From 82fc707ed88117e9c5e8341ea60eb947773b7f22 Mon Sep 17 00:00:00 2001
From: System Two <alexandre@55seeds.com>
Date: Sun, 5 Apr 2026 17:46:42 +0000
Subject: [PATCH 1/4] chore(develop): 034-eval

---
 src/cli/cli.rs                                |   7 +-
 src/cli/commands/eval/mod.rs                  |  62 ++++
 src/cli/commands/eval/report.rs               |  71 ++++
 src/cli/commands/eval/run.rs                  | 251 ++++++++++++++
 src/cli/commands/eval/score.rs                | 137 ++++++++
 src/cli/commands/eval/validate.rs             | 117 +++++++
 src/cli/commands/init.rs                      |   1 +
 src/cli/commands/mod.rs                       |   8 +
 src/core/manifest.rs                          |  27 ++
 src/core/packaging.rs                         |  10 +
 src/core/repository.rs                        |   2 +
 src/eval/artifacts.rs                         | 209 ++++++++++++
 src/eval/checks.rs                            | 315 ++++++++++++++++++
 src/eval/config.rs                            | 140 ++++++++
 src/eval/mod.rs                               |   8 +
 src/eval/runner.rs                            | 163 +++++++++
 src/eval/suite.rs                             | 277 +++++++++++++++
 src/eval/trace.rs                             | 101 ++++++
 src/lib.rs                                    |   1 +
 tests/cli/eval_tests.rs                       | 213 ++++++++++++
 tests/cli/mod.rs                              |   1 +
 tests/cli/repository_tests.rs                 |   1 +
 ...apshot_helpers__cli_directory_walking.snap |   1 +
 ...pers__cli_finds_skills_in_current_dir.snap |   1 +
 ...elpers__cli_invalid_repositories_path.snap |   1 +
 ...lpers__cli_repositories_path_argument.snap |   1 +
 ...i__snapshot_helpers__cli_with_env_var.snap |   1 +
 ...sts__cli__snapshot_helpers__eval_help.snap |  21 ++
 ...i__snapshot_helpers__eval_report_help.snap |  15 +
 ..._cli__snapshot_helpers__eval_run_help.snap |  20 ++
 ...li__snapshot_helpers__eval_score_help.snap |  16 +
 ..._snapshot_helpers__eval_validate_help.snap |  16 +
 ...sts__cli__snapshot_helpers__help_flag.snap |   1 +
 ...__cli__snapshot_helpers__help_no_args.snap |   2 +
 ...li__snapshot_helpers__help_short_flag.snap |   1 +
 ..._snapshot_helpers__serve_invalid_port.snap |   1 +
 ...shot_helpers__sources_add_missing_url.snap |   1 +
 tests/core_manifest_tests.rs                  |   1 +
 38 files changed, 2221 insertions(+), 1 deletion(-)
 create mode 100644 src/cli/commands/eval/mod.rs
 create mode 100644 src/cli/commands/eval/report.rs
 create mode 100644 src/cli/commands/eval/run.rs
 create mode 100644 src/cli/commands/eval/score.rs
 create mode 100644 src/cli/commands/eval/validate.rs
 create mode 100644 src/eval/artifacts.rs
 create mode 100644 src/eval/checks.rs
 create mode 100644 src/eval/config.rs
 create mode 100644 src/eval/mod.rs
 create mode 100644 src/eval/runner.rs
 create mode 100644 src/eval/suite.rs
 create mode 100644 src/eval/trace.rs
 create mode 100644 tests/cli/eval_tests.rs
 create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap
 create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
 create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
 create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
 create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap

diff --git a/src/cli/cli.rs b/src/cli/cli.rs
index 5e26959b..b58eaf20 100644
--- a/src/cli/cli.rs
+++ b/src/cli/cli.rs
@@ -5,7 +5,7 @@ use fastskill::FastSkillService;
 use std::sync::Arc;
 
 use crate::cli::commands::{
-    add, analyze, auth, disable, init, install, list, marketplace, package, publish, read,
+    add, analyze, auth, disable, eval, init, install, list, marketplace, package, publish, read,
     registry, reindex, remove, repos, search, serve, show, sources, sync, update, version,
     Commands,
 };
@@ -109,6 +109,10 @@ impl Cli {
             return auth::execute_auth(args).await;
         }
 
+        if let Some(Commands::Eval(args)) = self.command {
+            return eval::execute_eval(args).await;
+        }
+
         // For other commands, we need to restore self.command, so we need a different approach
         // Actually, if we get here, command was None or not Init/Repository/Package/Publish/RegistryIndex
         let command = self.command;
@@ -155,6 +159,7 @@ impl Cli {
             | Some(Commands::Repos(_))
             | Some(Commands::Marketplace(_))
             | Some(Commands::Auth(_))
+            | Some(Commands::Eval(_))
             | Some(Commands::Version(_)) => unreachable!("Handled above"),
             None => {
                 // No subcommand - treat as skill ID for read command
diff --git a/src/cli/commands/eval/mod.rs b/src/cli/commands/eval/mod.rs
new file mode 100644
index 00000000..0b2888a5
--- /dev/null
+++ b/src/cli/commands/eval/mod.rs
@@ -0,0 +1,62 @@
+//! Eval command group for skill quality assurance
+
+pub mod report;
+pub mod run;
+pub mod score;
+pub mod validate;
+
+use crate::cli::error::CliResult;
+use clap::{Args, Subcommand};
+
+/// Eval command group
+#[derive(Debug, Args)]
+#[command(
+    about = "Evaluation commands for skill quality assurance",
+    after_help = "Examples:\n  fastskill eval validate\n  fastskill eval run --agent codex --output-dir /tmp/evals"
+)]
+pub struct EvalCommand {
+    #[command(subcommand)]
+    pub command: EvalSubcommand,
+}
+
+/// Eval subcommands
+#[derive(Debug, Subcommand)]
+pub enum EvalSubcommand {
+    /// Validate eval configuration and files
+    #[command(
+        about = "Validate eval configuration and files",
+        after_help = "Examples:\n  fastskill eval validate\n  fastskill eval validate --agent codex"
+    )]
+    Validate(validate::ValidateArgs),
+
+    /// Run eval cases against an agent
+    #[command(
+        about = "Run eval cases against an agent",
+        after_help = "Examples:\n  fastskill eval run --agent codex --output-dir /tmp/evals"
+    )]
+    Run(run::RunArgs),
+
+    /// Show a report for a completed eval run
+    #[command(
+        about = "Show a report for a completed eval run",
+        after_help = "Examples:\n  fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z"
+    )]
+    Report(report::ReportArgs),
+
+    /// Re-score saved eval artifacts without running the agent again
+    #[command(
+        about = "Re-score saved eval artifacts without running the agent again",
+        after_help = "Examples:\n  fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z"
+    )]
+    Score(score::ScoreArgs),
+}
+
+/// Execute the eval command group
+pub async fn execute_eval(args: EvalCommand) -> CliResult<()> {
+    match args.command {
+        EvalSubcommand::Validate(args) => validate::execute_validate(args).await,
+        EvalSubcommand::Run(args) => run::execute_run(args).await,
+        EvalSubcommand::Report(args) => report::execute_report(args).await,
+        EvalSubcommand::Score(args) => score::execute_score(args).await,
+    }
+}
diff --git a/src/cli/commands/eval/report.rs b/src/cli/commands/eval/report.rs
new file mode 100644
index 00000000..c7877b0b
--- /dev/null
+++ b/src/cli/commands/eval/report.rs
@@ -0,0 +1,71 @@
+//! Eval report subcommand - artifact summary and formatting
+
+use crate::cli::error::{CliError, CliResult};
+use clap::Args;
+use fastskill::eval::artifacts::read_summary;
+use std::path::PathBuf;
+
+/// Arguments for `fastskill eval report`
+#[derive(Debug, Args)]
+#[command(
+    about = "Show a report for a completed eval run",
+    after_help = "Examples:\n  fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z\n  fastskill eval report --run-dir ./evals/2026-04-01T14-32-10Z --json"
+)]
+pub struct ReportArgs {
+    /// Path to the specific run directory
+    #[arg(long, required = true)]
+    pub run_dir: PathBuf,
+
+    /// Output as JSON
+    #[arg(long)]
+    pub json: bool,
+}
+
+/// Execute the `eval report` command
+pub async fn execute_report(args: ReportArgs) -> CliResult<()> {
+    if !args.run_dir.exists() {
+        return Err(CliError::Config(format!(
+            "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}",
+            args.run_dir.display()
+        )));
+    }
+
+    let summary = read_summary(&args.run_dir).map_err(|e| {
+        CliError::Config(format!(
+            "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}",
+            e
+        ))
+    })?;
+
+    if args.json {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&summary).unwrap_or_default()
+        );
+    } else {
+        println!("Eval Report");
+        println!("  run_dir: {}", args.run_dir.display());
+        println!("  agent: {}", summary.agent);
+        if let Some(ref model) = summary.model {
+            println!("  model: {}", model);
+        }
+        println!(
+            "  result: {}",
+            if summary.suite_pass {
+                "PASSED"
+            } else {
+                "FAILED"
+            }
+        );
+        println!("  cases: {}/{} passed", summary.passed, summary.total_cases);
+
+        if !summary.cases.is_empty() {
+            println!("\nCase Results:");
+            for case in &summary.cases {
+                println!("  [{}] {}", case.status, case.id);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs
new file mode 100644
index 00000000..483f49fa
--- /dev/null
+++ b/src/cli/commands/eval/run.rs
@@ -0,0 +1,251 @@
+//! Eval run subcommand - case execution orchestration
+
+use crate::cli::error::{CliError, CliResult};
+use aikit_sdk::{is_agent_available, is_runnable, runnable_agents};
+use chrono::Utc;
+use clap::Args;
+use fastskill::core::project::resolve_project_file;
+use fastskill::eval::artifacts::{
+    allocate_run_dir, write_case_artifacts, write_summary, CaseSummary, SummaryResult,
+};
+use fastskill::eval::checks::load_checks;
+use fastskill::eval::config::resolve_eval_config;
+use fastskill::eval::runner::{run_eval_case, CaseRunOptions};
+use fastskill::eval::suite::load_suite;
+use fastskill::eval::trace::{stdout_to_trace, trace_to_jsonl};
+use std::env;
+use std::path::PathBuf;
+
+/// Arguments for `fastskill eval run`
+#[derive(Debug, Args)]
+#[command(
+    about = "Run eval cases against an agent",
+    after_help = "Examples:\n  fastskill eval run --agent codex --output-dir /tmp/evals\n  fastskill eval run --agent claude --output-dir ./evals --case my-case\n  fastskill eval run --agent codex --output-dir ./evals --tag basic"
+)]
+pub struct RunArgs {
+    /// Agent to use for execution (required)
+    #[arg(long, required = true, value_parser = validate_agent_key_for_run)]
+    pub agent: String,
+
+    /// Output directory for artifacts (required)
+    #[arg(long, required = true)]
+    pub output_dir: PathBuf,
+
+    /// Optional model override forwarded to the agent
+    #[arg(long)]
+    pub model: Option<String>,
+
+    /// Filter: run only the case with this ID
+    #[arg(long)]
+    pub case: Option<String>,
+
+    /// Filter: run only cases with this tag
+    #[arg(long)]
+    pub tag: Option<String>,
+
+    /// Output as JSON
+    #[arg(long)]
+    pub json: bool,
+
+    /// Do not fail with non-zero exit code on suite failure
+    #[arg(long)]
+    pub no_fail: bool,
+}
+
+fn validate_agent_key_for_run(s: &str) -> Result<String, String> {
+    if is_runnable(s) {
+        Ok(s.to_string())
+    } else {
+        Err(format!(
+            "'{}' is not a supported agent. Supported: {}",
+            s,
+            runnable_agents().join(", ")
+        ))
+    }
+}
+
+/// Execute the `eval run` command
+pub async fn execute_run(args: RunArgs) -> CliResult<()> {
+    let current_dir = env::current_dir()
+        .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?;
+
+    let resolution = resolve_project_file(&current_dir);
+    if !resolution.found {
+        return Err(CliError::Config(
+            "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first."
+                .to_string(),
+        ));
+    }
+
+    let project_root = resolution
+        .path
+        .parent()
+        .unwrap_or(&current_dir)
+        .to_path_buf();
+
+    let eval_config = resolve_eval_config(&resolution.path, &project_root)
+        .map_err(|e| CliError::Config(e.to_string()))?;
+
+    // Check agent availability
+    if eval_config.fail_on_missing_agent && !is_agent_available(&args.agent) {
+        return Err(CliError::Config(format!(
+            "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it first.",
+            args.agent
+        )));
+    }
+
+    // Load suite
+    let mut suite =
+        load_suite(&eval_config.prompts_path).map_err(|e| CliError::Config(e.to_string()))?;
+
+    // Apply filters
+    if let Some(ref case_id) = args.case {
+        suite = suite.filter_by_id(case_id);
+        if suite.cases.is_empty() {
+            return Err(CliError::Config(format!(
+                "No case found with id '{}'",
+                case_id
+            )));
+        }
+    }
+    if let Some(ref tag) = args.tag {
+        suite = suite.filter_by_tag(tag);
+        if suite.cases.is_empty() {
+            return Err(CliError::Config(format!(
+                "No cases found with tag '{}'",
+                tag
+            )));
+        }
+    }
+
+    // Load checks if configured
+    let checks = if let Some(ref checks_path) = eval_config.checks_path {
+        load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))?
+    } else {
+        vec![]
+    };
+
+    // Allocate run directory
+    let run_id = Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string();
+    std::fs::create_dir_all(&args.output_dir).map_err(|e| {
+        CliError::Config(format!(
+            "Failed to create output directory '{}': {}",
+            args.output_dir.display(),
+            e
+        ))
+    })?;
+    let run_dir =
+        allocate_run_dir(&args.output_dir, &run_id).map_err(|e| CliError::Config(e.to_string()))?;
+
+    let run_opts = CaseRunOptions {
+        agent_key: args.agent.clone(),
+        model: args.model.clone(),
+        project_root: project_root.clone(),
+        timeout_seconds: eval_config.timeout_seconds,
+    };
+
+    eprintln!(
+        "Running {} eval case(s) with agent '{}'...",
+        suite.cases.len(),
+        args.agent
+    );
+
+    let mut case_results = Vec::new();
+    let mut case_summaries = Vec::new();
+
+    for case in &suite.cases {
+        eprintln!("  Running case '{}'...", case.id);
+
+        let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await;
+
+        // Build trace
+        let trace_events = stdout_to_trace(&run_output.stdout);
+        let trace_jsonl = trace_to_jsonl(&trace_events);
+
+        // Write artifacts
+        if let Err(e) = write_case_artifacts(
+            &run_dir,
+            &case.id,
+            &run_output.stdout,
+            &run_output.stderr,
+            &trace_jsonl,
+            &case_result,
+        ) {
+            eprintln!(
+                "  warning: failed to write artifacts for case '{}': {}",
+                case.id, e
+            );
+        }
+
+        let summary_entry = CaseSummary {
+            id: case_result.id.clone(),
+            status: case_result.status.clone(),
+            command_count: case_result.command_count,
+            input_tokens: case_result.input_tokens,
+            output_tokens: case_result.output_tokens,
+        };
+
+        case_summaries.push(summary_entry);
+        case_results.push(case_result);
+    }
+
+    let passed = case_results
+        .iter()
+        .filter(|r| r.status == fastskill::eval::artifacts::CaseStatus::Passed)
+        .count();
+    let failed = case_results.len() - passed;
+    let suite_pass = failed == 0;
+
+    let summary = SummaryResult {
+        suite_pass,
+        agent: args.agent.clone(),
+        model: args.model.clone(),
+        total_cases: case_results.len(),
+        passed,
+        failed,
+        run_dir: run_dir.clone(),
+        checks_path: eval_config.checks_path.map(|p| {
+            if p.is_absolute() {
+                p
+            } else {
+                project_root.join(p)
+            }
+        }),
+        skill_project_root: project_root,
+        cases: case_summaries,
+    };
+
+    // Write summary
+    if let Err(e) = write_summary(&run_dir, &summary) {
+        eprintln!("warning: failed to write summary.json: {}", e);
+    }
+
+    if args.json {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&summary).unwrap_or_default()
+        );
+    } else {
+        println!(
+            "\nEval run complete: {}/{} passed",
+            passed,
+            case_results.len()
+        );
+        println!("  run_dir: {}", run_dir.display());
+        if suite_pass {
+            println!("  result: PASSED");
+        } else {
+            println!("  result: FAILED ({} case(s) failed)", failed);
+        }
+    }
+
+    if !suite_pass && !args.no_fail {
+        return Err(CliError::Config(format!(
+            "Eval suite failed: {}/{} cases passed",
+            passed,
+            case_results.len()
+        )));
+    }
+
+    Ok(())
+}
diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs
new file mode 100644
index 00000000..6afa3911
--- /dev/null
+++ b/src/cli/commands/eval/score.rs
@@ -0,0 +1,137 @@
+//! Eval score subcommand - offline re-scoring from saved artifacts
+
+use crate::cli::error::{CliError, CliResult};
+use clap::Args;
+use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus};
+use fastskill::eval::checks::load_checks;
+use std::path::PathBuf;
+
+/// Arguments for `fastskill eval score`
+#[derive(Debug, Args)]
+#[command(
+    about = "Re-score saved eval artifacts without running the agent again",
+    after_help = "Examples:\n  fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z\n  fastskill eval score --run-dir ./evals/2026-04-01T14-32-10Z --json"
+)]
+pub struct ScoreArgs {
+    /// Path to the run directory to re-score
+    #[arg(long, required = true)]
+    pub run_dir: PathBuf,
+
+    /// Output as JSON
+    #[arg(long)]
+    pub json: bool,
+
+    /// Do not fail with non-zero exit code on suite failure
+    #[arg(long)]
+    pub no_fail: bool,
+}
+
+/// Execute the `eval score` command
+pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
+    if !args.run_dir.exists() {
+        return Err(CliError::Config(format!(
+            "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}",
+            args.run_dir.display()
+        )));
+    }
+
+    // Read existing summary
+    let mut summary = read_summary(&args.run_dir).map_err(|e| {
+        CliError::Config(format!(
+            "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}",
+            e
+        ))
+    })?;
+
+    // Validate that we have usable paths
+    let checks_path = summary.checks_path.as_ref().ok_or_else(|| {
+        CliError::Config(
+            "EVAL_ARTIFACTS_CORRUPT: summary.json lacks checks_path - cannot re-score".to_string(),
+        )
+    })?;
+
+    if !checks_path.exists() {
+        return Err(CliError::Config(format!(
+            "EVAL_ARTIFACTS_CORRUPT: checks_path '{}' does not exist",
+            checks_path.display()
+        )));
+    }
+
+    // Load checks
+    let checks = load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))?;
+
+    // Read existing case artifacts and re-score
+    let mut new_passed = 0;
+    let mut new_failed = 0;
+
+    let mut updated_cases = summary.cases.clone();
+
+    for case_summary in &mut updated_cases {
+        let case_dir = args.run_dir.join(&case_summary.id);
+        if !case_dir.exists() {
+            continue;
+        }
+
+        let stdout_path = case_dir.join("stdout.txt");
+        let trace_path = case_dir.join("trace.jsonl");
+
+        let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default();
+        let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default();
+
+        let check_results = fastskill::eval::checks::run_checks(
+            &checks,
+            &stdout_content,
+            &trace_jsonl,
+            &summary.skill_project_root,
+        );
+
+        let all_passed = check_results.iter().all(|r| r.passed);
+        case_summary.status = if all_passed {
+            CaseStatus::Passed
+        } else {
+            CaseStatus::Failed
+        };
+
+        if all_passed {
+            new_passed += 1;
+        } else {
+            new_failed += 1;
+        }
+    }
+
+    summary.passed = new_passed;
+    summary.failed = new_failed;
+    summary.suite_pass = new_failed == 0;
+    summary.cases = updated_cases;
+
+    // Update summary.json
+    write_summary(&args.run_dir, &summary)
+        .map_err(|e| CliError::Config(format!("Failed to write updated summary.json: {}", e)))?;
+
+    if args.json {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&summary).unwrap_or_default()
+        );
+    } else {
+        println!("Re-scoring complete");
+        println!(
+            "  result: {}",
+            if summary.suite_pass {
+                "PASSED"
+            } else {
+                "FAILED"
+            }
+        );
+        println!("  cases: {}/{} passed", summary.passed, summary.total_cases);
+    }
+
+    if !summary.suite_pass && !args.no_fail {
+        return Err(CliError::Config(format!(
+            "Eval suite failed: {}/{} cases passed after re-scoring",
+            summary.passed, summary.total_cases
+        )));
+    }
+
+    Ok(())
+}
diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs
new file mode 100644
index 00000000..54e52933
--- /dev/null
+++ b/src/cli/commands/eval/validate.rs
@@ -0,0 +1,117 @@
+//! Eval validate subcommand - configuration and file validation
+
+use crate::cli::error::{CliError, CliResult};
+use aikit_sdk::{is_agent_available, is_runnable, runnable_agents};
+use clap::Args;
+use fastskill::core::project::resolve_project_file;
+use fastskill::eval::config::resolve_eval_config;
+use std::env;
+
+/// Arguments for `fastskill eval validate`
+#[derive(Debug, Args)]
+#[command(
+    about = "Validate eval configuration and files",
+    after_help = "Examples:\n  fastskill eval validate\n  fastskill eval validate --agent codex"
+)]
+pub struct ValidateArgs {
+    /// Check agent availability for the specified agent key
+    #[arg(long, value_parser = validate_agent_key_parser)]
+    pub agent: Option<String>,
+
+    /// Output as JSON
+    #[arg(long)]
+    pub json: bool,
+}
+
+fn validate_agent_key_parser(s: &str) -> Result<String, String> {
+    if is_runnable(s) {
+        Ok(s.to_string())
+    } else {
+        Err(format!(
+            "'{}' is not a supported agent. Supported: {}",
+            s,
+            runnable_agents().join(", ")
+        ))
+    }
+}
+
+/// Execute the `eval validate` command
+pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
+    let current_dir = env::current_dir()
+        .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?;
+
+    let resolution = resolve_project_file(&current_dir);
+
+    if !resolution.found {
+        return Err(CliError::Config(
+            "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first."
+                .to_string(),
+        ));
+    }
+
+    let project_root = resolution
+        .path
+        .parent()
+        .unwrap_or(&current_dir)
+        .to_path_buf();
+
+    let eval_config = resolve_eval_config(&resolution.path, &project_root)
+        .map_err(|e| CliError::Config(e.to_string()))?;
+
+    // Check agent availability if --agent was specified
+    if let Some(ref agent_key) = args.agent {
+        let available = is_agent_available(agent_key);
+        if !available && eval_config.fail_on_missing_agent {
+            return Err(CliError::Config(format!(
+                "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it or use --agent with an available agent.",
+                agent_key
+            )));
+        }
+        if !available {
+            eprintln!(
+                "warning: agent '{}' is not available (fail_on_missing_agent=false, continuing)",
+                agent_key
+            );
+        }
+    }
+
+    if args.json {
+        let output = serde_json::json!({
+            "valid": true,
+            "prompts_path": eval_config.prompts_path,
+            "checks_path": eval_config.checks_path,
+            "timeout_seconds": eval_config.timeout_seconds,
+            "fail_on_missing_agent": eval_config.fail_on_missing_agent,
+            "project_root": eval_config.project_root,
+        });
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&output).unwrap_or_default()
+        );
+    } else {
+        println!("eval configuration: valid");
+        println!("  prompts: {}", eval_config.prompts_path.display());
+        if let Some(ref checks) = eval_config.checks_path {
+            println!("  checks: {}", checks.display());
+        }
+        println!("  timeout: {}s", eval_config.timeout_seconds);
+        println!(
+            "  fail_on_missing_agent: {}",
+            eval_config.fail_on_missing_agent
+        );
+        if let Some(ref agent_key) = args.agent {
+            let available = is_agent_available(agent_key);
+            println!(
+                "  agent '{}': {}",
+                agent_key,
+                if available {
+                    "available"
+                } else {
+                    "unavailable"
+                }
+            );
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/cli/commands/init.rs b/src/cli/commands/init.rs
index ada79bc0..bbd97d69 100644
--- a/src/cli/commands/init.rs
+++ b/src/cli/commands/init.rs
@@ -255,6 +255,7 @@ fn build_skill_project(meta: InitMetadata<'_>) -> CliResult<SkillProjectToml> {
             server: None,
             install_depth: 5,
             skip_transitive: false,
+            eval: None,
         }),
     });
     validate_project_structure(true, dependencies.is_some())
diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs
index b2499505..566585f8 100644
--- a/src/cli/commands/mod.rs
+++ b/src/cli/commands/mod.rs
@@ -5,6 +5,7 @@ pub mod analyze;
 pub mod auth;
 pub mod common;
 pub mod disable;
+pub mod eval;
 pub mod init;
 pub mod install;
 pub mod list;
@@ -57,6 +58,13 @@ pub enum Commands {
     )]
     Disable(disable::DisableArgs),
 
+    /// Evaluation commands for skill quality assurance
+    #[command(
+        about = "Evaluation commands for skill quality assurance",
+        after_help = "Examples:\n  fastskill eval validate\n  fastskill eval run --agent codex --output-dir /tmp/evals"
+    )]
+    Eval(eval::EvalCommand),
+
     /// Initialize skill-project.toml for skill authors
     #[command(
         about = "Initialize skill-project.toml in current skill directory",
diff --git a/src/core/manifest.rs b/src/core/manifest.rs
index abc171b3..50120010 100644
--- a/src/core/manifest.rs
+++ b/src/core/manifest.rs
@@ -273,6 +273,33 @@ pub struct FastSkillToolConfig {
     /// Skip transitive dependency resolution entirely (default: false)
     #[serde(default)]
     pub skip_transitive: bool,
+    /// Optional evaluation configuration
+    #[serde(default)]
+    pub eval: Option<EvalConfigToml>,
+}
+
+/// Evaluation configuration in TOML format ([tool.fastskill.eval])
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvalConfigToml {
+    /// Path to prompts CSV file (relative to skill project root)
+    pub prompts: PathBuf,
+    /// Optional path to checks TOML file
+    #[serde(default)]
+    pub checks: Option<PathBuf>,
+    /// Timeout in seconds for each eval case execution
+    #[serde(default = "default_eval_timeout_seconds")]
+    pub timeout_seconds: u64,
+    /// When true, `eval run` / `eval validate --agent` fail fast if the agent CLI is not available
+    #[serde(default = "default_fail_on_missing_agent")]
+    pub fail_on_missing_agent: bool,
+}
+
+fn default_eval_timeout_seconds() -> u64 {
+    900
+}
+
+fn default_fail_on_missing_agent() -> bool {
+    true
 }
 
 fn default_install_depth() -> u32 {
diff --git a/src/core/packaging.rs b/src/core/packaging.rs
index 892c8356..4c1ea9b8 100644
--- a/src/core/packaging.rs
+++ b/src/core/packaging.rs
@@ -157,6 +157,11 @@ pub fn package_skill_with_id(
             continue;
         }
 
+        // Skip evals/ directory from published artifacts
+        if relative_path_str.starts_with("evals/") || relative_path_str == "evals" {
+            continue;
+        }
+
         // Read file content
         let mut file_content = Vec::new();
         let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?;
@@ -219,6 +224,11 @@ pub fn package_skill_with_id(
             continue;
         }
 
+        // Skip evals/ directory from published artifacts
+        if relative_path_str.starts_with("evals/") || relative_path_str == "evals" {
+            continue;
+        }
+
         let mut file_content = Vec::new();
         let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?;
         file.read_to_end(&mut file_content)
diff --git a/src/core/repository.rs b/src/core/repository.rs
index a51163e6..5fcf1f49 100644
--- a/src/core/repository.rs
+++ b/src/core/repository.rs
@@ -251,6 +251,7 @@ impl RepositoryManager {
                     server: None,
                     install_depth: 5,
                     skip_transitive: false,
+                    eval: None,
                 }),
             });
         } else if let Some(ref mut tool) = project.tool {
@@ -262,6 +263,7 @@ impl RepositoryManager {
                     server: None,
                     install_depth: 5,
                     skip_transitive: false,
+                    eval: None,
                 });
             } else if let Some(ref mut fastskill) = tool.fastskill {
                 fastskill.repositories = Some(manifest_repos);
diff --git a/src/eval/artifacts.rs b/src/eval/artifacts.rs
new file mode 100644
index 00000000..b11a5b1e
--- /dev/null
+++ b/src/eval/artifacts.rs
@@ -0,0 +1,209 @@
+//! Artifact layout and persistence for eval runs
+
+use crate::eval::checks::CheckResult;
+use serde::{Deserialize, Serialize};
+use std::path::{Path, PathBuf};
+use thiserror::Error;
+
+/// Status of a single eval case
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "lowercase")]
+pub enum CaseStatus {
+    Passed,
+    Failed,
+    Error,
+    Skipped,
+}
+
+impl std::fmt::Display for CaseStatus {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            CaseStatus::Passed => write!(f, "passed"),
+            CaseStatus::Failed => write!(f, "failed"),
+            CaseStatus::Error => write!(f, "error"),
+            CaseStatus::Skipped => write!(f, "skipped"),
+        }
+    }
+}
+
+/// Per-case result
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CaseResult {
+    pub id: String,
+    pub status: CaseStatus,
+    pub command_count: Option<usize>,
+    pub input_tokens: Option<u64>,
+    pub output_tokens: Option<u64>,
+    #[serde(default)]
+    pub check_results: Vec<CheckResult>,
+    pub error_message: Option<String>,
+}
+
+/// Aggregated run summary
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SummaryResult {
+    pub suite_pass: bool,
+    pub agent: String,
+    pub model: Option<String>,
+    pub total_cases: usize,
+    pub passed: usize,
+    pub failed: usize,
+    pub run_dir: PathBuf,
+    pub checks_path: Option<PathBuf>,
+    pub skill_project_root: PathBuf,
+    pub cases: Vec<CaseSummary>,
+}
+
+/// Per-case summary entry in summary.json
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CaseSummary {
+    pub id: String,
+    pub status: CaseStatus,
+    pub command_count: Option<usize>,
+    pub input_tokens: Option<u64>,
+    pub output_tokens: Option<u64>,
+}
+
+/// All artifacts from a completed run
+#[derive(Debug)]
+pub struct RunArtifacts {
+    pub run_id: String,
+    pub run_dir: PathBuf,
+    pub summary: SummaryResult,
+    pub case_results: Vec<CaseResult>,
+}
+
+/// Errors during artifact writing/reading
+#[derive(Debug, Error)]
+pub enum ArtifactsError {
+    #[error("EVAL_ARTIFACTS_CORRUPT: IO error: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("EVAL_ARTIFACTS_CORRUPT: JSON error: {0}")]
+    Json(#[from] serde_json::Error),
+    #[error("EVAL_ARTIFACTS_CORRUPT: Missing required field: {0}")]
+    MissingField(String),
+}
+
+/// Allocate a run directory under output_dir using ISO 8601 timestamp format
+/// Appends numeric suffix if directory already exists
+pub fn allocate_run_dir(output_dir: &Path, run_id: &str) -> Result<PathBuf, ArtifactsError> {
+    let base = output_dir.join(run_id);
+    if !base.exists() {
+        std::fs::create_dir_all(&base)?;
+        return Ok(base);
+    }
+
+    // Append numeric suffix
+    for i in 2..=999 {
+        let candidate = output_dir.join(format!("{}-{}", run_id, i));
+        if !candidate.exists() {
+            std::fs::create_dir_all(&candidate)?;
+            return Ok(candidate);
+        }
+    }
+
+    // Fallback: use the base (it exists, will overwrite)
+    Ok(base)
+}
+
+/// Write per-case artifacts (stdout.txt, stderr.txt, trace.jsonl, result.json)
+pub fn write_case_artifacts(
+    run_dir: &Path,
+    case_id: &str,
+    stdout: &[u8],
+    stderr: &[u8],
+    trace_jsonl: &str,
+    result: &CaseResult,
+) -> Result<PathBuf, ArtifactsError> {
+    let case_dir = run_dir.join(case_id);
+    std::fs::create_dir_all(&case_dir)?;
+
+    std::fs::write(case_dir.join("stdout.txt"), stdout)?;
+    std::fs::write(case_dir.join("stderr.txt"), stderr)?;
+    std::fs::write(case_dir.join("trace.jsonl"), trace_jsonl)?;
+
+    let result_json = serde_json::to_string_pretty(result)?;
+    std::fs::write(case_dir.join("result.json"), result_json)?;
+
+    Ok(case_dir)
+}
+
+/// Write summary.json
+pub fn write_summary(run_dir: &Path, summary: &SummaryResult) -> Result<(), ArtifactsError> {
+    let summary_json = serde_json::to_string_pretty(summary)?;
+    std::fs::write(run_dir.join("summary.json"), summary_json)?;
+    Ok(())
+}
+
+/// Read summary.json from a run directory
+pub fn read_summary(run_dir: &Path) -> Result<SummaryResult, ArtifactsError> {
+    let summary_path = run_dir.join("summary.json");
+    let content = std::fs::read_to_string(&summary_path)?;
+    let summary: SummaryResult = serde_json::from_str(&content)?;
+    Ok(summary)
+}
+
+/// Read case result.json files from a run directory
+pub fn read_case_results(run_dir: &Path) -> Result<Vec<CaseResult>, ArtifactsError> {
+    let mut results = Vec::new();
+
+    let entries = std::fs::read_dir(run_dir)?;
+    for entry in entries.flatten() {
+        let path = entry.path();
+        if path.is_dir() {
+            let result_path = path.join("result.json");
+            if result_path.exists() {
+                let content = std::fs::read_to_string(&result_path)?;
+                let result: CaseResult = serde_json::from_str(&content)?;
+                results.push(result);
+            }
+        }
+    }
+
+    Ok(results)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    #[test]
+    fn test_allocate_run_dir_creates_new() {
+        let dir = TempDir::new().unwrap();
+        let run_dir = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap();
+        assert!(run_dir.exists());
+        assert!(run_dir.ends_with("2026-04-01T14-00-00Z"));
+    }
+
+    #[test]
+    fn test_allocate_run_dir_suffix_on_conflict() {
+        let dir = TempDir::new().unwrap();
+        let run_dir1 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap();
+        let run_dir2 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap();
+        assert_ne!(run_dir1, run_dir2);
+        assert!(run_dir2.to_string_lossy().contains("-2"));
+    }
+
+    #[test]
+    fn test_write_and_read_summary() {
+        let dir = TempDir::new().unwrap();
+        let summary = SummaryResult {
+            suite_pass: true,
+            agent: "codex".to_string(),
+            model: None,
+            total_cases: 2,
+            passed: 2,
+            failed: 0,
+            run_dir: dir.path().to_path_buf(),
+            checks_path: None,
+            skill_project_root: dir.path().to_path_buf(),
+            cases: vec![],
+        };
+
+        write_summary(dir.path(), &summary).unwrap();
+        let read = read_summary(dir.path()).unwrap();
+        assert_eq!(read.total_cases, 2);
+        assert!(read.suite_pass);
+    }
+}
diff --git a/src/eval/checks.rs b/src/eval/checks.rs
new file mode 100644
index 00000000..a7d94e5e
--- /dev/null
+++ b/src/eval/checks.rs
@@ -0,0 +1,315 @@
+//! Deterministic check engine for eval artifact scoring
+
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+use thiserror::Error;
+
+/// A deterministic check definition loaded from checks.toml
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "name")]
+pub enum CheckDefinition {
+    /// Check whether a pattern appears (or doesn't appear) in the trace
+    #[serde(rename = "trigger_expectation")]
+    TriggerExpectation {
+        pattern: String,
+        /// true = pattern must appear; false = pattern must NOT appear
+        expected: bool,
+        #[serde(default = "default_required")]
+        required: bool,
+    },
+    /// Check whether the trace contains a line with this pattern
+    #[serde(rename = "command_contains")]
+    CommandContains {
+        pattern: String,
+        #[serde(default = "default_required")]
+        required: bool,
+    },
+    /// Check whether a file exists in the working directory after execution
+    #[serde(rename = "file_exists")]
+    FileExists {
+        path: PathBuf,
+        #[serde(default = "default_required")]
+        required: bool,
+    },
+    /// Check that the number of raw_json trace lines does not exceed a limit
+    #[serde(rename = "max_command_count")]
+    MaxCommandCount {
+        limit: usize,
+        #[serde(default = "default_required")]
+        required: bool,
+    },
+}
+
+fn default_required() -> bool {
+    true
+}
+
+impl CheckDefinition {
+    pub fn name(&self) -> &str {
+        match self {
+            CheckDefinition::TriggerExpectation { .. } => "trigger_expectation",
+            CheckDefinition::CommandContains { .. } => "command_contains",
+            CheckDefinition::FileExists { .. } => "file_exists",
+            CheckDefinition::MaxCommandCount { .. } => "max_command_count",
+        }
+    }
+
+    pub fn is_required(&self) -> bool {
+        match self {
+            CheckDefinition::TriggerExpectation { required, .. } => *required,
+            CheckDefinition::CommandContains { required, .. } => *required,
+            CheckDefinition::FileExists { required, .. } => *required,
+            CheckDefinition::MaxCommandCount { required, .. } => *required,
+        }
+    }
+}
+
+/// Result of a single check
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CheckResult {
+    pub check_name: String,
+    pub passed: bool,
+    pub message: Option<String>,
+}
+
+/// TOML file structure for checks configuration
+#[derive(Debug, Deserialize)]
+pub struct ChecksToml {
+    #[serde(rename = "check", default)]
+    pub checks: Vec<CheckDefinition>,
+}
+
+/// Errors loading checks configuration
+#[derive(Debug, Error)]
+pub enum ChecksError {
+    #[error("EVAL_CHECKS_INVALID: Failed to read checks file: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("EVAL_CHECKS_INVALID: Failed to parse checks TOML: {0}")]
+    Parse(#[from] toml::de::Error),
+}
+
+/// Load check definitions from a TOML file
+pub fn load_checks(path: &std::path::Path) -> Result<Vec<CheckDefinition>, ChecksError> {
+    let content = std::fs::read_to_string(path)?;
+    let parsed: ChecksToml = toml::from_str(&content)?;
+    Ok(parsed.checks)
+}
+
+/// Run all checks against captured stdout content and working directory
+pub fn run_checks(
+    checks: &[CheckDefinition],
+    stdout_content: &str,
+    trace_jsonl: &str,
+    working_dir: &std::path::Path,
+) -> Vec<CheckResult> {
+    checks
+        .iter()
+        .map(|check| run_single_check(check, stdout_content, trace_jsonl, working_dir))
+        .collect()
+}
+
+fn run_single_check(
+    check: &CheckDefinition,
+    stdout_content: &str,
+    trace_jsonl: &str,
+    working_dir: &std::path::Path,
+) -> CheckResult {
+    match check {
+        CheckDefinition::TriggerExpectation {
+            pattern, expected, ..
+        } => {
+            // Literal substring matching in stdout + trace
+            let combined = format!("{}\n{}", stdout_content, trace_jsonl);
+            let found = combined.contains(pattern.as_str());
+            let passed = found == *expected;
+            let message = if passed {
+                None
+            } else if *expected {
+                Some(format!("Pattern '{}' not found but was expected", pattern))
+            } else {
+                Some(format!("Pattern '{}' found but was not expected", pattern))
+            };
+            CheckResult {
+                check_name: "trigger_expectation".to_string(),
+                passed,
+                message,
+            }
+        }
+        CheckDefinition::CommandContains { pattern, .. } => {
+            let combined = format!("{}\n{}", stdout_content, trace_jsonl);
+            let passed = combined.contains(pattern.as_str());
+            let message = if passed {
+                None
+            } else {
+                Some(format!("Pattern '{}' not found in output", pattern))
+            };
+            CheckResult {
+                check_name: "command_contains".to_string(),
+                passed,
+                message,
+            }
+        }
+        CheckDefinition::FileExists { path, .. } => {
+            let full_path = working_dir.join(path);
+            let passed = full_path.exists();
+            let message = if passed {
+                None
+            } else {
+                Some(format!("File '{}' does not exist", path.display()))
+            };
+            CheckResult {
+                check_name: "file_exists".to_string(),
+                passed,
+                message,
+            }
+        }
+        CheckDefinition::MaxCommandCount { limit, .. } => {
+            // Count raw_json trace lines
+            let count = trace_jsonl
+                .lines()
+                .filter(|line| line.contains("\"raw_json\""))
+                .count();
+            let passed = count <= *limit;
+            let message = if passed {
+                None
+            } else {
+                Some(format!("Command count {} exceeds limit {}", count, limit))
+            };
+            CheckResult {
+                check_name: "max_command_count".to_string(),
+                passed,
+                message,
+            }
+        }
+    }
+}
+
+/// Aggregate check results: suite passes if all required checks pass
+pub fn suite_passes(results: &[CheckResult]) -> bool {
+    results.iter().all(|r| r.passed)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::path::Path;
+    use tempfile::TempDir;
+
+    #[test]
+    fn test_trigger_expectation_passes_when_pattern_found() {
+        let check = CheckDefinition::TriggerExpectation {
+            pattern: "fastskill".to_string(),
+            expected: true,
+            required: true,
+        };
+        let results = run_checks(&[check], "fastskill triggered", "", Path::new("/tmp"));
+        assert!(results[0].passed);
+    }
+
+    #[test]
+    fn test_trigger_expectation_fails_when_pattern_missing() {
+        let check = CheckDefinition::TriggerExpectation {
+            pattern: "fastskill".to_string(),
+            expected: true,
+            required: true,
+        };
+        let results = run_checks(&[check], "nothing here", "", Path::new("/tmp"));
+        assert!(!results[0].passed);
+    }
+
+    #[test]
+    fn test_trigger_expectation_negative_passes_when_pattern_absent() {
+        let check = CheckDefinition::TriggerExpectation {
+            pattern: "fastskill".to_string(),
+            expected: false,
+            required: true,
+        };
+        let results = run_checks(&[check], "no match", "", Path::new("/tmp"));
+        assert!(results[0].passed);
+    }
+
+    #[test]
+    fn test_file_exists_check_passes() {
+        let dir = TempDir::new().unwrap();
+        let file_path = dir.path().join("output.txt");
+        std::fs::write(&file_path, "content").unwrap();
+
+        let check = CheckDefinition::FileExists {
+            path: PathBuf::from("output.txt"),
+            required: true,
+        };
+        let results = run_checks(&[check], "", "", dir.path());
+        assert!(results[0].passed);
+    }
+
+    #[test]
+    fn test_file_exists_check_fails() {
+        let dir = TempDir::new().unwrap();
+        let check = CheckDefinition::FileExists {
+            path: PathBuf::from("missing.txt"),
+            required: true,
+        };
+        let results = run_checks(&[check], "", "", dir.path());
+        assert!(!results[0].passed);
+    }
+
+    #[test]
+    fn test_max_command_count_passes() {
+        let check = CheckDefinition::MaxCommandCount {
+            limit: 5,
+            required: true,
+        };
+        let trace = r#"{"type":"raw_json","seq":0}
+{"type":"raw_json","seq":1}
+{"type":"raw_line","seq":2}"#;
+        let results = run_checks(&[check], "", trace, Path::new("/tmp"));
+        assert!(results[0].passed);
+    }
+
+    #[test]
+    fn test_max_command_count_fails() {
+        let check = CheckDefinition::MaxCommandCount {
+            limit: 1,
+            required: true,
+        };
+        let trace = r#"{"type":"raw_json","seq":0}
+{"type":"raw_json","seq":1}
+{"type":"raw_json","seq":2}"#;
+        let results = run_checks(&[check], "", trace, Path::new("/tmp"));
+        assert!(!results[0].passed);
+    }
+
+    #[test]
+    fn test_suite_passes_all_passed() {
+        let results = vec![
+            CheckResult {
+                check_name: "a".to_string(),
+                passed: true,
+                message: None,
+            },
+            CheckResult {
+                check_name: "b".to_string(),
+                passed: true,
+                message: None,
+            },
+        ];
+        assert!(suite_passes(&results));
+    }
+
+    #[test]
+    fn test_suite_passes_any_failed() {
+        let results = vec![
+            CheckResult {
+                check_name: "a".to_string(),
+                passed: true,
+                message: None,
+            },
+            CheckResult {
+                check_name: "b".to_string(),
+                passed: false,
+                message: Some("failed".to_string()),
+            },
+        ];
+        assert!(!suite_passes(&results));
+    }
+}
diff --git a/src/eval/config.rs b/src/eval/config.rs
new file mode 100644
index 00000000..ecb6e5b5
--- /dev/null
+++ b/src/eval/config.rs
@@ -0,0 +1,140 @@
+//! Eval configuration resolution
+
+use crate::core::manifest::{EvalConfigToml, SkillProjectToml};
+use std::path::{Path, PathBuf};
+use thiserror::Error;
+
+/// Resolved eval configuration (after path resolution)
+#[derive(Debug, Clone)]
+pub struct EvalConfig {
+    /// Absolute path to prompts CSV file
+    pub prompts_path: PathBuf,
+    /// Absolute path to checks TOML file (optional)
+    pub checks_path: Option<PathBuf>,
+    /// Timeout in seconds for each case
+    pub timeout_seconds: u64,
+    /// Whether to fail fast if agent is not available
+    pub fail_on_missing_agent: bool,
+    /// Skill project root directory
+    pub project_root: PathBuf,
+}
+
+/// Errors during eval configuration resolution
+#[derive(Debug, Error)]
+pub enum EvalConfigError {
+    #[error("EVAL_CONFIG_MISSING: No [tool.fastskill.eval] section found in skill-project.toml")]
+    ConfigMissing,
+    #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV not found: {0}")]
+    PromptsNotFound(PathBuf),
+    #[error("Failed to read skill-project.toml: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("Failed to parse skill-project.toml: {0}")]
+    Parse(String),
+}
+
+/// Resolve eval configuration from a skill-project.toml file
+pub fn resolve_eval_config(
+    project_file: &Path,
+    project_root: &Path,
+) -> Result<EvalConfig, EvalConfigError> {
+    let content = std::fs::read_to_string(project_file)?;
+    let toml: SkillProjectToml =
+        toml::from_str(&content).map_err(|e| EvalConfigError::Parse(e.to_string()))?;
+
+    let eval_config = toml
+        .tool
+        .as_ref()
+        .and_then(|t| t.fastskill.as_ref())
+        .and_then(|f| f.eval.as_ref())
+        .ok_or(EvalConfigError::ConfigMissing)?;
+
+    resolve_from_toml(eval_config, project_root)
+}
+
+/// Resolve eval configuration from parsed TOML config
+pub fn resolve_from_toml(
+    config: &EvalConfigToml,
+    project_root: &Path,
+) -> Result<EvalConfig, EvalConfigError> {
+    let prompts_path = if config.prompts.is_absolute() {
+        config.prompts.clone()
+    } else {
+        project_root.join(&config.prompts)
+    };
+
+    if !prompts_path.exists() {
+        return Err(EvalConfigError::PromptsNotFound(prompts_path));
+    }
+
+    let checks_path = config.checks.as_ref().map(|p| {
+        if p.is_absolute() {
+            p.clone()
+        } else {
+            project_root.join(p)
+        }
+    });
+
+    Ok(EvalConfig {
+        prompts_path,
+        checks_path,
+        timeout_seconds: config.timeout_seconds,
+        fail_on_missing_agent: config.fail_on_missing_agent,
+        project_root: project_root.to_path_buf(),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    #[test]
+    fn test_resolve_eval_config_missing() {
+        let dir = TempDir::new().unwrap();
+        let project_file = dir.path().join("skill-project.toml");
+        std::fs::write(&project_file, "[metadata]\nid = \"test\"\n").unwrap();
+
+        let result = resolve_eval_config(&project_file, dir.path());
+        assert!(matches!(result, Err(EvalConfigError::ConfigMissing)));
+    }
+
+    #[test]
+    fn test_resolve_eval_config_prompts_not_found() {
+        let dir = TempDir::new().unwrap();
+        let project_file = dir.path().join("skill-project.toml");
+        std::fs::write(
+            &project_file,
+            "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 900\nfail_on_missing_agent = true\n",
+        )
+        .unwrap();
+
+        let result = resolve_eval_config(&project_file, dir.path());
+        assert!(matches!(result, Err(EvalConfigError::PromptsNotFound(_))));
+    }
+
+    #[test]
+    fn test_resolve_eval_config_success() {
+        let dir = TempDir::new().unwrap();
+        let evals_dir = dir.path().join("evals");
+        std::fs::create_dir_all(&evals_dir).unwrap();
+        let prompts_file = evals_dir.join("prompts.csv");
+        std::fs::write(
+            &prompts_file,
+            "id,prompt,should_trigger\ntest-1,hello,true\n",
+        )
+        .unwrap();
+
+        let project_file = dir.path().join("skill-project.toml");
+        std::fs::write(
+            &project_file,
+            "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 600\nfail_on_missing_agent = false\n",
+        )
+        .unwrap();
+
+        let result = resolve_eval_config(&project_file, dir.path());
+        assert!(result.is_ok());
+        let config = result.unwrap();
+        assert_eq!(config.timeout_seconds, 600);
+        assert!(!config.fail_on_missing_agent);
+    }
+}
diff --git a/src/eval/mod.rs b/src/eval/mod.rs
new file mode 100644
index 00000000..bedfcb3c
--- /dev/null
+++ b/src/eval/mod.rs
@@ -0,0 +1,8 @@
+//! Evaluation domain types and services for skill quality assurance
+
+pub mod artifacts;
+pub mod checks;
+pub mod config;
+pub mod runner;
+pub mod suite;
+pub mod trace;
diff --git a/src/eval/runner.rs b/src/eval/runner.rs
new file mode 100644
index 00000000..24394f6c
--- /dev/null
+++ b/src/eval/runner.rs
@@ -0,0 +1,163 @@
+//! Eval runner implementation using aikit-sdk
+
+use crate::eval::artifacts::{CaseResult, CaseStatus};
+use crate::eval::checks::{run_checks, CheckDefinition};
+use crate::eval::suite::EvalCase;
+use crate::eval::trace::{stdout_to_trace, trace_to_jsonl};
+use aikit_sdk::{run_agent, RunOptions};
+use std::path::PathBuf;
+use thiserror::Error;
+
+/// Options for running a single eval case
+#[derive(Debug, Clone)]
+pub struct CaseRunOptions {
+    /// Agent key (e.g. "codex", "claude")
+    pub agent_key: String,
+    /// Optional model override
+    pub model: Option<String>,
+    /// Skill project root (used as working directory when no workspace_subdir)
+    pub project_root: PathBuf,
+    /// Timeout in seconds
+    pub timeout_seconds: u64,
+}
+
+/// Raw output from running a case
+#[derive(Debug)]
+pub struct CaseRunOutput {
+    pub stdout: Vec<u8>,
+    pub stderr: Vec<u8>,
+    pub exit_code: Option<i32>,
+    pub timed_out: bool,
+}
+
+/// Errors during case execution
+#[derive(Debug, Error)]
+pub enum RunnerError {
+    #[error("EVAL_AGENT_UNAVAILABLE: Agent '{0}' is not available")]
+    AgentUnavailable(String),
+    #[error("EVAL_CASE_TIMEOUT: Case timed out after {0}s")]
+    Timeout(u64),
+    #[error("Execution failed: {0}")]
+    ExecutionFailed(String),
+}
+
+/// Run a single eval case using aikit_sdk::run_agent via spawn_blocking.
+///
+/// This is the sole agent execution path per spec (acceptance criterion 29).
+/// Uses spawn_blocking because run_agent is synchronous.
+pub async fn run_eval_case(
+    case: &EvalCase,
+    opts: &CaseRunOptions,
+    checks: &[CheckDefinition],
+) -> (CaseRunOutput, CaseResult) {
+    let agent_key = opts.agent_key.clone();
+    let model = opts.model.clone();
+    let prompt = case.prompt.clone();
+
+    // Determine working directory
+    let working_dir = match &case.workspace_subdir {
+        Some(subdir) => opts.project_root.join(subdir),
+        None => opts.project_root.clone(),
+    };
+
+    // Build RunOptions using available aikit-sdk API
+    let run_opts = RunOptions {
+        model: model.clone(),
+        yolo: true,
+        stream: false,
+    };
+
+    // Run agent in a blocking thread to avoid blocking the async runtime
+    let result =
+        tokio::task::spawn_blocking(move || run_agent(&agent_key, &prompt, run_opts)).await;
+
+    let run_output = match result {
+        Ok(Ok(run_result)) => {
+            let exit_code = run_result.exit_code();
+            CaseRunOutput {
+                stdout: run_result.stdout,
+                stderr: run_result.stderr,
+                exit_code,
+                timed_out: false,
+            }
+        }
+        Ok(Err(e)) => CaseRunOutput {
+            stdout: vec![],
+            stderr: format!("Agent execution failed: {}", e).into_bytes(),
+            exit_code: None,
+            timed_out: false,
+        },
+        Err(e) => CaseRunOutput {
+            stdout: vec![],
+            stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
+            exit_code: None,
+            timed_out: false,
+        },
+    };
+
+    // Build trace from stdout
+    let trace_events = stdout_to_trace(&run_output.stdout);
+    let trace_jsonl = trace_to_jsonl(&trace_events);
+    let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string();
+
+    // Count raw_json events
+    let command_count = trace_events
+        .iter()
+        .filter(|e| matches!(&e.payload, crate::eval::trace::TracePayload::RawJson { .. }))
+        .count();
+
+    // Run deterministic checks
+    let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir);
+    let all_passed = check_results.iter().all(|r| r.passed);
+
+    let status = if run_output.timed_out {
+        CaseStatus::Error
+    } else if checks.is_empty() {
+        // No checks defined: pass if exit code is 0
+        if run_output.exit_code == Some(0) {
+            CaseStatus::Passed
+        } else {
+            CaseStatus::Failed
+        }
+    } else if all_passed {
+        CaseStatus::Passed
+    } else {
+        CaseStatus::Failed
+    };
+
+    let case_result = CaseResult {
+        id: case.id.clone(),
+        status,
+        command_count: Some(command_count),
+        input_tokens: None,
+        output_tokens: None,
+        check_results,
+        error_message: None,
+    };
+
+    (run_output, case_result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_case_run_options_builder() {
+        let opts = CaseRunOptions {
+            agent_key: "codex".to_string(),
+            model: Some("gpt-4".to_string()),
+            project_root: PathBuf::from("/tmp"),
+            timeout_seconds: 300,
+        };
+        assert_eq!(opts.agent_key, "codex");
+        assert_eq!(opts.model, Some("gpt-4".to_string()));
+    }
+
+    #[test]
+    fn test_runner_error_display() {
+        let err = RunnerError::AgentUnavailable("codex".to_string());
+        assert!(err.to_string().contains("codex"));
+        assert!(err.to_string().contains("EVAL_AGENT_UNAVAILABLE"));
+    }
+}
diff --git a/src/eval/suite.rs b/src/eval/suite.rs
new file mode 100644
index 00000000..8d4df581
--- /dev/null
+++ b/src/eval/suite.rs
@@ -0,0 +1,277 @@
+//! Eval suite loading and CSV parsing
+
+use std::path::{Path, PathBuf};
+use thiserror::Error;
+
+/// An individual eval case definition
+#[derive(Debug, Clone)]
+pub struct EvalCase {
+    /// Unique case identifier (alphanumeric + hyphens)
+    pub id: String,
+    /// Prompt to send to the agent
+    pub prompt: String,
+    /// Whether the skill should trigger (documentation-only; checks.toml is authoritative for pass/fail)
+    pub should_trigger: bool,
+    /// Tags for filtering
+    pub tags: Vec<String>,
+    /// Optional workspace subdirectory (relative to skill project root)
+    pub workspace_subdir: Option<PathBuf>,
+}
+
+/// A collection of eval cases
+#[derive(Debug, Default)]
+pub struct EvalSuite {
+    pub cases: Vec<EvalCase>,
+}
+
+impl EvalSuite {
+    pub fn new(cases: Vec<EvalCase>) -> Self {
+        Self { cases }
+    }
+
+    /// Filter cases by ID
+    pub fn filter_by_id(&self, id: &str) -> EvalSuite {
+        EvalSuite {
+            cases: self.cases.iter().filter(|c| c.id == id).cloned().collect(),
+        }
+    }
+
+    /// Filter cases by tag
+    pub fn filter_by_tag(&self, tag: &str) -> EvalSuite {
+        EvalSuite {
+            cases: self
+                .cases
+                .iter()
+                .filter(|c| c.tags.iter().any(|t| t == tag))
+                .cloned()
+                .collect(),
+        }
+    }
+}
+
+/// Errors that can occur when loading an eval suite
+#[derive(Debug, Error)]
+pub enum SuiteError {
+    #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV file not found: {0}")]
+    PromptsNotFound(PathBuf),
+    #[error("EVAL_INVALID_CSV: {0}")]
+    InvalidCsv(String),
+    #[error("EVAL_INVALID_CSV: IO error: {0}")]
+    Io(#[from] std::io::Error),
+}
+
+/// Load an eval suite from a CSV file
+///
+/// Expected CSV columns: id,prompt,should_trigger,tags,workspace_subdir
+pub fn load_suite(prompts_path: &Path) -> Result<EvalSuite, SuiteError> {
+    if !prompts_path.exists() {
+        return Err(SuiteError::PromptsNotFound(prompts_path.to_path_buf()));
+    }
+
+    let content = std::fs::read_to_string(prompts_path)?;
+    parse_prompts_csv(&content)
+}
+
+/// Parse prompts CSV content
+fn parse_prompts_csv(content: &str) -> Result<EvalSuite, SuiteError> {
+    let mut lines = content.lines();
+
+    // Parse header
+    let header = lines
+        .next()
+        .ok_or_else(|| SuiteError::InvalidCsv("CSV is empty".to_string()))?;
+    let headers: Vec<String> = parse_csv_line(header);
+
+    // Find column indices
+    let id_idx = find_col(&headers, "id")?;
+    let prompt_idx = find_col(&headers, "prompt")?;
+    let should_trigger_idx = find_col(&headers, "should_trigger")?;
+    let tags_idx = headers.iter().position(|h| h.trim() == "tags");
+    let workspace_subdir_idx = headers.iter().position(|h| h.trim() == "workspace_subdir");
+
+    let mut cases = Vec::new();
+
+    for (line_num, line) in lines.enumerate() {
+        let line = line.trim();
+        if line.is_empty() {
+            continue;
+        }
+
+        let cols = parse_csv_line(line);
+
+        let id = cols
+            .get(id_idx)
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+            .ok_or_else(|| {
+                SuiteError::InvalidCsv(format!("Missing id at line {}", line_num + 2))
+            })?;
+
+        let prompt = cols
+            .get(prompt_idx)
+            .map(|s| s.trim_matches('"').trim().to_string())
+            .ok_or_else(|| {
+                SuiteError::InvalidCsv(format!("Missing prompt at line {}", line_num + 2))
+            })?;
+
+        let should_trigger_str = cols
+            .get(should_trigger_idx)
+            .map(|s| s.trim().to_lowercase())
+            .unwrap_or_else(|| "false".to_string());
+        let should_trigger = should_trigger_str == "true" || should_trigger_str == "1";
+
+        let tags = if let Some(idx) = tags_idx {
+            cols.get(idx)
+                .map(|s| {
+                    s.trim()
+                        .trim_matches('"')
+                        .split(',')
+                        .map(|t| t.trim().to_string())
+                        .filter(|t| !t.is_empty())
+                        .collect()
+                })
+                .unwrap_or_default()
+        } else {
+            vec![]
+        };
+
+        let workspace_subdir = if let Some(idx) = workspace_subdir_idx {
+            cols.get(idx).and_then(|s| {
+                let s = s.trim();
+                if s.is_empty() {
+                    None
+                } else {
+                    Some(PathBuf::from(s))
+                }
+            })
+        } else {
+            None
+        };
+
+        cases.push(EvalCase {
+            id,
+            prompt,
+            should_trigger,
+            tags,
+            workspace_subdir,
+        });
+    }
+
+    Ok(EvalSuite::new(cases))
+}
+
+fn find_col(headers: &[String], name: &str) -> Result<usize, SuiteError> {
+    headers
+        .iter()
+        .position(|h| h.trim() == name)
+        .ok_or_else(|| SuiteError::InvalidCsv(format!("Missing required column: {}", name)))
+}
+
+/// Simple CSV line parser that handles quoted fields
+fn parse_csv_line(line: &str) -> Vec<String> {
+    let mut fields = Vec::new();
+    let mut current = String::new();
+    let mut in_quotes = false;
+    let mut chars = line.chars().peekable();
+
+    while let Some(ch) = chars.next() {
+        match ch {
+            '"' => {
+                if in_quotes {
+                    // Check for escaped quote ""
+                    if chars.peek() == Some(&'"') {
+                        chars.next();
+                        current.push('"');
+                    } else {
+                        in_quotes = false;
+                    }
+                } else {
+                    in_quotes = true;
+                }
+            }
+            ',' if !in_quotes => {
+                fields.push(current.clone());
+                current.clear();
+            }
+            _ => {
+                current.push(ch);
+            }
+        }
+    }
+    fields.push(current);
+    fields
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_prompts_csv_basic() {
+        let csv = "id,prompt,should_trigger,tags,workspace_subdir\n\
+                   test-1,\"Do something\",true,\"basic\",\n\
+                   test-2,\"Do nothing\",false,\"\",\n";
+        let suite = parse_prompts_csv(csv).unwrap();
+        assert_eq!(suite.cases.len(), 2);
+        assert_eq!(suite.cases[0].id, "test-1");
+        assert!(suite.cases[0].should_trigger);
+        assert_eq!(suite.cases[1].id, "test-2");
+        assert!(!suite.cases[1].should_trigger);
+    }
+
+    #[test]
+    fn test_parse_prompts_csv_missing_required_col() {
+        let csv = "id,prompt\ntest-1,hello\n";
+        let result = parse_prompts_csv(csv);
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains("should_trigger"));
+    }
+
+    #[test]
+    fn test_filter_by_id() {
+        let cases = vec![
+            EvalCase {
+                id: "a".to_string(),
+                prompt: "p1".to_string(),
+                should_trigger: true,
+                tags: vec![],
+                workspace_subdir: None,
+            },
+            EvalCase {
+                id: "b".to_string(),
+                prompt: "p2".to_string(),
+                should_trigger: false,
+                tags: vec![],
+                workspace_subdir: None,
+            },
+        ];
+        let suite = EvalSuite::new(cases);
+        let filtered = suite.filter_by_id("a");
+        assert_eq!(filtered.cases.len(), 1);
+        assert_eq!(filtered.cases[0].id, "a");
+    }
+
+    #[test]
+    fn test_filter_by_tag() {
+        let cases = vec![
+            EvalCase {
+                id: "a".to_string(),
+                prompt: "p1".to_string(),
+                should_trigger: true,
+                tags: vec!["foo".to_string(), "bar".to_string()],
+                workspace_subdir: None,
+            },
+            EvalCase {
+                id: "b".to_string(),
+                prompt: "p2".to_string(),
+                should_trigger: false,
+                tags: vec!["baz".to_string()],
+                workspace_subdir: None,
+            },
+        ];
+        let suite = EvalSuite::new(cases);
+        let filtered = suite.filter_by_tag("foo");
+        assert_eq!(filtered.cases.len(), 1);
+        assert_eq!(filtered.cases[0].id, "a");
+    }
+}
diff --git a/src/eval/trace.rs b/src/eval/trace.rs
new file mode 100644
index 00000000..ef67d6b0
--- /dev/null
+++ b/src/eval/trace.rs
@@ -0,0 +1,101 @@
+//! Trace event types for eval case execution
+
+use serde::{Deserialize, Serialize};
+
+/// A single line in a trace.jsonl file
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TraceEvent {
+    /// Sequence number (0-based)
+    pub seq: usize,
+    /// Event payload
+    pub payload: TracePayload,
+}
+
+/// Payload of a trace event
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "type", rename_all = "snake_case")]
+pub enum TracePayload {
+    /// A raw JSON line from the agent
+    RawJson { data: serde_json::Value },
+    /// A raw text line from stdout
+    RawLine { line: String },
+    /// A raw bytes chunk (base64-encoded)
+    RawBytes { b64: String },
+    /// Execution error
+    Error { message: String },
+    /// Case timed out
+    Timeout,
+}
+
+/// Convert raw stdout lines to trace events
+pub fn stdout_to_trace(stdout: &[u8]) -> Vec<TraceEvent> {
+    let text = String::from_utf8_lossy(stdout);
+    let mut events = Vec::new();
+
+    for (seq, line) in text.lines().enumerate() {
+        let trimmed = line.trim();
+        if trimmed.is_empty() {
+            continue;
+        }
+
+        // Try to parse as JSON first
+        let payload = if let Ok(value) = serde_json::from_str::<serde_json::Value>(trimmed) {
+            TracePayload::RawJson { data: value }
+        } else {
+            TracePayload::RawLine {
+                line: line.to_string(),
+            }
+        };
+
+        events.push(TraceEvent { seq, payload });
+    }
+
+    events
+}
+
+/// Serialize trace events to JSONL format
+pub fn trace_to_jsonl(events: &[TraceEvent]) -> String {
+    events
+        .iter()
+        .filter_map(|e| serde_json::to_string(e).ok())
+        .collect::<Vec<_>>()
+        .join("\n")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_stdout_to_trace_text_lines() {
+        let stdout = b"hello world\nfoo bar\n";
+        let events = stdout_to_trace(stdout);
+        assert_eq!(events.len(), 2);
+        assert_eq!(events[0].seq, 0);
+        assert!(
+            matches!(&events[0].payload, TracePayload::RawLine { line } if line == "hello world")
+        );
+    }
+
+    #[test]
+    fn test_stdout_to_trace_json_lines() {
+        let stdout = b"{\"key\": \"value\"}\nplain line\n";
+        let events = stdout_to_trace(stdout);
+        assert_eq!(events.len(), 2);
+        assert!(matches!(&events[0].payload, TracePayload::RawJson { .. }));
+        assert!(matches!(&events[1].payload, TracePayload::RawLine { .. }));
+    }
+
+    #[test]
+    fn test_trace_to_jsonl() {
+        let events = vec![TraceEvent {
+            seq: 0,
+            payload: TracePayload::RawLine {
+                line: "test".to_string(),
+            },
+        }];
+        let jsonl = trace_to_jsonl(&events);
+        assert!(jsonl.contains("\"seq\":0"));
+        assert!(jsonl.contains("raw_line"));
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 6eef1a09..4dc9dc85 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -49,6 +49,7 @@
 //! ```
 
 pub mod core;
+pub mod eval;
 pub mod events;
 pub mod execution;
 pub mod http;
diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs
new file mode 100644
index 00000000..26241a82
--- /dev/null
+++ b/tests/cli/eval_tests.rs
@@ -0,0 +1,213 @@
+//! CLI integration tests for eval commands
+
+#![allow(clippy::all, clippy::unwrap_used, clippy::expect_used)]
+
+use super::snapshot_helpers::{
+    assert_snapshot_with_settings, cli_snapshot_settings, run_fastskill_command,
+};
+
+#[test]
+fn test_eval_help() {
+    let result = run_fastskill_command(&["eval", "--help"], None);
+    assert!(result.success);
+    assert_snapshot_with_settings("eval_help", &result.stdout, &cli_snapshot_settings());
+}
+
+#[test]
+fn test_eval_validate_help() {
+    let result = run_fastskill_command(&["eval", "validate", "--help"], None);
+    assert!(result.success);
+    assert_snapshot_with_settings(
+        "eval_validate_help",
+        &result.stdout,
+        &cli_snapshot_settings(),
+    );
+}
+
+#[test]
+fn test_eval_run_help() {
+    let result = run_fastskill_command(&["eval", "run", "--help"], None);
+    assert!(result.success);
+    assert_snapshot_with_settings("eval_run_help", &result.stdout, &cli_snapshot_settings());
+}
+
+#[test]
+fn test_eval_report_help() {
+    let result = run_fastskill_command(&["eval", "report", "--help"], None);
+    assert!(result.success);
+    assert_snapshot_with_settings("eval_report_help", &result.stdout, &cli_snapshot_settings());
+}
+
+#[test]
+fn test_eval_score_help() {
+    let result = run_fastskill_command(&["eval", "score", "--help"], None);
+    assert!(result.success);
+    assert_snapshot_with_settings("eval_score_help", &result.stdout, &cli_snapshot_settings());
+}
+
+#[test]
+fn test_eval_run_requires_agent() {
+    let result = run_fastskill_command(&["eval", "run", "--output-dir", "/tmp/evals"], None);
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("--agent") || combined.contains("agent"),
+        "Expected error about missing --agent, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_run_requires_output_dir() {
+    let result = run_fastskill_command(&["eval", "run", "--agent", "codex"], None);
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("--output-dir")
+            || combined.contains("output-dir")
+            || combined.contains("output_dir"),
+        "Expected error about missing --output-dir, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_run_rejects_unsupported_agent() {
+    let result = run_fastskill_command(
+        &[
+            "eval",
+            "run",
+            "--agent",
+            "unsupported-agent-xyz",
+            "--output-dir",
+            "/tmp/evals",
+        ],
+        None,
+    );
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("unsupported-agent-xyz") || combined.contains("not a supported agent"),
+        "Expected error about unsupported agent, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_validate_no_project_file() {
+    use tempfile::TempDir;
+    let dir = TempDir::new().unwrap();
+    let result = run_fastskill_command(&["eval", "validate"], Some(dir.path()));
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("skill-project.toml") || combined.contains("EVAL_CONFIG_MISSING"),
+        "Expected error about missing skill-project.toml, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_validate_no_eval_config() {
+    use std::fs;
+    use tempfile::TempDir;
+
+    let dir = TempDir::new().unwrap();
+    fs::write(
+        dir.path().join("skill-project.toml"),
+        "[metadata]\nid = \"test-skill\"\n",
+    )
+    .unwrap();
+
+    let result = run_fastskill_command(&["eval", "validate"], Some(dir.path()));
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("EVAL_CONFIG_MISSING") || combined.contains("eval"),
+        "Expected EVAL_CONFIG_MISSING error, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_validate_with_eval_config() {
+    use std::fs;
+    use tempfile::TempDir;
+
+    let dir = TempDir::new().unwrap();
+
+    // Create evals directory and prompts.csv
+    let evals_dir = dir.path().join("evals");
+    fs::create_dir_all(&evals_dir).unwrap();
+    fs::write(
+        evals_dir.join("prompts.csv"),
+        "id,prompt,should_trigger,tags,workspace_subdir\ntest-1,\"Test prompt\",true,\"basic\",\n",
+    )
+    .unwrap();
+
+    // Create SKILL.md so it's detected as skill context
+    fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
+
+    // Create skill-project.toml with eval config
+    fs::write(
+        dir.path().join("skill-project.toml"),
+        "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n",
+    )
+    .unwrap();
+
+    let result = run_fastskill_command(&["eval", "validate"], Some(dir.path()));
+    assert!(
+        result.success,
+        "Expected eval validate to succeed, got stdout: {}, stderr: {}",
+        result.stdout, result.stderr
+    );
+    assert!(
+        result.stdout.contains("valid") || result.stdout.contains("prompts"),
+        "Expected valid output, got: {}",
+        result.stdout
+    );
+}
+
+#[test]
+fn test_eval_report_requires_run_dir() {
+    let result = run_fastskill_command(&["eval", "report"], None);
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("--run-dir") || combined.contains("run-dir"),
+        "Expected error about missing --run-dir, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_score_requires_run_dir() {
+    let result = run_fastskill_command(&["eval", "score"], None);
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("--run-dir") || combined.contains("run-dir"),
+        "Expected error about missing --run-dir, got: {}",
+        combined
+    );
+}
+
+#[test]
+fn test_eval_report_nonexistent_run_dir() {
+    let result = run_fastskill_command(
+        &[
+            "eval",
+            "report",
+            "--run-dir",
+            "/tmp/nonexistent-fastskill-eval-dir-xyz123",
+        ],
+        None,
+    );
+    assert!(!result.success);
+    let combined = format!("{}{}", result.stdout, result.stderr);
+    assert!(
+        combined.contains("EVAL_ARTIFACTS_CORRUPT") || combined.contains("not exist"),
+        "Expected error about nonexistent dir, got: {}",
+        combined
+    );
+}
diff --git a/tests/cli/mod.rs b/tests/cli/mod.rs
index 50d76de7..cda88146 100644
--- a/tests/cli/mod.rs
+++ b/tests/cli/mod.rs
@@ -7,6 +7,7 @@ pub mod add_tests;
 pub mod analyze_cluster_tests;
 pub mod auth_e2e_tests;
 pub mod config_tests;
+pub mod eval_tests;
 pub mod example_tests;
 pub mod help_tests;
 pub mod init_tests;
diff --git a/tests/cli/repository_tests.rs b/tests/cli/repository_tests.rs
index 820a2f65..be64a6df 100644
--- a/tests/cli/repository_tests.rs
+++ b/tests/cli/repository_tests.rs
@@ -85,6 +85,7 @@ web-scraper = "1.0.0"
         server: None,
         install_depth: 5,
         skip_transitive: false,
+        eval: None,
     });
     let repos = fastskill_config.repositories.get_or_insert_with(Vec::new);
 
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap
new file mode 100644
index 00000000..5b3baaaf
--- /dev/null
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap
@@ -0,0 +1,21 @@
+---
+source: tests/cli/snapshot_helpers.rs
+expression: normalized
+---
+Evaluation commands for skill quality assurance
+
+Usage: fastskill eval <COMMAND>
+
+Commands:
+  validate  Validate eval configuration and files
+  run       Run eval cases against an agent
+  report    Show a report for a completed eval run
+  score     Re-score saved eval artifacts without running the agent again
+  help      Print this message or the help of the given subcommand(s)
+
+Options:
+  -h, --help  Print help
+
+Examples:
+  fastskill eval validate
+  fastskill eval run --agent codex --output-dir [TEMP_DIR]
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
new file mode 100644
index 00000000..ba01eee7
--- /dev/null
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
@@ -0,0 +1,15 @@
+---
+source: tests/cli/snapshot_helpers.rs
+expression: normalized
+---
+Show a report for a completed eval run
+
+Usage: fastskill eval report [OPTIONS] --run-dir <RUN_DIR>
+
+Options:
+      --run-dir <RUN_DIR>  Path to the specific run directory
+      --json               Output as JSON
+  -h, --help               Print help
+
+Examples:
+  fastskill eval report --run-dir [TEMP_DIR]
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
new file mode 100644
index 00000000..ef5354c5
--- /dev/null
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
@@ -0,0 +1,20 @@
+---
+source: tests/cli/snapshot_helpers.rs
+expression: normalized
+---
+Run eval cases against an agent
+
+Usage: fastskill eval run [OPTIONS] --agent <AGENT> --output-dir <OUTPUT_DIR>
+
+Options:
+      --agent <AGENT>            Agent to use for execution (required)
+      --output-dir <OUTPUT_DIR>  Output directory for artifacts (required)
+      --model <MODEL>            Optional model override forwarded to the agent
+      --case <CASE>              Filter: run only the case with this ID
+      --tag <TAG>                Filter: run only cases with this tag
+      --json                     Output as JSON
+      --no-fail                  Do not fail with non-zero exit code on suite failure
+  -h, --help                     Print help
+
+Examples:
+  fastskill eval run --agent codex --output-dir [TEMP_DIR]
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
new file mode 100644
index 00000000..101a2950
--- /dev/null
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
@@ -0,0 +1,16 @@
+---
+source: tests/cli/snapshot_helpers.rs
+expression: normalized
+---
+Re-score saved eval artifacts without running the agent again
+
+Usage: fastskill eval score [OPTIONS] --run-dir <RUN_DIR>
+
+Options:
+      --run-dir <RUN_DIR>  Path to the run directory to re-score
+      --json               Output as JSON
+      --no-fail            Do not fail with non-zero exit code on suite failure
+  -h, --help               Print help
+
+Examples:
+  fastskill eval score --run-dir [TEMP_DIR]
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap
new file mode 100644
index 00000000..b6dbac3c
--- /dev/null
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap
@@ -0,0 +1,16 @@
+---
+source: tests/cli/snapshot_helpers.rs
+expression: normalized
+---
+Validate eval configuration and files
+
+Usage: fastskill eval validate [OPTIONS]
+
+Options:
+      --agent <AGENT>  Check agent availability for the specified agent key
+      --json           Output as JSON
+  -h, --help           Print help
+
+Examples:
+  fastskill eval validate
+  fastskill eval validate --agent codex
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap
index b5125c60..5d169eea 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap
@@ -24,6 +24,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap
index 89ab80e9..37023f23 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap
@@ -11,6 +11,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
@@ -47,6 +48,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap
index a8ba135c..611645cf 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap
@@ -11,6 +11,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap
index 7b21deef..f78dbd2d 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap
@@ -11,6 +11,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap
index c6bbaf4d..cdf953d2 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap
@@ -11,6 +11,7 @@ Commands:
   analyze      Diagnostic and analysis commands
   auth         Manage authentication for registries
   disable      Disable skills by ID
+  eval         Evaluation commands for skill quality assurance
   init         Initialize skill-project.toml in current skill directory
   install      Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow)
   list         List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info)
diff --git a/tests/core_manifest_tests.rs b/tests/core_manifest_tests.rs
index f40d939a..18f2cf3e 100644
--- a/tests/core_manifest_tests.rs
+++ b/tests/core_manifest_tests.rs
@@ -170,6 +170,7 @@ fn test_validation_project_level() {
                 server: None,
                 install_depth: 5,
                 skip_transitive: false,
+                eval: None,
             }),
         }),
     };

From f505cd0008cd6e27cb0b3a0a358ea2b79fd5d43b Mon Sep 17 00:00:00 2001
From: System Two <alexandre@55seeds.com>
Date: Sun, 5 Apr 2026 18:01:20 +0000
Subject: [PATCH 2/4] chore(develop): 034-eval

---
 Cargo.lock                                    |  25 ++---
 src/cli/commands/eval/report.rs               |  15 ++-
 src/cli/commands/eval/run.rs                  |  51 ++++++---
 src/cli/commands/eval/score.rs                |  15 ++-
 src/cli/commands/eval/validate.rs             |  15 ++-
 src/eval/runner.rs                            | 102 +++++++++++++-----
 src/eval/trace.rs                             |  26 +++++
 ...i__snapshot_helpers__eval_report_help.snap |   3 +-
 ..._cli__snapshot_helpers__eval_run_help.snap |   3 +-
 ...li__snapshot_helpers__eval_score_help.snap |   3 +-
 ..._snapshot_helpers__eval_validate_help.snap |   7 +-
 11 files changed, 192 insertions(+), 73 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bc8c4c12..2624d3a2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -42,12 +42,13 @@ dependencies = [
 
 [[package]]
 name = "aikit-sdk"
-version = "0.1.49"
-source = "git+https://github.com/goaikit/aikit?branch=main#60577a3d43d1396b24e2879644f8cf5b93408a4f"
+version = "0.1.50"
+source = "git+https://github.com/goaikit/aikit?branch=main#524f98f1db7b3facbfb4c0d49267b7dc067ada9b"
 dependencies = [
  "glob",
  "reqwest 0.12.28",
  "serde",
+ "serde_json",
  "tempfile",
  "toml",
  "walkdir",
@@ -1315,7 +1316,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -1416,7 +1417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2081,7 +2082,7 @@ dependencies = [
  "libc",
  "percent-encoding",
  "pin-project-lite",
- "socket2 0.6.2",
+ "socket2 0.5.10",
  "tokio",
  "tower-service",
  "tracing",
@@ -2650,7 +2651,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -2958,7 +2959,7 @@ dependencies = [
  "quinn-udp",
  "rustc-hash",
  "rustls 0.23.36",
- "socket2 0.6.2",
+ "socket2 0.5.10",
  "thiserror 2.0.18",
  "tokio",
  "tracing",
@@ -2995,9 +2996,9 @@ dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
- "socket2 0.6.2",
+ "socket2 0.5.10",
  "tracing",
- "windows-sys 0.60.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3343,7 +3344,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -3856,7 +3857,7 @@ dependencies = [
  "getrandom 0.3.4",
  "once_cell",
  "rustix",
- "windows-sys 0.61.2",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -4537,7 +4538,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.61.2",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
diff --git a/src/cli/commands/eval/report.rs b/src/cli/commands/eval/report.rs
index c7877b0b..a8dda912 100644
--- a/src/cli/commands/eval/report.rs
+++ b/src/cli/commands/eval/report.rs
@@ -1,8 +1,10 @@
 //! Eval report subcommand - artifact summary and formatting
 
+use crate::cli::commands::common::validate_format_args;
 use crate::cli::error::{CliError, CliResult};
 use clap::Args;
 use fastskill::eval::artifacts::read_summary;
+use fastskill::OutputFormat;
 use std::path::PathBuf;
 
 /// Arguments for `fastskill eval report`
@@ -16,13 +18,20 @@ pub struct ReportArgs {
     #[arg(long, required = true)]
     pub run_dir: PathBuf,
 
-    /// Output as JSON
-    #[arg(long)]
+    /// Output format: table, json, grid, xml (default: table)
+    #[arg(long, value_enum, help = "Output format: table, json, grid, xml")]
+    pub format: Option<OutputFormat>,
+
+    /// Shorthand for --format json
+    #[arg(long, help = "Shorthand for --format json")]
     pub json: bool,
 }
 
 /// Execute the `eval report` command
 pub async fn execute_report(args: ReportArgs) -> CliResult<()> {
+    let format = validate_format_args(&args.format, args.json)?;
+    let use_json = format == OutputFormat::Json;
+
     if !args.run_dir.exists() {
         return Err(CliError::Config(format!(
             "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}",
@@ -37,7 +46,7 @@ pub async fn execute_report(args: ReportArgs) -> CliResult<()> {
         ))
     })?;
 
-    if args.json {
+    if use_json {
         println!(
             "{}",
             serde_json::to_string_pretty(&summary).unwrap_or_default()
diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs
index 483f49fa..58621a80 100644
--- a/src/cli/commands/eval/run.rs
+++ b/src/cli/commands/eval/run.rs
@@ -1,5 +1,6 @@
 //! Eval run subcommand - case execution orchestration
 
+use crate::cli::commands::common::validate_format_args;
 use crate::cli::error::{CliError, CliResult};
 use aikit_sdk::{is_agent_available, is_runnable, runnable_agents};
 use chrono::Utc;
@@ -12,7 +13,8 @@ use fastskill::eval::checks::load_checks;
 use fastskill::eval::config::resolve_eval_config;
 use fastskill::eval::runner::{run_eval_case, CaseRunOptions};
 use fastskill::eval::suite::load_suite;
-use fastskill::eval::trace::{stdout_to_trace, trace_to_jsonl};
+use fastskill::eval::trace::trace_to_jsonl;
+use fastskill::OutputFormat;
 use std::env;
 use std::path::PathBuf;
 
@@ -43,8 +45,12 @@ pub struct RunArgs {
     #[arg(long)]
     pub tag: Option<String>,
 
-    /// Output as JSON
-    #[arg(long)]
+    /// Output format: table, json, grid, xml (default: table)
+    #[arg(long, value_enum, help = "Output format: table, json, grid, xml")]
+    pub format: Option<OutputFormat>,
+
+    /// Shorthand for --format json
+    #[arg(long, help = "Shorthand for --format json")]
     pub json: bool,
 
     /// Do not fail with non-zero exit code on suite failure
@@ -66,6 +72,9 @@ fn validate_agent_key_for_run(s: &str) -> Result<String, String> {
 
 /// Execute the `eval run` command
 pub async fn execute_run(args: RunArgs) -> CliResult<()> {
+    let format = validate_format_args(&args.format, args.json)?;
+    let use_json = format == OutputFormat::Json;
+
     let current_dir = env::current_dir()
         .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?;
 
@@ -144,22 +153,26 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> {
         timeout_seconds: eval_config.timeout_seconds,
     };
 
-    eprintln!(
-        "Running {} eval case(s) with agent '{}'...",
-        suite.cases.len(),
-        args.agent
-    );
+    if !use_json {
+        eprintln!(
+            "Running {} eval case(s) with agent '{}'...",
+            suite.cases.len(),
+            args.agent
+        );
+    }
 
     let mut case_results = Vec::new();
     let mut case_summaries = Vec::new();
 
     for case in &suite.cases {
-        eprintln!("  Running case '{}'...", case.id);
+        if !use_json {
+            eprintln!("  Running case '{}'...", case.id);
+        }
 
         let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await;
 
-        // Build trace
-        let trace_events = stdout_to_trace(&run_output.stdout);
+        // Build trace from stdout (since runner collects it)
+        let trace_events = fastskill::eval::trace::stdout_to_trace(&run_output.stdout);
         let trace_jsonl = trace_to_jsonl(&trace_events);
 
         // Write artifacts
@@ -171,10 +184,12 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> {
             &trace_jsonl,
             &case_result,
         ) {
-            eprintln!(
-                "  warning: failed to write artifacts for case '{}': {}",
-                case.id, e
-            );
+            if !use_json {
+                eprintln!(
+                    "  warning: failed to write artifacts for case '{}': {}",
+                    case.id, e
+                );
+            }
         }
 
         let summary_entry = CaseSummary {
@@ -217,10 +232,12 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> {
 
     // Write summary
     if let Err(e) = write_summary(&run_dir, &summary) {
-        eprintln!("warning: failed to write summary.json: {}", e);
+        if !use_json {
+            eprintln!("warning: failed to write summary.json: {}", e);
+        }
     }
 
-    if args.json {
+    if use_json {
         println!(
             "{}",
             serde_json::to_string_pretty(&summary).unwrap_or_default()
diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs
index 6afa3911..6d9c84d1 100644
--- a/src/cli/commands/eval/score.rs
+++ b/src/cli/commands/eval/score.rs
@@ -1,9 +1,11 @@
 //! Eval score subcommand - offline re-scoring from saved artifacts
 
+use crate::cli::commands::common::validate_format_args;
 use crate::cli::error::{CliError, CliResult};
 use clap::Args;
 use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus};
 use fastskill::eval::checks::load_checks;
+use fastskill::OutputFormat;
 use std::path::PathBuf;
 
 /// Arguments for `fastskill eval score`
@@ -17,8 +19,12 @@ pub struct ScoreArgs {
     #[arg(long, required = true)]
     pub run_dir: PathBuf,
 
-    /// Output as JSON
-    #[arg(long)]
+    /// Output format: table, json, grid, xml (default: table)
+    #[arg(long, value_enum, help = "Output format: table, json, grid, xml")]
+    pub format: Option<OutputFormat>,
+
+    /// Shorthand for --format json
+    #[arg(long, help = "Shorthand for --format json")]
     pub json: bool,
 
     /// Do not fail with non-zero exit code on suite failure
@@ -28,6 +34,9 @@ pub struct ScoreArgs {
 
 /// Execute the `eval score` command
 pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
+    let format = validate_format_args(&args.format, args.json)?;
+    let use_json = format == OutputFormat::Json;
+
     if !args.run_dir.exists() {
         return Err(CliError::Config(format!(
             "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}",
@@ -108,7 +117,7 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> {
     write_summary(&args.run_dir, &summary)
         .map_err(|e| CliError::Config(format!("Failed to write updated summary.json: {}", e)))?;
 
-    if args.json {
+    if use_json {
         println!(
             "{}",
             serde_json::to_string_pretty(&summary).unwrap_or_default()
diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs
index 54e52933..3c553c45 100644
--- a/src/cli/commands/eval/validate.rs
+++ b/src/cli/commands/eval/validate.rs
@@ -1,10 +1,12 @@
 //! Eval validate subcommand - configuration and file validation
 
+use crate::cli::commands::common::validate_format_args;
 use crate::cli::error::{CliError, CliResult};
 use aikit_sdk::{is_agent_available, is_runnable, runnable_agents};
 use clap::Args;
 use fastskill::core::project::resolve_project_file;
 use fastskill::eval::config::resolve_eval_config;
+use fastskill::OutputFormat;
 use std::env;
 
 /// Arguments for `fastskill eval validate`
@@ -18,8 +20,12 @@ pub struct ValidateArgs {
     #[arg(long, value_parser = validate_agent_key_parser)]
     pub agent: Option<String>,
 
-    /// Output as JSON
-    #[arg(long)]
+    /// Output format: table, json, grid, xml (default: table)
+    #[arg(long, value_enum, help = "Output format: table, json, grid, xml")]
+    pub format: Option<OutputFormat>,
+
+    /// Shorthand for --format json
+    #[arg(long, help = "Shorthand for --format json")]
     pub json: bool,
 }
 
@@ -37,6 +43,9 @@ fn validate_agent_key_parser(s: &str) -> Result<String, String> {
 
 /// Execute the `eval validate` command
 pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
+    let format = validate_format_args(&args.format, args.json)?;
+    let use_json = format == OutputFormat::Json;
+
     let current_dir = env::current_dir()
         .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?;
 
@@ -75,7 +84,7 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> {
         }
     }
 
-    if args.json {
+    if use_json {
         let output = serde_json::json!({
             "valid": true,
             "prompts_path": eval_config.prompts_path,
diff --git a/src/eval/runner.rs b/src/eval/runner.rs
index 24394f6c..236a611f 100644
--- a/src/eval/runner.rs
+++ b/src/eval/runner.rs
@@ -3,9 +3,10 @@
 use crate::eval::artifacts::{CaseResult, CaseStatus};
 use crate::eval::checks::{run_checks, CheckDefinition};
 use crate::eval::suite::EvalCase;
-use crate::eval::trace::{stdout_to_trace, trace_to_jsonl};
-use aikit_sdk::{run_agent, RunOptions};
+use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl};
+use aikit_sdk::{run_agent_events, AgentEvent, RunOptions};
 use std::path::PathBuf;
+use std::time::Duration;
 use thiserror::Error;
 
 /// Options for running a single eval case
@@ -41,10 +42,10 @@ pub enum RunnerError {
     ExecutionFailed(String),
 }
 
-/// Run a single eval case using aikit_sdk::run_agent via spawn_blocking.
+/// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking.
 ///
 /// This is the sole agent execution path per spec (acceptance criterion 29).
-/// Uses spawn_blocking because run_agent is synchronous.
+/// Uses spawn_blocking because run_agent_events is synchronous.
 pub async fn run_eval_case(
     case: &EvalCase,
     opts: &CaseRunOptions,
@@ -53,6 +54,7 @@ pub async fn run_eval_case(
     let agent_key = opts.agent_key.clone();
     let model = opts.model.clone();
     let prompt = case.prompt.clone();
+    let timeout_secs = opts.timeout_seconds;
 
     // Determine working directory
     let working_dir = match &case.workspace_subdir {
@@ -61,42 +63,84 @@ pub async fn run_eval_case(
     };
 
     // Build RunOptions using available aikit-sdk API
-    let run_opts = RunOptions {
-        model: model.clone(),
-        yolo: true,
-        stream: false,
-    };
+    let run_opts = RunOptions::new()
+        .with_model(model.clone().unwrap_or_default())
+        .with_yolo(true)
+        .with_stream(true);
 
     // Run agent in a blocking thread to avoid blocking the async runtime
-    let result =
-        tokio::task::spawn_blocking(move || run_agent(&agent_key, &prompt, run_opts)).await;
+    let timeout_duration = Duration::from_secs(timeout_secs);
+    let working_dir_clone = working_dir.clone();
+
+    let result = tokio::task::spawn_blocking(move || {
+        // Change to working directory before spawning the agent
+        let original_dir = std::env::current_dir().ok();
+        let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok();
+
+        // Collect events during execution
+        let mut events: Vec<AgentEvent> = Vec::new();
+
+        let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| {
+            events.push(ev.clone());
+        });
+
+        // Restore original directory if we changed it
+        if dir_changed {
+            if let Some(orig) = original_dir {
+                let _ = std::env::set_current_dir(&orig);
+            }
+        }
+
+        (result, events)
+    });
 
-    let run_output = match result {
-        Ok(Ok(run_result)) => {
+    // Apply timeout wrapper
+    let timed_result = tokio::time::timeout(timeout_duration, result).await;
+
+    let (run_output, trace_events) = match timed_result {
+        Ok(Ok((Ok(run_result), events))) => {
             let exit_code = run_result.exit_code();
-            CaseRunOutput {
+            let output = CaseRunOutput {
                 stdout: run_result.stdout,
                 stderr: run_result.stderr,
                 exit_code,
                 timed_out: false,
-            }
+            };
+            let trace = agent_events_to_trace(&events);
+            (output, trace)
+        }
+        Ok(Ok((Err(e), events))) => {
+            let trace = agent_events_to_trace(&events);
+            let output = CaseRunOutput {
+                stdout: vec![],
+                stderr: format!("Agent execution failed: {}", e).into_bytes(),
+                exit_code: None,
+                timed_out: false,
+            };
+            (output, trace)
+        }
+        Ok(Err(e)) => {
+            let output = CaseRunOutput {
+                stdout: vec![],
+                stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
+                exit_code: None,
+                timed_out: false,
+            };
+            (output, vec![])
+        }
+        Err(_) => {
+            // Timeout elapsed
+            let output = CaseRunOutput {
+                stdout: vec![],
+                stderr: format!("Case timed out after {}s", timeout_secs).into_bytes(),
+                exit_code: None,
+                timed_out: true,
+            };
+            (output, vec![])
         }
-        Ok(Err(e)) => CaseRunOutput {
-            stdout: vec![],
-            stderr: format!("Agent execution failed: {}", e).into_bytes(),
-            exit_code: None,
-            timed_out: false,
-        },
-        Err(e) => CaseRunOutput {
-            stdout: vec![],
-            stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
-            exit_code: None,
-            timed_out: false,
-        },
     };
 
-    // Build trace from stdout
-    let trace_events = stdout_to_trace(&run_output.stdout);
+    // Build trace from collected events
     let trace_jsonl = trace_to_jsonl(&trace_events);
     let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string();
 
diff --git a/src/eval/trace.rs b/src/eval/trace.rs
index ef67d6b0..e028b1f8 100644
--- a/src/eval/trace.rs
+++ b/src/eval/trace.rs
@@ -1,5 +1,6 @@
 //! Trace event types for eval case execution
 
+use aikit_sdk::{AgentEvent, AgentEventPayload};
 use serde::{Deserialize, Serialize};
 
 /// A single line in a trace.jsonl file
@@ -27,6 +28,31 @@ pub enum TracePayload {
     Timeout,
 }
 
+/// Convert aikit-sdk AgentEvent to internal TraceEvent
+pub fn agent_events_to_trace(events: &[AgentEvent]) -> Vec<TraceEvent> {
+    events
+        .iter()
+        .map(|ev| {
+            let payload = match &ev.payload {
+                AgentEventPayload::JsonLine(value) => TracePayload::RawJson {
+                    data: value.clone(),
+                },
+                AgentEventPayload::RawLine(line) => TracePayload::RawLine { line: line.clone() },
+                AgentEventPayload::RawBytes(bytes) => {
+                    use base64::{engine::general_purpose::STANDARD, Engine as _};
+                    TracePayload::RawBytes {
+                        b64: STANDARD.encode(bytes),
+                    }
+                }
+            };
+            TraceEvent {
+                seq: ev.seq as usize,
+                payload,
+            }
+        })
+        .collect()
+}
+
 /// Convert raw stdout lines to trace events
 pub fn stdout_to_trace(stdout: &[u8]) -> Vec<TraceEvent> {
     let text = String::from_utf8_lossy(stdout);
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
index ba01eee7..3afaf53a 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap
@@ -8,7 +8,8 @@ Usage: fastskill eval report [OPTIONS] --run-dir <RUN_DIR>
 
 Options:
       --run-dir <RUN_DIR>  Path to the specific run directory
-      --json               Output as JSON
+      --format <FORMAT>    Output format: table, json, grid, xml
+      --json               Shorthand for --format json
   -h, --help               Print help
 
 Examples:
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
index ef5354c5..4bf17752 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap
@@ -12,7 +12,8 @@ Options:
       --model <MODEL>            Optional model override forwarded to the agent
       --case <CASE>              Filter: run only the case with this ID
       --tag <TAG>                Filter: run only cases with this tag
-      --json                     Output as JSON
+      --format <FORMAT>          Output format: table, json, grid, xml
+      --json                     Shorthand for --format json
       --no-fail                  Do not fail with non-zero exit code on suite failure
   -h, --help                     Print help
 
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
index 101a2950..252d9195 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap
@@ -8,7 +8,8 @@ Usage: fastskill eval score [OPTIONS] --run-dir <RUN_DIR>
 
 Options:
       --run-dir <RUN_DIR>  Path to the run directory to re-score
-      --json               Output as JSON
+      --format <FORMAT>    Output format: table, json, grid, xml
+      --json               Shorthand for --format json
       --no-fail            Do not fail with non-zero exit code on suite failure
   -h, --help               Print help
 
diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap
index b6dbac3c..9f0f8903 100644
--- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap
+++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap
@@ -7,9 +7,10 @@ Validate eval configuration and files
 Usage: fastskill eval validate [OPTIONS]
 
 Options:
-      --agent <AGENT>  Check agent availability for the specified agent key
-      --json           Output as JSON
-  -h, --help           Print help
+      --agent <AGENT>    Check agent availability for the specified agent key
+      --format <FORMAT>  Output format: table, json, grid, xml
+      --json             Shorthand for --format json
+  -h, --help             Print help
 
 Examples:
   fastskill eval validate

From a64a1676ee101a83d219e88b25ec05ff77871e61 Mon Sep 17 00:00:00 2001
From: System Two <alexandre@55seeds.com>
Date: Sun, 5 Apr 2026 18:45:33 +0000
Subject: [PATCH 3/4] chore(develop): 034-eval

---
 src/eval/runner.rs | 78 ++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 30 deletions(-)

diff --git a/src/eval/runner.rs b/src/eval/runner.rs
index 236a611f..af13915f 100644
--- a/src/eval/runner.rs
+++ b/src/eval/runner.rs
@@ -3,7 +3,7 @@
 use crate::eval::artifacts::{CaseResult, CaseStatus};
 use crate::eval::checks::{run_checks, CheckDefinition};
 use crate::eval::suite::EvalCase;
-use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl};
+use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload};
 use aikit_sdk::{run_agent_events, AgentEvent, RunOptions};
 use std::path::PathBuf;
 use std::time::Duration;
@@ -42,6 +42,12 @@ pub enum RunnerError {
     ExecutionFailed(String),
 }
 
+/// Result of agent execution within spawn_blocking
+struct AgentExecutionResult {
+    result: Result<aikit_sdk::RunResult, aikit_sdk::RunError>,
+    events: Vec<AgentEvent>,
+}
+
 /// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking.
 ///
 /// This is the sole agent execution path per spec (acceptance criterion 29).
@@ -62,18 +68,18 @@ pub async fn run_eval_case(
         None => opts.project_root.clone(),
     };
 
-    // Build RunOptions using available aikit-sdk API
+    // Build RunOptions - note: aikit-sdk doesn't have with_timeout/with_current_dir yet
     let run_opts = RunOptions::new()
         .with_model(model.clone().unwrap_or_default())
         .with_yolo(true)
         .with_stream(true);
 
-    // Run agent in a blocking thread to avoid blocking the async runtime
-    let timeout_duration = Duration::from_secs(timeout_secs);
     let working_dir_clone = working_dir.clone();
 
-    let result = tokio::task::spawn_blocking(move || {
-        // Change to working directory before spawning the agent
+    // Run agent in a blocking thread to avoid blocking the async runtime
+    // We implement timeout ourselves using tokio::time::timeout
+    let spawn_result = tokio::task::spawn_blocking(move || {
+        // Change to working directory before spawning agent
         let original_dir = std::env::current_dir().ok();
         let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok();
 
@@ -91,35 +97,42 @@ pub async fn run_eval_case(
             }
         }
 
-        (result, events)
+        AgentExecutionResult { result, events }
     });
 
     // Apply timeout wrapper
-    let timed_result = tokio::time::timeout(timeout_duration, result).await;
+    let timeout_duration = Duration::from_secs(timeout_secs);
+    let timed_result = tokio::time::timeout(timeout_duration, spawn_result).await;
 
     let (run_output, trace_events) = match timed_result {
-        Ok(Ok((Ok(run_result), events))) => {
-            let exit_code = run_result.exit_code();
-            let output = CaseRunOutput {
-                stdout: run_result.stdout,
-                stderr: run_result.stderr,
-                exit_code,
-                timed_out: false,
-            };
-            let trace = agent_events_to_trace(&events);
-            (output, trace)
-        }
-        Ok(Ok((Err(e), events))) => {
-            let trace = agent_events_to_trace(&events);
-            let output = CaseRunOutput {
-                stdout: vec![],
-                stderr: format!("Agent execution failed: {}", e).into_bytes(),
-                exit_code: None,
-                timed_out: false,
-            };
-            (output, trace)
+        Ok(Ok(exec_result)) => {
+            // spawn_blocking succeeded, check agent result
+            match exec_result.result {
+                Ok(run_result) => {
+                    let exit_code = run_result.exit_code();
+                    let output = CaseRunOutput {
+                        stdout: run_result.stdout,
+                        stderr: run_result.stderr,
+                        exit_code,
+                        timed_out: false,
+                    };
+                    let trace = agent_events_to_trace(&exec_result.events);
+                    (output, trace)
+                }
+                Err(e) => {
+                    let trace = agent_events_to_trace(&exec_result.events);
+                    let output = CaseRunOutput {
+                        stdout: vec![],
+                        stderr: format!("Agent execution failed: {}", e).into_bytes(),
+                        exit_code: None,
+                        timed_out: false,
+                    };
+                    (output, trace)
+                }
+            }
         }
         Ok(Err(e)) => {
+            // spawn_blocking itself failed
             let output = CaseRunOutput {
                 stdout: vec![],
                 stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
@@ -136,7 +149,12 @@ pub async fn run_eval_case(
                 exit_code: None,
                 timed_out: true,
             };
-            (output, vec![])
+            // Add a timeout marker to the trace
+            let timeout_event = TraceEvent {
+                seq: 0,
+                payload: TracePayload::Timeout,
+            };
+            (output, vec![timeout_event])
         }
     };
 
@@ -147,7 +165,7 @@ pub async fn run_eval_case(
     // Count raw_json events
     let command_count = trace_events
         .iter()
-        .filter(|e| matches!(&e.payload, crate::eval::trace::TracePayload::RawJson { .. }))
+        .filter(|e| matches!(&e.payload, TracePayload::RawJson { .. }))
         .count();
 
     // Run deterministic checks

From 4146607948bf46d3c012d5c8fc1f4d37606fe804 Mon Sep 17 00:00:00 2001
From: System Two <alexandre@55seeds.com>
Date: Mon, 6 Apr 2026 12:36:14 +0000
Subject: [PATCH 4/4] feat(eval): EvalRunner trait, SDK timeout/cwd, trace
 parity

---
 Cargo.lock                   |  22 +--
 src/cli/commands/eval/run.rs |  20 ++-
 src/cli/commands/package.rs  |   3 +
 src/core/packaging.rs        |   4 +-
 src/eval/checks.rs           |  35 ++--
 src/eval/mod.rs              |   4 +
 src/eval/runner.rs           | 317 ++++++++++++++++++++++-------------
 tests/cli/eval_tests.rs      |  88 ++++++++++
 8 files changed, 342 insertions(+), 151 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2624d3a2..2ad1ecd6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -43,7 +43,7 @@ dependencies = [
 [[package]]
 name = "aikit-sdk"
 version = "0.1.50"
-source = "git+https://github.com/goaikit/aikit?branch=main#524f98f1db7b3facbfb4c0d49267b7dc067ada9b"
+source = "git+https://github.com/goaikit/aikit?branch=main#7380ef9c33c7915d32679da98a818cf7116967fb"
 dependencies = [
  "glob",
  "reqwest 0.12.28",
@@ -1316,7 +1316,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -1417,7 +1417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2082,7 +2082,7 @@ dependencies = [
  "libc",
  "percent-encoding",
  "pin-project-lite",
- "socket2 0.5.10",
+ "socket2 0.6.2",
  "tokio",
  "tower-service",
  "tracing",
@@ -2651,7 +2651,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2959,7 +2959,7 @@ dependencies = [
  "quinn-udp",
  "rustc-hash",
  "rustls 0.23.36",
- "socket2 0.5.10",
+ "socket2 0.6.2",
  "thiserror 2.0.18",
  "tokio",
  "tracing",
@@ -2996,9 +2996,9 @@ dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
- "socket2 0.5.10",
+ "socket2 0.6.2",
  "tracing",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -3344,7 +3344,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3857,7 +3857,7 @@ dependencies = [
  "getrandom 0.3.4",
  "once_cell",
  "rustix",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4538,7 +4538,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.48.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs
index 58621a80..8ce7fd2e 100644
--- a/src/cli/commands/eval/run.rs
+++ b/src/cli/commands/eval/run.rs
@@ -11,9 +11,8 @@ use fastskill::eval::artifacts::{
 };
 use fastskill::eval::checks::load_checks;
 use fastskill::eval::config::resolve_eval_config;
-use fastskill::eval::runner::{run_eval_case, CaseRunOptions};
+use fastskill::eval::runner::{AikitEvalRunner, CaseRunOptions, EvalRunner};
 use fastskill::eval::suite::load_suite;
-use fastskill::eval::trace::trace_to_jsonl;
 use fastskill::OutputFormat;
 use std::env;
 use std::path::PathBuf;
@@ -70,8 +69,16 @@ fn validate_agent_key_for_run(s: &str) -> Result<String, String> {
     }
 }
 
-/// Execute the `eval run` command
+/// Execute the `eval run` command using the default aikit-backed runner.
 pub async fn execute_run(args: RunArgs) -> CliResult<()> {
+    execute_run_with_runner(args, &AikitEvalRunner).await
+}
+
+/// Execute `eval run` with an injectable [`EvalRunner`] (tests or future adapters).
+pub async fn execute_run_with_runner<R: EvalRunner + ?Sized>(
+    args: RunArgs,
+    runner: &R,
+) -> CliResult<()> {
     let format = validate_format_args(&args.format, args.json)?;
     let use_json = format == OutputFormat::Json;
 
@@ -169,11 +176,8 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> {
             eprintln!("  Running case '{}'...", case.id);
         }
 
-        let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await;
-
-        // Build trace from stdout (since runner collects it)
-        let trace_events = fastskill::eval::trace::stdout_to_trace(&run_output.stdout);
-        let trace_jsonl = trace_to_jsonl(&trace_events);
+        let (run_output, case_result, trace_jsonl) =
+            runner.run_case(case, &run_opts, &checks).await;
 
         // Write artifacts
         if let Err(e) = write_case_artifacts(
diff --git a/src/cli/commands/package.rs b/src/cli/commands/package.rs
index eb58aaaa..21582f0a 100644
--- a/src/cli/commands/package.rs
+++ b/src/cli/commands/package.rs
@@ -48,6 +48,9 @@ pub enum PackagePreset {
 ///   fastskill package --git-diff base head  # Git-based change detection
 ///   fastskill package --skills id1 id2  # Package specific skills
 ///   fastskill package --force           # Package all skills
+///
+/// Note: the `evals/` directory is omitted from packaged skill ZIPs; exclusion is enforced in
+/// `fastskill::core::packaging` when building archives, not in this CLI module.
 #[derive(Debug, Args)]
 pub struct PackageArgs {
     /// Package preset command
diff --git a/src/core/packaging.rs b/src/core/packaging.rs
index 4c1ea9b8..976f79bb 100644
--- a/src/core/packaging.rs
+++ b/src/core/packaging.rs
@@ -19,7 +19,9 @@ pub fn package_skill(
     package_skill_with_id(skill_path, output_dir, version, None)
 }
 
-/// Package a skill directory into a ZIP file with explicit skill ID
+/// Package a skill directory into a ZIP file with explicit skill ID.
+///
+/// The `evals/` subtree is excluded from the archive (local eval suites are not published).
 pub fn package_skill_with_id(
     skill_path: &Path,
     output_dir: &Path,
diff --git a/src/eval/checks.rs b/src/eval/checks.rs
index a7d94e5e..47d460b5 100644
--- a/src/eval/checks.rs
+++ b/src/eval/checks.rs
@@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize};
 use std::path::PathBuf;
 use thiserror::Error;
 
+use crate::eval::trace::{TraceEvent, TracePayload};
+
 /// A deterministic check definition loaded from checks.toml
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(tag = "name")]
@@ -108,6 +110,15 @@ pub fn run_checks(
         .collect()
 }
 
+/// Count trace events with payload type `raw_json`.
+pub fn count_raw_json_events(trace_jsonl: &str) -> usize {
+    trace_jsonl
+        .lines()
+        .filter_map(|line| serde_json::from_str::<TraceEvent>(line).ok())
+        .filter(|event| matches!(event.payload, TracePayload::RawJson { .. }))
+        .count()
+}
+
 fn run_single_check(
     check: &CheckDefinition,
     stdout_content: &str,
@@ -165,10 +176,7 @@ fn run_single_check(
         }
         CheckDefinition::MaxCommandCount { limit, .. } => {
             // Count raw_json trace lines
-            let count = trace_jsonl
-                .lines()
-                .filter(|line| line.contains("\"raw_json\""))
-                .count();
+            let count = count_raw_json_events(trace_jsonl);
             let passed = count <= *limit;
             let message = if passed {
                 None
@@ -259,9 +267,9 @@ mod tests {
             limit: 5,
             required: true,
         };
-        let trace = r#"{"type":"raw_json","seq":0}
-{"type":"raw_json","seq":1}
-{"type":"raw_line","seq":2}"#;
+        let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}}
+{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}}
+{"seq":2,"payload":{"type":"raw_line","line":"ok"}}"#;
         let results = run_checks(&[check], "", trace, Path::new("/tmp"));
         assert!(results[0].passed);
     }
@@ -272,13 +280,20 @@ mod tests {
             limit: 1,
             required: true,
         };
-        let trace = r#"{"type":"raw_json","seq":0}
-{"type":"raw_json","seq":1}
-{"type":"raw_json","seq":2}"#;
+        let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}}
+{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}}
+{"seq":2,"payload":{"type":"raw_json","data":{"cmd":"c"}}}"#;
         let results = run_checks(&[check], "", trace, Path::new("/tmp"));
         assert!(!results[0].passed);
     }
 
+    #[test]
+    fn test_count_raw_json_events_ignores_substring_only() {
+        let trace = r#"{"seq":0,"payload":{"type":"raw_line","line":"mentions raw_json text"}}
+{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"x"}}}"#;
+        assert_eq!(count_raw_json_events(trace), 1);
+    }
+
     #[test]
     fn test_suite_passes_all_passed() {
         let results = vec![
diff --git a/src/eval/mod.rs b/src/eval/mod.rs
index bedfcb3c..33ac78aa 100644
--- a/src/eval/mod.rs
+++ b/src/eval/mod.rs
@@ -6,3 +6,7 @@ pub mod config;
 pub mod runner;
 pub mod suite;
 pub mod trace;
+
+pub use runner::{
+    run_eval_case, AikitEvalRunner, CaseRunOptions, CaseRunOutput, EvalRunner, RunnerError,
+};
diff --git a/src/eval/runner.rs b/src/eval/runner.rs
index af13915f..c5495c41 100644
--- a/src/eval/runner.rs
+++ b/src/eval/runner.rs
@@ -1,10 +1,11 @@
 //! Eval runner implementation using aikit-sdk
 
 use crate::eval::artifacts::{CaseResult, CaseStatus};
-use crate::eval::checks::{run_checks, CheckDefinition};
+use crate::eval::checks::{count_raw_json_events, run_checks, CheckDefinition};
 use crate::eval::suite::EvalCase;
 use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload};
 use aikit_sdk::{run_agent_events, AgentEvent, RunOptions};
+use async_trait::async_trait;
 use std::path::PathBuf;
 use std::time::Duration;
 use thiserror::Error;
@@ -42,72 +43,78 @@ pub enum RunnerError {
     ExecutionFailed(String),
 }
 
+/// Abstraction over eval case execution (default: aikit-backed).
+#[async_trait]
+pub trait EvalRunner: Send + Sync {
+    /// Run one case, produce stdout/stderr capture, scored result, and canonical trace JSONL.
+    async fn run_case(
+        &self,
+        case: &EvalCase,
+        opts: &CaseRunOptions,
+        checks: &[CheckDefinition],
+    ) -> (CaseRunOutput, CaseResult, String);
+}
+
+/// Default runner: `aikit_sdk::run_agent_events` inside `spawn_blocking` with SDK timeout/cwd.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct AikitEvalRunner;
+
 /// Result of agent execution within spawn_blocking
 struct AgentExecutionResult {
     result: Result<aikit_sdk::RunResult, aikit_sdk::RunError>,
     events: Vec<AgentEvent>,
 }
 
-/// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking.
-///
-/// This is the sole agent execution path per spec (acceptance criterion 29).
-/// Uses spawn_blocking because run_agent_events is synchronous.
-pub async fn run_eval_case(
-    case: &EvalCase,
-    opts: &CaseRunOptions,
-    checks: &[CheckDefinition],
-) -> (CaseRunOutput, CaseResult) {
-    let agent_key = opts.agent_key.clone();
-    let model = opts.model.clone();
-    let prompt = case.prompt.clone();
-    let timeout_secs = opts.timeout_seconds;
-
-    // Determine working directory
-    let working_dir = match &case.workspace_subdir {
-        Some(subdir) => opts.project_root.join(subdir),
-        None => opts.project_root.clone(),
-    };
-
-    // Build RunOptions - note: aikit-sdk doesn't have with_timeout/with_current_dir yet
-    let run_opts = RunOptions::new()
-        .with_model(model.clone().unwrap_or_default())
-        .with_yolo(true)
-        .with_stream(true);
-
-    let working_dir_clone = working_dir.clone();
-
-    // Run agent in a blocking thread to avoid blocking the async runtime
-    // We implement timeout ourselves using tokio::time::timeout
-    let spawn_result = tokio::task::spawn_blocking(move || {
-        // Change to working directory before spawning agent
-        let original_dir = std::env::current_dir().ok();
-        let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok();
-
-        // Collect events during execution
-        let mut events: Vec<AgentEvent> = Vec::new();
-
-        let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| {
-            events.push(ev.clone());
-        });
+#[async_trait]
+impl EvalRunner for AikitEvalRunner {
+    async fn run_case(
+        &self,
+        case: &EvalCase,
+        opts: &CaseRunOptions,
+        checks: &[CheckDefinition],
+    ) -> (CaseRunOutput, CaseResult, String) {
+        self.run_case_inner(case, opts, checks).await
+    }
+}
+
+impl AikitEvalRunner {
+    async fn run_case_inner(
+        &self,
+        case: &EvalCase,
+        opts: &CaseRunOptions,
+        checks: &[CheckDefinition],
+    ) -> (CaseRunOutput, CaseResult, String) {
+        let agent_key = opts.agent_key.clone();
+        let model = opts.model.clone();
+        let prompt = case.prompt.clone();
+        let timeout_secs = opts.timeout_seconds;
 
-        // Restore original directory if we changed it
-        if dir_changed {
-            if let Some(orig) = original_dir {
-                let _ = std::env::set_current_dir(&orig);
+        let working_dir = match &case.workspace_subdir {
+            Some(subdir) => opts.project_root.join(subdir),
+            None => opts.project_root.clone(),
+        };
+
+        let mut run_opts = RunOptions::new()
+            .with_yolo(true)
+            .with_stream(true)
+            .with_timeout(Duration::from_secs(timeout_secs))
+            .with_current_dir(working_dir.clone());
+        if let Some(model_name) = model {
+            if !model_name.trim().is_empty() {
+                run_opts = run_opts.with_model(model_name);
             }
         }
 
-        AgentExecutionResult { result, events }
-    });
-
-    // Apply timeout wrapper
-    let timeout_duration = Duration::from_secs(timeout_secs);
-    let timed_result = tokio::time::timeout(timeout_duration, spawn_result).await;
+        let spawn_result = tokio::task::spawn_blocking(move || {
+            let mut events: Vec<AgentEvent> = Vec::new();
+            let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| {
+                events.push(ev.clone());
+            });
+            AgentExecutionResult { result, events }
+        });
 
-    let (run_output, trace_events) = match timed_result {
-        Ok(Ok(exec_result)) => {
-            // spawn_blocking succeeded, check agent result
-            match exec_result.result {
+        let (run_output, trace_events) = match spawn_result.await {
+            Ok(exec_result) => match exec_result.result {
                 Ok(run_result) => {
                     let exit_code = run_result.exit_code();
                     let output = CaseRunOutput {
@@ -119,6 +126,33 @@ pub async fn run_eval_case(
                     let trace = agent_events_to_trace(&exec_result.events);
                     (output, trace)
                 }
+                Err(aikit_sdk::RunError::TimedOut {
+                    timeout, stderr, ..
+                }) => {
+                    let mut trace = agent_events_to_trace(&exec_result.events);
+                    trace.push(TraceEvent {
+                        seq: trace.len(),
+                        payload: TracePayload::Timeout,
+                    });
+                    let output = CaseRunOutput {
+                        stdout: vec![],
+                        stderr,
+                        exit_code: None,
+                        timed_out: true,
+                    };
+                    if output.stderr.is_empty() {
+                        let fallback = format!("Case timed out after {}s", timeout.as_secs());
+                        let output = CaseRunOutput {
+                            stdout: vec![],
+                            stderr: fallback.into_bytes(),
+                            exit_code: None,
+                            timed_out: true,
+                        };
+                        (output, trace)
+                    } else {
+                        (output, trace)
+                    }
+                }
                 Err(e) => {
                     let trace = agent_events_to_trace(&exec_result.events);
                     let output = CaseRunOutput {
@@ -129,80 +163,121 @@ pub async fn run_eval_case(
                     };
                     (output, trace)
                 }
+            },
+            Err(e) => {
+                let output = CaseRunOutput {
+                    stdout: vec![],
+                    stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
+                    exit_code: None,
+                    timed_out: false,
+                };
+                (output, vec![])
             }
-        }
-        Ok(Err(e)) => {
-            // spawn_blocking itself failed
-            let output = CaseRunOutput {
-                stdout: vec![],
-                stderr: format!("spawn_blocking failed: {}", e).into_bytes(),
-                exit_code: None,
-                timed_out: false,
-            };
-            (output, vec![])
-        }
-        Err(_) => {
-            // Timeout elapsed
-            let output = CaseRunOutput {
-                stdout: vec![],
-                stderr: format!("Case timed out after {}s", timeout_secs).into_bytes(),
-                exit_code: None,
-                timed_out: true,
-            };
-            // Add a timeout marker to the trace
-            let timeout_event = TraceEvent {
-                seq: 0,
-                payload: TracePayload::Timeout,
-            };
-            (output, vec![timeout_event])
-        }
-    };
-
-    // Build trace from collected events
-    let trace_jsonl = trace_to_jsonl(&trace_events);
-    let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string();
-
-    // Count raw_json events
-    let command_count = trace_events
-        .iter()
-        .filter(|e| matches!(&e.payload, TracePayload::RawJson { .. }))
-        .count();
-
-    // Run deterministic checks
-    let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir);
-    let all_passed = check_results.iter().all(|r| r.passed);
-
-    let status = if run_output.timed_out {
-        CaseStatus::Error
-    } else if checks.is_empty() {
-        // No checks defined: pass if exit code is 0
-        if run_output.exit_code == Some(0) {
+        };
+
+        let trace_jsonl = trace_to_jsonl(&trace_events);
+        let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string();
+        let command_count = count_raw_json_events(&trace_jsonl);
+        let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir);
+        let all_passed = check_results.iter().all(|r| r.passed);
+
+        let status = if run_output.timed_out {
+            CaseStatus::Error
+        } else if checks.is_empty() {
+            if run_output.exit_code == Some(0) {
+                CaseStatus::Passed
+            } else {
+                CaseStatus::Failed
+            }
+        } else if all_passed {
             CaseStatus::Passed
         } else {
             CaseStatus::Failed
-        }
-    } else if all_passed {
-        CaseStatus::Passed
-    } else {
-        CaseStatus::Failed
-    };
-
-    let case_result = CaseResult {
-        id: case.id.clone(),
-        status,
-        command_count: Some(command_count),
-        input_tokens: None,
-        output_tokens: None,
-        check_results,
-        error_message: None,
-    };
-
-    (run_output, case_result)
+        };
+
+        let case_result = CaseResult {
+            id: case.id.clone(),
+            status,
+            command_count: Some(command_count),
+            input_tokens: None,
+            output_tokens: None,
+            check_results,
+            error_message: None,
+        };
+
+        (run_output, case_result, trace_jsonl)
+    }
+}
+
+/// Run a single eval case using the default [`AikitEvalRunner`].
+///
+/// Sole agent execution path per spec (acceptance criterion 29).
+pub async fn run_eval_case(
+    case: &EvalCase,
+    opts: &CaseRunOptions,
+    checks: &[CheckDefinition],
+) -> (CaseRunOutput, CaseResult, String) {
+    AikitEvalRunner.run_case(case, opts, checks).await
 }
 
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::eval::artifacts::CaseStatus;
+
+    /// Stub runner for trait wiring tests (no aikit).
+    struct StubEvalRunner;
+
+    #[async_trait]
+    impl EvalRunner for StubEvalRunner {
+        async fn run_case(
+            &self,
+            case: &EvalCase,
+            _opts: &CaseRunOptions,
+            _checks: &[CheckDefinition],
+        ) -> (CaseRunOutput, CaseResult, String) {
+            let trace_jsonl =
+                r#"{"seq":0,"payload":{"type":"raw_line","line":"stub"}}"#.to_string();
+            let out = CaseRunOutput {
+                stdout: b"ok".to_vec(),
+                stderr: vec![],
+                exit_code: Some(0),
+                timed_out: false,
+            };
+            let result = CaseResult {
+                id: case.id.clone(),
+                status: CaseStatus::Passed,
+                command_count: Some(0),
+                input_tokens: None,
+                output_tokens: None,
+                check_results: vec![],
+                error_message: None,
+            };
+            (out, result, trace_jsonl)
+        }
+    }
+
+    #[tokio::test]
+    async fn test_eval_runner_trait_stub_returns_expected_trace() {
+        let case = EvalCase {
+            id: "c1".to_string(),
+            prompt: "p".to_string(),
+            should_trigger: true,
+            tags: vec![],
+            workspace_subdir: None,
+        };
+        let opts = CaseRunOptions {
+            agent_key: "agent".to_string(),
+            model: None,
+            project_root: PathBuf::from("/tmp"),
+            timeout_seconds: 1,
+        };
+        let runner = StubEvalRunner;
+        let (out, res, trace) = runner.run_case(&case, &opts, &[]).await;
+        assert_eq!(out.exit_code, Some(0));
+        assert_eq!(res.id, "c1");
+        assert!(trace.contains("raw_line"));
+    }
 
     #[test]
     fn test_case_run_options_builder() {
diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs
index 26241a82..f98a08f6 100644
--- a/tests/cli/eval_tests.rs
+++ b/tests/cli/eval_tests.rs
@@ -4,6 +4,7 @@
 
 use super::snapshot_helpers::{
     assert_snapshot_with_settings, cli_snapshot_settings, run_fastskill_command,
+    run_fastskill_command_with_env,
 };
 
 #[test]
@@ -211,3 +212,90 @@ fn test_eval_report_nonexistent_run_dir() {
         combined
     );
 }
+
+#[test]
+fn test_eval_run_persists_event_trace_jsonl() {
+    use serde_json::Value;
+    use std::env;
+    use std::fs;
+    use tempfile::TempDir;
+
+    let dir = TempDir::new().unwrap();
+    let evals_dir = dir.path().join("evals");
+    fs::create_dir_all(&evals_dir).unwrap();
+    fs::write(
+        evals_dir.join("prompts.csv"),
+        "id,prompt,should_trigger,tags,workspace_subdir\ntrace-case,\"test prompt\",true,\"basic\",\n",
+    )
+    .unwrap();
+    fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap();
+    fs::write(
+        dir.path().join("skill-project.toml"),
+        "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nfail_on_missing_agent = true\n",
+    )
+    .unwrap();
+
+    // Create a fake `agent` executable so aikit_sdk::is_agent_available("agent") succeeds.
+    let bin_dir = dir.path().join("bin");
+    fs::create_dir_all(&bin_dir).unwrap();
+    let agent_path = bin_dir.join("agent");
+    fs::write(
+        &agent_path,
+        "#!/usr/bin/env bash\nif [[ \"$1\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\necho '{\"event\":\"ok\"}'\nexit 0\n",
+    )
+    .unwrap();
+    #[cfg(unix)]
+    {
+        use std::os::unix::fs::PermissionsExt;
+        let mut perms = fs::metadata(&agent_path).unwrap().permissions();
+        perms.set_mode(0o755);
+        fs::set_permissions(&agent_path, perms).unwrap();
+    }
+
+    let output_dir = dir.path().join("out");
+    let path = env::var("PATH").unwrap_or_default();
+    let merged_path = format!("{}:{}", bin_dir.display(), path);
+    let env_vars = vec![("PATH", merged_path.as_str())];
+
+    let result = run_fastskill_command_with_env(
+        &[
+            "eval",
+            "run",
+            "--agent",
+            "agent",
+            "--output-dir",
+            output_dir.to_str().unwrap(),
+            "--case",
+            "trace-case",
+            "--json",
+        ],
+        &env_vars,
+        Some(dir.path()),
+    );
+    assert!(
+        result.success,
+        "Expected eval run to succeed, got stdout: {}, stderr: {}",
+        result.stdout, result.stderr
+    );
+
+    let json_start = result.stdout.find('{').unwrap();
+    let summary: Value = serde_json::from_str(&result.stdout[json_start..]).unwrap();
+    let run_dir = summary["run_dir"].as_str().unwrap();
+    let trace_path = std::path::Path::new(run_dir)
+        .join("trace-case")
+        .join("trace.jsonl");
+    let trace_jsonl = fs::read_to_string(&trace_path).unwrap();
+
+    assert!(
+        trace_jsonl.contains("\"type\":\"raw_json\""),
+        "expected persisted trace.jsonl to contain raw_json event, got: {}",
+        trace_jsonl
+    );
+
+    let result_path = std::path::Path::new(run_dir)
+        .join("trace-case")
+        .join("result.json");
+    let case_result: Value =
+        serde_json::from_str(&fs::read_to_string(result_path).unwrap()).unwrap();
+    assert_eq!(case_result["command_count"], 1);
+}