From 82fc707ed88117e9c5e8341ea60eb947773b7f22 Mon Sep 17 00:00:00 2001 From: System Two Date: Sun, 5 Apr 2026 17:46:42 +0000 Subject: [PATCH 1/4] chore(develop): 034-eval --- src/cli/cli.rs | 7 +- src/cli/commands/eval/mod.rs | 62 ++++ src/cli/commands/eval/report.rs | 71 ++++ src/cli/commands/eval/run.rs | 251 ++++++++++++++ src/cli/commands/eval/score.rs | 137 ++++++++ src/cli/commands/eval/validate.rs | 117 +++++++ src/cli/commands/init.rs | 1 + src/cli/commands/mod.rs | 8 + src/core/manifest.rs | 27 ++ src/core/packaging.rs | 10 + src/core/repository.rs | 2 + src/eval/artifacts.rs | 209 ++++++++++++ src/eval/checks.rs | 315 ++++++++++++++++++ src/eval/config.rs | 140 ++++++++ src/eval/mod.rs | 8 + src/eval/runner.rs | 163 +++++++++ src/eval/suite.rs | 277 +++++++++++++++ src/eval/trace.rs | 101 ++++++ src/lib.rs | 1 + tests/cli/eval_tests.rs | 213 ++++++++++++ tests/cli/mod.rs | 1 + tests/cli/repository_tests.rs | 1 + ...apshot_helpers__cli_directory_walking.snap | 1 + ...pers__cli_finds_skills_in_current_dir.snap | 1 + ...elpers__cli_invalid_repositories_path.snap | 1 + ...lpers__cli_repositories_path_argument.snap | 1 + ...i__snapshot_helpers__cli_with_env_var.snap | 1 + ...sts__cli__snapshot_helpers__eval_help.snap | 21 ++ ...i__snapshot_helpers__eval_report_help.snap | 15 + ..._cli__snapshot_helpers__eval_run_help.snap | 20 ++ ...li__snapshot_helpers__eval_score_help.snap | 16 + ..._snapshot_helpers__eval_validate_help.snap | 16 + ...sts__cli__snapshot_helpers__help_flag.snap | 1 + ...__cli__snapshot_helpers__help_no_args.snap | 2 + ...li__snapshot_helpers__help_short_flag.snap | 1 + ..._snapshot_helpers__serve_invalid_port.snap | 1 + ...shot_helpers__sources_add_missing_url.snap | 1 + tests/core_manifest_tests.rs | 1 + 38 files changed, 2221 insertions(+), 1 deletion(-) create mode 100644 src/cli/commands/eval/mod.rs create mode 100644 src/cli/commands/eval/report.rs create mode 100644 src/cli/commands/eval/run.rs create mode 100644 src/cli/commands/eval/score.rs create mode 100644 src/cli/commands/eval/validate.rs create mode 100644 src/eval/artifacts.rs create mode 100644 src/eval/checks.rs create mode 100644 src/eval/config.rs create mode 100644 src/eval/mod.rs create mode 100644 src/eval/runner.rs create mode 100644 src/eval/suite.rs create mode 100644 src/eval/trace.rs create mode 100644 tests/cli/eval_tests.rs create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap create mode 100644 tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 5e26959b..b58eaf20 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -5,7 +5,7 @@ use fastskill::FastSkillService; use std::sync::Arc; use crate::cli::commands::{ - add, analyze, auth, disable, init, install, list, marketplace, package, publish, read, + add, analyze, auth, disable, eval, init, install, list, marketplace, package, publish, read, registry, reindex, remove, repos, search, serve, show, sources, sync, update, version, Commands, }; @@ -109,6 +109,10 @@ impl Cli { return auth::execute_auth(args).await; } + if let Some(Commands::Eval(args)) = self.command { + return eval::execute_eval(args).await; + } + // For other commands, we need to restore self.command, so we need a different approach // Actually, if we get here, command was None or not Init/Repository/Package/Publish/RegistryIndex let command = self.command; @@ -155,6 +159,7 @@ impl Cli { | Some(Commands::Repos(_)) | Some(Commands::Marketplace(_)) | Some(Commands::Auth(_)) + | Some(Commands::Eval(_)) | Some(Commands::Version(_)) => unreachable!("Handled above"), None => { // No subcommand - treat as skill ID for read command diff --git a/src/cli/commands/eval/mod.rs b/src/cli/commands/eval/mod.rs new file mode 100644 index 00000000..0b2888a5 --- /dev/null +++ b/src/cli/commands/eval/mod.rs @@ -0,0 +1,62 @@ +//! Eval command group for skill quality assurance + +pub mod report; +pub mod run; +pub mod score; +pub mod validate; + +use crate::cli::error::CliResult; +use clap::{Args, Subcommand}; + +/// Eval command group +#[derive(Debug, Args)] +#[command( + about = "Evaluation commands for skill quality assurance", + after_help = "Examples:\n fastskill eval validate\n fastskill eval run --agent codex --output-dir /tmp/evals" +)] +pub struct EvalCommand { + #[command(subcommand)] + pub command: EvalSubcommand, +} + +/// Eval subcommands +#[derive(Debug, Subcommand)] +pub enum EvalSubcommand { + /// Validate eval configuration and files + #[command( + about = "Validate eval configuration and files", + after_help = "Examples:\n fastskill eval validate\n fastskill eval validate --agent codex" + )] + Validate(validate::ValidateArgs), + + /// Run eval cases against an agent + #[command( + about = "Run eval cases against an agent", + after_help = "Examples:\n fastskill eval run --agent codex --output-dir /tmp/evals" + )] + Run(run::RunArgs), + + /// Show a report for a completed eval run + #[command( + about = "Show a report for a completed eval run", + after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z" + )] + Report(report::ReportArgs), + + /// Re-score saved eval artifacts without running the agent again + #[command( + about = "Re-score saved eval artifacts without running the agent again", + after_help = "Examples:\n fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z" + )] + Score(score::ScoreArgs), +} + +/// Execute the eval command group +pub async fn execute_eval(args: EvalCommand) -> CliResult<()> { + match args.command { + EvalSubcommand::Validate(args) => validate::execute_validate(args).await, + EvalSubcommand::Run(args) => run::execute_run(args).await, + EvalSubcommand::Report(args) => report::execute_report(args).await, + EvalSubcommand::Score(args) => score::execute_score(args).await, + } +} diff --git a/src/cli/commands/eval/report.rs b/src/cli/commands/eval/report.rs new file mode 100644 index 00000000..c7877b0b --- /dev/null +++ b/src/cli/commands/eval/report.rs @@ -0,0 +1,71 @@ +//! Eval report subcommand - artifact summary and formatting + +use crate::cli::error::{CliError, CliResult}; +use clap::Args; +use fastskill::eval::artifacts::read_summary; +use std::path::PathBuf; + +/// Arguments for `fastskill eval report` +#[derive(Debug, Args)] +#[command( + about = "Show a report for a completed eval run", + after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z\n fastskill eval report --run-dir ./evals/2026-04-01T14-32-10Z --json" +)] +pub struct ReportArgs { + /// Path to the specific run directory + #[arg(long, required = true)] + pub run_dir: PathBuf, + + /// Output as JSON + #[arg(long)] + pub json: bool, +} + +/// Execute the `eval report` command +pub async fn execute_report(args: ReportArgs) -> CliResult<()> { + if !args.run_dir.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", + args.run_dir.display() + ))); + } + + let summary = read_summary(&args.run_dir).map_err(|e| { + CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}", + e + )) + })?; + + if args.json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!("Eval Report"); + println!(" run_dir: {}", args.run_dir.display()); + println!(" agent: {}", summary.agent); + if let Some(ref model) = summary.model { + println!(" model: {}", model); + } + println!( + " result: {}", + if summary.suite_pass { + "PASSED" + } else { + "FAILED" + } + ); + println!(" cases: {}/{} passed", summary.passed, summary.total_cases); + + if !summary.cases.is_empty() { + println!("\nCase Results:"); + for case in &summary.cases { + println!(" [{}] {}", case.status, case.id); + } + } + } + + Ok(()) +} diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs new file mode 100644 index 00000000..483f49fa --- /dev/null +++ b/src/cli/commands/eval/run.rs @@ -0,0 +1,251 @@ +//! Eval run subcommand - case execution orchestration + +use crate::cli::error::{CliError, CliResult}; +use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; +use chrono::Utc; +use clap::Args; +use fastskill::core::project::resolve_project_file; +use fastskill::eval::artifacts::{ + allocate_run_dir, write_case_artifacts, write_summary, CaseSummary, SummaryResult, +}; +use fastskill::eval::checks::load_checks; +use fastskill::eval::config::resolve_eval_config; +use fastskill::eval::runner::{run_eval_case, CaseRunOptions}; +use fastskill::eval::suite::load_suite; +use fastskill::eval::trace::{stdout_to_trace, trace_to_jsonl}; +use std::env; +use std::path::PathBuf; + +/// Arguments for `fastskill eval run` +#[derive(Debug, Args)] +#[command( + about = "Run eval cases against an agent", + after_help = "Examples:\n fastskill eval run --agent codex --output-dir /tmp/evals\n fastskill eval run --agent claude --output-dir ./evals --case my-case\n fastskill eval run --agent codex --output-dir ./evals --tag basic" +)] +pub struct RunArgs { + /// Agent to use for execution (required) + #[arg(long, required = true, value_parser = validate_agent_key_for_run)] + pub agent: String, + + /// Output directory for artifacts (required) + #[arg(long, required = true)] + pub output_dir: PathBuf, + + /// Optional model override forwarded to the agent + #[arg(long)] + pub model: Option, + + /// Filter: run only the case with this ID + #[arg(long)] + pub case: Option, + + /// Filter: run only cases with this tag + #[arg(long)] + pub tag: Option, + + /// Output as JSON + #[arg(long)] + pub json: bool, + + /// Do not fail with non-zero exit code on suite failure + #[arg(long)] + pub no_fail: bool, +} + +fn validate_agent_key_for_run(s: &str) -> Result { + if is_runnable(s) { + Ok(s.to_string()) + } else { + Err(format!( + "'{}' is not a supported agent. Supported: {}", + s, + runnable_agents().join(", ") + )) + } +} + +/// Execute the `eval run` command +pub async fn execute_run(args: RunArgs) -> CliResult<()> { + let current_dir = env::current_dir() + .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; + + let resolution = resolve_project_file(¤t_dir); + if !resolution.found { + return Err(CliError::Config( + "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first." + .to_string(), + )); + } + + let project_root = resolution + .path + .parent() + .unwrap_or(¤t_dir) + .to_path_buf(); + + let eval_config = resolve_eval_config(&resolution.path, &project_root) + .map_err(|e| CliError::Config(e.to_string()))?; + + // Check agent availability + if eval_config.fail_on_missing_agent && !is_agent_available(&args.agent) { + return Err(CliError::Config(format!( + "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it first.", + args.agent + ))); + } + + // Load suite + let mut suite = + load_suite(&eval_config.prompts_path).map_err(|e| CliError::Config(e.to_string()))?; + + // Apply filters + if let Some(ref case_id) = args.case { + suite = suite.filter_by_id(case_id); + if suite.cases.is_empty() { + return Err(CliError::Config(format!( + "No case found with id '{}'", + case_id + ))); + } + } + if let Some(ref tag) = args.tag { + suite = suite.filter_by_tag(tag); + if suite.cases.is_empty() { + return Err(CliError::Config(format!( + "No cases found with tag '{}'", + tag + ))); + } + } + + // Load checks if configured + let checks = if let Some(ref checks_path) = eval_config.checks_path { + load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))? + } else { + vec![] + }; + + // Allocate run directory + let run_id = Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string(); + std::fs::create_dir_all(&args.output_dir).map_err(|e| { + CliError::Config(format!( + "Failed to create output directory '{}': {}", + args.output_dir.display(), + e + )) + })?; + let run_dir = + allocate_run_dir(&args.output_dir, &run_id).map_err(|e| CliError::Config(e.to_string()))?; + + let run_opts = CaseRunOptions { + agent_key: args.agent.clone(), + model: args.model.clone(), + project_root: project_root.clone(), + timeout_seconds: eval_config.timeout_seconds, + }; + + eprintln!( + "Running {} eval case(s) with agent '{}'...", + suite.cases.len(), + args.agent + ); + + let mut case_results = Vec::new(); + let mut case_summaries = Vec::new(); + + for case in &suite.cases { + eprintln!(" Running case '{}'...", case.id); + + let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await; + + // Build trace + let trace_events = stdout_to_trace(&run_output.stdout); + let trace_jsonl = trace_to_jsonl(&trace_events); + + // Write artifacts + if let Err(e) = write_case_artifacts( + &run_dir, + &case.id, + &run_output.stdout, + &run_output.stderr, + &trace_jsonl, + &case_result, + ) { + eprintln!( + " warning: failed to write artifacts for case '{}': {}", + case.id, e + ); + } + + let summary_entry = CaseSummary { + id: case_result.id.clone(), + status: case_result.status.clone(), + command_count: case_result.command_count, + input_tokens: case_result.input_tokens, + output_tokens: case_result.output_tokens, + }; + + case_summaries.push(summary_entry); + case_results.push(case_result); + } + + let passed = case_results + .iter() + .filter(|r| r.status == fastskill::eval::artifacts::CaseStatus::Passed) + .count(); + let failed = case_results.len() - passed; + let suite_pass = failed == 0; + + let summary = SummaryResult { + suite_pass, + agent: args.agent.clone(), + model: args.model.clone(), + total_cases: case_results.len(), + passed, + failed, + run_dir: run_dir.clone(), + checks_path: eval_config.checks_path.map(|p| { + if p.is_absolute() { + p + } else { + project_root.join(p) + } + }), + skill_project_root: project_root, + cases: case_summaries, + }; + + // Write summary + if let Err(e) = write_summary(&run_dir, &summary) { + eprintln!("warning: failed to write summary.json: {}", e); + } + + if args.json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!( + "\nEval run complete: {}/{} passed", + passed, + case_results.len() + ); + println!(" run_dir: {}", run_dir.display()); + if suite_pass { + println!(" result: PASSED"); + } else { + println!(" result: FAILED ({} case(s) failed)", failed); + } + } + + if !suite_pass && !args.no_fail { + return Err(CliError::Config(format!( + "Eval suite failed: {}/{} cases passed", + passed, + case_results.len() + ))); + } + + Ok(()) +} diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs new file mode 100644 index 00000000..6afa3911 --- /dev/null +++ b/src/cli/commands/eval/score.rs @@ -0,0 +1,137 @@ +//! Eval score subcommand - offline re-scoring from saved artifacts + +use crate::cli::error::{CliError, CliResult}; +use clap::Args; +use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus}; +use fastskill::eval::checks::load_checks; +use std::path::PathBuf; + +/// Arguments for `fastskill eval score` +#[derive(Debug, Args)] +#[command( + about = "Re-score saved eval artifacts without running the agent again", + after_help = "Examples:\n fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z\n fastskill eval score --run-dir ./evals/2026-04-01T14-32-10Z --json" +)] +pub struct ScoreArgs { + /// Path to the run directory to re-score + #[arg(long, required = true)] + pub run_dir: PathBuf, + + /// Output as JSON + #[arg(long)] + pub json: bool, + + /// Do not fail with non-zero exit code on suite failure + #[arg(long)] + pub no_fail: bool, +} + +/// Execute the `eval score` command +pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { + if !args.run_dir.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", + args.run_dir.display() + ))); + } + + // Read existing summary + let mut summary = read_summary(&args.run_dir).map_err(|e| { + CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}", + e + )) + })?; + + // Validate that we have usable paths + let checks_path = summary.checks_path.as_ref().ok_or_else(|| { + CliError::Config( + "EVAL_ARTIFACTS_CORRUPT: summary.json lacks checks_path - cannot re-score".to_string(), + ) + })?; + + if !checks_path.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: checks_path '{}' does not exist", + checks_path.display() + ))); + } + + // Load checks + let checks = load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))?; + + // Read existing case artifacts and re-score + let mut new_passed = 0; + let mut new_failed = 0; + + let mut updated_cases = summary.cases.clone(); + + for case_summary in &mut updated_cases { + let case_dir = args.run_dir.join(&case_summary.id); + if !case_dir.exists() { + continue; + } + + let stdout_path = case_dir.join("stdout.txt"); + let trace_path = case_dir.join("trace.jsonl"); + + let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default(); + let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default(); + + let check_results = fastskill::eval::checks::run_checks( + &checks, + &stdout_content, + &trace_jsonl, + &summary.skill_project_root, + ); + + let all_passed = check_results.iter().all(|r| r.passed); + case_summary.status = if all_passed { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + if all_passed { + new_passed += 1; + } else { + new_failed += 1; + } + } + + summary.passed = new_passed; + summary.failed = new_failed; + summary.suite_pass = new_failed == 0; + summary.cases = updated_cases; + + // Update summary.json + write_summary(&args.run_dir, &summary) + .map_err(|e| CliError::Config(format!("Failed to write updated summary.json: {}", e)))?; + + if args.json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!("Re-scoring complete"); + println!( + " result: {}", + if summary.suite_pass { + "PASSED" + } else { + "FAILED" + } + ); + println!(" cases: {}/{} passed", summary.passed, summary.total_cases); + } + + if !summary.suite_pass && !args.no_fail { + return Err(CliError::Config(format!( + "Eval suite failed: {}/{} cases passed after re-scoring", + summary.passed, summary.total_cases + ))); + } + + Ok(()) +} diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs new file mode 100644 index 00000000..54e52933 --- /dev/null +++ b/src/cli/commands/eval/validate.rs @@ -0,0 +1,117 @@ +//! Eval validate subcommand - configuration and file validation + +use crate::cli::error::{CliError, CliResult}; +use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; +use clap::Args; +use fastskill::core::project::resolve_project_file; +use fastskill::eval::config::resolve_eval_config; +use std::env; + +/// Arguments for `fastskill eval validate` +#[derive(Debug, Args)] +#[command( + about = "Validate eval configuration and files", + after_help = "Examples:\n fastskill eval validate\n fastskill eval validate --agent codex" +)] +pub struct ValidateArgs { + /// Check agent availability for the specified agent key + #[arg(long, value_parser = validate_agent_key_parser)] + pub agent: Option, + + /// Output as JSON + #[arg(long)] + pub json: bool, +} + +fn validate_agent_key_parser(s: &str) -> Result { + if is_runnable(s) { + Ok(s.to_string()) + } else { + Err(format!( + "'{}' is not a supported agent. Supported: {}", + s, + runnable_agents().join(", ") + )) + } +} + +/// Execute the `eval validate` command +pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { + let current_dir = env::current_dir() + .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; + + let resolution = resolve_project_file(¤t_dir); + + if !resolution.found { + return Err(CliError::Config( + "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first." + .to_string(), + )); + } + + let project_root = resolution + .path + .parent() + .unwrap_or(¤t_dir) + .to_path_buf(); + + let eval_config = resolve_eval_config(&resolution.path, &project_root) + .map_err(|e| CliError::Config(e.to_string()))?; + + // Check agent availability if --agent was specified + if let Some(ref agent_key) = args.agent { + let available = is_agent_available(agent_key); + if !available && eval_config.fail_on_missing_agent { + return Err(CliError::Config(format!( + "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it or use --agent with an available agent.", + agent_key + ))); + } + if !available { + eprintln!( + "warning: agent '{}' is not available (fail_on_missing_agent=false, continuing)", + agent_key + ); + } + } + + if args.json { + let output = serde_json::json!({ + "valid": true, + "prompts_path": eval_config.prompts_path, + "checks_path": eval_config.checks_path, + "timeout_seconds": eval_config.timeout_seconds, + "fail_on_missing_agent": eval_config.fail_on_missing_agent, + "project_root": eval_config.project_root, + }); + println!( + "{}", + serde_json::to_string_pretty(&output).unwrap_or_default() + ); + } else { + println!("eval configuration: valid"); + println!(" prompts: {}", eval_config.prompts_path.display()); + if let Some(ref checks) = eval_config.checks_path { + println!(" checks: {}", checks.display()); + } + println!(" timeout: {}s", eval_config.timeout_seconds); + println!( + " fail_on_missing_agent: {}", + eval_config.fail_on_missing_agent + ); + if let Some(ref agent_key) = args.agent { + let available = is_agent_available(agent_key); + println!( + " agent '{}': {}", + agent_key, + if available { + "available" + } else { + "unavailable" + } + ); + } + } + + Ok(()) +} diff --git a/src/cli/commands/init.rs b/src/cli/commands/init.rs index ada79bc0..bbd97d69 100644 --- a/src/cli/commands/init.rs +++ b/src/cli/commands/init.rs @@ -255,6 +255,7 @@ fn build_skill_project(meta: InitMetadata<'_>) -> CliResult { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }); validate_project_structure(true, dependencies.is_some()) diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs index b2499505..566585f8 100644 --- a/src/cli/commands/mod.rs +++ b/src/cli/commands/mod.rs @@ -5,6 +5,7 @@ pub mod analyze; pub mod auth; pub mod common; pub mod disable; +pub mod eval; pub mod init; pub mod install; pub mod list; @@ -57,6 +58,13 @@ pub enum Commands { )] Disable(disable::DisableArgs), + /// Evaluation commands for skill quality assurance + #[command( + about = "Evaluation commands for skill quality assurance", + after_help = "Examples:\n fastskill eval validate\n fastskill eval run --agent codex --output-dir /tmp/evals" + )] + Eval(eval::EvalCommand), + /// Initialize skill-project.toml for skill authors #[command( about = "Initialize skill-project.toml in current skill directory", diff --git a/src/core/manifest.rs b/src/core/manifest.rs index abc171b3..50120010 100644 --- a/src/core/manifest.rs +++ b/src/core/manifest.rs @@ -273,6 +273,33 @@ pub struct FastSkillToolConfig { /// Skip transitive dependency resolution entirely (default: false) #[serde(default)] pub skip_transitive: bool, + /// Optional evaluation configuration + #[serde(default)] + pub eval: Option, +} + +/// Evaluation configuration in TOML format ([tool.fastskill.eval]) +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalConfigToml { + /// Path to prompts CSV file (relative to skill project root) + pub prompts: PathBuf, + /// Optional path to checks TOML file + #[serde(default)] + pub checks: Option, + /// Timeout in seconds for each eval case execution + #[serde(default = "default_eval_timeout_seconds")] + pub timeout_seconds: u64, + /// When true, `eval run` / `eval validate --agent` fail fast if the agent CLI is not available + #[serde(default = "default_fail_on_missing_agent")] + pub fail_on_missing_agent: bool, +} + +fn default_eval_timeout_seconds() -> u64 { + 900 +} + +fn default_fail_on_missing_agent() -> bool { + true } fn default_install_depth() -> u32 { diff --git a/src/core/packaging.rs b/src/core/packaging.rs index 892c8356..4c1ea9b8 100644 --- a/src/core/packaging.rs +++ b/src/core/packaging.rs @@ -157,6 +157,11 @@ pub fn package_skill_with_id( continue; } + // Skip evals/ directory from published artifacts + if relative_path_str.starts_with("evals/") || relative_path_str == "evals" { + continue; + } + // Read file content let mut file_content = Vec::new(); let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?; @@ -219,6 +224,11 @@ pub fn package_skill_with_id( continue; } + // Skip evals/ directory from published artifacts + if relative_path_str.starts_with("evals/") || relative_path_str == "evals" { + continue; + } + let mut file_content = Vec::new(); let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?; file.read_to_end(&mut file_content) diff --git a/src/core/repository.rs b/src/core/repository.rs index a51163e6..5fcf1f49 100644 --- a/src/core/repository.rs +++ b/src/core/repository.rs @@ -251,6 +251,7 @@ impl RepositoryManager { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }); } else if let Some(ref mut tool) = project.tool { @@ -262,6 +263,7 @@ impl RepositoryManager { server: None, install_depth: 5, skip_transitive: false, + eval: None, }); } else if let Some(ref mut fastskill) = tool.fastskill { fastskill.repositories = Some(manifest_repos); diff --git a/src/eval/artifacts.rs b/src/eval/artifacts.rs new file mode 100644 index 00000000..b11a5b1e --- /dev/null +++ b/src/eval/artifacts.rs @@ -0,0 +1,209 @@ +//! Artifact layout and persistence for eval runs + +use crate::eval::checks::CheckResult; +use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// Status of a single eval case +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum CaseStatus { + Passed, + Failed, + Error, + Skipped, +} + +impl std::fmt::Display for CaseStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CaseStatus::Passed => write!(f, "passed"), + CaseStatus::Failed => write!(f, "failed"), + CaseStatus::Error => write!(f, "error"), + CaseStatus::Skipped => write!(f, "skipped"), + } + } +} + +/// Per-case result +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CaseResult { + pub id: String, + pub status: CaseStatus, + pub command_count: Option, + pub input_tokens: Option, + pub output_tokens: Option, + #[serde(default)] + pub check_results: Vec, + pub error_message: Option, +} + +/// Aggregated run summary +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SummaryResult { + pub suite_pass: bool, + pub agent: String, + pub model: Option, + pub total_cases: usize, + pub passed: usize, + pub failed: usize, + pub run_dir: PathBuf, + pub checks_path: Option, + pub skill_project_root: PathBuf, + pub cases: Vec, +} + +/// Per-case summary entry in summary.json +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CaseSummary { + pub id: String, + pub status: CaseStatus, + pub command_count: Option, + pub input_tokens: Option, + pub output_tokens: Option, +} + +/// All artifacts from a completed run +#[derive(Debug)] +pub struct RunArtifacts { + pub run_id: String, + pub run_dir: PathBuf, + pub summary: SummaryResult, + pub case_results: Vec, +} + +/// Errors during artifact writing/reading +#[derive(Debug, Error)] +pub enum ArtifactsError { + #[error("EVAL_ARTIFACTS_CORRUPT: IO error: {0}")] + Io(#[from] std::io::Error), + #[error("EVAL_ARTIFACTS_CORRUPT: JSON error: {0}")] + Json(#[from] serde_json::Error), + #[error("EVAL_ARTIFACTS_CORRUPT: Missing required field: {0}")] + MissingField(String), +} + +/// Allocate a run directory under output_dir using ISO 8601 timestamp format +/// Appends numeric suffix if directory already exists +pub fn allocate_run_dir(output_dir: &Path, run_id: &str) -> Result { + let base = output_dir.join(run_id); + if !base.exists() { + std::fs::create_dir_all(&base)?; + return Ok(base); + } + + // Append numeric suffix + for i in 2..=999 { + let candidate = output_dir.join(format!("{}-{}", run_id, i)); + if !candidate.exists() { + std::fs::create_dir_all(&candidate)?; + return Ok(candidate); + } + } + + // Fallback: use the base (it exists, will overwrite) + Ok(base) +} + +/// Write per-case artifacts (stdout.txt, stderr.txt, trace.jsonl, result.json) +pub fn write_case_artifacts( + run_dir: &Path, + case_id: &str, + stdout: &[u8], + stderr: &[u8], + trace_jsonl: &str, + result: &CaseResult, +) -> Result { + let case_dir = run_dir.join(case_id); + std::fs::create_dir_all(&case_dir)?; + + std::fs::write(case_dir.join("stdout.txt"), stdout)?; + std::fs::write(case_dir.join("stderr.txt"), stderr)?; + std::fs::write(case_dir.join("trace.jsonl"), trace_jsonl)?; + + let result_json = serde_json::to_string_pretty(result)?; + std::fs::write(case_dir.join("result.json"), result_json)?; + + Ok(case_dir) +} + +/// Write summary.json +pub fn write_summary(run_dir: &Path, summary: &SummaryResult) -> Result<(), ArtifactsError> { + let summary_json = serde_json::to_string_pretty(summary)?; + std::fs::write(run_dir.join("summary.json"), summary_json)?; + Ok(()) +} + +/// Read summary.json from a run directory +pub fn read_summary(run_dir: &Path) -> Result { + let summary_path = run_dir.join("summary.json"); + let content = std::fs::read_to_string(&summary_path)?; + let summary: SummaryResult = serde_json::from_str(&content)?; + Ok(summary) +} + +/// Read case result.json files from a run directory +pub fn read_case_results(run_dir: &Path) -> Result, ArtifactsError> { + let mut results = Vec::new(); + + let entries = std::fs::read_dir(run_dir)?; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let result_path = path.join("result.json"); + if result_path.exists() { + let content = std::fs::read_to_string(&result_path)?; + let result: CaseResult = serde_json::from_str(&content)?; + results.push(result); + } + } + } + + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_allocate_run_dir_creates_new() { + let dir = TempDir::new().unwrap(); + let run_dir = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + assert!(run_dir.exists()); + assert!(run_dir.ends_with("2026-04-01T14-00-00Z")); + } + + #[test] + fn test_allocate_run_dir_suffix_on_conflict() { + let dir = TempDir::new().unwrap(); + let run_dir1 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + let run_dir2 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + assert_ne!(run_dir1, run_dir2); + assert!(run_dir2.to_string_lossy().contains("-2")); + } + + #[test] + fn test_write_and_read_summary() { + let dir = TempDir::new().unwrap(); + let summary = SummaryResult { + suite_pass: true, + agent: "codex".to_string(), + model: None, + total_cases: 2, + passed: 2, + failed: 0, + run_dir: dir.path().to_path_buf(), + checks_path: None, + skill_project_root: dir.path().to_path_buf(), + cases: vec![], + }; + + write_summary(dir.path(), &summary).unwrap(); + let read = read_summary(dir.path()).unwrap(); + assert_eq!(read.total_cases, 2); + assert!(read.suite_pass); + } +} diff --git a/src/eval/checks.rs b/src/eval/checks.rs new file mode 100644 index 00000000..a7d94e5e --- /dev/null +++ b/src/eval/checks.rs @@ -0,0 +1,315 @@ +//! Deterministic check engine for eval artifact scoring + +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use thiserror::Error; + +/// A deterministic check definition loaded from checks.toml +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "name")] +pub enum CheckDefinition { + /// Check whether a pattern appears (or doesn't appear) in the trace + #[serde(rename = "trigger_expectation")] + TriggerExpectation { + pattern: String, + /// true = pattern must appear; false = pattern must NOT appear + expected: bool, + #[serde(default = "default_required")] + required: bool, + }, + /// Check whether the trace contains a line with this pattern + #[serde(rename = "command_contains")] + CommandContains { + pattern: String, + #[serde(default = "default_required")] + required: bool, + }, + /// Check whether a file exists in the working directory after execution + #[serde(rename = "file_exists")] + FileExists { + path: PathBuf, + #[serde(default = "default_required")] + required: bool, + }, + /// Check that the number of raw_json trace lines does not exceed a limit + #[serde(rename = "max_command_count")] + MaxCommandCount { + limit: usize, + #[serde(default = "default_required")] + required: bool, + }, +} + +fn default_required() -> bool { + true +} + +impl CheckDefinition { + pub fn name(&self) -> &str { + match self { + CheckDefinition::TriggerExpectation { .. } => "trigger_expectation", + CheckDefinition::CommandContains { .. } => "command_contains", + CheckDefinition::FileExists { .. } => "file_exists", + CheckDefinition::MaxCommandCount { .. } => "max_command_count", + } + } + + pub fn is_required(&self) -> bool { + match self { + CheckDefinition::TriggerExpectation { required, .. } => *required, + CheckDefinition::CommandContains { required, .. } => *required, + CheckDefinition::FileExists { required, .. } => *required, + CheckDefinition::MaxCommandCount { required, .. } => *required, + } + } +} + +/// Result of a single check +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub check_name: String, + pub passed: bool, + pub message: Option, +} + +/// TOML file structure for checks configuration +#[derive(Debug, Deserialize)] +pub struct ChecksToml { + #[serde(rename = "check", default)] + pub checks: Vec, +} + +/// Errors loading checks configuration +#[derive(Debug, Error)] +pub enum ChecksError { + #[error("EVAL_CHECKS_INVALID: Failed to read checks file: {0}")] + Io(#[from] std::io::Error), + #[error("EVAL_CHECKS_INVALID: Failed to parse checks TOML: {0}")] + Parse(#[from] toml::de::Error), +} + +/// Load check definitions from a TOML file +pub fn load_checks(path: &std::path::Path) -> Result, ChecksError> { + let content = std::fs::read_to_string(path)?; + let parsed: ChecksToml = toml::from_str(&content)?; + Ok(parsed.checks) +} + +/// Run all checks against captured stdout content and working directory +pub fn run_checks( + checks: &[CheckDefinition], + stdout_content: &str, + trace_jsonl: &str, + working_dir: &std::path::Path, +) -> Vec { + checks + .iter() + .map(|check| run_single_check(check, stdout_content, trace_jsonl, working_dir)) + .collect() +} + +fn run_single_check( + check: &CheckDefinition, + stdout_content: &str, + trace_jsonl: &str, + working_dir: &std::path::Path, +) -> CheckResult { + match check { + CheckDefinition::TriggerExpectation { + pattern, expected, .. + } => { + // Literal substring matching in stdout + trace + let combined = format!("{}\n{}", stdout_content, trace_jsonl); + let found = combined.contains(pattern.as_str()); + let passed = found == *expected; + let message = if passed { + None + } else if *expected { + Some(format!("Pattern '{}' not found but was expected", pattern)) + } else { + Some(format!("Pattern '{}' found but was not expected", pattern)) + }; + CheckResult { + check_name: "trigger_expectation".to_string(), + passed, + message, + } + } + CheckDefinition::CommandContains { pattern, .. } => { + let combined = format!("{}\n{}", stdout_content, trace_jsonl); + let passed = combined.contains(pattern.as_str()); + let message = if passed { + None + } else { + Some(format!("Pattern '{}' not found in output", pattern)) + }; + CheckResult { + check_name: "command_contains".to_string(), + passed, + message, + } + } + CheckDefinition::FileExists { path, .. } => { + let full_path = working_dir.join(path); + let passed = full_path.exists(); + let message = if passed { + None + } else { + Some(format!("File '{}' does not exist", path.display())) + }; + CheckResult { + check_name: "file_exists".to_string(), + passed, + message, + } + } + CheckDefinition::MaxCommandCount { limit, .. } => { + // Count raw_json trace lines + let count = trace_jsonl + .lines() + .filter(|line| line.contains("\"raw_json\"")) + .count(); + let passed = count <= *limit; + let message = if passed { + None + } else { + Some(format!("Command count {} exceeds limit {}", count, limit)) + }; + CheckResult { + check_name: "max_command_count".to_string(), + passed, + message, + } + } + } +} + +/// Aggregate check results: suite passes if all required checks pass +pub fn suite_passes(results: &[CheckResult]) -> bool { + results.iter().all(|r| r.passed) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + use tempfile::TempDir; + + #[test] + fn test_trigger_expectation_passes_when_pattern_found() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: true, + required: true, + }; + let results = run_checks(&[check], "fastskill triggered", "", Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_trigger_expectation_fails_when_pattern_missing() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: true, + required: true, + }; + let results = run_checks(&[check], "nothing here", "", Path::new("/tmp")); + assert!(!results[0].passed); + } + + #[test] + fn test_trigger_expectation_negative_passes_when_pattern_absent() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: false, + required: true, + }; + let results = run_checks(&[check], "no match", "", Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_file_exists_check_passes() { + let dir = TempDir::new().unwrap(); + let file_path = dir.path().join("output.txt"); + std::fs::write(&file_path, "content").unwrap(); + + let check = CheckDefinition::FileExists { + path: PathBuf::from("output.txt"), + required: true, + }; + let results = run_checks(&[check], "", "", dir.path()); + assert!(results[0].passed); + } + + #[test] + fn test_file_exists_check_fails() { + let dir = TempDir::new().unwrap(); + let check = CheckDefinition::FileExists { + path: PathBuf::from("missing.txt"), + required: true, + }; + let results = run_checks(&[check], "", "", dir.path()); + assert!(!results[0].passed); + } + + #[test] + fn test_max_command_count_passes() { + let check = CheckDefinition::MaxCommandCount { + limit: 5, + required: true, + }; + let trace = r#"{"type":"raw_json","seq":0} +{"type":"raw_json","seq":1} +{"type":"raw_line","seq":2}"#; + let results = run_checks(&[check], "", trace, Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_max_command_count_fails() { + let check = CheckDefinition::MaxCommandCount { + limit: 1, + required: true, + }; + let trace = r#"{"type":"raw_json","seq":0} +{"type":"raw_json","seq":1} +{"type":"raw_json","seq":2}"#; + let results = run_checks(&[check], "", trace, Path::new("/tmp")); + assert!(!results[0].passed); + } + + #[test] + fn test_suite_passes_all_passed() { + let results = vec![ + CheckResult { + check_name: "a".to_string(), + passed: true, + message: None, + }, + CheckResult { + check_name: "b".to_string(), + passed: true, + message: None, + }, + ]; + assert!(suite_passes(&results)); + } + + #[test] + fn test_suite_passes_any_failed() { + let results = vec![ + CheckResult { + check_name: "a".to_string(), + passed: true, + message: None, + }, + CheckResult { + check_name: "b".to_string(), + passed: false, + message: Some("failed".to_string()), + }, + ]; + assert!(!suite_passes(&results)); + } +} diff --git a/src/eval/config.rs b/src/eval/config.rs new file mode 100644 index 00000000..ecb6e5b5 --- /dev/null +++ b/src/eval/config.rs @@ -0,0 +1,140 @@ +//! Eval configuration resolution + +use crate::core::manifest::{EvalConfigToml, SkillProjectToml}; +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// Resolved eval configuration (after path resolution) +#[derive(Debug, Clone)] +pub struct EvalConfig { + /// Absolute path to prompts CSV file + pub prompts_path: PathBuf, + /// Absolute path to checks TOML file (optional) + pub checks_path: Option, + /// Timeout in seconds for each case + pub timeout_seconds: u64, + /// Whether to fail fast if agent is not available + pub fail_on_missing_agent: bool, + /// Skill project root directory + pub project_root: PathBuf, +} + +/// Errors during eval configuration resolution +#[derive(Debug, Error)] +pub enum EvalConfigError { + #[error("EVAL_CONFIG_MISSING: No [tool.fastskill.eval] section found in skill-project.toml")] + ConfigMissing, + #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV not found: {0}")] + PromptsNotFound(PathBuf), + #[error("Failed to read skill-project.toml: {0}")] + Io(#[from] std::io::Error), + #[error("Failed to parse skill-project.toml: {0}")] + Parse(String), +} + +/// Resolve eval configuration from a skill-project.toml file +pub fn resolve_eval_config( + project_file: &Path, + project_root: &Path, +) -> Result { + let content = std::fs::read_to_string(project_file)?; + let toml: SkillProjectToml = + toml::from_str(&content).map_err(|e| EvalConfigError::Parse(e.to_string()))?; + + let eval_config = toml + .tool + .as_ref() + .and_then(|t| t.fastskill.as_ref()) + .and_then(|f| f.eval.as_ref()) + .ok_or(EvalConfigError::ConfigMissing)?; + + resolve_from_toml(eval_config, project_root) +} + +/// Resolve eval configuration from parsed TOML config +pub fn resolve_from_toml( + config: &EvalConfigToml, + project_root: &Path, +) -> Result { + let prompts_path = if config.prompts.is_absolute() { + config.prompts.clone() + } else { + project_root.join(&config.prompts) + }; + + if !prompts_path.exists() { + return Err(EvalConfigError::PromptsNotFound(prompts_path)); + } + + let checks_path = config.checks.as_ref().map(|p| { + if p.is_absolute() { + p.clone() + } else { + project_root.join(p) + } + }); + + Ok(EvalConfig { + prompts_path, + checks_path, + timeout_seconds: config.timeout_seconds, + fail_on_missing_agent: config.fail_on_missing_agent, + project_root: project_root.to_path_buf(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_resolve_eval_config_missing() { + let dir = TempDir::new().unwrap(); + let project_file = dir.path().join("skill-project.toml"); + std::fs::write(&project_file, "[metadata]\nid = \"test\"\n").unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!(result, Err(EvalConfigError::ConfigMissing))); + } + + #[test] + fn test_resolve_eval_config_prompts_not_found() { + let dir = TempDir::new().unwrap(); + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 900\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!(result, Err(EvalConfigError::PromptsNotFound(_)))); + } + + #[test] + fn test_resolve_eval_config_success() { + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + std::fs::create_dir_all(&evals_dir).unwrap(); + let prompts_file = evals_dir.join("prompts.csv"); + std::fs::write( + &prompts_file, + "id,prompt,should_trigger\ntest-1,hello,true\n", + ) + .unwrap(); + + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 600\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(result.is_ok()); + let config = result.unwrap(); + assert_eq!(config.timeout_seconds, 600); + assert!(!config.fail_on_missing_agent); + } +} diff --git a/src/eval/mod.rs b/src/eval/mod.rs new file mode 100644 index 00000000..bedfcb3c --- /dev/null +++ b/src/eval/mod.rs @@ -0,0 +1,8 @@ +//! Evaluation domain types and services for skill quality assurance + +pub mod artifacts; +pub mod checks; +pub mod config; +pub mod runner; +pub mod suite; +pub mod trace; diff --git a/src/eval/runner.rs b/src/eval/runner.rs new file mode 100644 index 00000000..24394f6c --- /dev/null +++ b/src/eval/runner.rs @@ -0,0 +1,163 @@ +//! Eval runner implementation using aikit-sdk + +use crate::eval::artifacts::{CaseResult, CaseStatus}; +use crate::eval::checks::{run_checks, CheckDefinition}; +use crate::eval::suite::EvalCase; +use crate::eval::trace::{stdout_to_trace, trace_to_jsonl}; +use aikit_sdk::{run_agent, RunOptions}; +use std::path::PathBuf; +use thiserror::Error; + +/// Options for running a single eval case +#[derive(Debug, Clone)] +pub struct CaseRunOptions { + /// Agent key (e.g. "codex", "claude") + pub agent_key: String, + /// Optional model override + pub model: Option, + /// Skill project root (used as working directory when no workspace_subdir) + pub project_root: PathBuf, + /// Timeout in seconds + pub timeout_seconds: u64, +} + +/// Raw output from running a case +#[derive(Debug)] +pub struct CaseRunOutput { + pub stdout: Vec, + pub stderr: Vec, + pub exit_code: Option, + pub timed_out: bool, +} + +/// Errors during case execution +#[derive(Debug, Error)] +pub enum RunnerError { + #[error("EVAL_AGENT_UNAVAILABLE: Agent '{0}' is not available")] + AgentUnavailable(String), + #[error("EVAL_CASE_TIMEOUT: Case timed out after {0}s")] + Timeout(u64), + #[error("Execution failed: {0}")] + ExecutionFailed(String), +} + +/// Run a single eval case using aikit_sdk::run_agent via spawn_blocking. +/// +/// This is the sole agent execution path per spec (acceptance criterion 29). +/// Uses spawn_blocking because run_agent is synchronous. +pub async fn run_eval_case( + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], +) -> (CaseRunOutput, CaseResult) { + let agent_key = opts.agent_key.clone(); + let model = opts.model.clone(); + let prompt = case.prompt.clone(); + + // Determine working directory + let working_dir = match &case.workspace_subdir { + Some(subdir) => opts.project_root.join(subdir), + None => opts.project_root.clone(), + }; + + // Build RunOptions using available aikit-sdk API + let run_opts = RunOptions { + model: model.clone(), + yolo: true, + stream: false, + }; + + // Run agent in a blocking thread to avoid blocking the async runtime + let result = + tokio::task::spawn_blocking(move || run_agent(&agent_key, &prompt, run_opts)).await; + + let run_output = match result { + Ok(Ok(run_result)) => { + let exit_code = run_result.exit_code(); + CaseRunOutput { + stdout: run_result.stdout, + stderr: run_result.stderr, + exit_code, + timed_out: false, + } + } + Ok(Err(e)) => CaseRunOutput { + stdout: vec![], + stderr: format!("Agent execution failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }, + Err(e) => CaseRunOutput { + stdout: vec![], + stderr: format!("spawn_blocking failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }, + }; + + // Build trace from stdout + let trace_events = stdout_to_trace(&run_output.stdout); + let trace_jsonl = trace_to_jsonl(&trace_events); + let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string(); + + // Count raw_json events + let command_count = trace_events + .iter() + .filter(|e| matches!(&e.payload, crate::eval::trace::TracePayload::RawJson { .. })) + .count(); + + // Run deterministic checks + let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir); + let all_passed = check_results.iter().all(|r| r.passed); + + let status = if run_output.timed_out { + CaseStatus::Error + } else if checks.is_empty() { + // No checks defined: pass if exit code is 0 + if run_output.exit_code == Some(0) { + CaseStatus::Passed + } else { + CaseStatus::Failed + } + } else if all_passed { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + let case_result = CaseResult { + id: case.id.clone(), + status, + command_count: Some(command_count), + input_tokens: None, + output_tokens: None, + check_results, + error_message: None, + }; + + (run_output, case_result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_case_run_options_builder() { + let opts = CaseRunOptions { + agent_key: "codex".to_string(), + model: Some("gpt-4".to_string()), + project_root: PathBuf::from("/tmp"), + timeout_seconds: 300, + }; + assert_eq!(opts.agent_key, "codex"); + assert_eq!(opts.model, Some("gpt-4".to_string())); + } + + #[test] + fn test_runner_error_display() { + let err = RunnerError::AgentUnavailable("codex".to_string()); + assert!(err.to_string().contains("codex")); + assert!(err.to_string().contains("EVAL_AGENT_UNAVAILABLE")); + } +} diff --git a/src/eval/suite.rs b/src/eval/suite.rs new file mode 100644 index 00000000..8d4df581 --- /dev/null +++ b/src/eval/suite.rs @@ -0,0 +1,277 @@ +//! Eval suite loading and CSV parsing + +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// An individual eval case definition +#[derive(Debug, Clone)] +pub struct EvalCase { + /// Unique case identifier (alphanumeric + hyphens) + pub id: String, + /// Prompt to send to the agent + pub prompt: String, + /// Whether the skill should trigger (documentation-only; checks.toml is authoritative for pass/fail) + pub should_trigger: bool, + /// Tags for filtering + pub tags: Vec, + /// Optional workspace subdirectory (relative to skill project root) + pub workspace_subdir: Option, +} + +/// A collection of eval cases +#[derive(Debug, Default)] +pub struct EvalSuite { + pub cases: Vec, +} + +impl EvalSuite { + pub fn new(cases: Vec) -> Self { + Self { cases } + } + + /// Filter cases by ID + pub fn filter_by_id(&self, id: &str) -> EvalSuite { + EvalSuite { + cases: self.cases.iter().filter(|c| c.id == id).cloned().collect(), + } + } + + /// Filter cases by tag + pub fn filter_by_tag(&self, tag: &str) -> EvalSuite { + EvalSuite { + cases: self + .cases + .iter() + .filter(|c| c.tags.iter().any(|t| t == tag)) + .cloned() + .collect(), + } + } +} + +/// Errors that can occur when loading an eval suite +#[derive(Debug, Error)] +pub enum SuiteError { + #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV file not found: {0}")] + PromptsNotFound(PathBuf), + #[error("EVAL_INVALID_CSV: {0}")] + InvalidCsv(String), + #[error("EVAL_INVALID_CSV: IO error: {0}")] + Io(#[from] std::io::Error), +} + +/// Load an eval suite from a CSV file +/// +/// Expected CSV columns: id,prompt,should_trigger,tags,workspace_subdir +pub fn load_suite(prompts_path: &Path) -> Result { + if !prompts_path.exists() { + return Err(SuiteError::PromptsNotFound(prompts_path.to_path_buf())); + } + + let content = std::fs::read_to_string(prompts_path)?; + parse_prompts_csv(&content) +} + +/// Parse prompts CSV content +fn parse_prompts_csv(content: &str) -> Result { + let mut lines = content.lines(); + + // Parse header + let header = lines + .next() + .ok_or_else(|| SuiteError::InvalidCsv("CSV is empty".to_string()))?; + let headers: Vec = parse_csv_line(header); + + // Find column indices + let id_idx = find_col(&headers, "id")?; + let prompt_idx = find_col(&headers, "prompt")?; + let should_trigger_idx = find_col(&headers, "should_trigger")?; + let tags_idx = headers.iter().position(|h| h.trim() == "tags"); + let workspace_subdir_idx = headers.iter().position(|h| h.trim() == "workspace_subdir"); + + let mut cases = Vec::new(); + + for (line_num, line) in lines.enumerate() { + let line = line.trim(); + if line.is_empty() { + continue; + } + + let cols = parse_csv_line(line); + + let id = cols + .get(id_idx) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .ok_or_else(|| { + SuiteError::InvalidCsv(format!("Missing id at line {}", line_num + 2)) + })?; + + let prompt = cols + .get(prompt_idx) + .map(|s| s.trim_matches('"').trim().to_string()) + .ok_or_else(|| { + SuiteError::InvalidCsv(format!("Missing prompt at line {}", line_num + 2)) + })?; + + let should_trigger_str = cols + .get(should_trigger_idx) + .map(|s| s.trim().to_lowercase()) + .unwrap_or_else(|| "false".to_string()); + let should_trigger = should_trigger_str == "true" || should_trigger_str == "1"; + + let tags = if let Some(idx) = tags_idx { + cols.get(idx) + .map(|s| { + s.trim() + .trim_matches('"') + .split(',') + .map(|t| t.trim().to_string()) + .filter(|t| !t.is_empty()) + .collect() + }) + .unwrap_or_default() + } else { + vec![] + }; + + let workspace_subdir = if let Some(idx) = workspace_subdir_idx { + cols.get(idx).and_then(|s| { + let s = s.trim(); + if s.is_empty() { + None + } else { + Some(PathBuf::from(s)) + } + }) + } else { + None + }; + + cases.push(EvalCase { + id, + prompt, + should_trigger, + tags, + workspace_subdir, + }); + } + + Ok(EvalSuite::new(cases)) +} + +fn find_col(headers: &[String], name: &str) -> Result { + headers + .iter() + .position(|h| h.trim() == name) + .ok_or_else(|| SuiteError::InvalidCsv(format!("Missing required column: {}", name))) +} + +/// Simple CSV line parser that handles quoted fields +fn parse_csv_line(line: &str) -> Vec { + let mut fields = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut chars = line.chars().peekable(); + + while let Some(ch) = chars.next() { + match ch { + '"' => { + if in_quotes { + // Check for escaped quote "" + if chars.peek() == Some(&'"') { + chars.next(); + current.push('"'); + } else { + in_quotes = false; + } + } else { + in_quotes = true; + } + } + ',' if !in_quotes => { + fields.push(current.clone()); + current.clear(); + } + _ => { + current.push(ch); + } + } + } + fields.push(current); + fields +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_prompts_csv_basic() { + let csv = "id,prompt,should_trigger,tags,workspace_subdir\n\ + test-1,\"Do something\",true,\"basic\",\n\ + test-2,\"Do nothing\",false,\"\",\n"; + let suite = parse_prompts_csv(csv).unwrap(); + assert_eq!(suite.cases.len(), 2); + assert_eq!(suite.cases[0].id, "test-1"); + assert!(suite.cases[0].should_trigger); + assert_eq!(suite.cases[1].id, "test-2"); + assert!(!suite.cases[1].should_trigger); + } + + #[test] + fn test_parse_prompts_csv_missing_required_col() { + let csv = "id,prompt\ntest-1,hello\n"; + let result = parse_prompts_csv(csv); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("should_trigger")); + } + + #[test] + fn test_filter_by_id() { + let cases = vec![ + EvalCase { + id: "a".to_string(), + prompt: "p1".to_string(), + should_trigger: true, + tags: vec![], + workspace_subdir: None, + }, + EvalCase { + id: "b".to_string(), + prompt: "p2".to_string(), + should_trigger: false, + tags: vec![], + workspace_subdir: None, + }, + ]; + let suite = EvalSuite::new(cases); + let filtered = suite.filter_by_id("a"); + assert_eq!(filtered.cases.len(), 1); + assert_eq!(filtered.cases[0].id, "a"); + } + + #[test] + fn test_filter_by_tag() { + let cases = vec![ + EvalCase { + id: "a".to_string(), + prompt: "p1".to_string(), + should_trigger: true, + tags: vec!["foo".to_string(), "bar".to_string()], + workspace_subdir: None, + }, + EvalCase { + id: "b".to_string(), + prompt: "p2".to_string(), + should_trigger: false, + tags: vec!["baz".to_string()], + workspace_subdir: None, + }, + ]; + let suite = EvalSuite::new(cases); + let filtered = suite.filter_by_tag("foo"); + assert_eq!(filtered.cases.len(), 1); + assert_eq!(filtered.cases[0].id, "a"); + } +} diff --git a/src/eval/trace.rs b/src/eval/trace.rs new file mode 100644 index 00000000..ef67d6b0 --- /dev/null +++ b/src/eval/trace.rs @@ -0,0 +1,101 @@ +//! Trace event types for eval case execution + +use serde::{Deserialize, Serialize}; + +/// A single line in a trace.jsonl file +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TraceEvent { + /// Sequence number (0-based) + pub seq: usize, + /// Event payload + pub payload: TracePayload, +} + +/// Payload of a trace event +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum TracePayload { + /// A raw JSON line from the agent + RawJson { data: serde_json::Value }, + /// A raw text line from stdout + RawLine { line: String }, + /// A raw bytes chunk (base64-encoded) + RawBytes { b64: String }, + /// Execution error + Error { message: String }, + /// Case timed out + Timeout, +} + +/// Convert raw stdout lines to trace events +pub fn stdout_to_trace(stdout: &[u8]) -> Vec { + let text = String::from_utf8_lossy(stdout); + let mut events = Vec::new(); + + for (seq, line) in text.lines().enumerate() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + + // Try to parse as JSON first + let payload = if let Ok(value) = serde_json::from_str::(trimmed) { + TracePayload::RawJson { data: value } + } else { + TracePayload::RawLine { + line: line.to_string(), + } + }; + + events.push(TraceEvent { seq, payload }); + } + + events +} + +/// Serialize trace events to JSONL format +pub fn trace_to_jsonl(events: &[TraceEvent]) -> String { + events + .iter() + .filter_map(|e| serde_json::to_string(e).ok()) + .collect::>() + .join("\n") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_stdout_to_trace_text_lines() { + let stdout = b"hello world\nfoo bar\n"; + let events = stdout_to_trace(stdout); + assert_eq!(events.len(), 2); + assert_eq!(events[0].seq, 0); + assert!( + matches!(&events[0].payload, TracePayload::RawLine { line } if line == "hello world") + ); + } + + #[test] + fn test_stdout_to_trace_json_lines() { + let stdout = b"{\"key\": \"value\"}\nplain line\n"; + let events = stdout_to_trace(stdout); + assert_eq!(events.len(), 2); + assert!(matches!(&events[0].payload, TracePayload::RawJson { .. })); + assert!(matches!(&events[1].payload, TracePayload::RawLine { .. })); + } + + #[test] + fn test_trace_to_jsonl() { + let events = vec![TraceEvent { + seq: 0, + payload: TracePayload::RawLine { + line: "test".to_string(), + }, + }]; + let jsonl = trace_to_jsonl(&events); + assert!(jsonl.contains("\"seq\":0")); + assert!(jsonl.contains("raw_line")); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6eef1a09..4dc9dc85 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,6 +49,7 @@ //! ``` pub mod core; +pub mod eval; pub mod events; pub mod execution; pub mod http; diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs new file mode 100644 index 00000000..26241a82 --- /dev/null +++ b/tests/cli/eval_tests.rs @@ -0,0 +1,213 @@ +//! CLI integration tests for eval commands + +#![allow(clippy::all, clippy::unwrap_used, clippy::expect_used)] + +use super::snapshot_helpers::{ + assert_snapshot_with_settings, cli_snapshot_settings, run_fastskill_command, +}; + +#[test] +fn test_eval_help() { + let result = run_fastskill_command(&["eval", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_validate_help() { + let result = run_fastskill_command(&["eval", "validate", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings( + "eval_validate_help", + &result.stdout, + &cli_snapshot_settings(), + ); +} + +#[test] +fn test_eval_run_help() { + let result = run_fastskill_command(&["eval", "run", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_run_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_report_help() { + let result = run_fastskill_command(&["eval", "report", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_report_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_score_help() { + let result = run_fastskill_command(&["eval", "score", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_score_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_run_requires_agent() { + let result = run_fastskill_command(&["eval", "run", "--output-dir", "/tmp/evals"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--agent") || combined.contains("agent"), + "Expected error about missing --agent, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_requires_output_dir() { + let result = run_fastskill_command(&["eval", "run", "--agent", "codex"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--output-dir") + || combined.contains("output-dir") + || combined.contains("output_dir"), + "Expected error about missing --output-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_rejects_unsupported_agent() { + let result = run_fastskill_command( + &[ + "eval", + "run", + "--agent", + "unsupported-agent-xyz", + "--output-dir", + "/tmp/evals", + ], + None, + ); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("unsupported-agent-xyz") || combined.contains("not a supported agent"), + "Expected error about unsupported agent, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_no_project_file() { + use tempfile::TempDir; + let dir = TempDir::new().unwrap(); + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("skill-project.toml") || combined.contains("EVAL_CONFIG_MISSING"), + "Expected error about missing skill-project.toml, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_no_eval_config() { + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n", + ) + .unwrap(); + + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("EVAL_CONFIG_MISSING") || combined.contains("eval"), + "Expected EVAL_CONFIG_MISSING error, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_with_eval_config() { + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + + // Create evals directory and prompts.csv + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\ntest-1,\"Test prompt\",true,\"basic\",\n", + ) + .unwrap(); + + // Create SKILL.md so it's detected as skill context + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + + // Create skill-project.toml with eval config + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!( + result.success, + "Expected eval validate to succeed, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + assert!( + result.stdout.contains("valid") || result.stdout.contains("prompts"), + "Expected valid output, got: {}", + result.stdout + ); +} + +#[test] +fn test_eval_report_requires_run_dir() { + let result = run_fastskill_command(&["eval", "report"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--run-dir") || combined.contains("run-dir"), + "Expected error about missing --run-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_score_requires_run_dir() { + let result = run_fastskill_command(&["eval", "score"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--run-dir") || combined.contains("run-dir"), + "Expected error about missing --run-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_report_nonexistent_run_dir() { + let result = run_fastskill_command( + &[ + "eval", + "report", + "--run-dir", + "/tmp/nonexistent-fastskill-eval-dir-xyz123", + ], + None, + ); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("EVAL_ARTIFACTS_CORRUPT") || combined.contains("not exist"), + "Expected error about nonexistent dir, got: {}", + combined + ); +} diff --git a/tests/cli/mod.rs b/tests/cli/mod.rs index 50d76de7..cda88146 100644 --- a/tests/cli/mod.rs +++ b/tests/cli/mod.rs @@ -7,6 +7,7 @@ pub mod add_tests; pub mod analyze_cluster_tests; pub mod auth_e2e_tests; pub mod config_tests; +pub mod eval_tests; pub mod example_tests; pub mod help_tests; pub mod init_tests; diff --git a/tests/cli/repository_tests.rs b/tests/cli/repository_tests.rs index 820a2f65..be64a6df 100644 --- a/tests/cli/repository_tests.rs +++ b/tests/cli/repository_tests.rs @@ -85,6 +85,7 @@ web-scraper = "1.0.0" server: None, install_depth: 5, skip_transitive: false, + eval: None, }); let repos = fastskill_config.repositories.get_or_insert_with(Vec::new); diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap new file mode 100644 index 00000000..5b3baaaf --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Evaluation commands for skill quality assurance + +Usage: fastskill eval + +Commands: + validate Validate eval configuration and files + run Run eval cases against an agent + report Show a report for a completed eval run + score Re-score saved eval artifacts without running the agent again + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + +Examples: + fastskill eval validate + fastskill eval run --agent codex --output-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap new file mode 100644 index 00000000..ba01eee7 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap @@ -0,0 +1,15 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Show a report for a completed eval run + +Usage: fastskill eval report [OPTIONS] --run-dir + +Options: + --run-dir Path to the specific run directory + --json Output as JSON + -h, --help Print help + +Examples: + fastskill eval report --run-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap new file mode 100644 index 00000000..ef5354c5 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap @@ -0,0 +1,20 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Run eval cases against an agent + +Usage: fastskill eval run [OPTIONS] --agent --output-dir + +Options: + --agent Agent to use for execution (required) + --output-dir Output directory for artifacts (required) + --model Optional model override forwarded to the agent + --case Filter: run only the case with this ID + --tag Filter: run only cases with this tag + --json Output as JSON + --no-fail Do not fail with non-zero exit code on suite failure + -h, --help Print help + +Examples: + fastskill eval run --agent codex --output-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap new file mode 100644 index 00000000..101a2950 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap @@ -0,0 +1,16 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Re-score saved eval artifacts without running the agent again + +Usage: fastskill eval score [OPTIONS] --run-dir + +Options: + --run-dir Path to the run directory to re-score + --json Output as JSON + --no-fail Do not fail with non-zero exit code on suite failure + -h, --help Print help + +Examples: + fastskill eval score --run-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap new file mode 100644 index 00000000..b6dbac3c --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap @@ -0,0 +1,16 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Validate eval configuration and files + +Usage: fastskill eval validate [OPTIONS] + +Options: + --agent Check agent availability for the specified agent key + --json Output as JSON + -h, --help Print help + +Examples: + fastskill eval validate + fastskill eval validate --agent codex diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap index b5125c60..5d169eea 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap index 89ab80e9..37023f23 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) @@ -47,6 +48,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap index a8ba135c..611645cf 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap index 7b21deef..f78dbd2d 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap index c6bbaf4d..cdf953d2 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/core_manifest_tests.rs b/tests/core_manifest_tests.rs index f40d939a..18f2cf3e 100644 --- a/tests/core_manifest_tests.rs +++ b/tests/core_manifest_tests.rs @@ -170,6 +170,7 @@ fn test_validation_project_level() { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }), }; From f505cd0008cd6e27cb0b3a0a358ea2b79fd5d43b Mon Sep 17 00:00:00 2001 From: System Two Date: Sun, 5 Apr 2026 18:01:20 +0000 Subject: [PATCH 2/4] chore(develop): 034-eval --- Cargo.lock | 25 ++--- src/cli/commands/eval/report.rs | 15 ++- src/cli/commands/eval/run.rs | 51 ++++++--- src/cli/commands/eval/score.rs | 15 ++- src/cli/commands/eval/validate.rs | 15 ++- src/eval/runner.rs | 102 +++++++++++++----- src/eval/trace.rs | 26 +++++ ...i__snapshot_helpers__eval_report_help.snap | 3 +- ..._cli__snapshot_helpers__eval_run_help.snap | 3 +- ...li__snapshot_helpers__eval_score_help.snap | 3 +- ..._snapshot_helpers__eval_validate_help.snap | 7 +- 11 files changed, 192 insertions(+), 73 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc8c4c12..2624d3a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,12 +42,13 @@ dependencies = [ [[package]] name = "aikit-sdk" -version = "0.1.49" -source = "git+https://github.com/goaikit/aikit?branch=main#60577a3d43d1396b24e2879644f8cf5b93408a4f" +version = "0.1.50" +source = "git+https://github.com/goaikit/aikit?branch=main#524f98f1db7b3facbfb4c0d49267b7dc067ada9b" dependencies = [ "glob", "reqwest 0.12.28", "serde", + "serde_json", "tempfile", "toml", "walkdir", @@ -1315,7 +1316,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -1416,7 +1417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2081,7 +2082,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -2650,7 +2651,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2958,7 +2959,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.36", - "socket2 0.6.2", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -2995,9 +2996,9 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.2", + "socket2 0.5.10", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.59.0", ] [[package]] @@ -3343,7 +3344,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -3856,7 +3857,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4537,7 +4538,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/src/cli/commands/eval/report.rs b/src/cli/commands/eval/report.rs index c7877b0b..a8dda912 100644 --- a/src/cli/commands/eval/report.rs +++ b/src/cli/commands/eval/report.rs @@ -1,8 +1,10 @@ //! Eval report subcommand - artifact summary and formatting +use crate::cli::commands::common::validate_format_args; use crate::cli::error::{CliError, CliResult}; use clap::Args; use fastskill::eval::artifacts::read_summary; +use fastskill::OutputFormat; use std::path::PathBuf; /// Arguments for `fastskill eval report` @@ -16,13 +18,20 @@ pub struct ReportArgs { #[arg(long, required = true)] pub run_dir: PathBuf, - /// Output as JSON - #[arg(long)] + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] pub json: bool, } /// Execute the `eval report` command pub async fn execute_report(args: ReportArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + if !args.run_dir.exists() { return Err(CliError::Config(format!( "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", @@ -37,7 +46,7 @@ pub async fn execute_report(args: ReportArgs) -> CliResult<()> { )) })?; - if args.json { + if use_json { println!( "{}", serde_json::to_string_pretty(&summary).unwrap_or_default() diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs index 483f49fa..58621a80 100644 --- a/src/cli/commands/eval/run.rs +++ b/src/cli/commands/eval/run.rs @@ -1,5 +1,6 @@ //! Eval run subcommand - case execution orchestration +use crate::cli::commands::common::validate_format_args; use crate::cli::error::{CliError, CliResult}; use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; use chrono::Utc; @@ -12,7 +13,8 @@ use fastskill::eval::checks::load_checks; use fastskill::eval::config::resolve_eval_config; use fastskill::eval::runner::{run_eval_case, CaseRunOptions}; use fastskill::eval::suite::load_suite; -use fastskill::eval::trace::{stdout_to_trace, trace_to_jsonl}; +use fastskill::eval::trace::trace_to_jsonl; +use fastskill::OutputFormat; use std::env; use std::path::PathBuf; @@ -43,8 +45,12 @@ pub struct RunArgs { #[arg(long)] pub tag: Option, - /// Output as JSON - #[arg(long)] + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] pub json: bool, /// Do not fail with non-zero exit code on suite failure @@ -66,6 +72,9 @@ fn validate_agent_key_for_run(s: &str) -> Result { /// Execute the `eval run` command pub async fn execute_run(args: RunArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + let current_dir = env::current_dir() .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; @@ -144,22 +153,26 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> { timeout_seconds: eval_config.timeout_seconds, }; - eprintln!( - "Running {} eval case(s) with agent '{}'...", - suite.cases.len(), - args.agent - ); + if !use_json { + eprintln!( + "Running {} eval case(s) with agent '{}'...", + suite.cases.len(), + args.agent + ); + } let mut case_results = Vec::new(); let mut case_summaries = Vec::new(); for case in &suite.cases { - eprintln!(" Running case '{}'...", case.id); + if !use_json { + eprintln!(" Running case '{}'...", case.id); + } let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await; - // Build trace - let trace_events = stdout_to_trace(&run_output.stdout); + // Build trace from stdout (since runner collects it) + let trace_events = fastskill::eval::trace::stdout_to_trace(&run_output.stdout); let trace_jsonl = trace_to_jsonl(&trace_events); // Write artifacts @@ -171,10 +184,12 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> { &trace_jsonl, &case_result, ) { - eprintln!( - " warning: failed to write artifacts for case '{}': {}", - case.id, e - ); + if !use_json { + eprintln!( + " warning: failed to write artifacts for case '{}': {}", + case.id, e + ); + } } let summary_entry = CaseSummary { @@ -217,10 +232,12 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> { // Write summary if let Err(e) = write_summary(&run_dir, &summary) { - eprintln!("warning: failed to write summary.json: {}", e); + if !use_json { + eprintln!("warning: failed to write summary.json: {}", e); + } } - if args.json { + if use_json { println!( "{}", serde_json::to_string_pretty(&summary).unwrap_or_default() diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs index 6afa3911..6d9c84d1 100644 --- a/src/cli/commands/eval/score.rs +++ b/src/cli/commands/eval/score.rs @@ -1,9 +1,11 @@ //! Eval score subcommand - offline re-scoring from saved artifacts +use crate::cli::commands::common::validate_format_args; use crate::cli::error::{CliError, CliResult}; use clap::Args; use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus}; use fastskill::eval::checks::load_checks; +use fastskill::OutputFormat; use std::path::PathBuf; /// Arguments for `fastskill eval score` @@ -17,8 +19,12 @@ pub struct ScoreArgs { #[arg(long, required = true)] pub run_dir: PathBuf, - /// Output as JSON - #[arg(long)] + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] pub json: bool, /// Do not fail with non-zero exit code on suite failure @@ -28,6 +34,9 @@ pub struct ScoreArgs { /// Execute the `eval score` command pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + if !args.run_dir.exists() { return Err(CliError::Config(format!( "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", @@ -108,7 +117,7 @@ pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { write_summary(&args.run_dir, &summary) .map_err(|e| CliError::Config(format!("Failed to write updated summary.json: {}", e)))?; - if args.json { + if use_json { println!( "{}", serde_json::to_string_pretty(&summary).unwrap_or_default() diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs index 54e52933..3c553c45 100644 --- a/src/cli/commands/eval/validate.rs +++ b/src/cli/commands/eval/validate.rs @@ -1,10 +1,12 @@ //! Eval validate subcommand - configuration and file validation +use crate::cli::commands::common::validate_format_args; use crate::cli::error::{CliError, CliResult}; use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; use clap::Args; use fastskill::core::project::resolve_project_file; use fastskill::eval::config::resolve_eval_config; +use fastskill::OutputFormat; use std::env; /// Arguments for `fastskill eval validate` @@ -18,8 +20,12 @@ pub struct ValidateArgs { #[arg(long, value_parser = validate_agent_key_parser)] pub agent: Option, - /// Output as JSON - #[arg(long)] + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] pub json: bool, } @@ -37,6 +43,9 @@ fn validate_agent_key_parser(s: &str) -> Result { /// Execute the `eval validate` command pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + let current_dir = env::current_dir() .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; @@ -75,7 +84,7 @@ pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { } } - if args.json { + if use_json { let output = serde_json::json!({ "valid": true, "prompts_path": eval_config.prompts_path, diff --git a/src/eval/runner.rs b/src/eval/runner.rs index 24394f6c..236a611f 100644 --- a/src/eval/runner.rs +++ b/src/eval/runner.rs @@ -3,9 +3,10 @@ use crate::eval::artifacts::{CaseResult, CaseStatus}; use crate::eval::checks::{run_checks, CheckDefinition}; use crate::eval::suite::EvalCase; -use crate::eval::trace::{stdout_to_trace, trace_to_jsonl}; -use aikit_sdk::{run_agent, RunOptions}; +use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl}; +use aikit_sdk::{run_agent_events, AgentEvent, RunOptions}; use std::path::PathBuf; +use std::time::Duration; use thiserror::Error; /// Options for running a single eval case @@ -41,10 +42,10 @@ pub enum RunnerError { ExecutionFailed(String), } -/// Run a single eval case using aikit_sdk::run_agent via spawn_blocking. +/// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking. /// /// This is the sole agent execution path per spec (acceptance criterion 29). -/// Uses spawn_blocking because run_agent is synchronous. +/// Uses spawn_blocking because run_agent_events is synchronous. pub async fn run_eval_case( case: &EvalCase, opts: &CaseRunOptions, @@ -53,6 +54,7 @@ pub async fn run_eval_case( let agent_key = opts.agent_key.clone(); let model = opts.model.clone(); let prompt = case.prompt.clone(); + let timeout_secs = opts.timeout_seconds; // Determine working directory let working_dir = match &case.workspace_subdir { @@ -61,42 +63,84 @@ pub async fn run_eval_case( }; // Build RunOptions using available aikit-sdk API - let run_opts = RunOptions { - model: model.clone(), - yolo: true, - stream: false, - }; + let run_opts = RunOptions::new() + .with_model(model.clone().unwrap_or_default()) + .with_yolo(true) + .with_stream(true); // Run agent in a blocking thread to avoid blocking the async runtime - let result = - tokio::task::spawn_blocking(move || run_agent(&agent_key, &prompt, run_opts)).await; + let timeout_duration = Duration::from_secs(timeout_secs); + let working_dir_clone = working_dir.clone(); + + let result = tokio::task::spawn_blocking(move || { + // Change to working directory before spawning the agent + let original_dir = std::env::current_dir().ok(); + let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok(); + + // Collect events during execution + let mut events: Vec = Vec::new(); + + let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| { + events.push(ev.clone()); + }); + + // Restore original directory if we changed it + if dir_changed { + if let Some(orig) = original_dir { + let _ = std::env::set_current_dir(&orig); + } + } + + (result, events) + }); - let run_output = match result { - Ok(Ok(run_result)) => { + // Apply timeout wrapper + let timed_result = tokio::time::timeout(timeout_duration, result).await; + + let (run_output, trace_events) = match timed_result { + Ok(Ok((Ok(run_result), events))) => { let exit_code = run_result.exit_code(); - CaseRunOutput { + let output = CaseRunOutput { stdout: run_result.stdout, stderr: run_result.stderr, exit_code, timed_out: false, - } + }; + let trace = agent_events_to_trace(&events); + (output, trace) + } + Ok(Ok((Err(e), events))) => { + let trace = agent_events_to_trace(&events); + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("Agent execution failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, trace) + } + Ok(Err(e)) => { + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("spawn_blocking failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, vec![]) + } + Err(_) => { + // Timeout elapsed + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("Case timed out after {}s", timeout_secs).into_bytes(), + exit_code: None, + timed_out: true, + }; + (output, vec![]) } - Ok(Err(e)) => CaseRunOutput { - stdout: vec![], - stderr: format!("Agent execution failed: {}", e).into_bytes(), - exit_code: None, - timed_out: false, - }, - Err(e) => CaseRunOutput { - stdout: vec![], - stderr: format!("spawn_blocking failed: {}", e).into_bytes(), - exit_code: None, - timed_out: false, - }, }; - // Build trace from stdout - let trace_events = stdout_to_trace(&run_output.stdout); + // Build trace from collected events let trace_jsonl = trace_to_jsonl(&trace_events); let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string(); diff --git a/src/eval/trace.rs b/src/eval/trace.rs index ef67d6b0..e028b1f8 100644 --- a/src/eval/trace.rs +++ b/src/eval/trace.rs @@ -1,5 +1,6 @@ //! Trace event types for eval case execution +use aikit_sdk::{AgentEvent, AgentEventPayload}; use serde::{Deserialize, Serialize}; /// A single line in a trace.jsonl file @@ -27,6 +28,31 @@ pub enum TracePayload { Timeout, } +/// Convert aikit-sdk AgentEvent to internal TraceEvent +pub fn agent_events_to_trace(events: &[AgentEvent]) -> Vec { + events + .iter() + .map(|ev| { + let payload = match &ev.payload { + AgentEventPayload::JsonLine(value) => TracePayload::RawJson { + data: value.clone(), + }, + AgentEventPayload::RawLine(line) => TracePayload::RawLine { line: line.clone() }, + AgentEventPayload::RawBytes(bytes) => { + use base64::{engine::general_purpose::STANDARD, Engine as _}; + TracePayload::RawBytes { + b64: STANDARD.encode(bytes), + } + } + }; + TraceEvent { + seq: ev.seq as usize, + payload, + } + }) + .collect() +} + /// Convert raw stdout lines to trace events pub fn stdout_to_trace(stdout: &[u8]) -> Vec { let text = String::from_utf8_lossy(stdout); diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap index ba01eee7..3afaf53a 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap @@ -8,7 +8,8 @@ Usage: fastskill eval report [OPTIONS] --run-dir Options: --run-dir Path to the specific run directory - --json Output as JSON + --format Output format: table, json, grid, xml + --json Shorthand for --format json -h, --help Print help Examples: diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap index ef5354c5..4bf17752 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap @@ -12,7 +12,8 @@ Options: --model Optional model override forwarded to the agent --case Filter: run only the case with this ID --tag Filter: run only cases with this tag - --json Output as JSON + --format Output format: table, json, grid, xml + --json Shorthand for --format json --no-fail Do not fail with non-zero exit code on suite failure -h, --help Print help diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap index 101a2950..252d9195 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap @@ -8,7 +8,8 @@ Usage: fastskill eval score [OPTIONS] --run-dir Options: --run-dir Path to the run directory to re-score - --json Output as JSON + --format Output format: table, json, grid, xml + --json Shorthand for --format json --no-fail Do not fail with non-zero exit code on suite failure -h, --help Print help diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap index b6dbac3c..9f0f8903 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap @@ -7,9 +7,10 @@ Validate eval configuration and files Usage: fastskill eval validate [OPTIONS] Options: - --agent Check agent availability for the specified agent key - --json Output as JSON - -h, --help Print help + --agent Check agent availability for the specified agent key + --format Output format: table, json, grid, xml + --json Shorthand for --format json + -h, --help Print help Examples: fastskill eval validate From a64a1676ee101a83d219e88b25ec05ff77871e61 Mon Sep 17 00:00:00 2001 From: System Two Date: Sun, 5 Apr 2026 18:45:33 +0000 Subject: [PATCH 3/4] chore(develop): 034-eval --- src/eval/runner.rs | 78 ++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/src/eval/runner.rs b/src/eval/runner.rs index 236a611f..af13915f 100644 --- a/src/eval/runner.rs +++ b/src/eval/runner.rs @@ -3,7 +3,7 @@ use crate::eval::artifacts::{CaseResult, CaseStatus}; use crate::eval::checks::{run_checks, CheckDefinition}; use crate::eval::suite::EvalCase; -use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl}; +use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload}; use aikit_sdk::{run_agent_events, AgentEvent, RunOptions}; use std::path::PathBuf; use std::time::Duration; @@ -42,6 +42,12 @@ pub enum RunnerError { ExecutionFailed(String), } +/// Result of agent execution within spawn_blocking +struct AgentExecutionResult { + result: Result, + events: Vec, +} + /// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking. /// /// This is the sole agent execution path per spec (acceptance criterion 29). @@ -62,18 +68,18 @@ pub async fn run_eval_case( None => opts.project_root.clone(), }; - // Build RunOptions using available aikit-sdk API + // Build RunOptions - note: aikit-sdk doesn't have with_timeout/with_current_dir yet let run_opts = RunOptions::new() .with_model(model.clone().unwrap_or_default()) .with_yolo(true) .with_stream(true); - // Run agent in a blocking thread to avoid blocking the async runtime - let timeout_duration = Duration::from_secs(timeout_secs); let working_dir_clone = working_dir.clone(); - let result = tokio::task::spawn_blocking(move || { - // Change to working directory before spawning the agent + // Run agent in a blocking thread to avoid blocking the async runtime + // We implement timeout ourselves using tokio::time::timeout + let spawn_result = tokio::task::spawn_blocking(move || { + // Change to working directory before spawning agent let original_dir = std::env::current_dir().ok(); let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok(); @@ -91,35 +97,42 @@ pub async fn run_eval_case( } } - (result, events) + AgentExecutionResult { result, events } }); // Apply timeout wrapper - let timed_result = tokio::time::timeout(timeout_duration, result).await; + let timeout_duration = Duration::from_secs(timeout_secs); + let timed_result = tokio::time::timeout(timeout_duration, spawn_result).await; let (run_output, trace_events) = match timed_result { - Ok(Ok((Ok(run_result), events))) => { - let exit_code = run_result.exit_code(); - let output = CaseRunOutput { - stdout: run_result.stdout, - stderr: run_result.stderr, - exit_code, - timed_out: false, - }; - let trace = agent_events_to_trace(&events); - (output, trace) - } - Ok(Ok((Err(e), events))) => { - let trace = agent_events_to_trace(&events); - let output = CaseRunOutput { - stdout: vec![], - stderr: format!("Agent execution failed: {}", e).into_bytes(), - exit_code: None, - timed_out: false, - }; - (output, trace) + Ok(Ok(exec_result)) => { + // spawn_blocking succeeded, check agent result + match exec_result.result { + Ok(run_result) => { + let exit_code = run_result.exit_code(); + let output = CaseRunOutput { + stdout: run_result.stdout, + stderr: run_result.stderr, + exit_code, + timed_out: false, + }; + let trace = agent_events_to_trace(&exec_result.events); + (output, trace) + } + Err(e) => { + let trace = agent_events_to_trace(&exec_result.events); + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("Agent execution failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, trace) + } + } } Ok(Err(e)) => { + // spawn_blocking itself failed let output = CaseRunOutput { stdout: vec![], stderr: format!("spawn_blocking failed: {}", e).into_bytes(), @@ -136,7 +149,12 @@ pub async fn run_eval_case( exit_code: None, timed_out: true, }; - (output, vec![]) + // Add a timeout marker to the trace + let timeout_event = TraceEvent { + seq: 0, + payload: TracePayload::Timeout, + }; + (output, vec![timeout_event]) } }; @@ -147,7 +165,7 @@ pub async fn run_eval_case( // Count raw_json events let command_count = trace_events .iter() - .filter(|e| matches!(&e.payload, crate::eval::trace::TracePayload::RawJson { .. })) + .filter(|e| matches!(&e.payload, TracePayload::RawJson { .. })) .count(); // Run deterministic checks From 4146607948bf46d3c012d5c8fc1f4d37606fe804 Mon Sep 17 00:00:00 2001 From: System Two Date: Mon, 6 Apr 2026 12:36:14 +0000 Subject: [PATCH 4/4] feat(eval): EvalRunner trait, SDK timeout/cwd, trace parity --- Cargo.lock | 22 +-- src/cli/commands/eval/run.rs | 20 ++- src/cli/commands/package.rs | 3 + src/core/packaging.rs | 4 +- src/eval/checks.rs | 35 ++-- src/eval/mod.rs | 4 + src/eval/runner.rs | 317 ++++++++++++++++++++++------------- tests/cli/eval_tests.rs | 88 ++++++++++ 8 files changed, 342 insertions(+), 151 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2624d3a2..2ad1ecd6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -43,7 +43,7 @@ dependencies = [ [[package]] name = "aikit-sdk" version = "0.1.50" -source = "git+https://github.com/goaikit/aikit?branch=main#524f98f1db7b3facbfb4c0d49267b7dc067ada9b" +source = "git+https://github.com/goaikit/aikit?branch=main#7380ef9c33c7915d32679da98a818cf7116967fb" dependencies = [ "glob", "reqwest 0.12.28", @@ -1316,7 +1316,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1417,7 +1417,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2082,7 +2082,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.2", "tokio", "tower-service", "tracing", @@ -2651,7 +2651,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2959,7 +2959,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls 0.23.36", - "socket2 0.5.10", + "socket2 0.6.2", "thiserror 2.0.18", "tokio", "tracing", @@ -2996,9 +2996,9 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -3344,7 +3344,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -3857,7 +3857,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4538,7 +4538,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs index 58621a80..8ce7fd2e 100644 --- a/src/cli/commands/eval/run.rs +++ b/src/cli/commands/eval/run.rs @@ -11,9 +11,8 @@ use fastskill::eval::artifacts::{ }; use fastskill::eval::checks::load_checks; use fastskill::eval::config::resolve_eval_config; -use fastskill::eval::runner::{run_eval_case, CaseRunOptions}; +use fastskill::eval::runner::{AikitEvalRunner, CaseRunOptions, EvalRunner}; use fastskill::eval::suite::load_suite; -use fastskill::eval::trace::trace_to_jsonl; use fastskill::OutputFormat; use std::env; use std::path::PathBuf; @@ -70,8 +69,16 @@ fn validate_agent_key_for_run(s: &str) -> Result { } } -/// Execute the `eval run` command +/// Execute the `eval run` command using the default aikit-backed runner. pub async fn execute_run(args: RunArgs) -> CliResult<()> { + execute_run_with_runner(args, &AikitEvalRunner).await +} + +/// Execute `eval run` with an injectable [`EvalRunner`] (tests or future adapters). +pub async fn execute_run_with_runner( + args: RunArgs, + runner: &R, +) -> CliResult<()> { let format = validate_format_args(&args.format, args.json)?; let use_json = format == OutputFormat::Json; @@ -169,11 +176,8 @@ pub async fn execute_run(args: RunArgs) -> CliResult<()> { eprintln!(" Running case '{}'...", case.id); } - let (run_output, case_result) = run_eval_case(case, &run_opts, &checks).await; - - // Build trace from stdout (since runner collects it) - let trace_events = fastskill::eval::trace::stdout_to_trace(&run_output.stdout); - let trace_jsonl = trace_to_jsonl(&trace_events); + let (run_output, case_result, trace_jsonl) = + runner.run_case(case, &run_opts, &checks).await; // Write artifacts if let Err(e) = write_case_artifacts( diff --git a/src/cli/commands/package.rs b/src/cli/commands/package.rs index eb58aaaa..21582f0a 100644 --- a/src/cli/commands/package.rs +++ b/src/cli/commands/package.rs @@ -48,6 +48,9 @@ pub enum PackagePreset { /// fastskill package --git-diff base head # Git-based change detection /// fastskill package --skills id1 id2 # Package specific skills /// fastskill package --force # Package all skills +/// +/// Note: the `evals/` directory is omitted from packaged skill ZIPs; exclusion is enforced in +/// `fastskill::core::packaging` when building archives, not in this CLI module. #[derive(Debug, Args)] pub struct PackageArgs { /// Package preset command diff --git a/src/core/packaging.rs b/src/core/packaging.rs index 4c1ea9b8..976f79bb 100644 --- a/src/core/packaging.rs +++ b/src/core/packaging.rs @@ -19,7 +19,9 @@ pub fn package_skill( package_skill_with_id(skill_path, output_dir, version, None) } -/// Package a skill directory into a ZIP file with explicit skill ID +/// Package a skill directory into a ZIP file with explicit skill ID. +/// +/// The `evals/` subtree is excluded from the archive (local eval suites are not published). pub fn package_skill_with_id( skill_path: &Path, output_dir: &Path, diff --git a/src/eval/checks.rs b/src/eval/checks.rs index a7d94e5e..47d460b5 100644 --- a/src/eval/checks.rs +++ b/src/eval/checks.rs @@ -4,6 +4,8 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; use thiserror::Error; +use crate::eval::trace::{TraceEvent, TracePayload}; + /// A deterministic check definition loaded from checks.toml #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "name")] @@ -108,6 +110,15 @@ pub fn run_checks( .collect() } +/// Count trace events with payload type `raw_json`. +pub fn count_raw_json_events(trace_jsonl: &str) -> usize { + trace_jsonl + .lines() + .filter_map(|line| serde_json::from_str::(line).ok()) + .filter(|event| matches!(event.payload, TracePayload::RawJson { .. })) + .count() +} + fn run_single_check( check: &CheckDefinition, stdout_content: &str, @@ -165,10 +176,7 @@ fn run_single_check( } CheckDefinition::MaxCommandCount { limit, .. } => { // Count raw_json trace lines - let count = trace_jsonl - .lines() - .filter(|line| line.contains("\"raw_json\"")) - .count(); + let count = count_raw_json_events(trace_jsonl); let passed = count <= *limit; let message = if passed { None @@ -259,9 +267,9 @@ mod tests { limit: 5, required: true, }; - let trace = r#"{"type":"raw_json","seq":0} -{"type":"raw_json","seq":1} -{"type":"raw_line","seq":2}"#; + let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}} +{"seq":2,"payload":{"type":"raw_line","line":"ok"}}"#; let results = run_checks(&[check], "", trace, Path::new("/tmp")); assert!(results[0].passed); } @@ -272,13 +280,20 @@ mod tests { limit: 1, required: true, }; - let trace = r#"{"type":"raw_json","seq":0} -{"type":"raw_json","seq":1} -{"type":"raw_json","seq":2}"#; + let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}} +{"seq":2,"payload":{"type":"raw_json","data":{"cmd":"c"}}}"#; let results = run_checks(&[check], "", trace, Path::new("/tmp")); assert!(!results[0].passed); } + #[test] + fn test_count_raw_json_events_ignores_substring_only() { + let trace = r#"{"seq":0,"payload":{"type":"raw_line","line":"mentions raw_json text"}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"x"}}}"#; + assert_eq!(count_raw_json_events(trace), 1); + } + #[test] fn test_suite_passes_all_passed() { let results = vec![ diff --git a/src/eval/mod.rs b/src/eval/mod.rs index bedfcb3c..33ac78aa 100644 --- a/src/eval/mod.rs +++ b/src/eval/mod.rs @@ -6,3 +6,7 @@ pub mod config; pub mod runner; pub mod suite; pub mod trace; + +pub use runner::{ + run_eval_case, AikitEvalRunner, CaseRunOptions, CaseRunOutput, EvalRunner, RunnerError, +}; diff --git a/src/eval/runner.rs b/src/eval/runner.rs index af13915f..c5495c41 100644 --- a/src/eval/runner.rs +++ b/src/eval/runner.rs @@ -1,10 +1,11 @@ //! Eval runner implementation using aikit-sdk use crate::eval::artifacts::{CaseResult, CaseStatus}; -use crate::eval::checks::{run_checks, CheckDefinition}; +use crate::eval::checks::{count_raw_json_events, run_checks, CheckDefinition}; use crate::eval::suite::EvalCase; use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload}; use aikit_sdk::{run_agent_events, AgentEvent, RunOptions}; +use async_trait::async_trait; use std::path::PathBuf; use std::time::Duration; use thiserror::Error; @@ -42,72 +43,78 @@ pub enum RunnerError { ExecutionFailed(String), } +/// Abstraction over eval case execution (default: aikit-backed). +#[async_trait] +pub trait EvalRunner: Send + Sync { + /// Run one case, produce stdout/stderr capture, scored result, and canonical trace JSONL. + async fn run_case( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String); +} + +/// Default runner: `aikit_sdk::run_agent_events` inside `spawn_blocking` with SDK timeout/cwd. +#[derive(Debug, Clone, Copy, Default)] +pub struct AikitEvalRunner; + /// Result of agent execution within spawn_blocking struct AgentExecutionResult { result: Result, events: Vec, } -/// Run a single eval case using aikit_sdk::run_agent_events via spawn_blocking. -/// -/// This is the sole agent execution path per spec (acceptance criterion 29). -/// Uses spawn_blocking because run_agent_events is synchronous. -pub async fn run_eval_case( - case: &EvalCase, - opts: &CaseRunOptions, - checks: &[CheckDefinition], -) -> (CaseRunOutput, CaseResult) { - let agent_key = opts.agent_key.clone(); - let model = opts.model.clone(); - let prompt = case.prompt.clone(); - let timeout_secs = opts.timeout_seconds; - - // Determine working directory - let working_dir = match &case.workspace_subdir { - Some(subdir) => opts.project_root.join(subdir), - None => opts.project_root.clone(), - }; - - // Build RunOptions - note: aikit-sdk doesn't have with_timeout/with_current_dir yet - let run_opts = RunOptions::new() - .with_model(model.clone().unwrap_or_default()) - .with_yolo(true) - .with_stream(true); - - let working_dir_clone = working_dir.clone(); - - // Run agent in a blocking thread to avoid blocking the async runtime - // We implement timeout ourselves using tokio::time::timeout - let spawn_result = tokio::task::spawn_blocking(move || { - // Change to working directory before spawning agent - let original_dir = std::env::current_dir().ok(); - let dir_changed = std::env::set_current_dir(&working_dir_clone).is_ok(); - - // Collect events during execution - let mut events: Vec = Vec::new(); - - let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| { - events.push(ev.clone()); - }); +#[async_trait] +impl EvalRunner for AikitEvalRunner { + async fn run_case( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + self.run_case_inner(case, opts, checks).await + } +} + +impl AikitEvalRunner { + async fn run_case_inner( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + let agent_key = opts.agent_key.clone(); + let model = opts.model.clone(); + let prompt = case.prompt.clone(); + let timeout_secs = opts.timeout_seconds; - // Restore original directory if we changed it - if dir_changed { - if let Some(orig) = original_dir { - let _ = std::env::set_current_dir(&orig); + let working_dir = match &case.workspace_subdir { + Some(subdir) => opts.project_root.join(subdir), + None => opts.project_root.clone(), + }; + + let mut run_opts = RunOptions::new() + .with_yolo(true) + .with_stream(true) + .with_timeout(Duration::from_secs(timeout_secs)) + .with_current_dir(working_dir.clone()); + if let Some(model_name) = model { + if !model_name.trim().is_empty() { + run_opts = run_opts.with_model(model_name); } } - AgentExecutionResult { result, events } - }); - - // Apply timeout wrapper - let timeout_duration = Duration::from_secs(timeout_secs); - let timed_result = tokio::time::timeout(timeout_duration, spawn_result).await; + let spawn_result = tokio::task::spawn_blocking(move || { + let mut events: Vec = Vec::new(); + let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| { + events.push(ev.clone()); + }); + AgentExecutionResult { result, events } + }); - let (run_output, trace_events) = match timed_result { - Ok(Ok(exec_result)) => { - // spawn_blocking succeeded, check agent result - match exec_result.result { + let (run_output, trace_events) = match spawn_result.await { + Ok(exec_result) => match exec_result.result { Ok(run_result) => { let exit_code = run_result.exit_code(); let output = CaseRunOutput { @@ -119,6 +126,33 @@ pub async fn run_eval_case( let trace = agent_events_to_trace(&exec_result.events); (output, trace) } + Err(aikit_sdk::RunError::TimedOut { + timeout, stderr, .. + }) => { + let mut trace = agent_events_to_trace(&exec_result.events); + trace.push(TraceEvent { + seq: trace.len(), + payload: TracePayload::Timeout, + }); + let output = CaseRunOutput { + stdout: vec![], + stderr, + exit_code: None, + timed_out: true, + }; + if output.stderr.is_empty() { + let fallback = format!("Case timed out after {}s", timeout.as_secs()); + let output = CaseRunOutput { + stdout: vec![], + stderr: fallback.into_bytes(), + exit_code: None, + timed_out: true, + }; + (output, trace) + } else { + (output, trace) + } + } Err(e) => { let trace = agent_events_to_trace(&exec_result.events); let output = CaseRunOutput { @@ -129,80 +163,121 @@ pub async fn run_eval_case( }; (output, trace) } + }, + Err(e) => { + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("spawn_blocking failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, vec![]) } - } - Ok(Err(e)) => { - // spawn_blocking itself failed - let output = CaseRunOutput { - stdout: vec![], - stderr: format!("spawn_blocking failed: {}", e).into_bytes(), - exit_code: None, - timed_out: false, - }; - (output, vec![]) - } - Err(_) => { - // Timeout elapsed - let output = CaseRunOutput { - stdout: vec![], - stderr: format!("Case timed out after {}s", timeout_secs).into_bytes(), - exit_code: None, - timed_out: true, - }; - // Add a timeout marker to the trace - let timeout_event = TraceEvent { - seq: 0, - payload: TracePayload::Timeout, - }; - (output, vec![timeout_event]) - } - }; - - // Build trace from collected events - let trace_jsonl = trace_to_jsonl(&trace_events); - let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string(); - - // Count raw_json events - let command_count = trace_events - .iter() - .filter(|e| matches!(&e.payload, TracePayload::RawJson { .. })) - .count(); - - // Run deterministic checks - let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir); - let all_passed = check_results.iter().all(|r| r.passed); - - let status = if run_output.timed_out { - CaseStatus::Error - } else if checks.is_empty() { - // No checks defined: pass if exit code is 0 - if run_output.exit_code == Some(0) { + }; + + let trace_jsonl = trace_to_jsonl(&trace_events); + let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string(); + let command_count = count_raw_json_events(&trace_jsonl); + let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir); + let all_passed = check_results.iter().all(|r| r.passed); + + let status = if run_output.timed_out { + CaseStatus::Error + } else if checks.is_empty() { + if run_output.exit_code == Some(0) { + CaseStatus::Passed + } else { + CaseStatus::Failed + } + } else if all_passed { CaseStatus::Passed } else { CaseStatus::Failed - } - } else if all_passed { - CaseStatus::Passed - } else { - CaseStatus::Failed - }; - - let case_result = CaseResult { - id: case.id.clone(), - status, - command_count: Some(command_count), - input_tokens: None, - output_tokens: None, - check_results, - error_message: None, - }; - - (run_output, case_result) + }; + + let case_result = CaseResult { + id: case.id.clone(), + status, + command_count: Some(command_count), + input_tokens: None, + output_tokens: None, + check_results, + error_message: None, + }; + + (run_output, case_result, trace_jsonl) + } +} + +/// Run a single eval case using the default [`AikitEvalRunner`]. +/// +/// Sole agent execution path per spec (acceptance criterion 29). +pub async fn run_eval_case( + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], +) -> (CaseRunOutput, CaseResult, String) { + AikitEvalRunner.run_case(case, opts, checks).await } #[cfg(test)] mod tests { use super::*; + use crate::eval::artifacts::CaseStatus; + + /// Stub runner for trait wiring tests (no aikit). + struct StubEvalRunner; + + #[async_trait] + impl EvalRunner for StubEvalRunner { + async fn run_case( + &self, + case: &EvalCase, + _opts: &CaseRunOptions, + _checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + let trace_jsonl = + r#"{"seq":0,"payload":{"type":"raw_line","line":"stub"}}"#.to_string(); + let out = CaseRunOutput { + stdout: b"ok".to_vec(), + stderr: vec![], + exit_code: Some(0), + timed_out: false, + }; + let result = CaseResult { + id: case.id.clone(), + status: CaseStatus::Passed, + command_count: Some(0), + input_tokens: None, + output_tokens: None, + check_results: vec![], + error_message: None, + }; + (out, result, trace_jsonl) + } + } + + #[tokio::test] + async fn test_eval_runner_trait_stub_returns_expected_trace() { + let case = EvalCase { + id: "c1".to_string(), + prompt: "p".to_string(), + should_trigger: true, + tags: vec![], + workspace_subdir: None, + }; + let opts = CaseRunOptions { + agent_key: "agent".to_string(), + model: None, + project_root: PathBuf::from("/tmp"), + timeout_seconds: 1, + }; + let runner = StubEvalRunner; + let (out, res, trace) = runner.run_case(&case, &opts, &[]).await; + assert_eq!(out.exit_code, Some(0)); + assert_eq!(res.id, "c1"); + assert!(trace.contains("raw_line")); + } #[test] fn test_case_run_options_builder() { diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs index 26241a82..f98a08f6 100644 --- a/tests/cli/eval_tests.rs +++ b/tests/cli/eval_tests.rs @@ -4,6 +4,7 @@ use super::snapshot_helpers::{ assert_snapshot_with_settings, cli_snapshot_settings, run_fastskill_command, + run_fastskill_command_with_env, }; #[test] @@ -211,3 +212,90 @@ fn test_eval_report_nonexistent_run_dir() { combined ); } + +#[test] +fn test_eval_run_persists_event_trace_jsonl() { + use serde_json::Value; + use std::env; + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\ntrace-case,\"test prompt\",true,\"basic\",\n", + ) + .unwrap(); + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + // Create a fake `agent` executable so aikit_sdk::is_agent_available("agent") succeeds. + let bin_dir = dir.path().join("bin"); + fs::create_dir_all(&bin_dir).unwrap(); + let agent_path = bin_dir.join("agent"); + fs::write( + &agent_path, + "#!/usr/bin/env bash\nif [[ \"$1\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\necho '{\"event\":\"ok\"}'\nexit 0\n", + ) + .unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&agent_path).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&agent_path, perms).unwrap(); + } + + let output_dir = dir.path().join("out"); + let path = env::var("PATH").unwrap_or_default(); + let merged_path = format!("{}:{}", bin_dir.display(), path); + let env_vars = vec![("PATH", merged_path.as_str())]; + + let result = run_fastskill_command_with_env( + &[ + "eval", + "run", + "--agent", + "agent", + "--output-dir", + output_dir.to_str().unwrap(), + "--case", + "trace-case", + "--json", + ], + &env_vars, + Some(dir.path()), + ); + assert!( + result.success, + "Expected eval run to succeed, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + + let json_start = result.stdout.find('{').unwrap(); + let summary: Value = serde_json::from_str(&result.stdout[json_start..]).unwrap(); + let run_dir = summary["run_dir"].as_str().unwrap(); + let trace_path = std::path::Path::new(run_dir) + .join("trace-case") + .join("trace.jsonl"); + let trace_jsonl = fs::read_to_string(&trace_path).unwrap(); + + assert!( + trace_jsonl.contains("\"type\":\"raw_json\""), + "expected persisted trace.jsonl to contain raw_json event, got: {}", + trace_jsonl + ); + + let result_path = std::path::Path::new(run_dir) + .join("trace-case") + .join("result.json"); + let case_result: Value = + serde_json::from_str(&fs::read_to_string(result_path).unwrap()).unwrap(); + assert_eq!(case_result["command_count"], 1); +}