diff --git a/Cargo.lock b/Cargo.lock index bc8c4c1..2ad1ecd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,12 +42,13 @@ dependencies = [ [[package]] name = "aikit-sdk" -version = "0.1.49" -source = "git+https://github.com/goaikit/aikit?branch=main#60577a3d43d1396b24e2879644f8cf5b93408a4f" +version = "0.1.50" +source = "git+https://github.com/goaikit/aikit?branch=main#7380ef9c33c7915d32679da98a818cf7116967fb" dependencies = [ "glob", "reqwest 0.12.28", "serde", + "serde_json", "tempfile", "toml", "walkdir", diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 5e26959..b58eaf2 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -5,7 +5,7 @@ use fastskill::FastSkillService; use std::sync::Arc; use crate::cli::commands::{ - add, analyze, auth, disable, init, install, list, marketplace, package, publish, read, + add, analyze, auth, disable, eval, init, install, list, marketplace, package, publish, read, registry, reindex, remove, repos, search, serve, show, sources, sync, update, version, Commands, }; @@ -109,6 +109,10 @@ impl Cli { return auth::execute_auth(args).await; } + if let Some(Commands::Eval(args)) = self.command { + return eval::execute_eval(args).await; + } + // For other commands, we need to restore self.command, so we need a different approach // Actually, if we get here, command was None or not Init/Repository/Package/Publish/RegistryIndex let command = self.command; @@ -155,6 +159,7 @@ impl Cli { | Some(Commands::Repos(_)) | Some(Commands::Marketplace(_)) | Some(Commands::Auth(_)) + | Some(Commands::Eval(_)) | Some(Commands::Version(_)) => unreachable!("Handled above"), None => { // No subcommand - treat as skill ID for read command diff --git a/src/cli/commands/eval/mod.rs b/src/cli/commands/eval/mod.rs new file mode 100644 index 0000000..0b2888a --- /dev/null +++ b/src/cli/commands/eval/mod.rs @@ -0,0 +1,62 @@ +//! Eval command group for skill quality assurance + +pub mod report; +pub mod run; +pub mod score; +pub mod validate; + +use crate::cli::error::CliResult; +use clap::{Args, Subcommand}; + +/// Eval command group +#[derive(Debug, Args)] +#[command( + about = "Evaluation commands for skill quality assurance", + after_help = "Examples:\n fastskill eval validate\n fastskill eval run --agent codex --output-dir /tmp/evals" +)] +pub struct EvalCommand { + #[command(subcommand)] + pub command: EvalSubcommand, +} + +/// Eval subcommands +#[derive(Debug, Subcommand)] +pub enum EvalSubcommand { + /// Validate eval configuration and files + #[command( + about = "Validate eval configuration and files", + after_help = "Examples:\n fastskill eval validate\n fastskill eval validate --agent codex" + )] + Validate(validate::ValidateArgs), + + /// Run eval cases against an agent + #[command( + about = "Run eval cases against an agent", + after_help = "Examples:\n fastskill eval run --agent codex --output-dir /tmp/evals" + )] + Run(run::RunArgs), + + /// Show a report for a completed eval run + #[command( + about = "Show a report for a completed eval run", + after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z" + )] + Report(report::ReportArgs), + + /// Re-score saved eval artifacts without running the agent again + #[command( + about = "Re-score saved eval artifacts without running the agent again", + after_help = "Examples:\n fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z" + )] + Score(score::ScoreArgs), +} + +/// Execute the eval command group +pub async fn execute_eval(args: EvalCommand) -> CliResult<()> { + match args.command { + EvalSubcommand::Validate(args) => validate::execute_validate(args).await, + EvalSubcommand::Run(args) => run::execute_run(args).await, + EvalSubcommand::Report(args) => report::execute_report(args).await, + EvalSubcommand::Score(args) => score::execute_score(args).await, + } +} diff --git a/src/cli/commands/eval/report.rs b/src/cli/commands/eval/report.rs new file mode 100644 index 0000000..a8dda91 --- /dev/null +++ b/src/cli/commands/eval/report.rs @@ -0,0 +1,80 @@ +//! Eval report subcommand - artifact summary and formatting + +use crate::cli::commands::common::validate_format_args; +use crate::cli::error::{CliError, CliResult}; +use clap::Args; +use fastskill::eval::artifacts::read_summary; +use fastskill::OutputFormat; +use std::path::PathBuf; + +/// Arguments for `fastskill eval report` +#[derive(Debug, Args)] +#[command( + about = "Show a report for a completed eval run", + after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z\n fastskill eval report --run-dir ./evals/2026-04-01T14-32-10Z --json" +)] +pub struct ReportArgs { + /// Path to the specific run directory + #[arg(long, required = true)] + pub run_dir: PathBuf, + + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] + pub json: bool, +} + +/// Execute the `eval report` command +pub async fn execute_report(args: ReportArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + + if !args.run_dir.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", + args.run_dir.display() + ))); + } + + let summary = read_summary(&args.run_dir).map_err(|e| { + CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}", + e + )) + })?; + + if use_json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!("Eval Report"); + println!(" run_dir: {}", args.run_dir.display()); + println!(" agent: {}", summary.agent); + if let Some(ref model) = summary.model { + println!(" model: {}", model); + } + println!( + " result: {}", + if summary.suite_pass { + "PASSED" + } else { + "FAILED" + } + ); + println!(" cases: {}/{} passed", summary.passed, summary.total_cases); + + if !summary.cases.is_empty() { + println!("\nCase Results:"); + for case in &summary.cases { + println!(" [{}] {}", case.status, case.id); + } + } + } + + Ok(()) +} diff --git a/src/cli/commands/eval/run.rs b/src/cli/commands/eval/run.rs new file mode 100644 index 0000000..8ce7fd2 --- /dev/null +++ b/src/cli/commands/eval/run.rs @@ -0,0 +1,272 @@ +//! Eval run subcommand - case execution orchestration + +use crate::cli::commands::common::validate_format_args; +use crate::cli::error::{CliError, CliResult}; +use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; +use chrono::Utc; +use clap::Args; +use fastskill::core::project::resolve_project_file; +use fastskill::eval::artifacts::{ + allocate_run_dir, write_case_artifacts, write_summary, CaseSummary, SummaryResult, +}; +use fastskill::eval::checks::load_checks; +use fastskill::eval::config::resolve_eval_config; +use fastskill::eval::runner::{AikitEvalRunner, CaseRunOptions, EvalRunner}; +use fastskill::eval::suite::load_suite; +use fastskill::OutputFormat; +use std::env; +use std::path::PathBuf; + +/// Arguments for `fastskill eval run` +#[derive(Debug, Args)] +#[command( + about = "Run eval cases against an agent", + after_help = "Examples:\n fastskill eval run --agent codex --output-dir /tmp/evals\n fastskill eval run --agent claude --output-dir ./evals --case my-case\n fastskill eval run --agent codex --output-dir ./evals --tag basic" +)] +pub struct RunArgs { + /// Agent to use for execution (required) + #[arg(long, required = true, value_parser = validate_agent_key_for_run)] + pub agent: String, + + /// Output directory for artifacts (required) + #[arg(long, required = true)] + pub output_dir: PathBuf, + + /// Optional model override forwarded to the agent + #[arg(long)] + pub model: Option, + + /// Filter: run only the case with this ID + #[arg(long)] + pub case: Option, + + /// Filter: run only cases with this tag + #[arg(long)] + pub tag: Option, + + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] + pub json: bool, + + /// Do not fail with non-zero exit code on suite failure + #[arg(long)] + pub no_fail: bool, +} + +fn validate_agent_key_for_run(s: &str) -> Result { + if is_runnable(s) { + Ok(s.to_string()) + } else { + Err(format!( + "'{}' is not a supported agent. Supported: {}", + s, + runnable_agents().join(", ") + )) + } +} + +/// Execute the `eval run` command using the default aikit-backed runner. +pub async fn execute_run(args: RunArgs) -> CliResult<()> { + execute_run_with_runner(args, &AikitEvalRunner).await +} + +/// Execute `eval run` with an injectable [`EvalRunner`] (tests or future adapters). +pub async fn execute_run_with_runner( + args: RunArgs, + runner: &R, +) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + + let current_dir = env::current_dir() + .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; + + let resolution = resolve_project_file(¤t_dir); + if !resolution.found { + return Err(CliError::Config( + "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first." + .to_string(), + )); + } + + let project_root = resolution + .path + .parent() + .unwrap_or(¤t_dir) + .to_path_buf(); + + let eval_config = resolve_eval_config(&resolution.path, &project_root) + .map_err(|e| CliError::Config(e.to_string()))?; + + // Check agent availability + if eval_config.fail_on_missing_agent && !is_agent_available(&args.agent) { + return Err(CliError::Config(format!( + "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it first.", + args.agent + ))); + } + + // Load suite + let mut suite = + load_suite(&eval_config.prompts_path).map_err(|e| CliError::Config(e.to_string()))?; + + // Apply filters + if let Some(ref case_id) = args.case { + suite = suite.filter_by_id(case_id); + if suite.cases.is_empty() { + return Err(CliError::Config(format!( + "No case found with id '{}'", + case_id + ))); + } + } + if let Some(ref tag) = args.tag { + suite = suite.filter_by_tag(tag); + if suite.cases.is_empty() { + return Err(CliError::Config(format!( + "No cases found with tag '{}'", + tag + ))); + } + } + + // Load checks if configured + let checks = if let Some(ref checks_path) = eval_config.checks_path { + load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))? + } else { + vec![] + }; + + // Allocate run directory + let run_id = Utc::now().format("%Y-%m-%dT%H-%M-%SZ").to_string(); + std::fs::create_dir_all(&args.output_dir).map_err(|e| { + CliError::Config(format!( + "Failed to create output directory '{}': {}", + args.output_dir.display(), + e + )) + })?; + let run_dir = + allocate_run_dir(&args.output_dir, &run_id).map_err(|e| CliError::Config(e.to_string()))?; + + let run_opts = CaseRunOptions { + agent_key: args.agent.clone(), + model: args.model.clone(), + project_root: project_root.clone(), + timeout_seconds: eval_config.timeout_seconds, + }; + + if !use_json { + eprintln!( + "Running {} eval case(s) with agent '{}'...", + suite.cases.len(), + args.agent + ); + } + + let mut case_results = Vec::new(); + let mut case_summaries = Vec::new(); + + for case in &suite.cases { + if !use_json { + eprintln!(" Running case '{}'...", case.id); + } + + let (run_output, case_result, trace_jsonl) = + runner.run_case(case, &run_opts, &checks).await; + + // Write artifacts + if let Err(e) = write_case_artifacts( + &run_dir, + &case.id, + &run_output.stdout, + &run_output.stderr, + &trace_jsonl, + &case_result, + ) { + if !use_json { + eprintln!( + " warning: failed to write artifacts for case '{}': {}", + case.id, e + ); + } + } + + let summary_entry = CaseSummary { + id: case_result.id.clone(), + status: case_result.status.clone(), + command_count: case_result.command_count, + input_tokens: case_result.input_tokens, + output_tokens: case_result.output_tokens, + }; + + case_summaries.push(summary_entry); + case_results.push(case_result); + } + + let passed = case_results + .iter() + .filter(|r| r.status == fastskill::eval::artifacts::CaseStatus::Passed) + .count(); + let failed = case_results.len() - passed; + let suite_pass = failed == 0; + + let summary = SummaryResult { + suite_pass, + agent: args.agent.clone(), + model: args.model.clone(), + total_cases: case_results.len(), + passed, + failed, + run_dir: run_dir.clone(), + checks_path: eval_config.checks_path.map(|p| { + if p.is_absolute() { + p + } else { + project_root.join(p) + } + }), + skill_project_root: project_root, + cases: case_summaries, + }; + + // Write summary + if let Err(e) = write_summary(&run_dir, &summary) { + if !use_json { + eprintln!("warning: failed to write summary.json: {}", e); + } + } + + if use_json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!( + "\nEval run complete: {}/{} passed", + passed, + case_results.len() + ); + println!(" run_dir: {}", run_dir.display()); + if suite_pass { + println!(" result: PASSED"); + } else { + println!(" result: FAILED ({} case(s) failed)", failed); + } + } + + if !suite_pass && !args.no_fail { + return Err(CliError::Config(format!( + "Eval suite failed: {}/{} cases passed", + passed, + case_results.len() + ))); + } + + Ok(()) +} diff --git a/src/cli/commands/eval/score.rs b/src/cli/commands/eval/score.rs new file mode 100644 index 0000000..6d9c84d --- /dev/null +++ b/src/cli/commands/eval/score.rs @@ -0,0 +1,146 @@ +//! Eval score subcommand - offline re-scoring from saved artifacts + +use crate::cli::commands::common::validate_format_args; +use crate::cli::error::{CliError, CliResult}; +use clap::Args; +use fastskill::eval::artifacts::{read_summary, write_summary, CaseStatus}; +use fastskill::eval::checks::load_checks; +use fastskill::OutputFormat; +use std::path::PathBuf; + +/// Arguments for `fastskill eval score` +#[derive(Debug, Args)] +#[command( + about = "Re-score saved eval artifacts without running the agent again", + after_help = "Examples:\n fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z\n fastskill eval score --run-dir ./evals/2026-04-01T14-32-10Z --json" +)] +pub struct ScoreArgs { + /// Path to the run directory to re-score + #[arg(long, required = true)] + pub run_dir: PathBuf, + + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] + pub json: bool, + + /// Do not fail with non-zero exit code on suite failure + #[arg(long)] + pub no_fail: bool, +} + +/// Execute the `eval score` command +pub async fn execute_score(args: ScoreArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + + if !args.run_dir.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}", + args.run_dir.display() + ))); + } + + // Read existing summary + let mut summary = read_summary(&args.run_dir).map_err(|e| { + CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}", + e + )) + })?; + + // Validate that we have usable paths + let checks_path = summary.checks_path.as_ref().ok_or_else(|| { + CliError::Config( + "EVAL_ARTIFACTS_CORRUPT: summary.json lacks checks_path - cannot re-score".to_string(), + ) + })?; + + if !checks_path.exists() { + return Err(CliError::Config(format!( + "EVAL_ARTIFACTS_CORRUPT: checks_path '{}' does not exist", + checks_path.display() + ))); + } + + // Load checks + let checks = load_checks(checks_path).map_err(|e| CliError::Config(e.to_string()))?; + + // Read existing case artifacts and re-score + let mut new_passed = 0; + let mut new_failed = 0; + + let mut updated_cases = summary.cases.clone(); + + for case_summary in &mut updated_cases { + let case_dir = args.run_dir.join(&case_summary.id); + if !case_dir.exists() { + continue; + } + + let stdout_path = case_dir.join("stdout.txt"); + let trace_path = case_dir.join("trace.jsonl"); + + let stdout_content = std::fs::read_to_string(&stdout_path).unwrap_or_default(); + let trace_jsonl = std::fs::read_to_string(&trace_path).unwrap_or_default(); + + let check_results = fastskill::eval::checks::run_checks( + &checks, + &stdout_content, + &trace_jsonl, + &summary.skill_project_root, + ); + + let all_passed = check_results.iter().all(|r| r.passed); + case_summary.status = if all_passed { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + if all_passed { + new_passed += 1; + } else { + new_failed += 1; + } + } + + summary.passed = new_passed; + summary.failed = new_failed; + summary.suite_pass = new_failed == 0; + summary.cases = updated_cases; + + // Update summary.json + write_summary(&args.run_dir, &summary) + .map_err(|e| CliError::Config(format!("Failed to write updated summary.json: {}", e)))?; + + if use_json { + println!( + "{}", + serde_json::to_string_pretty(&summary).unwrap_or_default() + ); + } else { + println!("Re-scoring complete"); + println!( + " result: {}", + if summary.suite_pass { + "PASSED" + } else { + "FAILED" + } + ); + println!(" cases: {}/{} passed", summary.passed, summary.total_cases); + } + + if !summary.suite_pass && !args.no_fail { + return Err(CliError::Config(format!( + "Eval suite failed: {}/{} cases passed after re-scoring", + summary.passed, summary.total_cases + ))); + } + + Ok(()) +} diff --git a/src/cli/commands/eval/validate.rs b/src/cli/commands/eval/validate.rs new file mode 100644 index 0000000..3c553c4 --- /dev/null +++ b/src/cli/commands/eval/validate.rs @@ -0,0 +1,126 @@ +//! Eval validate subcommand - configuration and file validation + +use crate::cli::commands::common::validate_format_args; +use crate::cli::error::{CliError, CliResult}; +use aikit_sdk::{is_agent_available, is_runnable, runnable_agents}; +use clap::Args; +use fastskill::core::project::resolve_project_file; +use fastskill::eval::config::resolve_eval_config; +use fastskill::OutputFormat; +use std::env; + +/// Arguments for `fastskill eval validate` +#[derive(Debug, Args)] +#[command( + about = "Validate eval configuration and files", + after_help = "Examples:\n fastskill eval validate\n fastskill eval validate --agent codex" +)] +pub struct ValidateArgs { + /// Check agent availability for the specified agent key + #[arg(long, value_parser = validate_agent_key_parser)] + pub agent: Option, + + /// Output format: table, json, grid, xml (default: table) + #[arg(long, value_enum, help = "Output format: table, json, grid, xml")] + pub format: Option, + + /// Shorthand for --format json + #[arg(long, help = "Shorthand for --format json")] + pub json: bool, +} + +fn validate_agent_key_parser(s: &str) -> Result { + if is_runnable(s) { + Ok(s.to_string()) + } else { + Err(format!( + "'{}' is not a supported agent. Supported: {}", + s, + runnable_agents().join(", ") + )) + } +} + +/// Execute the `eval validate` command +pub async fn execute_validate(args: ValidateArgs) -> CliResult<()> { + let format = validate_format_args(&args.format, args.json)?; + let use_json = format == OutputFormat::Json; + + let current_dir = env::current_dir() + .map_err(|e| CliError::Config(format!("Failed to get current directory: {}", e)))?; + + let resolution = resolve_project_file(¤t_dir); + + if !resolution.found { + return Err(CliError::Config( + "EVAL_CONFIG_MISSING: No skill-project.toml found. Run 'fastskill init' first." + .to_string(), + )); + } + + let project_root = resolution + .path + .parent() + .unwrap_or(¤t_dir) + .to_path_buf(); + + let eval_config = resolve_eval_config(&resolution.path, &project_root) + .map_err(|e| CliError::Config(e.to_string()))?; + + // Check agent availability if --agent was specified + if let Some(ref agent_key) = args.agent { + let available = is_agent_available(agent_key); + if !available && eval_config.fail_on_missing_agent { + return Err(CliError::Config(format!( + "EVAL_AGENT_UNAVAILABLE: Agent '{}' is not available. Install it or use --agent with an available agent.", + agent_key + ))); + } + if !available { + eprintln!( + "warning: agent '{}' is not available (fail_on_missing_agent=false, continuing)", + agent_key + ); + } + } + + if use_json { + let output = serde_json::json!({ + "valid": true, + "prompts_path": eval_config.prompts_path, + "checks_path": eval_config.checks_path, + "timeout_seconds": eval_config.timeout_seconds, + "fail_on_missing_agent": eval_config.fail_on_missing_agent, + "project_root": eval_config.project_root, + }); + println!( + "{}", + serde_json::to_string_pretty(&output).unwrap_or_default() + ); + } else { + println!("eval configuration: valid"); + println!(" prompts: {}", eval_config.prompts_path.display()); + if let Some(ref checks) = eval_config.checks_path { + println!(" checks: {}", checks.display()); + } + println!(" timeout: {}s", eval_config.timeout_seconds); + println!( + " fail_on_missing_agent: {}", + eval_config.fail_on_missing_agent + ); + if let Some(ref agent_key) = args.agent { + let available = is_agent_available(agent_key); + println!( + " agent '{}': {}", + agent_key, + if available { + "available" + } else { + "unavailable" + } + ); + } + } + + Ok(()) +} diff --git a/src/cli/commands/init.rs b/src/cli/commands/init.rs index ada79bc..bbd97d6 100644 --- a/src/cli/commands/init.rs +++ b/src/cli/commands/init.rs @@ -255,6 +255,7 @@ fn build_skill_project(meta: InitMetadata<'_>) -> CliResult { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }); validate_project_structure(true, dependencies.is_some()) diff --git a/src/cli/commands/mod.rs b/src/cli/commands/mod.rs index b249950..566585f 100644 --- a/src/cli/commands/mod.rs +++ b/src/cli/commands/mod.rs @@ -5,6 +5,7 @@ pub mod analyze; pub mod auth; pub mod common; pub mod disable; +pub mod eval; pub mod init; pub mod install; pub mod list; @@ -57,6 +58,13 @@ pub enum Commands { )] Disable(disable::DisableArgs), + /// Evaluation commands for skill quality assurance + #[command( + about = "Evaluation commands for skill quality assurance", + after_help = "Examples:\n fastskill eval validate\n fastskill eval run --agent codex --output-dir /tmp/evals" + )] + Eval(eval::EvalCommand), + /// Initialize skill-project.toml for skill authors #[command( about = "Initialize skill-project.toml in current skill directory", diff --git a/src/cli/commands/package.rs b/src/cli/commands/package.rs index eb58aaa..21582f0 100644 --- a/src/cli/commands/package.rs +++ b/src/cli/commands/package.rs @@ -48,6 +48,9 @@ pub enum PackagePreset { /// fastskill package --git-diff base head # Git-based change detection /// fastskill package --skills id1 id2 # Package specific skills /// fastskill package --force # Package all skills +/// +/// Note: the `evals/` directory is omitted from packaged skill ZIPs; exclusion is enforced in +/// `fastskill::core::packaging` when building archives, not in this CLI module. #[derive(Debug, Args)] pub struct PackageArgs { /// Package preset command diff --git a/src/core/manifest.rs b/src/core/manifest.rs index abc171b..5012001 100644 --- a/src/core/manifest.rs +++ b/src/core/manifest.rs @@ -273,6 +273,33 @@ pub struct FastSkillToolConfig { /// Skip transitive dependency resolution entirely (default: false) #[serde(default)] pub skip_transitive: bool, + /// Optional evaluation configuration + #[serde(default)] + pub eval: Option, +} + +/// Evaluation configuration in TOML format ([tool.fastskill.eval]) +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalConfigToml { + /// Path to prompts CSV file (relative to skill project root) + pub prompts: PathBuf, + /// Optional path to checks TOML file + #[serde(default)] + pub checks: Option, + /// Timeout in seconds for each eval case execution + #[serde(default = "default_eval_timeout_seconds")] + pub timeout_seconds: u64, + /// When true, `eval run` / `eval validate --agent` fail fast if the agent CLI is not available + #[serde(default = "default_fail_on_missing_agent")] + pub fail_on_missing_agent: bool, +} + +fn default_eval_timeout_seconds() -> u64 { + 900 +} + +fn default_fail_on_missing_agent() -> bool { + true } fn default_install_depth() -> u32 { diff --git a/src/core/packaging.rs b/src/core/packaging.rs index 892c835..976f79b 100644 --- a/src/core/packaging.rs +++ b/src/core/packaging.rs @@ -19,7 +19,9 @@ pub fn package_skill( package_skill_with_id(skill_path, output_dir, version, None) } -/// Package a skill directory into a ZIP file with explicit skill ID +/// Package a skill directory into a ZIP file with explicit skill ID. +/// +/// The `evals/` subtree is excluded from the archive (local eval suites are not published). pub fn package_skill_with_id( skill_path: &Path, output_dir: &Path, @@ -157,6 +159,11 @@ pub fn package_skill_with_id( continue; } + // Skip evals/ directory from published artifacts + if relative_path_str.starts_with("evals/") || relative_path_str == "evals" { + continue; + } + // Read file content let mut file_content = Vec::new(); let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?; @@ -219,6 +226,11 @@ pub fn package_skill_with_id( continue; } + // Skip evals/ directory from published artifacts + if relative_path_str.starts_with("evals/") || relative_path_str == "evals" { + continue; + } + let mut file_content = Vec::new(); let mut file = fs::File::open(file_path).map_err(ServiceError::Io)?; file.read_to_end(&mut file_content) diff --git a/src/core/repository.rs b/src/core/repository.rs index a51163e..5fcf1f4 100644 --- a/src/core/repository.rs +++ b/src/core/repository.rs @@ -251,6 +251,7 @@ impl RepositoryManager { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }); } else if let Some(ref mut tool) = project.tool { @@ -262,6 +263,7 @@ impl RepositoryManager { server: None, install_depth: 5, skip_transitive: false, + eval: None, }); } else if let Some(ref mut fastskill) = tool.fastskill { fastskill.repositories = Some(manifest_repos); diff --git a/src/eval/artifacts.rs b/src/eval/artifacts.rs new file mode 100644 index 0000000..b11a5b1 --- /dev/null +++ b/src/eval/artifacts.rs @@ -0,0 +1,209 @@ +//! Artifact layout and persistence for eval runs + +use crate::eval::checks::CheckResult; +use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// Status of a single eval case +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum CaseStatus { + Passed, + Failed, + Error, + Skipped, +} + +impl std::fmt::Display for CaseStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CaseStatus::Passed => write!(f, "passed"), + CaseStatus::Failed => write!(f, "failed"), + CaseStatus::Error => write!(f, "error"), + CaseStatus::Skipped => write!(f, "skipped"), + } + } +} + +/// Per-case result +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CaseResult { + pub id: String, + pub status: CaseStatus, + pub command_count: Option, + pub input_tokens: Option, + pub output_tokens: Option, + #[serde(default)] + pub check_results: Vec, + pub error_message: Option, +} + +/// Aggregated run summary +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SummaryResult { + pub suite_pass: bool, + pub agent: String, + pub model: Option, + pub total_cases: usize, + pub passed: usize, + pub failed: usize, + pub run_dir: PathBuf, + pub checks_path: Option, + pub skill_project_root: PathBuf, + pub cases: Vec, +} + +/// Per-case summary entry in summary.json +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CaseSummary { + pub id: String, + pub status: CaseStatus, + pub command_count: Option, + pub input_tokens: Option, + pub output_tokens: Option, +} + +/// All artifacts from a completed run +#[derive(Debug)] +pub struct RunArtifacts { + pub run_id: String, + pub run_dir: PathBuf, + pub summary: SummaryResult, + pub case_results: Vec, +} + +/// Errors during artifact writing/reading +#[derive(Debug, Error)] +pub enum ArtifactsError { + #[error("EVAL_ARTIFACTS_CORRUPT: IO error: {0}")] + Io(#[from] std::io::Error), + #[error("EVAL_ARTIFACTS_CORRUPT: JSON error: {0}")] + Json(#[from] serde_json::Error), + #[error("EVAL_ARTIFACTS_CORRUPT: Missing required field: {0}")] + MissingField(String), +} + +/// Allocate a run directory under output_dir using ISO 8601 timestamp format +/// Appends numeric suffix if directory already exists +pub fn allocate_run_dir(output_dir: &Path, run_id: &str) -> Result { + let base = output_dir.join(run_id); + if !base.exists() { + std::fs::create_dir_all(&base)?; + return Ok(base); + } + + // Append numeric suffix + for i in 2..=999 { + let candidate = output_dir.join(format!("{}-{}", run_id, i)); + if !candidate.exists() { + std::fs::create_dir_all(&candidate)?; + return Ok(candidate); + } + } + + // Fallback: use the base (it exists, will overwrite) + Ok(base) +} + +/// Write per-case artifacts (stdout.txt, stderr.txt, trace.jsonl, result.json) +pub fn write_case_artifacts( + run_dir: &Path, + case_id: &str, + stdout: &[u8], + stderr: &[u8], + trace_jsonl: &str, + result: &CaseResult, +) -> Result { + let case_dir = run_dir.join(case_id); + std::fs::create_dir_all(&case_dir)?; + + std::fs::write(case_dir.join("stdout.txt"), stdout)?; + std::fs::write(case_dir.join("stderr.txt"), stderr)?; + std::fs::write(case_dir.join("trace.jsonl"), trace_jsonl)?; + + let result_json = serde_json::to_string_pretty(result)?; + std::fs::write(case_dir.join("result.json"), result_json)?; + + Ok(case_dir) +} + +/// Write summary.json +pub fn write_summary(run_dir: &Path, summary: &SummaryResult) -> Result<(), ArtifactsError> { + let summary_json = serde_json::to_string_pretty(summary)?; + std::fs::write(run_dir.join("summary.json"), summary_json)?; + Ok(()) +} + +/// Read summary.json from a run directory +pub fn read_summary(run_dir: &Path) -> Result { + let summary_path = run_dir.join("summary.json"); + let content = std::fs::read_to_string(&summary_path)?; + let summary: SummaryResult = serde_json::from_str(&content)?; + Ok(summary) +} + +/// Read case result.json files from a run directory +pub fn read_case_results(run_dir: &Path) -> Result, ArtifactsError> { + let mut results = Vec::new(); + + let entries = std::fs::read_dir(run_dir)?; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let result_path = path.join("result.json"); + if result_path.exists() { + let content = std::fs::read_to_string(&result_path)?; + let result: CaseResult = serde_json::from_str(&content)?; + results.push(result); + } + } + } + + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_allocate_run_dir_creates_new() { + let dir = TempDir::new().unwrap(); + let run_dir = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + assert!(run_dir.exists()); + assert!(run_dir.ends_with("2026-04-01T14-00-00Z")); + } + + #[test] + fn test_allocate_run_dir_suffix_on_conflict() { + let dir = TempDir::new().unwrap(); + let run_dir1 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + let run_dir2 = allocate_run_dir(dir.path(), "2026-04-01T14-00-00Z").unwrap(); + assert_ne!(run_dir1, run_dir2); + assert!(run_dir2.to_string_lossy().contains("-2")); + } + + #[test] + fn test_write_and_read_summary() { + let dir = TempDir::new().unwrap(); + let summary = SummaryResult { + suite_pass: true, + agent: "codex".to_string(), + model: None, + total_cases: 2, + passed: 2, + failed: 0, + run_dir: dir.path().to_path_buf(), + checks_path: None, + skill_project_root: dir.path().to_path_buf(), + cases: vec![], + }; + + write_summary(dir.path(), &summary).unwrap(); + let read = read_summary(dir.path()).unwrap(); + assert_eq!(read.total_cases, 2); + assert!(read.suite_pass); + } +} diff --git a/src/eval/checks.rs b/src/eval/checks.rs new file mode 100644 index 0000000..47d460b --- /dev/null +++ b/src/eval/checks.rs @@ -0,0 +1,330 @@ +//! Deterministic check engine for eval artifact scoring + +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use thiserror::Error; + +use crate::eval::trace::{TraceEvent, TracePayload}; + +/// A deterministic check definition loaded from checks.toml +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "name")] +pub enum CheckDefinition { + /// Check whether a pattern appears (or doesn't appear) in the trace + #[serde(rename = "trigger_expectation")] + TriggerExpectation { + pattern: String, + /// true = pattern must appear; false = pattern must NOT appear + expected: bool, + #[serde(default = "default_required")] + required: bool, + }, + /// Check whether the trace contains a line with this pattern + #[serde(rename = "command_contains")] + CommandContains { + pattern: String, + #[serde(default = "default_required")] + required: bool, + }, + /// Check whether a file exists in the working directory after execution + #[serde(rename = "file_exists")] + FileExists { + path: PathBuf, + #[serde(default = "default_required")] + required: bool, + }, + /// Check that the number of raw_json trace lines does not exceed a limit + #[serde(rename = "max_command_count")] + MaxCommandCount { + limit: usize, + #[serde(default = "default_required")] + required: bool, + }, +} + +fn default_required() -> bool { + true +} + +impl CheckDefinition { + pub fn name(&self) -> &str { + match self { + CheckDefinition::TriggerExpectation { .. } => "trigger_expectation", + CheckDefinition::CommandContains { .. } => "command_contains", + CheckDefinition::FileExists { .. } => "file_exists", + CheckDefinition::MaxCommandCount { .. } => "max_command_count", + } + } + + pub fn is_required(&self) -> bool { + match self { + CheckDefinition::TriggerExpectation { required, .. } => *required, + CheckDefinition::CommandContains { required, .. } => *required, + CheckDefinition::FileExists { required, .. } => *required, + CheckDefinition::MaxCommandCount { required, .. } => *required, + } + } +} + +/// Result of a single check +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub check_name: String, + pub passed: bool, + pub message: Option, +} + +/// TOML file structure for checks configuration +#[derive(Debug, Deserialize)] +pub struct ChecksToml { + #[serde(rename = "check", default)] + pub checks: Vec, +} + +/// Errors loading checks configuration +#[derive(Debug, Error)] +pub enum ChecksError { + #[error("EVAL_CHECKS_INVALID: Failed to read checks file: {0}")] + Io(#[from] std::io::Error), + #[error("EVAL_CHECKS_INVALID: Failed to parse checks TOML: {0}")] + Parse(#[from] toml::de::Error), +} + +/// Load check definitions from a TOML file +pub fn load_checks(path: &std::path::Path) -> Result, ChecksError> { + let content = std::fs::read_to_string(path)?; + let parsed: ChecksToml = toml::from_str(&content)?; + Ok(parsed.checks) +} + +/// Run all checks against captured stdout content and working directory +pub fn run_checks( + checks: &[CheckDefinition], + stdout_content: &str, + trace_jsonl: &str, + working_dir: &std::path::Path, +) -> Vec { + checks + .iter() + .map(|check| run_single_check(check, stdout_content, trace_jsonl, working_dir)) + .collect() +} + +/// Count trace events with payload type `raw_json`. +pub fn count_raw_json_events(trace_jsonl: &str) -> usize { + trace_jsonl + .lines() + .filter_map(|line| serde_json::from_str::(line).ok()) + .filter(|event| matches!(event.payload, TracePayload::RawJson { .. })) + .count() +} + +fn run_single_check( + check: &CheckDefinition, + stdout_content: &str, + trace_jsonl: &str, + working_dir: &std::path::Path, +) -> CheckResult { + match check { + CheckDefinition::TriggerExpectation { + pattern, expected, .. + } => { + // Literal substring matching in stdout + trace + let combined = format!("{}\n{}", stdout_content, trace_jsonl); + let found = combined.contains(pattern.as_str()); + let passed = found == *expected; + let message = if passed { + None + } else if *expected { + Some(format!("Pattern '{}' not found but was expected", pattern)) + } else { + Some(format!("Pattern '{}' found but was not expected", pattern)) + }; + CheckResult { + check_name: "trigger_expectation".to_string(), + passed, + message, + } + } + CheckDefinition::CommandContains { pattern, .. } => { + let combined = format!("{}\n{}", stdout_content, trace_jsonl); + let passed = combined.contains(pattern.as_str()); + let message = if passed { + None + } else { + Some(format!("Pattern '{}' not found in output", pattern)) + }; + CheckResult { + check_name: "command_contains".to_string(), + passed, + message, + } + } + CheckDefinition::FileExists { path, .. } => { + let full_path = working_dir.join(path); + let passed = full_path.exists(); + let message = if passed { + None + } else { + Some(format!("File '{}' does not exist", path.display())) + }; + CheckResult { + check_name: "file_exists".to_string(), + passed, + message, + } + } + CheckDefinition::MaxCommandCount { limit, .. } => { + // Count raw_json trace lines + let count = count_raw_json_events(trace_jsonl); + let passed = count <= *limit; + let message = if passed { + None + } else { + Some(format!("Command count {} exceeds limit {}", count, limit)) + }; + CheckResult { + check_name: "max_command_count".to_string(), + passed, + message, + } + } + } +} + +/// Aggregate check results: suite passes if all required checks pass +pub fn suite_passes(results: &[CheckResult]) -> bool { + results.iter().all(|r| r.passed) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + use tempfile::TempDir; + + #[test] + fn test_trigger_expectation_passes_when_pattern_found() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: true, + required: true, + }; + let results = run_checks(&[check], "fastskill triggered", "", Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_trigger_expectation_fails_when_pattern_missing() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: true, + required: true, + }; + let results = run_checks(&[check], "nothing here", "", Path::new("/tmp")); + assert!(!results[0].passed); + } + + #[test] + fn test_trigger_expectation_negative_passes_when_pattern_absent() { + let check = CheckDefinition::TriggerExpectation { + pattern: "fastskill".to_string(), + expected: false, + required: true, + }; + let results = run_checks(&[check], "no match", "", Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_file_exists_check_passes() { + let dir = TempDir::new().unwrap(); + let file_path = dir.path().join("output.txt"); + std::fs::write(&file_path, "content").unwrap(); + + let check = CheckDefinition::FileExists { + path: PathBuf::from("output.txt"), + required: true, + }; + let results = run_checks(&[check], "", "", dir.path()); + assert!(results[0].passed); + } + + #[test] + fn test_file_exists_check_fails() { + let dir = TempDir::new().unwrap(); + let check = CheckDefinition::FileExists { + path: PathBuf::from("missing.txt"), + required: true, + }; + let results = run_checks(&[check], "", "", dir.path()); + assert!(!results[0].passed); + } + + #[test] + fn test_max_command_count_passes() { + let check = CheckDefinition::MaxCommandCount { + limit: 5, + required: true, + }; + let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}} +{"seq":2,"payload":{"type":"raw_line","line":"ok"}}"#; + let results = run_checks(&[check], "", trace, Path::new("/tmp")); + assert!(results[0].passed); + } + + #[test] + fn test_max_command_count_fails() { + let check = CheckDefinition::MaxCommandCount { + limit: 1, + required: true, + }; + let trace = r#"{"seq":0,"payload":{"type":"raw_json","data":{"cmd":"a"}}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"b"}}} +{"seq":2,"payload":{"type":"raw_json","data":{"cmd":"c"}}}"#; + let results = run_checks(&[check], "", trace, Path::new("/tmp")); + assert!(!results[0].passed); + } + + #[test] + fn test_count_raw_json_events_ignores_substring_only() { + let trace = r#"{"seq":0,"payload":{"type":"raw_line","line":"mentions raw_json text"}} +{"seq":1,"payload":{"type":"raw_json","data":{"cmd":"x"}}}"#; + assert_eq!(count_raw_json_events(trace), 1); + } + + #[test] + fn test_suite_passes_all_passed() { + let results = vec![ + CheckResult { + check_name: "a".to_string(), + passed: true, + message: None, + }, + CheckResult { + check_name: "b".to_string(), + passed: true, + message: None, + }, + ]; + assert!(suite_passes(&results)); + } + + #[test] + fn test_suite_passes_any_failed() { + let results = vec![ + CheckResult { + check_name: "a".to_string(), + passed: true, + message: None, + }, + CheckResult { + check_name: "b".to_string(), + passed: false, + message: Some("failed".to_string()), + }, + ]; + assert!(!suite_passes(&results)); + } +} diff --git a/src/eval/config.rs b/src/eval/config.rs new file mode 100644 index 0000000..ecb6e5b --- /dev/null +++ b/src/eval/config.rs @@ -0,0 +1,140 @@ +//! Eval configuration resolution + +use crate::core::manifest::{EvalConfigToml, SkillProjectToml}; +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// Resolved eval configuration (after path resolution) +#[derive(Debug, Clone)] +pub struct EvalConfig { + /// Absolute path to prompts CSV file + pub prompts_path: PathBuf, + /// Absolute path to checks TOML file (optional) + pub checks_path: Option, + /// Timeout in seconds for each case + pub timeout_seconds: u64, + /// Whether to fail fast if agent is not available + pub fail_on_missing_agent: bool, + /// Skill project root directory + pub project_root: PathBuf, +} + +/// Errors during eval configuration resolution +#[derive(Debug, Error)] +pub enum EvalConfigError { + #[error("EVAL_CONFIG_MISSING: No [tool.fastskill.eval] section found in skill-project.toml")] + ConfigMissing, + #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV not found: {0}")] + PromptsNotFound(PathBuf), + #[error("Failed to read skill-project.toml: {0}")] + Io(#[from] std::io::Error), + #[error("Failed to parse skill-project.toml: {0}")] + Parse(String), +} + +/// Resolve eval configuration from a skill-project.toml file +pub fn resolve_eval_config( + project_file: &Path, + project_root: &Path, +) -> Result { + let content = std::fs::read_to_string(project_file)?; + let toml: SkillProjectToml = + toml::from_str(&content).map_err(|e| EvalConfigError::Parse(e.to_string()))?; + + let eval_config = toml + .tool + .as_ref() + .and_then(|t| t.fastskill.as_ref()) + .and_then(|f| f.eval.as_ref()) + .ok_or(EvalConfigError::ConfigMissing)?; + + resolve_from_toml(eval_config, project_root) +} + +/// Resolve eval configuration from parsed TOML config +pub fn resolve_from_toml( + config: &EvalConfigToml, + project_root: &Path, +) -> Result { + let prompts_path = if config.prompts.is_absolute() { + config.prompts.clone() + } else { + project_root.join(&config.prompts) + }; + + if !prompts_path.exists() { + return Err(EvalConfigError::PromptsNotFound(prompts_path)); + } + + let checks_path = config.checks.as_ref().map(|p| { + if p.is_absolute() { + p.clone() + } else { + project_root.join(p) + } + }); + + Ok(EvalConfig { + prompts_path, + checks_path, + timeout_seconds: config.timeout_seconds, + fail_on_missing_agent: config.fail_on_missing_agent, + project_root: project_root.to_path_buf(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_resolve_eval_config_missing() { + let dir = TempDir::new().unwrap(); + let project_file = dir.path().join("skill-project.toml"); + std::fs::write(&project_file, "[metadata]\nid = \"test\"\n").unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!(result, Err(EvalConfigError::ConfigMissing))); + } + + #[test] + fn test_resolve_eval_config_prompts_not_found() { + let dir = TempDir::new().unwrap(); + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 900\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(matches!(result, Err(EvalConfigError::PromptsNotFound(_)))); + } + + #[test] + fn test_resolve_eval_config_success() { + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + std::fs::create_dir_all(&evals_dir).unwrap(); + let prompts_file = evals_dir.join("prompts.csv"); + std::fs::write( + &prompts_file, + "id,prompt,should_trigger\ntest-1,hello,true\n", + ) + .unwrap(); + + let project_file = dir.path().join("skill-project.toml"); + std::fs::write( + &project_file, + "[metadata]\nid = \"test\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 600\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = resolve_eval_config(&project_file, dir.path()); + assert!(result.is_ok()); + let config = result.unwrap(); + assert_eq!(config.timeout_seconds, 600); + assert!(!config.fail_on_missing_agent); + } +} diff --git a/src/eval/mod.rs b/src/eval/mod.rs new file mode 100644 index 0000000..33ac78a --- /dev/null +++ b/src/eval/mod.rs @@ -0,0 +1,12 @@ +//! Evaluation domain types and services for skill quality assurance + +pub mod artifacts; +pub mod checks; +pub mod config; +pub mod runner; +pub mod suite; +pub mod trace; + +pub use runner::{ + run_eval_case, AikitEvalRunner, CaseRunOptions, CaseRunOutput, EvalRunner, RunnerError, +}; diff --git a/src/eval/runner.rs b/src/eval/runner.rs new file mode 100644 index 0000000..c5495c4 --- /dev/null +++ b/src/eval/runner.rs @@ -0,0 +1,300 @@ +//! Eval runner implementation using aikit-sdk + +use crate::eval::artifacts::{CaseResult, CaseStatus}; +use crate::eval::checks::{count_raw_json_events, run_checks, CheckDefinition}; +use crate::eval::suite::EvalCase; +use crate::eval::trace::{agent_events_to_trace, trace_to_jsonl, TraceEvent, TracePayload}; +use aikit_sdk::{run_agent_events, AgentEvent, RunOptions}; +use async_trait::async_trait; +use std::path::PathBuf; +use std::time::Duration; +use thiserror::Error; + +/// Options for running a single eval case +#[derive(Debug, Clone)] +pub struct CaseRunOptions { + /// Agent key (e.g. "codex", "claude") + pub agent_key: String, + /// Optional model override + pub model: Option, + /// Skill project root (used as working directory when no workspace_subdir) + pub project_root: PathBuf, + /// Timeout in seconds + pub timeout_seconds: u64, +} + +/// Raw output from running a case +#[derive(Debug)] +pub struct CaseRunOutput { + pub stdout: Vec, + pub stderr: Vec, + pub exit_code: Option, + pub timed_out: bool, +} + +/// Errors during case execution +#[derive(Debug, Error)] +pub enum RunnerError { + #[error("EVAL_AGENT_UNAVAILABLE: Agent '{0}' is not available")] + AgentUnavailable(String), + #[error("EVAL_CASE_TIMEOUT: Case timed out after {0}s")] + Timeout(u64), + #[error("Execution failed: {0}")] + ExecutionFailed(String), +} + +/// Abstraction over eval case execution (default: aikit-backed). +#[async_trait] +pub trait EvalRunner: Send + Sync { + /// Run one case, produce stdout/stderr capture, scored result, and canonical trace JSONL. + async fn run_case( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String); +} + +/// Default runner: `aikit_sdk::run_agent_events` inside `spawn_blocking` with SDK timeout/cwd. +#[derive(Debug, Clone, Copy, Default)] +pub struct AikitEvalRunner; + +/// Result of agent execution within spawn_blocking +struct AgentExecutionResult { + result: Result, + events: Vec, +} + +#[async_trait] +impl EvalRunner for AikitEvalRunner { + async fn run_case( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + self.run_case_inner(case, opts, checks).await + } +} + +impl AikitEvalRunner { + async fn run_case_inner( + &self, + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + let agent_key = opts.agent_key.clone(); + let model = opts.model.clone(); + let prompt = case.prompt.clone(); + let timeout_secs = opts.timeout_seconds; + + let working_dir = match &case.workspace_subdir { + Some(subdir) => opts.project_root.join(subdir), + None => opts.project_root.clone(), + }; + + let mut run_opts = RunOptions::new() + .with_yolo(true) + .with_stream(true) + .with_timeout(Duration::from_secs(timeout_secs)) + .with_current_dir(working_dir.clone()); + if let Some(model_name) = model { + if !model_name.trim().is_empty() { + run_opts = run_opts.with_model(model_name); + } + } + + let spawn_result = tokio::task::spawn_blocking(move || { + let mut events: Vec = Vec::new(); + let result = run_agent_events(&agent_key, &prompt, run_opts, |ev| { + events.push(ev.clone()); + }); + AgentExecutionResult { result, events } + }); + + let (run_output, trace_events) = match spawn_result.await { + Ok(exec_result) => match exec_result.result { + Ok(run_result) => { + let exit_code = run_result.exit_code(); + let output = CaseRunOutput { + stdout: run_result.stdout, + stderr: run_result.stderr, + exit_code, + timed_out: false, + }; + let trace = agent_events_to_trace(&exec_result.events); + (output, trace) + } + Err(aikit_sdk::RunError::TimedOut { + timeout, stderr, .. + }) => { + let mut trace = agent_events_to_trace(&exec_result.events); + trace.push(TraceEvent { + seq: trace.len(), + payload: TracePayload::Timeout, + }); + let output = CaseRunOutput { + stdout: vec![], + stderr, + exit_code: None, + timed_out: true, + }; + if output.stderr.is_empty() { + let fallback = format!("Case timed out after {}s", timeout.as_secs()); + let output = CaseRunOutput { + stdout: vec![], + stderr: fallback.into_bytes(), + exit_code: None, + timed_out: true, + }; + (output, trace) + } else { + (output, trace) + } + } + Err(e) => { + let trace = agent_events_to_trace(&exec_result.events); + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("Agent execution failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, trace) + } + }, + Err(e) => { + let output = CaseRunOutput { + stdout: vec![], + stderr: format!("spawn_blocking failed: {}", e).into_bytes(), + exit_code: None, + timed_out: false, + }; + (output, vec![]) + } + }; + + let trace_jsonl = trace_to_jsonl(&trace_events); + let stdout_str = String::from_utf8_lossy(&run_output.stdout).to_string(); + let command_count = count_raw_json_events(&trace_jsonl); + let check_results = run_checks(checks, &stdout_str, &trace_jsonl, &working_dir); + let all_passed = check_results.iter().all(|r| r.passed); + + let status = if run_output.timed_out { + CaseStatus::Error + } else if checks.is_empty() { + if run_output.exit_code == Some(0) { + CaseStatus::Passed + } else { + CaseStatus::Failed + } + } else if all_passed { + CaseStatus::Passed + } else { + CaseStatus::Failed + }; + + let case_result = CaseResult { + id: case.id.clone(), + status, + command_count: Some(command_count), + input_tokens: None, + output_tokens: None, + check_results, + error_message: None, + }; + + (run_output, case_result, trace_jsonl) + } +} + +/// Run a single eval case using the default [`AikitEvalRunner`]. +/// +/// Sole agent execution path per spec (acceptance criterion 29). +pub async fn run_eval_case( + case: &EvalCase, + opts: &CaseRunOptions, + checks: &[CheckDefinition], +) -> (CaseRunOutput, CaseResult, String) { + AikitEvalRunner.run_case(case, opts, checks).await +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::eval::artifacts::CaseStatus; + + /// Stub runner for trait wiring tests (no aikit). + struct StubEvalRunner; + + #[async_trait] + impl EvalRunner for StubEvalRunner { + async fn run_case( + &self, + case: &EvalCase, + _opts: &CaseRunOptions, + _checks: &[CheckDefinition], + ) -> (CaseRunOutput, CaseResult, String) { + let trace_jsonl = + r#"{"seq":0,"payload":{"type":"raw_line","line":"stub"}}"#.to_string(); + let out = CaseRunOutput { + stdout: b"ok".to_vec(), + stderr: vec![], + exit_code: Some(0), + timed_out: false, + }; + let result = CaseResult { + id: case.id.clone(), + status: CaseStatus::Passed, + command_count: Some(0), + input_tokens: None, + output_tokens: None, + check_results: vec![], + error_message: None, + }; + (out, result, trace_jsonl) + } + } + + #[tokio::test] + async fn test_eval_runner_trait_stub_returns_expected_trace() { + let case = EvalCase { + id: "c1".to_string(), + prompt: "p".to_string(), + should_trigger: true, + tags: vec![], + workspace_subdir: None, + }; + let opts = CaseRunOptions { + agent_key: "agent".to_string(), + model: None, + project_root: PathBuf::from("/tmp"), + timeout_seconds: 1, + }; + let runner = StubEvalRunner; + let (out, res, trace) = runner.run_case(&case, &opts, &[]).await; + assert_eq!(out.exit_code, Some(0)); + assert_eq!(res.id, "c1"); + assert!(trace.contains("raw_line")); + } + + #[test] + fn test_case_run_options_builder() { + let opts = CaseRunOptions { + agent_key: "codex".to_string(), + model: Some("gpt-4".to_string()), + project_root: PathBuf::from("/tmp"), + timeout_seconds: 300, + }; + assert_eq!(opts.agent_key, "codex"); + assert_eq!(opts.model, Some("gpt-4".to_string())); + } + + #[test] + fn test_runner_error_display() { + let err = RunnerError::AgentUnavailable("codex".to_string()); + assert!(err.to_string().contains("codex")); + assert!(err.to_string().contains("EVAL_AGENT_UNAVAILABLE")); + } +} diff --git a/src/eval/suite.rs b/src/eval/suite.rs new file mode 100644 index 0000000..8d4df58 --- /dev/null +++ b/src/eval/suite.rs @@ -0,0 +1,277 @@ +//! Eval suite loading and CSV parsing + +use std::path::{Path, PathBuf}; +use thiserror::Error; + +/// An individual eval case definition +#[derive(Debug, Clone)] +pub struct EvalCase { + /// Unique case identifier (alphanumeric + hyphens) + pub id: String, + /// Prompt to send to the agent + pub prompt: String, + /// Whether the skill should trigger (documentation-only; checks.toml is authoritative for pass/fail) + pub should_trigger: bool, + /// Tags for filtering + pub tags: Vec, + /// Optional workspace subdirectory (relative to skill project root) + pub workspace_subdir: Option, +} + +/// A collection of eval cases +#[derive(Debug, Default)] +pub struct EvalSuite { + pub cases: Vec, +} + +impl EvalSuite { + pub fn new(cases: Vec) -> Self { + Self { cases } + } + + /// Filter cases by ID + pub fn filter_by_id(&self, id: &str) -> EvalSuite { + EvalSuite { + cases: self.cases.iter().filter(|c| c.id == id).cloned().collect(), + } + } + + /// Filter cases by tag + pub fn filter_by_tag(&self, tag: &str) -> EvalSuite { + EvalSuite { + cases: self + .cases + .iter() + .filter(|c| c.tags.iter().any(|t| t == tag)) + .cloned() + .collect(), + } + } +} + +/// Errors that can occur when loading an eval suite +#[derive(Debug, Error)] +pub enum SuiteError { + #[error("EVAL_PROMPTS_NOT_FOUND: Prompts CSV file not found: {0}")] + PromptsNotFound(PathBuf), + #[error("EVAL_INVALID_CSV: {0}")] + InvalidCsv(String), + #[error("EVAL_INVALID_CSV: IO error: {0}")] + Io(#[from] std::io::Error), +} + +/// Load an eval suite from a CSV file +/// +/// Expected CSV columns: id,prompt,should_trigger,tags,workspace_subdir +pub fn load_suite(prompts_path: &Path) -> Result { + if !prompts_path.exists() { + return Err(SuiteError::PromptsNotFound(prompts_path.to_path_buf())); + } + + let content = std::fs::read_to_string(prompts_path)?; + parse_prompts_csv(&content) +} + +/// Parse prompts CSV content +fn parse_prompts_csv(content: &str) -> Result { + let mut lines = content.lines(); + + // Parse header + let header = lines + .next() + .ok_or_else(|| SuiteError::InvalidCsv("CSV is empty".to_string()))?; + let headers: Vec = parse_csv_line(header); + + // Find column indices + let id_idx = find_col(&headers, "id")?; + let prompt_idx = find_col(&headers, "prompt")?; + let should_trigger_idx = find_col(&headers, "should_trigger")?; + let tags_idx = headers.iter().position(|h| h.trim() == "tags"); + let workspace_subdir_idx = headers.iter().position(|h| h.trim() == "workspace_subdir"); + + let mut cases = Vec::new(); + + for (line_num, line) in lines.enumerate() { + let line = line.trim(); + if line.is_empty() { + continue; + } + + let cols = parse_csv_line(line); + + let id = cols + .get(id_idx) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .ok_or_else(|| { + SuiteError::InvalidCsv(format!("Missing id at line {}", line_num + 2)) + })?; + + let prompt = cols + .get(prompt_idx) + .map(|s| s.trim_matches('"').trim().to_string()) + .ok_or_else(|| { + SuiteError::InvalidCsv(format!("Missing prompt at line {}", line_num + 2)) + })?; + + let should_trigger_str = cols + .get(should_trigger_idx) + .map(|s| s.trim().to_lowercase()) + .unwrap_or_else(|| "false".to_string()); + let should_trigger = should_trigger_str == "true" || should_trigger_str == "1"; + + let tags = if let Some(idx) = tags_idx { + cols.get(idx) + .map(|s| { + s.trim() + .trim_matches('"') + .split(',') + .map(|t| t.trim().to_string()) + .filter(|t| !t.is_empty()) + .collect() + }) + .unwrap_or_default() + } else { + vec![] + }; + + let workspace_subdir = if let Some(idx) = workspace_subdir_idx { + cols.get(idx).and_then(|s| { + let s = s.trim(); + if s.is_empty() { + None + } else { + Some(PathBuf::from(s)) + } + }) + } else { + None + }; + + cases.push(EvalCase { + id, + prompt, + should_trigger, + tags, + workspace_subdir, + }); + } + + Ok(EvalSuite::new(cases)) +} + +fn find_col(headers: &[String], name: &str) -> Result { + headers + .iter() + .position(|h| h.trim() == name) + .ok_or_else(|| SuiteError::InvalidCsv(format!("Missing required column: {}", name))) +} + +/// Simple CSV line parser that handles quoted fields +fn parse_csv_line(line: &str) -> Vec { + let mut fields = Vec::new(); + let mut current = String::new(); + let mut in_quotes = false; + let mut chars = line.chars().peekable(); + + while let Some(ch) = chars.next() { + match ch { + '"' => { + if in_quotes { + // Check for escaped quote "" + if chars.peek() == Some(&'"') { + chars.next(); + current.push('"'); + } else { + in_quotes = false; + } + } else { + in_quotes = true; + } + } + ',' if !in_quotes => { + fields.push(current.clone()); + current.clear(); + } + _ => { + current.push(ch); + } + } + } + fields.push(current); + fields +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_prompts_csv_basic() { + let csv = "id,prompt,should_trigger,tags,workspace_subdir\n\ + test-1,\"Do something\",true,\"basic\",\n\ + test-2,\"Do nothing\",false,\"\",\n"; + let suite = parse_prompts_csv(csv).unwrap(); + assert_eq!(suite.cases.len(), 2); + assert_eq!(suite.cases[0].id, "test-1"); + assert!(suite.cases[0].should_trigger); + assert_eq!(suite.cases[1].id, "test-2"); + assert!(!suite.cases[1].should_trigger); + } + + #[test] + fn test_parse_prompts_csv_missing_required_col() { + let csv = "id,prompt\ntest-1,hello\n"; + let result = parse_prompts_csv(csv); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("should_trigger")); + } + + #[test] + fn test_filter_by_id() { + let cases = vec![ + EvalCase { + id: "a".to_string(), + prompt: "p1".to_string(), + should_trigger: true, + tags: vec![], + workspace_subdir: None, + }, + EvalCase { + id: "b".to_string(), + prompt: "p2".to_string(), + should_trigger: false, + tags: vec![], + workspace_subdir: None, + }, + ]; + let suite = EvalSuite::new(cases); + let filtered = suite.filter_by_id("a"); + assert_eq!(filtered.cases.len(), 1); + assert_eq!(filtered.cases[0].id, "a"); + } + + #[test] + fn test_filter_by_tag() { + let cases = vec![ + EvalCase { + id: "a".to_string(), + prompt: "p1".to_string(), + should_trigger: true, + tags: vec!["foo".to_string(), "bar".to_string()], + workspace_subdir: None, + }, + EvalCase { + id: "b".to_string(), + prompt: "p2".to_string(), + should_trigger: false, + tags: vec!["baz".to_string()], + workspace_subdir: None, + }, + ]; + let suite = EvalSuite::new(cases); + let filtered = suite.filter_by_tag("foo"); + assert_eq!(filtered.cases.len(), 1); + assert_eq!(filtered.cases[0].id, "a"); + } +} diff --git a/src/eval/trace.rs b/src/eval/trace.rs new file mode 100644 index 0000000..e028b1f --- /dev/null +++ b/src/eval/trace.rs @@ -0,0 +1,127 @@ +//! Trace event types for eval case execution + +use aikit_sdk::{AgentEvent, AgentEventPayload}; +use serde::{Deserialize, Serialize}; + +/// A single line in a trace.jsonl file +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TraceEvent { + /// Sequence number (0-based) + pub seq: usize, + /// Event payload + pub payload: TracePayload, +} + +/// Payload of a trace event +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum TracePayload { + /// A raw JSON line from the agent + RawJson { data: serde_json::Value }, + /// A raw text line from stdout + RawLine { line: String }, + /// A raw bytes chunk (base64-encoded) + RawBytes { b64: String }, + /// Execution error + Error { message: String }, + /// Case timed out + Timeout, +} + +/// Convert aikit-sdk AgentEvent to internal TraceEvent +pub fn agent_events_to_trace(events: &[AgentEvent]) -> Vec { + events + .iter() + .map(|ev| { + let payload = match &ev.payload { + AgentEventPayload::JsonLine(value) => TracePayload::RawJson { + data: value.clone(), + }, + AgentEventPayload::RawLine(line) => TracePayload::RawLine { line: line.clone() }, + AgentEventPayload::RawBytes(bytes) => { + use base64::{engine::general_purpose::STANDARD, Engine as _}; + TracePayload::RawBytes { + b64: STANDARD.encode(bytes), + } + } + }; + TraceEvent { + seq: ev.seq as usize, + payload, + } + }) + .collect() +} + +/// Convert raw stdout lines to trace events +pub fn stdout_to_trace(stdout: &[u8]) -> Vec { + let text = String::from_utf8_lossy(stdout); + let mut events = Vec::new(); + + for (seq, line) in text.lines().enumerate() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + + // Try to parse as JSON first + let payload = if let Ok(value) = serde_json::from_str::(trimmed) { + TracePayload::RawJson { data: value } + } else { + TracePayload::RawLine { + line: line.to_string(), + } + }; + + events.push(TraceEvent { seq, payload }); + } + + events +} + +/// Serialize trace events to JSONL format +pub fn trace_to_jsonl(events: &[TraceEvent]) -> String { + events + .iter() + .filter_map(|e| serde_json::to_string(e).ok()) + .collect::>() + .join("\n") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_stdout_to_trace_text_lines() { + let stdout = b"hello world\nfoo bar\n"; + let events = stdout_to_trace(stdout); + assert_eq!(events.len(), 2); + assert_eq!(events[0].seq, 0); + assert!( + matches!(&events[0].payload, TracePayload::RawLine { line } if line == "hello world") + ); + } + + #[test] + fn test_stdout_to_trace_json_lines() { + let stdout = b"{\"key\": \"value\"}\nplain line\n"; + let events = stdout_to_trace(stdout); + assert_eq!(events.len(), 2); + assert!(matches!(&events[0].payload, TracePayload::RawJson { .. })); + assert!(matches!(&events[1].payload, TracePayload::RawLine { .. })); + } + + #[test] + fn test_trace_to_jsonl() { + let events = vec![TraceEvent { + seq: 0, + payload: TracePayload::RawLine { + line: "test".to_string(), + }, + }]; + let jsonl = trace_to_jsonl(&events); + assert!(jsonl.contains("\"seq\":0")); + assert!(jsonl.contains("raw_line")); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6eef1a0..4dc9dc8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,6 +49,7 @@ //! ``` pub mod core; +pub mod eval; pub mod events; pub mod execution; pub mod http; diff --git a/tests/cli/eval_tests.rs b/tests/cli/eval_tests.rs new file mode 100644 index 0000000..f98a08f --- /dev/null +++ b/tests/cli/eval_tests.rs @@ -0,0 +1,301 @@ +//! CLI integration tests for eval commands + +#![allow(clippy::all, clippy::unwrap_used, clippy::expect_used)] + +use super::snapshot_helpers::{ + assert_snapshot_with_settings, cli_snapshot_settings, run_fastskill_command, + run_fastskill_command_with_env, +}; + +#[test] +fn test_eval_help() { + let result = run_fastskill_command(&["eval", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_validate_help() { + let result = run_fastskill_command(&["eval", "validate", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings( + "eval_validate_help", + &result.stdout, + &cli_snapshot_settings(), + ); +} + +#[test] +fn test_eval_run_help() { + let result = run_fastskill_command(&["eval", "run", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_run_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_report_help() { + let result = run_fastskill_command(&["eval", "report", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_report_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_score_help() { + let result = run_fastskill_command(&["eval", "score", "--help"], None); + assert!(result.success); + assert_snapshot_with_settings("eval_score_help", &result.stdout, &cli_snapshot_settings()); +} + +#[test] +fn test_eval_run_requires_agent() { + let result = run_fastskill_command(&["eval", "run", "--output-dir", "/tmp/evals"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--agent") || combined.contains("agent"), + "Expected error about missing --agent, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_requires_output_dir() { + let result = run_fastskill_command(&["eval", "run", "--agent", "codex"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--output-dir") + || combined.contains("output-dir") + || combined.contains("output_dir"), + "Expected error about missing --output-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_rejects_unsupported_agent() { + let result = run_fastskill_command( + &[ + "eval", + "run", + "--agent", + "unsupported-agent-xyz", + "--output-dir", + "/tmp/evals", + ], + None, + ); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("unsupported-agent-xyz") || combined.contains("not a supported agent"), + "Expected error about unsupported agent, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_no_project_file() { + use tempfile::TempDir; + let dir = TempDir::new().unwrap(); + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("skill-project.toml") || combined.contains("EVAL_CONFIG_MISSING"), + "Expected error about missing skill-project.toml, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_no_eval_config() { + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n", + ) + .unwrap(); + + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("EVAL_CONFIG_MISSING") || combined.contains("eval"), + "Expected EVAL_CONFIG_MISSING error, got: {}", + combined + ); +} + +#[test] +fn test_eval_validate_with_eval_config() { + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + + // Create evals directory and prompts.csv + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\ntest-1,\"Test prompt\",true,\"basic\",\n", + ) + .unwrap(); + + // Create SKILL.md so it's detected as skill context + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + + // Create skill-project.toml with eval config + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 300\nfail_on_missing_agent = false\n", + ) + .unwrap(); + + let result = run_fastskill_command(&["eval", "validate"], Some(dir.path())); + assert!( + result.success, + "Expected eval validate to succeed, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + assert!( + result.stdout.contains("valid") || result.stdout.contains("prompts"), + "Expected valid output, got: {}", + result.stdout + ); +} + +#[test] +fn test_eval_report_requires_run_dir() { + let result = run_fastskill_command(&["eval", "report"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--run-dir") || combined.contains("run-dir"), + "Expected error about missing --run-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_score_requires_run_dir() { + let result = run_fastskill_command(&["eval", "score"], None); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("--run-dir") || combined.contains("run-dir"), + "Expected error about missing --run-dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_report_nonexistent_run_dir() { + let result = run_fastskill_command( + &[ + "eval", + "report", + "--run-dir", + "/tmp/nonexistent-fastskill-eval-dir-xyz123", + ], + None, + ); + assert!(!result.success); + let combined = format!("{}{}", result.stdout, result.stderr); + assert!( + combined.contains("EVAL_ARTIFACTS_CORRUPT") || combined.contains("not exist"), + "Expected error about nonexistent dir, got: {}", + combined + ); +} + +#[test] +fn test_eval_run_persists_event_trace_jsonl() { + use serde_json::Value; + use std::env; + use std::fs; + use tempfile::TempDir; + + let dir = TempDir::new().unwrap(); + let evals_dir = dir.path().join("evals"); + fs::create_dir_all(&evals_dir).unwrap(); + fs::write( + evals_dir.join("prompts.csv"), + "id,prompt,should_trigger,tags,workspace_subdir\ntrace-case,\"test prompt\",true,\"basic\",\n", + ) + .unwrap(); + fs::write(dir.path().join("SKILL.md"), "# Test Skill\n").unwrap(); + fs::write( + dir.path().join("skill-project.toml"), + "[metadata]\nid = \"test-skill\"\n\n[tool.fastskill.eval]\nprompts = \"evals/prompts.csv\"\ntimeout_seconds = 30\nfail_on_missing_agent = true\n", + ) + .unwrap(); + + // Create a fake `agent` executable so aikit_sdk::is_agent_available("agent") succeeds. + let bin_dir = dir.path().join("bin"); + fs::create_dir_all(&bin_dir).unwrap(); + let agent_path = bin_dir.join("agent"); + fs::write( + &agent_path, + "#!/usr/bin/env bash\nif [[ \"$1\" == \"--version\" ]]; then echo \"agent 0.1\"; exit 0; fi\necho '{\"event\":\"ok\"}'\nexit 0\n", + ) + .unwrap(); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&agent_path).unwrap().permissions(); + perms.set_mode(0o755); + fs::set_permissions(&agent_path, perms).unwrap(); + } + + let output_dir = dir.path().join("out"); + let path = env::var("PATH").unwrap_or_default(); + let merged_path = format!("{}:{}", bin_dir.display(), path); + let env_vars = vec![("PATH", merged_path.as_str())]; + + let result = run_fastskill_command_with_env( + &[ + "eval", + "run", + "--agent", + "agent", + "--output-dir", + output_dir.to_str().unwrap(), + "--case", + "trace-case", + "--json", + ], + &env_vars, + Some(dir.path()), + ); + assert!( + result.success, + "Expected eval run to succeed, got stdout: {}, stderr: {}", + result.stdout, result.stderr + ); + + let json_start = result.stdout.find('{').unwrap(); + let summary: Value = serde_json::from_str(&result.stdout[json_start..]).unwrap(); + let run_dir = summary["run_dir"].as_str().unwrap(); + let trace_path = std::path::Path::new(run_dir) + .join("trace-case") + .join("trace.jsonl"); + let trace_jsonl = fs::read_to_string(&trace_path).unwrap(); + + assert!( + trace_jsonl.contains("\"type\":\"raw_json\""), + "expected persisted trace.jsonl to contain raw_json event, got: {}", + trace_jsonl + ); + + let result_path = std::path::Path::new(run_dir) + .join("trace-case") + .join("result.json"); + let case_result: Value = + serde_json::from_str(&fs::read_to_string(result_path).unwrap()).unwrap(); + assert_eq!(case_result["command_count"], 1); +} diff --git a/tests/cli/mod.rs b/tests/cli/mod.rs index 50d76de..cda8814 100644 --- a/tests/cli/mod.rs +++ b/tests/cli/mod.rs @@ -7,6 +7,7 @@ pub mod add_tests; pub mod analyze_cluster_tests; pub mod auth_e2e_tests; pub mod config_tests; +pub mod eval_tests; pub mod example_tests; pub mod help_tests; pub mod init_tests; diff --git a/tests/cli/repository_tests.rs b/tests/cli/repository_tests.rs index 820a2f6..be64a6d 100644 --- a/tests/cli/repository_tests.rs +++ b/tests/cli/repository_tests.rs @@ -85,6 +85,7 @@ web-scraper = "1.0.0" server: None, install_depth: 5, skip_transitive: false, + eval: None, }); let repos = fastskill_config.repositories.get_or_insert_with(Vec::new); diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_directory_walking.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_finds_skills_in_current_dir.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_invalid_repositories_path.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_repositories_path_argument.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__cli_with_env_var.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap new file mode 100644 index 0000000..5b3baaa --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_help.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Evaluation commands for skill quality assurance + +Usage: fastskill eval + +Commands: + validate Validate eval configuration and files + run Run eval cases against an agent + report Show a report for a completed eval run + score Re-score saved eval artifacts without running the agent again + help Print this message or the help of the given subcommand(s) + +Options: + -h, --help Print help + +Examples: + fastskill eval validate + fastskill eval run --agent codex --output-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap new file mode 100644 index 0000000..3afaf53 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_report_help.snap @@ -0,0 +1,16 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Show a report for a completed eval run + +Usage: fastskill eval report [OPTIONS] --run-dir + +Options: + --run-dir Path to the specific run directory + --format Output format: table, json, grid, xml + --json Shorthand for --format json + -h, --help Print help + +Examples: + fastskill eval report --run-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap new file mode 100644 index 0000000..4bf1775 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_run_help.snap @@ -0,0 +1,21 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Run eval cases against an agent + +Usage: fastskill eval run [OPTIONS] --agent --output-dir + +Options: + --agent Agent to use for execution (required) + --output-dir Output directory for artifacts (required) + --model Optional model override forwarded to the agent + --case Filter: run only the case with this ID + --tag Filter: run only cases with this tag + --format Output format: table, json, grid, xml + --json Shorthand for --format json + --no-fail Do not fail with non-zero exit code on suite failure + -h, --help Print help + +Examples: + fastskill eval run --agent codex --output-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap new file mode 100644 index 0000000..252d919 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_score_help.snap @@ -0,0 +1,17 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Re-score saved eval artifacts without running the agent again + +Usage: fastskill eval score [OPTIONS] --run-dir + +Options: + --run-dir Path to the run directory to re-score + --format Output format: table, json, grid, xml + --json Shorthand for --format json + --no-fail Do not fail with non-zero exit code on suite failure + -h, --help Print help + +Examples: + fastskill eval score --run-dir [TEMP_DIR] diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap new file mode 100644 index 0000000..9f0f890 --- /dev/null +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__eval_validate_help.snap @@ -0,0 +1,17 @@ +--- +source: tests/cli/snapshot_helpers.rs +expression: normalized +--- +Validate eval configuration and files + +Usage: fastskill eval validate [OPTIONS] + +Options: + --agent Check agent availability for the specified agent key + --format Output format: table, json, grid, xml + --json Shorthand for --format json + -h, --help Print help + +Examples: + fastskill eval validate + fastskill eval validate --agent codex diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap index b5125c6..5d169ee 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_flag.snap @@ -24,6 +24,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap index 89ab80e..37023f2 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_no_args.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) @@ -47,6 +48,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap index a8ba135..611645c 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__help_short_flag.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap index 7b21dee..f78dbd2 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__serve_invalid_port.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap index c6bbaf4..cdf953d 100644 --- a/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap +++ b/tests/cli/snapshots/cli_tests__cli__snapshot_helpers__sources_add_missing_url.snap @@ -11,6 +11,7 @@ Commands: analyze Diagnostic and analysis commands auth Manage authentication for registries disable Disable skills by ID + eval Evaluation commands for skill quality assurance init Initialize skill-project.toml in current skill directory install Apply manifest: install skills from skill-project.toml [dependencies] (canonical command for manifest-driven workflow) list List installed skills and reconcile with skill-project.toml and skills.lock (shows name, description, and flags by default; use --details for full info) diff --git a/tests/core_manifest_tests.rs b/tests/core_manifest_tests.rs index f40d939..18f2cf3 100644 --- a/tests/core_manifest_tests.rs +++ b/tests/core_manifest_tests.rs @@ -170,6 +170,7 @@ fn test_validation_project_level() { server: None, install_depth: 5, skip_transitive: false, + eval: None, }), }), };