Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion src/cli/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use fastskill::FastSkillService;
use std::sync::Arc;

use crate::cli::commands::{
add, analyze, auth, disable, init, install, list, marketplace, package, publish, read,
add, analyze, auth, disable, eval, init, install, list, marketplace, package, publish, read,
registry, reindex, remove, repos, search, serve, show, sources, sync, update, version,
Commands,
};
Expand Down Expand Up @@ -109,6 +109,10 @@ impl Cli {
return auth::execute_auth(args).await;
}

if let Some(Commands::Eval(args)) = self.command {
return eval::execute_eval(args).await;
}

// For other commands, we need to restore self.command, so we need a different approach
// Actually, if we get here, command was None or not Init/Repository/Package/Publish/RegistryIndex
let command = self.command;
Expand Down Expand Up @@ -155,6 +159,7 @@ impl Cli {
| Some(Commands::Repos(_))
| Some(Commands::Marketplace(_))
| Some(Commands::Auth(_))
| Some(Commands::Eval(_))
| Some(Commands::Version(_)) => unreachable!("Handled above"),
None => {
// No subcommand - treat as skill ID for read command
Expand Down
62 changes: 62 additions & 0 deletions src/cli/commands/eval/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//! Eval command group for skill quality assurance

pub mod report;
pub mod run;
pub mod score;
pub mod validate;

use crate::cli::error::CliResult;
use clap::{Args, Subcommand};

/// Eval command group
#[derive(Debug, Args)]
#[command(
about = "Evaluation commands for skill quality assurance",
after_help = "Examples:\n fastskill eval validate\n fastskill eval run --agent codex --output-dir /tmp/evals"
)]
pub struct EvalCommand {
#[command(subcommand)]
pub command: EvalSubcommand,
}

/// Eval subcommands
#[derive(Debug, Subcommand)]
pub enum EvalSubcommand {
/// Validate eval configuration and files
#[command(
about = "Validate eval configuration and files",
after_help = "Examples:\n fastskill eval validate\n fastskill eval validate --agent codex"
)]
Validate(validate::ValidateArgs),

/// Run eval cases against an agent
#[command(
about = "Run eval cases against an agent",
after_help = "Examples:\n fastskill eval run --agent codex --output-dir /tmp/evals"
)]
Run(run::RunArgs),

/// Show a report for a completed eval run
#[command(
about = "Show a report for a completed eval run",
after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z"
)]
Report(report::ReportArgs),

/// Re-score saved eval artifacts without running the agent again
#[command(
about = "Re-score saved eval artifacts without running the agent again",
after_help = "Examples:\n fastskill eval score --run-dir /tmp/evals/2026-04-01T14-32-10Z"
)]
Score(score::ScoreArgs),
}

/// Execute the eval command group
pub async fn execute_eval(args: EvalCommand) -> CliResult<()> {
match args.command {
EvalSubcommand::Validate(args) => validate::execute_validate(args).await,
EvalSubcommand::Run(args) => run::execute_run(args).await,
EvalSubcommand::Report(args) => report::execute_report(args).await,
EvalSubcommand::Score(args) => score::execute_score(args).await,
}
}
80 changes: 80 additions & 0 deletions src/cli/commands/eval/report.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//! Eval report subcommand - artifact summary and formatting

use crate::cli::commands::common::validate_format_args;
use crate::cli::error::{CliError, CliResult};
use clap::Args;
use fastskill::eval::artifacts::read_summary;
use fastskill::OutputFormat;
use std::path::PathBuf;

/// Arguments for `fastskill eval report`
#[derive(Debug, Args)]
#[command(
about = "Show a report for a completed eval run",
after_help = "Examples:\n fastskill eval report --run-dir /tmp/evals/2026-04-01T14-32-10Z\n fastskill eval report --run-dir ./evals/2026-04-01T14-32-10Z --json"
)]
pub struct ReportArgs {
/// Path to the specific run directory
#[arg(long, required = true)]
pub run_dir: PathBuf,

/// Output format: table, json, grid, xml (default: table)
#[arg(long, value_enum, help = "Output format: table, json, grid, xml")]
pub format: Option<OutputFormat>,

/// Shorthand for --format json
#[arg(long, help = "Shorthand for --format json")]
pub json: bool,
}

/// Execute the `eval report` command
pub async fn execute_report(args: ReportArgs) -> CliResult<()> {
let format = validate_format_args(&args.format, args.json)?;
let use_json = format == OutputFormat::Json;

if !args.run_dir.exists() {
return Err(CliError::Config(format!(
"EVAL_ARTIFACTS_CORRUPT: Run directory does not exist: {}",
args.run_dir.display()
)));
}

let summary = read_summary(&args.run_dir).map_err(|e| {
CliError::Config(format!(
"EVAL_ARTIFACTS_CORRUPT: Failed to read summary.json: {}",
e
))
})?;

if use_json {
println!(
"{}",
serde_json::to_string_pretty(&summary).unwrap_or_default()
);
} else {
println!("Eval Report");
println!(" run_dir: {}", args.run_dir.display());
println!(" agent: {}", summary.agent);
if let Some(ref model) = summary.model {
println!(" model: {}", model);
}
println!(
" result: {}",
if summary.suite_pass {
"PASSED"
} else {
"FAILED"
}
);
println!(" cases: {}/{} passed", summary.passed, summary.total_cases);

if !summary.cases.is_empty() {
println!("\nCase Results:");
for case in &summary.cases {
println!(" [{}] {}", case.status, case.id);
}
}
}

Ok(())
}
Loading
Loading