diff --git a/Cargo.lock b/Cargo.lock index c5272da93..c8e223ff9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,7 @@ name = "agent-os-execution" version = "0.1.0" dependencies = [ "agent-os-bridge", + "serde", "serde_json", "tempfile", "wat", diff --git a/crates/execution/Cargo.toml b/crates/execution/Cargo.toml index f314b1c51..9a3c6c120 100644 --- a/crates/execution/Cargo.toml +++ b/crates/execution/Cargo.toml @@ -7,6 +7,7 @@ description = "Native execution plane scaffold for Agent OS" [dependencies] agent-os-bridge = { path = "../bridge" } +serde = { version = "1.0", features = ["derive"] } serde_json = "1" [dev-dependencies] diff --git a/crates/execution/src/benchmark.rs b/crates/execution/src/benchmark.rs index b7d5abe67..f51f68a86 100644 --- a/crates/execution/src/benchmark.rs +++ b/crates/execution/src/benchmark.rs @@ -2,6 +2,7 @@ use crate::{ CreateJavascriptContextRequest, JavascriptExecutionEngine, JavascriptExecutionError, StartJavascriptExecutionRequest, }; +use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::env; use std::fmt; @@ -9,12 +10,18 @@ use std::fmt::Write as _; use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; -use std::time::{Instant, SystemTime, UNIX_EPOCH}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; const BENCHMARK_MARKER_PREFIX: &str = "__AGENT_OS_BENCH__:"; const LOCAL_GRAPH_MODULE_COUNT: usize = 24; - -#[derive(Debug, Clone, PartialEq, Eq)] +const BENCHMARK_ARTIFACT_VERSION: u32 = 5; +const BENCHMARK_ARTIFACT_DIR: &str = "target/benchmark-reports/node-import-bench"; +const BENCHMARK_RUN_STATE_FILE: &str = "run-state.json"; +const TRANSPORT_RTT_CHANNEL: &str = "execution-stdio-echo"; +const TRANSPORT_RTT_PAYLOAD_BYTES: [usize; 3] = [32, 4 * 1024, 64 * 1024]; +const TRANSPORT_POLL_TIMEOUT: Duration = Duration::from_secs(5); + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct JavascriptBenchmarkConfig { pub iterations: usize, pub warmup_iterations: usize, @@ -29,7 +36,7 @@ impl Default for JavascriptBenchmarkConfig { } } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] pub struct BenchmarkHost { pub node_binary: String, pub node_version: String, @@ -38,18 +45,63 @@ pub struct BenchmarkHost { pub logical_cpus: usize, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct BenchmarkScenarioPhases { + pub context_setup_ms: T, + pub startup_ms: T, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub guest_execution_ms: Option, + pub completion_ms: T, +} + +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] pub struct BenchmarkStats { pub mean_ms: f64, pub p50_ms: f64, pub p95_ms: f64, pub min_ms: f64, pub max_ms: f64, + pub stddev_ms: f64, +} + +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +pub struct BenchmarkDistributionStats { + pub mean: f64, + pub p50: f64, + pub p95: f64, + pub min: f64, + pub max: f64, + pub stddev: f64, +} + +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +pub struct BenchmarkResourceUsage { + #[serde(skip_serializing_if = "Option::is_none", default)] + pub rss_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub heap_used_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub cpu_user_us: Option, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub cpu_system_us: Option, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub cpu_total_us: Option, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkTransportRttReport { + pub channel: &'static str, + pub payload_bytes: usize, + pub samples_ms: Vec, + pub stats: BenchmarkStats, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize)] pub struct BenchmarkScenarioReport { pub id: &'static str, + pub workload: &'static str, + pub runtime: &'static str, + pub mode: &'static str, pub description: &'static str, pub fixture: &'static str, pub compile_cache: &'static str, @@ -59,19 +111,86 @@ pub struct BenchmarkScenarioReport { pub guest_import_stats: Option, pub startup_overhead_samples_ms: Option>, pub startup_overhead_stats: Option, + pub phase_samples_ms: BenchmarkScenarioPhases>, + pub phase_stats: BenchmarkScenarioPhases, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub resource_usage_samples: Option>>, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub resource_usage_stats: Option>, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize)] pub struct JavascriptBenchmarkReport { pub generated_at_unix_ms: u128, pub config: JavascriptBenchmarkConfig, pub host: BenchmarkHost, pub repo_root: PathBuf, + pub transport_rtt: Vec, pub scenarios: Vec, } +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkComparison { + pub baseline: BenchmarkComparisonBaseline, + pub summary: BenchmarkComparisonSummary, + pub scenario_deltas: Vec, + pub scenarios_missing_from_baseline: Vec, + pub baseline_only_scenarios: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkComparisonBaseline { + pub artifact_version: u32, + pub generated_at_unix_ms: u128, + pub path: PathBuf, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkComparisonSummary { + pub compared_scenario_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub largest_wall_improvement: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub largest_wall_regression: Option, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkDeltaHighlight { + pub id: String, + pub delta_ms: f64, + pub delta_pct: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkScenarioDelta { + pub id: String, + pub description: String, + pub wall_mean_ms: BenchmarkMetricDelta, + #[serde(skip_serializing_if = "Option::is_none")] + pub guest_import_mean_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub startup_overhead_mean_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub phase_mean_ms: Option>, +} + +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct BenchmarkMetricDelta { + pub baseline_ms: f64, + pub current_ms: f64, + pub delta_ms: f64, + pub delta_pct: f64, +} + impl JavascriptBenchmarkReport { pub fn render_markdown(&self) -> String { + self.render_markdown_with_comparison(None) + } + + pub fn render_markdown_with_comparison( + &self, + comparison: Option<&BenchmarkComparison>, + ) -> String { let mut markdown = String::new(); let _ = writeln!(&mut markdown, "# Agent OS Node Import Benchmark"); let _ = writeln!(&mut markdown); @@ -103,22 +222,60 @@ impl JavascriptBenchmarkReport { self.config.iterations, self.config.warmup_iterations ); let _ = writeln!(&mut markdown); + let _ = writeln!(&mut markdown, "## Transport RTT"); + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "| Channel | Payload (bytes) | Mean RTT (ms) | P50 | P95 |" + ); + let _ = writeln!(&mut markdown, "| --- | ---: | ---: | ---: | ---: |"); + + for transport in &self.transport_rtt { + let _ = writeln!( + &mut markdown, + "| `{}` | {} | {} | {} | {} |", + transport.channel, + transport.payload_bytes, + format_ms(transport.stats.mean_ms), + format_ms(transport.stats.p50_ms), + format_ms(transport.stats.p95_ms), + ); + } + + let _ = writeln!(&mut markdown, "## Control Matrix"); + let _ = writeln!(&mut markdown); + + for row in self.control_matrix() { + let _ = writeln!( + &mut markdown, + "- Workload `{}`: runtimes {}, modes {}, scenarios {}", + row.workload, + format_label_list(&row.runtimes), + format_label_list(&row.modes), + format_label_list(&row.scenario_ids), + ); + } + + let _ = writeln!(&mut markdown); + let _ = writeln!(&mut markdown, "## Scenario Summary"); + let _ = writeln!(&mut markdown); let _ = writeln!( &mut markdown, - "| Scenario | Fixture | Cache | Mean wall (ms) | P50 | P95 | Mean import (ms) | Mean startup overhead (ms) |" + "| Scenario | Workload | Runtime | Mode | Fixture | Cache | Mean wall (ms) | Mean context (ms) | Mean startup (ms) | Mean guest exec (ms) | Mean completion (ms) | Mean startup overhead (ms) |" ); let _ = writeln!( &mut markdown, - "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |" + "| --- | --- | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |" ); for scenario in &self.scenarios { - let import_mean = scenario - .guest_import_stats + let guest_execution_mean = scenario + .phase_stats + .guest_execution_ms .as_ref() .map(|stats| format_ms(stats.mean_ms)) .unwrap_or_else(|| String::from("n/a")); - let startup_mean = scenario + let startup_overhead_mean = scenario .startup_overhead_stats .as_ref() .map(|stats| format_ms(stats.mean_ms)) @@ -126,19 +283,106 @@ impl JavascriptBenchmarkReport { let _ = writeln!( &mut markdown, - "| `{}` | {} | {} | {} | {} | {} | {} | {} |", + "| `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} |", scenario.id, + scenario.workload, + scenario.runtime, + scenario.mode, scenario.fixture, scenario.compile_cache, format_ms(scenario.wall_stats.mean_ms), + format_ms(scenario.phase_stats.context_setup_ms.mean_ms), + format_ms(scenario.phase_stats.startup_ms.mean_ms), + guest_execution_mean, + format_ms(scenario.phase_stats.completion_ms.mean_ms), + startup_overhead_mean, + ); + } + + let _ = writeln!(&mut markdown); + let _ = writeln!(&mut markdown, "## Stability And Resource Summary"); + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "| Scenario | Wall P50 (ms) | Wall min-max (ms) | Wall stddev (ms) | Mean RSS (MiB) | Mean heap (MiB) | Mean total CPU (ms) |" + ); + let _ = writeln!( + &mut markdown, + "| --- | ---: | --- | ---: | ---: | ---: | ---: |" + ); + + for scenario in &self.scenarios { + let _ = writeln!( + &mut markdown, + "| `{}` | {} | {}-{} | {} | {} | {} | {} |", + scenario.id, format_ms(scenario.wall_stats.p50_ms), - format_ms(scenario.wall_stats.p95_ms), - import_mean, - startup_mean, + format_ms(scenario.wall_stats.min_ms), + format_ms(scenario.wall_stats.max_ms), + format_ms(scenario.wall_stats.stddev_ms), + scenario + .resource_usage_stats + .as_ref() + .and_then(|stats| stats.rss_bytes.as_ref()) + .map(|stats| format_mib(bytes_to_mib(stats.mean))) + .unwrap_or_else(|| String::from("n/a")), + scenario + .resource_usage_stats + .as_ref() + .and_then(|stats| stats.heap_used_bytes.as_ref()) + .map(|stats| format_mib(bytes_to_mib(stats.mean))) + .unwrap_or_else(|| String::from("n/a")), + scenario + .resource_usage_stats + .as_ref() + .and_then(|stats| stats.cpu_total_us.as_ref()) + .map(|stats| format_ms(micros_to_ms(stats.mean))) + .unwrap_or_else(|| String::from("n/a")), ); } let _ = writeln!(&mut markdown); + let _ = writeln!(&mut markdown, "## Ranked Hotspots"); + let _ = writeln!(&mut markdown); + + for ranking in self.hotspot_rankings() { + let _ = writeln!( + &mut markdown, + "### {} (`{}`, `{}`)", + ranking.label, ranking.dimension, ranking.unit + ); + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "| Rank | Scenario | Workload | Runtime | Mode | Value |" + ); + let _ = writeln!(&mut markdown, "| ---: | --- | --- | --- | --- | ---: |"); + + for scenario in &ranking.ranked_scenarios { + let _ = writeln!( + &mut markdown, + "| {} | `{}` | `{}` | `{}` | `{}` | {} |", + scenario.rank, + scenario.id, + scenario.workload, + scenario.runtime, + scenario.mode, + format_hotspot_value(ranking.unit, scenario.value), + ); + } + + if !ranking.scenarios_without_metric.is_empty() { + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "Missing metric for: {}", + format_string_label_list(&ranking.scenarios_without_metric), + ); + } + + let _ = writeln!(&mut markdown); + } + let _ = writeln!(&mut markdown, "## Hotspot Guidance"); let _ = writeln!(&mut markdown); @@ -146,12 +390,112 @@ impl JavascriptBenchmarkReport { let _ = writeln!(&mut markdown, "- {line}"); } + if let Some(comparison) = comparison { + let _ = writeln!(&mut markdown); + let _ = writeln!(&mut markdown, "## Baseline Comparison"); + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "- Baseline artifact: `{}`", + comparison.baseline.path.display() + ); + let _ = writeln!( + &mut markdown, + "- Baseline generated at unix ms: `{}`", + comparison.baseline.generated_at_unix_ms + ); + let _ = writeln!( + &mut markdown, + "- Compared scenarios: `{}`", + comparison.summary.compared_scenario_count + ); + if let Some(improvement) = &comparison.summary.largest_wall_improvement { + let _ = writeln!( + &mut markdown, + "- Largest wall-time improvement: `{}` at {} ({})", + improvement.id, + format_delta_ms(improvement.delta_ms), + format_delta_pct(improvement.delta_pct), + ); + } + if let Some(regression) = &comparison.summary.largest_wall_regression { + let _ = writeln!( + &mut markdown, + "- Largest wall-time regression: `{}` at {} ({})", + regression.id, + format_delta_ms(regression.delta_ms), + format_delta_pct(regression.delta_pct), + ); + } + if !comparison.scenarios_missing_from_baseline.is_empty() { + let _ = writeln!( + &mut markdown, + "- Scenarios missing from baseline: {}", + comparison.scenarios_missing_from_baseline.join(", ") + ); + } + if !comparison.baseline_only_scenarios.is_empty() { + let _ = writeln!( + &mut markdown, + "- Baseline-only scenarios: {}", + comparison.baseline_only_scenarios.join(", ") + ); + } + let _ = writeln!(&mut markdown); + let _ = writeln!( + &mut markdown, + "| Scenario | Wall delta (ms) | Wall delta % | Import delta (ms) | Startup delta (ms) | Context delta (ms) | Completion delta (ms) |" + ); + let _ = writeln!( + &mut markdown, + "| --- | ---: | ---: | ---: | ---: | ---: | ---: |" + ); + + for scenario in &comparison.scenario_deltas { + let import_delta = scenario + .guest_import_mean_ms + .as_ref() + .map(|delta| format_delta_ms(delta.delta_ms)) + .unwrap_or_else(|| String::from("n/a")); + let startup_delta = scenario + .startup_overhead_mean_ms + .as_ref() + .map(|delta| format_delta_ms(delta.delta_ms)) + .unwrap_or_else(|| String::from("n/a")); + let context_delta = scenario + .phase_mean_ms + .as_ref() + .map(|delta| format_delta_ms(delta.context_setup_ms.delta_ms)) + .unwrap_or_else(|| String::from("n/a")); + let completion_delta = scenario + .phase_mean_ms + .as_ref() + .map(|delta| format_delta_ms(delta.completion_ms.delta_ms)) + .unwrap_or_else(|| String::from("n/a")); + + let _ = writeln!( + &mut markdown, + "| `{}` | {} | {} | {} | {} | {} | {} |", + scenario.id, + format_delta_ms(scenario.wall_mean_ms.delta_ms), + format_delta_pct(scenario.wall_mean_ms.delta_pct), + import_delta, + startup_delta, + context_delta, + completion_delta, + ); + } + } + let _ = writeln!(&mut markdown); let _ = writeln!(&mut markdown, "## Raw Samples"); let _ = writeln!(&mut markdown); for scenario in &self.scenarios { let _ = writeln!(&mut markdown, "### `{}`", scenario.id); + let _ = writeln!(&mut markdown, "- Workload: `{}`", scenario.workload); + let _ = writeln!(&mut markdown, "- Runtime: `{}`", scenario.runtime); + let _ = writeln!(&mut markdown, "- Mode: `{}`", scenario.mode); let _ = writeln!(&mut markdown, "- Description: {}", scenario.description); let _ = writeln!( &mut markdown, @@ -172,16 +516,113 @@ impl JavascriptBenchmarkReport { format_sample_list(samples) ); } + let _ = writeln!( + &mut markdown, + "- Context setup samples (ms): {}", + format_sample_list(&scenario.phase_samples_ms.context_setup_ms) + ); + let _ = writeln!( + &mut markdown, + "- Startup samples (ms): {}", + format_sample_list(&scenario.phase_samples_ms.startup_ms) + ); + if let Some(samples) = &scenario.phase_samples_ms.guest_execution_ms { + let _ = writeln!( + &mut markdown, + "- Guest execution samples (ms): {}", + format_sample_list(samples) + ); + } + let _ = writeln!( + &mut markdown, + "- Completion samples (ms): {}", + format_sample_list(&scenario.phase_samples_ms.completion_ms) + ); + if let Some(samples) = &scenario.resource_usage_samples { + if let Some(rss_samples) = &samples.rss_bytes { + let _ = writeln!( + &mut markdown, + "- RSS samples (MiB): {}", + format_scaled_sample_list(rss_samples, bytes_to_mib) + ); + } + if let Some(heap_samples) = &samples.heap_used_bytes { + let _ = writeln!( + &mut markdown, + "- Heap samples (MiB): {}", + format_scaled_sample_list(heap_samples, bytes_to_mib) + ); + } + if let Some(cpu_samples) = &samples.cpu_total_us { + let _ = writeln!( + &mut markdown, + "- Total CPU samples (ms): {}", + format_scaled_sample_list(cpu_samples, micros_to_ms) + ); + } + } let _ = writeln!(&mut markdown); } markdown } + pub fn render_json(&self) -> Result { + self.render_json_with_comparison(None) + } + + pub fn render_json_with_comparison( + &self, + comparison: Option<&BenchmarkComparison>, + ) -> Result { + serde_json::to_string_pretty(&self.json_artifact(comparison)) + } + + pub fn write_artifacts( + &self, + output_dir: &Path, + ) -> Result { + self.write_artifacts_with_comparison(output_dir, None) + } + + pub fn write_artifacts_with_comparison( + &self, + output_dir: &Path, + comparison: Option<&BenchmarkComparison>, + ) -> Result { + fs::create_dir_all(output_dir)?; + + let markdown_path = output_dir.join("report.md"); + let json_path = output_dir.join("report.json"); + write_string_atomic( + &markdown_path, + &self.render_markdown_with_comparison(comparison), + )?; + write_string_atomic(&json_path, &self.render_json_with_comparison(comparison)?)?; + + Ok(JavascriptBenchmarkArtifactPaths { + markdown_path, + json_path, + }) + } + + pub fn compare_to_baseline_path( + &self, + baseline_path: &Path, + ) -> Result { + let baseline = load_benchmark_artifact(baseline_path)?; + Ok(BenchmarkComparison::from_reports( + self, + baseline_path, + &baseline, + )) + } + fn guidance_lines(&self) -> Vec { let isolate = self.scenario("isolate-startup"); let cold_local = self.scenario("cold-local-import"); let warm_local = self.scenario("warm-local-import"); + let prewarmed_local = self.scenario("prewarmed-local-import"); let builtin = self.scenario("builtin-import"); let large = self.scenario("large-package-import"); @@ -190,7 +631,10 @@ impl JavascriptBenchmarkReport { if let ( Some(cold_import), Some(warm_import), - Some(warm_startup), + Some(warm_context), + Some(warm_startup_phase), + Some(warm_completion), + Some(warm_startup_overhead), Some(warm_wall), Some(isolate_wall), ) = ( @@ -200,6 +644,9 @@ impl JavascriptBenchmarkReport { warm_local .and_then(|scenario| scenario.guest_import_stats.as_ref()) .map(|stats| stats.mean_ms), + warm_local.map(|scenario| scenario.phase_stats.context_setup_ms.mean_ms), + warm_local.map(|scenario| scenario.phase_stats.startup_ms.mean_ms), + warm_local.map(|scenario| scenario.phase_stats.completion_ms.mean_ms), warm_local .and_then(|scenario| scenario.startup_overhead_stats.as_ref()) .map(|stats| stats.mean_ms), @@ -211,15 +658,45 @@ impl JavascriptBenchmarkReport { format_ms(cold_import), format_ms(warm_import), percentage_reduction(cold_import, warm_import), - format_ms(warm_startup), + format_ms(warm_startup_overhead), format_ms(isolate_wall), )); if warm_wall > 0.0 { guidance.push(format!( "Warm local imports still spend {:.1}% of wall time in process startup, wrapper evaluation, and stdio handling instead of guest import work. Optimizations that only touch module compilation will not remove that floor.", - percentage_share(warm_startup, warm_wall), + percentage_share(warm_startup_overhead, warm_wall), )); } + let warm_guest = warm_local + .and_then(|scenario| scenario.phase_stats.guest_execution_ms.as_ref()) + .map(|stats| stats.mean_ms) + .unwrap_or(0.0); + guidance.push(format!( + "The warm path phase split is {} context setup, {} runtime startup, {} guest execution, and {} completion/stdio work. Future attribution can now separate bootstrap wins from pure transport/collection wins instead of treating them as one startup bucket.", + format_ms(warm_context), + format_ms(warm_startup_phase), + format_ms(warm_guest), + format_ms(warm_completion), + )); + } + + if let (Some(warm_startup_overhead), Some(prewarmed_startup_overhead), Some(isolate_wall)) = ( + warm_local + .and_then(|scenario| scenario.startup_overhead_stats.as_ref()) + .map(|stats| stats.mean_ms), + prewarmed_local + .and_then(|scenario| scenario.startup_overhead_stats.as_ref()) + .map(|stats| stats.mean_ms), + isolate.map(|scenario| scenario.wall_stats.mean_ms), + ) { + guidance.push(format!( + "Keeping the current import-cache materialization and builtin/polyfill prewarm alive inside one execution engine cuts warm local startup overhead from {} to {} ({:.1}% faster). The remaining {} of non-import work is the post-prewarm floor that broader warm-pool/snapshot work would still need to attack above the `{}` empty-isolate baseline.", + format_ms(warm_startup_overhead), + format_ms(prewarmed_startup_overhead), + percentage_reduction(warm_startup_overhead, prewarmed_startup_overhead), + format_ms(prewarmed_startup_overhead), + format_ms(isolate_wall), + )); } if let (Some(builtin_import), Some(large_import)) = ( @@ -238,6 +715,59 @@ impl JavascriptBenchmarkReport { )); } + if let (Some(smallest), Some(largest)) = + (self.transport_rtt.first(), self.transport_rtt.last()) + { + guidance.push(format!( + "Execution-transport RTT over the stdio bridge rises from {} at {} bytes to {} at {} bytes. That gives later work a direct transport floor to compare against the larger startup and import phases.", + format_ms(smallest.stats.mean_ms), + smallest.payload_bytes, + format_ms(largest.stats.mean_ms), + largest.payload_bytes, + )); + } + + if let Some(noisiest) = self.scenarios.iter().max_by(|lhs, rhs| { + lhs.wall_stats + .stddev_ms + .total_cmp(&rhs.wall_stats.stddev_ms) + }) { + guidance.push(format!( + "Wall-time noise is now surfaced directly in the same artifact set: `{}` currently shows the largest spread at {} stddev over a {}-{} wall range, so future deltas on that path should be judged against stability as well as mean time.", + noisiest.id, + format_ms(noisiest.wall_stats.stddev_ms), + format_ms(noisiest.wall_stats.min_ms), + format_ms(noisiest.wall_stats.max_ms), + )); + } + + if let Some(heaviest) = self.scenarios.iter().max_by(|lhs, rhs| { + lhs.resource_usage_stats + .as_ref() + .and_then(|stats| stats.rss_bytes.as_ref()) + .map(|stats| stats.mean) + .unwrap_or(f64::NEG_INFINITY) + .total_cmp( + &rhs.resource_usage_stats + .as_ref() + .and_then(|stats| stats.rss_bytes.as_ref()) + .map(|stats| stats.mean) + .unwrap_or(f64::NEG_INFINITY), + ) + }) { + if let Some(rss_mean) = heaviest + .resource_usage_stats + .as_ref() + .and_then(|stats| stats.rss_bytes.as_ref()) + { + guidance.push(format!( + "Per-scenario resource reporting is now attached to the benchmark rows themselves: `{}` currently has the highest mean RSS at {} MiB, so import-path changes can now be judged for memory regressions without a separate memory-only pass.", + heaviest.id, + format_mib(bytes_to_mib(rss_mean.mean)), + )); + } + } + guidance.push(String::from( "No new PRD stories were added from this run. The measured hotspots already map cleanly onto existing follow-ons: `ARC-021C` for safe resolution and metadata caches, `ARC-021D` for builtin/polyfill prewarm, and `ARC-022` for broader warm-pool and timing-mitigation execution work.", )); @@ -248,84 +778,703 @@ impl JavascriptBenchmarkReport { fn scenario(&self, id: &str) -> Option<&BenchmarkScenarioReport> { self.scenarios.iter().find(|scenario| scenario.id == id) } -} - -#[derive(Debug)] -pub enum JavascriptBenchmarkError { - InvalidConfig(&'static str), - InvalidWorkspaceRoot(PathBuf), - Io(std::io::Error), - Utf8(std::string::FromUtf8Error), - Execution(JavascriptExecutionError), - NodeVersion(std::io::Error), - MissingBenchmarkMetric(&'static str), - InvalidBenchmarkMetric { - scenario: &'static str, - raw_value: String, - }, - NonZeroExit { - scenario: &'static str, - exit_code: i32, - stderr: String, - }, -} -impl fmt::Display for JavascriptBenchmarkError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::InvalidConfig(message) => write!(f, "invalid benchmark config: {message}"), - Self::InvalidWorkspaceRoot(path) => { - write!( - f, - "failed to resolve workspace root from execution crate path: {}", - path.display() - ) - } - Self::Io(err) => write!(f, "benchmark I/O failure: {err}"), - Self::Utf8(err) => write!(f, "benchmark output was not valid UTF-8: {err}"), - Self::Execution(err) => write!(f, "benchmark execution failed: {err}"), - Self::NodeVersion(err) => write!(f, "failed to query node version: {err}"), - Self::MissingBenchmarkMetric(scenario) => { - write!( - f, - "benchmark scenario `{scenario}` did not emit a metric marker" - ) - } - Self::InvalidBenchmarkMetric { - scenario, - raw_value, - } => write!( - f, - "benchmark scenario `{scenario}` emitted an invalid metric: {raw_value}" - ), - Self::NonZeroExit { - scenario, - exit_code, - stderr, - } => write!( - f, - "benchmark scenario `{scenario}` exited with code {exit_code}: {stderr}" + fn json_artifact<'a>( + &'a self, + comparison: Option<&'a BenchmarkComparison>, + ) -> JavascriptBenchmarkArtifact<'a> { + JavascriptBenchmarkArtifact { + artifact_version: BENCHMARK_ARTIFACT_VERSION, + generated_at_unix_ms: self.generated_at_unix_ms, + command: format!( + "cargo run -p agent-os-execution --bin node-import-bench -- --iterations {} --warmup-iterations {}", + self.config.iterations, self.config.warmup_iterations ), + config: &self.config, + host: &self.host, + repo_root: &self.repo_root, + summary: self.summary(), + comparison, + transport_rtt: self + .transport_rtt + .iter() + .map(|transport| BenchmarkTransportRttArtifact { + channel: transport.channel, + payload_bytes: transport.payload_bytes, + samples_ms: &transport.samples_ms, + stats: &transport.stats, + }) + .collect(), + scenarios: self + .scenarios + .iter() + .map(|scenario| BenchmarkScenarioArtifact { + id: scenario.id, + workload: scenario.workload, + runtime: scenario.runtime, + mode: scenario.mode, + description: scenario.description, + fixture: scenario.fixture, + compile_cache: scenario.compile_cache, + wall_samples_ms: &scenario.wall_samples_ms, + wall_stats: &scenario.wall_stats, + guest_import_samples_ms: scenario.guest_import_samples_ms.as_deref(), + guest_import_stats: scenario.guest_import_stats.as_ref(), + startup_overhead_samples_ms: scenario.startup_overhead_samples_ms.as_deref(), + startup_overhead_stats: scenario.startup_overhead_stats.as_ref(), + mean_startup_share_pct: scenario.mean_startup_share_pct(), + phase_samples_ms: &scenario.phase_samples_ms, + phase_stats: &scenario.phase_stats, + resource_usage_samples: scenario.resource_usage_samples.as_ref(), + resource_usage_stats: scenario.resource_usage_stats.as_ref(), + }) + .collect(), } } -} - -impl std::error::Error for JavascriptBenchmarkError {} -impl From for JavascriptBenchmarkError { - fn from(err: std::io::Error) -> Self { - Self::Io(err) + fn summary(&self) -> BenchmarkSummaryArtifact<'_> { + BenchmarkSummaryArtifact { + scenario_count: self.scenarios.len(), + recorded_samples_per_scenario: self.config.iterations, + warmup_iterations: self.config.warmup_iterations, + control_matrix: self.control_matrix(), + slowest_wall_scenario: self.slowest_scenario_by(|scenario| scenario.wall_stats.mean_ms), + slowest_guest_import_scenario: self.slowest_scenario_by(|scenario| { + scenario + .guest_import_stats + .as_ref() + .map(|stats| stats.mean_ms) + .unwrap_or(f64::NEG_INFINITY) + }), + highest_startup_share_scenario: self.scenarios.iter().max_by(|lhs, rhs| { + lhs.mean_startup_share_pct() + .unwrap_or(f64::NEG_INFINITY) + .total_cmp(&rhs.mean_startup_share_pct().unwrap_or(f64::NEG_INFINITY)) + }), + hotspot_rankings: self.hotspot_rankings(), + guidance_lines: self.guidance_lines(), + } } -} -impl From for JavascriptBenchmarkError { - fn from(err: std::string::FromUtf8Error) -> Self { - Self::Utf8(err) - } -} + fn control_matrix(&self) -> Vec> { + let mut rows = Vec::new(); + let mut row_indexes = BTreeMap::new(); -impl From for JavascriptBenchmarkError { - fn from(err: JavascriptExecutionError) -> Self { + for scenario in &self.scenarios { + let row_index = *row_indexes.entry(scenario.workload).or_insert_with(|| { + rows.push(BenchmarkControlMatrixArtifact { + workload: scenario.workload, + runtimes: Vec::new(), + modes: Vec::new(), + scenario_ids: Vec::new(), + }); + rows.len() - 1 + }); + let row = &mut rows[row_index]; + push_unique_label(&mut row.runtimes, scenario.runtime); + push_unique_label(&mut row.modes, scenario.mode); + row.scenario_ids.push(scenario.id); + } + + rows + } + + fn slowest_scenario_by( + &self, + value: impl Fn(&BenchmarkScenarioReport) -> f64, + ) -> Option<&BenchmarkScenarioReport> { + self.scenarios + .iter() + .max_by(|lhs, rhs| value(lhs).total_cmp(&value(rhs))) + } + + fn hotspot_rankings(&self) -> Vec> { + HOTSPOT_METRICS + .iter() + .map(|metric| { + let mut ranked_scenarios = self + .scenarios + .iter() + .filter_map(|scenario| { + (metric.value)(scenario).map(|value| BenchmarkHotspotScenarioArtifact { + rank: 0, + id: scenario.id, + workload: scenario.workload, + runtime: scenario.runtime, + mode: scenario.mode, + value, + }) + }) + .collect::>(); + ranked_scenarios.sort_by(|lhs, rhs| { + rhs.value + .total_cmp(&lhs.value) + .then_with(|| lhs.id.cmp(rhs.id)) + }); + for (index, scenario) in ranked_scenarios.iter_mut().enumerate() { + scenario.rank = index + 1; + } + + BenchmarkHotspotRankingArtifact { + metric: metric.metric, + label: metric.label, + dimension: metric.dimension, + unit: metric.unit, + ranked_scenarios, + scenarios_without_metric: self + .scenarios + .iter() + .filter(|scenario| (metric.value)(scenario).is_none()) + .map(|scenario| scenario.id) + .collect(), + } + }) + .collect() + } +} + +impl BenchmarkScenarioReport { + fn mean_startup_share_pct(&self) -> Option { + let startup_mean = self.startup_overhead_stats.as_ref()?.mean_ms; + let wall_mean = self.wall_stats.mean_ms; + if wall_mean <= 0.0 { + Some(0.0) + } else { + Some((startup_mean / wall_mean) * 100.0) + } + } + + fn wall_range_ms(&self) -> f64 { + self.wall_stats.max_ms - self.wall_stats.min_ms + } +} + +impl BenchmarkResourceUsage> { + fn push_sample(&mut self, sample: &BenchmarkResourceUsage) { + push_optional_sample(&mut self.rss_bytes, sample.rss_bytes); + push_optional_sample(&mut self.heap_used_bytes, sample.heap_used_bytes); + push_optional_sample(&mut self.cpu_user_us, sample.cpu_user_us); + push_optional_sample(&mut self.cpu_system_us, sample.cpu_system_us); + push_optional_sample(&mut self.cpu_total_us, sample.cpu_total_us); + } + + fn into_populated(self) -> Option { + (!self.is_empty()).then_some(self) + } +} + +impl BenchmarkResourceUsage { + fn is_empty(&self) -> bool { + self.rss_bytes.is_none() + && self.heap_used_bytes.is_none() + && self.cpu_user_us.is_none() + && self.cpu_system_us.is_none() + && self.cpu_total_us.is_none() + } +} + +impl BenchmarkComparison { + fn from_reports( + current: &JavascriptBenchmarkReport, + baseline_path: &Path, + baseline: &StoredBenchmarkArtifact, + ) -> Self { + let baseline_path = + fs::canonicalize(baseline_path).unwrap_or_else(|_| baseline_path.to_path_buf()); + let baseline_by_id = baseline + .scenarios + .iter() + .map(|scenario| (scenario.id.as_str(), scenario)) + .collect::>(); + + let mut scenario_deltas = Vec::new(); + let mut scenarios_missing_from_baseline = Vec::new(); + + for scenario in ¤t.scenarios { + if let Some(baseline_scenario) = baseline_by_id.get(scenario.id) { + scenario_deltas.push(BenchmarkScenarioDelta { + id: scenario.id.to_owned(), + description: scenario.description.to_owned(), + wall_mean_ms: BenchmarkMetricDelta::from_means( + baseline_scenario.wall_stats.mean_ms, + scenario.wall_stats.mean_ms, + ), + guest_import_mean_ms: match ( + baseline_scenario.guest_import_stats.as_ref(), + scenario.guest_import_stats.as_ref(), + ) { + (Some(baseline_stats), Some(current_stats)) => { + Some(BenchmarkMetricDelta::from_means( + baseline_stats.mean_ms, + current_stats.mean_ms, + )) + } + _ => None, + }, + startup_overhead_mean_ms: match ( + baseline_scenario.startup_overhead_stats.as_ref(), + scenario.startup_overhead_stats.as_ref(), + ) { + (Some(baseline_stats), Some(current_stats)) => { + Some(BenchmarkMetricDelta::from_means( + baseline_stats.mean_ms, + current_stats.mean_ms, + )) + } + _ => None, + }, + phase_mean_ms: match ( + baseline_scenario.phase_stats.as_ref(), + Some(&scenario.phase_stats), + ) { + (Some(baseline_phase), Some(current_phase)) => { + Some(BenchmarkScenarioPhases { + context_setup_ms: BenchmarkMetricDelta::from_means( + baseline_phase.context_setup_ms.mean_ms, + current_phase.context_setup_ms.mean_ms, + ), + startup_ms: BenchmarkMetricDelta::from_means( + baseline_phase.startup_ms.mean_ms, + current_phase.startup_ms.mean_ms, + ), + guest_execution_ms: match ( + baseline_phase.guest_execution_ms.as_ref(), + current_phase.guest_execution_ms.as_ref(), + ) { + (Some(baseline_stats), Some(current_stats)) => { + Some(BenchmarkMetricDelta::from_means( + baseline_stats.mean_ms, + current_stats.mean_ms, + )) + } + _ => None, + }, + completion_ms: BenchmarkMetricDelta::from_means( + baseline_phase.completion_ms.mean_ms, + current_phase.completion_ms.mean_ms, + ), + }) + } + _ => None, + }, + }); + } else { + scenarios_missing_from_baseline.push(scenario.id.to_owned()); + } + } + + let current_ids = current + .scenarios + .iter() + .map(|scenario| (scenario.id, ())) + .collect::>(); + let baseline_only_scenarios = baseline + .scenarios + .iter() + .filter_map(|scenario| { + (!current_ids.contains_key(scenario.id.as_str())).then(|| scenario.id.clone()) + }) + .collect::>(); + + let largest_wall_improvement = scenario_deltas + .iter() + .filter(|scenario| scenario.wall_mean_ms.delta_ms < 0.0) + .min_by(|lhs, rhs| { + lhs.wall_mean_ms + .delta_ms + .total_cmp(&rhs.wall_mean_ms.delta_ms) + }) + .map(BenchmarkDeltaHighlight::from_wall_delta); + let largest_wall_regression = scenario_deltas + .iter() + .filter(|scenario| scenario.wall_mean_ms.delta_ms > 0.0) + .max_by(|lhs, rhs| { + lhs.wall_mean_ms + .delta_ms + .total_cmp(&rhs.wall_mean_ms.delta_ms) + }) + .map(BenchmarkDeltaHighlight::from_wall_delta); + + Self { + baseline: BenchmarkComparisonBaseline { + artifact_version: baseline.artifact_version, + generated_at_unix_ms: baseline.generated_at_unix_ms, + path: baseline_path, + }, + summary: BenchmarkComparisonSummary { + compared_scenario_count: scenario_deltas.len(), + largest_wall_improvement, + largest_wall_regression, + }, + scenario_deltas, + scenarios_missing_from_baseline, + baseline_only_scenarios, + } + } +} + +impl BenchmarkDeltaHighlight { + fn from_wall_delta(delta: &BenchmarkScenarioDelta) -> Self { + Self { + id: delta.id.clone(), + delta_ms: delta.wall_mean_ms.delta_ms, + delta_pct: delta.wall_mean_ms.delta_pct, + } + } +} + +impl BenchmarkMetricDelta { + fn from_means(baseline_ms: f64, current_ms: f64) -> Self { + let delta_ms = current_ms - baseline_ms; + let delta_pct = if baseline_ms <= 0.0 { + 0.0 + } else { + (delta_ms / baseline_ms) * 100.0 + }; + + Self { + baseline_ms, + current_ms, + delta_ms, + delta_pct, + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct JavascriptBenchmarkArtifactPaths { + pub markdown_path: PathBuf, + pub json_path: PathBuf, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct JavascriptBenchmarkRunOutput { + pub artifact_paths: JavascriptBenchmarkArtifactPaths, + pub resumed_stage_count: usize, +} + +#[derive(Debug, Serialize)] +struct JavascriptBenchmarkArtifact<'a> { + artifact_version: u32, + generated_at_unix_ms: u128, + command: String, + config: &'a JavascriptBenchmarkConfig, + host: &'a BenchmarkHost, + repo_root: &'a Path, + summary: BenchmarkSummaryArtifact<'a>, + #[serde(skip_serializing_if = "Option::is_none")] + comparison: Option<&'a BenchmarkComparison>, + transport_rtt: Vec>, + scenarios: Vec>, +} + +#[derive(Debug, Serialize)] +struct BenchmarkSummaryArtifact<'a> { + scenario_count: usize, + recorded_samples_per_scenario: usize, + warmup_iterations: usize, + control_matrix: Vec>, + #[serde(skip_serializing_if = "Option::is_none")] + slowest_wall_scenario: Option<&'a BenchmarkScenarioReport>, + #[serde(skip_serializing_if = "Option::is_none")] + slowest_guest_import_scenario: Option<&'a BenchmarkScenarioReport>, + #[serde(skip_serializing_if = "Option::is_none")] + highest_startup_share_scenario: Option<&'a BenchmarkScenarioReport>, + hotspot_rankings: Vec>, + guidance_lines: Vec, +} + +#[derive(Debug, Serialize)] +struct BenchmarkScenarioArtifact<'a> { + id: &'static str, + workload: &'static str, + runtime: &'static str, + mode: &'static str, + description: &'static str, + fixture: &'static str, + compile_cache: &'static str, + wall_samples_ms: &'a [f64], + wall_stats: &'a BenchmarkStats, + #[serde(skip_serializing_if = "Option::is_none")] + guest_import_samples_ms: Option<&'a [f64]>, + #[serde(skip_serializing_if = "Option::is_none")] + guest_import_stats: Option<&'a BenchmarkStats>, + #[serde(skip_serializing_if = "Option::is_none")] + startup_overhead_samples_ms: Option<&'a [f64]>, + #[serde(skip_serializing_if = "Option::is_none")] + startup_overhead_stats: Option<&'a BenchmarkStats>, + #[serde(skip_serializing_if = "Option::is_none")] + mean_startup_share_pct: Option, + phase_samples_ms: &'a BenchmarkScenarioPhases>, + phase_stats: &'a BenchmarkScenarioPhases, + #[serde(skip_serializing_if = "Option::is_none")] + resource_usage_samples: Option<&'a BenchmarkResourceUsage>>, + #[serde(skip_serializing_if = "Option::is_none")] + resource_usage_stats: Option<&'a BenchmarkResourceUsage>, +} + +#[derive(Debug, Serialize)] +struct BenchmarkControlMatrixArtifact<'a> { + workload: &'a str, + runtimes: Vec<&'a str>, + modes: Vec<&'a str>, + scenario_ids: Vec<&'a str>, +} + +#[derive(Debug, Serialize)] +struct BenchmarkTransportRttArtifact<'a> { + channel: &'static str, + payload_bytes: usize, + samples_ms: &'a [f64], + stats: &'a BenchmarkStats, +} + +#[derive(Debug, Serialize)] +struct BenchmarkHotspotRankingArtifact<'a> { + metric: &'static str, + label: &'static str, + dimension: &'static str, + unit: &'static str, + ranked_scenarios: Vec>, + #[serde(skip_serializing_if = "Vec::is_empty")] + scenarios_without_metric: Vec<&'a str>, +} + +#[derive(Debug, Serialize)] +struct BenchmarkHotspotScenarioArtifact<'a> { + rank: usize, + id: &'a str, + workload: &'a str, + runtime: &'a str, + mode: &'a str, + value: f64, +} + +struct HotspotMetricDefinition { + metric: &'static str, + label: &'static str, + dimension: &'static str, + unit: &'static str, + value: fn(&BenchmarkScenarioReport) -> Option, +} + +const HOTSPOT_METRICS: [HotspotMetricDefinition; 13] = [ + HotspotMetricDefinition { + metric: "wall_mean_ms", + label: "Wall Time", + dimension: "time", + unit: "ms", + value: hotspot_wall_mean_ms, + }, + HotspotMetricDefinition { + metric: "wall_stddev_ms", + label: "Wall Time Stddev", + dimension: "stability", + unit: "ms", + value: hotspot_wall_stddev_ms, + }, + HotspotMetricDefinition { + metric: "wall_range_ms", + label: "Wall Time Range", + dimension: "stability", + unit: "ms", + value: hotspot_wall_range_ms, + }, + HotspotMetricDefinition { + metric: "guest_import_mean_ms", + label: "Guest Import Time", + dimension: "time", + unit: "ms", + value: hotspot_guest_import_mean_ms, + }, + HotspotMetricDefinition { + metric: "startup_overhead_mean_ms", + label: "Startup Overhead", + dimension: "time", + unit: "ms", + value: hotspot_startup_overhead_mean_ms, + }, + HotspotMetricDefinition { + metric: "context_setup_mean_ms", + label: "Context Setup Phase", + dimension: "time", + unit: "ms", + value: hotspot_context_setup_mean_ms, + }, + HotspotMetricDefinition { + metric: "startup_phase_mean_ms", + label: "Runtime Startup Phase", + dimension: "time", + unit: "ms", + value: hotspot_startup_phase_mean_ms, + }, + HotspotMetricDefinition { + metric: "guest_execution_mean_ms", + label: "Guest Execution Phase", + dimension: "time", + unit: "ms", + value: hotspot_guest_execution_mean_ms, + }, + HotspotMetricDefinition { + metric: "completion_mean_ms", + label: "Completion/Stdio Phase", + dimension: "time", + unit: "ms", + value: hotspot_completion_mean_ms, + }, + HotspotMetricDefinition { + metric: "startup_share_pct", + label: "Startup Share Of Wall", + dimension: "share", + unit: "pct", + value: hotspot_startup_share_pct, + }, + HotspotMetricDefinition { + metric: "rss_mean_mib", + label: "RSS", + dimension: "memory", + unit: "MiB", + value: hotspot_rss_mean_mib, + }, + HotspotMetricDefinition { + metric: "heap_mean_mib", + label: "Heap Used", + dimension: "memory", + unit: "MiB", + value: hotspot_heap_mean_mib, + }, + HotspotMetricDefinition { + metric: "cpu_total_mean_ms", + label: "Total CPU", + dimension: "cpu", + unit: "ms", + value: hotspot_total_cpu_mean_ms, + }, +]; + +#[derive(Debug)] +pub enum JavascriptBenchmarkError { + InvalidConfig(&'static str), + InvalidWorkspaceRoot(PathBuf), + InvalidBaselineReport { + path: PathBuf, + message: String, + }, + Io(std::io::Error), + Utf8(std::string::FromUtf8Error), + Execution(JavascriptExecutionError), + NodeVersion(std::io::Error), + MissingBenchmarkMetric(&'static str), + InvalidBenchmarkMetric { + scenario: &'static str, + raw_value: String, + }, + TransportProbeTimeout { + payload_bytes: usize, + }, + TransportProbeExited { + exit_code: i32, + stderr: String, + }, + InvalidTransportProbeResponse { + payload_bytes: usize, + expected: String, + actual: String, + }, + NonZeroExit { + scenario: &'static str, + exit_code: i32, + stderr: String, + }, +} + +impl fmt::Display for JavascriptBenchmarkError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidConfig(message) => write!(f, "invalid benchmark config: {message}"), + Self::InvalidWorkspaceRoot(path) => { + write!( + f, + "failed to resolve workspace root from execution crate path: {}", + path.display() + ) + } + Self::InvalidBaselineReport { path, message } => { + write!( + f, + "failed to parse benchmark baseline artifact {}: {message}", + path.display() + ) + } + Self::Io(err) => write!(f, "benchmark I/O failure: {err}"), + Self::Utf8(err) => write!(f, "benchmark output was not valid UTF-8: {err}"), + Self::Execution(err) => write!(f, "benchmark execution failed: {err}"), + Self::NodeVersion(err) => write!(f, "failed to query node version: {err}"), + Self::MissingBenchmarkMetric(scenario) => { + write!( + f, + "benchmark scenario `{scenario}` did not emit a metric marker" + ) + } + Self::InvalidBenchmarkMetric { + scenario, + raw_value, + } => write!( + f, + "benchmark scenario `{scenario}` emitted an invalid metric: {raw_value}" + ), + Self::TransportProbeTimeout { payload_bytes } => { + write!( + f, + "transport probe timed out waiting for {payload_bytes}-byte round-trip" + ) + } + Self::TransportProbeExited { exit_code, stderr } => write!( + f, + "transport probe exited with code {exit_code}: {stderr}" + ), + Self::InvalidTransportProbeResponse { + payload_bytes, + expected, + actual, + } => write!( + f, + "transport probe returned unexpected payload for {payload_bytes}-byte round-trip: expected {expected:?}, got {actual:?}" + ), + Self::NonZeroExit { + scenario, + exit_code, + stderr, + } => write!( + f, + "benchmark scenario `{scenario}` exited with code {exit_code}: {stderr}" + ), + } + } +} + +impl std::error::Error for JavascriptBenchmarkError {} + +impl From for JavascriptBenchmarkError { + fn from(err: std::io::Error) -> Self { + Self::Io(err) + } +} + +impl From for JavascriptBenchmarkError { + fn from(err: std::string::FromUtf8Error) -> Self { + Self::Utf8(err) + } +} + +impl From for JavascriptBenchmarkError { + fn from(err: serde_json::Error) -> Self { + Self::Io(std::io::Error::new(std::io::ErrorKind::InvalidData, err)) + } +} + +impl From for JavascriptBenchmarkError { + fn from(err: JavascriptExecutionError) -> Self { Self::Execution(err) } } @@ -342,6 +1491,7 @@ pub fn run_javascript_benchmarks( let repo_root = workspace_root()?; let host = benchmark_host()?; let workspace = BenchmarkWorkspace::create(&repo_root)?; + let transport_rtt = measure_transport_rtt(&workspace, config)?; let mut scenarios = Vec::new(); @@ -357,18 +1507,100 @@ pub fn run_javascript_benchmarks( config: config.clone(), host, repo_root, + transport_rtt, scenarios, }) } -#[derive(Debug)] +fn benchmark_artifact_dir(repo_root: &Path) -> PathBuf { + repo_root.join(BENCHMARK_ARTIFACT_DIR) +} + +fn benchmark_run_state_path(artifact_dir: &Path) -> PathBuf { + artifact_dir.join(BENCHMARK_RUN_STATE_FILE) +} + +fn load_benchmark_run_state( + state_path: &Path, + config: &JavascriptBenchmarkConfig, + host: &BenchmarkHost, + repo_root: &Path, + definitions: &[ScenarioDefinition], +) -> Result { + match fs::read_to_string(state_path) { + Ok(raw) => match serde_json::from_str::(&raw) { + Ok(state) if state.is_compatible(config, host, repo_root) => { + Ok(state.sanitized(definitions)) + } + Ok(_) | Err(_) => Ok(StoredBenchmarkRunState::new(config, host, repo_root)), + }, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => { + Ok(StoredBenchmarkRunState::new(config, host, repo_root)) + } + Err(err) => Err(JavascriptBenchmarkError::Io(err)), + } +} + +fn persist_benchmark_run_state( + state_path: &Path, + state: &StoredBenchmarkRunState, +) -> Result<(), JavascriptBenchmarkError> { + write_string_atomic(state_path, &serde_json::to_string_pretty(state)?) +} + +fn write_string_atomic(path: &Path, contents: &str) -> Result<(), JavascriptBenchmarkError> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + + let temp_path = path.with_file_name(format!( + ".{}.tmp-{}-{}", + path.file_name() + .and_then(|name| name.to_str()) + .unwrap_or("artifact"), + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() + )); + fs::write(&temp_path, contents)?; + if let Err(err) = fs::rename(&temp_path, path) { + let _ = fs::remove_file(&temp_path); + return Err(JavascriptBenchmarkError::Io(err)); + } + + Ok(()) +} + +fn remove_file_if_exists(path: &Path) -> Result<(), JavascriptBenchmarkError> { + match fs::remove_file(path) { + Ok(()) => Ok(()), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(JavascriptBenchmarkError::Io(err)), + } +} + +fn current_unix_ms() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() +} + +#[derive(Debug, Clone, Copy)] struct ScenarioDefinition { id: &'static str, + workload: &'static str, + runtime: ScenarioRuntime, + mode: ScenarioMode, description: &'static str, fixture: &'static str, entrypoint: &'static str, compile_cache: CompileCacheStrategy, + engine_reuse: EngineReuseStrategy, expect_import_metric: bool, + env: ScenarioEnvironment, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -386,15 +1618,139 @@ impl CompileCacheStrategy { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum EngineReuseStrategy { + FreshPerSample, + SharedAcrossScenario, + SharedContextAcrossScenario, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ScenarioEnvironment { + None, + ProjectedWorkspaceNodeModules, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ScenarioRuntime { + NativeExecution, + HostNode, +} + +impl ScenarioRuntime { + fn label(self) -> &'static str { + match self { + Self::NativeExecution => "native-execution", + Self::HostNode => "host-node", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ScenarioMode { + BaselineControl, + TrueColdStart, + NewSessionReplay, + SameSessionReplay, + SameEngineReplay, + HostControl, +} + +impl ScenarioMode { + fn label(self) -> &'static str { + match self { + Self::BaselineControl => "baseline-control", + Self::TrueColdStart => "true-cold-start", + Self::NewSessionReplay => "new-session-replay", + Self::SameSessionReplay => "same-session-replay", + Self::SameEngineReplay => "same-engine-replay", + Self::HostControl => "host-control", + } + } +} + #[derive(Debug)] struct SampleMeasurement { wall_ms: f64, guest_import_ms: Option, + context_setup_ms: f64, + startup_ms: f64, + completion_ms: f64, + resource_usage: Option>, } #[derive(Debug)] struct BenchmarkWorkspace { root: PathBuf, + repo_root: PathBuf, +} + +#[derive(Debug, Deserialize)] +struct StoredBenchmarkArtifact { + artifact_version: u32, + generated_at_unix_ms: u128, + scenarios: Vec, +} + +#[derive(Debug, Deserialize)] +struct StoredBenchmarkScenario { + id: String, + wall_stats: BenchmarkStats, + #[serde(default)] + guest_import_stats: Option, + #[serde(default)] + startup_overhead_stats: Option, + #[serde(default)] + phase_stats: Option>, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +struct StoredBenchmarkRunHost { + node_binary: String, + node_version: String, + os: String, + arch: String, + logical_cpus: usize, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct StoredBenchmarkRunState { + artifact_version: u32, + config: JavascriptBenchmarkConfig, + host: StoredBenchmarkRunHost, + repo_root: PathBuf, + #[serde(default)] + transport_rtt: Option>, + #[serde(default)] + scenarios: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct StoredBenchmarkTransportRttReport { + payload_bytes: usize, + samples_ms: Vec, + stats: BenchmarkStats, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct StoredBenchmarkScenarioReport { + id: String, + wall_samples_ms: Vec, + wall_stats: BenchmarkStats, + #[serde(default)] + guest_import_samples_ms: Option>, + #[serde(default)] + guest_import_stats: Option, + #[serde(default)] + startup_overhead_samples_ms: Option>, + #[serde(default)] + startup_overhead_stats: Option, + phase_samples_ms: BenchmarkScenarioPhases>, + phase_stats: BenchmarkScenarioPhases, + #[serde(default)] + resource_usage_samples: Option>>, + #[serde(default)] + resource_usage_stats: Option>, } impl BenchmarkWorkspace { @@ -409,7 +1765,10 @@ impl BenchmarkWorkspace { )); fs::create_dir_all(&root)?; write_benchmark_workspace(&root)?; - Ok(Self { root }) + Ok(Self { + root, + repo_root: repo_root.to_path_buf(), + }) } } @@ -419,52 +1778,575 @@ impl Drop for BenchmarkWorkspace { } } -fn benchmark_scenarios() -> [ScenarioDefinition; 5] { +impl StoredBenchmarkRunHost { + fn from_host(host: &BenchmarkHost) -> Self { + Self { + node_binary: host.node_binary.clone(), + node_version: host.node_version.clone(), + os: host.os.to_owned(), + arch: host.arch.to_owned(), + logical_cpus: host.logical_cpus, + } + } + + fn matches_host(&self, host: &BenchmarkHost) -> bool { + self.node_binary == host.node_binary + && self.node_version == host.node_version + && self.os == host.os + && self.arch == host.arch + && self.logical_cpus == host.logical_cpus + } +} + +impl StoredBenchmarkRunState { + fn new(config: &JavascriptBenchmarkConfig, host: &BenchmarkHost, repo_root: &Path) -> Self { + Self { + artifact_version: BENCHMARK_ARTIFACT_VERSION, + config: config.clone(), + host: StoredBenchmarkRunHost::from_host(host), + repo_root: repo_root.to_path_buf(), + transport_rtt: None, + scenarios: Vec::new(), + } + } + + fn is_compatible( + &self, + config: &JavascriptBenchmarkConfig, + host: &BenchmarkHost, + repo_root: &Path, + ) -> bool { + self.artifact_version == BENCHMARK_ARTIFACT_VERSION + && self.config == *config + && self.host.matches_host(host) + && self.repo_root == repo_root + } + + fn sanitized(mut self, definitions: &[ScenarioDefinition]) -> Self { + if let Some(transport_rtt) = &self.transport_rtt { + let payloads = transport_rtt + .iter() + .map(|report| report.payload_bytes) + .collect::>(); + if payloads != TRANSPORT_RTT_PAYLOAD_BYTES { + self.transport_rtt = None; + } + } + + let mut scenarios_by_id = self + .scenarios + .into_iter() + .map(|scenario| (scenario.id.clone(), scenario)) + .collect::>(); + self.scenarios = definitions + .iter() + .filter_map(|definition| scenarios_by_id.remove(definition.id)) + .collect(); + self + } + + fn resumed_stage_count(&self, definitions: &[ScenarioDefinition]) -> usize { + usize::from(self.transport_rtt.is_some()) + + definitions + .iter() + .filter(|definition| self.has_scenario(definition.id)) + .count() + } + + fn has_scenario(&self, id: &str) -> bool { + self.scenarios.iter().any(|scenario| scenario.id == id) + } + + fn record_transport_rtt(&mut self, transport_rtt: &[BenchmarkTransportRttReport]) { + self.transport_rtt = Some( + transport_rtt + .iter() + .map(StoredBenchmarkTransportRttReport::from_report) + .collect(), + ); + } + + fn record_scenario(&mut self, scenario: &BenchmarkScenarioReport) { + self.scenarios.retain(|stored| stored.id != scenario.id); + self.scenarios + .push(StoredBenchmarkScenarioReport::from_report(scenario)); + } + + fn to_report( + &self, + config: &JavascriptBenchmarkConfig, + host: &BenchmarkHost, + repo_root: &Path, + definitions: &[ScenarioDefinition], + ) -> JavascriptBenchmarkReport { + let scenarios_by_id = self + .scenarios + .iter() + .map(|scenario| (scenario.id.as_str(), scenario)) + .collect::>(); + + JavascriptBenchmarkReport { + generated_at_unix_ms: current_unix_ms(), + config: config.clone(), + host: host.clone(), + repo_root: repo_root.to_path_buf(), + transport_rtt: self + .transport_rtt + .clone() + .unwrap_or_default() + .into_iter() + .map(StoredBenchmarkTransportRttReport::into_report) + .collect(), + scenarios: definitions + .iter() + .filter_map(|definition| { + scenarios_by_id + .get(definition.id) + .map(|scenario| scenario.to_report(*definition)) + }) + .collect(), + } + } +} + +impl StoredBenchmarkTransportRttReport { + fn from_report(report: &BenchmarkTransportRttReport) -> Self { + Self { + payload_bytes: report.payload_bytes, + samples_ms: report.samples_ms.clone(), + stats: report.stats.clone(), + } + } + + fn into_report(self) -> BenchmarkTransportRttReport { + BenchmarkTransportRttReport { + channel: TRANSPORT_RTT_CHANNEL, + payload_bytes: self.payload_bytes, + samples_ms: self.samples_ms, + stats: self.stats, + } + } +} + +impl StoredBenchmarkScenarioReport { + fn from_report(report: &BenchmarkScenarioReport) -> Self { + Self { + id: report.id.to_owned(), + wall_samples_ms: report.wall_samples_ms.clone(), + wall_stats: report.wall_stats.clone(), + guest_import_samples_ms: report.guest_import_samples_ms.clone(), + guest_import_stats: report.guest_import_stats.clone(), + startup_overhead_samples_ms: report.startup_overhead_samples_ms.clone(), + startup_overhead_stats: report.startup_overhead_stats.clone(), + phase_samples_ms: report.phase_samples_ms.clone(), + phase_stats: report.phase_stats.clone(), + resource_usage_samples: report.resource_usage_samples.clone(), + resource_usage_stats: report.resource_usage_stats.clone(), + } + } + + fn to_report(&self, definition: ScenarioDefinition) -> BenchmarkScenarioReport { + BenchmarkScenarioReport { + id: definition.id, + workload: definition.workload, + runtime: definition.runtime.label(), + mode: definition.mode.label(), + description: definition.description, + fixture: definition.fixture, + compile_cache: definition.compile_cache.label(), + wall_samples_ms: self.wall_samples_ms.clone(), + wall_stats: self.wall_stats.clone(), + guest_import_samples_ms: self.guest_import_samples_ms.clone(), + guest_import_stats: self.guest_import_stats.clone(), + startup_overhead_samples_ms: self.startup_overhead_samples_ms.clone(), + startup_overhead_stats: self.startup_overhead_stats.clone(), + phase_samples_ms: self.phase_samples_ms.clone(), + phase_stats: self.phase_stats.clone(), + resource_usage_samples: self.resource_usage_samples.clone(), + resource_usage_stats: self.resource_usage_stats.clone(), + } + } +} + +pub fn run_javascript_benchmarks_with_recovery( + config: &JavascriptBenchmarkConfig, + baseline_path: Option<&Path>, +) -> Result { + if config.iterations == 0 { + return Err(JavascriptBenchmarkError::InvalidConfig( + "iterations must be greater than zero", + )); + } + + let repo_root = workspace_root()?; + let host = benchmark_host()?; + let artifact_dir = benchmark_artifact_dir(&repo_root); + let workspace = BenchmarkWorkspace::create(&repo_root)?; + let (report, resumed_stage_count, state_path) = orchestrate_javascript_benchmark_report( + config, + &repo_root, + &host, + &artifact_dir, + || measure_transport_rtt(&workspace, config), + |scenario| run_scenario(&workspace, config, scenario), + )?; + let comparison = baseline_path + .map(|path| report.compare_to_baseline_path(path)) + .transpose()?; + let artifact_paths = + report.write_artifacts_with_comparison(&artifact_dir, comparison.as_ref())?; + remove_file_if_exists(&state_path)?; + + Ok(JavascriptBenchmarkRunOutput { + artifact_paths, + resumed_stage_count, + }) +} + +fn orchestrate_javascript_benchmark_report( + config: &JavascriptBenchmarkConfig, + repo_root: &Path, + host: &BenchmarkHost, + artifact_dir: &Path, + mut measure_transport: MeasureTransport, + mut run_scenario: RunScenario, +) -> Result<(JavascriptBenchmarkReport, usize, PathBuf), JavascriptBenchmarkError> +where + MeasureTransport: FnMut() -> Result, JavascriptBenchmarkError>, + RunScenario: + FnMut(ScenarioDefinition) -> Result, +{ + if config.iterations == 0 { + return Err(JavascriptBenchmarkError::InvalidConfig( + "iterations must be greater than zero", + )); + } + + fs::create_dir_all(artifact_dir)?; + + let definitions = benchmark_scenarios(); + let state_path = benchmark_run_state_path(artifact_dir); + let mut state = load_benchmark_run_state(&state_path, config, host, repo_root, &definitions)?; + let resumed_stage_count = state.resumed_stage_count(&definitions); + + if state.transport_rtt.is_none() { + let transport_rtt = measure_transport()?; + state.record_transport_rtt(&transport_rtt); + persist_benchmark_run_state(&state_path, &state)?; + } + + for definition in definitions { + if state.has_scenario(definition.id) { + continue; + } + + let scenario = run_scenario(definition)?; + state.record_scenario(&scenario); + persist_benchmark_run_state(&state_path, &state)?; + } + + Ok(( + state.to_report(config, host, repo_root, &benchmark_scenarios()), + resumed_stage_count, + state_path, + )) +} + +fn benchmark_scenarios() -> [ScenarioDefinition; 21] { [ ScenarioDefinition { id: "isolate-startup", + workload: "startup-floor", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::BaselineControl, description: "Minimal guest with no extra imports. Measures the current startup floor for create-context plus node process bootstrap.", fixture: "empty entrypoint", entrypoint: "./bench/isolate-startup.mjs", compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: false, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "prewarmed-isolate-startup", + workload: "startup-floor", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Minimal guest after a priming pass while one execution engine keeps materialized assets and builtin/polyfill prewarm state alive, isolating the hot startup floor from import work.", + fixture: "empty entrypoint", + entrypoint: "./bench/isolate-startup.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, expect_import_metric: false, + env: ScenarioEnvironment::None, }, ScenarioDefinition { id: "cold-local-import", + workload: "local-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, description: "Cold import of a repo-local ESM graph that simulates layered application modules without compile-cache reuse.", fixture: "24-module local ESM graph", entrypoint: "./bench/cold-local-import.mjs", compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, expect_import_metric: true, + env: ScenarioEnvironment::None, }, ScenarioDefinition { id: "warm-local-import", + workload: "local-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::NewSessionReplay, description: "Warm import of the same local ESM graph after a compile-cache priming pass in an earlier isolate.", fixture: "24-module local ESM graph", entrypoint: "./bench/warm-local-import.mjs", compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::FreshPerSample, expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "same-context-local-import", + workload: "local-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameSessionReplay, + description: + "Warm import of the same local ESM graph by replaying executions against one reused JavaScript context after a compile-cache priming pass.", + fixture: "24-module local ESM graph", + entrypoint: "./bench/warm-local-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedContextAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "prewarmed-local-import", + workload: "local-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Warm import of the same local ESM graph after compile-cache priming while one execution engine keeps materialized assets and builtin/polyfill prewarm state alive.", + fixture: "24-module local ESM graph", + entrypoint: "./bench/warm-local-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "host-local-import", + workload: "local-import", + runtime: ScenarioRuntime::HostNode, + mode: ScenarioMode::HostControl, + description: + "Direct host-Node control for the same local ESM graph so later runs can separate native executor overhead from guest import work.", + fixture: "24-module local ESM graph", + entrypoint: "./bench/cold-local-import.mjs", + compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, }, ScenarioDefinition { id: "builtin-import", + workload: "builtin-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, description: "Import of the common builtin path used by the wrappers and polyfill-adjacent bootstrap code.", fixture: "node:path + node:url + node:fs/promises", entrypoint: "./bench/builtin-import.mjs", compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-stream-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:stream` after a priming pass inside one reused execution engine.", + fixture: "node:stream", + entrypoint: "./bench/hot-builtin-stream-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-stream-web-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:stream/web` after a priming pass inside one reused execution engine.", + fixture: "node:stream/web", + entrypoint: "./bench/hot-builtin-stream-web-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-crypto-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:crypto` after a priming pass inside one reused execution engine.", + fixture: "node:crypto", + entrypoint: "./bench/hot-builtin-crypto-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-zlib-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:zlib` after a priming pass inside one reused execution engine.", + fixture: "node:zlib", + entrypoint: "./bench/hot-builtin-zlib-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-assert-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:assert/strict` after a priming pass inside one reused execution engine.", + fixture: "node:assert/strict", + entrypoint: "./bench/hot-builtin-assert-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-builtin-url-import", + workload: "builtin-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot single-import microbench for `node:url` after a priming pass inside one reused execution engine.", + fixture: "node:url", + entrypoint: "./bench/hot-builtin-url-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "hot-projected-package-file-import", + workload: "projected-package-hot-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Hot projected-package single-import microbench for the TypeScript compiler file with compile cache and projected-source manifest reuse enabled across repeated contexts.", + fixture: "projected TypeScript compiler file", + entrypoint: "./bench/hot-projected-package-file-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::ProjectedWorkspaceNodeModules, }, ScenarioDefinition { id: "large-package-import", + workload: "large-package-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, description: "Cold import of the real-world `typescript` package from the workspace root `node_modules` tree.", fixture: "typescript", entrypoint: "./bench/large-package-import.mjs", compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "projected-package-import", + workload: "projected-package-import", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::SameEngineReplay, + description: + "Projected-package guest-path import of TypeScript with compile cache and projected-source manifest reuse enabled across repeated contexts.", + fixture: "projected TypeScript guest-path import", + entrypoint: "./bench/projected-package-import.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::SharedAcrossScenario, + expect_import_metric: true, + env: ScenarioEnvironment::ProjectedWorkspaceNodeModules, + }, + ScenarioDefinition { + id: "pdf-lib-startup", + workload: "pdf-lib-startup", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, + description: + "Cold import of `pdf-lib` plus representative document setup that creates a PDF page and embeds a standard font.", + fixture: "pdf-lib document creation", + entrypoint: "./bench/pdf-lib-startup.mjs", + compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "jszip-startup", + workload: "jszip-startup", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, + description: + "Cold import of `jszip` plus representative archive staging that builds a nested archive structure.", + fixture: "jszip archive staging", + entrypoint: "./bench/jszip-startup.mjs", + compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "jszip-end-to-end", + workload: "jszip-end-to-end", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::TrueColdStart, + description: + "Cold import of `jszip` plus a full compressed archive roundtrip that writes, compresses, reloads, and validates nested archive contents.", + fixture: "jszip end-to-end archive roundtrip", + entrypoint: "./bench/jszip-end-to-end.mjs", + compile_cache: CompileCacheStrategy::Disabled, + engine_reuse: EngineReuseStrategy::FreshPerSample, + expect_import_metric: true, + env: ScenarioEnvironment::None, + }, + ScenarioDefinition { + id: "jszip-repeated-session-compressed", + workload: "jszip-repeated-session-compressed", + runtime: ScenarioRuntime::NativeExecution, + mode: ScenarioMode::NewSessionReplay, + description: + "Repeated-session `jszip` workload after a compile-cache priming pass that compresses and reloads a nested archive in each fresh isolate.", + fixture: "jszip compressed archive roundtrip", + entrypoint: "./bench/jszip-repeated-session-compressed.mjs", + compile_cache: CompileCacheStrategy::Primed, + engine_reuse: EngineReuseStrategy::FreshPerSample, expect_import_metric: true, + env: ScenarioEnvironment::None, }, ] } @@ -478,23 +2360,32 @@ fn run_scenario( .root .join("compile-cache") .join(scenario.id.replace('-', "_")); + let mut shared_engine = match scenario.engine_reuse { + EngineReuseStrategy::FreshPerSample => None, + EngineReuseStrategy::SharedAcrossScenario + | EngineReuseStrategy::SharedContextAcrossScenario => { + Some(JavascriptExecutionEngine::default()) + } + }; + let mut shared_context = None; if scenario.compile_cache == CompileCacheStrategy::Primed { run_sample( workspace, &scenario, Some(compile_cache_root.clone()), - "prime-cache", + shared_engine.as_mut(), + &mut shared_context, )?; } - for warmup_index in 0..config.warmup_iterations { - let label = format!("warmup-{}", warmup_index + 1); + for _ in 0..config.warmup_iterations { run_sample( workspace, &scenario, compile_cache_root_for_strategy(scenario.compile_cache, &compile_cache_root), - &label, + shared_engine.as_mut(), + &mut shared_context, )?; } @@ -504,34 +2395,59 @@ fn run_scenario( } else { None }; + let mut context_setup_samples_ms = Vec::with_capacity(config.iterations); + let mut startup_samples_ms = Vec::with_capacity(config.iterations); + let mut completion_samples_ms = Vec::with_capacity(config.iterations); + let mut resource_usage_samples = BenchmarkResourceUsage::>::default(); - for iteration in 0..config.iterations { - let label = format!("measure-{}", iteration + 1); + for _ in 0..config.iterations { let sample = run_sample( workspace, &scenario, compile_cache_root_for_strategy(scenario.compile_cache, &compile_cache_root), - &label, + shared_engine.as_mut(), + &mut shared_context, )?; wall_samples_ms.push(sample.wall_ms); + context_setup_samples_ms.push(sample.context_setup_ms); + startup_samples_ms.push(sample.startup_ms); + completion_samples_ms.push(sample.completion_ms); if let (Some(import_ms), Some(samples)) = (sample.guest_import_ms, guest_import_samples_ms.as_mut()) { samples.push(import_ms); } + if let Some(resource_usage) = sample.resource_usage.as_ref() { + resource_usage_samples.push_sample(resource_usage); + } } let startup_overhead_samples_ms = guest_import_samples_ms.as_ref().map(|guest_samples| { - wall_samples_ms + context_setup_samples_ms .iter() + .zip(startup_samples_ms.iter()) + .zip(completion_samples_ms.iter()) .zip(guest_samples.iter()) - .map(|(wall_ms, import_ms)| wall_ms - import_ms) + .map(|(((context_ms, startup_ms), completion_ms), _guest_ms)| { + context_ms + startup_ms + completion_ms + }) .collect::>() }); + let phase_samples_ms = BenchmarkScenarioPhases { + context_setup_ms: context_setup_samples_ms, + startup_ms: startup_samples_ms, + guest_execution_ms: guest_import_samples_ms.clone(), + completion_ms: completion_samples_ms, + }; + let resource_usage_samples = resource_usage_samples.into_populated(); + Ok(BenchmarkScenarioReport { id: scenario.id, + workload: scenario.workload, + runtime: scenario.runtime.label(), + mode: scenario.mode.label(), description: scenario.description, fixture: scenario.fixture, compile_cache: scenario.compile_cache.label(), @@ -542,9 +2458,23 @@ fn run_scenario( startup_overhead_stats: startup_overhead_samples_ms .as_ref() .map(|samples| compute_stats(samples)), + phase_stats: BenchmarkScenarioPhases { + context_setup_ms: compute_stats(&phase_samples_ms.context_setup_ms), + startup_ms: compute_stats(&phase_samples_ms.startup_ms), + guest_execution_ms: phase_samples_ms + .guest_execution_ms + .as_ref() + .map(|samples| compute_stats(samples)), + completion_ms: compute_stats(&phase_samples_ms.completion_ms), + }, + resource_usage_stats: resource_usage_samples + .as_ref() + .and_then(compute_resource_usage_stats), wall_samples_ms, guest_import_samples_ms, startup_overhead_samples_ms, + phase_samples_ms, + resource_usage_samples, }) } @@ -559,26 +2489,70 @@ fn run_sample( workspace: &BenchmarkWorkspace, scenario: &ScenarioDefinition, compile_cache_root: Option, - _label: &str, + shared_engine: Option<&mut JavascriptExecutionEngine>, + shared_context: &mut Option, ) -> Result { - let mut engine = JavascriptExecutionEngine::default(); - let started_at = Instant::now(); - let context = engine.create_context(CreateJavascriptContextRequest { - vm_id: String::from("vm-bench"), - bootstrap_module: None, - compile_cache_root, - }); + match scenario.runtime { + ScenarioRuntime::NativeExecution => run_native_sample( + workspace, + scenario, + compile_cache_root, + shared_engine, + shared_context, + ), + ScenarioRuntime::HostNode => run_host_node_sample(workspace, scenario), + } +} + +fn run_native_sample( + workspace: &BenchmarkWorkspace, + scenario: &ScenarioDefinition, + compile_cache_root: Option, + shared_engine: Option<&mut JavascriptExecutionEngine>, + shared_context: &mut Option, +) -> Result { + let mut fresh_engine = JavascriptExecutionEngine::default(); + let engine = shared_engine.unwrap_or(&mut fresh_engine); + let context_started_at = Instant::now(); + let (context, context_setup_ms) = match scenario.engine_reuse { + EngineReuseStrategy::SharedContextAcrossScenario => { + if let Some(context) = shared_context.as_ref() { + (context.clone(), 0.0) + } else { + let context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-bench"), + bootstrap_module: None, + compile_cache_root, + }); + let context_setup_ms = context_started_at.elapsed().as_secs_f64() * 1000.0; + *shared_context = Some(context.clone()); + (context, context_setup_ms) + } + } + _ => { + let context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-bench"), + bootstrap_module: None, + compile_cache_root, + }); + let context_setup_ms = context_started_at.elapsed().as_secs_f64() * 1000.0; + (context, context_setup_ms) + } + }; + let startup_started_at = Instant::now(); let execution = engine.start_execution(StartJavascriptExecutionRequest { vm_id: String::from("vm-bench"), context_id: context.context_id, argv: vec![String::from(scenario.entrypoint)], - env: BTreeMap::new(), + env: scenario_env(workspace, scenario), cwd: workspace.root.clone(), })?; + let startup_ms = startup_started_at.elapsed().as_secs_f64() * 1000.0; + let completion_started_at = Instant::now(); let result = execution.wait()?; - let wall_ms = started_at.elapsed().as_secs_f64() * 1000.0; + let completion_total_ms = completion_started_at.elapsed().as_secs_f64() * 1000.0; let stdout = String::from_utf8(result.stdout)?; let stderr = String::from_utf8(result.stderr)?; @@ -590,31 +2564,265 @@ fn run_sample( }); } - let guest_import_ms = if scenario.expect_import_metric { - Some(parse_benchmark_metric(scenario.id, &stdout)?) - } else { - None - }; + let parsed_metrics = + parse_benchmark_metrics(scenario.id, &stdout, scenario.expect_import_metric)?; + let guest_import_ms = parsed_metrics.import_ms; + let completion_ms = guest_import_ms + .map(|guest_ms| saturating_delta_ms(completion_total_ms, guest_ms)) + .unwrap_or(completion_total_ms); + let wall_ms = context_setup_ms + startup_ms + completion_total_ms; + + Ok(SampleMeasurement { + wall_ms, + guest_import_ms, + context_setup_ms, + startup_ms, + completion_ms, + resource_usage: parsed_metrics.resource_usage, + }) +} + +fn run_host_node_sample( + workspace: &BenchmarkWorkspace, + scenario: &ScenarioDefinition, +) -> Result { + let started_at = Instant::now(); + let output = Command::new(crate::node_process::node_binary()) + .arg(scenario.entrypoint) + .current_dir(&workspace.root) + .envs(scenario_env(workspace, scenario)) + .output()?; + let wall_ms = started_at.elapsed().as_secs_f64() * 1000.0; + let stdout = String::from_utf8(output.stdout)?; + let stderr = String::from_utf8(output.stderr)?; + + if !output.status.success() { + return Err(JavascriptBenchmarkError::NonZeroExit { + scenario: scenario.id, + exit_code: output.status.code().unwrap_or(-1), + stderr, + }); + } + + let parsed_metrics = + parse_benchmark_metrics(scenario.id, &stdout, scenario.expect_import_metric)?; + let guest_import_ms = parsed_metrics.import_ms; + let startup_ms = guest_import_ms + .map(|guest_ms| saturating_delta_ms(wall_ms, guest_ms)) + .unwrap_or(wall_ms); Ok(SampleMeasurement { wall_ms, guest_import_ms, + context_setup_ms: 0.0, + startup_ms, + completion_ms: 0.0, + resource_usage: parsed_metrics.resource_usage, }) } -fn parse_benchmark_metric( +fn scenario_env( + workspace: &BenchmarkWorkspace, + scenario: &ScenarioDefinition, +) -> BTreeMap { + match scenario.env { + ScenarioEnvironment::None => BTreeMap::new(), + ScenarioEnvironment::ProjectedWorkspaceNodeModules => { + let projected_node_modules = workspace.repo_root.join("node_modules"); + let projected_node_modules_json = + serde_json::to_string(&vec![projected_node_modules.display().to_string()]) + .expect("serialize projected node_modules read path"); + let guest_path_mappings = serde_json::json!([{ + "guestPath": "/root/node_modules", + "hostPath": projected_node_modules.display().to_string(), + }]) + .to_string(); + + BTreeMap::from([ + ( + String::from("AGENT_OS_EXTRA_FS_READ_PATHS"), + projected_node_modules_json, + ), + ( + String::from("AGENT_OS_GUEST_PATH_MAPPINGS"), + guest_path_mappings, + ), + ]) + } + } +} + +fn measure_transport_rtt( + workspace: &BenchmarkWorkspace, + config: &JavascriptBenchmarkConfig, +) -> Result, JavascriptBenchmarkError> { + let mut engine = JavascriptExecutionEngine::default(); + let context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-transport"), + bootstrap_module: None, + compile_cache_root: None, + }); + let mut execution = engine.start_execution(StartJavascriptExecutionRequest { + vm_id: String::from("vm-transport"), + context_id: context.context_id, + argv: vec![String::from("./bench/transport-echo.mjs")], + env: BTreeMap::from([(String::from("AGENT_OS_KEEP_STDIN_OPEN"), String::from("1"))]), + cwd: workspace.root.clone(), + })?; + + let mut stdout_buffer = String::new(); + let mut stderr_buffer = String::new(); + let mut reports = Vec::with_capacity(TRANSPORT_RTT_PAYLOAD_BYTES.len()); + + for payload_bytes in TRANSPORT_RTT_PAYLOAD_BYTES { + for warmup_index in 0..config.warmup_iterations { + let label = format!("warmup-{}-{warmup_index}", payload_bytes); + measure_transport_roundtrip( + &mut execution, + payload_bytes, + &label, + &mut stdout_buffer, + &mut stderr_buffer, + )?; + } + + let mut samples_ms = Vec::with_capacity(config.iterations); + for iteration in 0..config.iterations { + let label = format!("measure-{}-{iteration}", payload_bytes); + samples_ms.push(measure_transport_roundtrip( + &mut execution, + payload_bytes, + &label, + &mut stdout_buffer, + &mut stderr_buffer, + )?); + } + + reports.push(BenchmarkTransportRttReport { + channel: TRANSPORT_RTT_CHANNEL, + payload_bytes, + stats: compute_stats(&samples_ms), + samples_ms, + }); + } + + execution.close_stdin()?; + let result = execution.wait()?; + if result.exit_code != 0 { + stderr_buffer.push_str(&String::from_utf8(result.stderr)?); + return Err(JavascriptBenchmarkError::TransportProbeExited { + exit_code: result.exit_code, + stderr: stderr_buffer, + }); + } + + Ok(reports) +} + +fn measure_transport_roundtrip( + execution: &mut crate::JavascriptExecution, + payload_bytes: usize, + label: &str, + stdout_buffer: &mut String, + stderr_buffer: &mut String, +) -> Result { + let payload = transport_probe_payload(payload_bytes, label); + let expected_line = format!("{payload}\n"); + let started_at = Instant::now(); + execution.write_stdin(expected_line.as_bytes())?; + + loop { + if let Some(line) = take_complete_line(stdout_buffer) { + if line == payload { + return Ok(started_at.elapsed().as_secs_f64() * 1000.0); + } + return Err(JavascriptBenchmarkError::InvalidTransportProbeResponse { + payload_bytes, + expected: payload, + actual: line, + }); + } + + match execution.poll_event(TRANSPORT_POLL_TIMEOUT)? { + Some(crate::JavascriptExecutionEvent::Stdout(chunk)) => { + stdout_buffer.push_str(&String::from_utf8(chunk)?); + } + Some(crate::JavascriptExecutionEvent::Stderr(chunk)) => { + stderr_buffer.push_str(&String::from_utf8(chunk)?); + } + Some(crate::JavascriptExecutionEvent::Exited(exit_code)) => { + return Err(JavascriptBenchmarkError::TransportProbeExited { + exit_code, + stderr: stderr_buffer.clone(), + }); + } + None => { + return Err(JavascriptBenchmarkError::TransportProbeTimeout { payload_bytes }); + } + } + } +} + +fn transport_probe_payload(payload_bytes: usize, label: &str) -> String { + if payload_bytes == 0 { + return format!("transport:{label}:"); + } + + let header = format!("transport:{label}:"); + let fill_len = payload_bytes.saturating_sub(header.len()); + format!("{header}{}", "x".repeat(fill_len)) +} + +fn take_complete_line(buffer: &mut String) -> Option { + let newline_index = buffer.find('\n')?; + let line = buffer[..newline_index].trim_end_matches('\r').to_owned(); + buffer.drain(..=newline_index); + Some(line) +} + +#[derive(Debug, Default, Deserialize)] +struct ParsedBenchmarkMetrics { + #[serde(default)] + import_ms: Option, + #[serde(default)] + resource_usage: Option>, +} + +fn parse_benchmark_metrics( scenario_id: &'static str, stdout: &str, -) -> Result { + expect_import_metric: bool, +) -> Result { let raw_value = stdout .lines() + .rev() .find_map(|line| line.strip_prefix(BENCHMARK_MARKER_PREFIX)) .ok_or(JavascriptBenchmarkError::MissingBenchmarkMetric( scenario_id, - ))?; + ))? + .trim(); + + if let Ok(parsed) = serde_json::from_str::(raw_value) { + let has_resource_usage = match parsed.resource_usage.as_ref() { + Some(resource_usage) => !resource_usage.is_empty(), + None => false, + }; + if parsed.import_ms.is_some() || has_resource_usage { + if expect_import_metric && parsed.import_ms.is_none() { + return Err(JavascriptBenchmarkError::MissingBenchmarkMetric( + scenario_id, + )); + } + return Ok(parsed); + } + } raw_value .parse::() + .map(|import_ms| ParsedBenchmarkMetrics { + import_ms: Some(import_ms), + resource_usage: None, + }) .map_err(|_| JavascriptBenchmarkError::InvalidBenchmarkMetric { scenario: scenario_id, raw_value: raw_value.to_owned(), @@ -630,6 +2838,16 @@ fn workspace_root() -> Result { .ok_or(JavascriptBenchmarkError::InvalidWorkspaceRoot(manifest_dir)) } +fn load_benchmark_artifact( + baseline_path: &Path, +) -> Result { + let raw = fs::read_to_string(baseline_path)?; + serde_json::from_str(&raw).map_err(|err| JavascriptBenchmarkError::InvalidBaselineReport { + path: baseline_path.to_path_buf(), + message: err.to_string(), + }) +} + fn benchmark_host() -> Result { let node_binary = crate::node_process::node_binary(); let output = Command::new(&node_binary) @@ -680,10 +2898,14 @@ fn write_benchmark_workspace(root: &Path) -> Result<(), JavascriptBenchmarkError last = LOCAL_GRAPH_MODULE_COUNT - 1 ), )?; + fs::write( + root.join("bench/benchmark-metrics.mjs"), + benchmark_metrics_module_source(), + )?; fs::write( root.join("bench/isolate-startup.mjs"), - "console.log('isolate-ready');\n", + resource_only_entrypoint_source("console.log('isolate-ready');"), )?; fs::write( root.join("bench/cold-local-import.mjs"), @@ -695,25 +2917,163 @@ fn write_benchmark_workspace(root: &Path) -> Result<(), JavascriptBenchmarkError )?; fs::write( root.join("bench/builtin-import.mjs"), - format!( - "import {{ performance }} from 'node:perf_hooks';\nconst started = performance.now();\nconst [pathMod, fsMod, urlMod] = await Promise.all([\n import('node:path'),\n import('node:fs/promises'),\n import('node:url'),\n]);\nif (typeof pathMod.basename !== 'function' || typeof fsMod.readFile !== 'function' || typeof urlMod.pathToFileURL !== 'function') {{\n throw new Error('builtin import fixture did not load expected exports');\n}}\nconsole.log('{BENCHMARK_MARKER_PREFIX}' + String(performance.now() - started));\n", + timed_entrypoint_source( + "const [pathMod, fsMod, urlMod] = await Promise.all([\n import('node:path'),\n import('node:fs/promises'),\n import('node:url'),\n]);\nif (typeof pathMod.basename !== 'function' || typeof fsMod.readFile !== 'function' || typeof urlMod.pathToFileURL !== 'function') {\n throw new Error('builtin import fixture did not load expected exports');\n}", + ), + )?; + fs::write( + root.join("bench/hot-builtin-stream-import.mjs"), + single_import_entrypoint_source( + "node:stream", + "typeof imported.Readable === 'function'", + "node:stream import did not expose Readable", + ), + )?; + fs::write( + root.join("bench/hot-builtin-stream-web-import.mjs"), + single_import_entrypoint_source( + "node:stream/web", + "typeof imported.ReadableStream === 'function'", + "node:stream/web import did not expose ReadableStream", + ), + )?; + fs::write( + root.join("bench/hot-builtin-crypto-import.mjs"), + single_import_entrypoint_source( + "node:crypto", + "typeof imported.createHash === 'function'", + "node:crypto import did not expose createHash", + ), + )?; + fs::write( + root.join("bench/hot-builtin-zlib-import.mjs"), + single_import_entrypoint_source( + "node:zlib", + "typeof imported.gzipSync === 'function'", + "node:zlib import did not expose gzipSync", + ), + )?; + fs::write( + root.join("bench/hot-builtin-assert-import.mjs"), + single_import_entrypoint_source( + "node:assert/strict", + "typeof imported.strictEqual === 'function'", + "node:assert/strict import did not expose strictEqual", + ), + )?; + fs::write( + root.join("bench/hot-builtin-url-import.mjs"), + single_import_entrypoint_source( + "node:url", + "typeof imported.pathToFileURL === 'function'", + "node:url import did not expose pathToFileURL", ), )?; fs::write( root.join("bench/large-package-import.mjs"), - format!( - "import {{ performance }} from 'node:perf_hooks';\nconst started = performance.now();\nconst typescript = await import('typescript');\nif (typeof typescript.transpileModule !== 'function') {{\n throw new Error('typescript import did not expose transpileModule');\n}}\nconsole.log('{BENCHMARK_MARKER_PREFIX}' + String(performance.now() - started));\n", + timed_entrypoint_source( + "const typescript = await import('typescript');\nif (typeof typescript.transpileModule !== 'function') {\n throw new Error('typescript import did not expose transpileModule');\n}", ), )?; + fs::write( + root.join("bench/hot-projected-package-file-import.mjs"), + projected_package_file_import_entrypoint_source(), + )?; + fs::write( + root.join("bench/projected-package-import.mjs"), + projected_package_import_entrypoint_source(), + )?; + fs::write( + root.join("bench/pdf-lib-startup.mjs"), + pdf_lib_startup_entrypoint_source(), + )?; + fs::write( + root.join("bench/jszip-startup.mjs"), + jszip_startup_entrypoint_source(), + )?; + fs::write( + root.join("bench/jszip-end-to-end.mjs"), + jszip_end_to_end_entrypoint_source(), + )?; + fs::write( + root.join("bench/jszip-repeated-session-compressed.mjs"), + jszip_repeated_session_compressed_entrypoint_source(), + )?; + fs::write( + root.join("bench/transport-echo.mjs"), + "import readline from 'node:readline';\nconst rl = readline.createInterface({ input: process.stdin, crlfDelay: Infinity });\nfor await (const line of rl) {\n process.stdout.write(`${line}\\n`);\n}\n", + )?; Ok(()) } fn local_import_entrypoint_source(final_value: usize) -> String { + timed_entrypoint_source(&format!( + "const graph = await import('./local-graph/root.mjs');\nif (graph.value !== {final_value} || graph.expected !== {final_value}) {{\n throw new Error(`local graph import returned ${{\n graph.value\n }} instead of {final_value}`);\n}}" + )) +} + +fn single_import_entrypoint_source( + specifier: &str, + validation_expression: &str, + error_message: &str, +) -> String { + timed_entrypoint_source(&format!( + "const imported = await import('{specifier}');\nif (!({validation_expression})) {{\n throw new Error('{error_message}');\n}}" + )) +} + +fn projected_package_file_import_entrypoint_source() -> String { + timed_entrypoint_source( + "const typescriptModule = await import('/root/node_modules/typescript/lib/typescript.js');\nconst typescript = typescriptModule.default ?? typescriptModule;\nif (typeof typescript.transpileModule !== 'function') {\n throw new Error('projected package file import did not expose transpileModule');\n}", + ) +} + +fn projected_package_import_entrypoint_source() -> String { + timed_entrypoint_source( + "const typescriptModule = await import('/root/node_modules/typescript/lib/typescript.js');\nconst typescript = typescriptModule.default ?? typescriptModule;\nconst sourceFile = typescript.createSourceFile(\n 'bench.ts',\n 'const answer: number = 42;',\n typescript.ScriptTarget.ES2022,\n true,\n);\nif (\n typeof typescript.transpileModule !== 'function' ||\n typeof typescript.createSourceFile !== 'function' ||\n !sourceFile ||\n sourceFile.statements.length !== 1\n) {\n throw new Error('projected package import did not expose TypeScript compiler APIs');\n}", + ) +} + +fn pdf_lib_startup_entrypoint_source() -> String { + timed_entrypoint_source( + "const pdfLib = await import('pdf-lib');\nconst pdfDoc = await pdfLib.PDFDocument.create();\nconst page = pdfDoc.addPage([612, 792]);\nconst font = await pdfDoc.embedFont(pdfLib.StandardFonts.Helvetica);\npage.drawText('Agent OS pdf-lib benchmark', {\n x: 50,\n y: 750,\n font,\n size: 18,\n});\nif (pdfDoc.getPageCount() !== 1 || page.getSize().width !== 612) {\n throw new Error('pdf-lib fixture did not create the expected document');\n}", + ) +} + +fn jszip_startup_entrypoint_source() -> String { + timed_entrypoint_source( + "const jszipModule = await import('jszip');\nconst JSZip = jszipModule.default ?? jszipModule;\nconst zip = new JSZip();\nzip.file('README.txt', 'agent-os benchmark archive');\nconst notes = zip.folder('notes');\nif (!notes) {\n throw new Error('jszip fixture failed to create nested folder');\n}\nnotes.file('todo.txt', 'benchmark staging payload');\nconst fileCount = Object.values(zip.files).filter((entry) => !entry.dir).length;\nif (typeof zip.generateAsync !== 'function' || fileCount !== 2) {\n throw new Error('jszip fixture did not stage the expected archive');\n}", + ) +} + +fn jszip_end_to_end_entrypoint_source() -> String { + timed_entrypoint_source( + "const jszipModule = await import('jszip');\nconst JSZip = jszipModule.default ?? jszipModule;\nconst zip = new JSZip();\nconst repeatedPayload = 'agent-os benchmark payload '.repeat(512);\nzip.file('README.txt', repeatedPayload);\nconst notes = zip.folder('notes');\nif (!notes) {\n throw new Error('jszip end-to-end fixture failed to create notes folder');\n}\nnotes.file('todo.txt', 'complete the archive roundtrip');\nconst data = zip.folder('data');\nif (!data) {\n throw new Error('jszip end-to-end fixture failed to create data folder');\n}\ndata.file('payload.json', JSON.stringify({\n repeatedPayloadLength: repeatedPayload.length,\n mode: 'cold-end-to-end',\n}));\nconst archiveBytes = await zip.generateAsync({\n type: 'uint8array',\n compression: 'DEFLATE',\n compressionOptions: { level: 6 },\n});\nconst restored = await JSZip.loadAsync(archiveBytes);\nconst restoredFileCount = Object.values(restored.files).filter((entry) => !entry.dir).length;\nconst restoredReadme = await restored.file('README.txt')?.async('string');\nconst restoredTodo = await restored.file('notes/todo.txt')?.async('string');\nconst restoredPayload = await restored.file('data/payload.json')?.async('string');\nif (\n archiveBytes.byteLength >= repeatedPayload.length ||\n restoredFileCount !== 3 ||\n restoredReadme !== repeatedPayload ||\n restoredTodo !== 'complete the archive roundtrip' ||\n !restoredPayload?.includes('cold-end-to-end')\n) {\n throw new Error('jszip end-to-end fixture did not complete the compressed archive roundtrip');\n}", + ) +} + +fn jszip_repeated_session_compressed_entrypoint_source() -> String { + timed_entrypoint_source( + "const jszipModule = await import('jszip');\nconst JSZip = jszipModule.default ?? jszipModule;\nconst zip = new JSZip();\nconst repeatedPayload = 'agent-os benchmark payload '.repeat(512);\nzip.file('README.txt', repeatedPayload);\nconst notes = zip.folder('notes');\nif (!notes) {\n throw new Error('jszip repeated-session fixture failed to create notes folder');\n}\nnotes.file('todo.txt', 'repeat this session workload');\nconst data = zip.folder('data');\nif (!data) {\n throw new Error('jszip repeated-session fixture failed to create data folder');\n}\ndata.file('payload.json', JSON.stringify({\n repeatedPayloadLength: repeatedPayload.length,\n repeatedSessions: true,\n}));\nconst archiveBytes = await zip.generateAsync({\n type: 'uint8array',\n compression: 'DEFLATE',\n compressionOptions: { level: 6 },\n});\nconst restored = await JSZip.loadAsync(archiveBytes);\nconst restoredFileCount = Object.values(restored.files).filter((entry) => !entry.dir).length;\nconst restoredReadme = await restored.file('README.txt')?.async('string');\nconst restoredTodo = await restored.file('notes/todo.txt')?.async('string');\nif (\n archiveBytes.byteLength >= repeatedPayload.length ||\n restoredFileCount !== 3 ||\n restoredReadme !== repeatedPayload ||\n restoredTodo !== 'repeat this session workload'\n) {\n throw new Error('jszip repeated-session fixture did not complete the compressed archive roundtrip');\n}", + ) +} + +fn benchmark_metrics_module_source() -> String { + format!( + "const BENCHMARK_MARKER_PREFIX = '{BENCHMARK_MARKER_PREFIX}';\n\nexport function emitBenchmarkMetrics(importMs) {{\n const memoryUsage = process.memoryUsage();\n const resourceUsage = typeof process.resourceUsage === 'function'\n ? process.resourceUsage()\n : null;\n const payload = {{\n resource_usage: {{\n rss_bytes: memoryUsage.rss,\n heap_used_bytes: memoryUsage.heapUsed,\n ...(resourceUsage\n ? {{\n cpu_user_us: resourceUsage.userCPUTime,\n cpu_system_us: resourceUsage.systemCPUTime,\n cpu_total_us: resourceUsage.userCPUTime + resourceUsage.systemCPUTime,\n }}\n : {{}}),\n }},\n }};\n\n if (typeof importMs === 'number') {{\n payload.import_ms = importMs;\n }}\n\n console.log(BENCHMARK_MARKER_PREFIX + JSON.stringify(payload));\n}}\n" + ) +} + +fn resource_only_entrypoint_source(body: &str) -> String { + format!( + "import {{ emitBenchmarkMetrics }} from './benchmark-metrics.mjs';\n{body}\nemitBenchmarkMetrics();\n" + ) +} + +fn timed_entrypoint_source(body: &str) -> String { format!( - "import {{ performance }} from 'node:perf_hooks';\nconst started = performance.now();\nconst graph = await import('./local-graph/root.mjs');\nif (graph.value !== {final_value} || graph.expected !== {final_value}) {{\n throw new Error(`local graph import returned ${{ - graph.value - }} instead of {final_value}`);\n}}\nconsole.log('{BENCHMARK_MARKER_PREFIX}' + String(performance.now() - started));\n" + "import {{ performance }} from 'node:perf_hooks';\nimport {{ emitBenchmarkMetrics }} from './benchmark-metrics.mjs';\nconst started = performance.now();\n{body}\nemitBenchmarkMetrics(performance.now() - started);\n" ) } @@ -727,20 +3087,80 @@ fn local_graph_terminal_value() -> usize { value } -fn compute_stats(samples: &[f64]) -> BenchmarkStats { +fn compute_distribution_stats(samples: &[f64]) -> BenchmarkDistributionStats { let mut sorted = samples.to_vec(); sorted.sort_by(|a, b| a.total_cmp(b)); - let mean_ms = sorted.iter().sum::() / sorted.len() as f64; + let mean = sorted.iter().sum::() / sorted.len() as f64; + + BenchmarkDistributionStats { + mean, + p50: percentile(&sorted, 50.0), + p95: percentile(&sorted, 95.0), + min: *sorted.first().unwrap_or(&0.0), + max: *sorted.last().unwrap_or(&0.0), + stddev: standard_deviation(&sorted, mean), + } +} + +fn compute_stats(samples: &[f64]) -> BenchmarkStats { + let stats = compute_distribution_stats(samples); BenchmarkStats { - mean_ms, - p50_ms: percentile(&sorted, 50.0), - p95_ms: percentile(&sorted, 95.0), - min_ms: *sorted.first().unwrap_or(&0.0), - max_ms: *sorted.last().unwrap_or(&0.0), + mean_ms: stats.mean, + p50_ms: stats.p50, + p95_ms: stats.p95, + min_ms: stats.min, + max_ms: stats.max, + stddev_ms: stats.stddev, } } +fn compute_resource_usage_stats( + samples: &BenchmarkResourceUsage>, +) -> Option> { + let stats = BenchmarkResourceUsage { + rss_bytes: samples + .rss_bytes + .as_ref() + .map(|samples| compute_distribution_stats(samples)), + heap_used_bytes: samples + .heap_used_bytes + .as_ref() + .map(|samples| compute_distribution_stats(samples)), + cpu_user_us: samples + .cpu_user_us + .as_ref() + .map(|samples| compute_distribution_stats(samples)), + cpu_system_us: samples + .cpu_system_us + .as_ref() + .map(|samples| compute_distribution_stats(samples)), + cpu_total_us: samples + .cpu_total_us + .as_ref() + .map(|samples| compute_distribution_stats(samples)), + }; + + (!stats.is_empty()).then_some(stats) +} + +fn standard_deviation(samples: &[f64], mean: f64) -> f64 { + if samples.is_empty() { + return 0.0; + } + + let variance = samples + .iter() + .map(|sample| { + let delta = sample - mean; + delta * delta + }) + .sum::() + / samples.len() as f64; + + variance.sqrt() +} + fn percentile(sorted: &[f64], p: f64) -> f64 { if sorted.is_empty() { return 0.0; @@ -775,20 +3195,351 @@ fn safe_ratio(lhs: f64, rhs: f64) -> f64 { } } +fn saturating_delta_ms(total_ms: f64, subtracted_ms: f64) -> f64 { + (total_ms - subtracted_ms).max(0.0) +} + fn format_ms(value: f64) -> String { format!("{value:.2}") } +fn format_hotspot_value(unit: &str, value: f64) -> String { + match unit { + "pct" => format!("{value:.1}%"), + "MiB" => format_mib(value), + _ => format_ms(value), + } +} + fn format_sample_list(samples: &[f64]) -> String { + format_scaled_sample_list(samples, std::convert::identity) +} + +fn format_scaled_sample_list(samples: &[f64], scale: impl Fn(f64) -> f64) -> String { let mut formatted = String::from("["); for (index, sample) in samples.iter().enumerate() { if index > 0 { formatted.push_str(", "); } - let _ = write!(&mut formatted, "{sample:.2}"); + let _ = write!(&mut formatted, "{:.2}", scale(*sample)); } formatted.push(']'); formatted } + +fn format_mib(value: f64) -> String { + format!("{value:.2}") +} + +fn format_label_list(labels: &[&str]) -> String { + labels + .iter() + .map(|label| format!("`{label}`")) + .collect::>() + .join(", ") +} + +fn format_string_label_list(labels: &[&str]) -> String { + labels + .iter() + .map(|label| format!("`{label}`")) + .collect::>() + .join(", ") +} + +fn push_unique_label<'a>(labels: &mut Vec<&'a str>, value: &'a str) { + if !labels.contains(&value) { + labels.push(value); + } +} + +fn format_delta_ms(value: f64) -> String { + format!("{value:+.2}") +} + +fn format_delta_pct(value: f64) -> String { + format!("{value:+.1}%") +} + +fn push_optional_sample(samples: &mut Option>, value: Option) { + if let Some(value) = value { + samples.get_or_insert_with(Vec::new).push(value); + } +} + +fn bytes_to_mib(value: f64) -> f64 { + value / (1024.0 * 1024.0) +} + +fn micros_to_ms(value: f64) -> f64 { + value / 1000.0 +} + +fn hotspot_wall_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.wall_stats.mean_ms) +} + +fn hotspot_wall_stddev_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.wall_stats.stddev_ms) +} + +fn hotspot_wall_range_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.wall_range_ms()) +} + +fn hotspot_guest_import_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .guest_import_stats + .as_ref() + .map(|stats| stats.mean_ms) +} + +fn hotspot_startup_overhead_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .startup_overhead_stats + .as_ref() + .map(|stats| stats.mean_ms) +} + +fn hotspot_context_setup_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.phase_stats.context_setup_ms.mean_ms) +} + +fn hotspot_startup_phase_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.phase_stats.startup_ms.mean_ms) +} + +fn hotspot_guest_execution_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .phase_stats + .guest_execution_ms + .as_ref() + .map(|stats| stats.mean_ms) +} + +fn hotspot_completion_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + Some(scenario.phase_stats.completion_ms.mean_ms) +} + +fn hotspot_startup_share_pct(scenario: &BenchmarkScenarioReport) -> Option { + scenario.mean_startup_share_pct() +} + +fn hotspot_rss_mean_mib(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .resource_usage_stats + .as_ref()? + .rss_bytes + .as_ref() + .map(|stats| bytes_to_mib(stats.mean)) +} + +fn hotspot_heap_mean_mib(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .resource_usage_stats + .as_ref()? + .heap_used_bytes + .as_ref() + .map(|stats| bytes_to_mib(stats.mean)) +} + +fn hotspot_total_cpu_mean_ms(scenario: &BenchmarkScenarioReport) -> Option { + scenario + .resource_usage_stats + .as_ref()? + .cpu_total_us + .as_ref() + .map(|stats| micros_to_ms(stats.mean)) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::RefCell; + use tempfile::tempdir; + + fn synthetic_transport_reports() -> Vec { + TRANSPORT_RTT_PAYLOAD_BYTES + .iter() + .enumerate() + .map(|(index, payload_bytes)| { + let sample = index as f64 + 1.0; + BenchmarkTransportRttReport { + channel: TRANSPORT_RTT_CHANNEL, + payload_bytes: *payload_bytes, + samples_ms: vec![sample], + stats: compute_stats(&[sample]), + } + }) + .collect() + } + + fn synthetic_scenario_report( + definition: ScenarioDefinition, + wall_sample_ms: f64, + ) -> BenchmarkScenarioReport { + let context_setup_ms = wall_sample_ms / 5.0; + let startup_ms = wall_sample_ms / 4.0; + let guest_execution_ms = definition + .expect_import_metric + .then_some(wall_sample_ms / 3.0); + let completion_ms = + wall_sample_ms - context_setup_ms - startup_ms - guest_execution_ms.unwrap_or(0.0); + let startup_overhead_ms = definition + .expect_import_metric + .then_some(context_setup_ms + startup_ms + completion_ms); + let resource_usage_samples = BenchmarkResourceUsage { + rss_bytes: Some(vec![64.0 * 1024.0 * 1024.0]), + heap_used_bytes: Some(vec![12.0 * 1024.0 * 1024.0]), + cpu_user_us: None, + cpu_system_us: None, + cpu_total_us: Some(vec![wall_sample_ms * 1000.0]), + }; + + BenchmarkScenarioReport { + id: definition.id, + workload: definition.workload, + runtime: definition.runtime.label(), + mode: definition.mode.label(), + description: definition.description, + fixture: definition.fixture, + compile_cache: definition.compile_cache.label(), + wall_samples_ms: vec![wall_sample_ms], + wall_stats: compute_stats(&[wall_sample_ms]), + guest_import_samples_ms: guest_execution_ms.map(|sample| vec![sample]), + guest_import_stats: guest_execution_ms.map(|sample| compute_stats(&[sample])), + startup_overhead_samples_ms: startup_overhead_ms.map(|sample| vec![sample]), + startup_overhead_stats: startup_overhead_ms.map(|sample| compute_stats(&[sample])), + phase_samples_ms: BenchmarkScenarioPhases { + context_setup_ms: vec![context_setup_ms], + startup_ms: vec![startup_ms], + guest_execution_ms: guest_execution_ms.map(|sample| vec![sample]), + completion_ms: vec![completion_ms], + }, + phase_stats: BenchmarkScenarioPhases { + context_setup_ms: compute_stats(&[context_setup_ms]), + startup_ms: compute_stats(&[startup_ms]), + guest_execution_ms: guest_execution_ms.map(|sample| compute_stats(&[sample])), + completion_ms: compute_stats(&[completion_ms]), + }, + resource_usage_stats: compute_resource_usage_stats(&resource_usage_samples), + resource_usage_samples: Some(resource_usage_samples), + } + } + + fn synthetic_host() -> BenchmarkHost { + BenchmarkHost { + node_binary: String::from("node"), + node_version: String::from("v22.0.0"), + os: "linux", + arch: "x86_64", + logical_cpus: 8, + } + } + + #[test] + fn javascript_benchmark_orchestration_resumes_completed_stages_from_run_state() { + let tempdir = tempdir().expect("create tempdir"); + let repo_root = tempdir.path().join("repo"); + let artifact_dir = tempdir.path().join("artifacts"); + fs::create_dir_all(&repo_root).expect("create repo root"); + + let config = JavascriptBenchmarkConfig { + iterations: 1, + warmup_iterations: 0, + }; + let host = synthetic_host(); + let definitions = benchmark_scenarios(); + let mut state = StoredBenchmarkRunState::new(&config, &host, &repo_root); + state.record_transport_rtt(&synthetic_transport_reports()); + state.record_scenario(&synthetic_scenario_report(definitions[0], 10.0)); + persist_benchmark_run_state(&benchmark_run_state_path(&artifact_dir), &state) + .expect("persist initial run state"); + + let transport_calls = RefCell::new(0usize); + let scenario_calls = RefCell::new(Vec::new()); + let (report, resumed_stage_count, _) = orchestrate_javascript_benchmark_report( + &config, + &repo_root, + &host, + &artifact_dir, + || { + *transport_calls.borrow_mut() += 1; + Ok(synthetic_transport_reports()) + }, + |definition| { + scenario_calls.borrow_mut().push(definition.id.to_owned()); + Ok(synthetic_scenario_report(definition, 20.0)) + }, + ) + .expect("resume benchmark orchestration"); + + assert_eq!(resumed_stage_count, 2); + assert_eq!(*transport_calls.borrow(), 0); + assert_eq!( + scenario_calls.borrow().as_slice(), + &definitions[1..] + .iter() + .map(|definition| definition.id.to_owned()) + .collect::>() + ); + assert_eq!( + report.transport_rtt.len(), + TRANSPORT_RTT_PAYLOAD_BYTES.len() + ); + assert_eq!(report.scenarios.len(), definitions.len()); + assert_eq!(report.scenarios[0].id, definitions[0].id); + assert_eq!(report.scenarios[1].id, definitions[1].id); + } + + #[test] + fn javascript_benchmark_orchestration_persists_completed_stages_before_failure() { + let tempdir = tempdir().expect("create tempdir"); + let repo_root = tempdir.path().join("repo"); + let artifact_dir = tempdir.path().join("artifacts"); + fs::create_dir_all(&repo_root).expect("create repo root"); + + let config = JavascriptBenchmarkConfig { + iterations: 1, + warmup_iterations: 0, + }; + let host = synthetic_host(); + let state_path = benchmark_run_state_path(&artifact_dir); + let failure = orchestrate_javascript_benchmark_report( + &config, + &repo_root, + &host, + &artifact_dir, + || Ok(synthetic_transport_reports()), + |definition| { + if definition.id == "cold-local-import" { + Err(JavascriptBenchmarkError::InvalidConfig("synthetic failure")) + } else { + Ok(synthetic_scenario_report(definition, 15.0)) + } + }, + ) + .expect_err("expected synthetic orchestration failure"); + + assert!(matches!( + failure, + JavascriptBenchmarkError::InvalidConfig("synthetic failure") + )); + + let stored_state = serde_json::from_str::( + &fs::read_to_string(&state_path).expect("read persisted run state"), + ) + .expect("parse persisted run state"); + assert!(stored_state.transport_rtt.is_some()); + assert_eq!( + stored_state + .scenarios + .iter() + .map(|scenario| scenario.id.as_str()) + .collect::>(), + vec!["isolate-startup", "prewarmed-isolate-startup"] + ); + } +} diff --git a/crates/execution/src/bin/node-import-bench.rs b/crates/execution/src/bin/node-import-bench.rs index f13d785aa..727658f06 100644 --- a/crates/execution/src/bin/node-import-bench.rs +++ b/crates/execution/src/bin/node-import-bench.rs @@ -1,10 +1,51 @@ -use agent_os_execution::benchmark::{run_javascript_benchmarks, JavascriptBenchmarkConfig}; +use agent_os_execution::benchmark::{ + run_javascript_benchmarks_with_recovery, JavascriptBenchmarkConfig, +}; +use std::path::PathBuf; + +struct CliConfig { + benchmark: JavascriptBenchmarkConfig, + baseline_path: Option, +} fn main() { match parse_config(std::env::args().skip(1)) { - Ok(config) => match run_javascript_benchmarks(&config) { - Ok(report) => { - print!("{}", report.render_markdown()); + Ok(cli_config) => match run_javascript_benchmarks_with_recovery( + &cli_config.benchmark, + cli_config.baseline_path.as_deref(), + ) { + Ok(output) => { + if output.resumed_stage_count > 0 { + eprintln!( + "Resumed {} completed benchmark stages from {}", + output.resumed_stage_count, + output + .artifact_paths + .json_path + .parent() + .expect("benchmark artifact parent directory") + .join("run-state.json") + .display() + ); + } + if let Some(path) = &cli_config.baseline_path { + eprintln!("Compared against baseline {}", path.display()); + } + eprintln!( + "Wrote Markdown report to {}", + output.artifact_paths.markdown_path.display() + ); + eprintln!( + "Wrote JSON report to {}", + output.artifact_paths.json_path.display() + ); + match std::fs::read_to_string(&output.artifact_paths.markdown_path) { + Ok(markdown) => print!("{markdown}"), + Err(err) => { + eprintln!("failed to read generated markdown report: {err}"); + std::process::exit(1); + } + } } Err(err) => { eprintln!("{err}"); @@ -14,16 +55,15 @@ fn main() { Err(err) => { eprintln!("{err}"); eprintln!(); - eprintln!("Usage: cargo run -p agent-os-execution --bin node-import-bench -- [--iterations N] [--warmup-iterations N]"); + eprintln!("Usage: cargo run -p agent-os-execution --bin node-import-bench -- [--iterations N] [--warmup-iterations N] [--baseline PATH]"); std::process::exit(2); } } } -fn parse_config( - args: impl IntoIterator, -) -> Result { - let mut config = JavascriptBenchmarkConfig::default(); +fn parse_config(args: impl IntoIterator) -> Result { + let mut benchmark = JavascriptBenchmarkConfig::default(); + let mut baseline_path = None; let mut args = args.into_iter(); while let Some(arg) = args.next() { @@ -32,13 +72,19 @@ fn parse_config( let value = args .next() .ok_or_else(|| String::from("missing value for --iterations"))?; - config.iterations = parse_usize_flag("--iterations", &value)?; + benchmark.iterations = parse_usize_flag("--iterations", &value)?; } "--warmup-iterations" => { let value = args .next() .ok_or_else(|| String::from("missing value for --warmup-iterations"))?; - config.warmup_iterations = parse_usize_flag("--warmup-iterations", &value)?; + benchmark.warmup_iterations = parse_usize_flag("--warmup-iterations", &value)?; + } + "--baseline" => { + let value = args + .next() + .ok_or_else(|| String::from("missing value for --baseline"))?; + baseline_path = Some(PathBuf::from(value)); } "--help" | "-h" => { return Err(String::from("help requested")); @@ -49,7 +95,10 @@ fn parse_config( } } - Ok(config) + Ok(CliConfig { + benchmark, + baseline_path, + }) } fn parse_usize_flag(flag: &str, value: &str) -> Result { diff --git a/crates/execution/src/node_import_cache.rs b/crates/execution/src/node_import_cache.rs index 2807b040a..c0c2b76a5 100644 --- a/crates/execution/src/node_import_cache.rs +++ b/crates/execution/src/node_import_cache.rs @@ -11,7 +11,7 @@ pub(crate) const NODE_IMPORT_CACHE_ASSET_ROOT_ENV: &str = "AGENT_OS_NODE_IMPORT_ const NODE_IMPORT_CACHE_PATH_ENV: &str = "AGENT_OS_NODE_IMPORT_CACHE_PATH"; const NODE_IMPORT_CACHE_LOADER_PATH_ENV: &str = "AGENT_OS_NODE_IMPORT_CACHE_LOADER_PATH"; const NODE_IMPORT_CACHE_SCHEMA_VERSION: &str = "1"; -const NODE_IMPORT_CACHE_LOADER_VERSION: &str = "4"; +const NODE_IMPORT_CACHE_LOADER_VERSION: &str = "5"; const NODE_IMPORT_CACHE_ASSET_VERSION: &str = "1"; const AGENT_OS_BUILTIN_SPECIFIER_PREFIX: &str = "agent-os:builtin/"; const AGENT_OS_POLYFILL_SPECIFIER_PREFIX: &str = "agent-os:polyfill/"; @@ -24,6 +24,9 @@ import { fileURLToPath, pathToFileURL } from 'node:url'; const GUEST_PATH_MAPPINGS = parseGuestPathMappings(process.env.AGENT_OS_GUEST_PATH_MAPPINGS); const ALLOWED_BUILTINS = new Set(parseJsonArray(process.env.AGENT_OS_ALLOWED_NODE_BUILTINS)); const CACHE_PATH = process.env.__NODE_IMPORT_CACHE_PATH_ENV__; +const PROJECTED_SOURCE_CACHE_ROOT = CACHE_PATH + ? path.join(path.dirname(CACHE_PATH), 'projected-sources') + : null; const ASSET_ROOT = process.env.__NODE_IMPORT_CACHE_ASSET_ROOT_ENV__; const DEBUG_ENABLED = process.env.__NODE_IMPORT_CACHE_DEBUG_ENV__ === '1'; const METRICS_PREFIX = '__NODE_IMPORT_CACHE_METRICS_PREFIX__'; @@ -60,6 +63,8 @@ const metrics = { packageTypeMisses: 0, moduleFormatHits: 0, moduleFormatMisses: 0, + sourceHits: 0, + sourceMisses: 0, }; export async function resolve(specifier, context, nextResolve) { @@ -184,6 +189,17 @@ export async function load(url, context, nextLoad) { return nextLoad(url, context); } + const projectedPackageSource = loadProjectedPackageSource(url, filePath, format); + if (projectedPackageSource != null) { + flushCacheState(); + emitMetrics(); + return { + shortCircuit: true, + format, + source: projectedPackageSource, + }; + } + const source = format === 'wasm' ? fs.readFileSync(filePath) @@ -266,6 +282,7 @@ function emptyCacheState() { resolutions: {}, packageTypes: {}, moduleFormats: {}, + projectedSources: {}, }; } @@ -286,6 +303,7 @@ function normalizeCacheState(value) { resolutions: isRecord(value.resolutions) ? value.resolutions : {}, packageTypes: isRecord(value.packageTypes) ? value.packageTypes : {}, moduleFormats: isRecord(value.moduleFormats) ? value.moduleFormats : {}, + projectedSources: isRecord(value.projectedSources) ? value.projectedSources : {}, }; } @@ -304,9 +322,64 @@ function mergeCacheStates(base, current) { ...base.moduleFormats, ...current.moduleFormats, }, + projectedSources: { + ...base.projectedSources, + ...current.projectedSources, + }, }; } +function loadProjectedPackageSource(url, filePath, format) { + if ( + format === 'wasm' || + !isProjectedPackageSource(filePath) || + !PROJECTED_SOURCE_CACHE_ROOT + ) { + return null; + } + + const cached = cacheState.projectedSources[url]; + if (cached && validateProjectedSourceEntry(cached, filePath, format)) { + metrics.sourceHits += 1; + return fs.readFileSync(cached.cachedPath, 'utf8'); + } + + metrics.sourceMisses += 1; + + const stat = statForPath(filePath); + if (!stat) { + return null; + } + + const source = rewriteBuiltinImports(fs.readFileSync(filePath, 'utf8'), filePath); + const cacheKey = hashString( + JSON.stringify({ + url, + format, + size: stat.size, + mtimeMs: stat.mtimeMs, + }), + ); + const extension = path.extname(filePath) || '.js'; + const cachedPath = path.join( + PROJECTED_SOURCE_CACHE_ROOT, + `${cacheKey}${extension}.cached`, + ); + fs.mkdirSync(path.dirname(cachedPath), { recursive: true }); + fs.writeFileSync(cachedPath, source); + + cacheState.projectedSources[url] = { + kind: 'text', + filePath, + format, + cachedPath, + size: stat.size, + mtimeMs: stat.mtimeMs, + }; + dirty = true; + return source; +} + function resolveAgentOsAsset(specifier) { if (typeof specifier !== 'string' || !ASSET_ROOT) { return null; @@ -530,6 +603,15 @@ function buildResolutionEntry(specifier, context, resolved) { return null; } +function isProjectedPackageSource(filePath) { + if (typeof filePath !== 'string' || isAssetPath(filePath)) { + return false; + } + + const guestPath = guestPathFromHostPath(filePath); + return typeof guestPath === 'string' && guestPath.includes('/node_modules/'); +} + function validateResolutionEntry(entry) { if (!isRecord(entry) || typeof entry.kind !== 'string') { return false; @@ -687,6 +769,29 @@ function validateModuleFormatEntry(entry) { return true; } +function validateProjectedSourceEntry(entry, filePath, format) { + if ( + !isRecord(entry) || + entry.kind !== 'text' || + typeof entry.filePath !== 'string' || + typeof entry.cachedPath !== 'string' || + typeof entry.format !== 'string' + ) { + return false; + } + + if (entry.filePath !== filePath || entry.format !== format) { + return false; + } + + const stat = statForPath(filePath); + if (!stat || stat.size !== entry.size || stat.mtimeMs !== entry.mtimeMs) { + return false; + } + + return statForPath(entry.cachedPath)?.isFile() ?? false; +} + function lookupPackageType(filePath) { let directory = path.dirname(filePath); diff --git a/crates/execution/tests/benchmark.rs b/crates/execution/tests/benchmark.rs index d22c616ae..e7525afd5 100644 --- a/crates/execution/tests/benchmark.rs +++ b/crates/execution/tests/benchmark.rs @@ -1,4 +1,117 @@ -use agent_os_execution::benchmark::{run_javascript_benchmarks, JavascriptBenchmarkConfig}; +use agent_os_execution::benchmark::{ + run_javascript_benchmarks, BenchmarkDistributionStats, BenchmarkHost, BenchmarkResourceUsage, + BenchmarkScenarioPhases, BenchmarkScenarioReport, BenchmarkStats, BenchmarkTransportRttReport, + JavascriptBenchmarkConfig, JavascriptBenchmarkReport, +}; +use serde_json::Value; +use std::fs; +use std::path::PathBuf; +use tempfile::tempdir; + +fn stats( + mean_ms: f64, + p50_ms: f64, + p95_ms: f64, + min_ms: f64, + max_ms: f64, + stddev_ms: f64, +) -> BenchmarkStats { + BenchmarkStats { + mean_ms, + p50_ms, + p95_ms, + min_ms, + max_ms, + stddev_ms, + } +} + +fn phase_samples( + context_setup_ms: Vec, + startup_ms: Vec, + guest_execution_ms: Option>, + completion_ms: Vec, +) -> BenchmarkScenarioPhases> { + BenchmarkScenarioPhases { + context_setup_ms, + startup_ms, + guest_execution_ms, + completion_ms, + } +} + +fn phase_stats( + context_setup_ms: BenchmarkStats, + startup_ms: BenchmarkStats, + guest_execution_ms: Option, + completion_ms: BenchmarkStats, +) -> BenchmarkScenarioPhases { + BenchmarkScenarioPhases { + context_setup_ms, + startup_ms, + guest_execution_ms, + completion_ms, + } +} + +fn transport_rtt( + payload_bytes: usize, + samples_ms: Vec, + stats: BenchmarkStats, +) -> BenchmarkTransportRttReport { + BenchmarkTransportRttReport { + channel: "execution-stdio-echo", + payload_bytes, + samples_ms, + stats, + } +} + +fn distribution_stats( + mean: f64, + p50: f64, + p95: f64, + min: f64, + max: f64, + stddev: f64, +) -> BenchmarkDistributionStats { + BenchmarkDistributionStats { + mean, + p50, + p95, + min, + max, + stddev, + } +} + +fn resource_samples( + rss_bytes: Option>, + heap_used_bytes: Option>, + cpu_total_us: Option>, +) -> BenchmarkResourceUsage> { + BenchmarkResourceUsage { + rss_bytes, + heap_used_bytes, + cpu_user_us: None, + cpu_system_us: None, + cpu_total_us, + } +} + +fn resource_stats( + rss_bytes: Option, + heap_used_bytes: Option, + cpu_total_us: Option, +) -> BenchmarkResourceUsage { + BenchmarkResourceUsage { + rss_bytes, + heap_used_bytes, + cpu_user_us: None, + cpu_system_us: None, + cpu_total_us, + } +} #[test] fn javascript_benchmark_harness_covers_required_startup_and_import_scenarios() { @@ -17,10 +130,26 @@ fn javascript_benchmark_harness_covers_required_startup_and_import_scenarios() { scenario_ids, vec![ "isolate-startup", + "prewarmed-isolate-startup", "cold-local-import", "warm-local-import", + "same-context-local-import", + "prewarmed-local-import", + "host-local-import", "builtin-import", + "hot-builtin-stream-import", + "hot-builtin-stream-web-import", + "hot-builtin-crypto-import", + "hot-builtin-zlib-import", + "hot-builtin-assert-import", + "hot-builtin-url-import", + "hot-projected-package-file-import", "large-package-import", + "projected-package-import", + "pdf-lib-startup", + "jszip-startup", + "jszip-end-to-end", + "jszip-repeated-session-compressed", ] ); @@ -49,11 +178,954 @@ fn javascript_benchmark_harness_covers_required_startup_and_import_scenarios() { .len(), 1 ); + assert_eq!(warm.workload, "local-import"); + assert_eq!(warm.runtime, "native-execution"); + assert_eq!(warm.mode, "new-session-replay"); + + let same_context = report + .scenarios + .iter() + .find(|scenario| scenario.id == "same-context-local-import") + .expect("same-context-local-import scenario"); + assert_eq!(same_context.compile_cache, "primed"); + assert_eq!(same_context.workload, "local-import"); + assert_eq!(same_context.runtime, "native-execution"); + assert_eq!(same_context.mode, "same-session-replay"); + assert_eq!(same_context.wall_samples_ms.len(), 1); + + let prewarmed = report + .scenarios + .iter() + .find(|scenario| scenario.id == "prewarmed-local-import") + .expect("prewarmed-local-import scenario"); + assert_eq!(prewarmed.compile_cache, "primed"); + assert_eq!( + prewarmed + .guest_import_samples_ms + .as_ref() + .expect("prewarmed import samples") + .len(), + 1 + ); + assert_eq!( + prewarmed + .startup_overhead_samples_ms + .as_ref() + .expect("prewarmed startup samples") + .len(), + 1 + ); + assert_eq!(prewarmed.mode, "same-engine-replay"); + + let host = report + .scenarios + .iter() + .find(|scenario| scenario.id == "host-local-import") + .expect("host-local-import scenario"); + assert_eq!(host.workload, "local-import"); + assert_eq!(host.runtime, "host-node"); + assert_eq!(host.mode, "host-control"); + assert_eq!( + host.guest_import_samples_ms + .as_ref() + .expect("host import samples") + .len(), + 1 + ); + + let prewarmed_isolate = report + .scenarios + .iter() + .find(|scenario| scenario.id == "prewarmed-isolate-startup") + .expect("prewarmed-isolate-startup scenario"); + assert_eq!(prewarmed_isolate.workload, "startup-floor"); + assert_eq!(prewarmed_isolate.mode, "same-engine-replay"); + assert_eq!(prewarmed_isolate.compile_cache, "primed"); + assert!(prewarmed_isolate.guest_import_samples_ms.is_none()); + + let hot_builtin = report + .scenarios + .iter() + .find(|scenario| scenario.id == "hot-builtin-crypto-import") + .expect("hot-builtin-crypto-import scenario"); + assert_eq!(hot_builtin.workload, "builtin-hot-import"); + assert_eq!(hot_builtin.mode, "same-engine-replay"); + assert_eq!(hot_builtin.compile_cache, "primed"); + assert_eq!( + hot_builtin + .guest_import_samples_ms + .as_ref() + .expect("hot builtin import samples") + .len(), + 1 + ); + + let hot_projected = report + .scenarios + .iter() + .find(|scenario| scenario.id == "hot-projected-package-file-import") + .expect("hot-projected-package-file-import scenario"); + assert_eq!(hot_projected.workload, "projected-package-hot-import"); + assert_eq!(hot_projected.mode, "same-engine-replay"); + assert_eq!(hot_projected.compile_cache, "primed"); + assert_eq!( + hot_projected + .guest_import_samples_ms + .as_ref() + .expect("hot projected import samples") + .len(), + 1 + ); let rendered = report.render_markdown(); assert!(rendered.contains("ARC-021C")); assert!(rendered.contains("ARC-021D")); assert!(rendered.contains("ARC-022")); + assert!(rendered.contains("current import-cache materialization and builtin/polyfill prewarm")); assert!(rendered.contains("typescript")); + assert!(rendered.contains("projected TypeScript guest-path import")); + assert!(rendered.contains("projected-package-import")); + assert!(rendered.contains("pdf-lib document creation")); + assert!(rendered.contains("jszip archive staging")); + assert!(rendered.contains("jszip end-to-end archive roundtrip")); + assert!(rendered.contains("jszip compressed archive roundtrip")); + assert!(rendered.contains("prewarmed-isolate-startup")); + assert!(rendered.contains("prewarmed-local-import")); + assert!(rendered.contains("same-context-local-import")); + assert!(rendered.contains("host-local-import")); assert!(rendered.contains("node:path + node:url + node:fs/promises")); + assert!(rendered.contains("node:stream/web")); + assert!(rendered.contains("node:crypto")); + assert!(rendered.contains("projected TypeScript compiler file")); + assert!(rendered.contains("hot-projected-package-file-import")); + assert!(rendered.contains("## Transport RTT")); + assert!(rendered.contains("## Control Matrix")); + assert!(rendered.contains("## Ranked Hotspots")); + assert!(rendered.contains("### Wall Time (`time`, `ms`)")); + assert!(rendered.contains("### Startup Share Of Wall (`share`, `pct`)")); + assert!(rendered.contains("Mean context (ms)")); + assert!(rendered.contains("same-session-replay")); + assert!(rendered.contains("host-control")); + + let json = report.render_json().expect("render benchmark json"); + let parsed: Value = serde_json::from_str(&json).expect("parse benchmark json"); + assert_eq!(parsed["artifact_version"], 5); + assert_eq!(parsed["summary"]["scenario_count"], 21); + assert_eq!(parsed["summary"]["recorded_samples_per_scenario"], 1); + assert_eq!( + parsed["transport_rtt"] + .as_array() + .expect("transport rtt array") + .len(), + 3 + ); + let scenarios = parsed["scenarios"] + .as_array() + .expect("json scenarios array"); + assert_eq!(scenarios.len(), 21); + assert!( + parsed["summary"]["slowest_wall_scenario"]["id"].is_string(), + "expected a summarized slowest wall scenario: {json}" + ); + let startup_floor_matrix = parsed["summary"]["control_matrix"] + .as_array() + .expect("control matrix array") + .iter() + .find(|row| row["workload"] == "startup-floor") + .expect("startup-floor control matrix row"); + assert_eq!( + startup_floor_matrix["modes"].as_array().map(Vec::len), + Some(2) + ); + let local_import_matrix = parsed["summary"]["control_matrix"] + .as_array() + .expect("control matrix array") + .iter() + .find(|row| row["workload"] == "local-import") + .expect("local-import control matrix row"); + assert_eq!( + local_import_matrix["modes"].as_array().map(Vec::len), + Some(5) + ); + assert_eq!( + local_import_matrix["runtimes"].as_array().map(Vec::len), + Some(2) + ); + let builtin_hot_matrix = parsed["summary"]["control_matrix"] + .as_array() + .expect("control matrix array") + .iter() + .find(|row| row["workload"] == "builtin-hot-import") + .expect("builtin-hot-import control matrix row"); + assert_eq!( + builtin_hot_matrix["scenario_ids"].as_array().map(Vec::len), + Some(6) + ); + let hotspot_rankings = parsed["summary"]["hotspot_rankings"] + .as_array() + .expect("hotspot rankings array"); + assert_eq!(hotspot_rankings.len(), 13); + assert_eq!(hotspot_rankings[0]["metric"], "wall_mean_ms"); + assert_eq!(hotspot_rankings[1]["metric"], "wall_stddev_ms"); + assert_eq!(hotspot_rankings[1]["dimension"], "stability"); + assert_eq!(hotspot_rankings[0]["unit"], "ms"); + assert!(scenarios + .iter() + .all(|scenario| scenario["wall_stats"]["stddev_ms"].is_number())); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "prewarmed-isolate-startup" + && scenario["workload"] == "startup-floor" + && scenario["mode"] == "same-engine-replay" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"].is_null() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "same-context-local-import" + && scenario["workload"] == "local-import" + && scenario["runtime"] == "native-execution" + && scenario["mode"] == "same-session-replay" + && scenario["compile_cache"] == "primed" + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "host-local-import" + && scenario["workload"] == "local-import" + && scenario["runtime"] == "host-node" + && scenario["mode"] == "host-control" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "hot-builtin-stream-web-import" + && scenario["fixture"] == "node:stream/web" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "hot-builtin-crypto-import" + && scenario["fixture"] == "node:crypto" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "hot-projected-package-file-import" + && scenario["fixture"] == "projected TypeScript compiler file" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "pdf-lib-startup" + && scenario["fixture"] == "pdf-lib document creation" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "large-package-import" + && scenario["fixture"] == "typescript" + && scenario["compile_cache"] == "disabled" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "jszip-startup" + && scenario["fixture"] == "jszip archive staging" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "jszip-end-to-end" + && scenario["fixture"] == "jszip end-to-end archive roundtrip" + && scenario["compile_cache"] == "disabled" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "jszip-repeated-session-compressed" + && scenario["fixture"] == "jszip compressed archive roundtrip" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "prewarmed-local-import" + && scenario["fixture"] == "24-module local ESM graph" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["id"] == "projected-package-import" + && scenario["fixture"] == "projected TypeScript guest-path import" + && scenario["compile_cache"] == "primed" + && scenario["guest_import_stats"]["mean_ms"].is_number() + })); + assert!(scenarios.iter().any(|scenario| { + scenario["guest_import_samples_ms"].is_array() + && scenario["startup_overhead_samples_ms"].is_array() + && scenario["mean_startup_share_pct"].is_number() + && scenario["phase_stats"]["startup_ms"]["mean_ms"].is_number() + && scenario["phase_samples_ms"]["completion_ms"].is_array() + && scenario["resource_usage_stats"]["rss_bytes"]["mean"].is_number() + && scenario["resource_usage_stats"]["cpu_total_us"]["mean"].is_number() + && scenario["resource_usage_samples"]["heap_used_bytes"].is_array() + })); +} + +#[test] +fn javascript_benchmark_json_artifact_stays_stable_for_summary_and_samples() { + let report = JavascriptBenchmarkReport { + generated_at_unix_ms: 42, + config: JavascriptBenchmarkConfig { + iterations: 2, + warmup_iterations: 1, + }, + host: BenchmarkHost { + node_binary: String::from("node"), + node_version: String::from("v22.0.0"), + os: "linux", + arch: "x86_64", + logical_cpus: 8, + }, + repo_root: PathBuf::from("/repo"), + transport_rtt: vec![ + transport_rtt(32, vec![0.4, 0.6], stats(0.5, 0.4, 0.6, 0.4, 0.6, 0.1)), + transport_rtt(4096, vec![0.9, 1.1], stats(1.0, 0.9, 1.1, 0.9, 1.1, 0.1)), + transport_rtt(65536, vec![2.6, 3.0], stats(2.8, 2.6, 3.0, 2.6, 3.0, 0.2)), + ], + scenarios: vec![ + BenchmarkScenarioReport { + id: "fast-scenario", + workload: "fixture-a", + runtime: "native-execution", + mode: "true-cold-start", + description: "Faster benchmark path", + fixture: "fixture-a", + compile_cache: "disabled", + wall_samples_ms: vec![10.0, 14.0], + wall_stats: stats(12.0, 10.0, 14.0, 10.0, 14.0, 2.0), + guest_import_samples_ms: Some(vec![4.0, 6.0]), + guest_import_stats: Some(stats(5.0, 4.0, 6.0, 4.0, 6.0, 1.0)), + startup_overhead_samples_ms: Some(vec![6.0, 8.0]), + startup_overhead_stats: Some(stats(7.0, 6.0, 8.0, 6.0, 8.0, 1.0)), + phase_samples_ms: phase_samples( + vec![1.0, 2.0], + vec![2.0, 3.0], + Some(vec![4.0, 6.0]), + vec![3.0, 3.0], + ), + phase_stats: phase_stats( + stats(1.5, 1.0, 2.0, 1.0, 2.0, 0.5), + stats(2.5, 2.0, 3.0, 2.0, 3.0, 0.5), + Some(stats(5.0, 4.0, 6.0, 4.0, 6.0, 1.0)), + stats(3.0, 3.0, 3.0, 3.0, 3.0, 0.0), + ), + resource_usage_samples: Some(resource_samples( + Some(vec![32.0 * 1024.0 * 1024.0, 36.0 * 1024.0 * 1024.0]), + Some(vec![8.0 * 1024.0 * 1024.0, 10.0 * 1024.0 * 1024.0]), + Some(vec![4000.0, 6000.0]), + )), + resource_usage_stats: Some(resource_stats( + Some(distribution_stats( + 34.0 * 1024.0 * 1024.0, + 32.0 * 1024.0 * 1024.0, + 36.0 * 1024.0 * 1024.0, + 32.0 * 1024.0 * 1024.0, + 36.0 * 1024.0 * 1024.0, + 2.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 9.0 * 1024.0 * 1024.0, + 8.0 * 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0, + 8.0 * 1024.0 * 1024.0, + 10.0 * 1024.0 * 1024.0, + 1.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 5000.0, 4000.0, 6000.0, 4000.0, 6000.0, 1000.0, + )), + )), + }, + BenchmarkScenarioReport { + id: "slow-scenario", + workload: "fixture-b", + runtime: "host-node", + mode: "host-control", + description: "Slower benchmark path", + fixture: "fixture-b", + compile_cache: "primed", + wall_samples_ms: vec![30.0, 34.0], + wall_stats: stats(32.0, 30.0, 34.0, 30.0, 34.0, 2.0), + guest_import_samples_ms: Some(vec![12.0, 14.0]), + guest_import_stats: Some(stats(13.0, 12.0, 14.0, 12.0, 14.0, 1.0)), + startup_overhead_samples_ms: Some(vec![18.0, 20.0]), + startup_overhead_stats: Some(stats(19.0, 18.0, 20.0, 18.0, 20.0, 1.0)), + phase_samples_ms: phase_samples( + vec![4.0, 4.0], + vec![5.0, 6.0], + Some(vec![12.0, 14.0]), + vec![9.0, 10.0], + ), + phase_stats: phase_stats( + stats(4.0, 4.0, 4.0, 4.0, 4.0, 0.0), + stats(5.5, 5.0, 6.0, 5.0, 6.0, 0.5), + Some(stats(13.0, 12.0, 14.0, 12.0, 14.0, 1.0)), + stats(9.5, 9.0, 10.0, 9.0, 10.0, 0.5), + ), + resource_usage_samples: Some(resource_samples( + Some(vec![64.0 * 1024.0 * 1024.0, 72.0 * 1024.0 * 1024.0]), + Some(vec![14.0 * 1024.0 * 1024.0, 18.0 * 1024.0 * 1024.0]), + Some(vec![9000.0, 11000.0]), + )), + resource_usage_stats: Some(resource_stats( + Some(distribution_stats( + 68.0 * 1024.0 * 1024.0, + 64.0 * 1024.0 * 1024.0, + 72.0 * 1024.0 * 1024.0, + 64.0 * 1024.0 * 1024.0, + 72.0 * 1024.0 * 1024.0, + 4.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 16.0 * 1024.0 * 1024.0, + 14.0 * 1024.0 * 1024.0, + 18.0 * 1024.0 * 1024.0, + 14.0 * 1024.0 * 1024.0, + 18.0 * 1024.0 * 1024.0, + 2.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 10000.0, 9000.0, 11000.0, 9000.0, 11000.0, 1000.0, + )), + )), + }, + ], + }; + + let json = report.render_json().expect("render json"); + let parsed: Value = serde_json::from_str(&json).expect("parse json"); + + assert_eq!(parsed["artifact_version"], 5); + assert_eq!(parsed["generated_at_unix_ms"], 42); + assert_eq!( + parsed["command"].as_str(), + Some( + "cargo run -p agent-os-execution --bin node-import-bench -- --iterations 2 --warmup-iterations 1" + ) + ); + assert_eq!(parsed["summary"]["scenario_count"], 2); + assert_eq!(parsed["summary"]["recorded_samples_per_scenario"], 2); + assert_eq!( + parsed["summary"]["control_matrix"][0]["workload"].as_str(), + Some("fixture-a") + ); + assert_eq!( + parsed["summary"]["control_matrix"][1]["runtimes"][0].as_str(), + Some("host-node") + ); + assert_eq!( + parsed["transport_rtt"][2]["payload_bytes"].as_u64(), + Some(65536) + ); + assert_eq!(parsed["transport_rtt"][2]["stats"]["mean_ms"], 2.8); + assert_eq!( + parsed["summary"]["slowest_wall_scenario"]["id"].as_str(), + Some("slow-scenario") + ); + assert_eq!( + parsed["summary"]["slowest_guest_import_scenario"]["id"].as_str(), + Some("slow-scenario") + ); + assert_eq!( + parsed["summary"]["highest_startup_share_scenario"]["id"].as_str(), + Some("slow-scenario") + ); + let hotspot_rankings = parsed["summary"]["hotspot_rankings"] + .as_array() + .expect("hotspot rankings array"); + assert_eq!(hotspot_rankings.len(), 13); + assert_eq!(hotspot_rankings[0]["metric"], "wall_mean_ms"); + assert_eq!(hotspot_rankings[0]["label"], "Wall Time"); + assert_eq!( + hotspot_rankings[0]["ranked_scenarios"][0]["id"].as_str(), + Some("slow-scenario") + ); + assert_eq!(hotspot_rankings[0]["ranked_scenarios"][0]["rank"], 1); + assert_eq!(hotspot_rankings[3]["metric"], "guest_import_mean_ms"); + assert_eq!( + hotspot_rankings[3]["ranked_scenarios"][0]["value"].as_f64(), + Some(13.0) + ); + assert_eq!(hotspot_rankings[9]["metric"], "startup_share_pct"); + assert_eq!(hotspot_rankings[9]["unit"], "pct"); + assert_eq!(hotspot_rankings[10]["metric"], "rss_mean_mib"); + assert_eq!(hotspot_rankings[12]["metric"], "cpu_total_mean_ms"); + + let scenarios = parsed["scenarios"].as_array().expect("scenario array"); + assert_eq!(scenarios.len(), 2); + assert_eq!(scenarios[0]["workload"], "fixture-a"); + assert_eq!(scenarios[0]["runtime"], "native-execution"); + assert_eq!(scenarios[0]["mode"], "true-cold-start"); + assert_eq!(scenarios[0]["wall_stats"]["stddev_ms"], 2.0); + assert_eq!(scenarios[0]["mean_startup_share_pct"], 58.333333333333336); + assert_eq!( + scenarios[0]["resource_usage_stats"]["rss_bytes"]["mean"], + 35651584.0 + ); + assert_eq!( + scenarios[0]["resource_usage_stats"]["cpu_total_us"]["mean"], + 5000.0 + ); + assert_eq!( + scenarios[0]["phase_stats"]["context_setup_ms"]["mean_ms"], + 1.5 + ); + assert_eq!(scenarios[0]["phase_stats"]["completion_ms"]["mean_ms"], 3.0); + assert_eq!(scenarios[1]["mean_startup_share_pct"], 59.375); + assert_eq!(scenarios[1]["phase_stats"]["startup_ms"]["mean_ms"], 5.5); + assert_eq!( + scenarios[1]["resource_usage_stats"]["heap_used_bytes"]["mean"], + 16777216.0 + ); +} + +#[test] +fn javascript_benchmark_hotspot_rankings_handle_missing_metrics() { + let report = JavascriptBenchmarkReport { + generated_at_unix_ms: 42, + config: JavascriptBenchmarkConfig { + iterations: 2, + warmup_iterations: 1, + }, + host: BenchmarkHost { + node_binary: String::from("node"), + node_version: String::from("v22.0.0"), + os: "linux", + arch: "x86_64", + logical_cpus: 8, + }, + repo_root: PathBuf::from("/repo"), + transport_rtt: vec![], + scenarios: vec![ + BenchmarkScenarioReport { + id: "alpha", + workload: "fixture-a", + runtime: "native-execution", + mode: "true-cold-start", + description: "Alpha path", + fixture: "fixture-a", + compile_cache: "disabled", + wall_samples_ms: vec![15.0, 17.0], + wall_stats: stats(16.0, 15.0, 17.0, 15.0, 17.0, 1.0), + guest_import_samples_ms: Some(vec![7.0, 9.0]), + guest_import_stats: Some(stats(8.0, 7.0, 9.0, 7.0, 9.0, 1.0)), + startup_overhead_samples_ms: Some(vec![8.0, 8.0]), + startup_overhead_stats: Some(stats(8.0, 8.0, 8.0, 8.0, 8.0, 0.0)), + phase_samples_ms: phase_samples( + vec![2.0, 2.0], + vec![3.0, 3.0], + Some(vec![7.0, 9.0]), + vec![3.0, 3.0], + ), + phase_stats: phase_stats( + stats(2.0, 2.0, 2.0, 2.0, 2.0, 0.0), + stats(3.0, 3.0, 3.0, 3.0, 3.0, 0.0), + Some(stats(8.0, 7.0, 9.0, 7.0, 9.0, 1.0)), + stats(3.0, 3.0, 3.0, 3.0, 3.0, 0.0), + ), + resource_usage_samples: Some(resource_samples( + Some(vec![40.0 * 1024.0 * 1024.0, 44.0 * 1024.0 * 1024.0]), + None, + Some(vec![6000.0, 8000.0]), + )), + resource_usage_stats: Some(resource_stats( + Some(distribution_stats( + 42.0 * 1024.0 * 1024.0, + 40.0 * 1024.0 * 1024.0, + 44.0 * 1024.0 * 1024.0, + 40.0 * 1024.0 * 1024.0, + 44.0 * 1024.0 * 1024.0, + 2.0 * 1024.0 * 1024.0, + )), + None, + Some(distribution_stats( + 7000.0, 6000.0, 8000.0, 6000.0, 8000.0, 1000.0, + )), + )), + }, + BenchmarkScenarioReport { + id: "beta", + workload: "fixture-b", + runtime: "host-node", + mode: "host-control", + description: "Beta path", + fixture: "fixture-b", + compile_cache: "primed", + wall_samples_ms: vec![20.0, 24.0], + wall_stats: stats(22.0, 20.0, 24.0, 20.0, 24.0, 2.0), + guest_import_samples_ms: Some(vec![10.0, 12.0]), + guest_import_stats: Some(stats(11.0, 10.0, 12.0, 10.0, 12.0, 1.0)), + startup_overhead_samples_ms: Some(vec![9.0, 11.0]), + startup_overhead_stats: Some(stats(10.0, 9.0, 11.0, 9.0, 11.0, 1.0)), + phase_samples_ms: phase_samples( + vec![3.0, 3.0], + vec![4.0, 4.0], + Some(vec![10.0, 12.0]), + vec![5.0, 5.0], + ), + phase_stats: phase_stats( + stats(3.0, 3.0, 3.0, 3.0, 3.0, 0.0), + stats(4.0, 4.0, 4.0, 4.0, 4.0, 0.0), + Some(stats(11.0, 10.0, 12.0, 10.0, 12.0, 1.0)), + stats(5.0, 5.0, 5.0, 5.0, 5.0, 0.0), + ), + resource_usage_samples: Some(resource_samples( + Some(vec![60.0 * 1024.0 * 1024.0, 68.0 * 1024.0 * 1024.0]), + Some(vec![12.0 * 1024.0 * 1024.0, 14.0 * 1024.0 * 1024.0]), + Some(vec![9000.0, 12000.0]), + )), + resource_usage_stats: Some(resource_stats( + Some(distribution_stats( + 64.0 * 1024.0 * 1024.0, + 60.0 * 1024.0 * 1024.0, + 68.0 * 1024.0 * 1024.0, + 60.0 * 1024.0 * 1024.0, + 68.0 * 1024.0 * 1024.0, + 4.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 13.0 * 1024.0 * 1024.0, + 12.0 * 1024.0 * 1024.0, + 14.0 * 1024.0 * 1024.0, + 12.0 * 1024.0 * 1024.0, + 14.0 * 1024.0 * 1024.0, + 1.0 * 1024.0 * 1024.0, + )), + Some(distribution_stats( + 10500.0, 9000.0, 12000.0, 9000.0, 12000.0, 1500.0, + )), + )), + }, + BenchmarkScenarioReport { + id: "gamma", + workload: "fixture-c", + runtime: "native-execution", + mode: "baseline-control", + description: "Gamma path", + fixture: "fixture-c", + compile_cache: "disabled", + wall_samples_ms: vec![12.0, 14.0], + wall_stats: stats(13.0, 12.0, 14.0, 12.0, 14.0, 1.0), + guest_import_samples_ms: None, + guest_import_stats: None, + startup_overhead_samples_ms: None, + startup_overhead_stats: None, + phase_samples_ms: phase_samples( + vec![1.0, 1.0], + vec![2.0, 2.0], + None, + vec![4.0, 4.0], + ), + phase_stats: phase_stats( + stats(1.0, 1.0, 1.0, 1.0, 1.0, 0.0), + stats(2.0, 2.0, 2.0, 2.0, 2.0, 0.0), + None, + stats(4.0, 4.0, 4.0, 4.0, 4.0, 0.0), + ), + resource_usage_samples: Some(resource_samples( + Some(vec![24.0 * 1024.0 * 1024.0, 28.0 * 1024.0 * 1024.0]), + None, + None, + )), + resource_usage_stats: Some(resource_stats( + Some(distribution_stats( + 26.0 * 1024.0 * 1024.0, + 24.0 * 1024.0 * 1024.0, + 28.0 * 1024.0 * 1024.0, + 24.0 * 1024.0 * 1024.0, + 28.0 * 1024.0 * 1024.0, + 2.0 * 1024.0 * 1024.0, + )), + None, + None, + )), + }, + ], + }; + + let json = report.render_json().expect("render json"); + let parsed: Value = serde_json::from_str(&json).expect("parse json"); + let hotspot_rankings = parsed["summary"]["hotspot_rankings"] + .as_array() + .expect("hotspot rankings array"); + let wall_ranking = hotspot_rankings + .iter() + .find(|ranking| ranking["metric"] == "wall_mean_ms") + .expect("wall ranking"); + assert_eq!(wall_ranking["ranked_scenarios"][0]["id"], "beta"); + assert_eq!(wall_ranking["ranked_scenarios"][1]["id"], "alpha"); + assert_eq!(wall_ranking["ranked_scenarios"][2]["id"], "gamma"); + + let guest_execution_ranking = hotspot_rankings + .iter() + .find(|ranking| ranking["metric"] == "guest_execution_mean_ms") + .expect("guest execution ranking"); + assert_eq!(guest_execution_ranking["ranked_scenarios"][0]["id"], "beta"); + assert_eq!( + guest_execution_ranking["ranked_scenarios"][1]["id"], + "alpha" + ); + assert_eq!( + guest_execution_ranking["scenarios_without_metric"][0].as_str(), + Some("gamma") + ); + let rss_ranking = hotspot_rankings + .iter() + .find(|ranking| ranking["metric"] == "rss_mean_mib") + .expect("rss ranking"); + assert_eq!(rss_ranking["ranked_scenarios"][0]["id"], "beta"); + let cpu_ranking = hotspot_rankings + .iter() + .find(|ranking| ranking["metric"] == "cpu_total_mean_ms") + .expect("cpu ranking"); + assert_eq!(cpu_ranking["scenarios_without_metric"][0], "gamma"); + + let markdown = report.render_markdown(); + assert!(markdown.contains("## Ranked Hotspots")); + assert!(markdown.contains("## Stability And Resource Summary")); + assert!(markdown.contains("### Guest Execution Phase (`time`, `ms`)")); + assert!(markdown.contains("### RSS (`memory`, `MiB`)")); + assert!(markdown.contains("Missing metric for: `gamma`")); +} + +#[test] +fn javascript_benchmark_comparison_artifact_stays_stable_for_deltas() { + let report = JavascriptBenchmarkReport { + generated_at_unix_ms: 42, + config: JavascriptBenchmarkConfig { + iterations: 2, + warmup_iterations: 1, + }, + host: BenchmarkHost { + node_binary: String::from("node"), + node_version: String::from("v22.0.0"), + os: "linux", + arch: "x86_64", + logical_cpus: 8, + }, + repo_root: PathBuf::from("/repo"), + transport_rtt: vec![ + transport_rtt(32, vec![0.4, 0.6], stats(0.5, 0.4, 0.6, 0.4, 0.6, 0.1)), + transport_rtt(4096, vec![0.9, 1.1], stats(1.0, 0.9, 1.1, 0.9, 1.1, 0.1)), + transport_rtt(65536, vec![2.6, 3.0], stats(2.8, 2.6, 3.0, 2.6, 3.0, 0.2)), + ], + scenarios: vec![ + BenchmarkScenarioReport { + id: "fast-scenario", + workload: "fixture-a", + runtime: "native-execution", + mode: "true-cold-start", + description: "Faster benchmark path", + fixture: "fixture-a", + compile_cache: "disabled", + wall_samples_ms: vec![10.0, 14.0], + wall_stats: stats(12.0, 10.0, 14.0, 10.0, 14.0, 2.0), + guest_import_samples_ms: Some(vec![4.0, 6.0]), + guest_import_stats: Some(stats(5.0, 4.0, 6.0, 4.0, 6.0, 1.0)), + startup_overhead_samples_ms: Some(vec![6.0, 8.0]), + startup_overhead_stats: Some(stats(7.0, 6.0, 8.0, 6.0, 8.0, 1.0)), + phase_samples_ms: phase_samples( + vec![1.0, 2.0], + vec![2.0, 3.0], + Some(vec![4.0, 6.0]), + vec![3.0, 3.0], + ), + phase_stats: phase_stats( + stats(1.5, 1.0, 2.0, 1.0, 2.0, 0.5), + stats(2.5, 2.0, 3.0, 2.0, 3.0, 0.5), + Some(stats(5.0, 4.0, 6.0, 4.0, 6.0, 1.0)), + stats(3.0, 3.0, 3.0, 3.0, 3.0, 0.0), + ), + resource_usage_samples: None, + resource_usage_stats: None, + }, + BenchmarkScenarioReport { + id: "slow-scenario", + workload: "fixture-b", + runtime: "native-execution", + mode: "new-session-replay", + description: "Slower benchmark path", + fixture: "fixture-b", + compile_cache: "primed", + wall_samples_ms: vec![30.0, 34.0], + wall_stats: stats(32.0, 30.0, 34.0, 30.0, 34.0, 2.0), + guest_import_samples_ms: Some(vec![12.0, 14.0]), + guest_import_stats: Some(stats(13.0, 12.0, 14.0, 12.0, 14.0, 1.0)), + startup_overhead_samples_ms: Some(vec![18.0, 20.0]), + startup_overhead_stats: Some(stats(19.0, 18.0, 20.0, 18.0, 20.0, 1.0)), + phase_samples_ms: phase_samples( + vec![4.0, 4.0], + vec![5.0, 6.0], + Some(vec![12.0, 14.0]), + vec![9.0, 10.0], + ), + phase_stats: phase_stats( + stats(4.0, 4.0, 4.0, 4.0, 4.0, 0.0), + stats(5.5, 5.0, 6.0, 5.0, 6.0, 0.5), + Some(stats(13.0, 12.0, 14.0, 12.0, 14.0, 1.0)), + stats(9.5, 9.0, 10.0, 9.0, 10.0, 0.5), + ), + resource_usage_samples: None, + resource_usage_stats: None, + }, + BenchmarkScenarioReport { + id: "current-only", + workload: "fixture-c", + runtime: "host-node", + mode: "host-control", + description: "Current-only scenario", + fixture: "fixture-c", + compile_cache: "disabled", + wall_samples_ms: vec![8.0, 10.0], + wall_stats: stats(9.0, 8.0, 10.0, 8.0, 10.0, 1.0), + guest_import_samples_ms: None, + guest_import_stats: None, + startup_overhead_samples_ms: None, + startup_overhead_stats: None, + phase_samples_ms: phase_samples( + vec![1.0, 1.0], + vec![2.0, 3.0], + None, + vec![5.0, 6.0], + ), + phase_stats: phase_stats( + stats(1.0, 1.0, 1.0, 1.0, 1.0, 0.0), + stats(2.5, 2.0, 3.0, 2.0, 3.0, 0.5), + None, + stats(5.5, 5.0, 6.0, 5.0, 6.0, 0.5), + ), + resource_usage_samples: None, + resource_usage_stats: None, + }, + ], + }; + + let tempdir = tempdir().expect("create tempdir"); + let baseline_path = tempdir.path().join("baseline.json"); + fs::write( + &baseline_path, + r#"{ + "artifact_version": 1, + "generated_at_unix_ms": 24, + "scenarios": [ + { + "id": "fast-scenario", + "wall_stats": { + "mean_ms": 15.0, + "p50_ms": 15.0, + "p95_ms": 15.0, + "min_ms": 15.0, + "max_ms": 15.0, + "stddev_ms": 0.0 + }, + "guest_import_stats": { + "mean_ms": 6.0, + "p50_ms": 6.0, + "p95_ms": 6.0, + "min_ms": 6.0, + "max_ms": 6.0, + "stddev_ms": 0.0 + }, + "startup_overhead_stats": { + "mean_ms": 9.0, + "p50_ms": 9.0, + "p95_ms": 9.0, + "min_ms": 9.0, + "max_ms": 9.0, + "stddev_ms": 0.0 + } + }, + { + "id": "slow-scenario", + "wall_stats": { + "mean_ms": 28.0, + "p50_ms": 28.0, + "p95_ms": 28.0, + "min_ms": 28.0, + "max_ms": 28.0, + "stddev_ms": 0.0 + }, + "guest_import_stats": { + "mean_ms": 11.0, + "p50_ms": 11.0, + "p95_ms": 11.0, + "min_ms": 11.0, + "max_ms": 11.0, + "stddev_ms": 0.0 + }, + "startup_overhead_stats": { + "mean_ms": 17.0, + "p50_ms": 17.0, + "p95_ms": 17.0, + "min_ms": 17.0, + "max_ms": 17.0, + "stddev_ms": 0.0 + } + }, + { + "id": "baseline-only", + "wall_stats": { + "mean_ms": 5.0, + "p50_ms": 5.0, + "p95_ms": 5.0, + "min_ms": 5.0, + "max_ms": 5.0, + "stddev_ms": 0.0 + } + } + ] +}"#, + ) + .expect("write baseline report"); + + let comparison = report + .compare_to_baseline_path(&baseline_path) + .expect("load comparison"); + let json = report + .render_json_with_comparison(Some(&comparison)) + .expect("render comparison json"); + let parsed: Value = serde_json::from_str(&json).expect("parse comparison json"); + + assert_eq!( + parsed["comparison"]["summary"]["compared_scenario_count"], + 2 + ); + assert_eq!( + parsed["comparison"]["summary"]["largest_wall_improvement"]["id"].as_str(), + Some("fast-scenario") + ); + assert_eq!( + parsed["comparison"]["summary"]["largest_wall_regression"]["id"].as_str(), + Some("slow-scenario") + ); + assert_eq!( + parsed["comparison"]["scenario_deltas"][0]["wall_mean_ms"]["delta_ms"], + -3.0 + ); + assert_eq!( + parsed["comparison"]["scenario_deltas"][1]["wall_mean_ms"]["delta_ms"], + 4.0 + ); + assert!( + parsed["comparison"]["scenario_deltas"][0]["phase_mean_ms"].is_null(), + "phase deltas should stay absent when the baseline artifact has no phase data" + ); + assert_eq!( + parsed["comparison"]["scenarios_missing_from_baseline"][0].as_str(), + Some("current-only") + ); + assert_eq!( + parsed["comparison"]["baseline_only_scenarios"][0].as_str(), + Some("baseline-only") + ); + + let markdown = report.render_markdown_with_comparison(Some(&comparison)); + assert!(markdown.contains("## Baseline Comparison")); + assert!(markdown.contains("Context delta (ms)")); + assert!(markdown.contains("Largest wall-time improvement: `fast-scenario`")); + assert!(markdown.contains("Largest wall-time regression: `slow-scenario`")); + assert!(markdown.contains("Scenarios missing from baseline: current-only")); + assert!(markdown.contains("Baseline-only scenarios: baseline-only")); } diff --git a/crates/execution/tests/javascript.rs b/crates/execution/tests/javascript.rs index a7d29ed2e..bbc69d832 100644 --- a/crates/execution/tests/javascript.rs +++ b/crates/execution/tests/javascript.rs @@ -20,6 +20,8 @@ struct NodeImportCacheMetrics { package_type_misses: usize, module_format_hits: usize, module_format_misses: usize, + source_hits: usize, + source_misses: usize, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -80,6 +82,8 @@ fn parse_import_cache_metrics(stderr: &str) -> NodeImportCacheMetrics { package_type_misses: parse_metric_value(metrics_line, "packageTypeMisses"), module_format_hits: parse_metric_value(metrics_line, "moduleFormatHits"), module_format_misses: parse_metric_value(metrics_line, "moduleFormatMisses"), + source_hits: parse_metric_value(metrics_line, "sourceHits"), + source_misses: parse_metric_value(metrics_line, "sourceMisses"), } } @@ -1153,6 +1157,126 @@ console.log(`answer:${dep.answer}`); assert!(second_metrics.resolve_hits >= 2); } +#[test] +fn javascript_execution_reuses_and_invalidates_projected_package_source_cache() { + assert_node_available(); + + let temp = tempdir().expect("create temp dir"); + let projected_root = temp.path().join("projected-node-modules"); + let package_dir = projected_root.join("demo-projected"); + fs::create_dir_all(&package_dir).expect("create projected package dir"); + write_fixture( + &package_dir.join("package.json"), + "{\n \"name\": \"demo-projected\",\n \"type\": \"module\"\n}\n", + ); + write_fixture( + &package_dir.join("entry.js"), + "import { readFileSync } from 'node:fs';\nexport const answer = 41;\nexport const fsReady = typeof readFileSync === 'function';\n", + ); + write_fixture( + &temp.path().join("entry.mjs"), + r#" +const mod = await import("/root/node_modules/demo-projected/entry.js"); +console.log(`answer:${mod.answer}`); +console.log(`fsReady:${mod.fsReady}`); +"#, + ); + + let mut engine = JavascriptExecutionEngine::default(); + let first_context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-js"), + bootstrap_module: None, + compile_cache_root: None, + }); + let projected_root_host_path = projected_root.to_string_lossy().replace('\\', "\\\\"); + let extra_fs_read_paths_json = format!( + "[\"{}\"]", + projected_root.to_string_lossy().replace('\\', "\\\\") + ); + let debug_env = BTreeMap::from([ + ( + String::from("AGENT_OS_EXTRA_FS_READ_PATHS"), + extra_fs_read_paths_json, + ), + ( + String::from("AGENT_OS_GUEST_PATH_MAPPINGS"), + format!( + "[{{\"guestPath\":\"/root/node_modules\",\"hostPath\":\"{projected_root_host_path}\"}}]" + ), + ), + ( + String::from("AGENT_OS_NODE_IMPORT_CACHE_DEBUG"), + String::from("1"), + ), + ]); + + let (first_stdout, first_stderr, first_exit) = run_javascript_execution( + &mut engine, + first_context.context_id, + temp.path(), + vec![String::from("./entry.mjs")], + debug_env.clone(), + ); + let first_metrics = parse_import_cache_metrics(&first_stderr); + + assert_eq!(first_exit, 0, "stderr: {first_stderr}"); + assert!(first_stdout.contains("answer:41"), "stdout: {first_stdout}"); + assert!( + first_stdout.contains("fsReady:true"), + "stdout: {first_stdout}" + ); + assert_eq!(first_metrics.source_hits, 0, "stderr: {first_stderr}"); + assert!(first_metrics.source_misses >= 1, "stderr: {first_stderr}"); + + let second_context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-js"), + bootstrap_module: None, + compile_cache_root: None, + }); + let (second_stdout, second_stderr, second_exit) = run_javascript_execution( + &mut engine, + second_context.context_id, + temp.path(), + vec![String::from("./entry.mjs")], + debug_env.clone(), + ); + let second_metrics = parse_import_cache_metrics(&second_stderr); + + assert_eq!(second_exit, 0, "stderr: {second_stderr}"); + assert!( + second_stdout.contains("answer:41"), + "stdout: {second_stdout}" + ); + assert!(second_metrics.source_hits >= 1, "stderr: {second_stderr}"); + + write_fixture( + &package_dir.join("entry.js"), + "import { readFileSync } from 'node:fs';\nexport const answer = 42;\nexport const fsReady = typeof readFileSync === 'function';\n", + ); + + let third_context = engine.create_context(CreateJavascriptContextRequest { + vm_id: String::from("vm-js"), + bootstrap_module: None, + compile_cache_root: None, + }); + let (third_stdout, third_stderr, third_exit) = run_javascript_execution( + &mut engine, + third_context.context_id, + temp.path(), + vec![String::from("./entry.mjs")], + debug_env, + ); + let third_metrics = parse_import_cache_metrics(&third_stderr); + + assert_eq!(third_exit, 0, "stderr: {third_stderr}"); + assert!(third_stdout.contains("answer:42"), "stdout: {third_stdout}"); + assert!( + third_stdout.contains("fsReady:true"), + "stdout: {third_stdout}" + ); + assert!(third_metrics.source_misses >= 1, "stderr: {third_stderr}"); +} + #[test] fn javascript_execution_redirects_computed_node_fs_imports_through_builtin_assets() { assert_node_available(); diff --git a/package.json b/package.json index f1f2cc459..8b0c76d39 100644 --- a/package.json +++ b/package.json @@ -19,12 +19,14 @@ "devDependencies": { "@biomejs/biome": "^2.3", "@copilotkit/llmock": "^1.6.0", + "@rivet-dev/agent-os": "workspace:*", "@rivet-dev/agent-os-claude": "workspace:*", - "@rivet-dev/agent-os-common": "workspace:*", "@rivet-dev/agent-os-codex-agent": "workspace:*", - "@rivet-dev/agent-os": "workspace:*", + "@rivet-dev/agent-os-common": "workspace:*", "@rivet-dev/agent-os-pi": "workspace:*", "@types/node": "^22.19.15", + "jszip": "^3.10.1", + "pdf-lib": "^1.17.1", "turbo": "^2.5.6", "typescript": "^5.9.2" }, diff --git a/packages/core/tests/pi-headless.test.ts b/packages/core/tests/pi-headless.test.ts index b76345dd7..450fe58ef 100644 --- a/packages/core/tests/pi-headless.test.ts +++ b/packages/core/tests/pi-headless.test.ts @@ -126,14 +126,63 @@ console.log("messages:" + JSON.stringify(parsed.messages)); expect(stdout).toContain('messages:["hello"]'); }, 30_000); - // TODO: Full PI headless execution is blocked by two current VM limitations: - // 1. ESM module linking: V8 Rust runtime doesn't forward named exports from - // host-loaded modules (ModuleAccessFileSystem overlay). VFS modules work fine. - // PI's CLI must run as ESM (has async top-level main()), but ESM mode can't - // load host modules with named exports. - // 2. CJS mode: Works for loading PI's modules, but the V8 session doesn't - // process the event loop after synchronous code finishes, so async main() - // never completes. - // Fix: Either fix V8 module linking for overlay modules, or add event loop - // processing to CJS session mode. + test("CLI-backed PI headless session completes a real prompt turn", async () => { + const { sessionId } = await vm.createSession("pi-cli", { + env: { + ANTHROPIC_API_KEY: "mock-key", + ANTHROPIC_BASE_URL: mockUrl, + }, + }); + + try { + const response = await vm.prompt( + sessionId, + "Reply with exactly: Hello from llmock", + ); + + expect(response.error).toBeUndefined(); + expect((response.result as { stopReason?: string }).stopReason).toBe( + "end_turn", + ); + expect(response.result).toBeDefined(); + expect( + vm + .listProcesses() + .some( + (process) => + process.running && + process.command === "node" && + process.args.some((arg) => arg.includes("pi-acp")), + ), + ).toBe(true); + } finally { + vm.closeSession(sessionId); + } + }, 90_000); + + test("standalone PI CLI is not exposed on the native sidecar PATH", async () => { + let stdout = ""; + let stderr = ""; + + const { pid } = vm.spawn("pi", ["-p", "--no-session", "hello"], { + onStdout: (data: Uint8Array) => { + stdout += new TextDecoder().decode(data); + }, + onStderr: (data: Uint8Array) => { + stderr += new TextDecoder().decode(data); + }, + env: { + HOME: "/home/user", + PI_OFFLINE: "1", + ANTHROPIC_API_KEY: "mock-key", + ANTHROPIC_BASE_URL: mockUrl, + }, + }); + + const exitCode = await vm.waitProcess(pid); + + expect(exitCode).toBe(1); + expect(stdout).toBe(""); + expect(stderr).toContain("command not found on native sidecar path: pi"); + }, 30_000); }); diff --git a/packages/core/tests/software-projection.test.ts b/packages/core/tests/software-projection.test.ts index 7f5982017..9044e5b7f 100644 --- a/packages/core/tests/software-projection.test.ts +++ b/packages/core/tests/software-projection.test.ts @@ -1,7 +1,7 @@ import { existsSync } from "node:fs"; -import { afterEach, describe, expect, test } from "vitest"; import common, { coreutils } from "@rivet-dev/agent-os-common"; import pi from "@rivet-dev/agent-os-pi"; +import { afterEach, describe, expect, test } from "vitest"; import { AgentOs } from "../src/agent-os.js"; const hasRegistryCommands = existsSync(coreutils.commandDir); @@ -71,6 +71,44 @@ describe("software projection on the sidecar path", () => { expect(stdout).toContain("agent true"); }); + test("keeps projected package roots read-only on the sidecar path", async () => { + vm = await AgentOs.create({ + moduleAccessCwd: "/tmp", + software: [pi], + }); + + let stdout = ""; + let stderr = ""; + const { pid } = vm.spawn( + "node", + [ + "-e", + [ + "const fs = require('node:fs');", + "try {", + " fs.appendFileSync('/root/node_modules/@rivet-dev/agent-os-pi/package.json', '\\nblocked');", + " console.log('write:unexpected-success');", + "} catch (error) {", + " console.log('writeError', error && error.code);", + "}", + ].join(" "), + ], + { + onStdout: (chunk) => { + stdout += Buffer.from(chunk).toString("utf8"); + }, + onStderr: (chunk) => { + stderr += Buffer.from(chunk).toString("utf8"); + }, + }, + ); + + const exitCode = await waitForExit(vm, pid); + expect({ exitCode, stderr }).toEqual({ exitCode: 0, stderr: "" }); + expect(stdout).not.toContain("write:unexpected-success"); + expect(stdout).toMatch(/writeError (ERR_ACCESS_DENIED|EACCES|EPERM|EROFS)/); + }); + test.skipIf(!hasRegistryCommands)( "preserves registry meta-package command injection on the sidecar path", async () => { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 99a3cd084..da6499252 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -43,6 +43,12 @@ importers: '@types/node': specifier: ^22.19.15 version: 22.19.15 + jszip: + specifier: ^3.10.1 + version: 3.10.1 + pdf-lib: + specifier: ^1.17.1 + version: 1.17.1 turbo: specifier: ^2.5.6 version: 2.9.1 @@ -1353,6 +1359,7 @@ packages: '@copilotkit/llmock@1.6.0': resolution: {integrity: sha512-wq4J7ampjoEiOi6v2d7GMK5lTZcTnuhMduSPCIwmyxBTCPA3lekXyNKGJ4t3xM5OgoJReMQ5KmlfrMBVTRNGsA==} engines: {node: '>=20.15.0'} + deprecated: This package has moved to @copilotkit/aimock hasBin: true '@esbuild/aix-ppc64@0.21.5': @@ -1928,6 +1935,12 @@ packages: cpu: [x64] os: [win32] + '@pdf-lib/standard-fonts@1.0.0': + resolution: {integrity: sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==} + + '@pdf-lib/upng@1.0.1': + resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + '@pinojs/redact@0.4.0': resolution: {integrity: sha512-k2ENnmBugE/rzQfEcdWHcCY+/FM3VLzH9cYEsbdsoqrvzAKRhUZeRNhAZvB8OitQJ1TBed3yqWtdjzS6wJKBwg==} @@ -3208,6 +3221,9 @@ packages: resolution: {integrity: sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==} engines: {node: '>= 4'} + immediate@3.0.6: + resolution: {integrity: sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==} + inherits@2.0.4: resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==} @@ -3293,6 +3309,9 @@ packages: json-schema@0.4.0: resolution: {integrity: sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==} + jszip@3.10.1: + resolution: {integrity: sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==} + jwa@2.0.1: resolution: {integrity: sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==} @@ -3302,6 +3321,9 @@ packages: koffi@2.15.2: resolution: {integrity: sha512-r9tjJLVRSOhCRWdVyQlF3/Ugzeg13jlzS4czS82MAgLff4W+BcYOW7g8Y62t9O5JYjYOLAjAovAZDNlDfZNu+g==} + lie@3.3.0: + resolution: {integrity: sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==} + lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} @@ -3565,6 +3587,9 @@ packages: resolution: {integrity: sha512-Q3CG/cYvCO1ye4QKkuH7EXxs3VC/rI1/trd+qX2+PolbaKG0H+bgcZzrTt96mMyRtejk+JMCiLUn3y29W8qmFQ==} engines: {node: '>= 0.10'} + pdf-lib@1.17.1: + resolution: {integrity: sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==} + pend@1.2.0: resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} @@ -4000,6 +4025,9 @@ packages: ts-interface-checker@0.1.13: resolution: {integrity: sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==} + tslib@1.14.1: + resolution: {integrity: sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==} + tslib@2.8.1: resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} @@ -5325,6 +5353,14 @@ snapshots: '@oven/bun-windows-x64@1.3.11': optional: true + '@pdf-lib/standard-fonts@1.0.0': + dependencies: + pako: 1.0.11 + + '@pdf-lib/upng@1.0.1': + dependencies: + pako: 1.0.11 + '@pinojs/redact@0.4.0': {} '@protobufjs/aspromise@1.1.2': {} @@ -6846,6 +6882,8 @@ snapshots: ignore@7.0.5: {} + immediate@3.0.6: {} + inherits@2.0.4: {} ini@1.3.8: {} @@ -6920,6 +6958,13 @@ snapshots: json-schema@0.4.0: {} + jszip@3.10.1: + dependencies: + lie: 3.3.0 + pako: 1.0.11 + readable-stream: 2.3.8 + setimmediate: 1.0.5 + jwa@2.0.1: dependencies: buffer-equal-constant-time: 1.0.1 @@ -6934,6 +6979,10 @@ snapshots: koffi@2.15.2: optional: true + lie@3.3.0: + dependencies: + immediate: 3.0.6 + lines-and-columns@1.2.4: {} locate-path@6.0.0: @@ -7184,6 +7233,13 @@ snapshots: sha.js: 2.4.12 to-buffer: 1.2.2 + pdf-lib@1.17.1: + dependencies: + '@pdf-lib/standard-fonts': 1.0.0 + '@pdf-lib/upng': 1.0.1 + pako: 1.0.11 + tslib: 1.14.1 + pend@1.2.0: {} pi-acp@0.0.23: @@ -7705,6 +7761,8 @@ snapshots: ts-interface-checker@0.1.13: {} + tslib@1.14.1: {} + tslib@2.8.1: {} tsx@4.21.0: diff --git a/scripts/benchmarks/bench-utils.ts b/scripts/benchmarks/bench-utils.ts index c463ec1f5..6be00efa2 100644 --- a/scripts/benchmarks/bench-utils.ts +++ b/scripts/benchmarks/bench-utils.ts @@ -5,6 +5,7 @@ import codex from "@rivet-dev/agent-os-codex-agent"; import pi from "@rivet-dev/agent-os-pi"; import { LLMock } from "@copilotkit/llmock"; import os from "node:os"; +import { resolve } from "node:path"; // Benchmark parameters. Keep batch sizes minimal for fast iteration. export const BATCH_SIZES = [1, 10]; @@ -14,6 +15,15 @@ export const MAX_CONCURRENCY = Math.max(1, os.availableParallelism() - 4); export const ECHO_COMMAND = "echo hello"; export const EXPECTED_OUTPUT = "hello\n"; +export const PI_BENCHMARK_PROMPT = "Reply with exactly: Hello from llmock"; +export const PI_HEADLESS_BLOCKER_REFERENCE = + "packages/core/tests/pi-headless.test.ts"; +export const PI_HEADLESS_BLOCKER_REASON = + 'Standalone `spawn("pi", ...)` is not exposed on the native sidecar PATH; use `createSession("pi-cli")` to benchmark the native PI CLI RPC path tracked in packages/core/tests/pi-headless.test.ts.'; +const BENCHMARK_MODULE_ACCESS_CWD = resolve( + import.meta.dirname, + "../../packages/core", +); // ── Shared mock LLM server ───────────────────────────────────────── @@ -29,7 +39,10 @@ export async function ensureLlmock(): Promise<{ if (_llmock) return { url: _llmockUrl!, port: _llmockPort! }; _llmock = new LLMock({ port: 0, logLevel: "silent" }); _llmock.addFixtures([ - { match: { predicate: () => true }, response: { content: "ok" } }, + { + match: { predicate: () => true }, + response: { content: "Hello from llmock" }, + }, ]); _llmockUrl = await _llmock.start(); _llmockPort = Number(new URL(_llmockUrl).port); @@ -46,15 +59,32 @@ export async function stopLlmock(): Promise { } } +export function getLlmockRequestCount(): number { + return _llmock?.getRequests().length ?? 0; +} + // ── Workload abstraction ──────────────────────────────────────────── +export interface WorkloadObservation { + promptCompleted?: boolean; + providerRequestCount?: number; + sessionUpdateCount?: number; + textEventCount?: number; + finalText?: string | null; + stopReason?: string; + workloadPath?: string; + substituteReason?: string; + blockerReference?: string; + blockerReason?: string; +} + /** A workload describes how to create a VM and start a long-running process for memory measurement. */ export interface Workload { name: string; description: string; createVm: () => Promise; /** Start a long-running process so the Worker thread stays alive. */ - start: (vm: AgentOs) => Promise | void; + start: (vm: AgentOs) => Promise | WorkloadObservation | void; /** Verify the expected processes are running. Throws if not. */ verify: (vm: AgentOs) => void; /** Time to wait after start for the process to fully initialize. */ @@ -104,6 +134,101 @@ function makeAgentSessionWorkload(opts: { }; } +function getTextEventPayload( + event: unknown, +): { text?: string; type?: string } | undefined { + if (!event || typeof event !== "object") { + return undefined; + } + const params = (event as { params?: unknown }).params; + if (!params || typeof params !== "object") { + return undefined; + } + return params as { text?: string; type?: string }; +} + +function makeAgentPromptWorkload(opts: { + agentId: string; + description: string; + software: SoftwareInput[]; + processMarker: string; + prompt: string; +}): Workload { + return { + name: `${opts.agentId}-prompt-turn`, + description: opts.description, + createVm: async () => { + const { port } = await ensureLlmock(); + return AgentOs.create({ + loopbackExemptPorts: [port], + moduleAccessCwd: BENCHMARK_MODULE_ACCESS_CWD, + software: opts.software, + }); + }, + start: async (vm) => { + const { url } = await ensureLlmock(); + const { sessionId } = await vm.createSession(opts.agentId, { + env: { + ANTHROPIC_API_KEY: "bench-key", + ANTHROPIC_BASE_URL: url, + }, + }); + + const events: unknown[] = []; + const unsubscribe = vm.onSessionEvent(sessionId, (event) => { + events.push(event); + }); + const requestCountBefore = getLlmockRequestCount(); + + try { + const response = await vm.prompt(sessionId, opts.prompt); + if (response.error) { + throw new Error( + `${opts.agentId} prompt workload failed: ${response.error.message}`, + ); + } + const textEvents = events + .map(getTextEventPayload) + .filter((event) => event?.type === "text"); + const finalText = textEvents.at(-1)?.text ?? null; + const providerRequestCount = + getLlmockRequestCount() - requestCountBefore; + + return { + promptCompleted: true, + providerRequestCount, + sessionUpdateCount: events.length, + textEventCount: textEvents.length, + finalText, + stopReason: (response.result as { stopReason?: string } | undefined) + ?.stopReason, + workloadPath: + 'createSession("pi-cli") + vm.prompt(...) via pi-acp -> PI CLI --mode rpc', + blockerReference: PI_HEADLESS_BLOCKER_REFERENCE, + blockerReason: PI_HEADLESS_BLOCKER_REASON, + } satisfies WorkloadObservation; + } finally { + unsubscribe(); + } + }, + verify: (vm) => { + const procs = vm.listProcesses(); + const running = procs.filter((p) => p.running); + const hasAgent = running.some( + (p) => + p.command === "node" && + p.args.some((a) => a.includes(opts.processMarker)), + ); + if (!hasAgent) { + throw new Error( + `Expected running ${opts.processMarker} process, got: ${JSON.stringify(running.map((p) => ({ cmd: p.command, args: p.args })))}`, + ); + } + }, + settleMs: 2000, + }; +} + export const WORKLOADS: Record = { sleep: { name: "sleep", @@ -132,6 +257,14 @@ export const WORKLOADS: Record = { software: [pi], processMarker: "agent-os-pi", }), + "pi-prompt-turn": makeAgentPromptWorkload({ + agentId: "pi-cli", + description: + 'Native PI CLI headless benchmark path via createSession("pi-cli"), which drives the real PI CLI through pi-acp RPC mode and records a full prompt turn.', + software: [], + processMarker: "pi-acp", + prompt: PI_BENCHMARK_PROMPT, + }), "claude-session": makeAgentSessionWorkload({ agentId: "claude", description: "VM with Claude agent session via createSession", diff --git a/scripts/benchmarks/coldstart.bench.ts b/scripts/benchmarks/coldstart.bench.ts index f70ff7369..171222ca8 100644 --- a/scripts/benchmarks/coldstart.bench.ts +++ b/scripts/benchmarks/coldstart.bench.ts @@ -4,19 +4,26 @@ * Measures time from AgentOs.create() through workload ready: * --workload=echo Minimal VM + first exec("echo hello") completing * --workload=pi-session VM + createSession("pi") completing (ACP handshake done) + * --workload=pi-prompt-turn VM + createSession("pi-cli") + first prompt turn completing * --workload=claude-session VM + createSession("claude") completing (ACP handshake done) * --workload=codex-session VM + createSession("codex") completing (ACP handshake done) * + * `pi-prompt-turn` now benchmarks the native PI CLI path through + * `createSession("pi-cli")`, which uses `pi-acp` to drive the real PI CLI in + * RPC mode. The same PI headless test file documents that raw `spawn("pi", ...)` + * is still not exposed on the native sidecar PATH. + * * Pass --iterations=N to override default (5). * * Usage: - * npx tsx scripts/benchmarks/coldstart.bench.ts --workload=echo - * npx tsx scripts/benchmarks/coldstart.bench.ts --workload=pi-session --iterations=3 - * npx tsx scripts/benchmarks/coldstart.bench.ts --workload=claude-session --iterations=3 + * pnpm exec tsx scripts/benchmarks/coldstart.bench.ts --workload=echo + * pnpm exec tsx scripts/benchmarks/coldstart.bench.ts --workload=pi-session --iterations=3 + * pnpm exec tsx scripts/benchmarks/coldstart.bench.ts --workload=claude-session --iterations=3 */ import { ITERATIONS, + type WorkloadObservation, WARMUP_ITERATIONS, WORKLOADS, createBenchVm, @@ -29,9 +36,19 @@ import { stopLlmock, } from "./bench-utils.js"; -const VALID_WORKLOADS = ["echo", ...Object.keys(WORKLOADS).filter((k) => k.endsWith("-session"))]; +const VALID_WORKLOADS = [ + "echo", + ...Object.keys(WORKLOADS).filter( + (k) => k.endsWith("-session") || k.endsWith("-turn"), + ), +]; + +interface Measurement { + ms: number; + observation?: WorkloadObservation; +} -async function measureEcho(): Promise { +async function measureEcho(): Promise { const t0 = performance.now(); const vm = await createBenchVm(); const result = await vm.exec(ECHO_COMMAND); @@ -40,17 +57,17 @@ async function measureEcho(): Promise { throw new Error(`Unexpected output: ${JSON.stringify(result.stdout)}`); } await vm.dispose(); - return ms; + return { ms }; } -async function measureAgentSession(workloadName: string): Promise { +async function measureAgentSession(workloadName: string): Promise { const workload = WORKLOADS[workloadName]; const t0 = performance.now(); const vm = await workload.createVm(); - await workload.start(vm); + const observation = await workload.start(vm); const ms = performance.now() - t0; await vm.dispose(); - return ms; + return { ms, observation }; } function parseArgs(): { workload: string; iterations: number } { @@ -59,7 +76,7 @@ function parseArgs(): { workload: string; iterations: number } { if (!wArg) { console.error( - `Usage: npx tsx coldstart.bench.ts --workload=${VALID_WORKLOADS.join("|")} [--iterations=N]`, + `Usage: pnpm exec tsx coldstart.bench.ts --workload=${VALID_WORKLOADS.join("|")} [--iterations=N]`, ); process.exit(1); } @@ -91,11 +108,15 @@ async function main() { console.error(`Iterations: ${iterations} (+ ${WARMUP_ITERATIONS} warmup)`); const samples: number[] = []; + let lastObservation: WorkloadObservation | undefined; for (let i = 0; i < WARMUP_ITERATIONS + iterations; i++) { - const ms = await measure(); + const { ms, observation } = await measure(); if (i >= WARMUP_ITERATIONS) { samples.push(ms); + if (observation) { + lastObservation = observation; + } } console.error( ` iter ${i}: ${round(ms)}ms${i < WARMUP_ITERATIONS ? " (warmup)" : ""}`, @@ -109,8 +130,27 @@ async function main() { [["cold start", `${s.mean}ms`, `${s.p50}ms`, `${s.p95}ms`, `${s.min}ms`, `${s.max}ms`]], ); + if (lastObservation) { + console.error( + `observed work: providerRequests=${lastObservation.providerRequestCount ?? 0} textEvents=${lastObservation.textEventCount ?? 0} stopReason=${lastObservation.stopReason ?? "n/a"}`, + ); + if (lastObservation.finalText) { + console.error(`final text: ${JSON.stringify(lastObservation.finalText)}`); + } + } + console.log( - JSON.stringify({ hardware, workload, iterations, coldStart: s }, null, 2), + JSON.stringify( + { + hardware, + workload, + iterations, + coldStart: s, + observation: lastObservation, + }, + null, + 2, + ), ); await stopLlmock(); diff --git a/scripts/benchmarks/run-benchmarks.sh b/scripts/benchmarks/run-benchmarks.sh index 0f731be18..4ee43b67f 100755 --- a/scripts/benchmarks/run-benchmarks.sh +++ b/scripts/benchmarks/run-benchmarks.sh @@ -16,7 +16,7 @@ run() { shift echo "" >&2 echo "=== Running $name ===" >&2 - npx tsx "$@" \ + pnpm exec tsx "$@" \ 1> "$RESULTS_DIR/${name}.json" \ 2> >(tee "$RESULTS_DIR/${name}.log" >&2) } @@ -25,6 +25,9 @@ run() { run "coldstart-echo" \ scripts/benchmarks/coldstart.bench.ts --workload=echo +run "coldstart-pi-prompt-turn" \ + scripts/benchmarks/coldstart.bench.ts --workload=pi-prompt-turn --iterations=3 + # Memory benchmarks # run "memory-sleep" \ # --expose-gc scripts/benchmarks/memory.bench.ts --workload=sleep --count=5 diff --git a/scripts/ralph/prd.json b/scripts/ralph/prd.json deleted file mode 100644 index 1eb17e05c..000000000 --- a/scripts/ralph/prd.json +++ /dev/null @@ -1,208 +0,0 @@ -{ - "project": "agentOS", - "branchName": "04-01-feat_rust_kernel_sidecar", - "description": "Close remaining parity gaps between the Rust kernel sidecar and the old in-process TypeScript kernel", - "userStories": [ - { - "id": "US-001", - "title": "Implement real socketTable and processTable on NativeKernel", - "description": "As a developer running kernel tests, I need NativeKernel's socketTable and processTable to return real state from the Rust sidecar so that existing callers and tests work correctly.", - "acceptanceCriteria": [ - "socketTable.findListener() queries the sidecar and returns the matching listener or null (not always null)", - "socketTable.findBoundUdp() queries the sidecar and returns the matching bound socket or null (not always null)", - "processTable.getSignalState() returns the actual signal handler map from the sidecar (not an empty map)", - "registry/tests/kernel/cross-runtime-network.test.ts passes against the real sidecar", - "registry/tests/wasmvm/signal-handler.test.ts passes against the real sidecar", - "Typecheck passes", - "Tests pass" - ], - "priority": 1, - "passes": true, - "notes": "packages/core/src/runtime.ts:1855-1860 — socketTable is a minimal stub, processTable.getSignalState() returns empty map. Callers at registry/tests/kernel/cross-runtime-network.test.ts:43,:153 and registry/tests/wasmvm/signal-handler.test.ts:150 expect real data." - }, - { - "id": "US-002", - "title": "Add sidecar protocol support for socketTable and processTable queries", - "description": "As the NativeKernel implementation, I need sidecar protocol request/response types to query socket listeners, bound UDP sockets, and process signal state so US-001 can proxy real data.", - "acceptanceCriteria": [ - "New protocol request types: FindListener, FindBoundUdp, GetSignalState added to crates/sidecar/src/protocol.rs", - "Sidecar handles these requests and returns current kernel state", - "NativeSidecarKernelProxy or NativeSidecarProcessClient exposes methods for these queries", - "Typecheck passes" - ], - "priority": 2, - "passes": true, - "notes": "This is the Rust-side counterpart to US-001. The sidecar protocol currently has no request types for observability queries." - }, - { - "id": "US-003", - "title": "Implement proper hard link in js_bridge filesystem", - "description": "As a user performing link() on a js_bridge-backed mount, I need real hard-link semantics instead of read-then-write so that inode identity and link counts are preserved.", - "acceptanceCriteria": [ - "crates/sidecar/src/service.rs link() uses a proper bridge link operation instead of read_file + write_file", - "After link(a, b), both paths share the same inode identity", - "Link count reflects the number of hard links", - "Writing to one path is visible through the other path", - "Typecheck passes", - "Tests pass" - ], - "priority": 3, - "passes": true, - "notes": "crates/sidecar/src/service.rs:520 implements link() as read-then-write, losing hard-link identity and link-count semantics." - }, - { - "id": "US-004", - "title": "Implement chown and utimes in js_bridge filesystem", - "description": "As a user performing chown() or utimes() on a js_bridge-backed mount, I need these operations to actually update metadata instead of silently no-opping.", - "acceptanceCriteria": [ - "chown() updates the owner/group metadata via the bridge", - "utimes() updates atime/mtime metadata via the bridge", - "stat() after chown/utimes reflects the updated values", - "Typecheck passes", - "Tests pass" - ], - "priority": 4, - "passes": true, - "notes": "crates/sidecar/src/service.rs:538 chown() and :542 utimes() are no-ops returning Ok(())." - }, - { - "id": "US-005", - "title": "Add symlink, readlink, link, chmod, chown, utimes support to sandbox_agent plugin", - "description": "As a user of sandbox_agent mounts, I need filesystem operations beyond basic read/write/stat so that tools relying on symlinks, permissions, or timestamps work correctly.", - "acceptanceCriteria": [ - "symlink() creates a symbolic link via the sandbox agent API (or returns a clear not-supported-by-remote error if the API lacks it)", - "read_link() resolves symlinks via the sandbox agent API", - "realpath() resolves remote symlinks instead of just normalizing the path locally", - "link() creates hard links or returns a clear error", - "chmod() updates permissions or returns a clear error", - "chown() updates ownership or returns a clear error", - "utimes() updates timestamps or returns a clear error", - "Typecheck passes", - "Tests pass" - ], - "priority": 5, - "passes": true, - "notes": "crates/sidecar/src/sandbox_agent_plugin.rs:283 realpath() doesn't resolve remote symlinks. Lines 287,293,308,314,320,326 return unsupported. Line 332 truncate() uses full-file buffering." - }, - { - "id": "US-006", - "title": "Improve sandbox_agent truncate to avoid full-file buffering", - "description": "As a user truncating large files on sandbox_agent mounts, I need truncate() to work without reading the entire file into memory.", - "acceptanceCriteria": [ - "truncate() for non-zero lengths does not read the entire file contents", - "truncate() uses a range-aware API call or server-side truncation", - "truncate(path, 0) still works via write_file with empty data", - "Typecheck passes", - "Tests pass" - ], - "priority": 6, - "passes": true, - "notes": "crates/sidecar/src/sandbox_agent_plugin.rs:332 reads entire file, truncates in memory, writes back. Unacceptable for large files." - }, - { - "id": "US-007", - "title": "Configure host filesystem bridge for stdio sidecar path", - "description": "As a user of the local/stdin-stdout sidecar workflow, I need the LocalBridge to support host filesystem operations so that bridge-backed host FS behavior works.", - "acceptanceCriteria": [ - "LocalBridge filesystem operations (read_file, write_file, etc.) delegate to the host filesystem instead of returning 'not configured' errors", - "A local sidecar session can read and write files on the host through the bridge", - "Typecheck passes", - "Tests pass" - ], - "priority": 7, - "passes": true, - "notes": "crates/sidecar/src/stdio.rs:190 starts a LocalBridge whose filesystem operations all return 'host filesystem bridge is not configured' errors." - }, - { - "id": "US-008", - "title": "Separate stderr from stdout in openShell output", - "description": "As a developer using openShell(), I need stderr and stdout to be delivered through separate channels so that error output can be distinguished from normal output.", - "acceptanceCriteria": [ - "openShell() routes stderr to a separate handler set, not the same stdoutHandlers", - "Shell onData callback receives only stdout", - "A new onStderr callback (or tagged output) delivers stderr separately", - "Existing tests that consume shell output continue to pass", - "Typecheck passes", - "Tests pass" - ], - "priority": 8, - "passes": true, - "notes": "native-kernel-proxy.ts:368-370 — onStderr handler iterates stdoutHandlers instead of a separate set, merging stderr into stdout." - }, - { - "id": "US-009", - "title": "Support full signal set in signalProcess instead of SIGKILL/SIGTERM only", - "description": "As a developer sending signals to VM processes, I need the sidecar to accept arbitrary POSIX signals so that SIGUSR1, SIGSTOP, SIGCONT, etc. work correctly.", - "acceptanceCriteria": [ - "signalProcess() maps signal numbers to their correct POSIX signal names (not just 9→SIGKILL, everything-else→SIGTERM)", - "KillProcess protocol message accepts the full signal name string", - "Sending SIGUSR1 (10), SIGSTOP (19), SIGCONT (18) delivers the correct signal to the guest process", - "Typecheck passes", - "Tests pass" - ], - "priority": 9, - "passes": true, - "notes": "native-kernel-proxy.ts:631 — signal === 9 ? 'SIGKILL' : 'SIGTERM' discards all other signal types." - }, - { - "id": "US-010", - "title": "Add integration test for connectTerminal", - "description": "As a developer, I need test coverage for connectTerminal() to verify it correctly wires stdin/stdout to a PTY-backed shell.", - "acceptanceCriteria": [ - "New test in packages/core/tests/ calls connectTerminal() and verifies a PID is returned", - "Test writes input and verifies output is received", - "Test verifies the shell is functional (e.g., echo command produces output)", - "Typecheck passes", - "Tests pass" - ], - "priority": 10, - "passes": true, - "notes": "connectTerminal() at native-kernel-proxy.ts:400-402 is implemented but has zero test coverage anywhere in the codebase." - }, - { - "id": "US-011", - "title": "Remove or exercise the dead diagnostics() protocol path", - "description": "As a maintainer, I need the diagnostics() client method to either be called from somewhere useful or removed, so there is no dead code in the protocol layer.", - "acceptanceCriteria": [ - "Either: diagnostics() is wired into AgentOs or a health-check path and has a test proving it works", - "Or: diagnostics() method and Diagnostics protocol type are removed from client and protocol.rs", - "No dead protocol paths remain", - "Typecheck passes" - ], - "priority": 11, - "passes": true, - "notes": "native-process-client.ts:970-995 implements diagnostics(). Protocol has Diagnostics request type. Neither is called anywhere." - }, - { - "id": "US-012", - "title": "Replace panics with error returns in sidecar service.rs", - "description": "As a sidecar operator, I need unexpected protocol responses to produce errors instead of crashing the process with panic!().", - "acceptanceCriteria": [ - "service.rs:3028 panic on unexpected auth response replaced with Err return", - "service.rs:3043 panic on unexpected session response replaced with Err return", - "service.rs:3067 panic on unexpected VM response replaced with Err return", - "Sidecar does not crash on malformed responses; returns descriptive error instead", - "Typecheck passes", - "Tests pass" - ], - "priority": 12, - "passes": true, - "notes": "Three panic!() calls in service.rs crash the entire sidecar process on unexpected protocol responses instead of returning errors." - }, - { - "id": "US-013", - "title": "Track zombie process count from sidecar instead of hardcoding 0", - "description": "As a developer monitoring VM health, I need zombieTimerCount to reflect the actual number of zombie processes tracked by the sidecar.", - "acceptanceCriteria": [ - "zombieTimerCount queries or is updated from the sidecar's process table", - "After a child process exits without being waited on, zombieTimerCount reflects the zombie", - "After waitpid cleans up, zombieTimerCount decrements", - "Typecheck passes", - "Tests pass" - ], - "priority": 13, - "passes": true, - "notes": "native-kernel-proxy.ts:124 — readonly zombieTimerCount = 0; never updated." - } - ] -} diff --git a/scripts/ralph/progress.txt b/scripts/ralph/progress.txt deleted file mode 100644 index 48820b1e9..000000000 --- a/scripts/ralph/progress.txt +++ /dev/null @@ -1,230 +0,0 @@ -# Ralph Progress Log -Started: Sat Apr 4 02:05:35 PM PDT 2026 ---- -## Codebase Patterns -- When `NativeKernel` creates a sidecar VM with `disableDefaultBaseLayer: true`, rely on the sidecar's minimal root for default POSIX directories instead of re-bootstrapping paths like `/bin` and `/usr/bin/env`, or VM creation will fail with `EEXIST`. -- In this workspace, run registry-targeted Vitest files through `packages/core`'s Vitest installation and config with `--root /home/nathan/a5/registry`; invoking `registry/vitest.config.ts` directly fails because the registry package cannot resolve `vitest/config`. -- For sidecar observability tests, poll `findListener()`, `findBoundUdp()`, or `getSignalState()` directly instead of waiting on short-lived `process_output` events; the query itself is the stable readiness signal. -- For sidecar-managed guest processes, let the real execution exit event drive kernel-handle cleanup; routing non-terminating external signals like `SIGUSR1`, `SIGSTOP`, or `SIGCONT` through `KernelVm::kill_process()` hits the stub driver and incorrectly marks the process exited. -- For `js_bridge` mounts, preserve hard-link semantics inside `HostFilesystem` with sidecar-local inode/link tracking; the bridge contract only exposes path-based file primitives and does not provide native hard-link or inode metadata. -- For `js_bridge` mounts, keep ownership and timestamp mutations in `HostFilesystem` sidecar state keyed by the tracked inode; the bridge `FileMetadata` contract only reports `mode`, `size`, and `kind`, so `stat()` must overlay `uid`/`gid`/time fields locally. -- For `sandbox_agent` mounts on `sandbox-agent@0.4.2`, the HTTP fs API only exposes basic file/dir primitives; implement symlink/readlink/realpath/link/chmod/chown/utimes through `/v1/processes/run`, and fail with `ENOSYS` when the remote process API or helper runtime is unavailable. -- For `sandbox_agent` mounts, prefer `/v1/processes/run` helpers for mutating filesystem operations that the HTTP fs API cannot do natively, such as non-zero `truncate()`, so large files are handled server-side instead of via full-file buffering. -- For stdio-sidecar `js_bridge` coverage, mount the guest path to the same absolute host temp directory you want to expose; `ScopedHostFilesystem` prefixes mount-relative paths before they reach `LocalBridge`, so matching the guest mount path to the host path gives a direct end-to-end host filesystem check. -- For shell consumers on the native sidecar path, treat `OpenShellOptions.onStderr` as the separate error channel; `ShellHandle.onData` is stdout-only, so terminal-style UIs must wire both if they want a combined display. -- For native-sidecar `connectTerminal()` coverage, mock `process.stdin`/`stdout` listener registration and drive the captured stdin callback directly; the API returns the shell PID immediately and cleans up host-terminal hooks asynchronously when `shell.wait()` settles. -- For sidecar integration tests, prefer supported requests like `CreateVm`, `DisposeVm`, or `GetSignalState` for ownership and lifecycle assertions instead of adding test-only protocol introspection. -- In sidecar service tests, decode `DispatchResult` payloads through small `Result`-returning helpers so malformed fixtures surface `SidecarError::InvalidState` messages instead of `panic!`ing inside shared setup. -- In the kernel process table, `waitpid()` should reap exited entries immediately and cancel their zombie timer; callers that need zombie-count assertions must observe the count before `waitpid`, not after. - -## [2026-04-04 14:31:10 PDT] - US-001 -- Implemented focused coverage for sidecar-backed socket and signal-state queries in `packages/core/tests/native-sidecar-process.test.ts`, including direct protocol checks and `NativeKernel` cache checks. -- Fixed `NativeKernel` sidecar VM initialization in `packages/core/src/runtime.ts` so the sidecar bootstrap no longer collides with the minimal root snapshot on paths like `/bin`. -- Files changed: - - `packages/core/src/runtime.ts` - - `packages/core/tests/native-sidecar-process.test.ts` -- **Learnings for future iterations:** - - The sidecar VM builder inserts a minimal root snapshot when `disableDefaultBaseLayer` is enabled and no lowers are provided; that snapshot already contains the standard root directories and `/usr/bin/env`. - - The real sidecar protocol can be integration-tested without the optional WASM fixture build by using short-lived Node programs that open TCP/UDP sockets or emit `__AGENT_OS_SIGNAL_STATE__:` control messages. - - `registry/tests/kernel/cross-runtime-network.test.ts` and `registry/tests/wasmvm/signal-handler.test.ts` currently skip in this workspace because the WASM binaries are not built, so story closure still depends on a fixture-enabled run. ---- -## [2026-04-04 14:34:51 PDT] - US-001 -- Verified the committed `US-001` implementation by running `pnpm --dir /home/nathan/a5/packages/core check-types` and `pnpm --dir /home/nathan/a5/packages/core exec vitest run tests/native-sidecar-process.test.ts`. -- Ran the story's registry coverage via `pnpm --dir /home/nathan/a5/packages/core exec vitest run --config /home/nathan/a5/packages/core/vitest.config.ts --root /home/nathan/a5/registry tests/kernel/cross-runtime-network.test.ts tests/wasmvm/signal-handler.test.ts`; both suites skipped because the WASM fixtures are not built in this workspace, matching the existing note above. -- Marked `US-001` as passing in `prd.json`. -- Files changed: - - `prd.json` - - `progress.txt` -- **Learnings for future iterations:** - - In this checkout, run registry-targeted Vitest files through `packages/core`'s Vitest installation and config while overriding `--root /home/nathan/a5/registry`; invoking `registry/vitest.config.ts` directly fails because the registry package cannot resolve `vitest/config`. - - Fixture-gated registry suites still produce useful verification here: a clean skip confirms the code path loads, while a fixture-enabled environment is still needed for end-to-end execution. ---- -## [2026-04-04 14:41:56 PDT] - US-002 -- Added a Rust-side integration test in `crates/sidecar/tests/socket_state_queries.rs` that exercises `FindListener`, `FindBoundUdp`, and `GetSignalState` against a real sidecar VM with live TCP, UDP, and signal-state fixtures. -- Stabilized `packages/core/tests/native-sidecar-process.test.ts` so the query coverage waits on the observability APIs themselves and explicitly kills the long-lived signal-state fixture during cleanup. -- Marked `US-002` as passing in `prd.json`. -- Files changed: - - `crates/sidecar/tests/socket_state_queries.rs` - - `packages/core/tests/native-sidecar-process.test.ts` - - `prd.json` - - `progress.txt` -- **Learnings for future iterations:** - - The Rust sidecar can exercise these observability queries directly in crate tests by creating a JavaScript VM with `env.AGENT_OS_ALLOWED_NODE_BUILTINS` set to `["net","dgram"]`. - - For socket and signal-state coverage, polling the query endpoints is more reliable than treating `process_output` as the readiness contract. - - If a fixture is kept alive with `setInterval()` for stable observation, the test must send an explicit `killProcess()` before waiting for `process_exited`. ---- -## [2026-04-04 14:50:33 PDT] - US-003 -- Replaced the `js_bridge` hard-link stub in `crates/sidecar/src/service.rs` with sidecar-local inode tracking so linked paths share contents, preserve `ino`/`nlink`, survive writes through either name, and keep working after the original path is removed. -- Added a `js_bridge` mount regression test in `crates/sidecar/src/service.rs` that exercises link creation, shared writes, inode identity, and unlinking the original path through the mounted VM filesystem. -- Verified the story with `cargo check -p agent-os-sidecar` and `cargo test -p agent-os-sidecar`. -- Files changed: - - `crates/sidecar/src/service.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - `HostFilesystem` needs to merge sidecar-tracked hard-link aliases into both stat paths and directory listings because the bridge only knows about the single backing path. - - The `RecordingBridge` fixture does not infer parent directories from seeded files; tests that validate `link()` destination parents must seed the containing directory explicitly. - - Removing the canonical hard-link path on a `js_bridge` mount has to rename the single backing bridge file onto a surviving alias before dropping the old path from sidecar state. ---- -## [2026-04-04 14:57:04 PDT] - US-004 -- Implemented sidecar-local `js_bridge` metadata tracking in `crates/sidecar/src/service.rs` so `chown()` and `utimes()` persist `uid`/`gid` and timestamp overrides through `HostFilesystem` and surface them via `stat()`. -- Added a `js_bridge` regression test in `crates/sidecar/src/service.rs` that updates ownership and timestamps across hard-linked paths and verifies both aliases report the shared metadata. -- Verified the story with `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar`, and `cargo test -p agent-os-sidecar configure_vm_js_bridge_mount_preserves_ -- --nocapture`. -- Files changed: - - `crates/sidecar/src/service.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - `HostFilesystem::stat()` needs a follow-symlink metadata lookup path that can reuse tracked inode state even when the original user path is just an alias to the tracked target. - - For `js_bridge`, metadata updates should reuse the same tracked inode state as hard-link aliases so `uid`/`gid` and timestamps stay consistent across every linked path. - - `cargo fmt --all --check` currently reports unrelated formatting drift in `crates/execution` and `crates/kernel`, so story verification here should rely on targeted formatting for touched files plus package-specific check/test commands. ---- -## [2026-04-04 15:08:33 PDT] - US-005 -- Implemented process-backed sandbox-agent filesystem fallbacks in `crates/sidecar/src/sandbox_agent_plugin.rs` so `realpath`, `symlink`, `read_link`, `link`, `chmod`, `chown`, and `utimes` work against remote sandboxes even though the direct HTTP fs API only exposes basic file/dir endpoints. -- Added mock `/v1/processes/run` coverage plus regression tests for the happy path and the clear `ENOSYS` fallback when the remote process API is unavailable. -- Verified the story with `cargo fmt --all -- crates/sidecar/src/sandbox_agent_plugin.rs`, `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar sandbox_agent_plugin -- --nocapture`, and `cargo test -p agent-os-sidecar`. -- Files changed: - - `crates/sidecar/src/sandbox_agent_plugin.rs` - - `CLAUDE.md` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - `sandbox-agent@0.4.2` only exposes `entries`, `file`, `mkdir`, `move`, and `stat` over the fs HTTP API, so richer filesystem semantics need a separate helper path. - - The sidecar plugin can safely probe `python3`, `python`, then `node` through `/v1/processes/run` and cache the first working runtime for subsequent filesystem helper calls. - - Mock process helpers that execute on the host must rewrite absolute sandbox paths into the mock root and sanitize JSON path results back to guest-visible paths, or symlink/realpath tests accidentally target the host filesystem. ---- -## [2026-04-04 15:12:28 PDT] - US-006 -- Implemented non-zero `truncate()` in `crates/sidecar/src/sandbox_agent_plugin.rs` through the existing remote process helper path, so sandbox-agent mounts now truncate or extend files server-side instead of downloading the whole file into memory. -- Added a regression test that truncates and extends a large file with `max_full_read_bytes` set below the file size, verifies the on-disk result, confirms `/v1/processes/run` is used, and proves no full-file `GET /v1/fs/file` occurs; also verified `truncate(path, 0)` still uses the empty-write fallback. -- Verified the story with `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar sandbox_agent_plugin -- --nocapture`, and `cargo test -p agent-os-sidecar`. -- Files changed: - - `crates/sidecar/src/sandbox_agent_plugin.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - For sandbox-agent mounts, non-zero truncate should go through `/v1/processes/run` instead of the basic fs API because the HTTP surface cannot do ranged or server-side truncation. - - The mock sandbox-agent request log is enough to assert transport behavior, so regression tests can prove a mount operation avoided `/v1/fs/file` without depending on implementation details. - - Keep `truncate(path, 0)` on the direct `write_file` path; it stays simple and does not need the process helper. ---- -## [2026-04-04 15:19:17 PDT] - US-007 -- Implemented real host-backed filesystem operations in `crates/sidecar/src/stdio.rs` for the stdio `LocalBridge`, covering reads, writes, metadata, directory listing, mkdir/rmdir, rename, symlink/readlink, chmod, truncate, and existence checks instead of the previous “not configured” errors. -- Added an end-to-end stdio binary regression in `crates/sidecar/tests/stdio_binary.rs` that configures a `js_bridge` mount over a host temp directory, reads a pre-seeded host file through the VM, and writes a new file back onto the host. -- Verified the story with `cargo fmt --all -- crates/sidecar/src/stdio.rs crates/sidecar/tests/stdio_binary.rs`, `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar --test stdio_binary`, and `cargo test -p agent-os-sidecar`. -- Files changed: - - `crates/sidecar/src/stdio.rs` - - `crates/sidecar/tests/stdio_binary.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - The stdio sidecar path can now satisfy `js_bridge` host filesystem calls directly from the local host without any extra bridge bootstrap. - - For end-to-end stdio bridge tests, mounting a guest path that exactly matches the host tempdir path is the simplest way to prove `ScopedHostFilesystem` and `LocalBridge` cooperate correctly. - - `LocalBridge::exists()` should use `symlink_metadata()` rather than `Path::exists()` so dangling symlinks still count as existing bridge entries. ---- -## [2026-04-04 15:23:46 PDT] - US-008 -- Implemented separate stderr routing for native sidecar `openShell()` calls by adding `OpenShellOptions.onStderr`, keeping `ShellHandle.onData` stdout-only, and fixing `native-kernel-proxy.ts` to use a dedicated stderr handler set. -- Updated the headless `TerminalHarness` to subscribe to both stdout and stderr so terminal-style tests still render a combined stream when they need one. -- Added a native sidecar regression test that opens a shell, writes stdin, and proves stdout and stderr arrive on distinct callbacks. -- Verified the story with `pnpm --dir /home/nathan/a5/packages/core check-types` and `pnpm --dir /home/nathan/a5/packages/core exec vitest run tests/native-sidecar-process.test.ts tests/shell-flat-api.test.ts`. -- Files changed: - - `packages/core/src/runtime.ts` - - `packages/core/src/sidecar/native-kernel-proxy.ts` - - `packages/core/src/test/terminal-harness.ts` - - `packages/core/tests/native-sidecar-process.test.ts` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - `openShell()` on the native sidecar path should treat stderr as an opt-in callback on `OpenShellOptions`; that keeps the existing shell handle shape stable while stopping stderr from polluting stdout-only consumers. - - Terminal-oriented helpers such as `TerminalHarness` should explicitly subscribe to both channels if they want interactive stderr to remain visible after the split. - - A stdin-driven `node -e` shell fixture is a reliable regression test here because it avoids races between shell startup and callback registration. ---- -## [2026-04-04 15:39:00 PDT] - US-009 -- Implemented platform-aware signal-number translation in `packages/core/src/sidecar/native-kernel-proxy.ts` so sidecar protocol kills no longer collapse every non-`9` signal to `SIGTERM`. -- Expanded `crates/sidecar/src/service.rs` signal parsing to accept the broader POSIX signal-name set and stopped mirroring external signals into the kernel stub process table, so non-terminating signals no longer appear to exit immediately. -- Added unit coverage for the TypeScript translation helper and Rust parser, plus a real-sidecar regression in `packages/core/tests/native-sidecar-process.test.ts` that verifies `SIGSTOP`/`SIGCONT` over the protocol using the returned host PID. -- Verified the story with `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar parse_signal_accepts_posix_names_and_aliases -- --nocapture`, `pnpm --dir /home/nathan/a5/packages/core check-types`, and `pnpm --dir /home/nathan/a5/packages/core exec vitest run tests/native-sidecar-process.test.ts`. -- Files changed: - - `crates/sidecar/src/service.rs` - - `packages/core/src/sidecar/native-kernel-proxy.ts` - - `packages/core/tests/native-sidecar-process.test.ts` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - The sidecar currently tracks guest runtime processes in `KernelVm` with a stub driver handle, so only real execution exit events should mark those entries finished; synthetic `kill_process()` bookkeeping is wrong for non-terminating signals. - - In Node, `os.constants.signals` is the right source for platform-specific numeric-to-name translation, but platform-conditional names require string-indexed access instead of direct typed property indexing. - - For native sidecar signal regressions, `SIGSTOP`/`SIGCONT` are more reliable to validate via the returned host PID and `ps -o state=` than via guest stdout callbacks. ---- -## [2026-04-04 15:45:26 PDT] - US-010 -- Restored native-sidecar `connectTerminal()` host-terminal wiring in `packages/core/src/sidecar/native-kernel-proxy.ts` so it forwards host stdin to the shell, routes stdout through the optional `onData` callback or host stdout, mirrors stderr to host stderr by default, and cleans up terminal listeners after the shell exits while still returning the shell PID immediately. -- Moved `ConnectTerminalOptions` onto the shared runtime types and re-exported it from `packages/core/src/agent-os.ts` so kernel and AgentOs callers both see the `onData` callback contract. -- Added a focused integration regression in `packages/core/tests/native-sidecar-process.test.ts` that mocks host terminal hooks, calls `connectTerminal()`, verifies a PID is returned, feeds stdin through the registered host listener, and asserts the echoed output arrives plus cleanup runs. -- Verified the story with `pnpm --dir /home/nathan/a5/packages/core check-types` and `pnpm --dir /home/nathan/a5/packages/core exec vitest run tests/native-sidecar-process.test.ts`. -- Marked `US-010` as passing in `prd.json`. -- Files changed: - - `packages/core/src/runtime.ts` - - `packages/core/src/agent-os.ts` - - `packages/core/src/sidecar/native-kernel-proxy.ts` - - `packages/core/tests/native-sidecar-process.test.ts` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - `connectTerminal()` on the native sidecar path returns the shell PID immediately, so listener cleanup must happen in a detached `shell.wait().finally(...)` path rather than around the method return. - - The shared `ConnectTerminalOptions` type belongs in `runtime.ts`; otherwise `AgentOs` and direct `Kernel` consumers drift and `onData` silently disappears from one public surface. - - A Vitest spy on `process.stdin.on("data", ...)` is enough to exercise host-stdin forwarding deterministically without trying to drive the real terminal in CI. ---- -## [2026-04-04 15:54:20 PDT] - US-011 -- Removed the unused diagnostics protocol path from the sidecar TypeScript client, Rust protocol enums/structs, and the Rust service dispatch layer so no dead request or response variants remain. -- Reworked the affected Rust integration tests to assert ownership and lifecycle behavior through supported requests like `CreateVm` and `GetSignalState`, and replaced the old process-count assertion with a real rerun/recreate flow after cleanup. -- Verified the story with `pnpm --dir /home/nathan/a5/packages/core check-types` and `cargo test -p agent-os-sidecar`. -- Marked `US-011` as passing in `prd.json`. -- Files changed: - - `packages/core/src/sidecar/native-process-client.ts` - - `crates/sidecar/src/protocol.rs` - - `crates/sidecar/src/service.rs` - - `crates/sidecar/tests/connection_auth.rs` - - `crates/sidecar/tests/kill_cleanup.rs` - - `crates/sidecar/tests/protocol.rs` - - `crates/sidecar/tests/session_isolation.rs` - - `crates/sidecar/tests/vm_lifecycle.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - The diagnostics protocol was only acting as a test-only introspection hook, so ownership and cleanup regressions are better covered with real supported requests instead of hidden observability APIs. - - After terminating or disposing a sidecar guest process, a stronger regression is proving the VM or session can still service a fresh `execute()` or `CreateVm` request than checking internal counters. - - Removing a protocol variant requires updating both codec/response-tracker tests and any integration tests that were using it as a convenience assertion path. ---- -## [2026-04-04 15:59:26 PDT] - US-012 -- Replaced the three `service.rs` test-helper `panic!` paths for auth, session, and VM setup responses with `Result`-returning payload decoders that emit descriptive `SidecarError::InvalidState` messages. -- Added focused regressions that construct malformed `DispatchResult` payloads and assert those helpers now return errors instead of crashing. -- Verified the story with `cargo check -p agent-os-sidecar`, `cargo test -p agent-os-sidecar returns_error_for_unexpected_response -- --nocapture`, and `cargo test -p agent-os-sidecar`. -- Marked `US-012` as passing in `prd.json`. -- Files changed: - - `crates/sidecar/src/service.rs` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - The auth/session/create-VM setup helpers in `crates/sidecar/src/service.rs` should treat unexpected response kinds as `InvalidState` failures so malformed fixtures fail descriptively without aborting the whole test process. - - Small payload-decoder helpers make it easy to unit-test malformed protocol responses directly with synthetic `DispatchResult` values instead of forcing end-to-end setup to reach each branch. - - No `AGENTS.md` files exist under this workspace path today, so reusable sidecar patterns need to be captured in `progress.txt` until module-level agent guidance is added. ---- -## [2026-04-04 16:10:05 PDT] - US-013 -- Implemented a real zombie-count query path from the Rust sidecar through the native TypeScript proxy, replacing the hardcoded `zombieTimerCount = 0` behavior. -- Fixed kernel `waitpid()` semantics so it reaps exited entries immediately and clears their scheduled zombie timer, then added Rust regressions covering the sidecar request and protocol tracker plus a Vitest regression for the proxy refresh path. -- Verified the story with `cargo test -p agent-os-kernel waitpid_resolves_for_exiting_and_already_exited_processes -- --nocapture`, `cargo test -p agent-os-sidecar --lib get_zombie_timer_count_reports_kernel_state_before_and_after_waitpid -- --nocapture`, `cargo test -p agent-os-sidecar --test protocol response_tracker_accepts_zombie_timer_count_responses -- --nocapture`, `pnpm --dir /home/nathan/a5/packages/core check-types`, and `pnpm --dir /home/nathan/a5/packages/core exec vitest run tests/native-sidecar-process.test.ts`. -- Files changed: - - `crates/kernel/src/process_table.rs` - - `crates/kernel/src/kernel.rs` - - `crates/kernel/tests/process_table.rs` - - `crates/sidecar/src/protocol.rs` - - `crates/sidecar/src/service.rs` - - `crates/sidecar/tests/protocol.rs` - - `packages/core/src/runtime.ts` - - `packages/core/src/sidecar/native-kernel-proxy.ts` - - `packages/core/src/sidecar/native-process-client.ts` - - `packages/core/tests/native-sidecar-process.test.ts` - - `scripts/ralph/prd.json` - - `scripts/ralph/progress.txt` -- **Learnings for future iterations:** - - The native sidecar path can expose synchronous kernel state like `zombieTimerCount` by returning the last cached value and kicking off an async sidecar refresh on property access, matching the existing `socketTable`/`processTable` pattern. - - `ProcessTable::waitpid()` is the correct place to reap zombies and cancel reaper deadlines; otherwise any exported zombie-count metric stays artificially high after callers have already waited the child. - - No relevant `AGENTS.md` files exist near `crates/kernel`, `crates/sidecar`, or `packages/core`, so reusable guidance for those modules still needs to live in `progress.txt`. ---- diff --git a/se6-module-loading-perf-audit.md b/se6-module-loading-perf-audit.md new file mode 100644 index 000000000..04299b581 --- /dev/null +++ b/se6-module-loading-perf-audit.md @@ -0,0 +1,57 @@ +# `se6` `module-loading-perf` Audit For Agent OS + +Scope: + +- Compared `/home/nathan/se6` `main...module-loading-perf`. +- Covered every substantive change set from `f32fdfd3` through `3a46a184`. +- Collapsed the many generated benchmark output files under `packages/secure-exec/benchmarks/results/module-load/**` into a single artifact row instead of listing every generated JSON/Markdown file separately. +- Line references below are repo-relative and point at the branch tip for `se6` and the current Agent OS checkout in `a6`. + +Status legend: + +- `Applies`: still missing in Agent OS and worth porting. +- `Partially applies`: the idea still matters, but the original implementation targeted old secure-exec/V8 bridge architecture. +- `Already absorbed`: Agent OS already has a native equivalent. +- `Compat-only`: only relevant to the legacy compat runtime under `packages/core/src/compat-runtime/**`. +- `No longer applies`: tied to old secure-exec transport/bootstrap mechanics that no longer exist in native Agent OS. +- `Docs-only`: no runtime delta to port. + +## Change Matrix + +| Change set | Status in Agent OS | Secure-exec refs | Agent OS refs | Migration note | +| --- | --- | --- | --- | --- | +| `f32fdfd3` Initial IPC observability + module-load benchmark | `Partially applies` | `se6/packages/secure-exec/benchmarks/module-load/run-module-load-benchmarks.ts:52-58,96-104,458-525`
`se6/packages/secure-exec/benchmarks/module-load/summary.ts:420-520,1155-1183` | `a6/crates/execution/src/benchmark.rs:73-179,333-470`
`a6/crates/execution/src/bin/node-import-bench.rs:1-40`
`a6/crates/execution/src/node_import_cache.rs:53-63,65-176` | Agent OS already has a native node-import benchmark and import-cache metrics, but it does not have the old ndjson/prometheus/summary/comparison artifact stack. | +| `41215f48` `US-001` real Pi CLI end-to-end benchmark workload | `Partially applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:149-162` | `a6/scripts/benchmarks/bench-utils.ts:64-192`
`a6/scripts/benchmarks/coldstart.bench.ts:4-16,81-114`
`a6/packages/core/src/agent-os.ts:2359-2416` | Agent OS benchmarks `createSession("pi")`, but not the old standalone NodeRuntime Pi CLI path that secure-exec was timing. | +| `3772f039` `US-002` machine-readable summaries and deltas | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/run-module-load-benchmarks.ts:505-523`
`se6/packages/secure-exec/benchmarks/module-load/summary.ts:2688-2823` | `a6/crates/execution/src/bin/node-import-bench.rs:3-19`
`a6/crates/execution/src/benchmark.rs:73-179` | Current Agent OS benchmark prints Markdown only. There is no native JSON summary or before/after comparison artifact yet. | +| `9adad215` `US-003` raw UDS RTT + per-session phase attribution | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/run-module-load-benchmarks.ts:52-58`
`se6/packages/secure-exec/benchmarks/module-load/summary.ts:437-455,1695-1705,2328-2338,2776-2790` | `a6/crates/execution/src/benchmark.rs:50-61,181-245`
`a6/crates/sidecar/src/protocol.rs:737-817,1054-1095` | Agent OS can validate typed request/response flow, but it does not benchmark transport RTT or break time into create/inject/execute/destroy phases. | +| `6197d51a` `US-011` direct Pi CLI headless execution inside standalone NodeRuntime | `Partially applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:149-162` | `a6/packages/core/src/agent-os.ts:2359-2416`
`a6/packages/core/tests/pi-headless.test.ts:129-138` | `createSession("pi")` exists natively, but the old standalone-runtime headless path is not the main Agent OS target. The legacy V8/compat PI headless TODO is still open. | +| `88d1992e` `US-012` JSZip + pdf-lib scenarios | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:101-130` | `a6/crates/execution/src/benchmark.rs:422-469`
`a6/crates/execution/tests/benchmark.rs:3-59` | Native Agent OS benchmark still covers only five synthetic scenarios. | +| `4bb099df` `US-013` repeated-session compressed JSZip benchmarking | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:118-130`
`se6/packages/secure-exec/benchmarks/module-load/orchestration.ts:17-24,203-253` | `a6/crates/execution/src/benchmark.rs:422-469` | This only makes sense if the richer module-load suite is ported into Agent OS. | +| `4aae2210` `US-004` eliminate repeated `_loadPolyfill` round-trips | `Compat-only` | `se6/packages/core/isolate-runtime/src/inject/require-setup.ts:4024-4041,4048-4090,4573-4589` | `a6/packages/core/src/compat-runtime/kernel/isolate-runtime/require-setup.js:4008-4024`
`a6/crates/execution/src/node_import_cache.rs:65-176` | The native Agent OS Node path no longer uses `_loadPolyfill`; only the legacy compat runtime still does. | +| `15fe7482` `US-014` split `_loadPolyfill` attribution from `__bd:*` dispatch | `Compat-only` | `se6/packages/secure-exec/benchmarks/module-load/summary.ts:1129-1146,1364-1445,2696-2823` | `a6/packages/core/src/compat-runtime/nodejs/bridge.js:7027-7072` | Useful only if compat-runtime observability is ported. It does not map to the native Node executor. | +| `96164a4c` `US-005` shrink `_loadPolyfill` payload transfer with id/hash caching | `Compat-only` | `se6/packages/v8/src/runtime.ts:110-230,729-761,793-805` | `a6/packages/core/src/compat-runtime/kernel/isolate-runtime/require-setup.js:4008-4024`
`a6/packages/core/src/compat-runtime/nodejs/bridge-setup.js:1718-1729` | Native Agent OS uses filesystem-backed loader assets and env wiring, not repeated polyfill payload IPC. | +| `91c688ce` `US-015` restore full-suite `jszip-end-to-end` completion | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:125-130`
`se6/packages/secure-exec/benchmarks/module-load/orchestration.ts:17-24,431-470` | `a6/crates/execution/src/benchmark.rs:422-469` | The scenario is simply absent from the native benchmark suite today. | +| `a5f06534` `US-006` stop resending large bridge bootstrap payloads | `No longer applies` | `se6/packages/v8/src/runtime.ts:131-137,620-663,793-805,816-833`
`se6/native/v8-runtime/src/snapshot.rs:143-220` | `a6/crates/execution/src/javascript.rs:364-380,403-550`
`a6/crates/execution/src/node_import_cache.rs:2410-2450` | Agent OS materializes loader/bootstrap files once and passes paths/env to Node. It is not shipping bridge source blobs per execution. | +| `9efe5dc1` `US-016` dedupe static `postRestoreScript` bytes | `No longer applies` | `se6/packages/v8/src/runtime.ts:135-137,614-663,799-833` | `a6/crates/execution/src/javascript.rs:528-550` | There is no native Agent OS equivalent to secure-exec's `postRestoreScript` payload channel. | +| `be83b6fd` `US-007` cache module resolution, package metadata, and filesystem probes | `Already absorbed` | `se6/packages/core/isolate-runtime/src/inject/require-setup.ts:4034-4090` | `a6/crates/execution/src/node_import_cache.rs:53-63,65-176,487-603,690-775` | This is already present natively as `NodeImportCache` with persisted resolution/module-format/package-type caches and validation. | +| `834a057f` `US-008` preload or snapshot hottest bootstrap assets | `Mostly absorbed` | `se6/native/v8-runtime/src/snapshot.rs:15-26,143-220`
`se6/packages/core/isolate-runtime/src/inject/require-setup.ts:4029-4033,4578-4589` | `a6/crates/execution/src/javascript.rs:17-42,329-380,553-619`
`a6/crates/execution/src/node_import_cache.rs:2332-2450` | Agent OS already materializes builtin/polyfill assets and prewarms hot imports. The part that remains unported is the broader warm-pool/snapshot layer. | +| `d22ee524` `US-018` bypass `_loadPolyfill` for hot `__bd:*` bridge-dispatch wrappers | `Compat-only` | `se6/packages/nodejs/src/execution-driver.ts:713-776` | `a6/packages/core/src/compat-runtime/nodejs/bridge.js:7027-7072` | Native Agent OS does not route bridge calls through `__bd:*`. This is only relevant if the compat runtime stays performance-critical. | +| `7f2467c1` `US-019` preload or manifest-cache hot projected package source files | `Partially applies` | `se6/packages/core/isolate-runtime/src/inject/require-setup.ts:4034-4090`
`se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:79-85` | `a6/packages/core/tests/software-projection.test.ts:34-72`
`a6/crates/execution/src/node_import_cache.rs:179-196,487-530,690-775` | Projected packages exist and metadata caching exists, but there is no explicit manifest/preload layer for the hottest projected source files. | +| `ba7f25d5` `US-019` docs-only follow-up | `Docs-only` | `se6/CLAUDE.md`
`se6/scripts/ralph/prd.json`
`se6/scripts/ralph/progress.txt` | `a6/crates/execution/benchmarks/node-import-baseline.md:1-24` | No runtime delta. This was only benchmark planning/progress tracking. | +| `c12515b2` `US-020` explicit session-destroy acknowledgment | `Already absorbed` | `se6/packages/v8/src/runtime.ts:380-389,893-940`
`se6/native/v8-runtime/src/main.rs:227-255` | `a6/crates/sidecar/src/service.rs:1066-1071,1933-2015`
`a6/crates/sidecar/src/protocol.rs:737-817,1054-1095` | Agent OS already has typed request/response expectations and a direct disposal path. There is no ping/pong teardown pattern to port. | +| `1f83e050` `US-009` reduce fixed per-session overhead | `Docs-only` | `se6/CLAUDE.md`
`se6/docs-internal/todo.md`
`se6/scripts/ralph/prd.json`
`se6/scripts/ralph/progress.txt` | `a6/crates/execution/src/benchmark.rs:209-243`
`a6/crates/execution/benchmarks/node-import-baseline.md:21-24` | The secure-exec commit was planning only. The goal still matters in Agent OS, but there is no direct code patch to transplant. | +| `11a33bb0` `US-010` expand benchmark controls for cold, warm, and host comparisons | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/orchestration.ts:17-24,38-50,203-253,431-470`
`se6/packages/secure-exec/benchmarks/module-load/summary.ts:2748-2790` | `a6/crates/execution/src/benchmark.rs:422-469`
`a6/crates/execution/tests/benchmark.rs:16-24` | Native Agent OS has no mode matrix for true cold start, same-session replay, snapshot on/off, or host controls. | +| `71fde781` `US-021` ranked bridge-target hotspots in summaries | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/summary.ts:1364-1445,2696-2705,2823-2832` | `a6/crates/execution/src/benchmark.rs:141-245` | Agent OS currently emits guidance prose, not ranked hotspot tables or target-level deltas. | +| `a8a9fabc` `US-024` stabilize top-level module-load benchmark orchestration | `Applies` | `se6/packages/secure-exec/benchmarks/module-load/orchestration.ts:52-111,431-470` | `a6/crates/execution/src/benchmark.rs:333-350` | The current native harness is simple and single-stage. The old staged orchestration only matters if the broader suite is ported. | +| `0f9e3096` `US-022` isolation microbenchmarks for empty session and hot single imports | `Partially applies` | `se6/packages/secure-exec/benchmarks/module-load/scenario-catalog.ts:21-85` | `a6/crates/execution/src/benchmark.rs:422-469`
`a6/crates/execution/tests/benchmark.rs:16-24` | Agent OS already has `isolate-startup`, `builtin-import`, local-graph, and large-package cases, but not the fuller hot-import microbench set (`stream`, `stream/web`, `crypto`, `zlib`, `assert`, `url`, projected package files). | +| `3a46a184` `US-023` stability + resource-usage reporting | `Partially applies` | `se6/packages/secure-exec/benchmarks/module-load/summary.ts:432-455,1690-1705,2331-2342,2721-2745`
`se6/packages/secure-exec/benchmarks/module-load/run-module-load-benchmarks.ts:467-480` | `a6/scripts/benchmarks/memory.bench.ts:45-56,117-175,206-241`
`a6/crates/execution/src/benchmark.rs:73-179` | Agent OS has a separate memory benchmark, but the native node-import benchmark does not integrate per-scenario stability or host resource usage into one report. | +| Generated benchmark results under `packages/secure-exec/benchmarks/results/module-load/**` | `Do not port wholesale` | `se6/packages/secure-exec/benchmarks/results/module-load/**` | `a6/crates/execution/benchmarks/node-import-baseline.md:1-24` | Keep only representative baselines in Agent OS. The generated result tree is reference data, not code to integrate. | +| Packaging/docs/test plumbing around the old benchmark runner | `Mostly no longer applies` | `se6/packages/secure-exec/tests/module-load-summary.test.ts`
`se6/CLAUDE.md`
`se6/scripts/ralph/prd.json`
`se6/scripts/ralph/progress.txt` | `a6/crates/execution/src/bin/node-import-bench.rs:1-40`
`a6/scripts/benchmarks/coldstart.bench.ts:1-116` | Only port the parts that support a deliberate native benchmark-suite expansion. Most secure-exec package plumbing was specific to the old repo layout and reporting workflow. | + +## What I Would Actually Port + +1. Port the benchmark/reporting pieces first: `US-002`, `US-003`, `US-010`, `US-021`, `US-022`, `US-023`, and the orchestration parts of `US-024`. +2. Then add the missing real-library workloads from `US-012`, `US-013`, and `US-015`. +3. Treat `US-007`, `US-008`, and `US-020` as already largely or fully absorbed by the native Agent OS runtime. +4. Do not port `US-004`, `US-005`, `US-014`, or `US-018` into the native path unless the legacy compat runtime under `packages/core/src/compat-runtime/**` becomes a deliberate optimization target. +5. Do not port generated benchmark output trees wholesale. Keep a small native baseline artifact instead.