From 04f4cedddf94af3b9ef25dc5e5b5966884b0edfd Mon Sep 17 00:00:00 2001 From: MOZGIII Date: Wed, 25 Feb 2026 17:54:39 +0400 Subject: [PATCH 1/5] Split out ir parser into a separate crate --- Cargo.lock | 10 ++++++++++ Cargo.toml | 2 ++ crates/fuzzer/Cargo.toml | 1 + crates/fuzzer/src/harness.rs | 2 +- crates/ir-parser/Cargo.toml | 8 ++++++++ .../waymark_core/ir_parser.rs => ir-parser/src/lib.rs} | 2 +- crates/waymark/Cargo.toml | 1 + crates/waymark/src/bin/soak-harness.rs | 2 +- crates/waymark/src/waymark_core/cli/smoke.rs | 2 +- crates/waymark/src/waymark_core/mod.rs | 1 - 10 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 crates/ir-parser/Cargo.toml rename crates/{waymark/src/waymark_core/ir_parser.rs => ir-parser/src/lib.rs} (99%) diff --git a/Cargo.lock b/Cargo.lock index e57393b9..ef968a32 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3367,6 +3367,7 @@ dependencies = [ "tracing-subscriber", "uuid", "waymark-dag", + "waymark-ir-parser", "waymark-observability-macros", "waymark-proto", ] @@ -3397,6 +3398,15 @@ dependencies = [ "uuid", "waymark", "waymark-dag", + "waymark-ir-parser", +] + +[[package]] +name = "waymark-ir-parser" +version = "0.1.0" +dependencies = [ + "regex", + "waymark-proto", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ee36ff5d..e7fd75f8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = ["crates/*"] [workspace.dependencies] waymark = { path = "crates/waymark" } waymark-dag = { path = "crates/dag" } +waymark-ir-parser = { path = "crates/ir-parser" } waymark-proto = { path = "crates/proto" } anyhow = "1" @@ -12,6 +13,7 @@ clap = "4.5" proptest = "1.9" prost = "0.12" prost-types = "0.12" +regex = "1" rustc-hash = "2" serde = "1" serde_json = "1" diff --git a/crates/fuzzer/Cargo.toml b/crates/fuzzer/Cargo.toml index e31f9970..4e0fd1e0 100644 --- a/crates/fuzzer/Cargo.toml +++ b/crates/fuzzer/Cargo.toml @@ -14,3 +14,4 @@ uuid = { workspace = true, features = ["serde", "v4"] } tokio = { workspace = true } waymark = { workspace = true } waymark-dag = { workspace = true } +waymark-ir-parser = { workspace = true } diff --git a/crates/fuzzer/src/harness.rs b/crates/fuzzer/src/harness.rs index 242d2924..59dc38c2 100644 --- a/crates/fuzzer/src/harness.rs +++ b/crates/fuzzer/src/harness.rs @@ -15,11 +15,11 @@ use waymark::backends::{ MemoryBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, }; use waymark::messages::ast as ir; -use waymark::waymark_core::ir_parser::parse_program; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; use waymark::waymark_core::runner::RunnerState; use waymark::workers::{ActionCallable, InlineWorkerPool, WorkerPoolError}; use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; pub async fn run_case(case_index: usize, case: &GeneratedCase) -> Result<()> { let program = parse_program(case.source.trim()).map_err(|err| { diff --git a/crates/ir-parser/Cargo.toml b/crates/ir-parser/Cargo.toml new file mode 100644 index 00000000..fee9b094 --- /dev/null +++ b/crates/ir-parser/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-ir-parser" +version = "0.1.0" +edition = "2024" + +[dependencies] +waymark-proto = { workspace = true } +regex = { workspace = true } diff --git a/crates/waymark/src/waymark_core/ir_parser.rs b/crates/ir-parser/src/lib.rs similarity index 99% rename from crates/waymark/src/waymark_core/ir_parser.rs rename to crates/ir-parser/src/lib.rs index a3f2bbf7..1af43324 100644 --- a/crates/waymark/src/waymark_core/ir_parser.rs +++ b/crates/ir-parser/src/lib.rs @@ -2,7 +2,7 @@ use std::fmt; -use crate::messages::ast as ir; +use waymark_proto::ast as ir; /// Raised when parsing the IR source representation fails. #[derive(Debug, Clone)] diff --git a/crates/waymark/Cargo.toml b/crates/waymark/Cargo.toml index d04b394a..83c06cd3 100644 --- a/crates/waymark/Cargo.toml +++ b/crates/waymark/Cargo.toml @@ -19,6 +19,7 @@ path = "src/bin/smoke.rs" [dependencies] waymark-proto = { workspace = true, features = ["serde", "client", "server"] } waymark-dag = { workspace = true } +waymark-ir-parser = { workspace = true } anyhow = "1" axum = "0.8" diff --git a/crates/waymark/src/bin/soak-harness.rs b/crates/waymark/src/bin/soak-harness.rs index 3503fe94..d4e77388 100644 --- a/crates/waymark/src/bin/soak-harness.rs +++ b/crates/waymark/src/bin/soak-harness.rs @@ -34,9 +34,9 @@ use waymark::backends::{ }; use waymark::db; use waymark::messages::ast as ir; -use waymark::waymark_core::ir_parser::parse_program; use waymark::waymark_core::runner::RunnerState; use waymark_dag::{DAG, convert_to_dag}; +use waymark_ir_parser::parse_program; const DEFAULT_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; const DEFAULT_WORKFLOW_NAME: &str = "waymark_soak_timeout_mix_v1"; diff --git a/crates/waymark/src/waymark_core/cli/smoke.rs b/crates/waymark/src/waymark_core/cli/smoke.rs index 3625e952..abd34109 100644 --- a/crates/waymark/src/waymark_core/cli/smoke.rs +++ b/crates/waymark/src/waymark_core/cli/smoke.rs @@ -18,11 +18,11 @@ use crate::backends::{ use crate::messages::ast as ir; use crate::waymark_core::dag_viz::render_dag_image; use crate::waymark_core::ir_format::format_program; -use crate::waymark_core::ir_parser::parse_program; use crate::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; use crate::waymark_core::runner::RunnerState; use crate::workers::{PythonWorkerConfig, RemoteWorkerPool}; use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; #[derive(Parser, Debug)] #[command(name = "waymark-smoke", about = "Smoke check core-python components.")] diff --git a/crates/waymark/src/waymark_core/mod.rs b/crates/waymark/src/waymark_core/mod.rs index 85f0c008..5e3b9090 100644 --- a/crates/waymark/src/waymark_core/mod.rs +++ b/crates/waymark/src/waymark_core/mod.rs @@ -4,7 +4,6 @@ pub mod cli; pub mod commit_barrier; pub mod dag_viz; pub mod ir_format; -pub mod ir_parser; pub mod lock; pub mod runloop; pub mod runner; From 3bc4b79bc9f0b3dad6528612127c5cadd5d076ba Mon Sep 17 00:00:00 2001 From: MOZGIII Date: Wed, 25 Feb 2026 21:37:23 +0400 Subject: [PATCH 2/5] Annotate prost for cargo-shear exclusion --- crates/proto/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/proto/Cargo.toml b/crates/proto/Cargo.toml index 277e7df3..79ea21bf 100644 --- a/crates/proto/Cargo.toml +++ b/crates/proto/Cargo.toml @@ -3,6 +3,9 @@ name = "waymark-proto" version = "0.1.0" edition = "2024" +[package.metadata.cargo-shear] +ignored = ["prost"] + [dependencies] prost = "0.12" prost-types = "0.12" From e0bb14fb2e20e30ea4c839a634cb13f9a1eed246 Mon Sep 17 00:00:00 2001 From: MOZGIII Date: Wed, 25 Feb 2026 17:55:14 +0400 Subject: [PATCH 3/5] Split out runner, runner state and backends into separate crates --- Cargo.lock | 158 +- Cargo.toml | 21 + crates/backend-memory/Cargo.toml | 36 + crates/backend-memory/src/core_backend.rs | 158 + .../src/garbage_collector_backend.rs | 15 + crates/backend-memory/src/lib.rs | 111 + .../backend-memory/src/scheduler_backend.rs | 209 ++ crates/backend-memory/src/webapp_backend.rs | 294 ++ .../src/worker_status_backend.rs | 13 + .../src/workflow_registry_backend.rs | 58 + crates/backends-core/Cargo.toml | 15 + crates/backends-core/src/lib.rs | 29 + crates/core-backend/Cargo.toml | 14 + crates/core-backend/src/data.rs | 150 + crates/core-backend/src/lib.rs | 58 + crates/garbage-collector-backend/Cargo.toml | 9 + crates/garbage-collector-backend/src/lib.rs | 20 + crates/observability-macros/src/lib.rs | 4 +- crates/observability/Cargo.toml | 8 + crates/observability/src/lib.rs | 8 + crates/runner-state/Cargo.toml | 16 + crates/runner-state/src/lib.rs | 5 + crates/runner-state/src/state.rs | 2206 ++++++++++++ crates/runner-state/src/util.rs | 12 + crates/runner-state/src/value_visitor.rs | 533 +++ crates/runner/Cargo.toml | 24 + crates/runner/src/executor.rs | 3015 +++++++++++++++++ crates/runner/src/expression_evaluator.rs | 1056 ++++++ crates/runner/src/lib.rs | 12 + crates/runner/src/replay.rs | 659 ++++ crates/runner/src/retry.rs | 137 + crates/runner/src/synthetic_exceptions.rs | 90 + crates/scheduler-backend/Cargo.toml | 10 + crates/scheduler-backend/src/lib.rs | 29 + crates/scheduler-core/Cargo.toml | 14 + crates/scheduler-core/src/lib.rs | 6 + crates/scheduler-core/src/types.rs | 139 + crates/scheduler-core/src/utils.rs | 181 + crates/webapp-backend/Cargo.toml | 10 + crates/webapp-backend/src/lib.rs | 54 + crates/webapp-core/Cargo.toml | 9 + crates/webapp-core/src/lib.rs | 299 ++ crates/worker-status-backend/Cargo.toml | 10 + crates/worker-status-backend/src/lib.rs | 32 + crates/workflow-registry-backend/Cargo.toml | 9 + crates/workflow-registry-backend/src/lib.rs | 35 + 46 files changed, 9984 insertions(+), 6 deletions(-) create mode 100644 crates/backend-memory/Cargo.toml create mode 100644 crates/backend-memory/src/core_backend.rs create mode 100644 crates/backend-memory/src/garbage_collector_backend.rs create mode 100644 crates/backend-memory/src/lib.rs create mode 100644 crates/backend-memory/src/scheduler_backend.rs create mode 100644 crates/backend-memory/src/webapp_backend.rs create mode 100644 crates/backend-memory/src/worker_status_backend.rs create mode 100644 crates/backend-memory/src/workflow_registry_backend.rs create mode 100644 crates/backends-core/Cargo.toml create mode 100644 crates/backends-core/src/lib.rs create mode 100644 crates/core-backend/Cargo.toml create mode 100644 crates/core-backend/src/data.rs create mode 100644 crates/core-backend/src/lib.rs create mode 100644 crates/garbage-collector-backend/Cargo.toml create mode 100644 crates/garbage-collector-backend/src/lib.rs create mode 100644 crates/observability/Cargo.toml create mode 100644 crates/observability/src/lib.rs create mode 100644 crates/runner-state/Cargo.toml create mode 100644 crates/runner-state/src/lib.rs create mode 100644 crates/runner-state/src/state.rs create mode 100644 crates/runner-state/src/util.rs create mode 100644 crates/runner-state/src/value_visitor.rs create mode 100644 crates/runner/Cargo.toml create mode 100644 crates/runner/src/executor.rs create mode 100644 crates/runner/src/expression_evaluator.rs create mode 100644 crates/runner/src/lib.rs create mode 100644 crates/runner/src/replay.rs create mode 100644 crates/runner/src/retry.rs create mode 100644 crates/runner/src/synthetic_exceptions.rs create mode 100644 crates/scheduler-backend/Cargo.toml create mode 100644 crates/scheduler-backend/src/lib.rs create mode 100644 crates/scheduler-core/Cargo.toml create mode 100644 crates/scheduler-core/src/lib.rs create mode 100644 crates/scheduler-core/src/types.rs create mode 100644 crates/scheduler-core/src/utils.rs create mode 100644 crates/webapp-backend/Cargo.toml create mode 100644 crates/webapp-backend/src/lib.rs create mode 100644 crates/webapp-core/Cargo.toml create mode 100644 crates/webapp-core/src/lib.rs create mode 100644 crates/worker-status-backend/Cargo.toml create mode 100644 crates/worker-status-backend/src/lib.rs create mode 100644 crates/workflow-registry-backend/Cargo.toml create mode 100644 crates/workflow-registry-backend/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ef968a32..4b31811b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1517,9 +1517,9 @@ checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "metrics" -version = "0.24.2" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25dea7ac8057892855ec285c440160265225438c3c45072613c25a4b26e98ef5" +checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" dependencies = [ "ahash", "portable-atomic", @@ -1847,9 +1847,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "potential_utf" @@ -3372,6 +3372,48 @@ dependencies = [ "waymark-proto", ] +[[package]] +name = "waymark-backend-memory" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "rmp-serde", + "serde_json", + "uuid", + "waymark-core-backend", + "waymark-garbage-collector-backend", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backends-core" +version = "0.1.0" +dependencies = [ + "serde_json", + "sqlx", + "thiserror", +] + +[[package]] +name = "waymark-core-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "serde", + "serde_json", + "uuid", + "waymark-backends-core", + "waymark-dag", + "waymark-runner-state", +] + [[package]] name = "waymark-dag" version = "0.1.0" @@ -3401,6 +3443,15 @@ dependencies = [ "waymark-ir-parser", ] +[[package]] +name = "waymark-garbage-collector-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "waymark-backends-core", +] + [[package]] name = "waymark-ir-parser" version = "0.1.0" @@ -3409,6 +3460,14 @@ dependencies = [ "waymark-proto", ] +[[package]] +name = "waymark-observability" +version = "0.1.0" +dependencies = [ + "tracing", + "waymark-observability-macros", +] + [[package]] name = "waymark-observability-macros" version = "0.1.0" @@ -3429,6 +3488,97 @@ dependencies = [ "tonic-build", ] +[[package]] +name = "waymark-runner" +version = "0.1.0" +dependencies = [ + "chrono", + "rustc-hash", + "serde_json", + "thiserror", + "tracing", + "uuid", + "waymark-backend-memory", + "waymark-core-backend", + "waymark-dag", + "waymark-ir-parser", + "waymark-observability", + "waymark-proto", + "waymark-runner-state", +] + +[[package]] +name = "waymark-runner-state" +version = "0.1.0" +dependencies = [ + "chrono", + "serde", + "serde_json", + "thiserror", + "uuid", + "waymark-dag", + "waymark-proto", +] + +[[package]] +name = "waymark-scheduler-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", + "waymark-scheduler-core", +] + +[[package]] +name = "waymark-scheduler-core" +version = "0.1.0" +dependencies = [ + "chrono", + "cron", + "rand 0.8.5", + "serde", + "uuid", +] + +[[package]] +name = "waymark-webapp-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", + "waymark-webapp-core", +] + +[[package]] +name = "waymark-webapp-core" +version = "0.1.0" +dependencies = [ + "chrono", + "serde", + "uuid", +] + +[[package]] +name = "waymark-worker-status-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "uuid", + "waymark-backends-core", +] + +[[package]] +name = "waymark-workflow-registry-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", +] + [[package]] name = "webpki-roots" version = "0.26.11" diff --git a/Cargo.toml b/Cargo.toml index e7fd75f8..aa2f4ab0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,22 +4,43 @@ members = ["crates/*"] [workspace.dependencies] waymark = { path = "crates/waymark" } +waymark-backend-memory = { path = "crates/backend-memory" } +waymark-backends-core = { path = "crates/backends-core" } +waymark-core-backend = { path = "crates/core-backend" } waymark-dag = { path = "crates/dag" } +waymark-garbage-collector-backend = { path = "crates/garbage-collector-backend" } waymark-ir-parser = { path = "crates/ir-parser" } +waymark-observability = { path = "crates/observability" } +waymark-observability-macros = { path = "crates/observability-macros" } waymark-proto = { path = "crates/proto" } +waymark-runner = { path = "crates/runner" } +waymark-runner-state = { path = "crates/runner-state" } +waymark-scheduler-backend = { path = "crates/scheduler-backend" } +waymark-scheduler-core = { path = "crates/scheduler-core" } +waymark-webapp-backend = { path = "crates/webapp-backend" } +waymark-webapp-core = { path = "crates/webapp-core" } +waymark-worker-status-backend = { path = "crates/worker-status-backend" } +waymark-workflow-registry-backend = { path = "crates/workflow-registry-backend" } anyhow = "1" +async-trait = "0.1" +chrono = { version = "0.4", default-features = false } clap = "4.5" +cron = "0.12" proptest = "1.9" prost = "0.12" prost-types = "0.12" +rand = "0.8" regex = "1" +rmp-serde = "1" rustc-hash = "2" serde = "1" serde_json = "1" sha2 = "0.10" +sqlx = { version = "0.8", default-features = false } thiserror = "2" tokio = "1" tonic = "0.11" tonic-build = "0.11" +tracing = "0.1" uuid = "1" diff --git a/crates/backend-memory/Cargo.toml b/crates/backend-memory/Cargo.toml new file mode 100644 index 00000000..4346bbda --- /dev/null +++ b/crates/backend-memory/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "waymark-backend-memory" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +rmp-serde = { workspace = true } +serde_json = { workspace = true } +uuid = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-garbage-collector-backend = { workspace = true, optional = true } +waymark-scheduler-backend = { workspace = true, optional = true } +waymark-scheduler-core = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +waymark-webapp-backend = { workspace = true, optional = true } +waymark-webapp-core = { workspace = true, optional = true } + +[features] +default = [ + "core-backend", + "worker-status-backend", + "workflow-registry-backend", + "scheduler-backend", + "garbage-collector-backend", + "webapp-backend", +] + +core-backend = [] +garbage-collector-backend = ["dep:waymark-garbage-collector-backend"] +scheduler-backend = ["dep:waymark-scheduler-backend"] +worker-status-backend = [] +workflow-registry-backend = [] +webapp-backend = ["dep:waymark-webapp-backend", "dep:waymark-webapp-core"] diff --git a/crates/backend-memory/src/core_backend.rs b/crates/backend-memory/src/core_backend.rs new file mode 100644 index 00000000..d6e2da1e --- /dev/null +++ b/crates/backend-memory/src/core_backend.rs @@ -0,0 +1,158 @@ +use chrono::Utc; +use uuid::Uuid; +use waymark_core_backend::{ + ActionDone, BackendError, BackendResult, GraphUpdate, InstanceDone, InstanceLockStatus, + LockClaim, QueuedInstance, QueuedInstanceBatch, +}; + +#[async_trait::async_trait] +impl waymark_core_backend::CoreBackend for crate::MemoryBackend { + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + let mut stored = self.graph_updates.lock().expect("graph updates poisoned"); + stored.extend(graphs.iter().cloned()); + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + let mut locks = Vec::with_capacity(graphs.len()); + for graph in graphs { + if let Some((Some(lock_uuid), lock_expires_at)) = guard.get_mut(&graph.instance_id) + && *lock_uuid == claim.lock_uuid + && lock_expires_at.is_none_or(|expires_at| expires_at < claim.lock_expires_at) + { + *lock_expires_at = Some(claim.lock_expires_at); + } + let (lock_uuid, lock_expires_at) = guard + .get(&graph.instance_id) + .cloned() + .unwrap_or((None, None)); + locks.push(InstanceLockStatus { + instance_id: graph.instance_id, + lock_uuid, + lock_expires_at, + }); + } + Ok(locks) + } + + async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { + let mut stored = self.actions_done.lock().expect("actions done poisoned"); + stored.extend(actions.iter().cloned()); + Ok(()) + } + + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { + let mut stored = self.instances_done.lock().expect("instances done poisoned"); + stored.extend(instances.iter().cloned()); + if !instances.is_empty() { + let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); + for instance in instances { + locks.remove(&instance.executor_id); + } + } + Ok(()) + } + + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + if size == 0 { + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + let queue = match &self.instance_queue { + Some(queue) => queue, + None => { + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + }; + let mut guard = queue.lock().expect("instance queue poisoned"); + let now = Utc::now(); + let mut instances = Vec::new(); + while instances.len() < size { + let Some(instance) = guard.front() else { + break; + }; + if let Some(scheduled_at) = instance.scheduled_at + && scheduled_at > now + { + break; + } + let instance = guard.pop_front().expect("instance queue empty"); + instances.push(instance); + } + if !instances.is_empty() { + let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); + for instance in &instances { + locks.insert( + instance.instance_id, + (Some(claim.lock_uuid), Some(claim.lock_expires_at)), + ); + } + } + Ok(QueuedInstanceBatch { instances }) + } + + async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { + if instances.is_empty() { + return Ok(()); + } + let queue = self.instance_queue.as_ref().ok_or_else(|| { + BackendError::Message("memory backend missing instance queue".to_string()) + })?; + let mut guard = queue.lock().expect("instance queue poisoned"); + for instance in instances { + guard.push_back(instance.clone()); + } + Ok(()) + } + + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + let mut locks = Vec::new(); + for instance_id in instance_ids { + let entry = guard + .entry(*instance_id) + .or_insert((Some(claim.lock_uuid), Some(claim.lock_expires_at))); + if entry.0 == Some(claim.lock_uuid) { + entry.1 = Some(claim.lock_expires_at); + } + locks.push(InstanceLockStatus { + instance_id: *instance_id, + lock_uuid: entry.0, + lock_expires_at: entry.1, + }); + } + Ok(locks) + } + + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()> { + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + for instance_id in instance_ids { + if let Some((current_lock, _)) = guard.get(instance_id) + && *current_lock == Some(lock_uuid) + { + guard.remove(instance_id); + } + } + Ok(()) + } +} diff --git a/crates/backend-memory/src/garbage_collector_backend.rs b/crates/backend-memory/src/garbage_collector_backend.rs new file mode 100644 index 00000000..a1274935 --- /dev/null +++ b/crates/backend-memory/src/garbage_collector_backend.rs @@ -0,0 +1,15 @@ +use chrono::{DateTime, Utc}; +use waymark_garbage_collector_backend::{ + BackendResult, GarbageCollectionResult, GarbageCollectorBackend, +}; + +#[async_trait::async_trait] +impl GarbageCollectorBackend for crate::MemoryBackend { + async fn collect_done_instances( + &self, + _older_than: DateTime, + _limit: usize, + ) -> BackendResult { + Ok(GarbageCollectionResult::default()) + } +} diff --git a/crates/backend-memory/src/lib.rs b/crates/backend-memory/src/lib.rs new file mode 100644 index 00000000..e2ef56e4 --- /dev/null +++ b/crates/backend-memory/src/lib.rs @@ -0,0 +1,111 @@ +//! In-memory backend that prints persistence operations. + +#[cfg(feature = "core-backend")] +mod core_backend; + +#[cfg(feature = "garbage-collector-backend")] +mod garbage_collector_backend; + +#[cfg(feature = "scheduler-backend")] +mod scheduler_backend; + +#[cfg(feature = "webapp-backend")] +mod webapp_backend; + +#[cfg(feature = "worker-status-backend")] +mod worker_status_backend; + +#[cfg(feature = "workflow-registry-backend")] +mod workflow_registry_backend; + +use std::collections::{HashMap, VecDeque}; +use std::sync::{Arc, Mutex}; + +use chrono::{DateTime, Utc}; +use uuid::Uuid; + +use waymark_core_backend::{ActionDone, GraphUpdate, InstanceDone, QueuedInstance}; +use waymark_scheduler_core::{ScheduleId, WorkflowSchedule}; +use waymark_worker_status_backend::WorkerStatusUpdate; +use waymark_workflow_registry_backend::WorkflowRegistration; + +type WorkflowVersionKey = (String, String); +type WorkflowVersionValue = (Uuid, WorkflowRegistration); +type WorkflowVersionStore = HashMap; +type InstanceLockStore = HashMap, Option>)>; + +/// Backend that stores updates in memory for tests or local runs. +#[derive(Clone)] +pub struct MemoryBackend { + instance_queue: Option>>>, + graph_updates: Arc>>, + actions_done: Arc>>, + instances_done: Arc>>, + worker_status_updates: Arc>>, + #[cfg_attr(not(feature = "workflow-registry-backend"), allow(dead_code))] + workflow_versions: Arc>, + #[cfg_attr(not(feature = "scheduler-backend"), allow(dead_code))] + schedules: Arc>>, + #[cfg_attr(not(feature = "core-backend"), allow(dead_code))] + instance_locks: Arc>, +} + +impl Default for MemoryBackend { + fn default() -> Self { + Self { + instance_queue: None, + graph_updates: Arc::new(Mutex::new(Vec::new())), + actions_done: Arc::new(Mutex::new(Vec::new())), + instances_done: Arc::new(Mutex::new(Vec::new())), + worker_status_updates: Arc::new(Mutex::new(Vec::new())), + workflow_versions: Arc::new(Mutex::new(HashMap::new())), + schedules: Arc::new(Mutex::new(HashMap::new())), + instance_locks: Arc::new(Mutex::new(HashMap::new())), + } + } +} + +impl MemoryBackend { + pub fn new() -> Self { + Self::default() + } + + pub fn with_queue(queue: Arc>>) -> Self { + Self { + instance_queue: Some(queue), + ..Self::default() + } + } + + pub fn instance_queue(&self) -> Option>>> { + self.instance_queue.clone() + } + + pub fn graph_updates(&self) -> Vec { + self.graph_updates + .lock() + .expect("graph updates poisoned") + .clone() + } + + pub fn actions_done(&self) -> Vec { + self.actions_done + .lock() + .expect("actions done poisoned") + .clone() + } + + pub fn instances_done(&self) -> Vec { + self.instances_done + .lock() + .expect("instances done poisoned") + .clone() + } + + pub fn worker_status_updates(&self) -> Vec { + self.worker_status_updates + .lock() + .expect("worker status updates poisoned") + .clone() + } +} diff --git a/crates/backend-memory/src/scheduler_backend.rs b/crates/backend-memory/src/scheduler_backend.rs new file mode 100644 index 00000000..a69aa94e --- /dev/null +++ b/crates/backend-memory/src/scheduler_backend.rs @@ -0,0 +1,209 @@ +use chrono::Utc; +use uuid::Uuid; +use waymark_core_backend::{BackendError, BackendResult}; +use waymark_scheduler_backend::SchedulerBackend; +use waymark_scheduler_core::{ + CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule, compute_next_run, +}; + +#[async_trait::async_trait] +impl SchedulerBackend for crate::MemoryBackend { + async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let existing_schedule = guard.iter().find_map(|(id, schedule)| { + if schedule.workflow_name == params.workflow_name + && schedule.schedule_name == params.schedule_name + { + Some((*id, schedule.clone())) + } else { + None + } + }); + let schedule_id = existing_schedule + .as_ref() + .map(|(id, _)| *id) + .unwrap_or_else(ScheduleId::new); + let now = Utc::now(); + let next_run_at = match existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.next_run_at) + { + Some(next_run_at) => Some(next_run_at), + None => Some( + compute_next_run( + params.schedule_type, + params.cron_expression.as_deref(), + params.interval_seconds, + params.jitter_seconds, + None, + ) + .map_err(BackendError::Message)?, + ), + }; + let schedule = WorkflowSchedule { + id: schedule_id.0, + workflow_name: params.workflow_name.clone(), + schedule_name: params.schedule_name.clone(), + schedule_type: params.schedule_type.as_str().to_string(), + cron_expression: params.cron_expression.clone(), + interval_seconds: params.interval_seconds, + jitter_seconds: params.jitter_seconds, + input_payload: params.input_payload.clone(), + status: "active".to_string(), + next_run_at, + last_run_at: existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.last_run_at), + last_instance_id: existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.last_instance_id), + created_at: existing_schedule + .as_ref() + .map(|(_, schedule)| schedule.created_at) + .unwrap_or(now), + updated_at: now, + priority: params.priority, + allow_duplicate: params.allow_duplicate, + }; + guard.insert(schedule_id, schedule); + Ok(schedule_id) + } + + async fn get_schedule(&self, id: ScheduleId) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + guard + .get(&id) + .cloned() + .ok_or_else(|| BackendError::Message(format!("schedule not found: {id}"))) + } + + async fn get_schedule_by_name( + &self, + workflow_name: &str, + schedule_name: &str, + ) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .find(|schedule| { + schedule.workflow_name == workflow_name + && schedule.schedule_name == schedule_name + && schedule.status != "deleted" + }) + .cloned()) + } + + async fn list_schedules( + &self, + limit: i64, + offset: i64, + ) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| schedule.status != "deleted") + .cloned() + .collect(); + schedules.sort_by(|a, b| { + (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) + }); + let start = offset.max(0) as usize; + let end = start.saturating_add(limit.max(0) as usize); + Ok(schedules + .into_iter() + .skip(start) + .take(end - start) + .collect()) + } + + async fn count_schedules(&self) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .filter(|schedule| schedule.status != "deleted") + .count() as i64) + } + + async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + if let Some(schedule) = guard.get_mut(&id) { + schedule.status = status.to_string(); + schedule.updated_at = Utc::now(); + Ok(true) + } else { + Ok(false) + } + } + + async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { + SchedulerBackend::update_schedule_status(self, id, "deleted").await + } + + async fn find_due_schedules(&self, limit: i32) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let now = Utc::now(); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| { + schedule.status == "active" + && schedule + .next_run_at + .map(|next| next <= now) + .unwrap_or(false) + }) + .cloned() + .collect(); + schedules.sort_by_key(|schedule| schedule.next_run_at); + Ok(schedules.into_iter().take(limit as usize).collect()) + } + + async fn has_running_instance(&self, _schedule_id: ScheduleId) -> BackendResult { + Ok(false) + } + + async fn mark_schedule_executed( + &self, + schedule_id: ScheduleId, + instance_id: Uuid, + ) -> BackendResult<()> { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .get_mut(&schedule_id) + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + schedule.last_run_at = Some(Utc::now()); + schedule.last_instance_id = Some(instance_id); + schedule.next_run_at = Some(next_run_at); + schedule.updated_at = Utc::now(); + Ok(()) + } + + async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .get_mut(&schedule_id) + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + schedule.next_run_at = Some(next_run_at); + schedule.updated_at = Utc::now(); + Ok(()) + } +} diff --git a/crates/backend-memory/src/webapp_backend.rs b/crates/backend-memory/src/webapp_backend.rs new file mode 100644 index 00000000..883f4076 --- /dev/null +++ b/crates/backend-memory/src/webapp_backend.rs @@ -0,0 +1,294 @@ +use std::collections::HashMap; + +use chrono::Utc; +use uuid::Uuid; +use waymark_webapp_backend::{BackendError, BackendResult, WebappBackend}; +use waymark_webapp_core::{ + ExecutionGraphView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, + ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, + WorkerAggregateStats, WorkerStatus, +}; +use waymark_worker_status_backend::WorkerStatusUpdate; + +#[async_trait::async_trait] +impl WebappBackend for crate::MemoryBackend { + async fn count_instances(&self, _search: Option<&str>) -> BackendResult { + Ok(0) + } + + async fn list_instances( + &self, + _search: Option<&str>, + _limit: i64, + _offset: i64, + ) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_instance(&self, instance_id: Uuid) -> BackendResult { + Err(BackendError::Message(format!( + "instance not found: {instance_id}" + ))) + } + + async fn get_execution_graph( + &self, + _instance_id: Uuid, + ) -> BackendResult> { + Ok(None) + } + + async fn get_workflow_graph( + &self, + _instance_id: Uuid, + ) -> BackendResult> { + Ok(None) + } + + async fn get_action_results(&self, _instance_id: Uuid) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_distinct_workflows(&self) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_distinct_statuses(&self) -> BackendResult> { + Ok(vec![ + InstanceStatus::Queued.to_string(), + InstanceStatus::Running.to_string(), + InstanceStatus::Completed.to_string(), + InstanceStatus::Failed.to_string(), + ]) + } + + async fn count_schedules(&self) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .filter(|schedule| schedule.status != "deleted") + .count() as i64) + } + + async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| schedule.status != "deleted") + .cloned() + .collect(); + schedules.sort_by(|a, b| { + (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) + }); + + let start = offset.max(0) as usize; + let page_limit = limit.max(0) as usize; + Ok(schedules + .into_iter() + .skip(start) + .take(page_limit) + .map(|schedule| ScheduleSummary { + id: schedule.id.to_string(), + workflow_name: schedule.workflow_name, + schedule_name: schedule.schedule_name, + schedule_type: schedule.schedule_type, + cron_expression: schedule.cron_expression, + interval_seconds: schedule.interval_seconds, + status: schedule.status, + next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), + last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), + created_at: schedule.created_at.to_rfc3339(), + }) + .collect()) + } + + async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .values() + .find(|schedule| schedule.id == schedule_id) + .cloned() + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + + let input_payload = schedule.input_payload.as_ref().and_then(|bytes| { + rmp_serde::from_slice::(bytes) + .ok() + .and_then(|value| serde_json::to_string_pretty(&value).ok()) + }); + + Ok(ScheduleDetail { + id: schedule.id.to_string(), + workflow_name: schedule.workflow_name, + schedule_name: schedule.schedule_name, + schedule_type: schedule.schedule_type, + cron_expression: schedule.cron_expression, + interval_seconds: schedule.interval_seconds, + jitter_seconds: schedule.jitter_seconds, + status: schedule.status, + next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), + last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), + last_instance_id: schedule.last_instance_id.map(|id| id.to_string()), + created_at: schedule.created_at.to_rfc3339(), + updated_at: schedule.updated_at.to_rfc3339(), + priority: schedule.priority, + allow_duplicate: schedule.allow_duplicate, + input_payload, + }) + } + + async fn count_schedule_invocations(&self, _schedule_id: Uuid) -> BackendResult { + Ok(0) + } + + async fn list_schedule_invocations( + &self, + _schedule_id: Uuid, + _limit: i64, + _offset: i64, + ) -> BackendResult> { + Ok(Vec::new()) + } + + async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let Some(schedule) = guard + .values_mut() + .find(|schedule| schedule.id == schedule_id) + else { + return Ok(false); + }; + schedule.status = status.to_string(); + schedule.updated_at = Utc::now(); + Ok(true) + } + + async fn get_distinct_schedule_statuses(&self) -> BackendResult> { + Ok(vec!["active".to_string(), "paused".to_string()]) + } + + async fn get_distinct_schedule_types(&self) -> BackendResult> { + Ok(vec!["cron".to_string(), "interval".to_string()]) + } + + async fn get_worker_action_stats( + &self, + _window_minutes: i64, + ) -> BackendResult> { + let statuses = latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + ); + + Ok(statuses + .into_iter() + .map(|status| WorkerActionRow { + pool_id: status.pool_id.to_string(), + active_workers: status.active_workers as i64, + actions_per_sec: format!("{:.1}", status.actions_per_sec), + throughput_per_min: status.throughput_per_min as i64, + total_completed: status.total_completed, + median_dequeue_ms: status.median_dequeue_ms, + median_handling_ms: status.median_handling_ms, + last_action_at: status.last_action_at.map(|dt| dt.to_rfc3339()), + updated_at: status.updated_at.to_rfc3339(), + }) + .collect()) + } + + async fn get_worker_aggregate_stats( + &self, + _window_minutes: i64, + ) -> BackendResult { + let statuses = latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + ); + + let active_worker_count = statuses + .iter() + .map(|status| status.active_workers as i64) + .sum(); + let total_in_flight = statuses + .iter() + .filter_map(|status| status.total_in_flight) + .sum(); + let total_queue_depth = statuses + .iter() + .filter_map(|status| status.dispatch_queue_size) + .sum(); + let actions_per_sec = statuses + .iter() + .map(|status| status.actions_per_sec) + .sum::(); + + Ok(WorkerAggregateStats { + active_worker_count, + actions_per_sec: format!("{:.1}", actions_per_sec), + total_in_flight, + total_queue_depth, + }) + } + + async fn worker_status_table_exists(&self) -> bool { + !self + .worker_status_updates + .lock() + .expect("worker status updates poisoned") + .is_empty() + } + + async fn schedules_table_exists(&self) -> bool { + !self + .schedules + .lock() + .expect("schedules poisoned") + .is_empty() + } + + async fn get_worker_statuses(&self, _window_minutes: i64) -> BackendResult> { + Ok(latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + )) + } +} + +fn latest_worker_statuses(updates: &[WorkerStatusUpdate]) -> Vec { + let mut by_pool: HashMap = HashMap::new(); + for update in updates { + by_pool.insert(update.pool_id, update.clone()); + } + + let now = Utc::now(); + let mut statuses: Vec<_> = by_pool + .into_values() + .map(|status| WorkerStatus { + pool_id: status.pool_id, + active_workers: status.active_workers, + throughput_per_min: status.throughput_per_min, + actions_per_sec: status.actions_per_sec, + total_completed: status.total_completed, + last_action_at: status.last_action_at, + updated_at: now, + median_dequeue_ms: status.median_dequeue_ms, + median_handling_ms: status.median_handling_ms, + dispatch_queue_size: Some(status.dispatch_queue_size), + total_in_flight: Some(status.total_in_flight), + median_instance_duration_secs: status.median_instance_duration_secs, + active_instance_count: status.active_instance_count, + total_instances_completed: status.total_instances_completed, + instances_per_sec: status.instances_per_sec, + instances_per_min: status.instances_per_min, + time_series: status.time_series, + }) + .collect(); + + statuses.sort_by(|left, right| right.actions_per_sec.total_cmp(&left.actions_per_sec)); + statuses +} diff --git a/crates/backend-memory/src/worker_status_backend.rs b/crates/backend-memory/src/worker_status_backend.rs new file mode 100644 index 00000000..dbca9794 --- /dev/null +++ b/crates/backend-memory/src/worker_status_backend.rs @@ -0,0 +1,13 @@ +use waymark_worker_status_backend::{BackendResult, WorkerStatusBackend, WorkerStatusUpdate}; + +#[async_trait::async_trait] +impl WorkerStatusBackend for crate::MemoryBackend { + async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { + let mut stored = self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"); + stored.push(status.clone()); + Ok(()) + } +} diff --git a/crates/backend-memory/src/workflow_registry_backend.rs b/crates/backend-memory/src/workflow_registry_backend.rs new file mode 100644 index 00000000..e820b5a9 --- /dev/null +++ b/crates/backend-memory/src/workflow_registry_backend.rs @@ -0,0 +1,58 @@ +use uuid::Uuid; +use waymark_workflow_registry_backend::{ + BackendError, BackendResult, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; + +#[async_trait::async_trait] +impl WorkflowRegistryBackend for crate::MemoryBackend { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult { + let mut guard = self + .workflow_versions + .lock() + .expect("workflow versions poisoned"); + let key = ( + registration.workflow_name.clone(), + registration.workflow_version.clone(), + ); + if let Some((id, existing)) = guard.get(&key) { + if existing.ir_hash != registration.ir_hash { + return Err(BackendError::Message(format!( + "workflow version already exists with different IR hash: {}@{}", + registration.workflow_name, registration.workflow_version + ))); + } + return Ok(*id); + } + + let id = Uuid::new_v4(); + guard.insert(key, (id, registration.clone())); + Ok(id) + } + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { + if ids.is_empty() { + return Ok(Vec::new()); + } + let guard = self + .workflow_versions + .lock() + .expect("workflow versions poisoned"); + let mut versions = Vec::new(); + for (id, registration) in guard.values() { + if ids.contains(id) { + versions.push(WorkflowVersion { + id: *id, + workflow_name: registration.workflow_name.clone(), + workflow_version: registration.workflow_version.clone(), + ir_hash: registration.ir_hash.clone(), + program_proto: registration.program_proto.clone(), + concurrent: registration.concurrent, + }); + } + } + Ok(versions) + } +} diff --git a/crates/backends-core/Cargo.toml b/crates/backends-core/Cargo.toml new file mode 100644 index 00000000..194062c0 --- /dev/null +++ b/crates/backends-core/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "waymark-backends-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +serde_json = { workspace = true } +thiserror = { workspace = true } +sqlx = { workspace = true, optional = true } + +[features] +default = ["sqlx-error"] + +# TODO: this has to abstracted away since not all backends will use sqlx. +sqlx-error = ["dep:sqlx"] diff --git a/crates/backends-core/src/lib.rs b/crates/backends-core/src/lib.rs new file mode 100644 index 00000000..50f807b4 --- /dev/null +++ b/crates/backends-core/src/lib.rs @@ -0,0 +1,29 @@ +//! Core primitives for various waymark subsystem backends. + +/// The common backend error. +/// +/// TODO: move away from a shared notion of backend error to use concrete error +/// type per-operation (rather than per-subsystem or per-crate). +#[derive(Debug, thiserror::Error)] +pub enum BackendError { + #[error("{0}")] + Message(String), + + #[error(transparent)] + Inner(Inner), + + #[error(transparent)] + Serialization(serde_json::Error), +} + +#[cfg(feature = "sqlx-error")] +pub type InnerError = sqlx::Error; + +#[cfg(not(feature = "sqlx-error"))] +pub type InnerError = (); + +/// Utility type alias for backend results. +/// +/// TODO: move away from the single-`Result` type aliases as we want to vary +/// rrors per-call. +pub type BackendResult = Result>; diff --git a/crates/core-backend/Cargo.toml b/crates/core-backend/Cargo.toml new file mode 100644 index 00000000..da2aa394 --- /dev/null +++ b/crates/core-backend/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "waymark-core-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +chrono = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-dag = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/core-backend/src/data.rs b/crates/core-backend/src/data.rs new file mode 100644 index 00000000..d9320e71 --- /dev/null +++ b/crates/core-backend/src/data.rs @@ -0,0 +1,150 @@ +// The models that we use for our backends are similar to the ones that we +// have specified in our database/Postgres backend, but not 1:1. It's better for +// us to internally convert within the given backend + +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use waymark_dag::DAG; +use waymark_runner_state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Queued instance payload for the run loop. +pub struct QueuedInstance { + pub workflow_version_id: Uuid, + #[serde(default)] + pub schedule_id: Option, + #[serde(skip, default)] + pub dag: Option>, + pub entry_node: Uuid, + pub state: Option, + #[serde( + default = "default_action_results", + deserialize_with = "deserialize_action_results" + )] + pub action_results: HashMap, + #[serde(default = "default_instance_id")] + pub instance_id: Uuid, + #[serde(default)] + pub scheduled_at: Option>, +} + +#[derive(Clone, Debug)] +/// Result payload for queued instance polling. +pub struct QueuedInstanceBatch { + pub instances: Vec, +} + +#[derive(Clone, Debug)] +/// Lock claim settings for owned instances. +pub struct LockClaim { + pub lock_uuid: Uuid, + pub lock_expires_at: DateTime, +} + +#[derive(Clone, Debug)] +/// Current lock status for an instance. +pub struct InstanceLockStatus { + pub instance_id: Uuid, + pub lock_uuid: Option, + pub lock_expires_at: Option>, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Completed instance payload with result or exception. +pub struct InstanceDone { + pub executor_id: Uuid, + pub entry_node: Uuid, + pub result: Option, + pub error: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Batch payload representing an updated execution graph snapshot. +/// +/// This intentionally stores only runtime nodes and edges (no DAG template or +/// derived caches) so persistence stays lightweight. +pub struct GraphUpdate { + pub instance_id: Uuid, + pub nodes: HashMap, + pub edges: HashSet, +} + +impl GraphUpdate { + pub fn from_state(instance_id: Uuid, state: &RunnerState) -> Self { + Self { + instance_id, + nodes: state.nodes.clone(), + edges: state.edges.clone(), + } + } + + pub fn next_scheduled_at(&self) -> DateTime { + let mut next: Option> = None; + for node in self.nodes.values() { + if matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) { + continue; + } + if let Some(scheduled_at) = node.scheduled_at { + next = Some(match next { + Some(existing) => existing.min(scheduled_at), + None => scheduled_at, + }); + } + } + next.unwrap_or_else(Utc::now) + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Batch payload representing a finished action attempt (success or failure). +pub struct ActionDone { + pub execution_id: Uuid, + pub attempt: i32, + pub status: ActionAttemptStatus, + pub started_at: Option>, + pub completed_at: Option>, + pub duration_ms: Option, + pub result: serde_json::Value, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ActionAttemptStatus { + Completed, + Failed, + TimedOut, +} + +impl std::fmt::Display for ActionAttemptStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Completed => write!(f, "completed"), + Self::Failed => write!(f, "failed"), + Self::TimedOut => write!(f, "timed_out"), + } + } +} + +fn default_instance_id() -> Uuid { + Uuid::new_v4() +} + +fn default_action_results() -> HashMap { + HashMap::new() +} + +fn deserialize_action_results<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let value = Option::>::deserialize(deserializer)?; + Ok(value.unwrap_or_default()) +} diff --git a/crates/core-backend/src/lib.rs b/crates/core-backend/src/lib.rs new file mode 100644 index 00000000..e38f5cdd --- /dev/null +++ b/crates/core-backend/src/lib.rs @@ -0,0 +1,58 @@ +//! Core backend traits for waymark. + +mod data; + +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +pub use self::data::*; + +/// Abstract persistence backend for runner state. +#[async_trait::async_trait] +pub trait CoreBackend: Send + Sync { + fn clone_box(&self) -> Box; + + /// Persist updated execution graphs. + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult>; + + /// Persist finished action attempts (success or failure). + async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()>; + + /// Return up to size queued instances without blocking. + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult; + + /// Refresh lock expiry for owned instances. + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult>; + + /// Release instance locks when evicting from memory. + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()>; + + /// Persist completed workflow instances. + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()>; + + /// Insert queued instances for run-loop consumption. + async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()>; +} + +impl Clone for Box { + fn clone(&self) -> Self { + self.clone_box() + } +} diff --git a/crates/garbage-collector-backend/Cargo.toml b/crates/garbage-collector-backend/Cargo.toml new file mode 100644 index 00000000..e1e4f300 --- /dev/null +++ b/crates/garbage-collector-backend/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-garbage-collector-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/garbage-collector-backend/src/lib.rs b/crates/garbage-collector-backend/src/lib.rs new file mode 100644 index 00000000..d3f2e234 --- /dev/null +++ b/crates/garbage-collector-backend/src/lib.rs @@ -0,0 +1,20 @@ +use chrono::{DateTime, Utc}; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +#[derive(Clone, Copy, Debug, Default)] +/// Summary of a garbage collection sweep. +pub struct GarbageCollectionResult { + pub deleted_instances: usize, + pub deleted_actions: usize, +} + +/// Backend capability for deleting old finished workflow data. +#[async_trait::async_trait] +pub trait GarbageCollectorBackend: Send + Sync { + async fn collect_done_instances( + &self, + older_than: DateTime, + limit: usize, + ) -> BackendResult; +} diff --git a/crates/observability-macros/src/lib.rs b/crates/observability-macros/src/lib.rs index 9fc1df7b..e10c19ec 100644 --- a/crates/observability-macros/src/lib.rs +++ b/crates/observability-macros/src/lib.rs @@ -6,10 +6,10 @@ use syn::{ItemFn, parse_macro_input}; pub fn obs(args: TokenStream, input: TokenStream) -> TokenStream { let mut item = parse_macro_input!(input as ItemFn); let attr = if args.is_empty() { - syn::parse_quote!(#[cfg_attr(feature = "trace", tracing::instrument(skip_all))]) + syn::parse_quote!(#[cfg_attr(feature = "trace", ::waymark_observability::__inner::tracing::instrument(skip_all))]) } else { let args = proc_macro2::TokenStream::from(args); - syn::parse_quote!(#[cfg_attr(feature = "trace", tracing::instrument(#args))]) + syn::parse_quote!(#[cfg_attr(feature = "trace", ::waymark_observability::__inner::tracing::instrument(#args))]) }; item.attrs.push(attr); TokenStream::from(quote!(#item)) diff --git a/crates/observability/Cargo.toml b/crates/observability/Cargo.toml new file mode 100644 index 00000000..bc27b66c --- /dev/null +++ b/crates/observability/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-observability" +version = "0.1.0" +edition = "2024" + +[dependencies] +waymark-observability-macros = { workspace = true } +tracing = { workspace = true } diff --git a/crates/observability/src/lib.rs b/crates/observability/src/lib.rs new file mode 100644 index 00000000..d2fa50f6 --- /dev/null +++ b/crates/observability/src/lib.rs @@ -0,0 +1,8 @@ +pub use waymark_observability_macros::obs; + +#[doc(hidden)] +pub mod __inner { + pub mod tracing { + pub use tracing::instrument; + } +} diff --git a/crates/runner-state/Cargo.toml b/crates/runner-state/Cargo.toml new file mode 100644 index 00000000..6a64d994 --- /dev/null +++ b/crates/runner-state/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "waymark-runner-state" +version = "0.1.0" +edition = "2024" + +[dependencies] +chrono = { workspace = true, features = ["serde", "clock"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } + +[features] +trace = [] diff --git a/crates/runner-state/src/lib.rs b/crates/runner-state/src/lib.rs new file mode 100644 index 00000000..5c7ae36b --- /dev/null +++ b/crates/runner-state/src/lib.rs @@ -0,0 +1,5 @@ +mod state; +mod util; +pub mod value_visitor; + +pub use self::state::*; diff --git a/crates/runner-state/src/state.rs b/crates/runner-state/src/state.rs new file mode 100644 index 00000000..da418624 --- /dev/null +++ b/crates/runner-state/src/state.rs @@ -0,0 +1,2206 @@ +//! Execution-time DAG state with unrolled nodes and symbolic values. + +use std::collections::{HashMap, HashSet}; +use std::fmt; +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::util::is_truthy; +use crate::value_visitor::{ValueExpr, collect_value_sources, resolve_value_tree}; +use waymark_dag::{ + ActionCallNode, AggregatorNode, AssignmentNode, DAG, DAGNode, EdgeType, FnCallNode, JoinNode, + ReturnNode, SleepNode, +}; +use waymark_proto::ast as ir; + +/// Raised when the runner state cannot be updated safely. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub struct RunnerStateError(pub String); + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ActionCallSpec { + pub action_name: String, + pub module_name: Option, + pub kwargs: HashMap, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct LiteralValue { + pub value: serde_json::Value, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct VariableValue { + pub name: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ActionResultValue { + pub node_id: Uuid, + pub action_name: String, + pub iteration_index: Option, + pub result_index: Option, +} + +impl ActionResultValue { + pub fn label(&self) -> String { + let mut label = self.action_name.clone(); + if let Some(idx) = self.iteration_index { + label = format!("{label}[{idx}]"); + } + if let Some(idx) = self.result_index { + label = format!("{label}[{idx}]"); + } + label + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct BinaryOpValue { + pub left: Box, + pub op: i32, + pub right: Box, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct UnaryOpValue { + pub op: i32, + pub operand: Box, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ListValue { + pub elements: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DictEntryValue { + pub key: ValueExpr, + pub value: ValueExpr, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DictValue { + pub entries: Vec, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct IndexValue { + pub object: Box, + pub index: Box, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct DotValue { + pub object: Box, + pub attribute: String, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct FunctionCallValue { + pub name: String, + pub args: Vec, + pub kwargs: HashMap, + pub global_function: Option, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct SpreadValue { + pub collection: Box, + pub loop_var: String, + pub action: ActionCallSpec, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", content = "data")] +pub enum NodeStatus { + Queued, + Running, + Completed, + Failed, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ExecutionNodeType { + Input, + Output, + Assignment, + ActionCall, + FnCall, + Parallel, + Aggregator, + Branch, + Join, + Return, + Break, + Continue, + Sleep, + Expression, +} + +impl ExecutionNodeType { + pub fn as_str(&self) -> &'static str { + match self { + ExecutionNodeType::Input => "input", + ExecutionNodeType::Output => "output", + ExecutionNodeType::Assignment => "assignment", + ExecutionNodeType::ActionCall => "action_call", + ExecutionNodeType::FnCall => "fn_call", + ExecutionNodeType::Parallel => "parallel", + ExecutionNodeType::Aggregator => "aggregator", + ExecutionNodeType::Branch => "branch", + ExecutionNodeType::Join => "join", + ExecutionNodeType::Return => "return", + ExecutionNodeType::Break => "break", + ExecutionNodeType::Continue => "continue", + ExecutionNodeType::Sleep => "sleep", + ExecutionNodeType::Expression => "expression", + } + } +} + +impl TryFrom<&str> for ExecutionNodeType { + type Error = RunnerStateError; + + fn try_from(value: &str) -> Result { + match value { + "input" => Ok(ExecutionNodeType::Input), + "output" => Ok(ExecutionNodeType::Output), + "assignment" => Ok(ExecutionNodeType::Assignment), + "action_call" => Ok(ExecutionNodeType::ActionCall), + "fn_call" => Ok(ExecutionNodeType::FnCall), + "parallel" => Ok(ExecutionNodeType::Parallel), + "aggregator" => Ok(ExecutionNodeType::Aggregator), + "branch" => Ok(ExecutionNodeType::Branch), + "join" => Ok(ExecutionNodeType::Join), + "return" => Ok(ExecutionNodeType::Return), + "break" => Ok(ExecutionNodeType::Break), + "continue" => Ok(ExecutionNodeType::Continue), + "sleep" => Ok(ExecutionNodeType::Sleep), + "expression" => Ok(ExecutionNodeType::Expression), + _ => Err(RunnerStateError(format!( + "unknown execution node type: {value}" + ))), + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ExecutionNode { + pub node_id: Uuid, + pub node_type: String, + pub label: String, + pub status: NodeStatus, + pub template_id: Option, + pub targets: Vec, + pub action: Option, + pub value_expr: Option, + pub assignments: HashMap, + pub action_attempt: i32, + #[serde(default)] + pub started_at: Option>, + #[serde(default)] + pub completed_at: Option>, + #[serde(default)] + pub scheduled_at: Option>, +} + +impl ExecutionNode { + pub fn node_type_enum(&self) -> Result { + ExecutionNodeType::try_from(self.node_type.as_str()) + } + + pub fn is_action_call(&self) -> bool { + matches!( + ExecutionNodeType::try_from(self.node_type.as_str()), + Ok(ExecutionNodeType::ActionCall) + ) + } + + pub fn is_sleep(&self) -> bool { + matches!( + ExecutionNodeType::try_from(self.node_type.as_str()), + Ok(ExecutionNodeType::Sleep) + ) + } +} + +#[derive(Clone, Debug, Default)] +pub struct QueueNodeParams { + pub node_id: Option, + pub template_id: Option, + pub targets: Option>, + pub action: Option, + pub value_expr: Option, + pub scheduled_at: Option>, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ExecutionEdge { + pub source: Uuid, + pub target: Uuid, + pub edge_type: EdgeType, +} + +/// Track queued/executed DAG nodes with an unrolled, symbolic state. +/// +/// Design overview: +/// - The runner state is not a variable heap; it is the runtime graph itself, +/// unrolled to the exact nodes that have been queued or executed. +/// - Each execution node stores assignments as symbolic expressions so action +/// results can be replayed later without having the concrete payloads. +/// - Data-flow edges encode which execution node supplies a value to another, +/// while state-machine edges encode execution ordering and control flow. This +/// mirrors how the ground truth IR->DAG functions. +/// +/// Expected usage: +/// - Callers queue nodes as the program executes (ie. the DAG template is +/// walked) so loops and spreads expand into explicit iterations. +/// - Callers never mutate variables directly; they record assignments on nodes +/// and let replay walk the graph to reconstruct values. +/// - Persisted state can be rehydrated only with nodes/edges. The constructor will +/// rebuild in-memory cache (like timeline ordering and latest assignment tracking). +/// +/// In short, RunnerState is the ground-truth runtime DAG: symbolic assignments +/// plus control/data edges, suitable for replay and visualization. +/// +/// Action nodes represent our "frontier" nodes. Because of how we construct the graph and always +/// greedily walk the state until we hit the next actions that are possible to run, we guarantee that +/// leaf nodes are only ever actions. +/// +/// Cycle walkthrough (mid-loop example): +/// Suppose we are partway through: +/// - results = [] +/// - for item in items: +/// - action_result = @action(item) +/// - results = results + [action_result + 1] +/// +/// On a single iteration update: +/// 1) The runner queues an action node for @action(item). +/// - A new execution node is created with a UUID id. +/// - Its assignments map action_result -> ActionResultValue(node_id). +/// - Data-flow edges are added from the node that last defined `item`. +/// 2) The runner queues the assignment node for results update. +/// - The RHS expression is materialized: +/// results + [action_result + 1] becomes a BinaryOpValue whose tree +/// contains the ActionResultValue from step (1), plus a LiteralValue(1). +/// - Data-flow edges are added from the prior results definition node and +/// from the action node created in step (1). +/// - Latest assignment tracking is updated so `results` now points to this +/// new execution node. +/// +/// After this iteration, the state graph has explicit nodes for the current +/// action and the results update. Subsequent iterations repeat the same +/// sequence, producing a chain of assignments where replay can reconstruct the +/// incremental `results` value by following data-flow edges. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct RunnerState { + #[serde(skip, default)] + pub dag: Option>, + pub nodes: HashMap, + pub edges: HashSet, + pub ready_queue: Vec, + pub timeline: Vec, + link_queued_nodes: bool, + latest_assignments: HashMap, + graph_dirty: bool, +} + +impl RunnerState { + pub fn new( + dag: Option>, + nodes: Option>, + edges: Option>, + link_queued_nodes: bool, + ) -> Self { + let mut state = Self { + dag, + nodes: nodes.unwrap_or_default(), + edges: edges.unwrap_or_default(), + ready_queue: Vec::new(), + timeline: Vec::new(), + link_queued_nodes, + latest_assignments: HashMap::new(), + graph_dirty: false, + }; + if !state.nodes.is_empty() || !state.edges.is_empty() { + state.rehydrate_state(); + } + state + } + + /// TODO: make this `pub(crate)` again + pub fn set_link_queued_nodes(&mut self, value: bool) { + self.link_queued_nodes = value; + } + + /// TODO: make this `pub(crate)` again + pub fn latest_assignment(&self, name: &str) -> Option { + self.latest_assignments.get(name).copied() + } + + /// Queue a runtime node based on the DAG template and apply its effects. + /// + /// Use this when stepping through a compiled DAG so the runtime state mirrors + /// the template node (assignments, action results, and data-flow edges). + /// + /// Example IR: + /// - total = a + b + /// When the AssignmentNode template is queued, the execution node records + /// the symbolic BinaryOpValue and updates data-flow edges from a/b. + pub fn queue_template_node( + &mut self, + template_id: &str, + iteration_index: Option, + ) -> Result { + let dag = self + .dag + .as_ref() + .ok_or_else(|| RunnerStateError("runner state has no DAG template".to_string()))?; + let template = dag + .nodes + .get(template_id) + .ok_or_else(|| RunnerStateError(format!("template node not found: {template_id}")))? + .clone(); + + let node_id = Uuid::new_v4(); + let node = ExecutionNode { + node_id, + node_type: template.node_type().to_string(), + label: template.label(), + status: NodeStatus::Queued, + template_id: Some(template_id.to_string()), + targets: self.node_targets(&template), + action: if let DAGNode::ActionCall(action_node) = &template { + Some(self.action_spec_from_node(action_node)) + } else { + None + }, + value_expr: None, + assignments: HashMap::new(), + action_attempt: if matches!(template, DAGNode::ActionCall(_)) { + 1 + } else { + 0 + }, + started_at: None, + completed_at: None, + scheduled_at: None, + }; + + self.register_node(node.clone())?; + self.apply_template_node(&node, &template, iteration_index)?; + Ok(node) + } + + /// Create a runtime node directly without a DAG template. + /// + /// Use this for ad-hoc nodes (tests, synthetic steps) and as a common + /// builder for higher-level queue helpers like queue_action. + /// + /// Example: + /// - queue_node(node_type="assignment", label="results = []") + pub fn queue_node( + &mut self, + node_type: &str, + label: &str, + params: QueueNodeParams, + ) -> Result { + let node_type_enum = ExecutionNodeType::try_from(node_type)?; + let QueueNodeParams { + node_id, + template_id, + targets, + action, + value_expr, + scheduled_at, + } = params; + let node_id = node_id.unwrap_or_else(Uuid::new_v4); + let action_attempt = if matches!(node_type_enum, ExecutionNodeType::ActionCall) { + 1 + } else { + 0 + }; + let node = ExecutionNode { + node_id, + node_type: node_type.to_string(), + label: label.to_string(), + status: NodeStatus::Queued, + template_id, + targets: targets.unwrap_or_default(), + action, + value_expr, + assignments: HashMap::new(), + action_attempt, + started_at: None, + completed_at: None, + scheduled_at, + }; + self.register_node(node.clone())?; + Ok(node) + } + + /// Queue an action call from IR, respecting a local scope for loop vars. + /// + /// Use this during IR -> runner-state conversion (including spreads) so + /// action arguments are converted to symbolic expressions. + /// + /// Example IR: + /// - @double(value=item) + /// With local_scope={"item": LiteralValue(2)}, the queued action uses a + /// literal argument and links data-flow to the literal's source nodes. + pub fn queue_action_call( + &mut self, + action: &ir::ActionCall, + targets: Option>, + iteration_index: Option, + local_scope: Option<&HashMap>, + ) -> Result { + let spec = self.action_spec_from_ir(action, local_scope); + let node = self.queue_node( + ExecutionNodeType::ActionCall.as_str(), + &format!("@{}()", spec.action_name), + QueueNodeParams { + targets: targets.clone(), + action: Some(spec.clone()), + ..QueueNodeParams::default() + }, + )?; + for value in spec.kwargs.values() { + self.record_data_flow_from_value(node.node_id, value); + } + let result = self.assign_action_results( + &node, + &spec.action_name, + targets.as_deref(), + iteration_index, + true, + )?; + if let Some(node_mut) = self.nodes.get_mut(&node.node_id) { + node_mut.value_expr = Some(ValueExpr::ActionResult(result.clone())); + } + Ok(result) + } + + pub fn mark_running(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { + let is_action = { + let node = self.get_node_mut(node_id)?; + node.status = NodeStatus::Running; + let is_action = node.is_action_call(); + if is_action { + node.started_at = Some(Utc::now()); + node.completed_at = None; + } + is_action + }; + self.ready_queue.retain(|id| id != &node_id); + if is_action { + self.mark_graph_dirty(); + } + Ok(()) + } + + pub fn mark_completed(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { + let is_action = { + let node = self.get_node_mut(node_id)?; + node.status = NodeStatus::Completed; + let is_action = node.is_action_call(); + if is_action { + node.completed_at = Some(Utc::now()); + } + node.scheduled_at = None; + is_action + }; + self.ready_queue.retain(|id| id != &node_id); + if is_action { + self.mark_graph_dirty(); + } + Ok(()) + } + + pub fn mark_failed(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { + let is_action = { + let node = self.get_node_mut(node_id)?; + node.status = NodeStatus::Failed; + let is_action = node.is_action_call(); + if is_action { + node.completed_at = Some(Utc::now()); + } + node.scheduled_at = None; + is_action + }; + self.ready_queue.retain(|id| id != &node_id); + if is_action { + self.mark_graph_dirty(); + } + Ok(()) + } + + pub fn set_node_scheduled_at( + &mut self, + node_id: Uuid, + scheduled_at: Option>, + ) -> Result<(), RunnerStateError> { + let node = self.get_node_mut(node_id)?; + node.scheduled_at = scheduled_at; + self.mark_graph_dirty(); + Ok(()) + } + + pub fn increment_action_attempt(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { + let node = self.get_node_mut(node_id)?; + if !node.is_action_call() { + return Err(RunnerStateError( + "action attempt increment requires an action_call node".to_string(), + )); + } + node.action_attempt += 1; + self.mark_graph_dirty(); + Ok(()) + } + + /// Return and clear the graph dirty bit for durable execution. + /// + /// Only action nodes and their retry parameters must be persisted; other + /// nodes are deterministic from the ground-truth DAG definition. + pub fn consume_graph_dirty_for_durable_execution(&mut self) -> bool { + let dirty = self.graph_dirty; + self.graph_dirty = false; + dirty + } + + pub fn add_edge(&mut self, source: Uuid, target: Uuid, edge_type: EdgeType) { + self.register_edge(ExecutionEdge { + source, + target, + edge_type, + }); + } + + /// Insert a node into the runtime bookkeeping and optional control flow. + /// + /// Use this for all queued nodes so the ready queue, timeline, and implicit + /// state-machine edge ordering remain consistent. + /// + /// Example: + /// - queue node A then node B with link_queued_nodes=True + /// This creates a state-machine edge A -> B automatically. + fn register_node(&mut self, node: ExecutionNode) -> Result<(), RunnerStateError> { + if self.nodes.contains_key(&node.node_id) { + return Err(RunnerStateError(format!( + "execution node already queued: {}", + node.node_id + ))); + } + self.nodes.insert(node.node_id, node.clone()); + self.ready_queue.push(node.node_id); + if node.is_action_call() { + self.mark_graph_dirty(); + } + if self.link_queued_nodes + && let Some(last) = self.timeline.last() + { + self.register_edge(ExecutionEdge { + source: *last, + target: node.node_id, + edge_type: EdgeType::StateMachine, + }); + } + self.timeline.push(node.node_id); + Ok(()) + } + + fn register_edge(&mut self, edge: ExecutionEdge) { + self.edges.insert(edge); + } + + fn mark_graph_dirty(&mut self) { + self.graph_dirty = true; + } + + /// Rebuild derived structures from persisted nodes and edges. + /// + /// Use this when loading a snapshot so timeline ordering, latest assignment + /// tracking, and ready queue reflect the current node set. + /// + /// Example: + /// - Given nodes {A, B} and edge A -> B, rehydration restores timeline + /// [A, B] and marks the latest assignment targets from node B. + fn rehydrate_state(&mut self) { + self.timeline = self.build_timeline(); + self.latest_assignments.clear(); + for node_id in &self.timeline { + if let Some(node) = self.nodes.get(node_id) { + for target in node.assignments.keys() { + self.latest_assignments.insert(target.clone(), *node_id); + } + } + } + if self.ready_queue.is_empty() { + self.ready_queue = self + .timeline + .iter() + .filter(|node_id| { + self.nodes + .get(node_id) + .map(|node| node.status == NodeStatus::Queued) + .unwrap_or(false) + }) + .cloned() + .collect(); + } + } + + fn build_timeline(&self) -> Vec { + if self.edges.is_empty() { + return self.nodes.keys().cloned().collect(); + } + let mut adjacency: HashMap> = self + .nodes + .keys() + .map(|node_id| (*node_id, Vec::new())) + .collect(); + let mut in_degree: HashMap = + self.nodes.keys().map(|node_id| (*node_id, 0)).collect(); + let mut edges: Vec<&ExecutionEdge> = self.edges.iter().collect(); + edges.sort_by_key(|edge| (edge.source, edge.target)); + for edge in edges { + if edge.edge_type != EdgeType::StateMachine { + continue; + } + if adjacency.contains_key(&edge.source) && adjacency.contains_key(&edge.target) { + adjacency.entry(edge.source).or_default().push(edge.target); + *in_degree.entry(edge.target).or_insert(0) += 1; + } + } + let mut queue: Vec = in_degree + .iter() + .filter(|(_, degree)| **degree == 0) + .map(|(node_id, _)| *node_id) + .collect(); + queue.sort_by_key(|id| id.to_string()); + let mut order: Vec = Vec::new(); + while !queue.is_empty() { + let node_id = queue.remove(0); + order.push(node_id); + if let Some(neighbors) = adjacency.get(&node_id) { + let mut sorted = neighbors.clone(); + sorted.sort_by_key(|id| id.to_string()); + for neighbor in sorted { + if let Some(degree) = in_degree.get_mut(&neighbor) { + *degree -= 1; + if *degree == 0 { + queue.push(neighbor); + } + } + } + queue.sort_by_key(|id| id.to_string()); + } + } + let mut remaining: Vec = self + .nodes + .keys() + .filter(|node_id| !order.contains(node_id)) + .cloned() + .collect(); + remaining.sort_by_key(|id| id.to_string()); + order.extend(remaining); + order + } + + fn get_node_mut(&mut self, node_id: Uuid) -> Result<&mut ExecutionNode, RunnerStateError> { + self.nodes + .get_mut(&node_id) + .ok_or_else(|| RunnerStateError(format!("execution node not found: {node_id}"))) + } + + fn node_targets(&self, node: &DAGNode) -> Vec { + match node { + DAGNode::Assignment(AssignmentNode { + targets, target, .. + }) => { + if !targets.is_empty() { + return targets.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + DAGNode::ActionCall(ActionCallNode { + targets, target, .. + }) => { + if let Some(list) = targets + && !list.is_empty() + { + return list.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + DAGNode::FnCall(FnCallNode { + targets, target, .. + }) => { + if let Some(list) = targets + && !list.is_empty() + { + return list.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + DAGNode::Join(JoinNode { + targets, target, .. + }) => { + if let Some(list) = targets + && !list.is_empty() + { + return list.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + DAGNode::Aggregator(AggregatorNode { + targets, target, .. + }) => { + if let Some(list) = targets + && !list.is_empty() + { + return list.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + DAGNode::Return(ReturnNode { + targets, target, .. + }) => { + if let Some(list) = targets + && !list.is_empty() + { + return list.clone(); + } + target.clone().map(|item| vec![item]).unwrap_or_default() + } + _ => Vec::new(), + } + } + + /// Apply DAG template semantics to a queued execution node. + /// + /// Use this right after queue_template_node so assignments, action result + /// references, and data-flow edges are populated from the template. + /// + /// Example IR: + /// - total = @sum(values=items) + /// The ActionCallNode template produces an ActionResultValue and defines + /// total via assignments on the execution node. + fn apply_template_node( + &mut self, + exec_node: &ExecutionNode, + template: &DAGNode, + iteration_index: Option, + ) -> Result<(), RunnerStateError> { + match template { + DAGNode::Assignment(AssignmentNode { + assign_expr: Some(expr), + .. + }) => { + let value_expr = self.expr_to_value(expr, None)?; + if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { + node_mut.value_expr = Some(value_expr.clone()); + } + self.record_data_flow_from_value(exec_node.node_id, &value_expr); + let assignments = + self.build_assignments(&self.node_targets(template), &value_expr)?; + if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { + node.assignments.extend(assignments.clone()); + } + self.mark_latest_assignments(exec_node.node_id, &assignments); + return Ok(()); + } + DAGNode::ActionCall(ActionCallNode { + action_name, + targets, + target, + .. + }) => { + let kwarg_values = self + .nodes + .get(&exec_node.node_id) + .and_then(|node| node.action.as_ref()) + .map(|action| action.kwargs.values().cloned().collect::>()) + .unwrap_or_default(); + for expr in &kwarg_values { + self.record_data_flow_from_value(exec_node.node_id, expr); + } + let targets = targets + .clone() + .or_else(|| target.clone().map(|item| vec![item])); + let result = self.assign_action_results( + exec_node, + action_name, + targets.as_deref(), + iteration_index, + true, + )?; + if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { + node_mut.value_expr = Some(ValueExpr::ActionResult(result)); + } + return Ok(()); + } + DAGNode::Sleep(SleepNode { + duration_expr: Some(expr), + .. + }) => { + let value_expr = self.expr_to_value(expr, None)?; + if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { + node_mut.value_expr = Some(value_expr.clone()); + } + self.record_data_flow_from_value(exec_node.node_id, &value_expr); + return Ok(()); + } + DAGNode::FnCall(FnCallNode { + assign_expr: Some(expr), + .. + }) => { + let value_expr = self.expr_to_value(expr, None)?; + if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { + node_mut.value_expr = Some(value_expr.clone()); + } + self.record_data_flow_from_value(exec_node.node_id, &value_expr); + let assignments = + self.build_assignments(&self.node_targets(template), &value_expr)?; + if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { + node.assignments.extend(assignments.clone()); + } + self.mark_latest_assignments(exec_node.node_id, &assignments); + return Ok(()); + } + DAGNode::Return(ReturnNode { + assign_expr: Some(expr), + target, + .. + }) => { + let value_expr = self.expr_to_value(expr, None)?; + if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { + node_mut.value_expr = Some(value_expr.clone()); + } + self.record_data_flow_from_value(exec_node.node_id, &value_expr); + let target = target.clone().unwrap_or_else(|| "result".to_string()); + let assignments = self.build_assignments(&[target], &value_expr)?; + if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { + node.assignments.extend(assignments.clone()); + } + self.mark_latest_assignments(exec_node.node_id, &assignments); + return Ok(()); + } + _ => {} + } + Ok(()) + } + + /// Create symbolic action results and map them to targets. + /// + /// Use this when an action produces one or more results that are assigned + /// to variables (including tuple unpacking). + /// + /// `update_latest` controls whether assigned targets are published into + /// `latest_assignments` for downstream variable/data-flow resolution. + /// + /// Use `update_latest = true` for user-visible assignments so later nodes + /// can resolve those target names through `latest_assignments`. + /// + /// Use `update_latest = false` for internal/synthetic bindings that should + /// not become globally visible variable definitions. Example: spread action + /// unroll nodes can bind an internal `_spread_result`, and the aggregator + /// later publishes the final user target. + /// + /// Example IR: + /// - a, b = @pair() + /// This yields ActionResultValue(node_id, result_index=0/1) for a and b. + /// + /// TODO: make this `pub(crate)` again + pub fn assign_action_results( + &mut self, + node: &ExecutionNode, + action_name: &str, + targets: Option<&[String]>, + iteration_index: Option, + update_latest: bool, + ) -> Result { + let result_ref = ActionResultValue { + node_id: node.node_id, + action_name: action_name.to_string(), + iteration_index, + result_index: None, + }; + let targets = targets.unwrap_or(&[]); + let assignments = + self.build_assignments(targets, &ValueExpr::ActionResult(result_ref.clone()))?; + if !assignments.is_empty() { + if let Some(node) = self.nodes.get_mut(&node.node_id) { + node.assignments.extend(assignments.clone()); + } + if update_latest { + self.mark_latest_assignments(node.node_id, &assignments); + } + } + Ok(result_ref) + } + + /// Expand an assignment into per-target symbolic values. + /// + /// Use this for single-target assignments, tuple unpacking, and action + /// multi-result binding to keep definitions explicit. + /// + /// Example IR: + /// - a, b = [1, 2] + /// Produces {"a": LiteralValue(1), "b": LiteralValue(2)}. + fn build_assignments( + &self, + targets: &[String], + value: &ValueExpr, + ) -> Result, RunnerStateError> { + if targets.is_empty() { + return Ok(HashMap::new()); + } + if targets.len() == 1 { + let mut map = HashMap::new(); + // Keep single-target assignments symbolic to avoid recursively + // embedding prior values into each update (which can explode + // persisted runner_instances.state size/depth in loops). + map.insert(targets[0].clone(), value.clone()); + return Ok(map); + } + let value = self.materialize_value(value.clone()); + + match value { + ValueExpr::List(ListValue { elements }) => { + if elements.len() != targets.len() { + return Err(RunnerStateError("tuple unpacking mismatch".to_string())); + } + let mut map = HashMap::new(); + for (target, item) in targets.iter().zip(elements.into_iter()) { + map.insert(target.clone(), item); + } + Ok(map) + } + ValueExpr::ActionResult(action_value) => { + let mut map = HashMap::new(); + for (idx, target) in targets.iter().enumerate() { + map.insert( + target.clone(), + ValueExpr::ActionResult(ActionResultValue { + node_id: action_value.node_id, + action_name: action_value.action_name.clone(), + iteration_index: action_value.iteration_index, + result_index: Some(idx as i32), + }), + ); + } + Ok(map) + } + ValueExpr::FunctionCall(func_value) => { + let mut map = HashMap::new(); + for (idx, target) in targets.iter().enumerate() { + map.insert( + target.clone(), + ValueExpr::Index(IndexValue { + object: Box::new(ValueExpr::FunctionCall(func_value.clone())), + index: Box::new(ValueExpr::Literal(LiteralValue { + value: serde_json::Value::Number((idx as i64).into()), + })), + }), + ); + } + Ok(map) + } + ValueExpr::Index(index_value) => { + let mut map = HashMap::new(); + for (idx, target) in targets.iter().enumerate() { + map.insert( + target.clone(), + ValueExpr::Index(IndexValue { + object: Box::new(ValueExpr::Index(index_value.clone())), + index: Box::new(ValueExpr::Literal(LiteralValue { + value: serde_json::Value::Number((idx as i64).into()), + })), + }), + ); + } + Ok(map) + } + _ => Err(RunnerStateError("tuple unpacking mismatch".to_string())), + } + } + + /// Inline variable references and apply light constant folding. + /// + /// Use this before storing assignments so values are self-contained and + /// list concatenations are simplified. + /// + /// Example IR: + /// - xs = [1] + /// - ys = xs + [2] + /// Materialization turns ys into ListValue([1, 2]) rather than keeping xs. + pub fn materialize_value(&self, value: ValueExpr) -> ValueExpr { + let resolved = resolve_value_tree(&value, &|name, seen| { + self.resolve_variable_value(name, seen) + }); + if let ValueExpr::BinaryOp(BinaryOpValue { left, op, right }) = &resolved + && ir::BinaryOperator::try_from(*op).ok() == Some(ir::BinaryOperator::BinaryOpAdd) + && let (ValueExpr::List(left_list), ValueExpr::List(right_list)) = (&**left, &**right) + { + let mut elements = left_list.elements.clone(); + elements.extend(right_list.elements.clone()); + return ValueExpr::List(ListValue { elements }); + } + resolved + } + + /// Resolve a variable name to its latest symbolic definition. + /// + /// Use this when materializing expressions so variables become their + /// defining expression while guarding against cycles. + /// + /// Example IR: + /// - x = 1 + /// - y = x + 2 + /// When materializing y, the VariableValue("x") is replaced with the + /// LiteralValue(1), yielding a BinaryOpValue(1 + 2) instead of a reference + /// to x. This makes downstream replay use the symbolic expression rather + /// than requiring a separate variable lookup. + fn resolve_variable_value(&self, name: &str, seen: &mut HashSet) -> ValueExpr { + if seen.contains(name) { + return ValueExpr::Variable(VariableValue { + name: name.to_string(), + }); + } + let node_id = match self.latest_assignments.get(name) { + Some(node_id) => *node_id, + None => { + return ValueExpr::Variable(VariableValue { + name: name.to_string(), + }); + } + }; + let node = match self.nodes.get(&node_id) { + Some(node) => node, + None => { + return ValueExpr::Variable(VariableValue { + name: name.to_string(), + }); + } + }; + let assigned = match node.assignments.get(name) { + Some(value) => value.clone(), + None => { + return ValueExpr::Variable(VariableValue { + name: name.to_string(), + }); + } + }; + // Avoid inlining self-referential updates such as `i = i + 1`. + // Returning the raw assignment here would inject one "extra step" + // into materialized consumers (e.g. loop guards), causing off-by-one + // behavior and deep recursive expression trees. + if value_expr_contains_variable(&assigned, name) { + return ValueExpr::Variable(VariableValue { + name: name.to_string(), + }); + } + if let ValueExpr::Variable(var) = &assigned { + seen.insert(name.to_string()); + return self.resolve_variable_value(&var.name, seen); + } + assigned + } + + /// TODO: make this `pub(crate)` again + pub fn mark_latest_assignments( + &mut self, + node_id: Uuid, + assignments: &HashMap, + ) { + for target in assignments.keys() { + self.latest_assignments.insert(target.clone(), node_id); + } + } + + /// Add data-flow edges implied by a value expression. + /// + /// Use this when a node consumes an expression so upstream dependencies are + /// encoded in the runtime graph. + /// + /// Example IR: + /// - total = @sum(values) + /// A data-flow edge is added from the values assignment node to the action. + /// + /// TODO: make this `pub(crate)` again + pub fn record_data_flow_from_value(&mut self, node_id: Uuid, value: &ValueExpr) { + let source_ids = + collect_value_sources(value, &|name| self.latest_assignments.get(name).copied()); + self.record_data_flow_edges(node_id, &source_ids); + } + + /// Register data-flow edges from sources to the given node. + /// + /// Example: + /// - sources {A, B} and node C produce edges A -> C and B -> C. + fn record_data_flow_edges(&mut self, node_id: Uuid, source_ids: &HashSet) { + for source_id in source_ids { + if *source_id == node_id { + continue; + } + self.register_edge(ExecutionEdge { + source: *source_id, + target: node_id, + edge_type: EdgeType::DataFlow, + }); + } + } + + /// Convert an IR expression into a symbolic ValueExpr tree. + /// + /// Use this when interpreting IR statements or DAG templates into the + /// runtime state; it queues actions and spreads as needed. + /// + /// Example IR: + /// - total = base + 1 + /// Produces BinaryOpValue(VariableValue("base"), LiteralValue(1)). + pub fn expr_to_value( + &mut self, + expr: &ir::Expr, + local_scope: Option<&HashMap>, + ) -> Result { + match expr.kind.as_ref() { + Some(ir::expr::Kind::Literal(lit)) => Ok(ValueExpr::Literal(LiteralValue { + value: literal_value(lit), + })), + Some(ir::expr::Kind::Variable(var)) => { + if let Some(scope) = local_scope + && let Some(value) = scope.get(&var.name) + { + return Ok(value.clone()); + } + Ok(ValueExpr::Variable(VariableValue { + name: var.name.clone(), + })) + } + Some(ir::expr::Kind::BinaryOp(op)) => { + let left = op + .left + .as_ref() + .ok_or_else(|| RunnerStateError("binary op missing left".to_string()))?; + let right = op + .right + .as_ref() + .ok_or_else(|| RunnerStateError("binary op missing right".to_string()))?; + let left_value = self.expr_to_value(left, local_scope)?; + let right_value = self.expr_to_value(right, local_scope)?; + Ok(self.binary_op_value(op.op, left_value, right_value)) + } + Some(ir::expr::Kind::UnaryOp(op)) => { + let operand = op + .operand + .as_ref() + .ok_or_else(|| RunnerStateError("unary op missing operand".to_string()))?; + let operand_value = self.expr_to_value(operand, local_scope)?; + Ok(self.unary_op_value(op.op, operand_value)) + } + Some(ir::expr::Kind::List(list)) => { + let elements = list + .elements + .iter() + .map(|item| self.expr_to_value(item, local_scope)) + .collect::, RunnerStateError>>()?; + Ok(ValueExpr::List(ListValue { elements })) + } + Some(ir::expr::Kind::Dict(dict_expr)) => { + let mut entries = Vec::new(); + for entry in &dict_expr.entries { + let key_expr = entry + .key + .as_ref() + .ok_or_else(|| RunnerStateError("dict entry missing key".to_string()))?; + let value_expr = entry + .value + .as_ref() + .ok_or_else(|| RunnerStateError("dict entry missing value".to_string()))?; + entries.push(DictEntryValue { + key: self.expr_to_value(key_expr, local_scope)?, + value: self.expr_to_value(value_expr, local_scope)?, + }); + } + Ok(ValueExpr::Dict(DictValue { entries })) + } + Some(ir::expr::Kind::Index(index)) => { + let object = index + .object + .as_ref() + .ok_or_else(|| RunnerStateError("index access missing object".to_string()))?; + let index_expr = index + .index + .as_ref() + .ok_or_else(|| RunnerStateError("index access missing index".to_string()))?; + let object_value = self.expr_to_value(object, local_scope)?; + let index_value = self.expr_to_value(index_expr, local_scope)?; + Ok(self.index_value(object_value, index_value)) + } + Some(ir::expr::Kind::Dot(dot)) => { + let object = dot + .object + .as_ref() + .ok_or_else(|| RunnerStateError("dot access missing object".to_string()))?; + Ok(ValueExpr::Dot(DotValue { + object: Box::new(self.expr_to_value(object, local_scope)?), + attribute: dot.attribute.clone(), + })) + } + Some(ir::expr::Kind::FunctionCall(call)) => { + let args = call + .args + .iter() + .map(|arg| self.expr_to_value(arg, local_scope)) + .collect::, RunnerStateError>>()?; + let mut kwargs = HashMap::new(); + for kw in &call.kwargs { + if let Some(value) = &kw.value { + kwargs.insert(kw.name.clone(), self.expr_to_value(value, local_scope)?); + } + } + let global_fn = if call.global_function != 0 { + Some(call.global_function) + } else { + None + }; + Ok(ValueExpr::FunctionCall(FunctionCallValue { + name: call.name.clone(), + args, + kwargs, + global_function: global_fn, + })) + } + Some(ir::expr::Kind::ActionCall(action)) => { + let result = self.queue_action_call(action, None, None, local_scope)?; + Ok(ValueExpr::ActionResult(result)) + } + Some(ir::expr::Kind::ParallelExpr(parallel)) => { + let mut calls = Vec::new(); + for call in ¶llel.calls { + calls.push(self.call_to_value(call, local_scope)?); + } + Ok(ValueExpr::List(ListValue { elements: calls })) + } + Some(ir::expr::Kind::SpreadExpr(spread)) => self.spread_expr_value(spread, local_scope), + None => Ok(ValueExpr::Literal(LiteralValue { + value: serde_json::Value::Null, + })), + } + } + + /// Convert an IR call (action/function) into a ValueExpr. + /// + /// Use this for parallel expressions that contain mixed call types. + /// + /// Example IR: + /// - parallel { @double(x), helper(x) } + /// Action calls become ActionResultValue nodes; function calls become + /// FunctionCallValue expressions. + fn call_to_value( + &mut self, + call: &ir::Call, + local_scope: Option<&HashMap>, + ) -> Result { + match call.kind.as_ref() { + Some(ir::call::Kind::Action(action)) => Ok(ValueExpr::ActionResult( + self.queue_action_call(action, None, None, local_scope)?, + )), + Some(ir::call::Kind::Function(function)) => self.expr_to_value( + &ir::Expr { + kind: Some(ir::expr::Kind::FunctionCall(function.clone())), + span: None, + }, + local_scope, + ), + None => Ok(ValueExpr::Literal(LiteralValue { + value: serde_json::Value::Null, + })), + } + } + + /// Materialize a spread expression into concrete calls or a symbolic spread. + /// + /// Use this when converting IR spreads so known list collections unroll to + /// explicit action calls, while unknown collections stay symbolic. + /// + /// Example IR: + /// - spread [1, 2]:item -> @double(value=item) + /// Produces a ListValue of ActionResultValue entries for each item. + fn spread_expr_value( + &mut self, + spread: &ir::SpreadExpr, + local_scope: Option<&HashMap>, + ) -> Result { + let collection = self.expr_to_value( + spread + .collection + .as_ref() + .ok_or_else(|| RunnerStateError("spread collection missing".to_string()))?, + local_scope, + )?; + if let ValueExpr::List(list) = &collection { + let mut results = Vec::new(); + for (idx, item) in list.elements.iter().enumerate() { + let mut scope = HashMap::new(); + scope.insert(spread.loop_var.clone(), item.clone()); + let result = self.queue_action_call( + spread + .action + .as_ref() + .ok_or_else(|| RunnerStateError("spread action missing".to_string()))?, + None, + Some(idx as i32), + Some(&scope), + )?; + results.push(ValueExpr::ActionResult(result)); + } + return Ok(ValueExpr::List(ListValue { elements: results })); + } + + let action_spec = self.action_spec_from_ir( + spread + .action + .as_ref() + .ok_or_else(|| RunnerStateError("spread action missing".to_string()))?, + None, + ); + Ok(ValueExpr::Spread(SpreadValue { + collection: Box::new(collection), + loop_var: spread.loop_var.clone(), + action: action_spec, + })) + } + + /// Build a binary-op value with simple constant folding. + /// + /// Use this when converting IR so literals and list concatenations are + /// simplified early. + /// + /// Example IR: + /// - total = 1 + 2 + /// Produces LiteralValue(3) instead of a BinaryOpValue. + fn binary_op_value(&self, op: i32, left: ValueExpr, right: ValueExpr) -> ValueExpr { + if ir::BinaryOperator::try_from(op).ok() == Some(ir::BinaryOperator::BinaryOpAdd) + && let (ValueExpr::List(left_list), ValueExpr::List(right_list)) = (&left, &right) + { + let mut elements = left_list.elements.clone(); + elements.extend(right_list.elements.clone()); + return ValueExpr::List(ListValue { elements }); + } + if let (ValueExpr::Literal(left_val), ValueExpr::Literal(right_val)) = (&left, &right) + && let Some(folded) = fold_literal_binary(op, &left_val.value, &right_val.value) + { + return ValueExpr::Literal(LiteralValue { value: folded }); + } + ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(left), + op, + right: Box::new(right), + }) + } + + /// Build a unary-op value with constant folding for literals. + /// + /// Example IR: + /// - neg = -1 + /// Produces LiteralValue(-1) instead of UnaryOpValue. + fn unary_op_value(&self, op: i32, operand: ValueExpr) -> ValueExpr { + if let ValueExpr::Literal(lit) = &operand + && let Some(folded) = fold_literal_unary(op, &lit.value) + { + return ValueExpr::Literal(LiteralValue { value: folded }); + } + ValueExpr::UnaryOp(UnaryOpValue { + op, + operand: Box::new(operand), + }) + } + + /// Build an index value, folding list literals when possible. + /// + /// Example IR: + /// - first = [10, 20][0] + /// Produces LiteralValue(10) when the list is fully literal. + fn index_value(&self, object: ValueExpr, index: ValueExpr) -> ValueExpr { + if let (ValueExpr::List(list), ValueExpr::Literal(idx)) = (&object, &index) + && let Some(idx) = idx.value.as_i64() + && idx >= 0 + && (idx as usize) < list.elements.len() + { + return list.elements[idx as usize].clone(); + } + ValueExpr::Index(IndexValue { + object: Box::new(object), + index: Box::new(index), + }) + } + + /// Extract an action call spec from a DAG node. + /// + /// Use this when queueing nodes from the DAG template. + /// + /// Example: + /// - ActionCallNode(action_name="double", kwargs={"value": "$x"}) + /// Produces ActionCallSpec(action_name="double", kwargs={"value": VariableValue("x")}). + fn action_spec_from_node(&mut self, node: &ActionCallNode) -> ActionCallSpec { + let kwargs = node + .kwarg_exprs + .iter() + .map(|(name, expr)| (name.clone(), self.expr_to_value(expr, None).unwrap())) + .collect(); + ActionCallSpec { + action_name: node.action_name.clone(), + module_name: node.module_name.clone(), + kwargs, + } + } + + /// Extract an action call spec from IR, applying local scope bindings. + /// + /// Example IR: + /// - @double(value=item) with local_scope["item"]=LiteralValue(2) + /// Produces kwargs {"value": LiteralValue(2)}. + fn action_spec_from_ir( + &mut self, + action: &ir::ActionCall, + local_scope: Option<&HashMap>, + ) -> ActionCallSpec { + let kwargs = action + .kwargs + .iter() + .filter_map(|kw| kw.value.as_ref().map(|value| (kw.name.clone(), value))) + .map(|(name, value)| (name, self.expr_to_value(value, local_scope).unwrap())) + .collect(); + ActionCallSpec { + action_name: action.action_name.clone(), + module_name: action.module_name.clone(), + kwargs, + } + } + + /// Queue an action call from raw parameters and return a symbolic result. + /// + /// Use this when constructing runner state programmatically without IR + /// objects, while still wiring data-flow edges and assignments. + /// + /// Example: + /// - queue_action("double", targets=["out"], kwargs={"value": LiteralValue(2)}) + /// Defines out via an ActionResultValue and records data-flow from the literal. + pub fn queue_action( + &mut self, + action_name: &str, + targets: Option>, + kwargs: Option>, + module_name: Option, + iteration_index: Option, + ) -> Result { + let spec = ActionCallSpec { + action_name: action_name.to_string(), + module_name, + kwargs: kwargs.unwrap_or_default(), + }; + let node = self.queue_node( + ExecutionNodeType::ActionCall.as_str(), + &format!("@{}()", spec.action_name), + QueueNodeParams { + targets: targets.clone(), + action: Some(spec.clone()), + ..QueueNodeParams::default() + }, + )?; + for value in spec.kwargs.values() { + self.record_data_flow_from_value(node.node_id, value); + } + let result = self.assign_action_results( + &node, + &spec.action_name, + targets.as_deref(), + iteration_index, + true, + )?; + if let Some(node) = self.nodes.get_mut(&node.node_id) { + node.value_expr = Some(ValueExpr::ActionResult(result.clone())); + } + Ok(result) + } + + /// Record an IR assignment as a runtime node with symbolic values. + /// + /// Use this when interpreting IR statements into the unrolled runtime graph. + /// + /// Example IR: + /// - results = [] + /// Produces an assignment node with targets ["results"] and a ListValue([]). + pub fn record_assignment( + &mut self, + targets: Vec, + expr: &ir::Expr, + node_id: Option, + label: Option, + ) -> Result { + let value_expr = self.expr_to_value(expr, None)?; + self.record_assignment_value(targets, value_expr, node_id, label) + } + + /// Record a symbolic assignment node and update data-flow/definitions. + /// + /// Use this for assignments created programmatically after ValueExpr + /// construction (tests or state rewrites). + /// + /// Example: + /// - record_assignment_value(targets=["x"], value_expr=LiteralValue(1)) + /// Creates an assignment node with x bound to LiteralValue(1). + pub fn record_assignment_value( + &mut self, + targets: Vec, + value_expr: ValueExpr, + node_id: Option, + label: Option, + ) -> Result { + let exec_node_id = node_id.unwrap_or_else(Uuid::new_v4); + let node = self.queue_node( + "assignment", + label.as_deref().unwrap_or("assignment"), + QueueNodeParams { + node_id: Some(exec_node_id), + targets: Some(targets.clone()), + value_expr: Some(value_expr.clone()), + ..QueueNodeParams::default() + }, + )?; + self.record_data_flow_from_value(exec_node_id, &value_expr); + let assignments = self.build_assignments(&targets, &value_expr)?; + if let Some(node_mut) = self.nodes.get_mut(&node.node_id) { + node_mut.assignments.extend(assignments.clone()); + } + self.mark_latest_assignments(node.node_id, &assignments); + Ok(node) + } +} + +/// Render a ValueExpr to a python-like string for debugging/visualization. +/// +/// Example: +/// - BinaryOpValue(VariableValue("a"), +, LiteralValue(1)) -> "a + 1" +pub fn format_value(expr: &ValueExpr) -> String { + format_value_inner(expr, 0) +} + +/// Recursive ValueExpr formatter with operator precedence handling. +/// +/// Example: +/// - (a + b) * c renders with parentheses when needed. +fn format_value_inner(expr: &ValueExpr, parent_prec: i32) -> String { + match expr { + ValueExpr::Literal(lit) => format_literal(&lit.value), + ValueExpr::Variable(var) => var.name.clone(), + ValueExpr::ActionResult(value) => value.label(), + ValueExpr::BinaryOp(value) => { + let (op_str, prec) = binary_operator(value.op); + let left = format_value_inner(&value.left, prec); + let right = format_value_inner(&value.right, prec + 1); + let rendered = format!("{left} {op_str} {right}"); + if prec < parent_prec { + format!("({rendered})") + } else { + rendered + } + } + ValueExpr::UnaryOp(value) => { + let (op_str, prec) = unary_operator(value.op); + let operand = format_value_inner(&value.operand, prec); + let rendered = format!("{op_str}{operand}"); + if prec < parent_prec { + format!("({rendered})") + } else { + rendered + } + } + ValueExpr::List(value) => { + let items: Vec = value + .elements + .iter() + .map(|item| format_value_inner(item, 0)) + .collect(); + format!("[{}]", items.join(", ")) + } + ValueExpr::Dict(value) => { + let entries: Vec = value + .entries + .iter() + .map(|entry| { + format!( + "{}: {}", + format_value_inner(&entry.key, 0), + format_value_inner(&entry.value, 0) + ) + }) + .collect(); + format!("{{{}}}", entries.join(", ")) + } + ValueExpr::Index(value) => { + let prec = precedence("index"); + let obj = format_value_inner(&value.object, prec); + let idx = format_value_inner(&value.index, 0); + let rendered = format!("{obj}[{idx}]"); + if prec < parent_prec { + format!("({rendered})") + } else { + rendered + } + } + ValueExpr::Dot(value) => { + let prec = precedence("dot"); + let obj = format_value_inner(&value.object, prec); + let rendered = format!("{obj}.{}", value.attribute); + if prec < parent_prec { + format!("({rendered})") + } else { + rendered + } + } + ValueExpr::FunctionCall(value) => { + let mut args: Vec = value + .args + .iter() + .map(|arg| format_value_inner(arg, 0)) + .collect(); + for (name, val) in &value.kwargs { + args.push(format!("{name}={}", format_value_inner(val, 0))); + } + format!("{}({})", value.name, args.join(", ")) + } + ValueExpr::Spread(value) => { + let collection = format_value_inner(&value.collection, 0); + let mut args: Vec = Vec::new(); + for (name, val) in &value.action.kwargs { + args.push(format!("{name}={}", format_value_inner(val, 0))); + } + let call = format!("@{}({})", value.action.action_name, args.join(", ")); + format!("spread {collection}:{} -> {call}", value.loop_var) + } + } +} + +fn value_expr_contains_variable(expr: &ValueExpr, name: &str) -> bool { + match expr { + ValueExpr::Variable(var) => var.name == name, + ValueExpr::BinaryOp(value) => { + value_expr_contains_variable(&value.left, name) + || value_expr_contains_variable(&value.right, name) + } + ValueExpr::UnaryOp(value) => value_expr_contains_variable(&value.operand, name), + ValueExpr::List(value) => value + .elements + .iter() + .any(|item| value_expr_contains_variable(item, name)), + ValueExpr::Dict(value) => value.entries.iter().any(|entry| { + value_expr_contains_variable(&entry.key, name) + || value_expr_contains_variable(&entry.value, name) + }), + ValueExpr::Index(value) => { + value_expr_contains_variable(&value.object, name) + || value_expr_contains_variable(&value.index, name) + } + ValueExpr::Dot(value) => value_expr_contains_variable(&value.object, name), + ValueExpr::FunctionCall(value) => { + value + .args + .iter() + .any(|arg| value_expr_contains_variable(arg, name)) + || value + .kwargs + .values() + .any(|kwarg| value_expr_contains_variable(kwarg, name)) + } + ValueExpr::Spread(value) => { + value_expr_contains_variable(&value.collection, name) + || value + .action + .kwargs + .values() + .any(|kwarg| value_expr_contains_variable(kwarg, name)) + } + ValueExpr::Literal(_) | ValueExpr::ActionResult(_) => false, + } +} + +/// Map binary operator enums to (symbol, precedence) for formatting. +fn binary_operator(op: i32) -> (&'static str, i32) { + match ir::BinaryOperator::try_from(op).ok() { + Some(ir::BinaryOperator::BinaryOpOr) => ("or", 10), + Some(ir::BinaryOperator::BinaryOpAnd) => ("and", 20), + Some(ir::BinaryOperator::BinaryOpEq) => ("==", 30), + Some(ir::BinaryOperator::BinaryOpNe) => ("!=", 30), + Some(ir::BinaryOperator::BinaryOpLt) => ("<", 30), + Some(ir::BinaryOperator::BinaryOpLe) => ("<=", 30), + Some(ir::BinaryOperator::BinaryOpGt) => (">", 30), + Some(ir::BinaryOperator::BinaryOpGe) => (">=", 30), + Some(ir::BinaryOperator::BinaryOpIn) => ("in", 30), + Some(ir::BinaryOperator::BinaryOpNotIn) => ("not in", 30), + Some(ir::BinaryOperator::BinaryOpAdd) => ("+", 40), + Some(ir::BinaryOperator::BinaryOpSub) => ("-", 40), + Some(ir::BinaryOperator::BinaryOpMul) => ("*", 50), + Some(ir::BinaryOperator::BinaryOpDiv) => ("/", 50), + Some(ir::BinaryOperator::BinaryOpFloorDiv) => ("//", 50), + Some(ir::BinaryOperator::BinaryOpMod) => ("%", 50), + _ => ("?", 0), + } +} + +/// Map unary operator enums to (symbol, precedence) for formatting. +fn unary_operator(op: i32) -> (&'static str, i32) { + match ir::UnaryOperator::try_from(op).ok() { + Some(ir::UnaryOperator::UnaryOpNeg) => ("-", 60), + Some(ir::UnaryOperator::UnaryOpNot) => ("not ", 60), + _ => ("?", 0), + } +} + +/// Return precedence for non-operator constructs like index/dot. +fn precedence(kind: &str) -> i32 { + match kind { + "index" | "dot" => 80, + _ => 0, + } +} + +/// Format Python literals as source-like text. +fn format_literal(value: &serde_json::Value) -> String { + match value { + serde_json::Value::Null => "None".to_string(), + serde_json::Value::Bool(value) => { + if *value { + "True".to_string() + } else { + "False".to_string() + } + } + serde_json::Value::String(value) => { + serde_json::to_string(value).unwrap_or_else(|_| format!("\"{value}\"")) + } + _ => value.to_string(), + } +} + +/// Convert an IR literal into a Python value. +/// +/// Example IR: +/// - Literal(int_value=3) -> 3 +pub fn literal_value(lit: &ir::Literal) -> serde_json::Value { + match lit.value.as_ref() { + Some(ir::literal::Value::IntValue(value)) => serde_json::Value::Number((*value).into()), + Some(ir::literal::Value::FloatValue(value)) => serde_json::Number::from_f64(*value) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null), + Some(ir::literal::Value::StringValue(value)) => serde_json::Value::String(value.clone()), + Some(ir::literal::Value::BoolValue(value)) => serde_json::Value::Bool(*value), + Some(ir::literal::Value::IsNone(_)) => serde_json::Value::Null, + None => serde_json::Value::Null, + } +} + +/// Try to fold a literal binary operation to a concrete value. +/// +/// Example: +/// - (1, 2, BINARY_OP_ADD) -> 3 +fn fold_literal_binary( + op: i32, + left: &serde_json::Value, + right: &serde_json::Value, +) -> Option { + match ir::BinaryOperator::try_from(op).ok() { + Some(ir::BinaryOperator::BinaryOpAdd) => { + if let (Some(left), Some(right)) = (left.as_i64(), right.as_i64()) { + return Some(serde_json::Value::Number((left + right).into())); + } + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + return serde_json::Number::from_f64(left + right).map(serde_json::Value::Number); + } + if let (Some(left), Some(right)) = (left.as_str(), right.as_str()) { + return Some(serde_json::Value::String(format!("{left}{right}"))); + } + None + } + Some(ir::BinaryOperator::BinaryOpSub) => { + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + return serde_json::Number::from_f64(left - right).map(serde_json::Value::Number); + } + None + } + Some(ir::BinaryOperator::BinaryOpMul) => { + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + return serde_json::Number::from_f64(left * right).map(serde_json::Value::Number); + } + None + } + Some(ir::BinaryOperator::BinaryOpDiv) => { + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + return serde_json::Number::from_f64(left / right).map(serde_json::Value::Number); + } + None + } + Some(ir::BinaryOperator::BinaryOpFloorDiv) => { + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + if right == 0.0 { + return None; + } + let value = (left / right).floor(); + return serde_json::Number::from_f64(value).map(serde_json::Value::Number); + } + None + } + Some(ir::BinaryOperator::BinaryOpMod) => { + if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { + return serde_json::Number::from_f64(left % right).map(serde_json::Value::Number); + } + None + } + _ => None, + } +} + +/// Try to fold a literal unary operation to a concrete value. +/// +/// Example: +/// - (UNARY_OP_NEG, 4) -> -4 +fn fold_literal_unary(op: i32, operand: &serde_json::Value) -> Option { + match ir::UnaryOperator::try_from(op).ok() { + Some(ir::UnaryOperator::UnaryOpNeg) => operand + .as_f64() + .and_then(|value| serde_json::Number::from_f64(-value).map(serde_json::Value::Number)), + Some(ir::UnaryOperator::UnaryOpNot) => Some(serde_json::Value::Bool(!is_truthy(operand))), + _ => None, + } +} + +impl fmt::Display for NodeStatus { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let value = match self { + NodeStatus::Queued => "queued", + NodeStatus::Running => "running", + NodeStatus::Completed => "completed", + NodeStatus::Failed => "failed", + }; + write!(f, "{value}") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::Value; + use waymark_proto::ast as ir; + + fn action_plus_two_expr() -> ir::Expr { + ir::Expr { + kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { + left: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Variable(ir::Variable { + name: "action_result".to_string(), + })), + span: None, + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(2)), + })), + span: None, + })), + }))), + span: None, + } + } + + #[test] + fn test_runner_state_unrolls_loop_assignments() { + let mut state = RunnerState::new(None, None, None, true); + + state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + Some(0), + ) + .expect("queue action"); + let first_list = ir::Expr { + kind: Some(ir::expr::Kind::List(ir::ListExpr { + elements: vec![action_plus_two_expr()], + })), + span: None, + }; + state + .record_assignment(vec!["results".to_string()], &first_list, None, None) + .expect("record assignment"); + + state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + Some(1), + ) + .expect("queue action"); + let second_list = ir::Expr { + kind: Some(ir::expr::Kind::List(ir::ListExpr { + elements: vec![action_plus_two_expr()], + })), + span: None, + }; + let concat_expr = ir::Expr { + kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { + left: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Variable(ir::Variable { + name: "results".to_string(), + })), + span: None, + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Some(Box::new(second_list)), + }))), + span: None, + }; + state + .record_assignment(vec!["results".to_string()], &concat_expr, None, None) + .expect("record assignment"); + + let mut results: Option = None; + for node_id in state.timeline.iter().rev() { + let node = state.nodes.get(node_id).unwrap(); + if let Some(value) = node.assignments.get("results") { + results = Some(value.clone()); + break; + } + } + + let results = results.expect("results assignment"); + let binary = match results { + ValueExpr::BinaryOp(value) => value, + other => panic!("expected BinaryOpValue, got {other:?}"), + }; + + match binary.left.as_ref() { + ValueExpr::Variable(value) => assert_eq!(value.name, "results"), + other => panic!("expected VariableValue, got {other:?}"), + } + + let right_list = match binary.right.as_ref() { + ValueExpr::List(value) => value, + other => panic!("expected ListValue, got {other:?}"), + }; + assert_eq!(right_list.elements.len(), 1); + + let item_bin = match &right_list.elements[0] { + ValueExpr::BinaryOp(value) => value, + other => panic!("expected BinaryOpValue, got {other:?}"), + }; + + match item_bin.left.as_ref() { + ValueExpr::Variable(value) => assert_eq!(value.name, "action_result"), + other => panic!("expected VariableValue, got {other:?}"), + } + + match item_bin.right.as_ref() { + ValueExpr::Literal(value) => assert_eq!(value.value, Value::Number(2.into())), + other => panic!("expected LiteralValue, got {other:?}"), + } + } + + #[test] + fn test_runner_state_single_target_assignments_stay_symbolic() { + let mut state = RunnerState::new(None, None, None, true); + + let initial = ValueExpr::Dict(DictValue { + entries: vec![DictEntryValue { + key: ValueExpr::Literal(LiteralValue { + value: Value::String("result".to_string()), + }), + value: ValueExpr::Literal(LiteralValue { + value: Value::Number(1.into()), + }), + }], + }); + state + .record_assignment_value(vec!["result".to_string()], initial, None, None) + .expect("record initial assignment"); + + let wrapped = ValueExpr::Dict(DictValue { + entries: vec![DictEntryValue { + key: ValueExpr::Literal(LiteralValue { + value: Value::String("result".to_string()), + }), + value: ValueExpr::Variable(VariableValue { + name: "result".to_string(), + }), + }], + }); + state + .record_assignment_value(vec!["result".to_string()], wrapped, None, None) + .expect("record wrapped assignment"); + + let mut latest: Option = None; + for node_id in state.timeline.iter().rev() { + let node = state.nodes.get(node_id).expect("node"); + if let Some(value) = node.assignments.get("result") { + latest = Some(value.clone()); + break; + } + } + let latest = latest.expect("latest assignment"); + let dict = match latest { + ValueExpr::Dict(value) => value, + other => panic!("expected DictValue, got {other:?}"), + }; + assert_eq!(dict.entries.len(), 1); + match &dict.entries[0].value { + ValueExpr::Variable(value) => assert_eq!(value.name, "result"), + other => panic!("expected VariableValue, got {other:?}"), + } + } + + #[test] + fn test_materialize_value_keeps_self_referential_variable_symbolic() { + let mut state = RunnerState::new(None, None, None, true); + state + .record_assignment_value( + vec!["count".to_string()], + ValueExpr::Literal(LiteralValue { + value: Value::Number(0.into()), + }), + None, + None, + ) + .expect("record initial count"); + state + .record_assignment_value( + vec!["count".to_string()], + ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "count".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(ValueExpr::Literal(LiteralValue { + value: Value::Number(1.into()), + })), + }), + None, + None, + ) + .expect("record count update"); + + let materialized = state.materialize_value(ValueExpr::Variable(VariableValue { + name: "count".to_string(), + })); + match materialized { + ValueExpr::Variable(value) => assert_eq!(value.name, "count"), + other => panic!("expected VariableValue, got {other:?}"), + } + } + + #[test] + fn test_runner_state_graph_dirty_for_action_updates() { + let mut state = RunnerState::new(None, None, None, true); + assert!(!state.consume_graph_dirty_for_durable_execution()); + + let action_result = state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + None, + ) + .expect("queue action"); + assert!(state.consume_graph_dirty_for_durable_execution()); + assert!(!state.consume_graph_dirty_for_durable_execution()); + + state + .increment_action_attempt(action_result.node_id) + .expect("increment action attempt"); + assert!(state.consume_graph_dirty_for_durable_execution()); + } + + #[test] + fn test_runner_state_graph_dirty_not_set_for_assignments() { + let mut state = RunnerState::new(None, None, None, true); + let value_expr = ValueExpr::Literal(LiteralValue { + value: Value::Number(1.into()), + }); + state + .record_assignment_value(vec!["value".to_string()], value_expr, None, None) + .expect("record assignment"); + + assert!(!state.consume_graph_dirty_for_durable_execution()); + } + + #[test] + fn test_runner_state_records_action_start_stop_timestamps() { + let mut state = RunnerState::new(None, None, None, true); + let action_result = state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + None, + ) + .expect("queue action"); + + // Clear queue-time dirty bit so lifecycle transitions are isolated. + assert!(state.consume_graph_dirty_for_durable_execution()); + + state + .mark_running(action_result.node_id) + .expect("mark running"); + let started_at = state + .nodes + .get(&action_result.node_id) + .and_then(|node| node.started_at); + assert!( + started_at.is_some(), + "running action should record started_at" + ); + assert!( + state + .nodes + .get(&action_result.node_id) + .and_then(|node| node.completed_at) + .is_none(), + "running action should clear completed_at" + ); + assert!( + !state.ready_queue.contains(&action_result.node_id), + "running action should be removed from ready_queue" + ); + assert!(state.consume_graph_dirty_for_durable_execution()); + + state + .mark_completed(action_result.node_id) + .expect("mark completed"); + let completed_at = state + .nodes + .get(&action_result.node_id) + .and_then(|node| node.completed_at); + assert!( + completed_at.is_some(), + "completed action should record completed_at" + ); + assert!( + completed_at >= started_at, + "completed_at should be at or after started_at" + ); + assert!(state.consume_graph_dirty_for_durable_execution()); + } +} diff --git a/crates/runner-state/src/util.rs b/crates/runner-state/src/util.rs new file mode 100644 index 00000000..20768070 --- /dev/null +++ b/crates/runner-state/src/util.rs @@ -0,0 +1,12 @@ +pub(crate) fn is_truthy(value: &serde_json::Value) -> bool { + match value { + serde_json::Value::Null => false, + serde_json::Value::Bool(value) => *value, + serde_json::Value::Number(number) => { + number.as_f64().map(|value| value != 0.0).unwrap_or(false) + } + serde_json::Value::String(value) => !value.is_empty(), + serde_json::Value::Array(values) => !values.is_empty(), + serde_json::Value::Object(map) => !map.is_empty(), + } +} diff --git a/crates/runner-state/src/value_visitor.rs b/crates/runner-state/src/value_visitor.rs new file mode 100644 index 00000000..fbc7736a --- /dev/null +++ b/crates/runner-state/src/value_visitor.rs @@ -0,0 +1,533 @@ +//! Shared ValueExpr visitors for traversal, resolution, and evaluation. + +use std::collections::{HashMap, HashSet}; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use super::state::{ + ActionCallSpec, ActionResultValue, BinaryOpValue, DictEntryValue, DictValue, DotValue, + FunctionCallValue, IndexValue, ListValue, LiteralValue, SpreadValue, UnaryOpValue, + VariableValue, +}; + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +#[serde(tag = "type", content = "data")] +pub enum ValueExpr { + Literal(LiteralValue), + Variable(VariableValue), + ActionResult(ActionResultValue), + BinaryOp(BinaryOpValue), + UnaryOp(UnaryOpValue), + List(ListValue), + Dict(DictValue), + Index(IndexValue), + Dot(DotValue), + FunctionCall(FunctionCallValue), + Spread(SpreadValue), +} + +/// Resolve variables inside a ValueExpr tree without executing actions. +/// +/// Example IR: +/// - y = x + 1 (where x -> LiteralValue(2)) +/// Produces BinaryOpValue(LiteralValue(2), +, LiteralValue(1)). +pub struct ValueExprResolver<'a> { + resolve_variable: &'a dyn Fn(&str, &mut HashSet) -> ValueExpr, + seen: &'a mut HashSet, +} + +impl<'a> ValueExprResolver<'a> { + pub fn new( + resolve_variable: &'a dyn Fn(&str, &mut HashSet) -> ValueExpr, + seen: &'a mut HashSet, + ) -> Self { + Self { + resolve_variable, + seen, + } + } + + pub fn visit(&mut self, expr: &ValueExpr) -> ValueExpr { + match expr { + ValueExpr::Literal(value) => ValueExpr::Literal(value.clone()), + ValueExpr::Variable(value) => (self.resolve_variable)(&value.name, self.seen), + ValueExpr::ActionResult(value) => ValueExpr::ActionResult(value.clone()), + ValueExpr::BinaryOp(value) => ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(self.visit(&value.left)), + op: value.op, + right: Box::new(self.visit(&value.right)), + }), + ValueExpr::UnaryOp(value) => ValueExpr::UnaryOp(UnaryOpValue { + op: value.op, + operand: Box::new(self.visit(&value.operand)), + }), + ValueExpr::List(value) => ValueExpr::List(ListValue { + elements: value.elements.iter().map(|item| self.visit(item)).collect(), + }), + ValueExpr::Dict(value) => ValueExpr::Dict(DictValue { + entries: value + .entries + .iter() + .map(|entry| DictEntryValue { + key: self.visit(&entry.key), + value: self.visit(&entry.value), + }) + .collect(), + }), + ValueExpr::Index(value) => ValueExpr::Index(IndexValue { + object: Box::new(self.visit(&value.object)), + index: Box::new(self.visit(&value.index)), + }), + ValueExpr::Dot(value) => ValueExpr::Dot(DotValue { + object: Box::new(self.visit(&value.object)), + attribute: value.attribute.clone(), + }), + ValueExpr::FunctionCall(value) => ValueExpr::FunctionCall(FunctionCallValue { + name: value.name.clone(), + args: value.args.iter().map(|arg| self.visit(arg)).collect(), + kwargs: value + .kwargs + .iter() + .map(|(name, arg)| (name.clone(), self.visit(arg))) + .collect(), + global_function: value.global_function, + }), + ValueExpr::Spread(value) => { + let kwargs = value + .action + .kwargs + .iter() + .map(|(name, arg)| (name.clone(), self.visit(arg))) + .collect::>(); + let action = ActionCallSpec { + action_name: value.action.action_name.clone(), + module_name: value.action.module_name.clone(), + kwargs, + }; + ValueExpr::Spread(SpreadValue { + collection: Box::new(self.visit(&value.collection)), + loop_var: value.loop_var.clone(), + action, + }) + } + } + } +} + +/// Collect execution node ids that supply data to a ValueExpr tree. +/// +/// Example IR: +/// - total = a + @sum(values) +/// Returns the node ids that last defined `a` and the action node for sum(). +pub struct ValueExprSourceCollector<'a> { + resolve_variable: &'a dyn Fn(&str) -> Option, +} + +impl<'a> ValueExprSourceCollector<'a> { + pub fn new(resolve_variable: &'a dyn Fn(&str) -> Option) -> Self { + Self { resolve_variable } + } + + pub fn visit(&self, expr: &ValueExpr) -> HashSet { + match expr { + ValueExpr::Literal(_) => HashSet::new(), + ValueExpr::Variable(value) => { + (self.resolve_variable)(&value.name).into_iter().collect() + } + ValueExpr::ActionResult(value) => [value.node_id].into_iter().collect(), + ValueExpr::BinaryOp(value) => { + let mut sources = self.visit(&value.left); + sources.extend(self.visit(&value.right)); + sources + } + ValueExpr::UnaryOp(value) => self.visit(&value.operand), + ValueExpr::List(value) => { + let mut sources = HashSet::new(); + for item in &value.elements { + sources.extend(self.visit(item)); + } + sources + } + ValueExpr::Dict(value) => { + let mut sources = HashSet::new(); + for entry in &value.entries { + sources.extend(self.visit(&entry.key)); + sources.extend(self.visit(&entry.value)); + } + sources + } + ValueExpr::Index(value) => { + let mut sources = self.visit(&value.object); + sources.extend(self.visit(&value.index)); + sources + } + ValueExpr::Dot(value) => self.visit(&value.object), + ValueExpr::FunctionCall(value) => { + let mut sources = HashSet::new(); + for arg in &value.args { + sources.extend(self.visit(arg)); + } + for arg in value.kwargs.values() { + sources.extend(self.visit(arg)); + } + sources + } + ValueExpr::Spread(value) => { + let mut sources = self.visit(&value.collection); + for arg in value.action.kwargs.values() { + sources.extend(self.visit(arg)); + } + sources + } + } + } +} + +/// Evaluate ValueExpr nodes into concrete Python values. +/// +/// Example: +/// - BinaryOpValue(VariableValue("a"), +, LiteralValue(1)) becomes the +/// current value of a plus 1. +pub struct ValueExprEvaluator<'a, E> { + resolve_variable: &'a dyn Fn(&str) -> Result, + resolve_action_result: &'a dyn Fn(&ActionResultValue) -> Result, + resolve_function_call: &'a ResolveFunctionCall<'a, E>, + apply_binary: + &'a dyn Fn(i32, serde_json::Value, serde_json::Value) -> Result, + apply_unary: &'a dyn Fn(i32, serde_json::Value) -> Result, + error_factory: &'a dyn Fn(&str) -> E, +} + +type ResolveFunctionCall<'a, E> = dyn Fn( + &FunctionCallValue, + Vec, + HashMap, + ) -> Result + + 'a; + +impl<'a, E> ValueExprEvaluator<'a, E> { + pub fn new( + resolve_variable: &'a dyn Fn(&str) -> Result, + resolve_action_result: &'a dyn Fn(&ActionResultValue) -> Result, + resolve_function_call: &'a ResolveFunctionCall<'a, E>, + apply_binary: &'a dyn Fn( + i32, + serde_json::Value, + serde_json::Value, + ) -> Result, + apply_unary: &'a dyn Fn(i32, serde_json::Value) -> Result, + error_factory: &'a dyn Fn(&str) -> E, + ) -> Self { + Self { + resolve_variable, + resolve_action_result, + resolve_function_call, + apply_binary, + apply_unary, + error_factory, + } + } + + pub fn visit(&self, expr: &ValueExpr) -> Result { + match expr { + ValueExpr::Literal(value) => Ok(value.value.clone()), + ValueExpr::Variable(value) => (self.resolve_variable)(&value.name), + ValueExpr::ActionResult(value) => (self.resolve_action_result)(value), + ValueExpr::BinaryOp(value) => { + let left = self.visit(&value.left)?; + let right = self.visit(&value.right)?; + (self.apply_binary)(value.op, left, right) + } + ValueExpr::UnaryOp(value) => { + let operand = self.visit(&value.operand)?; + (self.apply_unary)(value.op, operand) + } + ValueExpr::List(value) => { + let mut items = Vec::with_capacity(value.elements.len()); + for item in &value.elements { + items.push(self.visit(item)?); + } + Ok(serde_json::Value::Array(items)) + } + ValueExpr::Dict(value) => { + let mut map = serde_json::Map::with_capacity(value.entries.len()); + for entry in &value.entries { + let key_value = self.visit(&entry.key)?; + let key = key_value + .as_str() + .map(|value| value.to_string()) + .unwrap_or_else(|| key_value.to_string()); + let entry_value = self.visit(&entry.value)?; + map.insert(key, entry_value); + } + Ok(serde_json::Value::Object(map)) + } + ValueExpr::Index(value) => { + let object = self.visit(&value.object)?; + let index = self.visit(&value.index)?; + match (object, index) { + (serde_json::Value::Array(items), serde_json::Value::Number(idx)) => { + let idx = idx.as_i64().unwrap_or(-1); + if idx < 0 || idx as usize >= items.len() { + return Err((self.error_factory)("index out of range")); + } + Ok(items[idx as usize].clone()) + } + (serde_json::Value::Object(map), serde_json::Value::String(key)) => map + .get(&key) + .cloned() + .or_else(|| lookup_exception_value(&map, &key)) + .ok_or_else(|| (self.error_factory)("dict has no key")), + _ => Err((self.error_factory)("unsupported index operation")), + } + } + ValueExpr::Dot(value) => { + let object = self.visit(&value.object)?; + if let serde_json::Value::Object(map) = object { + return map + .get(&value.attribute) + .cloned() + .or_else(|| lookup_exception_value(&map, &value.attribute)) + .ok_or_else(|| (self.error_factory)("dict has no key")); + } + Err((self.error_factory)("attribute not found")) + } + ValueExpr::FunctionCall(value) => { + let mut args = Vec::with_capacity(value.args.len()); + for arg in &value.args { + args.push(self.visit(arg)?); + } + let mut kwargs = HashMap::new(); + for (name, arg) in &value.kwargs { + kwargs.insert(name.clone(), self.visit(arg)?); + } + (self.resolve_function_call)(value, args, kwargs) + } + ValueExpr::Spread(_) => Err((self.error_factory)( + "cannot replay unresolved spread expression", + )), + } + } +} + +fn lookup_exception_value( + map: &serde_json::Map, + key: &str, +) -> Option { + if !(map.contains_key("type") && map.contains_key("message")) { + return None; + } + map.get("values") + .and_then(|value| value.as_object()) + .and_then(|values| values.get(key)) + .cloned() +} + +/// Recursively resolve variable references throughout a value tree. +/// +/// Use this as the core materialization step before assignment storage. +/// +/// Example IR: +/// - z = (x + y) * 2 +/// The tree walk replaces VariableValue("x")/("y") with their latest +/// symbolic definitions before storing z. +pub fn resolve_value_tree( + value: &ValueExpr, + resolve_variable: &dyn Fn(&str, &mut HashSet) -> ValueExpr, +) -> ValueExpr { + let mut seen = HashSet::new(); + let mut resolver = ValueExprResolver::new(resolve_variable, &mut seen); + resolver.visit(value) +} + +/// Find execution node ids that supply data to the given value. +/// +/// Example IR: +/// - total = a + @sum(values) +/// Returns the latest assignment node for a and the action node for sum(). +pub fn collect_value_sources( + value: &ValueExpr, + resolve_variable: &dyn Fn(&str) -> Option, +) -> HashSet { + let collector = ValueExprSourceCollector::new(resolve_variable); + collector.visit(value) +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + + use serde_json::Value; + use uuid::Uuid; + + use super::*; + use waymark_proto::ast as ir; + + fn literal_int(value: i64) -> ValueExpr { + ValueExpr::Literal(LiteralValue { + value: Value::Number(value.into()), + }) + } + + #[test] + fn test_value_expr_resolver_visit_happy_path() { + let mut seen = HashSet::new(); + let resolve = |name: &str, _: &mut HashSet| { + if name == "x" { + literal_int(3) + } else { + literal_int(0) + } + }; + let mut resolver = ValueExprResolver::new(&resolve, &mut seen); + let expr = ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "x".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(literal_int(1)), + }); + + let resolved = resolver.visit(&expr); + match resolved { + ValueExpr::BinaryOp(value) => { + assert!(matches!(*value.left, ValueExpr::Literal(_))); + assert!(matches!(*value.right, ValueExpr::Literal(_))); + } + other => panic!("expected binary value, got {other:?}"), + } + } + + #[test] + fn test_value_expr_source_collector_visit_happy_path() { + let variable_source = Uuid::new_v4(); + let action_source = Uuid::new_v4(); + let resolve = |name: &str| { + if name == "x" { + Some(variable_source) + } else { + None + } + }; + let collector = ValueExprSourceCollector::new(&resolve); + let expr = ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "x".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(ValueExpr::ActionResult(ActionResultValue { + node_id: action_source, + action_name: "fetch".to_string(), + iteration_index: None, + result_index: None, + })), + }); + + let sources = collector.visit(&expr); + assert!(sources.contains(&variable_source)); + assert!(sources.contains(&action_source)); + } + + #[test] + fn test_value_expr_evaluator_visit_happy_path() { + let resolve_variable = |name: &str| -> Result { + if name == "x" { + Ok(Value::Number(2.into())) + } else { + Err(format!("unknown variable: {name}")) + } + }; + let resolve_action_result = + |_value: &ActionResultValue| -> Result { Ok(Value::Number(0.into())) }; + let resolve_function_call = + |_call: &FunctionCallValue, + args: Vec, + _kwargs: HashMap| + -> Result { Ok(Value::Number((args.len() as i64).into())) }; + let apply_binary = |_op: i32, left: Value, right: Value| -> Result { + match (left.as_i64(), right.as_i64()) { + (Some(left), Some(right)) => Ok(Value::Number((left + right).into())), + _ => Err("bad operands".to_string()), + } + }; + let apply_unary = |_op: i32, value: Value| -> Result { + Ok(Value::Bool(!value.as_bool().unwrap_or(false))) + }; + let error_factory = |message: &str| message.to_string(); + + let evaluator = ValueExprEvaluator::new( + &resolve_variable, + &resolve_action_result, + &resolve_function_call, + &apply_binary, + &apply_unary, + &error_factory, + ); + let expr = ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "x".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(literal_int(5)), + }); + + let value = evaluator.visit(&expr).expect("evaluate expression"); + assert_eq!(value, Value::Number(7.into())); + } + + #[test] + fn test_resolve_value_tree_happy_path() { + let expr = ValueExpr::List(ListValue { + elements: vec![ValueExpr::Variable(VariableValue { + name: "user_id".to_string(), + })], + }); + let resolve = |name: &str, _seen: &mut HashSet| { + if name == "user_id" { + ValueExpr::Literal(LiteralValue { + value: Value::String("abc".to_string()), + }) + } else { + ValueExpr::Literal(LiteralValue { value: Value::Null }) + } + }; + + let resolved = resolve_value_tree(&expr, &resolve); + match resolved { + ValueExpr::List(list) => { + assert_eq!(list.elements.len(), 1); + assert!(matches!(list.elements[0], ValueExpr::Literal(_))); + } + other => panic!("expected list value, got {other:?}"), + } + } + + #[test] + fn test_collect_value_sources_happy_path() { + let source_a = Uuid::new_v4(); + let source_b = Uuid::new_v4(); + let expr = ValueExpr::FunctionCall(FunctionCallValue { + name: "sum".to_string(), + args: vec![ValueExpr::Variable(VariableValue { + name: "a".to_string(), + })], + kwargs: HashMap::from([( + "other".to_string(), + ValueExpr::ActionResult(ActionResultValue { + node_id: source_b, + action_name: "compute".to_string(), + iteration_index: None, + result_index: None, + }), + )]), + global_function: None, + }); + let resolve = |name: &str| if name == "a" { Some(source_a) } else { None }; + + let sources = collect_value_sources(&expr, &resolve); + assert_eq!(sources.len(), 2); + assert!(sources.contains(&source_a)); + assert!(sources.contains(&source_b)); + } +} diff --git a/crates/runner/Cargo.toml b/crates/runner/Cargo.toml new file mode 100644 index 00000000..115de256 --- /dev/null +++ b/crates/runner/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "waymark-runner" +version = "0.1.0" +edition = "2024" + +[dependencies] +chrono = { workspace = true, features = ["serde"] } +rustc-hash = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } +waymark-observability = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-core-backend = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +waymark-ir-parser = { workspace = true } +waymark-backend-memory = { workspace = true } + +[features] +trace = [] diff --git a/crates/runner/src/executor.rs b/crates/runner/src/executor.rs new file mode 100644 index 00000000..bae9a9c2 --- /dev/null +++ b/crates/runner/src/executor.rs @@ -0,0 +1,3015 @@ +//! Incremental DAG executor for runner state graphs. + +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use chrono::{DateTime, Utc}; +use rustc_hash::FxHashMap; +use serde_json::Value; +use uuid::Uuid; + +use crate::expression_evaluator::is_exception_value; +use crate::retry::{RetryDecision, RetryPolicyEvaluator, timeout_seconds_from_policies}; +use crate::synthetic_exceptions::{SyntheticExceptionType, build_synthetic_exception_value}; +use waymark_core_backend::{ActionAttemptStatus, ActionDone, CoreBackend, GraphUpdate}; +use waymark_dag::{ + ActionCallNode, AggregatorNode, DAG, DAGEdge, DagEdgeIndex, EXCEPTION_SCOPE_VAR, EdgeType, +}; +use waymark_observability::obs; +use waymark_proto::ast as ir; +use waymark_runner_state::value_visitor::ValueExpr; +use waymark_runner_state::{ + ActionCallSpec, ExecutionEdge, ExecutionNode, ExecutionNodeType, IndexValue, ListValue, + LiteralValue, NodeStatus, QueueNodeParams, RunnerState, RunnerStateError, +}; + +/// Raised when the runner executor cannot advance safely. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub struct RunnerExecutorError(pub String); + +#[derive(Clone, Debug)] +/// Persistence payloads required before dispatching new actions. +/// These need to be written to the backends in order to ensure that we can mark any +/// inflight actions as failed before queuing them up again +pub struct DurableUpdates { + pub actions_done: Vec, + pub graph_updates: Vec, +} + +#[derive(Clone, Debug)] +/// Return value for executor steps with newly queued action nodes. +pub struct ExecutorStep { + pub actions: Vec, + pub sleep_requests: Vec, + pub updates: Option, +} + +#[derive(Clone, Debug)] +/// Sleep requests emitted by the executor with wake-up times. +pub struct SleepRequest { + pub node_id: Uuid, + pub wake_at: DateTime, +} + +/// Action result payloads keyed by execution node id. +type ExecutionResultMap = HashMap; + +struct FinishedNodeOutcome { + /// Node to continue graph traversal from. + start: Option, + /// Exception payload forwarded to exception edges. + exception_value: Option, + /// Durable attempt metadata for this finished action (if applicable). + action_done: Option, + /// Retry action to dispatch immediately after state transition. + retry_action: Option, +} + +#[derive(Default)] +struct IncrementAccumulator { + actions_done: Vec, + pending_starts: Vec<(ExecutionNode, Option)>, + actions: Vec, + sleep_requests: Vec, + seen_actions: HashSet, + seen_sleep_nodes: HashSet, +} + +impl IncrementAccumulator { + fn absorb_finished_outcome(&mut self, outcome: FinishedNodeOutcome) { + if let Some(start) = outcome.start { + self.pending_starts.push((start, outcome.exception_value)); + } + if let Some(done) = outcome.action_done { + self.actions_done.push(done); + } + if let Some(retry_action) = outcome.retry_action { + self.record_action(retry_action); + } + } + + fn record_action(&mut self, action: ExecutionNode) { + // Multiple finished nodes can converge on the same queued action. + if self.seen_actions.insert(action.node_id) { + self.actions.push(action); + } + } + + fn record_sleep_request(&mut self, sleep_request: SleepRequest) { + if self.seen_sleep_nodes.insert(sleep_request.node_id) { + self.sleep_requests.push(sleep_request); + } + } +} + +struct WalkOutcome { + actions: Vec, + sleep_requests: Vec, +} + +struct FinishedActionMetadata { + attempt: i32, + started_at: Option>, + result: Value, +} + +enum ActionFailureTransition { + RetryQueued(Box), + Failed, +} + +enum TemplateKind { + SpreadAction(Box), + Aggregator(String), + Regular(String), +} + +enum SleepDecision { + Completed, + Blocked(DateTime), +} + +/// Advance a DAG template using the current runner state and action results. +/// +/// The executor treats the DAG as a control-flow template. It queues runtime +/// execution nodes into RunnerState, unrolling loops/spreads into explicit +/// iterations, and stops when it encounters action calls that must be executed +/// by an external worker. +/// +/// This serves as a runner supervisor for a single instance that's owned +/// in memory by our logic. +/// +/// Each call to increment() starts from finished execution nodes, walks +/// downstream through inline nodes (assignments, branches, joins, etc.), and +/// returns any newly queued action nodes that are now unblocked. +pub struct RunnerExecutor { + dag: Arc, + state: RunnerState, + action_results: ExecutionResultMap, + backend: Option>, + template_index: DagEdgeIndex, + incoming_exec_edges: FxHashMap>, + /// Index: template_id -> list of execution node IDs with that template + template_to_exec_nodes: FxHashMap>, + /// Cached assignment evaluations for the current increment pass. + /// Cleared at the start of each increment call. + eval_cache: RefCell>, + instance_id: Option, + terminal_error: Option, +} + +impl RunnerExecutor { + pub fn new( + dag: Arc, + state: RunnerState, + // Action results keyed by execution node id. + action_results: ExecutionResultMap, + backend: Option>, + ) -> Self { + let mut state = state; + state.dag = Some(dag.clone()); + state.set_link_queued_nodes(false); + + let template_index = dag.edge_index(); + let incoming_exec_edges = Self::build_incoming_exec_edges(&state); + let template_to_exec_nodes = Self::build_template_to_exec_nodes(&state); + + Self { + dag, + state, + action_results, + backend, + template_index, + incoming_exec_edges, + template_to_exec_nodes, + eval_cache: RefCell::new(FxHashMap::default()), + instance_id: None, + terminal_error: None, + } + } + + pub fn state(&self) -> &RunnerState { + &self.state + } + + pub fn state_mut(&mut self) -> &mut RunnerState { + &mut self.state + } + + pub fn dag(&self) -> &DAG { + &self.dag + } + + pub fn action_results(&self) -> &ExecutionResultMap { + &self.action_results + } + + pub fn instance_id(&self) -> Option { + self.instance_id + } + + pub fn set_instance_id(&mut self, instance_id: Uuid) { + self.instance_id = Some(instance_id); + } + + pub fn terminal_error(&self) -> Option<&Value> { + self.terminal_error.as_ref() + } + + pub(super) fn eval_cache_get(&self, key: &(Uuid, String)) -> Option { + self.eval_cache.borrow().get(key).cloned() + } + + pub(super) fn eval_cache_insert(&self, key: (Uuid, String), value: Value) { + self.eval_cache.borrow_mut().insert(key, value); + } + + /// Store an action result value for a specific execution node id. + pub fn set_action_result(&mut self, node_id: Uuid, result: Value) { + self.action_results.insert(node_id, result); + } + + /// Remove any cached action result for a specific execution node. + /// Used when re-queuing an action so we don't replay stale results. + pub fn clear_action_result(&mut self, node_id: Uuid) { + self.action_results.remove(&node_id); + } + + /// Resolve timeout policy seconds for an action node. + pub fn action_timeout_seconds(&self, node_id: Uuid) -> Result { + let node = self.execution_node(node_id)?; + if !node.is_action_call() { + return Ok(0); + } + let Some(action_template) = self.template_action_for_execution_node(node)? else { + return Ok(0); + }; + Ok(timeout_seconds_from_policies(&action_template.policies).unwrap_or(0)) + } + + /// Fail inflight actions and return any that should be retried. + /// + /// Use this after recovering from a crash: running actions are treated as + /// failed, their attempt counter is incremented if retry policies allow, + /// and retryable nodes are re-queued for execution. + pub fn resume(&mut self) -> Result { + let mut finished_nodes = Vec::new(); + for (node_id, node) in &self.state.nodes { + if node.is_action_call() && node.status == NodeStatus::Running { + finished_nodes.push(*node_id); + self.action_results.insert( + *node_id, + build_synthetic_exception_value( + SyntheticExceptionType::ExecutorResume, + format!( + "action {node_id} was running during resume and is treated as failed" + ), + Vec::new(), + ), + ); + } + } + if finished_nodes.is_empty() { + let updates = self.collect_updates(Vec::new())?; + return Ok(ExecutorStep { + actions: Vec::new(), + sleep_requests: Vec::new(), + updates, + }); + } + self.increment(&finished_nodes) + } + + /// Advance execution for finished nodes in a single batch. + /// + /// Use this when multiple actions complete in the same tick so the graph + /// update and action inserts are persisted together. + #[obs] + pub fn increment( + &mut self, + finished_nodes: &[Uuid], + ) -> Result { + self.eval_cache.borrow_mut().clear(); + let mut accum = IncrementAccumulator::default(); + self.collect_increment_results(finished_nodes, &mut accum)?; + self.walk_pending_starts(&mut accum)?; + + let IncrementAccumulator { + actions_done, + actions, + sleep_requests, + .. + } = accum; + let running_actions = self.mark_actions_running(&actions)?; + let updates = self.collect_updates(actions_done)?; + + // Note: Action timeouts and delayed retries require wall-clock tracking in the run loop. + // The executor only handles timeout failures once they surface as action results. + + Ok(ExecutorStep { + actions: running_actions, + sleep_requests, + updates, + }) + } + + fn collect_increment_results( + &mut self, + finished_nodes: &[Uuid], + accum: &mut IncrementAccumulator, + ) -> Result<(), RunnerExecutorError> { + for &node_id in finished_nodes { + accum.absorb_finished_outcome(self.apply_finished_node(node_id)?); + } + Ok(()) + } + + fn walk_pending_starts( + &mut self, + accum: &mut IncrementAccumulator, + ) -> Result<(), RunnerExecutorError> { + while let Some((start, exception_value)) = accum.pending_starts.pop() { + let outcome = self.walk_from(start, exception_value)?; + for action in outcome.actions { + accum.record_action(action); + } + for sleep_request in outcome.sleep_requests { + accum.record_sleep_request(sleep_request); + } + } + Ok(()) + } + + fn mark_actions_running( + &mut self, + actions: &[ExecutionNode], + ) -> Result, RunnerExecutorError> { + let mut running_actions = Vec::with_capacity(actions.len()); + for action in actions { + self.clear_action_result(action.node_id); + self.state + .mark_running(action.node_id) + .map_err(Self::state_error)?; + running_actions.push(self.execution_node_clone(action.node_id)?); + } + Ok(running_actions) + } + + /// Walk downstream from a node, executing inline nodes until blocked by an action node. + #[obs] + fn walk_from( + &mut self, + node: ExecutionNode, + exception_value: Option, + ) -> Result { + let mut pending = vec![(node, exception_value)]; + let mut actions = Vec::new(); + let mut sleep_requests = Vec::new(); + let mut forwarded_completed: HashSet = HashSet::new(); + + while let Some((current, current_exception)) = pending.pop() { + // template_id is the DAG node id, not the execution id. + let template_node_id = match ¤t.template_id { + Some(id) => id, + None => continue, + }; + let edges = if let Some(template_edges) = self.template_index.outgoing(template_node_id) + { + self.select_edges(template_edges, ¤t, current_exception)? + } else { + continue; + }; + for edge in edges { + let successors = self.queue_successor(¤t, &edge)?; + for successor in successors { + self.handle_walk_successor( + successor, + &mut pending, + &mut actions, + &mut sleep_requests, + &mut forwarded_completed, + )?; + } + } + } + Ok(WalkOutcome { + actions, + sleep_requests, + }) + } + + fn handle_walk_successor( + &mut self, + successor: ExecutionNode, + pending: &mut Vec<(ExecutionNode, Option)>, + actions: &mut Vec, + sleep_requests: &mut Vec, + forwarded_completed: &mut HashSet, + ) -> Result<(), RunnerExecutorError> { + if self.forward_completed_successor(&successor, pending, forwarded_completed) { + return Ok(()); + } + if successor.is_action_call() { + actions.push(successor); + return Ok(()); + } + if successor.is_sleep() { + self.handle_sleep_successor(successor, pending, sleep_requests)?; + return Ok(()); + } + self.handle_inline_successor(successor, pending) + } + + fn forward_completed_successor( + &self, + successor: &ExecutionNode, + pending: &mut Vec<(ExecutionNode, Option)>, + forwarded_completed: &mut HashSet, + ) -> bool { + if successor.status != NodeStatus::Completed { + return false; + } + if forwarded_completed.insert(successor.node_id) { + // Rehydrated runs can revisit completed paths to recover downstream + // sleep/action work without mutating already completed nodes. + pending.push((successor.clone(), None)); + } + true + } + + fn handle_sleep_successor( + &mut self, + successor: ExecutionNode, + pending: &mut Vec<(ExecutionNode, Option)>, + sleep_requests: &mut Vec, + ) -> Result<(), RunnerExecutorError> { + if !self.inline_ready(&successor) { + return Ok(()); + } + match self.handle_sleep_node(&successor)? { + SleepDecision::Completed => pending.push((successor, None)), + SleepDecision::Blocked(wake_at) => sleep_requests.push(SleepRequest { + node_id: successor.node_id, + wake_at, + }), + } + Ok(()) + } + + fn handle_inline_successor( + &mut self, + successor: ExecutionNode, + pending: &mut Vec<(ExecutionNode, Option)>, + ) -> Result<(), RunnerExecutorError> { + if !self.inline_ready(&successor) { + return Ok(()); + } + self.execute_inline_node(&successor)?; + pending.push((successor, None)); + Ok(()) + } + + /// Update state for a finished node and return replay metadata. + #[obs] + fn apply_finished_node( + &mut self, + node_id: Uuid, + ) -> Result { + if self.execution_node(node_id)?.is_action_call() { + return self.apply_finished_action_node(node_id); + } + // Non-action nodes are inline runtime steps; completion is a status flip. + self.state + .mark_completed(node_id) + .map_err(Self::state_error)?; + Ok(FinishedNodeOutcome { + start: Some(self.execution_node_clone(node_id)?), + exception_value: None, + action_done: None, + retry_action: None, + }) + } + + fn apply_finished_action_node( + &mut self, + node_id: Uuid, + ) -> Result { + let metadata = self.finished_action_metadata(node_id)?; + if is_exception_value(&metadata.result) { + return self.apply_exception_action_completion(node_id, metadata); + } + self.apply_successful_action_completion(node_id, metadata) + } + + fn finished_action_metadata( + &self, + node_id: Uuid, + ) -> Result { + let node = self.execution_node(node_id)?; + let result = + self.action_results.get(&node_id).cloned().ok_or_else(|| { + RunnerExecutorError(format!("missing action result for {node_id}")) + })?; + Ok(FinishedActionMetadata { + attempt: node.action_attempt, + started_at: node.started_at, + result, + }) + } + + fn apply_successful_action_completion( + &mut self, + node_id: Uuid, + metadata: FinishedActionMetadata, + ) -> Result { + self.state + .mark_completed(node_id) + .map_err(Self::state_error)?; + let assignments = self.execution_node(node_id)?.assignments.clone(); + if !assignments.is_empty() { + self.state.mark_latest_assignments(node_id, &assignments); + } + let completed_at = self + .execution_node(node_id)? + .completed_at + .unwrap_or_else(Utc::now); + let action_done = build_action_done( + node_id, + metadata.attempt, + ActionAttemptStatus::Completed, + metadata.started_at, + completed_at, + metadata.result, + ); + Ok(FinishedNodeOutcome { + start: Some(self.execution_node_clone(node_id)?), + exception_value: None, + action_done: Some(action_done), + retry_action: None, + }) + } + + fn apply_exception_action_completion( + &mut self, + node_id: Uuid, + metadata: FinishedActionMetadata, + ) -> Result { + let exception_value = metadata.result; + let status = action_done_status_for_exception(&exception_value); + let finished_at = Utc::now(); + + match self.apply_action_failure_transition(node_id, Some(&exception_value), finished_at)? { + ActionFailureTransition::RetryQueued(retry_action) => { + // Retries are re-queued and dispatched in this same increment pass. + let action_done = build_action_done( + node_id, + metadata.attempt, + status, + metadata.started_at, + finished_at, + exception_value, + ); + Ok(FinishedNodeOutcome { + start: None, + exception_value: None, + action_done: Some(action_done), + retry_action: Some(*retry_action), + }) + } + ActionFailureTransition::Failed => { + // Terminal failures keep exception payloads on the node so exception + // handler edges can bind $__exception in downstream inline nodes. + if !self.failure_has_exception_handler(node_id, &exception_value)? + && self.terminal_error.is_none() + { + self.terminal_error = Some(exception_value.clone()); + } + let completed_at = self + .execution_node(node_id)? + .completed_at + .unwrap_or(finished_at); + let action_done = build_action_done( + node_id, + metadata.attempt, + status, + metadata.started_at, + completed_at, + exception_value.clone(), + ); + Ok(FinishedNodeOutcome { + start: Some(self.execution_node_clone(node_id)?), + exception_value: Some(exception_value), + action_done: Some(action_done), + retry_action: None, + }) + } + } + } + + fn apply_action_failure_transition( + &mut self, + node_id: Uuid, + exception_value: Option<&Value>, + finished_at: DateTime, + ) -> Result { + let should_retry = { + let node = self.execution_node(node_id)?; + self.retry_decision(node, exception_value)?.should_retry + }; + if should_retry { + let retry_node = self.transition_action_to_retry(node_id, finished_at)?; + return Ok(ActionFailureTransition::RetryQueued(Box::new(retry_node))); + } + self.transition_action_to_failed(node_id, exception_value, finished_at)?; + Ok(ActionFailureTransition::Failed) + } + + fn transition_action_to_retry( + &mut self, + node_id: Uuid, + finished_at: DateTime, + ) -> Result { + // Retry transition invariants: + // 1) bump attempt counter before re-dispatch + // 2) return to queued status + // 3) keep completion timestamp for the failed attempt + self.state + .increment_action_attempt(node_id) + .map_err(Self::state_error)?; + let should_queue = !self.state.ready_queue.contains(&node_id); + { + let node = self.execution_node_mut(node_id)?; + node.status = NodeStatus::Queued; + node.started_at = None; + node.completed_at = Some(finished_at); + } + if should_queue { + self.state.ready_queue.push(node_id); + } + self.execution_node_clone(node_id) + } + + fn transition_action_to_failed( + &mut self, + node_id: Uuid, + exception_value: Option<&Value>, + finished_at: DateTime, + ) -> Result<(), RunnerExecutorError> { + self.state.mark_failed(node_id).map_err(Self::state_error)?; + self.execution_node_mut(node_id)?.completed_at = Some(finished_at); + if let Some(exception_value) = exception_value { + self.assign_exception_scope(node_id, exception_value.clone())?; + } + Ok(()) + } + + fn assign_exception_scope( + &mut self, + node_id: Uuid, + exception_value: Value, + ) -> Result<(), RunnerExecutorError> { + let exception_expr = ValueExpr::Literal(LiteralValue { + value: exception_value, + }); + let mut exception_assignment = HashMap::new(); + exception_assignment.insert(EXCEPTION_SCOPE_VAR.to_string(), exception_expr.clone()); + self.execution_node_mut(node_id)? + .assignments + .insert(EXCEPTION_SCOPE_VAR.to_string(), exception_expr); + self.state + .mark_latest_assignments(node_id, &exception_assignment); + Ok(()) + } + + fn failure_has_exception_handler( + &self, + node_id: Uuid, + exception_value: &Value, + ) -> Result { + let node = self.execution_node(node_id)?; + let template_id = match &node.template_id { + Some(id) => id, + None => return Ok(false), + }; + let template_edges = match self.template_index.outgoing(template_id) { + Some(edges) => edges, + None => return Ok(false), + }; + let selected = self.select_edges(template_edges, node, Some(exception_value.clone()))?; + Ok(selected + .iter() + .any(|edge| edge.edge_type == EdgeType::StateMachine)) + } + + fn retry_decision( + &self, + node: &ExecutionNode, + exception_value: Option<&Value>, + ) -> Result { + let Some(action) = self.template_action_for_execution_node(node)? else { + return Ok(RetryDecision { + should_retry: false, + }); + }; + let exception_name = exception_value.and_then(exception_type); + let evaluator = RetryPolicyEvaluator::new(&action.policies, exception_name); + Ok(evaluator.decision(node.action_attempt)) + } + + /// Select outgoing edges based on guards and exception state. + fn select_edges( + &self, + edges: &[DAGEdge], + _node: &ExecutionNode, + exception_value: Option, + ) -> Result, RunnerExecutorError> { + // Fast path: exception handling + if let Some(exception_value) = exception_value { + let mut result = Vec::new(); + for edge in edges { + if edge.exception_types.is_some() && self.exception_matches(edge, &exception_value) + { + result.push(edge.clone()); + } + } + return Ok(result); + } + + // Check if we have any conditional edges (guards or else) + let has_guards = edges.iter().any(|e| e.guard_expr.is_some()); + let has_else = edges.iter().any(|e| e.is_else); + + if has_guards || has_else { + // Evaluate guards first + let mut passed = Vec::new(); + for edge in edges { + if edge.guard_expr.is_some() && self.evaluate_guard(edge.guard_expr.as_ref())? { + passed.push(edge.clone()); + } + } + if !passed.is_empty() { + return Ok(passed); + } + // Fall through to else edges + let mut else_edges = Vec::new(); + for edge in edges { + if edge.is_else { + else_edges.push(edge.clone()); + } + } + return Ok(else_edges); + } + + // Fast path: regular edges (no exceptions, guards, or else) + let mut result = Vec::with_capacity(edges.len()); + for edge in edges { + if edge.exception_types.is_none() { + result.push(edge.clone()); + } + } + Ok(result) + } + + /// Queue successor nodes for a template edge, handling spreads/aggregators. + fn queue_successor( + &mut self, + source: &ExecutionNode, + edge: &DAGEdge, + ) -> Result, RunnerExecutorError> { + if edge.edge_type != EdgeType::StateMachine { + return Ok(Vec::new()); + } + + // Extract info from template without holding borrow across mutable calls + let kind = { + let template = self.dag.nodes.get(&edge.target).ok_or_else(|| { + RunnerExecutorError(format!("template node not found: {}", edge.target)) + })?; + + match template { + waymark_dag::DAGNode::ActionCall(action) if action.spread_loop_var.is_some() => { + TemplateKind::SpreadAction(Box::new(action.clone())) + } + waymark_dag::DAGNode::Aggregator(_) => { + TemplateKind::Aggregator(template.id().to_string()) + } + _ => TemplateKind::Regular(template.id().to_string()), + } + }; + + match kind { + TemplateKind::SpreadAction(action) => { + self.expand_spread_action(source, action.as_ref()) + } + TemplateKind::Aggregator(template_id) => { + if let Some(existing) = self.find_connected_successor(source.node_id, &template_id) + { + return Ok(vec![existing]); + } + let agg_node = self.get_or_create_aggregator(&template_id)?; + self.add_exec_edge(source.node_id, agg_node.node_id); + Ok(vec![agg_node]) + } + TemplateKind::Regular(template_id) => { + if let Some(existing) = self.find_connected_successor(source.node_id, &template_id) + { + return Ok(vec![existing]); + } + let exec_node = self.get_or_create_exec_node(&template_id)?; + self.add_exec_edge(source.node_id, exec_node.node_id); + Ok(vec![exec_node]) + } + } + } + + /// Unroll a spread action into per-item action nodes and a shared aggregator. + /// + /// Example IR: + /// - results = spread items:item -> @work(item=item) + /// Produces one action execution node per element in items and connects + /// them to a single aggregator node for results. + fn expand_spread_action( + &mut self, + source: &ExecutionNode, + template: &ActionCallNode, + ) -> Result, RunnerExecutorError> { + let collection_expr = template.spread_collection_expr.as_ref().ok_or_else(|| { + RunnerExecutorError("spread action missing collection expression".to_string()) + })?; + let loop_var = template.spread_loop_var.as_ref().ok_or_else(|| { + RunnerExecutorError("spread action missing loop variable".to_string()) + })?; + let elements = self.expand_collection(collection_expr)?; + let agg_id = template.aggregates_to.as_ref().ok_or_else(|| { + RunnerExecutorError("spread action missing aggregator link".to_string()) + })?; + + let agg_node = self + .state + .queue_template_node(agg_id, None) + .map_err(|err| RunnerExecutorError(err.0))?; + if elements.is_empty() { + return Ok(vec![agg_node]); + } + + let mut created = Vec::new(); + for (idx, element) in elements.into_iter().enumerate() { + let exec_node = self.queue_action_from_template( + template, + Some(HashMap::from([(loop_var.clone(), element)])), + Some(idx as i32), + )?; + self.add_exec_edge(source.node_id, exec_node.node_id); + self.add_exec_edge(exec_node.node_id, agg_node.node_id); + created.push(exec_node); + } + Ok(created) + } + + /// Create an action execution node from a template with optional bindings. + /// + /// Example IR: + /// - @work(value=item) with local_scope{"item": LiteralValue(3)} + /// Produces an action node whose kwargs include the literal 3. + fn queue_action_from_template( + &mut self, + template: &ActionCallNode, + local_scope: Option>, + iteration_index: Option, + ) -> Result { + let kwargs = template + .kwarg_exprs + .iter() + .map(|(name, expr)| { + let value = self + .state + .expr_to_value(expr, local_scope.as_ref()) + .map_err(|err| RunnerExecutorError(err.0))?; + Ok((name.clone(), value)) + }) + .collect::, RunnerExecutorError>>()?; + + let spec = ActionCallSpec { + action_name: template.action_name.clone(), + module_name: template.module_name.clone(), + kwargs, + }; + let targets = template + .targets + .clone() + .or_else(|| template.target.clone().map(|target| vec![target])) + .unwrap_or_default(); + let node = self + .state + .queue_node( + ExecutionNodeType::ActionCall.as_str(), + &template.label(), + QueueNodeParams { + template_id: Some(template.id.clone()), + targets: Some(targets.clone()), + action: Some(spec.clone()), + ..QueueNodeParams::default() + }, + ) + .map_err(|err| RunnerExecutorError(err.0))?; + for value in spec.kwargs.values() { + self.state.record_data_flow_from_value(node.node_id, value); + } + let result = self + .state + .assign_action_results( + &node, + &template.action_name, + Some(&targets), + iteration_index, + false, + ) + .map_err(|err| RunnerExecutorError(err.0))?; + if let Some(node_mut) = self.state.nodes.get_mut(&node.node_id) { + node_mut.value_expr = Some(ValueExpr::ActionResult(result)); + } + Ok(node) + } + + /// Execute a non-action node inline and update assignments/edges. + fn execute_inline_node(&mut self, node: &ExecutionNode) -> Result<(), RunnerExecutorError> { + let template_id = node + .template_id + .as_ref() + .ok_or_else(|| RunnerExecutorError("inline node missing template id".to_string()))?; + let template = self.dag.nodes.get(template_id).ok_or_else(|| { + RunnerExecutorError(format!("template node not found: {template_id}")) + })?; + + let aggregator = match template { + waymark_dag::DAGNode::Aggregator(aggregator) => Some(aggregator.clone()), + _ => None, + }; + if let Some(aggregator) = aggregator { + self.apply_aggregator_assignments(node, &aggregator)?; + } + + self.state + .mark_completed(node.node_id) + .map_err(|err| RunnerExecutorError(err.0)) + } + + fn handle_sleep_node( + &mut self, + node: &ExecutionNode, + ) -> Result { + let now = Utc::now(); + let scheduled_at = self + .state + .nodes + .get(&node.node_id) + .and_then(|node| node.scheduled_at); + if let Some(wake_at) = scheduled_at { + if wake_at <= now { + self.state + .mark_completed(node.node_id) + .map_err(|err| RunnerExecutorError(err.0))?; + return Ok(SleepDecision::Completed); + } + return Ok(SleepDecision::Blocked(wake_at)); + } + + let value_expr = self + .state + .nodes + .get(&node.node_id) + .and_then(|node| node.value_expr.clone()) + .unwrap_or(ValueExpr::Literal(LiteralValue { + value: Value::Number(0.into()), + })); + let materialized = self.state.materialize_value(value_expr); + let duration_value = self.evaluate_value_expr(&materialized)?; + + let duration_secs = match duration_value { + Value::Number(value) => value.as_f64().ok_or_else(|| { + RunnerExecutorError("sleep duration must be a number".to_string()) + })?, + Value::Null => 0.0, + _ => { + return Err(RunnerExecutorError( + "sleep duration must be a number".to_string(), + )); + } + }; + + if !duration_secs.is_finite() { + return Err(RunnerExecutorError( + "sleep duration must be finite".to_string(), + )); + } + + if duration_secs <= 0.0 { + self.state + .mark_completed(node.node_id) + .map_err(|err| RunnerExecutorError(err.0))?; + return Ok(SleepDecision::Completed); + } + + let duration = Duration::from_secs_f64(duration_secs); + let chrono_duration = chrono::Duration::from_std(duration) + .map_err(|_| RunnerExecutorError("sleep duration is out of range".to_string()))?; + let wake_at = now + chrono_duration; + self.state + .set_node_scheduled_at(node.node_id, Some(wake_at)) + .map_err(|err| RunnerExecutorError(err.0))?; + Ok(SleepDecision::Blocked(wake_at)) + } + + /// Check if an inline node is ready to run based on incoming edges. + fn inline_ready(&self, node: &ExecutionNode) -> bool { + if node.status == NodeStatus::Completed { + return false; + } + let incoming = match self.incoming_exec_edges.get(&node.node_id) { + Some(edges) if !edges.is_empty() => edges, + _ => return true, // No incoming edges means ready + }; + + let template = match node + .template_id + .as_ref() + .and_then(|id| self.dag.nodes.get(id)) + { + Some(template) => template, + None => return false, + }; + + if let waymark_dag::DAGNode::Aggregator(_) = template { + if let Some(required) = self.template_index.incoming(template.id()) { + let connected = self.connected_template_sources(node.node_id); + if !required.is_subset(&connected) { + return false; + } + } + for edge in incoming { + if let Some(source) = self.state.nodes.get(&edge.source) { + if !matches!(source.status, NodeStatus::Completed | NodeStatus::Failed) { + return false; + } + } else { + return false; + } + } + return true; + } + + for edge in incoming { + if let Some(source) = self.state.nodes.get(&edge.source) { + if !matches!(source.status, NodeStatus::Completed | NodeStatus::Failed) { + return false; + } + } else { + return false; + } + } + true + } + + /// Populate aggregated list assignments for a ready aggregator node. + /// + /// Example: + /// - results = spread items: @work(item) + /// When all action nodes complete, the aggregator assigns + /// results = [ActionResultValue(...), ...]. + fn apply_aggregator_assignments( + &mut self, + node: &ExecutionNode, + template: &AggregatorNode, + ) -> Result<(), RunnerExecutorError> { + let targets = template + .targets + .clone() + .or_else(|| template.target.clone().map(|target| vec![target])) + .unwrap_or_default(); + if targets.len() != 1 { + return Ok(()); + } + + let incoming_nodes: Vec = self + .incoming_exec_edges + .get(&node.node_id) + .cloned() + .unwrap_or_default() + .into_iter() + .filter(|edge| edge.edge_type == EdgeType::StateMachine) + .filter_map(|edge| self.state.nodes.get(&edge.source).cloned()) + .collect(); + + let mut values = Vec::new(); + for source in &incoming_nodes { + let value_expr = source.value_expr.clone().ok_or_else(|| { + RunnerExecutorError("aggregator missing source value".to_string()) + })?; + values.push(value_expr); + } + + let ordered = self.order_aggregated_values(&incoming_nodes, &values)?; + let list_value = ValueExpr::List(ListValue { elements: ordered }); + let assignment = HashMap::from([(targets[0].clone(), list_value.clone())]); + if let Some(node_mut) = self.state.nodes.get_mut(&node.node_id) { + node_mut.assignments.extend(assignment.clone()); + } + self.state + .mark_latest_assignments(node.node_id, &assignment); + self.state + .record_data_flow_from_value(node.node_id, &list_value); + Ok(()) + } + + /// Order aggregator values by spread iteration or parallel index. + fn order_aggregated_values( + &self, + sources: &[ExecutionNode], + values: &[ValueExpr], + ) -> Result, RunnerExecutorError> { + // Order by explicit iteration/parallel indices when available, then fall back to timeline. + if sources.len() != values.len() { + return Err(RunnerExecutorError( + "aggregator sources/value mismatch".to_string(), + )); + } + let timeline_index: HashMap = self + .state + .timeline + .iter() + .enumerate() + .map(|(idx, node_id)| (*node_id, idx)) + .collect(); + let mut pairs: Vec<((i32, i32), ValueExpr)> = Vec::with_capacity(values.len()); + for (source, value) in sources.iter().zip(values.iter()) { + let key = self.aggregated_sort_key(source, value, &timeline_index); + pairs.push((key, value.clone())); + } + pairs.sort_by_key(|item| item.0); + Ok(pairs.into_iter().map(|(_, value)| value).collect()) + } + + fn aggregated_sort_key( + &self, + source: &ExecutionNode, + value: &ValueExpr, + timeline_index: &HashMap, + ) -> (i32, i32) { + let mut primary = 2; + let mut secondary = *timeline_index.get(&source.node_id).unwrap_or(&0) as i32; + if let ValueExpr::ActionResult(action) = value { + if let Some(iter_idx) = action.iteration_index { + primary = 0; + secondary = iter_idx; + } + } else if let Some(template_id) = &source.template_id + && let Some(waymark_dag::DAGNode::ActionCall(action)) = self.dag.nodes.get(template_id) + && let Some(idx) = action.parallel_index + { + primary = 1; + secondary = idx; + } + (primary, secondary) + } + + /// Expand a collection expression into element ValueExprs. + /// + /// Example IR: + /// - spread range(3):i -> @work(i) + /// Produces [LiteralValue(0), LiteralValue(1), LiteralValue(2)]. + fn expand_collection( + &mut self, + expr: &ir::Expr, + ) -> Result, RunnerExecutorError> { + let value = Self::expr_to_value(expr)?; + let value = self.state.materialize_value(value); + if let ValueExpr::List(list) = value { + return Ok(list.elements); + } + + if let ValueExpr::ActionResult(action_value) = value.clone() { + let action_result = self.resolve_action_result(&action_value)?; + if let Value::Array(items) = action_result { + return Ok(items + .iter() + .enumerate() + .map(|(idx, _)| { + ValueExpr::Index(IndexValue { + object: Box::new(ValueExpr::ActionResult(action_value.clone())), + index: Box::new(ValueExpr::Literal(LiteralValue { + value: Value::Number((idx as i64).into()), + })), + }) + }) + .collect()); + } + return Err(RunnerExecutorError( + "spread collection is not iterable".to_string(), + )); + } + + let evaluated = self.evaluate_value_expr(&value)?; + if let Value::Array(items) = evaluated { + return Ok(items + .into_iter() + .map(|item| ValueExpr::Literal(LiteralValue { value: item })) + .collect()); + } + + Err(RunnerExecutorError( + "spread collection is not iterable".to_string(), + )) + } + + fn build_incoming_exec_edges(state: &RunnerState) -> FxHashMap> { + let mut incoming: FxHashMap> = FxHashMap::default(); + for edge in &state.edges { + if edge.edge_type != EdgeType::StateMachine { + continue; + } + incoming.entry(edge.target).or_default().push(edge.clone()); + } + incoming + } + + fn build_template_to_exec_nodes(state: &RunnerState) -> FxHashMap> { + let mut index: FxHashMap> = FxHashMap::default(); + for (node_id, node) in &state.nodes { + if let Some(template_id) = &node.template_id { + index.entry(template_id.clone()).or_default().push(*node_id); + } + } + index + } + + /// Register a new execution node in the template index + fn register_exec_node(&mut self, template_id: &str, node_id: Uuid) { + self.template_to_exec_nodes + .entry(template_id.to_string()) + .or_default() + .push(node_id); + } + + fn add_exec_edge(&mut self, source: Uuid, target: Uuid) { + let edge = ExecutionEdge { + source, + target, + edge_type: EdgeType::StateMachine, + }; + if self.state.edges.contains(&edge) { + return; + } + self.state.edges.insert(edge.clone()); + self.incoming_exec_edges + .entry(target) + .or_default() + .push(edge); + } + + fn connected_template_sources(&self, exec_node_id: Uuid) -> HashSet { + let mut connected = HashSet::new(); + for edge in self + .incoming_exec_edges + .get(&exec_node_id) + .cloned() + .unwrap_or_default() + { + if let Some(source) = self.state.nodes.get(&edge.source) + && let Some(template_id) = &source.template_id + { + connected.insert(template_id.clone()); + } + } + connected + } + + fn find_connected_successor( + &self, + source_id: Uuid, + template_id: &str, + ) -> Option { + for edge in &self.state.edges { + if edge.edge_type != EdgeType::StateMachine || edge.source != source_id { + continue; + } + let target = self.state.nodes.get(&edge.target)?; + if target.template_id.as_deref() == Some(template_id) { + return Some(target.clone()); + } + } + None + } + + fn get_or_create_aggregator( + &mut self, + template_id: &str, + ) -> Result { + let mut candidates: Vec = self + .state + .nodes + .values() + .filter(|node| { + node.template_id.as_deref() == Some(template_id) + && node.status != NodeStatus::Completed + }) + .cloned() + .collect(); + if !candidates.is_empty() { + let timeline_index: HashMap = self + .state + .timeline + .iter() + .enumerate() + .map(|(idx, node_id)| (*node_id, idx)) + .collect(); + candidates.sort_by_key(|node| { + std::cmp::Reverse(timeline_index.get(&node.node_id).copied().unwrap_or(0)) + }); + return Ok(candidates[0].clone()); + } + self.state + .queue_template_node(template_id, None) + .map_err(|err| RunnerExecutorError(err.0)) + } + + fn get_or_create_exec_node( + &mut self, + template_id: &str, + ) -> Result { + // Use the index to find candidate nodes - O(k) where k is nodes for this template + if let Some(node_ids) = self.template_to_exec_nodes.get(template_id) { + // Find the most recent non-completed node + let mut best_node_id: Option = None; + let mut best_timeline_pos: Option = None; + + for &node_id in node_ids { + if let Some(node) = self.state.nodes.get(&node_id) + && !matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) + { + let timeline_pos = self.state.timeline.iter().position(|&id| id == node_id); + if let Some(pos) = timeline_pos { + if best_timeline_pos.is_none() || pos > best_timeline_pos.unwrap() { + best_timeline_pos = Some(pos); + best_node_id = Some(node_id); + } + } else if best_node_id.is_none() { + best_node_id = Some(node_id); + } + } + } + + if let Some(node_id) = best_node_id { + return self + .state + .nodes + .get(&node_id) + .cloned() + .ok_or_else(|| RunnerExecutorError(format!("node disappeared: {node_id}"))); + } + } + + // Create new node and register it in the index + let node = self + .state + .queue_template_node(template_id, None) + .map_err(|err| RunnerExecutorError(err.0))?; + self.register_exec_node(template_id, node.node_id); + Ok(node) + } + + fn execution_node(&self, node_id: Uuid) -> Result<&ExecutionNode, RunnerExecutorError> { + self.state + .nodes + .get(&node_id) + .ok_or_else(|| RunnerExecutorError(format!("execution node not found: {node_id}"))) + } + + fn execution_node_mut( + &mut self, + node_id: Uuid, + ) -> Result<&mut ExecutionNode, RunnerExecutorError> { + self.state + .nodes + .get_mut(&node_id) + .ok_or_else(|| RunnerExecutorError(format!("execution node not found: {node_id}"))) + } + + fn execution_node_clone(&self, node_id: Uuid) -> Result { + self.execution_node(node_id).cloned() + } + + fn template_action_for_execution_node( + &self, + node: &ExecutionNode, + ) -> Result, RunnerExecutorError> { + let Some(template_id) = node.template_id.as_ref() else { + return Ok(None); + }; + let template = self.dag.nodes.get(template_id).ok_or_else(|| { + RunnerExecutorError(format!("template node not found: {template_id}")) + })?; + match template { + waymark_dag::DAGNode::ActionCall(action) => Ok(Some(action)), + _ => Ok(None), + } + } + + fn state_error(err: RunnerStateError) -> RunnerExecutorError { + RunnerExecutorError(err.0) + } + + fn collect_updates( + &mut self, + actions_done: Vec, + ) -> Result, RunnerExecutorError> { + if self.backend.is_none() { + return Ok(None); + } + let graph_dirty = self.state.consume_graph_dirty_for_durable_execution(); + let mut graph_updates = Vec::new(); + if graph_dirty { + let instance_id = self.instance_id.ok_or_else(|| { + RunnerExecutorError("instance_id is required for graph persistence".to_string()) + })?; + graph_updates.push(GraphUpdate::from_state(instance_id, &self.state)); + } + let updates = DurableUpdates { + actions_done, + graph_updates, + }; + if updates.actions_done.is_empty() && updates.graph_updates.is_empty() { + Ok(None) + } else { + Ok(Some(updates)) + } + } +} + +fn exception_type(value: &Value) -> Option<&str> { + match value { + Value::Object(map) => map.get("type").and_then(|value| value.as_str()), + _ => None, + } +} + +fn action_done_status_for_exception(value: &Value) -> ActionAttemptStatus { + match SyntheticExceptionType::from_value(value) { + Some(SyntheticExceptionType::ExecutorResume) + | Some(SyntheticExceptionType::ActionTimeout) => ActionAttemptStatus::TimedOut, + None => ActionAttemptStatus::Failed, + } +} + +fn compute_action_duration_ms( + started_at: Option>, + completed_at: DateTime, +) -> Option { + started_at + .map(|started_at| { + completed_at + .signed_duration_since(started_at) + .num_milliseconds() + }) + .filter(|duration| *duration >= 0) +} + +fn build_action_done( + execution_id: Uuid, + attempt: i32, + status: ActionAttemptStatus, + started_at: Option>, + completed_at: DateTime, + result: Value, +) -> ActionDone { + ActionDone { + execution_id, + attempt, + status, + started_at, + completed_at: Some(completed_at), + duration_ms: compute_action_duration_ms(started_at, completed_at), + result, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use waymark_backend_memory::MemoryBackend; + use waymark_dag::{ + ActionCallNode, ActionCallParams, AggregatorNode, AssignmentNode, DAG, DAGEdge, + convert_to_dag, + }; + use waymark_ir_parser::parse_program; + use waymark_proto::ast as ir; + use waymark_runner_state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; + + fn variable(name: &str) -> ir::Expr { + ir::Expr { + kind: Some(ir::expr::Kind::Variable(ir::Variable { + name: name.to_string(), + })), + span: None, + } + } + + fn literal_int(value: i64) -> ir::Expr { + ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(value)), + })), + span: None, + } + } + + fn binary(left: ir::Expr, op: ir::BinaryOperator, right: ir::Expr) -> ir::Expr { + ir::Expr { + kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { + left: Some(Box::new(left)), + op: op as i32, + right: Some(Box::new(right)), + }))), + span: None, + } + } + + #[test] + fn test_action_done_status_for_resume_exception_is_timed_out() { + let value = serde_json::json!({ + "type": "ExecutorResume", + "message": "resumed action timed out", + }); + assert_eq!( + action_done_status_for_exception(&value), + ActionAttemptStatus::TimedOut + ); + } + + #[test] + fn test_action_done_status_for_action_timeout_exception_is_timed_out() { + let value = serde_json::json!({ + "type": "ActionTimeout", + "message": "action timed out", + "timeout_seconds": 1, + "attempt": 1, + }); + assert_eq!( + action_done_status_for_exception(&value), + ActionAttemptStatus::TimedOut + ); + } + + #[test] + fn test_action_done_status_for_generic_exception_is_failed() { + let value = serde_json::json!({ + "type": "ValueError", + "message": "boom", + }); + assert_eq!( + action_done_status_for_exception(&value), + ActionAttemptStatus::Failed + ); + } + + #[test] + fn test_action_done_status_for_non_synthetic_timeout_error_is_failed() { + let value = serde_json::json!({ + "type": "TimeoutError", + "message": "user action raised timeout", + }); + assert_eq!( + action_done_status_for_exception(&value), + ActionAttemptStatus::Failed + ); + } + + #[test] + fn test_build_action_done_sets_duration_from_started_and_completed() { + let execution_id = Uuid::new_v4(); + let started_at = Utc::now(); + let completed_at = started_at + chrono::Duration::milliseconds(275); + let done = build_action_done( + execution_id, + 2, + ActionAttemptStatus::Completed, + Some(started_at), + completed_at, + serde_json::json!({"ok": true}), + ); + + assert_eq!(done.execution_id, execution_id); + assert_eq!(done.attempt, 2); + assert_eq!(done.status, ActionAttemptStatus::Completed); + assert_eq!(done.started_at, Some(started_at)); + assert_eq!(done.completed_at, Some(completed_at)); + assert_eq!(done.duration_ms, Some(275)); + } + + #[derive(Default)] + struct ActionNodeOptions { + policies: Vec, + spread_loop_var: Option, + spread_collection_expr: Option, + aggregates_to: Option, + } + + fn action_node( + node_id: &str, + action_name: &str, + kwarg_exprs: HashMap, + targets: Vec, + options: ActionNodeOptions, + ) -> ActionCallNode { + let ActionNodeOptions { + policies, + spread_loop_var, + spread_collection_expr, + aggregates_to, + } = options; + ActionCallNode::new( + node_id, + action_name, + ActionCallParams { + module_name: None, + kwargs: HashMap::new(), + kwarg_exprs, + policies, + targets: Some(targets), + target: None, + parallel_index: None, + aggregates_to, + spread_loop_var, + spread_collection_expr, + function_name: Some("main".to_string()), + }, + ) + } + + fn assignment_node( + node_id: &str, + targets: Vec, + assign_expr: ir::Expr, + ) -> AssignmentNode { + AssignmentNode::new( + node_id, + targets, + None, + Some(assign_expr), + None, + Some("main".to_string()), + ) + } + + fn aggregator_node( + node_id: &str, + aggregates_from: &str, + targets: Vec, + ) -> AggregatorNode { + AggregatorNode::new( + node_id, + aggregates_from, + Some(targets), + None, + "aggregate", + Some("main".to_string()), + ) + } + + fn snapshot_state( + state: &RunnerState, + action_results: &HashMap, + ) -> ( + HashMap, + HashSet, + HashMap, + ) { + ( + state.nodes.clone(), + state.edges.clone(), + action_results.clone(), + ) + } + + fn create_rehydrated_executor( + dag: &Arc, + nodes: HashMap, + edges: HashSet, + action_results: HashMap, + ) -> RunnerExecutor { + let state = RunnerState::new(Some(Arc::clone(dag)), Some(nodes), Some(edges), false); + RunnerExecutor::new(Arc::clone(dag), state, action_results, None) + } + + fn compare_executor_states(original: &RunnerExecutor, rehydrated: &RunnerExecutor) { + let orig_state = original.state(); + let rehy_state = rehydrated.state(); + assert_eq!( + orig_state.nodes.keys().collect::>(), + rehy_state.nodes.keys().collect::>(), + ); + for node_id in orig_state.nodes.keys() { + let orig_node = orig_state.nodes.get(node_id).unwrap(); + let rehy_node = rehy_state.nodes.get(node_id).unwrap(); + assert_eq!(orig_node.node_type, rehy_node.node_type); + assert_eq!(orig_node.status, rehy_node.status); + assert_eq!(orig_node.template_id, rehy_node.template_id); + assert_eq!(orig_node.targets, rehy_node.targets); + assert_eq!(orig_node.action_attempt, rehy_node.action_attempt); + } + assert_eq!(orig_state.edges, rehy_state.edges); + } + + fn completion_action_result(action: &ExecutionNode) -> Value { + Value::String(format!( + "{}:attempt{}", + action.template_id.as_deref().unwrap_or("unknown_action"), + action.action_attempt + )) + } + + fn dag_from_ir_source(source: &str) -> Arc { + let program = parse_program(source.trim()).expect("parse program"); + Arc::new(convert_to_dag(&program).expect("convert program to DAG")) + } + + fn build_executor_at_entry(dag: &Arc) -> (RunnerExecutor, Uuid) { + let mut state = RunnerState::new(Some(Arc::clone(dag)), None, None, false); + let entry_template = dag.entry_node.as_ref().expect("dag entry node"); + let entry_exec = state + .queue_template_node(entry_template, None) + .expect("queue entry node"); + ( + RunnerExecutor::new(Arc::clone(dag), state, HashMap::new(), None), + entry_exec.node_id, + ) + } + + type ActionResultFor = fn(&ExecutionNode) -> Value; + + struct RehydrateBranchHarness { + dag: Arc, + canonical: RunnerExecutor, + branches: Vec, + action_result_for: ActionResultFor, + } + + impl RehydrateBranchHarness { + const MAX_TICKS: usize = 256; + + fn new( + dag: Arc, + canonical: RunnerExecutor, + action_result_for: ActionResultFor, + ) -> Self { + let mut harness = Self { + dag, + canonical, + branches: Vec::new(), + action_result_for, + }; + harness.fork_from_canonical(); + harness + } + + fn run_and_assert(mut self) { + self.advance_canonical_with_forks(); + for (index, branch) in self.branches.iter_mut().enumerate() { + Self::advance_executor_to_completion(branch, self.action_result_for) + .unwrap_or_else(|err| panic!("branch {index} failed to complete: {err}")); + Self::assert_completed_executor_equivalent(&self.canonical, branch); + } + } + + fn fork_from_canonical(&mut self) { + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(self.canonical.state(), self.canonical.action_results()); + self.branches.push(create_rehydrated_executor( + &self.dag, + nodes_snap, + edges_snap, + results_snap, + )); + } + + fn advance_canonical_with_forks(&mut self) { + let mut converged = false; + for _ in 0..Self::MAX_TICKS { + let progressed = Self::advance_executor_one_increment( + &mut self.canonical, + self.action_result_for, + ) + .expect("advance canonical executor"); + if !progressed { + converged = true; + break; + } + self.fork_from_canonical(); + } + assert!(converged, "canonical executor did not converge"); + assert!( + !self.branches.is_empty(), + "expected at least one rehydrated branch" + ); + } + + fn advance_executor_one_increment( + executor: &mut RunnerExecutor, + action_result_for: ActionResultFor, + ) -> Result { + let active_actions: Vec = executor + .state() + .nodes + .values() + .filter(|node| { + node.is_action_call() + && matches!(node.status, NodeStatus::Queued | NodeStatus::Running) + }) + .cloned() + .collect(); + for action in &active_actions { + if !executor.action_results().contains_key(&action.node_id) { + executor.set_action_result(action.node_id, action_result_for(action)); + } + } + + let mut finished_nodes: Vec = + active_actions.iter().map(|node| node.node_id).collect(); + finished_nodes.extend( + executor + .state() + .nodes + .values() + .filter(|node| { + node.status == NodeStatus::Queued + && node.is_sleep() + && node.scheduled_at.is_some() + }) + .map(|node| node.node_id), + ); + + if finished_nodes.is_empty() { + return Ok(false); + } + + let step = executor.increment(&finished_nodes)?; + for action in &step.actions { + if !executor.action_results().contains_key(&action.node_id) { + executor.set_action_result(action.node_id, action_result_for(action)); + } + } + for sleep_request in &step.sleep_requests { + executor + .state_mut() + .set_node_scheduled_at( + sleep_request.node_id, + Some(Utc::now() - chrono::Duration::seconds(1)), + ) + .map_err(|err| RunnerExecutorError(err.0))?; + } + Ok(true) + } + + fn advance_executor_to_completion( + executor: &mut RunnerExecutor, + action_result_for: ActionResultFor, + ) -> Result<(), RunnerExecutorError> { + for _ in 0..Self::MAX_TICKS { + if !Self::advance_executor_one_increment(executor, action_result_for)? { + return Ok(()); + } + } + + Err(RunnerExecutorError( + "executor did not converge to completion".to_string(), + )) + } + + fn count_keyed(items: impl IntoIterator) -> HashMap { + let mut counts: HashMap = HashMap::new(); + for item in items { + *counts.entry(item).or_insert(0) += 1; + } + counts + } + + fn node_shape_counts(executor: &RunnerExecutor) -> HashMap { + Self::count_keyed(executor.state().nodes.values().map(|node| { + let mut targets = node.targets.clone(); + targets.sort(); + let mut assignment_keys: Vec = node.assignments.keys().cloned().collect(); + assignment_keys.sort(); + let mut action_kwarg_keys = node + .action + .as_ref() + .map(|action| action.kwargs.keys().cloned().collect::>()) + .unwrap_or_default(); + action_kwarg_keys.sort(); + format!( + "type={}|template={}|status={:?}|attempt={}|targets={targets:?}|assignments={assignment_keys:?}|action={}({action_kwarg_keys:?})|scheduled={}", + node.node_type, + node.template_id.clone().unwrap_or_default(), + node.status, + node.action_attempt, + node.action + .as_ref() + .map(|action| action.action_name.clone()) + .unwrap_or_default(), + node.scheduled_at.is_some(), + ) + })) + } + + fn edge_shape_counts(executor: &RunnerExecutor) -> HashMap { + Self::count_keyed(executor.state().edges.iter().map(|edge| { + let source = executor + .state() + .nodes + .get(&edge.source) + .expect("source node") + .template_id + .clone() + .unwrap_or_else(|| "__unknown_source".to_string()); + let target = executor + .state() + .nodes + .get(&edge.target) + .expect("target node") + .template_id + .clone() + .unwrap_or_else(|| "__unknown_target".to_string()); + format!("{source}-{:?}->{target}", edge.edge_type) + })) + } + + fn action_result_counts(executor: &RunnerExecutor) -> HashMap { + Self::count_keyed(executor.action_results().iter().map(|(node_id, value)| { + let template_id = executor + .state() + .nodes + .get(node_id) + .and_then(|node| node.template_id.clone()) + .unwrap_or_else(|| "__unknown_action".to_string()); + let rendered = + serde_json::to_string(value).expect("action result should serialize to JSON"); + format!("{template_id}:{rendered}") + })) + } + + fn assert_completed_executor_equivalent( + canonical: &RunnerExecutor, + rehydrated: &RunnerExecutor, + ) { + assert_eq!( + Self::node_shape_counts(canonical), + Self::node_shape_counts(rehydrated) + ); + assert_eq!( + Self::edge_shape_counts(canonical), + Self::edge_shape_counts(rehydrated) + ); + assert_eq!( + canonical.state().timeline.len(), + rehydrated.state().timeline.len() + ); + assert_eq!( + Self::action_result_counts(canonical), + Self::action_result_counts(rehydrated) + ); + assert_eq!( + canonical.state().ready_queue.is_empty(), + rehydrated.state().ready_queue.is_empty() + ); + + let replay_canonical = + crate::replay_variables(canonical.state(), canonical.action_results()) + .expect("replay canonical"); + let replay_rehydrated = + crate::replay_variables(rehydrated.state(), rehydrated.action_results()) + .expect("replay rehydrated"); + + let mut assignment_counts: HashMap = HashMap::new(); + for node in canonical.state().nodes.values() { + for target in node.assignments.keys() { + *assignment_counts.entry(target.clone()).or_insert(0) += 1; + } + } + let stable_canonical: HashMap = replay_canonical + .variables + .into_iter() + .filter(|(name, _)| assignment_counts.get(name).copied().unwrap_or(0) <= 1) + .collect(); + let stable_rehydrated: HashMap = replay_rehydrated + .variables + .into_iter() + .filter(|(name, _)| assignment_counts.get(name).copied().unwrap_or(0) <= 1) + .collect(); + assert_eq!(stable_canonical, stable_rehydrated); + } + } + + fn setup_linear_assignment_checkpoint() -> (Arc, RunnerExecutor) { + let dag = dag_from_ir_source( + r#" +fn main(input: [], output: [z]): + x = @fetch() + y = x + 1 + z = @process(value=y) + return z +"#, + ); + let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); + + let first_step = executor + .increment(&[entry_exec_id]) + .expect("advance from entry"); + assert_eq!(first_step.actions.len(), 1); + let first_exec = first_step.actions[0].clone(); + executor.set_action_result(first_exec.node_id, Value::Number(10.into())); + + let step = executor.increment(&[first_exec.node_id]).expect("advance"); + assert_eq!(step.actions.len(), 1); + (dag, executor) + } + + fn setup_sleep_resume_checkpoint() -> (Arc, RunnerExecutor) { + let dag = dag_from_ir_source( + r#" +fn main(input: [], output: [resumed]): + seed = 1 + started = @get_timestamp() + sleep 60 + resumed = @get_timestamp() + return resumed +"#, + ); + let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); + + let start_step = executor.increment(&[entry_exec_id]).expect("start"); + assert_eq!(start_step.actions.len(), 1); + let start_exec = start_step.actions[0].clone(); + executor.set_action_result(start_exec.node_id, Value::String("t0".to_string())); + + let sleep_step = executor + .increment(&[start_exec.node_id]) + .expect("advance to sleep"); + assert!(sleep_step.actions.is_empty()); + assert_eq!(sleep_step.sleep_requests.len(), 1); + (dag, executor) + } + + fn setup_spread_checkpoint() -> (Arc, RunnerExecutor) { + let dag = dag_from_ir_source( + r#" +fn main(input: [], output: [done]): + items = @get_items() + results = spread items:item -> @double(value=item) + done = @finalize(values=results) + return done +"#, + ); + let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); + + let first_step = executor.increment(&[entry_exec_id]).expect("start"); + assert_eq!(first_step.actions.len(), 1); + let initial_exec = first_step.actions[0].clone(); + executor.set_action_result( + initial_exec.node_id, + Value::Array(vec![1.into(), 2.into(), 3.into()]), + ); + + let step1 = executor + .increment(&[initial_exec.node_id]) + .expect("expand spread"); + assert_eq!(step1.actions.len(), 3); + for (idx, node) in step1.actions.iter().enumerate() { + executor.set_action_result(node.node_id, Value::Number(((idx + 1) as i64).into())); + } + + let step2 = executor + .increment( + &step1 + .actions + .iter() + .map(|node| node.node_id) + .collect::>(), + ) + .expect("complete spread"); + assert_eq!(step2.actions.len(), 1); + (dag, executor) + } + + #[test] + fn test_executor_unblocks_downstream_action() { + let mut dag = DAG::default(); + + let action_start = action_node( + "action_start", + "fetch", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let assign_node = assignment_node( + "assign", + vec!["y".to_string()], + binary( + variable("x"), + ir::BinaryOperator::BinaryOpAdd, + literal_int(1), + ), + ); + let action_next = action_node( + "action_next", + "work", + HashMap::from([("value".to_string(), variable("y"))]), + vec!["z".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action_start.clone())); + dag.add_node(waymark_dag::DAGNode::Assignment(assign_node.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action_next.clone())); + dag.add_edge(DAGEdge::state_machine( + action_start.id.clone(), + assign_node.id.clone(), + )); + dag.add_edge(DAGEdge::state_machine( + assign_node.id.clone(), + action_next.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let start_exec = state + .queue_template_node(&action_start.id, None) + .expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(start_exec.node_id, Value::Number(10.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + + let step = executor + .increment(&[start_exec.node_id]) + .expect("increment"); + assert_eq!(step.actions.len(), 1); + assert_eq!( + step.actions[0].template_id.as_deref(), + Some(action_next.id.as_str()) + ); + } + + #[test] + fn test_rehydrate_after_first_action_queued() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "fetch", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let action2 = action_node( + "action2", + "process", + HashMap::from([("value".to_string(), variable("x"))]), + vec!["y".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + action2.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + compare_executor_states(&executor, &rehydrated); + let node = rehydrated.state().nodes.get(&exec1.node_id).expect("node"); + assert_eq!(node.status, NodeStatus::Queued); + } + + #[test] + fn test_rehydrate_after_action_completed_and_increment() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "fetch", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let action2 = action_node( + "action2", + "process", + HashMap::from([("value".to_string(), variable("x"))]), + vec!["y".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + action2.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(exec1.node_id, Value::Number(42.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + + let step = executor.increment(&[exec1.node_id]).expect("increment"); + assert_eq!(step.actions.len(), 1); + let exec2 = &step.actions[0]; + assert_eq!(exec2.template_id.as_deref(), Some(action2.id.as_str())); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + let node1 = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); + assert_eq!(node1.status, NodeStatus::Completed); + let node2 = rehydrated.state().nodes.get(&exec2.node_id).unwrap(); + assert_eq!(node2.status, NodeStatus::Running); + } + + #[test] + fn test_rehydrate_multi_step_chain() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "step1", + HashMap::new(), + vec!["a".to_string()], + ActionNodeOptions::default(), + ); + let action2 = action_node( + "action2", + "step2", + HashMap::from([("input".to_string(), variable("a"))]), + vec!["b".to_string()], + ActionNodeOptions::default(), + ); + let action3 = action_node( + "action3", + "step3", + HashMap::from([("input".to_string(), variable("b"))]), + vec!["c".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action3.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + action2.id.clone(), + )); + dag.add_edge(DAGEdge::state_machine( + action2.id.clone(), + action3.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + let mut executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + executor.set_action_result(exec1.node_id, Value::Number(10.into())); + let step1 = executor.increment(&[exec1.node_id]).expect("increment"); + let exec2 = step1.actions[0].clone(); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + executor.set_action_result(exec2.node_id, Value::Number(20.into())); + let step2 = executor.increment(&[exec2.node_id]).expect("increment"); + let exec3 = step2.actions[0].clone(); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + executor.set_action_result(exec3.node_id, Value::Number(30.into())); + let step3 = executor.increment(&[exec3.node_id]).expect("increment"); + assert!(step3.actions.is_empty()); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + for node in rehydrated.state().nodes.values() { + if node.is_action_call() { + assert_eq!(node.status, NodeStatus::Completed); + } + } + } + + #[test] + fn test_rehydrate_with_assignment_node() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "fetch", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let assign = assignment_node( + "assign", + vec!["y".to_string()], + binary( + variable("x"), + ir::BinaryOperator::BinaryOpAdd, + literal_int(1), + ), + ); + let action2 = action_node( + "action2", + "process", + HashMap::from([("value".to_string(), variable("y"))]), + vec!["z".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::Assignment(assign.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + assign.id.clone(), + )); + dag.add_edge(DAGEdge::state_machine( + assign.id.clone(), + action2.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(exec1.node_id, Value::Number(10.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + + let step = executor.increment(&[exec1.node_id]).expect("increment"); + assert_eq!(step.actions.len(), 1); + assert_eq!( + step.actions[0].template_id.as_deref(), + Some(action2.id.as_str()) + ); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + let assign_nodes: Vec<_> = rehydrated + .state() + .nodes + .values() + .filter(|node| node.template_id.as_deref() == Some(&assign.id)) + .collect(); + assert_eq!(assign_nodes.len(), 1); + assert_eq!(assign_nodes[0].status, NodeStatus::Completed); + assert!(assign_nodes[0].assignments.contains_key("y")); + } + + #[test] + fn test_rehydrate_preserves_action_kwargs() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "compute", + HashMap::from([ + ("a".to_string(), literal_int(5)), + ( + "b".to_string(), + ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::StringValue("test".to_string())), + })), + span: None, + }, + ), + ]), + vec!["result".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + let orig_node = executor.state().nodes.get(&exec1.node_id).unwrap(); + let rehy_node = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); + assert!(orig_node.action.is_some()); + assert!(rehy_node.action.is_some()); + let orig_action = orig_node.action.as_ref().unwrap(); + let rehy_action = rehy_node.action.as_ref().unwrap(); + assert_eq!(orig_action.action_name, rehy_action.action_name); + let orig_keys: HashSet<_> = orig_action.kwargs.keys().cloned().collect(); + let rehy_keys: HashSet<_> = rehy_action.kwargs.keys().cloned().collect(); + assert_eq!(orig_keys, rehy_keys); + } + + #[test] + fn test_rehydrate_increments_from_same_position() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "first", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let action2 = action_node( + "action2", + "second", + HashMap::new(), + vec!["y".to_string()], + ActionNodeOptions::default(), + ); + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + action2.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(exec1.node_id, Value::Number(100.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let mut rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + let orig_step = executor.increment(&[exec1.node_id]).expect("increment"); + let rehy_step = rehydrated.increment(&[exec1.node_id]).expect("increment"); + assert_eq!(orig_step.actions.len(), rehy_step.actions.len()); + assert_eq!( + orig_step.actions[0].template_id, + rehy_step.actions[0].template_id + ); + } + + #[test] + fn test_rehydrate_resume_marks_running_as_retryable() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "work", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions { + policies: vec![ir::PolicyBracket { + kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { + max_retries: 3, + backoff: None, + exception_types: vec!["ExecutorResume".to_string()], + })), + }], + ..ActionNodeOptions::default() + }, + ); + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + state.mark_running(exec1.node_id).expect("mark running"); + + let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let mut rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + assert_eq!( + rehydrated.state().nodes.get(&exec1.node_id).unwrap().status, + NodeStatus::Running + ); + + let step = rehydrated.resume().expect("resume"); + assert_eq!(step.actions.len(), 1); + assert_eq!(step.actions[0].node_id, exec1.node_id); + let node = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); + assert_eq!(node.status, NodeStatus::Running); + assert_eq!(node.action_attempt, 2); + assert!(node.started_at.is_some()); + } + + #[test] + fn test_increment_records_failed_action_attempt() { + let mut dag = DAG::default(); + let action = action_node( + "action1", + "work", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec = state.queue_template_node(&action.id, None).expect("queue"); + + let mut executor = RunnerExecutor::new( + dag, + state, + HashMap::new(), + Some(Arc::new(MemoryBackend::new())), + ); + executor.set_instance_id(Uuid::new_v4()); + executor.set_action_result( + exec.node_id, + serde_json::json!({"type": "ValueError", "message": "boom"}), + ); + + let step = executor.increment(&[exec.node_id]).expect("increment"); + let updates = step.updates.expect("durable updates"); + assert_eq!(updates.actions_done.len(), 1); + assert_eq!(updates.actions_done[0].execution_id, exec.node_id); + assert_eq!(updates.actions_done[0].attempt, 1); + assert_eq!( + updates.actions_done[0] + .result + .get("type") + .and_then(Value::as_str), + Some("ValueError") + ); + assert_eq!( + executor + .state() + .nodes + .get(&exec.node_id) + .map(|n| n.status.clone()), + Some(NodeStatus::Failed) + ); + } + + #[test] + fn test_increment_records_failed_attempt_before_retry() { + let mut dag = DAG::default(); + let action = action_node( + "action1", + "work", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions { + policies: vec![ir::PolicyBracket { + kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { + max_retries: 2, + backoff: None, + exception_types: Vec::new(), + })), + }], + ..ActionNodeOptions::default() + }, + ); + dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec = state.queue_template_node(&action.id, None).expect("queue"); + + let mut executor = RunnerExecutor::new( + dag, + state, + HashMap::new(), + Some(Arc::new(MemoryBackend::new())), + ); + executor.set_instance_id(Uuid::new_v4()); + executor.set_action_result( + exec.node_id, + serde_json::json!({"type": "ValueError", "message": "retry me"}), + ); + + let first_step = executor + .increment(&[exec.node_id]) + .expect("first increment"); + assert_eq!(first_step.actions.len(), 1); + assert_eq!(first_step.actions[0].node_id, exec.node_id); + let first_updates = first_step.updates.expect("first durable updates"); + assert_eq!(first_updates.actions_done.len(), 1); + assert_eq!(first_updates.actions_done[0].attempt, 1); + assert_eq!( + executor + .state() + .nodes + .get(&exec.node_id) + .map(|n| n.status.clone()), + Some(NodeStatus::Running) + ); + assert_eq!( + executor + .state() + .nodes + .get(&exec.node_id) + .map(|n| n.action_attempt), + Some(2) + ); + + executor.set_action_result(exec.node_id, Value::String("ok".to_string())); + let second_step = executor + .increment(&[exec.node_id]) + .expect("second increment"); + let second_updates = second_step.updates.expect("second durable updates"); + assert_eq!(second_updates.actions_done.len(), 1); + assert_eq!(second_updates.actions_done[0].attempt, 2); + assert_eq!( + executor + .state() + .nodes + .get(&exec.node_id) + .map(|n| n.status.clone()), + Some(NodeStatus::Completed) + ); + } + + #[test] + fn test_rehydrate_replay_variables_consistent() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "fetch", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let assign = assignment_node( + "assign", + vec!["doubled".to_string()], + binary( + variable("x"), + ir::BinaryOperator::BinaryOpMul, + literal_int(2), + ), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::Assignment(assign.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + assign.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(exec1.node_id, Value::Number(21.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + executor.increment(&[exec1.node_id]).expect("increment"); + + let orig_replay = + crate::replay_variables(executor.state(), executor.action_results()).expect("replay"); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + let rehy_replay = crate::replay_variables(rehydrated.state(), rehydrated.action_results()) + .expect("replay"); + assert_eq!(orig_replay.variables, rehy_replay.variables); + assert_eq!( + rehy_replay.variables.get("doubled"), + Some(&Value::Number(42.into())) + ); + } + + #[test] + fn test_rehydrate_completion_equivalent_across_ir_scenarios() { + let (linear_dag, linear_executor) = setup_linear_assignment_checkpoint(); + RehydrateBranchHarness::new(linear_dag, linear_executor, completion_action_result) + .run_and_assert(); + + let (sleep_dag, sleep_executor) = setup_sleep_resume_checkpoint(); + RehydrateBranchHarness::new(sleep_dag, sleep_executor, completion_action_result) + .run_and_assert(); + + let (spread_dag, spread_executor) = setup_spread_checkpoint(); + RehydrateBranchHarness::new(spread_dag, spread_executor, completion_action_result) + .run_and_assert(); + } + + #[test] + fn test_rehydrate_spread_action_with_aggregator() { + let mut dag = DAG::default(); + let initial_action = action_node( + "initial", + "get_items", + HashMap::new(), + vec!["items".to_string()], + ActionNodeOptions::default(), + ); + let spread_action = action_node( + "spread_action", + "process_item", + HashMap::from([("item".to_string(), variable("item"))]), + vec!["item_result".to_string()], + ActionNodeOptions { + spread_loop_var: Some("item".to_string()), + spread_collection_expr: Some(variable("items")), + aggregates_to: Some("aggregator".to_string()), + ..ActionNodeOptions::default() + }, + ); + let aggregator = + aggregator_node("aggregator", "spread_action", vec!["results".to_string()]); + + dag.add_node(waymark_dag::DAGNode::ActionCall(initial_action.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(spread_action.clone())); + dag.add_node(waymark_dag::DAGNode::Aggregator(aggregator.clone())); + dag.add_edge(DAGEdge::state_machine( + initial_action.id.clone(), + spread_action.id.clone(), + )); + dag.add_edge(DAGEdge::state_machine( + spread_action.id.clone(), + aggregator.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let initial_exec = state + .queue_template_node(&initial_action.id, None) + .expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert( + initial_exec.node_id, + Value::Array(vec![1.into(), 2.into(), 3.into()]), + ); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + + let step1 = executor + .increment(&[initial_exec.node_id]) + .expect("increment"); + assert_eq!(step1.actions.len(), 3); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + compare_executor_states(&executor, &rehydrated); + let action_nodes: Vec<_> = executor + .state() + .nodes + .values() + .filter(|node| { + node.is_action_call() && node.template_id.as_deref() == Some(&spread_action.id) + }) + .collect(); + assert_eq!(action_nodes.len(), 3); + for action_node in action_nodes { + let rehy_node = rehydrated.state().nodes.get(&action_node.node_id).unwrap(); + assert_eq!(rehy_node.node_type, action_node.node_type); + assert_eq!(rehy_node.status, action_node.status); + } + } + + #[test] + fn test_rehydrate_full_spread_execution() { + let mut dag = DAG::default(); + let initial_action = action_node( + "initial", + "get_items", + HashMap::new(), + vec!["items".to_string()], + ActionNodeOptions::default(), + ); + let spread_action = action_node( + "spread_action", + "double", + HashMap::from([("value".to_string(), variable("item"))]), + vec!["item_result".to_string()], + ActionNodeOptions { + spread_loop_var: Some("item".to_string()), + spread_collection_expr: Some(variable("items")), + aggregates_to: Some("aggregator".to_string()), + ..ActionNodeOptions::default() + }, + ); + let aggregator = + aggregator_node("aggregator", "spread_action", vec!["results".to_string()]); + + dag.add_node(waymark_dag::DAGNode::ActionCall(initial_action.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(spread_action.clone())); + dag.add_node(waymark_dag::DAGNode::Aggregator(aggregator.clone())); + dag.add_edge(DAGEdge::state_machine( + initial_action.id.clone(), + spread_action.id.clone(), + )); + dag.add_edge(DAGEdge::state_machine( + spread_action.id.clone(), + aggregator.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let initial_exec = state + .queue_template_node(&initial_action.id, None) + .expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert( + initial_exec.node_id, + Value::Array(vec![10.into(), 20.into()]), + ); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results.clone(), None); + + let step1 = executor + .increment(&[initial_exec.node_id]) + .expect("increment"); + let spread_nodes = step1.actions; + assert_eq!(spread_nodes.len(), 2); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + for (idx, node) in spread_nodes.iter().enumerate() { + executor.set_action_result(node.node_id, Value::Number(((idx + 1) * 100).into())); + } + + let _step2 = executor + .increment(&spread_nodes.iter().map(|n| n.node_id).collect::>()) + .expect("increment"); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + compare_executor_states(&executor, &rehydrated); + + let agg_nodes: Vec<_> = rehydrated + .state() + .nodes + .values() + .filter(|node| node.template_id.as_deref() == Some(&aggregator.id)) + .collect(); + assert_eq!(agg_nodes.len(), 1); + assert_eq!(agg_nodes[0].status, NodeStatus::Completed); + assert!(agg_nodes[0].assignments.contains_key("results")); + } + + #[test] + fn test_rehydrate_timeline_ordering_preserved() { + let mut dag = DAG::default(); + let mut actions = Vec::new(); + for i in 0..4 { + actions.push(action_node( + &format!("action{i}"), + &format!("step{i}"), + HashMap::new(), + vec![format!("x{i}")], + ActionNodeOptions::default(), + )); + } + for action in &actions { + dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); + } + for i in 0..actions.len() - 1 { + dag.add_edge(DAGEdge::state_machine( + actions[i].id.clone(), + actions[i + 1].id.clone(), + )); + } + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let mut exec_nodes: Vec = Vec::new(); + exec_nodes.push( + state + .queue_template_node(&actions[0].id, None) + .expect("queue"), + ); + let mut executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); + + for i in 0..3 { + executor.set_action_result( + exec_nodes.last().unwrap().node_id, + Value::Number((i * 10).into()), + ); + let step = executor + .increment(&[exec_nodes.last().unwrap().node_id]) + .expect("increment"); + if !step.actions.is_empty() { + exec_nodes.push(step.actions[0].clone()); + } + } + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + let orig_timeline = executor.state().timeline.clone(); + let rehy_timeline = rehydrated.state().timeline.clone(); + assert_eq!(orig_timeline.len(), rehy_timeline.len()); + assert_eq!( + orig_timeline.iter().collect::>(), + rehy_timeline.iter().collect::>() + ); + } + + #[test] + fn test_rehydrate_ready_queue_rebuilt_for_running_actions() { + let mut dag = DAG::default(); + let action1 = action_node( + "action1", + "first", + HashMap::new(), + vec!["x".to_string()], + ActionNodeOptions::default(), + ); + let action2 = action_node( + "action2", + "second", + HashMap::new(), + vec!["y".to_string()], + ActionNodeOptions::default(), + ); + + dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); + dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); + dag.add_edge(DAGEdge::state_machine( + action1.id.clone(), + action2.id.clone(), + )); + + let dag = Arc::new(dag); + let mut state = RunnerState::new(Some(dag.clone()), None, None, false); + let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); + + let mut action_results = HashMap::new(); + action_results.insert(exec1.node_id, Value::Number(50.into())); + let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); + let step = executor.increment(&[exec1.node_id]).expect("increment"); + let exec2 = step.actions[0].clone(); + + let (nodes_snap, edges_snap, results_snap) = + snapshot_state(executor.state(), executor.action_results()); + let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); + + let queued_nodes: Vec<_> = rehydrated + .state() + .nodes + .values() + .filter(|node| node.status == NodeStatus::Queued) + .collect(); + assert!(queued_nodes.is_empty()); + let running_nodes: Vec<_> = rehydrated + .state() + .nodes + .values() + .filter(|node| node.status == NodeStatus::Running) + .collect(); + assert_eq!(running_nodes.len(), 1); + assert_eq!(running_nodes[0].node_id, exec2.node_id); + assert!( + rehydrated.state().ready_queue.is_empty(), + "rehydration should not requeue running action nodes" + ); + } +} diff --git a/crates/runner/src/expression_evaluator.rs b/crates/runner/src/expression_evaluator.rs new file mode 100644 index 00000000..dac989a9 --- /dev/null +++ b/crates/runner/src/expression_evaluator.rs @@ -0,0 +1,1056 @@ +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::rc::Rc; + +use serde_json::Value; +use uuid::Uuid; + +use waymark_dag::{DAGEdge, EdgeType}; +use waymark_observability::obs; +use waymark_proto::ast as ir; +use waymark_runner_state::{ + ActionCallSpec, ActionResultValue, BinaryOpValue, DictEntryValue, DictValue, DotValue, + FunctionCallValue, IndexValue, ListValue, LiteralValue, UnaryOpValue, VariableValue, + literal_value, + value_visitor::{ValueExpr, ValueExprEvaluator}, +}; + +use super::{RunnerExecutor, RunnerExecutorError}; + +impl RunnerExecutor { + /// Convert a pure IR expression into a ValueExpr without side effects. + pub(super) fn expr_to_value(expr: &ir::Expr) -> Result { + match expr.kind.as_ref() { + Some(ir::expr::Kind::Literal(lit)) => Ok(ValueExpr::Literal(LiteralValue { + value: literal_value(lit), + })), + Some(ir::expr::Kind::Variable(var)) => Ok(ValueExpr::Variable(VariableValue { + name: var.name.clone(), + })), + Some(ir::expr::Kind::BinaryOp(op)) => { + let left = op + .left + .as_ref() + .ok_or_else(|| RunnerExecutorError("binary op missing left".to_string()))?; + let right = op + .right + .as_ref() + .ok_or_else(|| RunnerExecutorError("binary op missing right".to_string()))?; + Ok(ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(Self::expr_to_value(left)?), + op: op.op, + right: Box::new(Self::expr_to_value(right)?), + })) + } + Some(ir::expr::Kind::UnaryOp(op)) => { + let operand = op + .operand + .as_ref() + .ok_or_else(|| RunnerExecutorError("unary op missing operand".to_string()))?; + Ok(ValueExpr::UnaryOp(UnaryOpValue { + op: op.op, + operand: Box::new(Self::expr_to_value(operand)?), + })) + } + Some(ir::expr::Kind::List(list)) => { + let mut elements = Vec::new(); + for item in &list.elements { + elements.push(Self::expr_to_value(item)?); + } + Ok(ValueExpr::List(ListValue { elements })) + } + Some(ir::expr::Kind::Dict(dict_expr)) => { + let mut entries = Vec::new(); + for entry in &dict_expr.entries { + let key = entry + .key + .as_ref() + .ok_or_else(|| RunnerExecutorError("dict entry missing key".to_string()))?; + let value = entry.value.as_ref().ok_or_else(|| { + RunnerExecutorError("dict entry missing value".to_string()) + })?; + entries.push(DictEntryValue { + key: Self::expr_to_value(key)?, + value: Self::expr_to_value(value)?, + }); + } + Ok(ValueExpr::Dict(DictValue { entries })) + } + Some(ir::expr::Kind::Index(index)) => { + let object = index.object.as_ref().ok_or_else(|| { + RunnerExecutorError("index access missing object".to_string()) + })?; + let index_expr = index + .index + .as_ref() + .ok_or_else(|| RunnerExecutorError("index access missing index".to_string()))?; + Ok(ValueExpr::Index(IndexValue { + object: Box::new(Self::expr_to_value(object)?), + index: Box::new(Self::expr_to_value(index_expr)?), + })) + } + Some(ir::expr::Kind::Dot(dot)) => { + let object = dot + .object + .as_ref() + .ok_or_else(|| RunnerExecutorError("dot access missing object".to_string()))?; + Ok(ValueExpr::Dot(DotValue { + object: Box::new(Self::expr_to_value(object)?), + attribute: dot.attribute.clone(), + })) + } + Some(ir::expr::Kind::FunctionCall(call)) => { + let mut args = Vec::new(); + for arg in &call.args { + args.push(Self::expr_to_value(arg)?); + } + let mut kwargs = HashMap::new(); + for kw in &call.kwargs { + if let Some(value) = &kw.value { + kwargs.insert(kw.name.clone(), Self::expr_to_value(value)?); + } + } + let global_fn = if call.global_function != 0 { + Some(call.global_function) + } else { + None + }; + Ok(ValueExpr::FunctionCall(FunctionCallValue { + name: call.name.clone(), + args, + kwargs, + global_function: global_fn, + })) + } + Some( + ir::expr::Kind::ActionCall(_) + | ir::expr::Kind::ParallelExpr(_) + | ir::expr::Kind::SpreadExpr(_), + ) => Err(RunnerExecutorError( + "action/spread calls not allowed in guard expressions".to_string(), + )), + None => Ok(ValueExpr::Literal(LiteralValue { value: Value::Null })), + } + } + + /// Evaluate a guard expression using current symbolic assignments. + pub(super) fn evaluate_guard( + &self, + expr: Option<&ir::Expr>, + ) -> Result { + let expr = match expr { + Some(expr) => expr, + None => return Ok(false), + }; + let value_expr = self.state().materialize_value(Self::expr_to_value(expr)?); + let result = self.evaluate_value_expr(&value_expr)?; + Ok(is_truthy(&result)) + } + + /// Resolve an action's symbolic kwargs to concrete Python values. + /// + /// Example: + /// - spec.kwargs={"value": VariableValue("x")} + /// - with x assigned to LiteralValue(10), returns {"value": 10}. + #[obs] + pub fn resolve_action_kwargs( + &self, + node_id: Uuid, + action: &ActionCallSpec, + ) -> Result, RunnerExecutorError> { + let mut resolved = HashMap::new(); + for (name, expr) in &action.kwargs { + resolved.insert( + name.clone(), + self.evaluate_value_expr_for_node(expr, Some(node_id))?, + ); + } + Ok(resolved) + } + + /// Evaluate a ValueExpr into a concrete Python value. + #[obs] + pub(super) fn evaluate_value_expr( + &self, + expr: &ValueExpr, + ) -> Result { + self.evaluate_value_expr_for_node(expr, None) + } + + fn evaluate_value_expr_for_node( + &self, + expr: &ValueExpr, + current_node_id: Option, + ) -> Result { + let stack = Rc::new(RefCell::new(HashSet::new())); + let resolve_variable = { + let stack = stack.clone(); + let this = self; + move |name: &str| { + this.evaluate_variable_with_context(current_node_id, name, stack.clone()) + } + }; + let resolve_action_result = { + let this = self; + move |value: &ActionResultValue| this.resolve_action_result(value) + }; + let resolve_function_call = { + let this = self; + move |value: &FunctionCallValue, args, kwargs| { + this.evaluate_function_call(value, args, kwargs) + } + }; + let apply_binary = |op, left, right| Self::apply_binary(op, left, right); + let apply_unary = |op, operand| Self::apply_unary(op, operand); + let error_factory = |message: &str| RunnerExecutorError(message.to_string()); + let evaluator = ValueExprEvaluator::new( + &resolve_variable, + &resolve_action_result, + &resolve_function_call, + &apply_binary, + &apply_unary, + &error_factory, + ); + evaluator.visit(expr) + } + + fn find_variable_source_node(&self, current_node_id: Uuid, name: &str) -> Option { + let timeline_index: HashMap = self + .state() + .timeline + .iter() + .enumerate() + .map(|(idx, node_id)| (*node_id, idx)) + .collect(); + + self.state() + .edges + .iter() + .filter(|edge| edge.edge_type == EdgeType::DataFlow && edge.target == current_node_id) + .map(|edge| edge.source) + .filter(|source| { + self.state() + .nodes + .get(source) + .map(|node| node.assignments.contains_key(name)) + .unwrap_or(false) + }) + .max_by_key(|source| timeline_index.get(source).copied().unwrap_or(0)) + } + + fn evaluate_variable_with_context( + &self, + current_node_id: Option, + name: &str, + stack: Rc>>, + ) -> Result { + let node_id = current_node_id + .and_then(|node_id| self.find_variable_source_node(node_id, name)) + .or_else(|| self.state().latest_assignment(name)) + .ok_or_else(|| RunnerExecutorError(format!("variable not found: {name}")))?; + self.evaluate_assignment(node_id, name, stack) + } + + pub(super) fn evaluate_assignment( + &self, + node_id: Uuid, + target: &str, + stack: Rc>>, + ) -> Result { + let key = (node_id, target.to_string()); + if let Some(value) = self.eval_cache_get(&key) { + return Ok(value); + } + if stack.borrow().contains(&key) { + return Err(RunnerExecutorError(format!( + "recursive assignment detected for {target}" + ))); + } + + let node = self + .state() + .nodes + .get(&node_id) + .ok_or_else(|| RunnerExecutorError(format!("missing assignment for {target}")))?; + let expr = node + .assignments + .get(target) + .ok_or_else(|| RunnerExecutorError(format!("missing assignment for {target}")))?; + + stack.borrow_mut().insert(key.clone()); + let resolve_variable = { + let stack = stack.clone(); + let this = self; + move |name: &str| { + this.evaluate_variable_with_context(Some(node_id), name, stack.clone()) + } + }; + let resolve_action_result = { + let this = self; + move |value: &ActionResultValue| this.resolve_action_result(value) + }; + let resolve_function_call = { + let this = self; + move |value: &FunctionCallValue, args, kwargs| { + this.evaluate_function_call(value, args, kwargs) + } + }; + let apply_binary = |op, left, right| Self::apply_binary(op, left, right); + let apply_unary = |op, operand| Self::apply_unary(op, operand); + let error_factory = |message: &str| RunnerExecutorError(message.to_string()); + let evaluator = ValueExprEvaluator::new( + &resolve_variable, + &resolve_action_result, + &resolve_function_call, + &apply_binary, + &apply_unary, + &error_factory, + ); + let value = evaluator.visit(expr)?; + stack.borrow_mut().remove(&key); + self.eval_cache_insert(key, value.clone()); + Ok(value) + } + + pub(super) fn resolve_action_result( + &self, + expr: &ActionResultValue, + ) -> Result { + let value = self + .action_results() + .get(&expr.node_id) + .cloned() + .ok_or_else(|| { + RunnerExecutorError(format!("missing action result for {}", expr.node_id)) + })?; + if let Some(idx) = expr.result_index { + if let Value::Array(items) = value { + let idx = idx as usize; + return items.get(idx).cloned().ok_or_else(|| { + RunnerExecutorError(format!( + "action result for {} has no index {}", + expr.node_id, idx + )) + }); + } + return Err(RunnerExecutorError(format!( + "action result for {} has no index {}", + expr.node_id, idx + ))); + } + Ok(value) + } + + pub(super) fn evaluate_function_call( + &self, + expr: &FunctionCallValue, + args: Vec, + kwargs: HashMap, + ) -> Result { + if let Some(global_fn) = expr.global_function + && global_fn != ir::GlobalFunction::Unspecified as i32 + { + return self.evaluate_global_function(global_fn, args, kwargs); + } + Err(RunnerExecutorError(format!( + "cannot evaluate non-global function call: {}", + expr.name + ))) + } + + pub(super) fn evaluate_global_function( + &self, + global_function: i32, + args: Vec, + kwargs: HashMap, + ) -> Result { + let error = executor_error; + match ir::GlobalFunction::try_from(global_function).ok() { + Some(ir::GlobalFunction::Range) => Ok(range_from_args(&args).into()), + Some(ir::GlobalFunction::Len) => { + if let Some(first) = args.first() { + return Ok(Value::Number(len_of_value(first, error)?)); + } + if let Some(items) = kwargs.get("items") { + return Ok(Value::Number(len_of_value(items, error)?)); + } + Err(RunnerExecutorError("len() missing argument".to_string())) + } + Some(ir::GlobalFunction::Enumerate) => { + let items = if let Some(first) = args.first() { + first.clone() + } else if let Some(items) = kwargs.get("items") { + items.clone() + } else { + return Err(RunnerExecutorError( + "enumerate() missing argument".to_string(), + )); + }; + let list = match items { + Value::Array(items) => items, + _ => return Err(RunnerExecutorError("enumerate() expects list".to_string())), + }; + let pairs: Vec = list + .into_iter() + .enumerate() + .map(|(idx, item)| Value::Array(vec![Value::Number((idx as i64).into()), item])) + .collect(); + Ok(Value::Array(pairs)) + } + Some(ir::GlobalFunction::Isexception) => { + if let Some(first) = args.first() { + return Ok(Value::Bool(is_exception_value(first))); + } + if let Some(value) = kwargs.get("value") { + return Ok(Value::Bool(is_exception_value(value))); + } + Err(RunnerExecutorError( + "isexception() missing argument".to_string(), + )) + } + Some(ir::GlobalFunction::Unspecified) | None => Err(RunnerExecutorError( + "global function unspecified".to_string(), + )), + } + } + + pub(super) fn apply_binary( + op: i32, + left: Value, + right: Value, + ) -> Result { + let error = executor_error; + match ir::BinaryOperator::try_from(op).ok() { + Some(ir::BinaryOperator::BinaryOpOr) => { + if is_truthy(&left) { + Ok(left) + } else { + Ok(right) + } + } + Some(ir::BinaryOperator::BinaryOpAnd) => { + if is_truthy(&left) { + Ok(right) + } else { + Ok(left) + } + } + Some(ir::BinaryOperator::BinaryOpEq) => Ok(Value::Bool(left == right)), + Some(ir::BinaryOperator::BinaryOpNe) => Ok(Value::Bool(left != right)), + Some(ir::BinaryOperator::BinaryOpLt) => { + compare_values(left, right, |a, b| a < b, error) + } + Some(ir::BinaryOperator::BinaryOpLe) => { + compare_values(left, right, |a, b| a <= b, error) + } + Some(ir::BinaryOperator::BinaryOpGt) => { + compare_values(left, right, |a, b| a > b, error) + } + Some(ir::BinaryOperator::BinaryOpGe) => { + compare_values(left, right, |a, b| a >= b, error) + } + Some(ir::BinaryOperator::BinaryOpIn) => Ok(Value::Bool(value_in(&left, &right))), + Some(ir::BinaryOperator::BinaryOpNotIn) => Ok(Value::Bool(!value_in(&left, &right))), + Some(ir::BinaryOperator::BinaryOpAdd) => add_values(left, right, error), + Some(ir::BinaryOperator::BinaryOpSub) => { + numeric_op(left, right, |a, b| a - b, true, error) + } + Some(ir::BinaryOperator::BinaryOpMul) => { + numeric_op(left, right, |a, b| a * b, true, error) + } + Some(ir::BinaryOperator::BinaryOpDiv) => { + numeric_op(left, right, |a, b| a / b, false, error) + } + Some(ir::BinaryOperator::BinaryOpFloorDiv) => { + numeric_op(left, right, |a, b| (a / b).floor(), true, error) + } + Some(ir::BinaryOperator::BinaryOpMod) => { + numeric_op(left, right, |a, b| a % b, true, error) + } + Some(ir::BinaryOperator::BinaryOpUnspecified) | None => Err(RunnerExecutorError( + "binary operator unspecified".to_string(), + )), + } + } + + pub(super) fn apply_unary(op: i32, operand: Value) -> Result { + match ir::UnaryOperator::try_from(op).ok() { + Some(ir::UnaryOperator::UnaryOpNeg) => { + if let Some(value) = int_value(&operand) { + return Ok(Value::Number((-value).into())); + } + match operand.as_f64() { + Some(value) => Ok(Value::Number( + serde_json::Number::from_f64(-value) + .unwrap_or_else(|| serde_json::Number::from(0)), + )), + None => Err(RunnerExecutorError("unary neg expects number".to_string())), + } + } + Some(ir::UnaryOperator::UnaryOpNot) => Ok(Value::Bool(!is_truthy(&operand))), + Some(ir::UnaryOperator::UnaryOpUnspecified) | None => Err(RunnerExecutorError( + "unary operator unspecified".to_string(), + )), + } + } + + pub(super) fn exception_matches(&self, edge: &DAGEdge, exception_value: &Value) -> bool { + let exception_types = match &edge.exception_types { + Some(types) => types, + None => return false, + }; + if exception_types.is_empty() { + return true; + } + let exc_name = match exception_value { + Value::Object(map) => map + .get("type") + .and_then(|value| value.as_str()) + .map(|value| value.to_string()), + _ => None, + }; + if let Some(name) = exc_name { + return exception_types.iter().any(|value| value == &name); + } + false + } +} + +fn executor_error(message: &'static str) -> RunnerExecutorError { + RunnerExecutorError(message.to_string()) +} + +pub(crate) fn int_value(value: &Value) -> Option { + value + .as_i64() + .or_else(|| value.as_u64().and_then(|value| i64::try_from(value).ok())) +} + +pub(crate) fn numeric_op( + left: Value, + right: Value, + op: impl Fn(f64, f64) -> f64, + prefer_int: bool, + error: fn(&'static str) -> E, +) -> Result { + let left_num = left + .as_f64() + .ok_or_else(|| error("numeric operation expects number"))?; + let right_num = right + .as_f64() + .ok_or_else(|| error("numeric operation expects number"))?; + let result = op(left_num, right_num); + if prefer_int && int_value(&left).is_some() && int_value(&right).is_some() && result.is_finite() + { + let rounded = result.round(); + if (result - rounded).abs() < 1e-9 + && rounded >= (i64::MIN as f64) + && rounded <= (i64::MAX as f64) + { + return Ok(Value::Number((rounded as i64).into())); + } + } + Ok(Value::Number( + serde_json::Number::from_f64(result).unwrap_or_else(|| serde_json::Number::from(0)), + )) +} + +pub(crate) fn add_values( + left: Value, + right: Value, + error: fn(&'static str) -> E, +) -> Result { + if let (Value::Array(mut left), Value::Array(right)) = (left.clone(), right.clone()) { + left.extend(right); + return Ok(Value::Array(left)); + } + if let (Some(left), Some(right)) = (left.as_str(), right.as_str()) { + return Ok(Value::String(format!("{left}{right}"))); + } + numeric_op(left, right, |a, b| a + b, true, error) +} + +pub(crate) fn compare_values( + left: Value, + right: Value, + op: impl Fn(f64, f64) -> bool, + error: fn(&'static str) -> E, +) -> Result { + let left = left + .as_f64() + .ok_or_else(|| error("comparison expects number"))?; + let right = right + .as_f64() + .ok_or_else(|| error("comparison expects number"))?; + Ok(Value::Bool(op(left, right))) +} + +pub(crate) fn value_in(value: &Value, container: &Value) -> bool { + match container { + Value::Array(items) => items.iter().any(|item| item == value), + Value::Object(map) => value + .as_str() + .map(|key| map.contains_key(key)) + .unwrap_or(false), + Value::String(text) => value + .as_str() + .map(|needle| text.contains(needle)) + .unwrap_or(false), + _ => false, + } +} + +pub(crate) fn is_truthy(value: &Value) -> bool { + match value { + Value::Null => false, + Value::Bool(value) => *value, + Value::Number(number) => number.as_f64().map(|value| value != 0.0).unwrap_or(false), + Value::String(value) => !value.is_empty(), + Value::Array(values) => !values.is_empty(), + Value::Object(map) => !map.is_empty(), + } +} + +pub(crate) fn is_exception_value(value: &Value) -> bool { + if let Value::Object(map) = value { + return map.contains_key("type") && map.contains_key("message"); + } + false +} + +pub(crate) fn len_of_value( + value: &Value, + error: fn(&'static str) -> E, +) -> Result { + let len = match value { + Value::Array(items) => items.len() as i64, + Value::String(text) => text.len() as i64, + Value::Object(map) => map.len() as i64, + _ => return Err(error("len() expects list, string, or dict")), + }; + Ok(len.into()) +} + +pub(crate) fn range_from_args(args: &[Value]) -> Vec { + let mut start = 0i64; + let mut end = 0i64; + let mut step = 1i64; + if args.len() == 1 { + end = args[0].as_i64().unwrap_or(0); + } else if args.len() >= 2 { + start = args[0].as_i64().unwrap_or(0); + end = args[1].as_i64().unwrap_or(0); + if args.len() >= 3 { + step = args[2].as_i64().unwrap_or(1); + } + } + if step == 0 { + return Vec::new(); + } + let mut values = Vec::new(); + if step > 0 { + let mut current = start; + while current < end { + values.push(Value::Number(current.into())); + current += step; + } + } else { + let mut current = start; + while current > end { + values.push(Value::Number(current.into())); + current += step; + } + } + values +} + +#[cfg(test)] +mod tests { + use std::cell::RefCell; + use std::collections::{HashMap, HashSet}; + use std::rc::Rc; + use std::sync::Arc; + + use uuid::Uuid; + + use super::*; + use waymark_dag::{DAG, DAGEdge}; + use waymark_ir_parser::IRParser; + use waymark_proto::ast as ir; + use waymark_runner_state::{ + ActionCallSpec, ActionResultValue, BinaryOpValue, FunctionCallValue, LiteralValue, + RunnerState, VariableValue, value_visitor::ValueExpr, + }; + + fn parse_expr(source: &str) -> ir::Expr { + IRParser::new(" ") + .parse_expr(source) + .expect("parse expression") + } + + fn literal_int(value: i64) -> ValueExpr { + ValueExpr::Literal(LiteralValue { + value: Value::Number(value.into()), + }) + } + + fn empty_executor() -> RunnerExecutor { + let dag = Arc::new(DAG::default()); + let state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + RunnerExecutor::new(dag, state, HashMap::new(), None) + } + + fn executor_with_assignment(name: &str, value: ValueExpr) -> RunnerExecutor { + let dag = Arc::new(DAG::default()); + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + state + .record_assignment_value( + vec![name.to_string()], + value, + None, + Some("test assignment".to_string()), + ) + .expect("record assignment"); + RunnerExecutor::new(dag, state, HashMap::new(), None) + } + + #[test] + fn test_expr_to_value_happy_path() { + let expr = parse_expr("x + 2"); + let value = RunnerExecutor::expr_to_value(&expr).expect("convert expression"); + match value { + ValueExpr::BinaryOp(binary) => { + assert!(matches!(*binary.left, ValueExpr::Variable(_))); + assert!(matches!(*binary.right, ValueExpr::Literal(_))); + } + other => panic!("expected binary op, got {other:?}"), + } + } + + #[test] + fn test_evaluate_guard_happy_path() { + let executor = executor_with_assignment("x", literal_int(2)); + let guard = parse_expr("x > 1"); + let result = executor + .evaluate_guard(Some(&guard)) + .expect("evaluate guard"); + assert!(result); + } + + #[test] + fn test_resolve_action_kwargs_happy_path() { + let executor = executor_with_assignment("x", literal_int(10)); + let action = ActionCallSpec { + action_name: "double".to_string(), + module_name: Some("tests".to_string()), + kwargs: HashMap::from([( + "value".to_string(), + ValueExpr::Variable(VariableValue { + name: "x".to_string(), + }), + )]), + }; + let resolved = executor + .resolve_action_kwargs(Uuid::new_v4(), &action) + .expect("resolve kwargs"); + assert_eq!(resolved.get("value"), Some(&Value::Number(10.into()))); + } + + #[test] + fn test_resolve_action_kwargs_uses_data_flow_for_self_referential_targets() { + let dag = Arc::new(DAG::default()); + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + state + .record_assignment_value( + vec!["current".to_string()], + literal_int(0), + None, + Some("current = 0".to_string()), + ) + .expect("record current"); + let action_result = state + .queue_action( + "increment", + Some(vec!["current".to_string()]), + Some(HashMap::from([( + "value".to_string(), + ValueExpr::Variable(VariableValue { + name: "current".to_string(), + }), + )])), + None, + None, + ) + .expect("queue increment"); + let action_node = state + .nodes + .get(&action_result.node_id) + .expect("action node") + .clone(); + let action_spec = action_node.action.expect("action spec"); + + let executor = RunnerExecutor::new(dag, state, HashMap::new(), None); + let resolved = executor + .resolve_action_kwargs(action_result.node_id, &action_spec) + .expect("resolve kwargs"); + assert_eq!(resolved.get("value"), Some(&Value::Number(0.into()))); + } + + #[test] + fn test_evaluate_value_expr_happy_path() { + let executor = executor_with_assignment("x", literal_int(3)); + let expr = ValueExpr::BinaryOp(waymark_runner_state::BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "x".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(literal_int(1)), + }); + let value = executor + .evaluate_value_expr(&expr) + .expect("evaluate value expression"); + assert_eq!(value, Value::Number(4.into())); + } + + #[test] + fn test_evaluate_variable_happy_path() { + let executor = executor_with_assignment("value", literal_int(5)); + let stack = Rc::new(RefCell::new(HashSet::new())); + let value = executor + .evaluate_variable_with_context(None, "value", stack) + .expect("evaluate variable"); + assert_eq!(value, Value::Number(5.into())); + } + + #[test] + fn test_evaluate_assignment_happy_path() { + let executor = executor_with_assignment("value", literal_int(9)); + let node_id = executor + .state() + .latest_assignment("value") + .expect("latest assignment"); + let stack = Rc::new(RefCell::new(HashSet::new())); + let value = executor + .evaluate_assignment(node_id, "value", stack) + .expect("evaluate assignment"); + assert_eq!(value, Value::Number(9.into())); + } + + #[test] + fn test_evaluate_assignment_uses_data_flow_for_self_referential_updates() { + let dag = Arc::new(DAG::default()); + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + state + .record_assignment_value( + vec!["count".to_string()], + literal_int(0), + None, + Some("count = 0".to_string()), + ) + .expect("record initial count"); + state + .record_assignment_value( + vec!["count".to_string()], + ValueExpr::BinaryOp(BinaryOpValue { + left: Box::new(ValueExpr::Variable(VariableValue { + name: "count".to_string(), + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Box::new(literal_int(1)), + }), + None, + Some("count = count + 1".to_string()), + ) + .expect("record updated count"); + + let executor = RunnerExecutor::new(dag, state, HashMap::new(), None); + let node_id = executor + .state() + .latest_assignment("count") + .expect("latest assignment"); + let stack = Rc::new(RefCell::new(HashSet::new())); + let value = executor + .evaluate_assignment(node_id, "count", stack) + .expect("evaluate self-referential assignment"); + assert_eq!(value, Value::Number(1.into())); + } + + #[test] + fn test_resolve_action_result_happy_path() { + let mut executor = empty_executor(); + let action_id = Uuid::new_v4(); + executor.set_action_result( + action_id, + Value::Array(vec![Value::Number(7.into()), Value::Number(8.into())]), + ); + let result = executor + .resolve_action_result(&ActionResultValue { + node_id: action_id, + action_name: "fetch".to_string(), + iteration_index: None, + result_index: Some(1), + }) + .expect("resolve action result"); + assert_eq!(result, Value::Number(8.into())); + } + + #[test] + fn test_evaluate_function_call_happy_path() { + let executor = empty_executor(); + let value = executor + .evaluate_function_call( + &FunctionCallValue { + name: "len".to_string(), + args: Vec::new(), + kwargs: HashMap::new(), + global_function: Some(ir::GlobalFunction::Len as i32), + }, + vec![Value::Array(vec![Value::Null, Value::Null])], + HashMap::new(), + ) + .expect("evaluate function call"); + assert_eq!(value, Value::Number(2.into())); + } + + #[test] + fn test_evaluate_global_function_happy_path() { + let executor = empty_executor(); + let value = executor + .evaluate_global_function( + ir::GlobalFunction::Range as i32, + vec![Value::Number(1.into()), Value::Number(4.into())], + HashMap::new(), + ) + .expect("evaluate global function"); + assert_eq!( + value, + Value::Array(vec![ + Value::Number(1.into()), + Value::Number(2.into()), + Value::Number(3.into()) + ]) + ); + } + + #[test] + fn test_apply_binary_happy_path() { + let value = RunnerExecutor::apply_binary( + ir::BinaryOperator::BinaryOpAdd as i32, + Value::Number(2.into()), + Value::Number(3.into()), + ) + .expect("apply binary"); + assert_eq!(value, Value::Number(5.into())); + } + + #[test] + fn test_apply_unary_happy_path() { + let value = + RunnerExecutor::apply_unary(ir::UnaryOperator::UnaryOpNot as i32, Value::Bool(true)) + .expect("apply unary"); + assert_eq!(value, Value::Bool(false)); + } + + #[test] + fn test_exception_matches_happy_path() { + let executor = empty_executor(); + let edge = DAGEdge::state_machine_with_exception("a", "b", vec!["ValueError".to_string()]); + let exception = serde_json::json!({ + "type": "ValueError", + "message": "boom", + }); + assert!(executor.exception_matches(&edge, &exception)); + } + + #[test] + fn test_executor_error_happy_path() { + let error = executor_error("hello"); + assert_eq!(error.0, "hello"); + } + + #[test] + fn test_int_value_happy_path() { + let value = Value::Number(7_u64.into()); + assert_eq!(int_value(&value), Some(7)); + } + + #[test] + fn test_numeric_op_happy_path() { + let value = numeric_op( + Value::Number(10.into()), + Value::Number(3.into()), + |a, b| a + b, + true, + executor_error, + ) + .expect("numeric op"); + assert_eq!(value, Value::Number(13.into())); + } + + #[test] + fn test_add_values_happy_path() { + let value = add_values( + Value::String("hello ".to_string()), + Value::String("world".to_string()), + executor_error, + ) + .expect("add values"); + assert_eq!(value, Value::String("hello world".to_string())); + } + + #[test] + fn test_compare_values_happy_path() { + let value = compare_values( + Value::Number(3.into()), + Value::Number(5.into()), + |a, b| a < b, + executor_error, + ) + .expect("compare values"); + assert_eq!(value, Value::Bool(true)); + } + + #[test] + fn test_value_in_happy_path() { + let container = Value::Array(vec![Value::Number(1.into()), Value::Number(2.into())]); + assert!(value_in(&Value::Number(2.into()), &container)); + } + + #[test] + fn test_is_truthy_happy_path() { + assert!(is_truthy(&Value::String("non-empty".to_string()))); + } + + #[test] + fn test_is_exception_value_happy_path() { + let value = serde_json::json!({ + "type": "RuntimeError", + "message": "bad", + }); + assert!(is_exception_value(&value)); + } + + #[test] + fn test_len_of_value_happy_path() { + let value = Value::Array(vec![Value::Null, Value::Null, Value::Null]); + let len = len_of_value(&value, executor_error).expect("length"); + assert_eq!(len.as_i64(), Some(3)); + } + + #[test] + fn test_range_from_args_happy_path() { + let values = range_from_args(&[ + Value::Number(0.into()), + Value::Number(5.into()), + Value::Number(2.into()), + ]); + assert_eq!( + values, + vec![ + Value::Number(0.into()), + Value::Number(2.into()), + Value::Number(4.into()) + ] + ); + } +} diff --git a/crates/runner/src/lib.rs b/crates/runner/src/lib.rs new file mode 100644 index 00000000..ed59081f --- /dev/null +++ b/crates/runner/src/lib.rs @@ -0,0 +1,12 @@ +//! Runner utilities. + +pub mod executor; +pub mod expression_evaluator; +pub mod replay; +pub(crate) mod retry; +pub(crate) mod synthetic_exceptions; + +pub use executor::{ + DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, +}; +pub use replay::{ReplayError, ReplayResult, replay_action_kwargs, replay_variables}; diff --git a/crates/runner/src/replay.rs b/crates/runner/src/replay.rs new file mode 100644 index 00000000..ffb413a1 --- /dev/null +++ b/crates/runner/src/replay.rs @@ -0,0 +1,659 @@ +//! Replay variable values from a runner state snapshot. + +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::rc::Rc; + +use serde_json::Value; +use uuid::Uuid; + +use crate::expression_evaluator::{ + add_values, compare_values, int_value, is_exception_value, is_truthy, len_of_value, numeric_op, + range_from_args, value_in, +}; +use waymark_dag::{EXCEPTION_SCOPE_VAR, EdgeType}; +use waymark_proto::ast as ir; +use waymark_runner_state::{ + ActionResultValue, FunctionCallValue, RunnerState, + value_visitor::{ValueExpr, ValueExprEvaluator}, +}; + +/// Raised when replay cannot reconstruct variable values. +#[derive(Debug, thiserror::Error)] +#[error("{0}")] +pub struct ReplayError(pub String); + +#[derive(Clone, Debug)] +pub struct ReplayResult { + pub variables: HashMap, +} + +/// Replay variable values from a runner state snapshot. +pub struct ReplayEngine<'a> { + state: &'a RunnerState, + action_results: &'a HashMap, + cache: RefCell>, + timeline: Vec, + index: HashMap, + incoming_data: HashMap>, +} + +impl<'a> ReplayEngine<'a> { + /// Prepare replay state derived from a runner snapshot. + /// + /// We precompute a timeline index and incoming data-flow map so lookups are + /// O(1) during evaluation. + /// + /// Example: + /// - timeline = [node_a, node_b] + /// - index[node_b] == 1 and incoming data edges are pre-sorted. + pub fn new(state: &'a RunnerState, action_results: &'a HashMap) -> Self { + let timeline = if state.timeline.is_empty() { + state.nodes.keys().cloned().collect() + } else { + state.timeline.clone() + }; + let index = timeline + .iter() + .enumerate() + .map(|(idx, node_id)| (*node_id, idx)) + .collect(); + let incoming_data = build_incoming_data_map(state, &index); + Self { + state, + action_results, + cache: RefCell::new(HashMap::new()), + timeline, + index, + incoming_data, + } + } + + /// Replay variable values by scanning assignments from newest to oldest. + /// + /// We walk the timeline in reverse to capture the latest assignment for each + /// variable and skip older definitions once a value is known. This mirrors + /// "last write wins" semantics while avoiding redundant evaluation work. + /// + /// Example: + /// - x = 1 + /// - x = 2 + /// Reverse traversal yields x=2 without evaluating the older assignment. + pub fn replay_variables(&self) -> Result { + let mut variables: HashMap = HashMap::new(); + for node_id in self.timeline.iter().rev() { + let node = match self.state.nodes.get(node_id) { + Some(node) => node, + None => continue, + }; + if node.assignments.is_empty() { + continue; + } + for target in node.assignments.keys() { + if variables.contains_key(target) { + continue; + } + let value = self.evaluate_assignment( + *node_id, + target, + Rc::new(RefCell::new(HashSet::new())), + )?; + variables.insert(target.clone(), value); + } + } + Ok(ReplayResult { variables }) + } + + /// Replay concrete kwargs for an action execution node. + /// + /// This resolves symbolic kwargs from the action node in the context of + /// the node's incoming data-flow edges. + pub fn replay_action_kwargs( + &self, + node_id: Uuid, + ) -> Result, ReplayError> { + let node = self + .state + .nodes + .get(&node_id) + .ok_or_else(|| ReplayError(format!("action node not found: {node_id}")))?; + let action = node + .action + .as_ref() + .ok_or_else(|| ReplayError(format!("node is not an action call: {node_id}")))?; + let mut resolved = HashMap::new(); + for (name, expr) in &action.kwargs { + let value = self.evaluate_value_expr_at_node(node_id, expr)?; + resolved.insert(name.clone(), value); + } + Ok(resolved) + } + + /// Evaluate a single assignment expression with cycle detection. + /// + /// We memoize evaluated (node, target) pairs and guard against recursive + /// references by tracking a stack of active evaluations. + /// + /// Example: + /// - x = y + 1 + /// - y = 2 + /// Evaluating x resolves y first, then computes x. + fn evaluate_assignment( + &self, + node_id: Uuid, + target: &str, + stack: Rc>>, + ) -> Result { + let key = (node_id, target.to_string()); + if let Some(value) = self.cache.borrow().get(&key) { + return Ok(value.clone()); + } + if stack.borrow().contains(&key) { + return Err(ReplayError(format!( + "recursive assignment detected for {target} in {node_id}" + ))); + } + + let node = + self.state.nodes.get(&node_id).ok_or_else(|| { + ReplayError(format!("missing assignment for {target} in {node_id}")) + })?; + let expr = node + .assignments + .get(target) + .ok_or_else(|| ReplayError(format!("missing assignment for {target} in {node_id}")))?; + + stack.borrow_mut().insert(key.clone()); + let resolve_variable = { + let stack = stack.clone(); + let this = self; + move |name: &str| this.resolve_variable(node_id, name, stack.clone()) + }; + let resolve_action_result = { + let this = self; + move |value: &ActionResultValue| this.resolve_action_result(value) + }; + let resolve_function_call = { + let this = self; + move |value: &FunctionCallValue, args, kwargs| { + this.evaluate_function_call(value, args, kwargs) + } + }; + let apply_binary = |op, left, right| apply_binary(op, left, right); + let apply_unary = |op, operand| apply_unary(op, operand); + let error_factory = |message: &str| ReplayError(message.to_string()); + let evaluator = ValueExprEvaluator::new( + &resolve_variable, + &resolve_action_result, + &resolve_function_call, + &apply_binary, + &apply_unary, + &error_factory, + ); + let value = evaluator.visit(expr)?; + stack.borrow_mut().remove(&key); + self.cache.borrow_mut().insert(key, value.clone()); + Ok(value) + } + + fn evaluate_value_expr_at_node( + &self, + node_id: Uuid, + expr: &ValueExpr, + ) -> Result { + let stack = Rc::new(RefCell::new(HashSet::new())); + let resolve_variable = { + let stack = stack.clone(); + let this = self; + move |name: &str| this.resolve_variable(node_id, name, stack.clone()) + }; + let resolve_action_result = { + let this = self; + move |value: &ActionResultValue| this.resolve_action_result(value) + }; + let resolve_function_call = { + let this = self; + move |value: &FunctionCallValue, args, kwargs| { + this.evaluate_function_call(value, args, kwargs) + } + }; + let apply_binary = |op, left, right| apply_binary(op, left, right); + let apply_unary = |op, operand| apply_unary(op, operand); + let error_factory = |message: &str| ReplayError(message.to_string()); + let evaluator = ValueExprEvaluator::new( + &resolve_variable, + &resolve_action_result, + &resolve_function_call, + &apply_binary, + &apply_unary, + &error_factory, + ); + evaluator.visit(expr) + } + + /// Resolve a variable reference via data-flow edges. + /// + /// This walks to the closest upstream definition and replays that + /// assignment for the requested variable. + /// + /// Example: + /// - action_1 defines x + /// - assign_2 uses x + /// Resolving x from assign_2 evaluates action_1's assignment. + fn resolve_variable( + &self, + current_node_id: Uuid, + name: &str, + stack: Rc>>, + ) -> Result { + let mut source_node_id = self.find_variable_source_node(current_node_id, name); + if source_node_id.is_none() && name == EXCEPTION_SCOPE_VAR { + source_node_id = self.state.latest_assignment(name); + } + let source_node_id = source_node_id.ok_or_else(|| { + ReplayError(format!("variable not found via data-flow edges: {name}")) + })?; + self.evaluate_assignment(source_node_id, name, stack) + } + + /// Find the nearest upstream node that defines the variable. + /// + /// We consult pre-sorted incoming data edges and ignore sources that are + /// later in the timeline than the current node. + /// + /// Example: + /// - if node_b comes after node_a, node_b cannot be a source for node_a. + fn find_variable_source_node(&self, current_node_id: Uuid, name: &str) -> Option { + let sources = self.incoming_data.get(¤t_node_id)?; + let current_idx = self + .index + .get(¤t_node_id) + .copied() + .unwrap_or(self.index.len()); + for source_id in sources { + if self.index.get(source_id).copied().unwrap_or(0) > current_idx { + continue; + } + if let Some(node) = self.state.nodes.get(source_id) + && node.assignments.contains_key(name) + { + return Some(*source_id); + } + } + None + } + + /// Fetch an action result by node id, handling indexed results. + /// + /// Example: + /// - result = @fetch() + /// - result[0] + /// The evaluator looks up the action result and returns index 0. + fn resolve_action_result(&self, expr: &ActionResultValue) -> Result { + let value = self + .action_results + .get(&expr.node_id) + .cloned() + .ok_or_else(|| ReplayError(format!("missing action result for {}", expr.node_id)))?; + if let Some(idx) = expr.result_index { + if let Value::Array(items) = value { + let idx = idx as usize; + return items.get(idx).cloned().ok_or_else(|| { + ReplayError(format!( + "action result for {} has no index {}", + expr.node_id, idx + )) + }); + } + return Err(ReplayError(format!( + "action result for {} has no index {}", + expr.node_id, idx + ))); + } + Ok(value) + } + + /// Evaluate a function call during replay. + /// + /// Only global functions are supported because user-defined functions are + /// not available in this replay context. + /// + /// Example: + /// - len(items=[1, 2]) -> 2 + fn evaluate_function_call( + &self, + expr: &FunctionCallValue, + args: Vec, + kwargs: HashMap, + ) -> Result { + if let Some(global_fn) = expr.global_function + && global_fn != ir::GlobalFunction::Unspecified as i32 + { + return evaluate_global_function(global_fn, args, kwargs); + } + Err(ReplayError(format!( + "cannot replay non-global function call: {}", + expr.name + ))) + } +} + +fn replay_error(message: &'static str) -> ReplayError { + ReplayError(message.to_string()) +} + +/// Apply a binary operator to replayed operands. +/// +/// Example: +/// - left=1, right=2, op=ADD -> 3 +fn apply_binary(op: i32, left: Value, right: Value) -> Result { + let error = replay_error; + match ir::BinaryOperator::try_from(op).ok() { + Some(ir::BinaryOperator::BinaryOpOr) => { + if is_truthy(&left) { + Ok(left) + } else { + Ok(right) + } + } + Some(ir::BinaryOperator::BinaryOpAnd) => { + if is_truthy(&left) { + Ok(right) + } else { + Ok(left) + } + } + Some(ir::BinaryOperator::BinaryOpEq) => Ok(Value::Bool(left == right)), + Some(ir::BinaryOperator::BinaryOpNe) => Ok(Value::Bool(left != right)), + Some(ir::BinaryOperator::BinaryOpLt) => compare_values(left, right, |a, b| a < b, error), + Some(ir::BinaryOperator::BinaryOpLe) => compare_values(left, right, |a, b| a <= b, error), + Some(ir::BinaryOperator::BinaryOpGt) => compare_values(left, right, |a, b| a > b, error), + Some(ir::BinaryOperator::BinaryOpGe) => compare_values(left, right, |a, b| a >= b, error), + Some(ir::BinaryOperator::BinaryOpIn) => Ok(Value::Bool(value_in(&left, &right))), + Some(ir::BinaryOperator::BinaryOpNotIn) => Ok(Value::Bool(!value_in(&left, &right))), + Some(ir::BinaryOperator::BinaryOpAdd) => add_values(left, right, error), + Some(ir::BinaryOperator::BinaryOpSub) => numeric_op(left, right, |a, b| a - b, true, error), + Some(ir::BinaryOperator::BinaryOpMul) => numeric_op(left, right, |a, b| a * b, true, error), + Some(ir::BinaryOperator::BinaryOpDiv) => { + numeric_op(left, right, |a, b| a / b, false, error) + } + Some(ir::BinaryOperator::BinaryOpFloorDiv) => { + numeric_op(left, right, |a, b| (a / b).floor(), true, error) + } + Some(ir::BinaryOperator::BinaryOpMod) => numeric_op(left, right, |a, b| a % b, true, error), + Some(ir::BinaryOperator::BinaryOpUnspecified) | None => { + Err(ReplayError("binary operator unspecified".to_string())) + } + } +} + +/// Apply a unary operator to a replayed operand. +/// +/// Example: +/// - op=NOT, operand=True -> False +fn apply_unary(op: i32, operand: Value) -> Result { + match ir::UnaryOperator::try_from(op).ok() { + Some(ir::UnaryOperator::UnaryOpNeg) => { + if let Some(value) = int_value(&operand) { + return Ok(Value::Number((-value).into())); + } + match operand.as_f64() { + Some(value) => Ok(Value::Number( + serde_json::Number::from_f64(-value) + .unwrap_or_else(|| serde_json::Number::from(0)), + )), + None => Err(ReplayError("unary neg expects number".to_string())), + } + } + Some(ir::UnaryOperator::UnaryOpNot) => Ok(Value::Bool(!is_truthy(&operand))), + Some(ir::UnaryOperator::UnaryOpUnspecified) | None => { + Err(ReplayError("unary operator unspecified".to_string())) + } + } +} + +/// Evaluate supported global helper functions. +/// +/// Example: +/// - range(0, 3) -> [0, 1, 2] +/// - isexception(value={"type": "...", "message": "..."}) -> True +fn evaluate_global_function( + global_function: i32, + args: Vec, + kwargs: HashMap, +) -> Result { + match ir::GlobalFunction::try_from(global_function).ok() { + Some(ir::GlobalFunction::Range) => Ok(range_from_args(&args).into()), + Some(ir::GlobalFunction::Len) => { + if let Some(first) = args.first() { + return Ok(Value::Number(len_of_value(first, replay_error)?)); + } + if let Some(items) = kwargs.get("items") { + return Ok(Value::Number(len_of_value(items, replay_error)?)); + } + Err(ReplayError("len() missing argument".to_string())) + } + Some(ir::GlobalFunction::Enumerate) => { + let items = if let Some(first) = args.first() { + first.clone() + } else if let Some(items) = kwargs.get("items") { + items.clone() + } else { + return Err(ReplayError("enumerate() missing argument".to_string())); + }; + let list = match items { + Value::Array(items) => items, + _ => return Err(ReplayError("enumerate() expects list".to_string())), + }; + let pairs: Vec = list + .into_iter() + .enumerate() + .map(|(idx, item)| Value::Array(vec![Value::Number((idx as i64).into()), item])) + .collect(); + Ok(Value::Array(pairs)) + } + Some(ir::GlobalFunction::Isexception) => { + if let Some(first) = args.first() { + return Ok(Value::Bool(is_exception_value(first))); + } + if let Some(value) = kwargs.get("value") { + return Ok(Value::Bool(is_exception_value(value))); + } + Err(ReplayError("isexception() missing argument".to_string())) + } + Some(ir::GlobalFunction::Unspecified) | None => { + Err(ReplayError("global function unspecified".to_string())) + } + } +} + +/// Build a reverse index of incoming data-flow edges. +/// +/// Sources are sorted from most-recent to oldest by timeline index so +/// lookups can short-circuit on the first viable definition. +fn build_incoming_data_map( + state: &RunnerState, + index: &HashMap, +) -> HashMap> { + let mut incoming: HashMap> = HashMap::new(); + for edge in &state.edges { + if edge.edge_type != EdgeType::DataFlow { + continue; + } + incoming.entry(edge.target).or_default().push(edge.source); + } + for (_target, sources) in incoming.iter_mut() { + sources.sort_by_key(|node_id| { + ( + index.get(node_id).copied().unwrap_or(0), + node_id.to_string(), + ) + }); + sources.reverse(); + } + incoming +} + +/// Replay variable values from a runner state snapshot. +/// +/// This is a convenience wrapper around ReplayEngine that prefers the latest +/// assignment for each variable and returns a fully materialized mapping. +pub fn replay_variables( + state: &RunnerState, + action_results: &HashMap, +) -> Result { + ReplayEngine::new(state, action_results).replay_variables() +} + +/// Replay concrete kwargs for a specific action node from a state snapshot. +pub fn replay_action_kwargs( + state: &RunnerState, + action_results: &HashMap, + node_id: Uuid, +) -> Result, ReplayError> { + ReplayEngine::new(state, action_results).replay_action_kwargs(node_id) +} + +#[cfg(test)] +mod tests { + use super::*; + use waymark_proto::ast as ir; + use waymark_runner_state::{RunnerState, VariableValue, value_visitor::ValueExpr}; + + fn action_plus_two_expr() -> ir::Expr { + ir::Expr { + kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { + left: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Variable(ir::Variable { + name: "action_result".to_string(), + })), + span: None, + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(2)), + })), + span: None, + })), + }))), + span: None, + } + } + + #[test] + fn test_replay_variables_resolves_action_results() { + let mut state = RunnerState::new(None, None, None, true); + + let action0 = state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + Some(0), + ) + .expect("queue action"); + let first_list = ir::Expr { + kind: Some(ir::expr::Kind::List(ir::ListExpr { + elements: vec![action_plus_two_expr()], + })), + span: None, + }; + state + .record_assignment(vec!["results".to_string()], &first_list, None, None) + .expect("record assignment"); + + let action1 = state + .queue_action( + "action", + Some(vec!["action_result".to_string()]), + None, + None, + Some(1), + ) + .expect("queue action"); + let second_list = ir::Expr { + kind: Some(ir::expr::Kind::List(ir::ListExpr { + elements: vec![action_plus_two_expr()], + })), + span: None, + }; + let concat_expr = ir::Expr { + kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { + left: Some(Box::new(ir::Expr { + kind: Some(ir::expr::Kind::Variable(ir::Variable { + name: "results".to_string(), + })), + span: None, + })), + op: ir::BinaryOperator::BinaryOpAdd as i32, + right: Some(Box::new(second_list)), + }))), + span: None, + }; + state + .record_assignment(vec!["results".to_string()], &concat_expr, None, None) + .expect("record assignment"); + + let replayed = replay_variables( + &state, + &HashMap::from([ + (action0.node_id, Value::Number(1.into())), + (action1.node_id, Value::Number(2.into())), + ]), + ) + .expect("replay"); + + assert_eq!( + replayed.variables.get("results"), + Some(&Value::Array(vec![3.into(), 4.into()])), + ); + } + + #[test] + fn test_replay_action_kwargs_resolves_variable_inputs() { + let mut state = RunnerState::new(None, None, None, true); + + let number_expr = ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(7)), + })), + span: None, + }; + state + .record_assignment( + vec!["number".to_string()], + &number_expr, + None, + Some("number = 7".to_string()), + ) + .expect("record assignment"); + + let kwargs = HashMap::from([( + "value".to_string(), + ValueExpr::Variable(VariableValue { + name: "number".to_string(), + }), + )]); + + let action = state + .queue_action( + "compute", + Some(vec!["result".to_string()]), + Some(kwargs), + Some("tests".to_string()), + None, + ) + .expect("queue action"); + + let kwargs = replay_action_kwargs( + &state, + &HashMap::from([(action.node_id, Value::Number(14.into()))]), + action.node_id, + ) + .expect("replay kwargs"); + + assert_eq!(kwargs.get("value"), Some(&Value::Number(7.into()))); + } +} diff --git a/crates/runner/src/retry.rs b/crates/runner/src/retry.rs new file mode 100644 index 00000000..a24f7a2d --- /dev/null +++ b/crates/runner/src/retry.rs @@ -0,0 +1,137 @@ +//! Retry/timeout policy helpers shared by runner components. + +use waymark_proto::ast as ir; + +#[derive(Clone, Debug)] +pub(crate) struct RetryDecision { + pub(crate) should_retry: bool, +} + +pub(crate) struct RetryPolicyEvaluator<'a> { + policies: &'a [ir::PolicyBracket], + exception_name: Option<&'a str>, +} + +fn is_synthetic_runtime_exception(exception_name: Option<&str>) -> bool { + matches!(exception_name, Some("ExecutorResume" | "ActionTimeout")) +} + +impl<'a> RetryPolicyEvaluator<'a> { + pub(crate) fn new(policies: &'a [ir::PolicyBracket], exception_name: Option<&'a str>) -> Self { + Self { + policies, + exception_name, + } + } + + pub(crate) fn decision(&self, attempt: i32) -> RetryDecision { + let mut max_retries: i32 = 0; + let mut matched_policy = false; + + for policy in self.policies { + let Some(ir::policy_bracket::Kind::Retry(retry)) = policy.kind.as_ref() else { + continue; + }; + let matches_exception = if retry.exception_types.is_empty() { + // Synthetic runtime exceptions (resume/timeout) can represent in-flight + // work that may still be running out-of-band. Require explicit opt-in + // exception filters before retrying these cases. + !is_synthetic_runtime_exception(self.exception_name) + } else if let Some(name) = self.exception_name { + retry.exception_types.iter().any(|value| value == name) + } else { + false + }; + if !matches_exception { + continue; + } + matched_policy = true; + max_retries = max_retries.max(retry.max_retries as i32); + } + + let should_retry = matched_policy && attempt - 1 < max_retries; + + RetryDecision { should_retry } + } +} + +pub(crate) fn timeout_seconds_from_policies(policies: &[ir::PolicyBracket]) -> Option { + let mut timeout_seconds: Option = None; + for policy in policies { + let Some(ir::policy_bracket::Kind::Timeout(timeout)) = policy.kind.as_ref() else { + continue; + }; + let seconds = timeout + .timeout + .as_ref() + .map(|duration| duration.seconds) + .unwrap_or(0); + if seconds == 0 { + continue; + } + timeout_seconds = Some(match timeout_seconds { + Some(existing) => existing.min(seconds), + None => seconds, + }); + } + timeout_seconds.map(|seconds| seconds.min(u64::from(u32::MAX)) as u32) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn retry_policy(max_retries: u32, exception_types: Vec<&str>) -> ir::PolicyBracket { + ir::PolicyBracket { + kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { + exception_types: exception_types + .into_iter() + .map(ToString::to_string) + .collect(), + max_retries, + backoff: None, + })), + } + } + + fn timeout_policy(seconds: u64) -> ir::PolicyBracket { + ir::PolicyBracket { + kind: Some(ir::policy_bracket::Kind::Timeout(ir::TimeoutPolicy { + timeout: Some(ir::Duration { seconds }), + })), + } + } + + #[test] + fn retry_policy_evaluator_happy_path() { + let policies = vec![ + retry_policy(1, vec!["ValueError"]), + retry_policy(3, Vec::new()), + ]; + let decision = RetryPolicyEvaluator::new(&policies, Some("ValueError")).decision(2); + assert!(decision.should_retry); + + let exhausted = RetryPolicyEvaluator::new(&policies, Some("ValueError")).decision(4); + assert!(!exhausted.should_retry); + } + + #[test] + fn retry_policy_evaluator_wildcard_does_not_retry_synthetic_timeout() { + let policies = vec![retry_policy(3, Vec::new())]; + let decision = RetryPolicyEvaluator::new(&policies, Some("ActionTimeout")).decision(1); + assert!(!decision.should_retry); + } + + #[test] + fn retry_policy_evaluator_explicit_timeout_retry_happy_path() { + let policies = vec![retry_policy(2, vec!["ActionTimeout"])]; + let decision = RetryPolicyEvaluator::new(&policies, Some("ActionTimeout")).decision(1); + assert!(decision.should_retry); + } + + #[test] + fn timeout_seconds_from_policies_happy_path() { + let policies = vec![timeout_policy(30), timeout_policy(10), timeout_policy(0)]; + assert_eq!(timeout_seconds_from_policies(&policies), Some(10)); + } +} diff --git a/crates/runner/src/synthetic_exceptions.rs b/crates/runner/src/synthetic_exceptions.rs new file mode 100644 index 00000000..df89b71f --- /dev/null +++ b/crates/runner/src/synthetic_exceptions.rs @@ -0,0 +1,90 @@ +//! Synthetic exception helpers produced by Rust runtime coordination paths. + +use serde_json::Value; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum SyntheticExceptionType { + ExecutorResume, + ActionTimeout, +} + +impl SyntheticExceptionType { + pub(crate) fn as_type_str(self) -> &'static str { + match self { + Self::ExecutorResume => "ExecutorResume", + Self::ActionTimeout => "ActionTimeout", + } + } + + fn from_type_str(value: &str) -> Option { + match value { + "ExecutorResume" => Some(Self::ExecutorResume), + "ActionTimeout" => Some(Self::ActionTimeout), + _ => None, + } + } + + pub(crate) fn from_value(value: &Value) -> Option { + let Value::Object(map) = value else { + return None; + }; + map.get("type") + .and_then(Value::as_str) + .and_then(Self::from_type_str) + } +} + +pub(crate) fn build_synthetic_exception_value( + exception_type: SyntheticExceptionType, + message: impl Into, + fields: Vec<(String, Value)>, +) -> Value { + let mut map = serde_json::Map::new(); + map.insert( + "type".to_string(), + Value::String(exception_type.as_type_str().to_string()), + ); + map.insert("message".to_string(), Value::String(message.into())); + for (key, value) in fields { + map.insert(key, value); + } + Value::Object(map) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn synthetic_exception_from_value_happy_path() { + let value = serde_json::json!({"type": "ActionTimeout", "message": "x"}); + assert_eq!( + SyntheticExceptionType::from_value(&value), + Some(SyntheticExceptionType::ActionTimeout) + ); + } + + #[test] + fn build_synthetic_exception_value_happy_path() { + let value = build_synthetic_exception_value( + SyntheticExceptionType::ExecutorResume, + "resume", + vec![( + "attempt".to_string(), + Value::Number(serde_json::Number::from(2)), + )], + ); + let Value::Object(map) = value else { + panic!("expected object value"); + }; + assert_eq!( + map.get("type"), + Some(&Value::String("ExecutorResume".to_string())) + ); + assert_eq!( + map.get("message"), + Some(&Value::String("resume".to_string())) + ); + assert_eq!(map.get("attempt"), Some(&Value::Number(2.into()))); + } +} diff --git a/crates/scheduler-backend/Cargo.toml b/crates/scheduler-backend/Cargo.toml new file mode 100644 index 00000000..6af1c2bb --- /dev/null +++ b/crates/scheduler-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-scheduler-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-scheduler-core = { workspace = true } diff --git a/crates/scheduler-backend/src/lib.rs b/crates/scheduler-backend/src/lib.rs new file mode 100644 index 00000000..613cc4ac --- /dev/null +++ b/crates/scheduler-backend/src/lib.rs @@ -0,0 +1,29 @@ +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, WorkflowSchedule}; + +/// Backend capability for workflow schedule persistence. +#[async_trait::async_trait] +pub trait SchedulerBackend: Send + Sync { + async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult; + async fn get_schedule(&self, id: ScheduleId) -> BackendResult; + async fn get_schedule_by_name( + &self, + workflow_name: &str, + schedule_name: &str, + ) -> BackendResult>; + async fn list_schedules(&self, limit: i64, offset: i64) + -> BackendResult>; + async fn count_schedules(&self) -> BackendResult; + async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult; + async fn delete_schedule(&self, id: ScheduleId) -> BackendResult; + async fn find_due_schedules(&self, limit: i32) -> BackendResult>; + async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult; + async fn mark_schedule_executed( + &self, + schedule_id: ScheduleId, + instance_id: Uuid, + ) -> BackendResult<()>; + async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()>; +} diff --git a/crates/scheduler-core/Cargo.toml b/crates/scheduler-core/Cargo.toml new file mode 100644 index 00000000..9659e878 --- /dev/null +++ b/crates/scheduler-core/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "waymark-scheduler-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +uuid = { workspace = true, features = ["serde", "v4"] } +chrono = { workspace = true, features = ["serde"] } +serde = { workspace = true, features = ["derive"] } +cron = { workspace = true } +rand = { workspace = true } + +[dev-dependencies] +chrono = { workspace = true, features = ["clock"] } diff --git a/crates/scheduler-core/src/lib.rs b/crates/scheduler-core/src/lib.rs new file mode 100644 index 00000000..02d2783b --- /dev/null +++ b/crates/scheduler-core/src/lib.rs @@ -0,0 +1,6 @@ +mod types; +mod utils; + +pub use self::types::*; + +pub use self::utils::*; diff --git a/crates/scheduler-core/src/types.rs b/crates/scheduler-core/src/types.rs new file mode 100644 index 00000000..4f8c9104 --- /dev/null +++ b/crates/scheduler-core/src/types.rs @@ -0,0 +1,139 @@ +//! Schedule types. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Unique identifier for a schedule. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ScheduleId(pub Uuid); + +impl ScheduleId { + pub fn new() -> Self { + Self(Uuid::new_v4()) + } +} + +impl Default for ScheduleId { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Display for ScheduleId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Type of schedule. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ScheduleType { + Cron, + Interval, +} + +impl ScheduleType { + pub fn as_str(&self) -> &'static str { + match self { + Self::Cron => "cron", + Self::Interval => "interval", + } + } + + pub fn parse(s: &str) -> Option { + match s { + "cron" => Some(Self::Cron), + "interval" => Some(Self::Interval), + _ => None, + } + } +} + +impl std::fmt::Display for ScheduleType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +/// Status of a workflow schedule. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ScheduleStatus { + Active, + Paused, + Deleted, +} + +impl ScheduleStatus { + pub fn as_str(&self) -> &'static str { + match self { + Self::Active => "active", + Self::Paused => "paused", + Self::Deleted => "deleted", + } + } + + pub fn parse(s: &str) -> Option { + match s { + "active" => Some(Self::Active), + "paused" => Some(Self::Paused), + "deleted" => Some(Self::Deleted), + _ => None, + } + } +} + +impl std::fmt::Display for ScheduleStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +/// A workflow schedule (recurring execution). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkflowSchedule { + pub id: Uuid, + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: String, + pub cron_expression: Option, + pub interval_seconds: Option, + pub jitter_seconds: i64, + pub input_payload: Option>, + pub status: String, + pub next_run_at: Option>, + pub last_run_at: Option>, + pub last_instance_id: Option, + pub created_at: DateTime, + pub updated_at: DateTime, + pub priority: i32, + pub allow_duplicate: bool, +} + +impl WorkflowSchedule { + /// Get the schedule type as an enum. + pub fn schedule_type_enum(&self) -> Option { + ScheduleType::parse(&self.schedule_type) + } + + /// Get the status as an enum. + pub fn status_enum(&self) -> Option { + ScheduleStatus::parse(&self.status) + } +} + +/// Parameters for creating a schedule. +#[derive(Debug, Clone)] +pub struct CreateScheduleParams { + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: ScheduleType, + pub cron_expression: Option, + pub interval_seconds: Option, + pub jitter_seconds: i64, + pub input_payload: Option>, + pub priority: i32, + pub allow_duplicate: bool, +} diff --git a/crates/scheduler-core/src/utils.rs b/crates/scheduler-core/src/utils.rs new file mode 100644 index 00000000..4530329f --- /dev/null +++ b/crates/scheduler-core/src/utils.rs @@ -0,0 +1,181 @@ +//! Cron and interval schedule utilities. +//! +//! This module provides utilities for computing the next run time for +//! cron expressions and fixed intervals. +//! +//! Note: This module accepts standard 5-field Unix cron expressions +//! (minute, hour, day-of-month, month, day-of-week) and converts them +//! to 6-field format (with seconds) for the `cron` crate. + +use chrono::{DateTime, Utc}; +use cron::Schedule; +use rand::Rng; +use std::str::FromStr; + +use super::ScheduleType; + +/// Convert a 5-field Unix cron expression to 6-field format. +/// +/// The `cron` crate requires 6 fields (sec min hour dom month dow), +/// but standard Unix cron uses 5 fields (min hour dom month dow). +/// This function prepends "0 " to run at second 0 of each match. +fn normalize_cron_expr(cron_expr: &str) -> String { + let fields: Vec<&str> = cron_expr.split_whitespace().collect(); + if fields.len() == 5 { + // Standard 5-field cron: prepend "0" for seconds + format!("0 {}", cron_expr) + } else { + // Already 6+ fields, use as-is + cron_expr.to_string() + } +} + +/// Compute the next run time for a cron expression. +/// +/// Accepts standard 5-field Unix cron expressions (e.g., "0 * * * *" for hourly) +/// or 6-field expressions with seconds. +/// +/// Returns the next occurrence after the current time (UTC). +pub fn next_cron_run(cron_expr: &str) -> Result, String> { + let normalized = normalize_cron_expr(cron_expr); + let schedule = Schedule::from_str(&normalized) + .map_err(|e| format!("Invalid cron expression '{}': {}", cron_expr, e))?; + schedule + .upcoming(Utc) + .next() + .ok_or_else(|| "No upcoming schedule found".to_string()) +} + +/// Compute the next run time for an interval-based schedule. +/// +/// If `last_run_at` is provided, the next run is `last_run_at + interval_seconds`. +/// Otherwise, the next run is `now + interval_seconds`. +pub fn next_interval_run( + interval_seconds: i64, + last_run_at: Option>, +) -> DateTime { + let base = last_run_at.unwrap_or_else(Utc::now); + base + chrono::Duration::seconds(interval_seconds) +} + +/// Validate a cron expression without computing the next run. +/// +/// Accepts standard 5-field Unix cron expressions or 6-field expressions. +pub fn validate_cron(cron_expr: &str) -> Result<(), String> { + let normalized = normalize_cron_expr(cron_expr); + Schedule::from_str(&normalized) + .map(|_| ()) + .map_err(|e| format!("Invalid cron expression '{}': {}", cron_expr, e)) +} + +/// Apply a random jitter delay (in seconds) to a scheduled time. +/// +/// If `jitter_seconds` is 0, the base time is returned unchanged. +pub fn apply_jitter(base: DateTime, jitter_seconds: i64) -> Result, String> { + if jitter_seconds < 0 { + return Err("jitter_seconds must be non-negative".to_string()); + } + if jitter_seconds == 0 { + return Ok(base); + } + let jitter = rand::thread_rng().gen_range(0..=jitter_seconds); + Ok(base + chrono::Duration::seconds(jitter)) +} + +/// Compute the next run time for a schedule type with optional jitter. +pub fn compute_next_run( + schedule_type: ScheduleType, + cron_expression: Option<&str>, + interval_seconds: Option, + jitter_seconds: i64, + last_run_at: Option>, +) -> Result, String> { + let base = match schedule_type { + ScheduleType::Cron => { + let expr = cron_expression.ok_or_else(|| "cron expression required".to_string())?; + next_cron_run(expr)? + } + ScheduleType::Interval => { + let seconds = + interval_seconds.ok_or_else(|| "interval_seconds required".to_string())?; + next_interval_run(seconds, last_run_at) + } + }; + + apply_jitter(base, jitter_seconds) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_normalize_cron_expr() { + // 5-field should get "0 " prepended + assert_eq!(normalize_cron_expr("* * * * *"), "0 * * * * *"); + assert_eq!(normalize_cron_expr("0 * * * *"), "0 0 * * * *"); + + // 6-field should remain unchanged + assert_eq!(normalize_cron_expr("0 0 * * * *"), "0 0 * * * *"); + } + + #[test] + fn test_valid_cron_expression() { + // Standard 5-field Unix cron expressions + assert!(validate_cron("0 * * * *").is_ok()); + assert!(validate_cron("0 0 * * *").is_ok()); + assert!(validate_cron("* * * * *").is_ok()); + + // 6-field expression with seconds + assert!(validate_cron("0 0 * * * *").is_ok()); + } + + #[test] + fn test_invalid_cron_expression() { + assert!(validate_cron("invalid").is_err()); + assert!(validate_cron("").is_err()); + } + + #[test] + fn test_next_cron_run() { + // Every minute should return a time in the future + let next = next_cron_run("* * * * *").unwrap(); + assert!(next > Utc::now()); + } + + #[test] + fn test_next_interval_run_from_now() { + let before = Utc::now(); + let next = next_interval_run(3600, None); + let after = Utc::now(); + + // Should be approximately 1 hour from now + assert!(next >= before + chrono::Duration::seconds(3600)); + assert!(next <= after + chrono::Duration::seconds(3600)); + } + + #[test] + fn test_next_interval_run_from_last() { + let last_run = Utc::now() - chrono::Duration::seconds(1800); + let next = next_interval_run(3600, Some(last_run)); + + // Should be 1 hour after last_run (30 minutes from now) + let expected = last_run + chrono::Duration::seconds(3600); + assert_eq!(next, expected); + } + + #[test] + fn test_apply_jitter_zero() { + let base = Utc::now(); + let jittered = apply_jitter(base, 0).unwrap(); + assert_eq!(jittered, base); + } + + #[test] + fn test_apply_jitter_range() { + let base = Utc::now(); + let jittered = apply_jitter(base, 5).unwrap(); + assert!(jittered >= base); + assert!(jittered <= base + chrono::Duration::seconds(5)); + } +} diff --git a/crates/webapp-backend/Cargo.toml b/crates/webapp-backend/Cargo.toml new file mode 100644 index 00000000..735810b3 --- /dev/null +++ b/crates/webapp-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-webapp-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-webapp-core = { workspace = true } diff --git a/crates/webapp-backend/src/lib.rs b/crates/webapp-backend/src/lib.rs new file mode 100644 index 00000000..354e0e67 --- /dev/null +++ b/crates/webapp-backend/src/lib.rs @@ -0,0 +1,54 @@ +use uuid::Uuid; +pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_webapp_core::{ + ExecutionGraphView, InstanceDetail, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, + ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, +}; + +/// Backend capability for webapp-specific queries. +#[async_trait::async_trait] +pub trait WebappBackend: Send + Sync { + async fn count_instances(&self, search: Option<&str>) -> BackendResult; + async fn list_instances( + &self, + search: Option<&str>, + limit: i64, + offset: i64, + ) -> BackendResult>; + async fn get_instance(&self, instance_id: Uuid) -> BackendResult; + async fn get_execution_graph( + &self, + instance_id: Uuid, + ) -> BackendResult>; + async fn get_workflow_graph( + &self, + instance_id: Uuid, + ) -> BackendResult>; + async fn get_action_results(&self, instance_id: Uuid) -> BackendResult>; + async fn get_distinct_workflows(&self) -> BackendResult>; + async fn get_distinct_statuses(&self) -> BackendResult>; + async fn count_schedules(&self) -> BackendResult; + async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult>; + async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult; + async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult; + async fn list_schedule_invocations( + &self, + schedule_id: Uuid, + limit: i64, + offset: i64, + ) -> BackendResult>; + async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult; + async fn get_distinct_schedule_statuses(&self) -> BackendResult>; + async fn get_distinct_schedule_types(&self) -> BackendResult>; + async fn get_worker_action_stats( + &self, + window_minutes: i64, + ) -> BackendResult>; + async fn get_worker_aggregate_stats( + &self, + window_minutes: i64, + ) -> BackendResult; + async fn worker_status_table_exists(&self) -> bool; + async fn schedules_table_exists(&self) -> bool; + async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult>; +} diff --git a/crates/webapp-core/Cargo.toml b/crates/webapp-core/Cargo.toml new file mode 100644 index 00000000..2b51dc6d --- /dev/null +++ b/crates/webapp-core/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-webapp-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +uuid = { workspace = true, features = ["serde"] } +chrono = { workspace = true, features = ["serde"] } +serde = { workspace = true, features = ["derive"] } diff --git a/crates/webapp-core/src/lib.rs b/crates/webapp-core/src/lib.rs new file mode 100644 index 00000000..7805c428 --- /dev/null +++ b/crates/webapp-core/src/lib.rs @@ -0,0 +1,299 @@ +//! Shared types for the webapp. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Configuration for the webapp server. +#[derive(Debug, Clone)] +pub struct WebappConfig { + pub enabled: bool, + pub host: String, + pub port: u16, +} + +impl Default for WebappConfig { + fn default() -> Self { + Self { + enabled: false, + host: "0.0.0.0".to_string(), + port: 24119, + } + } +} + +impl WebappConfig { + /// Create config from environment variables. + pub fn from_env() -> Self { + let enabled = std::env::var("WAYMARK_WEBAPP_ENABLED") + .map(|v| v == "true" || v == "1") + .unwrap_or(false); + + let (host, port) = std::env::var("WAYMARK_WEBAPP_ADDR") + .ok() + .and_then(|addr| { + let parts: Vec<&str> = addr.split(':').collect(); + if parts.len() == 2 { + let host = parts[0].to_string(); + let port = parts[1].parse().ok()?; + Some((host, port)) + } else { + None + } + }) + .unwrap_or_else(|| ("0.0.0.0".to_string(), 24119)); + + Self { + enabled, + host, + port, + } + } + + /// Get the bind address. + pub fn bind_addr(&self) -> String { + format!("{}:{}", self.host, self.port) + } +} + +/// Instance status. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum InstanceStatus { + Queued, + Running, + Completed, + Failed, +} + +impl std::fmt::Display for InstanceStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Queued => write!(f, "queued"), + Self::Running => write!(f, "running"), + Self::Completed => write!(f, "completed"), + Self::Failed => write!(f, "failed"), + } + } +} + +/// Summary of a workflow instance for listing. +#[derive(Debug, Clone, Serialize)] +pub struct InstanceSummary { + pub id: Uuid, + pub entry_node: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, + pub workflow_name: Option, + pub input_preview: String, +} + +/// Full details of a workflow instance. +#[derive(Debug, Clone, Serialize)] +pub struct InstanceDetail { + pub id: Uuid, + pub entry_node: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, + pub workflow_name: Option, + pub input_payload: String, + pub result_payload: String, + pub error_payload: Option, +} + +/// Node in the execution graph for display. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionNodeView { + pub id: String, + pub node_type: String, + pub label: String, + pub status: String, + pub action_name: Option, + pub module_name: Option, +} + +/// Edge in the execution graph for display. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionEdgeView { + pub source: String, + pub target: String, + pub edge_type: String, +} + +/// Execution graph data for rendering. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionGraphView { + pub nodes: Vec, + pub edges: Vec, +} + +/// Timeline entry for an action execution. +#[derive(Debug, Clone, Serialize)] +pub struct TimelineEntry { + pub action_id: String, + pub action_name: String, + pub module_name: Option, + pub status: String, + pub attempt_number: i32, + pub dispatched_at: Option, + pub completed_at: Option, + pub duration_ms: Option, + pub request_preview: String, + pub response_preview: String, + pub error: Option, +} + +/// Action log entry with full details. +#[derive(Debug, Clone, Serialize)] +pub struct ActionLogEntry { + pub action_id: String, + pub action_name: String, + pub module_name: Option, + pub status: String, + pub attempt_number: i32, + pub dispatched_at: Option, + pub completed_at: Option, + pub duration_ms: Option, + pub request: String, + pub response: String, + pub error: Option, +} + +/// Response for the workflow run data API. +#[derive(Debug, Serialize)] +pub struct WorkflowRunDataResponse { + pub nodes: Vec, + pub timeline: Vec, + pub page: i64, + pub per_page: i64, + pub total: i64, + pub has_more: bool, +} + +/// Response for action logs API. +#[derive(Debug, Serialize)] +pub struct ActionLogsResponse { + pub logs: Vec, +} + +/// Filter values response. +#[derive(Debug, Serialize)] +pub struct FilterValuesResponse { + pub values: Vec, +} + +/// Health check response. +#[derive(Debug, Serialize)] +pub struct HealthResponse { + pub status: &'static str, + pub service: &'static str, +} + +/// Export format for a workflow instance. +#[derive(Debug, Serialize)] +pub struct WorkflowInstanceExport { + pub export_version: &'static str, + pub exported_at: String, + pub instance: InstanceExportInfo, + pub nodes: Vec, + pub timeline: Vec, +} + +/// Full worker status for webapp display. +#[derive(Debug, Clone)] +pub struct WorkerStatus { + pub pool_id: Uuid, + pub active_workers: i32, + pub throughput_per_min: f64, + pub actions_per_sec: f64, + pub total_completed: i64, + pub last_action_at: Option>, + pub updated_at: DateTime, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub dispatch_queue_size: Option, + pub total_in_flight: Option, + pub median_instance_duration_secs: Option, + pub active_instance_count: i32, + pub total_instances_completed: i64, + pub instances_per_sec: f64, + pub instances_per_min: f64, + pub time_series: Option>, +} + +/// Worker action stats row for display. +#[derive(Debug, Clone)] +pub struct WorkerActionRow { + pub pool_id: String, + pub active_workers: i64, + pub actions_per_sec: String, + pub throughput_per_min: i64, + pub total_completed: i64, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub last_action_at: Option, + pub updated_at: String, +} + +/// Aggregate worker stats for overview cards. +#[derive(Debug, Clone)] +pub struct WorkerAggregateStats { + pub active_worker_count: i64, + pub actions_per_sec: String, + pub total_in_flight: i64, + pub total_queue_depth: i64, +} + +/// Instance info for export. +#[derive(Debug, Serialize)] +pub struct InstanceExportInfo { + pub id: String, + pub status: String, + pub created_at: String, + pub input_payload: String, + pub result_payload: String, +} + +/// Schedule summary for listing. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleSummary { + pub id: String, + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: String, + pub cron_expression: Option, + pub interval_seconds: Option, + pub status: String, + pub next_run_at: Option, + pub last_run_at: Option, + pub created_at: String, +} + +/// Full schedule details. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleDetail { + pub id: String, + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: String, + pub cron_expression: Option, + pub interval_seconds: Option, + pub jitter_seconds: i64, + pub status: String, + pub next_run_at: Option, + pub last_run_at: Option, + pub last_instance_id: Option, + pub created_at: String, + pub updated_at: String, + pub priority: i32, + pub allow_duplicate: bool, + pub input_payload: Option, +} + +/// Invocation summary row for schedule detail pages. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleInvocationSummary { + pub id: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, +} diff --git a/crates/worker-status-backend/Cargo.toml b/crates/worker-status-backend/Cargo.toml new file mode 100644 index 00000000..ff50466a --- /dev/null +++ b/crates/worker-status-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-worker-status-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/worker-status-backend/src/lib.rs b/crates/worker-status-backend/src/lib.rs new file mode 100644 index 00000000..bc23eb4e --- /dev/null +++ b/crates/worker-status-backend/src/lib.rs @@ -0,0 +1,32 @@ +//! Worker status backend. + +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +/// Worker status update for persistence. +#[derive(Clone, Debug)] +pub struct WorkerStatusUpdate { + pub pool_id: Uuid, + pub throughput_per_min: f64, + pub total_completed: i64, + pub last_action_at: Option>, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub dispatch_queue_size: i64, + pub total_in_flight: i64, + pub active_workers: i32, + pub actions_per_sec: f64, + pub median_instance_duration_secs: Option, + pub active_instance_count: i32, + pub total_instances_completed: i64, + pub instances_per_sec: f64, + pub instances_per_min: f64, + pub time_series: Option>, +} + +/// Backend capability for recording worker status metrics. +#[async_trait::async_trait] +pub trait WorkerStatusBackend: Send + Sync { + async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()>; +} diff --git a/crates/workflow-registry-backend/Cargo.toml b/crates/workflow-registry-backend/Cargo.toml new file mode 100644 index 00000000..2dc85a4d --- /dev/null +++ b/crates/workflow-registry-backend/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-workflow-registry-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/workflow-registry-backend/src/lib.rs b/crates/workflow-registry-backend/src/lib.rs new file mode 100644 index 00000000..041c8482 --- /dev/null +++ b/crates/workflow-registry-backend/src/lib.rs @@ -0,0 +1,35 @@ +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +/// Registration payload for storing workflow DAG metadata. +#[derive(Clone, Debug)] +pub struct WorkflowRegistration { + pub workflow_name: String, + pub workflow_version: String, + pub ir_hash: String, + pub program_proto: Vec, + pub concurrent: bool, +} + +#[derive(Clone, Debug)] +/// Stored workflow version metadata and IR payload. +pub struct WorkflowVersion { + pub id: Uuid, + pub workflow_name: String, + pub workflow_version: String, + pub ir_hash: String, + pub program_proto: Vec, + pub concurrent: bool, +} + +/// Backend capability for registering workflow DAGs. +#[async_trait::async_trait] +pub trait WorkflowRegistryBackend: Send + Sync { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult; + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult>; +} From 6105f1679cab71aff3e33ac96241266372c40654 Mon Sep 17 00:00:00 2001 From: MOZGIII Date: Wed, 25 Feb 2026 21:23:46 +0400 Subject: [PATCH 4/5] Extract the rest of the crates --- Cargo.lock | 74 +- Cargo.toml | 6 + crates/backend-fault-injection/Cargo.toml | 12 + crates/backend-fault-injection/src/lib.rs | 128 + crates/backend-memory/Cargo.toml | 1 + crates/backend-memory/src/core_backend.rs | 5 +- .../src/garbage_collector_backend.rs | 5 +- .../backend-memory/src/scheduler_backend.rs | 2 +- crates/backend-memory/src/webapp_backend.rs | 3 +- crates/backend-postgres-migrations/Cargo.toml | 7 + crates/backend-postgres-migrations/build.rs | 3 + .../migrations/0001_init.sql | 115 + .../0002_runner_actions_done_execution_id.sql | 7 + .../migrations/0003_instance_locks.sql | 12 + .../migrations/0004_workflow_versions.sql | 21 + ...5_runner_instances_workflow_version_id.sql | 7 + .../0006_drop_unused_runner_tables.sql | 4 + .../0007_runner_instances_schedule_id.sql | 5 + .../0008_runner_actions_done_timing.sql | 14 + .../0009_instance_search_columns.sql | 63 + crates/backend-postgres-migrations/src/lib.rs | 8 + crates/backend-postgres/Cargo.toml | 39 + crates/backend-postgres/src/core.rs | 1993 ++++++++++++++ crates/backend-postgres/src/lib.rs | 115 + crates/backend-postgres/src/registry.rs | 146 ++ crates/backend-postgres/src/scheduler.rs | 605 +++++ crates/backend-postgres/src/test_helpers.rs | 27 + crates/backend-postgres/src/webapp.rs | 2329 +++++++++++++++++ crates/backends-core/src/lib.rs | 7 + crates/core-backend/src/lib.rs | 2 +- crates/dag/Cargo.toml | 2 +- crates/dag/src/builder/test_helpers.rs | 2 +- crates/dag/src/validate.rs | 2 +- crates/garbage-collector-backend/src/lib.rs | 2 +- crates/integration-support/Cargo.toml | 10 + crates/integration-support/src/lib.rs | 5 + crates/integration-support/src/postgres.rs | 103 + crates/runner/src/lib.rs | 4 +- crates/runner/src/synthetic_exceptions.rs | 8 +- crates/test-support/Cargo.toml | 8 + crates/test-support/src/lib.rs | 5 + crates/test-support/src/postgres.rs | 15 + crates/webapp-backend/src/lib.rs | 2 +- crates/webapp-core/src/lib.rs | 52 - 44 files changed, 5914 insertions(+), 71 deletions(-) create mode 100644 crates/backend-fault-injection/Cargo.toml create mode 100644 crates/backend-fault-injection/src/lib.rs create mode 100644 crates/backend-postgres-migrations/Cargo.toml create mode 100644 crates/backend-postgres-migrations/build.rs create mode 100644 crates/backend-postgres-migrations/migrations/0001_init.sql create mode 100644 crates/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql create mode 100644 crates/backend-postgres-migrations/migrations/0003_instance_locks.sql create mode 100644 crates/backend-postgres-migrations/migrations/0004_workflow_versions.sql create mode 100644 crates/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql create mode 100644 crates/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql create mode 100644 crates/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql create mode 100644 crates/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql create mode 100644 crates/backend-postgres-migrations/migrations/0009_instance_search_columns.sql create mode 100644 crates/backend-postgres-migrations/src/lib.rs create mode 100644 crates/backend-postgres/Cargo.toml create mode 100644 crates/backend-postgres/src/core.rs create mode 100644 crates/backend-postgres/src/lib.rs create mode 100644 crates/backend-postgres/src/registry.rs create mode 100644 crates/backend-postgres/src/scheduler.rs create mode 100644 crates/backend-postgres/src/test_helpers.rs create mode 100644 crates/backend-postgres/src/webapp.rs create mode 100644 crates/integration-support/Cargo.toml create mode 100644 crates/integration-support/src/lib.rs create mode 100644 crates/integration-support/src/postgres.rs create mode 100644 crates/test-support/Cargo.toml create mode 100644 crates/test-support/src/lib.rs create mode 100644 crates/test-support/src/postgres.rs diff --git a/Cargo.lock b/Cargo.lock index 4b31811b..9737bb86 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3372,6 +3372,18 @@ dependencies = [ "waymark-proto", ] +[[package]] +name = "waymark-backend-fault-injection" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backend-memory", + "waymark-backends-core", + "waymark-core-backend", + "waymark-workflow-registry-backend", +] + [[package]] name = "waymark-backend-memory" version = "0.1.0" @@ -3381,16 +3393,58 @@ dependencies = [ "rmp-serde", "serde_json", "uuid", + "waymark-backends-core", + "waymark-core-backend", + "waymark-garbage-collector-backend", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backend-postgres" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "prost 0.12.6", + "rmp-serde", + "serde", + "serde_json", + "serial_test", + "sqlx", + "tokio", + "tracing", + "uuid", + "waymark-backend-postgres-migrations", + "waymark-backends-core", "waymark-core-backend", + "waymark-dag", "waymark-garbage-collector-backend", + "waymark-ir-parser", + "waymark-observability", + "waymark-proto", + "waymark-runner", + "waymark-runner-state", "waymark-scheduler-backend", "waymark-scheduler-core", + "waymark-test-support", "waymark-webapp-backend", "waymark-webapp-core", "waymark-worker-status-backend", "waymark-workflow-registry-backend", ] +[[package]] +name = "waymark-backend-postgres-migrations" +version = "0.1.0" +dependencies = [ + "sqlx", +] + [[package]] name = "waymark-backends-core" version = "0.1.0" @@ -3422,7 +3476,7 @@ dependencies = [ "serde", "thiserror", "uuid", - "waymark", + "waymark-ir-parser", "waymark-proto", ] @@ -3452,6 +3506,16 @@ dependencies = [ "waymark-backends-core", ] +[[package]] +name = "waymark-integration-support" +version = "0.1.0" +dependencies = [ + "anyhow", + "sqlx", + "tokio", + "waymark-backend-postgres-migrations", +] + [[package]] name = "waymark-ir-parser" version = "0.1.0" @@ -3541,6 +3605,14 @@ dependencies = [ "uuid", ] +[[package]] +name = "waymark-test-support" +version = "0.1.0" +dependencies = [ + "sqlx", + "waymark-integration-support", +] + [[package]] name = "waymark-webapp-backend" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index aa2f4ab0..4c881345 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,11 +4,15 @@ members = ["crates/*"] [workspace.dependencies] waymark = { path = "crates/waymark" } +waymark-backend-fault-injection = { path = "crates/backend-fault-injection" } waymark-backend-memory = { path = "crates/backend-memory" } +waymark-backend-postgres = { path = "crates/backend-postgres" } +waymark-backend-postgres-migrations = { path = "crates/backend-postgres-migrations" } waymark-backends-core = { path = "crates/backends-core" } waymark-core-backend = { path = "crates/core-backend" } waymark-dag = { path = "crates/dag" } waymark-garbage-collector-backend = { path = "crates/garbage-collector-backend" } +waymark-integration-support = { path = "crates/integration-support" } waymark-ir-parser = { path = "crates/ir-parser" } waymark-observability = { path = "crates/observability" } waymark-observability-macros = { path = "crates/observability-macros" } @@ -17,6 +21,7 @@ waymark-runner = { path = "crates/runner" } waymark-runner-state = { path = "crates/runner-state" } waymark-scheduler-backend = { path = "crates/scheduler-backend" } waymark-scheduler-core = { path = "crates/scheduler-core" } +waymark-test-support = { path = "crates/test-support" } waymark-webapp-backend = { path = "crates/webapp-backend" } waymark-webapp-core = { path = "crates/webapp-core" } waymark-worker-status-backend = { path = "crates/worker-status-backend" } @@ -36,6 +41,7 @@ rmp-serde = "1" rustc-hash = "2" serde = "1" serde_json = "1" +serial_test = "2" sha2 = "0.10" sqlx = { version = "0.8", default-features = false } thiserror = "2" diff --git a/crates/backend-fault-injection/Cargo.toml b/crates/backend-fault-injection/Cargo.toml new file mode 100644 index 00000000..1b592ba1 --- /dev/null +++ b/crates/backend-fault-injection/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "waymark-backend-fault-injection" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } diff --git a/crates/backend-fault-injection/src/lib.rs b/crates/backend-fault-injection/src/lib.rs new file mode 100644 index 00000000..4a43d305 --- /dev/null +++ b/crates/backend-fault-injection/src/lib.rs @@ -0,0 +1,128 @@ +use std::sync::{ + Arc, + atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}, +}; + +use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::{ + CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstanceBatch, +}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; + +#[derive(Clone)] +pub struct FaultInjectingBackend { + inner: MemoryBackend, + fail_get_queued_instances_with_depth_limit: Arc, + get_queued_instances_calls: Arc, +} + +impl FaultInjectingBackend { + pub fn with_depth_limit_poll_failures(inner: MemoryBackend) -> Self { + Self { + inner, + fail_get_queued_instances_with_depth_limit: Arc::new(AtomicBool::new(true)), + get_queued_instances_calls: Arc::new(AtomicUsize::new(0)), + } + } + + pub fn get_queued_instances_calls(&self) -> usize { + self.get_queued_instances_calls.load(AtomicOrdering::SeqCst) + } + + pub fn queue_len(&self) -> usize { + self.inner + .instance_queue() + .as_ref() + .map(|queue| queue.lock().expect("queue poisoned").len()) + .unwrap_or(0) + } + + pub fn instances_done_len(&self) -> usize { + self.inner.instances_done().len() + } +} + +#[async_trait::async_trait] +impl CoreBackend for FaultInjectingBackend { + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + self.inner.save_graphs(claim, graphs).await + } + + async fn save_actions_done( + &self, + actions: &[waymark_core_backend::ActionDone], + ) -> BackendResult<()> { + self.inner.save_actions_done(actions).await + } + + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { + self.inner.save_instances_done(instances).await + } + + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + self.get_queued_instances_calls + .fetch_add(1, AtomicOrdering::SeqCst); + if self + .fail_get_queued_instances_with_depth_limit + .load(AtomicOrdering::SeqCst) + { + return Err(BackendError::Message("depth limit exceeded".to_string())); + } + self.inner.get_queued_instances(size, claim).await + } + + async fn queue_instances( + &self, + instances: &[waymark_core_backend::QueuedInstance], + ) -> BackendResult<()> { + self.inner.queue_instances(instances).await + } + + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + self.inner.refresh_instance_locks(claim, instance_ids).await + } + + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()> { + self.inner + .release_instance_locks(lock_uuid, instance_ids) + .await + } +} + +#[async_trait::async_trait] +impl WorkflowRegistryBackend for FaultInjectingBackend { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult { + self.inner.upsert_workflow_version(registration).await + } + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { + self.inner.get_workflow_versions(ids).await + } +} diff --git a/crates/backend-memory/Cargo.toml b/crates/backend-memory/Cargo.toml index 4346bbda..203e0f35 100644 --- a/crates/backend-memory/Cargo.toml +++ b/crates/backend-memory/Cargo.toml @@ -9,6 +9,7 @@ chrono = { workspace = true } rmp-serde = { workspace = true } serde_json = { workspace = true } uuid = { workspace = true } +waymark-backends-core = { workspace = true } waymark-core-backend = { workspace = true } waymark-garbage-collector-backend = { workspace = true, optional = true } waymark-scheduler-backend = { workspace = true, optional = true } diff --git a/crates/backend-memory/src/core_backend.rs b/crates/backend-memory/src/core_backend.rs index d6e2da1e..49a40330 100644 --- a/crates/backend-memory/src/core_backend.rs +++ b/crates/backend-memory/src/core_backend.rs @@ -1,8 +1,9 @@ use chrono::Utc; use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; use waymark_core_backend::{ - ActionDone, BackendError, BackendResult, GraphUpdate, InstanceDone, InstanceLockStatus, - LockClaim, QueuedInstance, QueuedInstanceBatch, + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, }; #[async_trait::async_trait] diff --git a/crates/backend-memory/src/garbage_collector_backend.rs b/crates/backend-memory/src/garbage_collector_backend.rs index a1274935..6a4cda66 100644 --- a/crates/backend-memory/src/garbage_collector_backend.rs +++ b/crates/backend-memory/src/garbage_collector_backend.rs @@ -1,7 +1,6 @@ use chrono::{DateTime, Utc}; -use waymark_garbage_collector_backend::{ - BackendResult, GarbageCollectionResult, GarbageCollectorBackend, -}; +use waymark_backends_core::BackendResult; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; #[async_trait::async_trait] impl GarbageCollectorBackend for crate::MemoryBackend { diff --git a/crates/backend-memory/src/scheduler_backend.rs b/crates/backend-memory/src/scheduler_backend.rs index a69aa94e..3764f489 100644 --- a/crates/backend-memory/src/scheduler_backend.rs +++ b/crates/backend-memory/src/scheduler_backend.rs @@ -1,6 +1,6 @@ use chrono::Utc; use uuid::Uuid; -use waymark_core_backend::{BackendError, BackendResult}; +use waymark_backends_core::{BackendError, BackendResult}; use waymark_scheduler_backend::SchedulerBackend; use waymark_scheduler_core::{ CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule, compute_next_run, diff --git a/crates/backend-memory/src/webapp_backend.rs b/crates/backend-memory/src/webapp_backend.rs index 883f4076..5bcca7c1 100644 --- a/crates/backend-memory/src/webapp_backend.rs +++ b/crates/backend-memory/src/webapp_backend.rs @@ -2,7 +2,8 @@ use std::collections::HashMap; use chrono::Utc; use uuid::Uuid; -use waymark_webapp_backend::{BackendError, BackendResult, WebappBackend}; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_webapp_backend::WebappBackend; use waymark_webapp_core::{ ExecutionGraphView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, diff --git a/crates/backend-postgres-migrations/Cargo.toml b/crates/backend-postgres-migrations/Cargo.toml new file mode 100644 index 00000000..f84ad14c --- /dev/null +++ b/crates/backend-postgres-migrations/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "waymark-backend-postgres-migrations" +version = "0.1.0" +edition = "2024" + +[dependencies] +sqlx = { workspace = true, features = ["postgres", "macros", "migrate"] } diff --git a/crates/backend-postgres-migrations/build.rs b/crates/backend-postgres-migrations/build.rs new file mode 100644 index 00000000..3a8149ef --- /dev/null +++ b/crates/backend-postgres-migrations/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rerun-if-changed=migrations"); +} diff --git a/crates/backend-postgres-migrations/migrations/0001_init.sql b/crates/backend-postgres-migrations/migrations/0001_init.sql new file mode 100644 index 00000000..dbb6b7da --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0001_init.sql @@ -0,0 +1,115 @@ +-- Waymark core schema (baseline) + +CREATE EXTENSION IF NOT EXISTS pgcrypto; + +-- --------------------------------------------------------------------------- +-- Workflow definitions +-- --------------------------------------------------------------------------- + +CREATE TABLE workflow_versions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + workflow_name TEXT NOT NULL, + dag_hash TEXT NOT NULL, + program_proto BYTEA NOT NULL, + concurrent BOOLEAN NOT NULL DEFAULT false, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE(workflow_name, dag_hash) +); + +CREATE INDEX idx_workflow_versions_name ON workflow_versions(workflow_name); + +-- --------------------------------------------------------------------------- +-- Runner persistence tables +-- --------------------------------------------------------------------------- + +CREATE TABLE runner_graph_updates ( + id BIGSERIAL PRIMARY KEY, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + state BYTEA NOT NULL +); + +CREATE TABLE runner_actions_done ( + id BIGSERIAL PRIMARY KEY, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + node_id UUID NOT NULL, + action_name TEXT NOT NULL, + attempt INTEGER NOT NULL, + result BYTEA +); + +CREATE TABLE runner_instances ( + instance_id UUID PRIMARY KEY, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + entry_node UUID NOT NULL, + state BYTEA, + result BYTEA, + error BYTEA +); + +CREATE TABLE runner_instances_done ( + id BIGSERIAL PRIMARY KEY, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + executor_id UUID NOT NULL, + entry_node UUID NOT NULL, + result BYTEA, + error BYTEA +); + +CREATE TABLE queued_instances ( + instance_id UUID PRIMARY KEY, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + payload BYTEA NOT NULL +); + +-- --------------------------------------------------------------------------- +-- Scheduler +-- --------------------------------------------------------------------------- + +CREATE TABLE workflow_schedules ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + workflow_name TEXT NOT NULL, + schedule_name TEXT NOT NULL, + schedule_type TEXT NOT NULL, + cron_expression TEXT, + interval_seconds BIGINT, + jitter_seconds BIGINT NOT NULL DEFAULT 0, + input_payload BYTEA, + status TEXT NOT NULL DEFAULT 'active', + next_run_at TIMESTAMPTZ, + last_run_at TIMESTAMPTZ, + last_instance_id UUID, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + priority INT NOT NULL DEFAULT 0, + allow_duplicate BOOLEAN NOT NULL DEFAULT false, + UNIQUE(workflow_name, schedule_name) +); + +CREATE INDEX idx_schedules_due ON workflow_schedules(next_run_at) + WHERE status = 'active' AND next_run_at IS NOT NULL; + +-- --------------------------------------------------------------------------- +-- Worker status metrics +-- --------------------------------------------------------------------------- + +CREATE TABLE worker_status ( + pool_id UUID NOT NULL, + worker_id BIGINT NOT NULL, + throughput_per_min DOUBLE PRECISION NOT NULL DEFAULT 0, + total_completed BIGINT NOT NULL DEFAULT 0, + last_action_at TIMESTAMPTZ, + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + median_dequeue_ms BIGINT, + median_handling_ms BIGINT, + dispatch_queue_size BIGINT, + total_in_flight BIGINT, + active_workers INT NOT NULL DEFAULT 0, + actions_per_sec DOUBLE PRECISION NOT NULL DEFAULT 0, + median_instance_duration_secs DOUBLE PRECISION, + active_instance_count INT NOT NULL DEFAULT 0, + total_instances_completed BIGINT NOT NULL DEFAULT 0, + instances_per_sec DOUBLE PRECISION NOT NULL DEFAULT 0, + instances_per_min DOUBLE PRECISION NOT NULL DEFAULT 0, + time_series BYTEA, + PRIMARY KEY (pool_id, worker_id) +); diff --git a/crates/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql b/crates/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql new file mode 100644 index 00000000..b4bce178 --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql @@ -0,0 +1,7 @@ +-- Rename runner action identifier to execution_id and drop stored action name. + +ALTER TABLE runner_actions_done + RENAME COLUMN node_id TO execution_id; + +ALTER TABLE runner_actions_done + DROP COLUMN action_name; diff --git a/crates/backend-postgres-migrations/migrations/0003_instance_locks.sql b/crates/backend-postgres-migrations/migrations/0003_instance_locks.sql new file mode 100644 index 00000000..6b826d18 --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0003_instance_locks.sql @@ -0,0 +1,12 @@ +-- Add scheduling and locking for queued instances. + +ALTER TABLE queued_instances + ADD COLUMN scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + ADD COLUMN lock_uuid UUID, + ADD COLUMN lock_expires_at TIMESTAMPTZ; + +CREATE INDEX IF NOT EXISTS idx_queued_instances_scheduled_at + ON queued_instances(scheduled_at); + +CREATE INDEX IF NOT EXISTS idx_queued_instances_lock_expires_at + ON queued_instances(lock_expires_at); diff --git a/crates/backend-postgres-migrations/migrations/0004_workflow_versions.sql b/crates/backend-postgres-migrations/migrations/0004_workflow_versions.sql new file mode 100644 index 00000000..daf3b54d --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0004_workflow_versions.sql @@ -0,0 +1,21 @@ +-- Workflow versions: replace dag_hash with workflow_version + ir_hash + +ALTER TABLE workflow_versions + RENAME COLUMN dag_hash TO workflow_version; + +ALTER TABLE workflow_versions + ADD COLUMN ir_hash TEXT; + +UPDATE workflow_versions +SET ir_hash = workflow_version +WHERE ir_hash IS NULL; + +ALTER TABLE workflow_versions + ALTER COLUMN ir_hash SET NOT NULL; + +ALTER TABLE workflow_versions + DROP CONSTRAINT IF EXISTS workflow_versions_workflow_name_dag_hash_key; + +ALTER TABLE workflow_versions + ADD CONSTRAINT workflow_versions_workflow_name_version_key + UNIQUE (workflow_name, workflow_version); diff --git a/crates/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql b/crates/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql new file mode 100644 index 00000000..6d09937b --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql @@ -0,0 +1,7 @@ +-- Persist workflow version on instances so webapp can show workflow names. + +ALTER TABLE runner_instances + ADD COLUMN workflow_version_id UUID; + +CREATE INDEX IF NOT EXISTS idx_runner_instances_workflow_version_id + ON runner_instances(workflow_version_id); diff --git a/crates/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql b/crates/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql new file mode 100644 index 00000000..d3b1f272 --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql @@ -0,0 +1,4 @@ +-- Remove legacy tables no longer used by runtime or webapp. + +DROP TABLE IF EXISTS runner_graph_updates; +DROP TABLE IF EXISTS runner_instances_done; diff --git a/crates/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql b/crates/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql new file mode 100644 index 00000000..06cb1385 --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql @@ -0,0 +1,5 @@ +ALTER TABLE runner_instances +ADD COLUMN IF NOT EXISTS schedule_id UUID; + +CREATE INDEX IF NOT EXISTS idx_runner_instances_schedule_id_created_at + ON runner_instances(schedule_id, created_at DESC); diff --git a/crates/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql b/crates/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql new file mode 100644 index 00000000..b1b5551d --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql @@ -0,0 +1,14 @@ +-- Persist per-attempt lifecycle metadata for action history and timeline rendering. + +ALTER TABLE runner_actions_done + ADD COLUMN status TEXT, + ADD COLUMN started_at TIMESTAMPTZ, + ADD COLUMN completed_at TIMESTAMPTZ, + ADD COLUMN duration_ms BIGINT; + +ALTER TABLE runner_actions_done + ADD CONSTRAINT runner_actions_done_status_check + CHECK (status IS NULL OR status IN ('completed', 'failed', 'timed_out')); + +CREATE INDEX idx_runner_actions_done_execution_attempt + ON runner_actions_done (execution_id, attempt); diff --git a/crates/backend-postgres-migrations/migrations/0009_instance_search_columns.sql b/crates/backend-postgres-migrations/migrations/0009_instance_search_columns.sql new file mode 100644 index 00000000..948c6aca --- /dev/null +++ b/crates/backend-postgres-migrations/migrations/0009_instance_search_columns.sql @@ -0,0 +1,63 @@ +-- Persist workflow/status instance metadata for indexed search in webapp queries. + +ALTER TABLE runner_instances + ADD COLUMN IF NOT EXISTS workflow_name TEXT, + ADD COLUMN IF NOT EXISTS current_status TEXT; + +ALTER TABLE queued_instances + ADD COLUMN IF NOT EXISTS workflow_name TEXT, + ADD COLUMN IF NOT EXISTS current_status TEXT; + +UPDATE runner_instances AS ri +SET workflow_name = wv.workflow_name +FROM workflow_versions wv +WHERE ri.workflow_name IS NULL + AND ri.workflow_version_id = wv.id; + +UPDATE runner_instances +SET current_status = CASE + WHEN error IS NOT NULL THEN 'failed' + WHEN result IS NOT NULL THEN 'completed' + WHEN state IS NOT NULL THEN 'running' + ELSE 'queued' +END +WHERE current_status IS NULL; + +UPDATE queued_instances AS qi +SET workflow_name = ri.workflow_name +FROM runner_instances ri +WHERE qi.workflow_name IS NULL + AND qi.instance_id = ri.instance_id; + +UPDATE queued_instances +SET current_status = CASE + WHEN lock_uuid IS NULL THEN 'queued' + ELSE 'running' +END +WHERE current_status IS NULL; + +ALTER TABLE runner_instances + ADD CONSTRAINT runner_instances_current_status_check + CHECK ( + current_status IS NULL + OR current_status IN ('queued', 'running', 'completed', 'failed') + ); + +ALTER TABLE queued_instances + ADD CONSTRAINT queued_instances_current_status_check + CHECK ( + current_status IS NULL + OR current_status IN ('queued', 'running') + ); + +CREATE INDEX IF NOT EXISTS idx_runner_instances_workflow_name + ON runner_instances(workflow_name); + +CREATE INDEX IF NOT EXISTS idx_runner_instances_current_status + ON runner_instances(current_status); + +CREATE INDEX IF NOT EXISTS idx_queued_instances_workflow_name + ON queued_instances(workflow_name); + +CREATE INDEX IF NOT EXISTS idx_queued_instances_current_status + ON queued_instances(current_status); diff --git a/crates/backend-postgres-migrations/src/lib.rs b/crates/backend-postgres-migrations/src/lib.rs new file mode 100644 index 00000000..82495aeb --- /dev/null +++ b/crates/backend-postgres-migrations/src/lib.rs @@ -0,0 +1,8 @@ +//! Migrations for the postgres backend. + +use sqlx::PgPool; + +/// Run the embedded SQLx migrations. +pub async fn run(pool: &PgPool) -> Result<(), sqlx::migrate::MigrateError> { + sqlx::migrate!().run(pool).await +} diff --git a/crates/backend-postgres/Cargo.toml b/crates/backend-postgres/Cargo.toml new file mode 100644 index 00000000..f61f582f --- /dev/null +++ b/crates/backend-postgres/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "waymark-backend-postgres" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +rmp-serde = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +sqlx = { workspace = true, features = ["uuid", "chrono"] } +tokio = { workspace = true, features = ["macros"] } +tracing = { workspace = true } +uuid = { workspace = true } +waymark-backend-postgres-migrations = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } +waymark-garbage-collector-backend = { workspace = true } +waymark-observability = { workspace = true } +waymark-runner = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-scheduler-backend = { workspace = true } +waymark-scheduler-core = { workspace = true } +waymark-webapp-backend = { workspace = true } +waymark-webapp-core = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +prost = { workspace = true } + +[dev-dependencies] +serial_test = { workspace = true } +waymark-test-support = { workspace = true } +waymark-ir-parser = { workspace = true } + +[features] +trace = [] diff --git a/crates/backend-postgres/src/core.rs b/crates/backend-postgres/src/core.rs new file mode 100644 index 00000000..bc46747e --- /dev/null +++ b/crates/backend-postgres/src/core.rs @@ -0,0 +1,1993 @@ +use std::collections::HashMap; +use std::future::Future; +use std::time::Duration as StdDuration; + +use chrono::{DateTime, Utc}; +use sqlx::{Postgres, QueryBuilder, Row}; +use tracing::warn; +use uuid::Uuid; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; +use waymark_scheduler_backend::{BackendError, BackendResult}; +use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; + +use super::PostgresBackend; +use waymark_core_backend::{ + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, +}; +use waymark_observability::obs; +use waymark_runner_state::RunnerState; + +const INSTANCE_STATUS_QUEUED: &str = "queued"; +const INSTANCE_STATUS_RUNNING: &str = "running"; +const INSTANCE_STATUS_COMPLETED: &str = "completed"; +const INSTANCE_STATUS_FAILED: &str = "failed"; +const TRANSIENT_DEADLOCK_SQLSTATE: &str = "40P01"; +const TRANSIENT_SERIALIZATION_SQLSTATE: &str = "40001"; +const TRANSIENT_RETRY_MAX_ATTEMPTS: usize = 3; +const TRANSIENT_RETRY_INITIAL_BACKOFF_MS: u64 = 25; +const TRANSIENT_RETRY_MAX_BACKOFF_MS: u64 = 250; + +fn instance_result_is_error_wrapper(result: &serde_json::Value) -> bool { + let serde_json::Value::Object(map) = result else { + return false; + }; + map.len() == 1 + && (map.contains_key("error") + || map.contains_key("__exception__") + || map.contains_key("exception")) +} + +fn instance_done_status(instance: &InstanceDone) -> &'static str { + if instance.error.is_some() + || instance + .result + .as_ref() + .is_some_and(instance_result_is_error_wrapper) + { + INSTANCE_STATUS_FAILED + } else { + INSTANCE_STATUS_COMPLETED + } +} + +fn is_transient_sqlstate(code: &str) -> bool { + matches!( + code, + TRANSIENT_DEADLOCK_SQLSTATE | TRANSIENT_SERIALIZATION_SQLSTATE + ) +} + +fn is_transient_backend_error(err: &BackendError) -> bool { + match err { + BackendError::Inner(sqlx::Error::Database(db_err)) => { + db_err.code().as_deref().is_some_and(is_transient_sqlstate) + } + // Fallback for cases where sqlstate is not preserved in wrapping. + BackendError::Message(message) => { + message.contains("deadlock detected") + || message.contains("could not serialize access due to") + } + _ => false, + } +} + +async fn retry_transient_backend( + operation: &'static str, + mut op: Op, +) -> BackendResult +where + Op: FnMut() -> Fut, + Fut: Future>, +{ + let mut attempt = 0usize; + let mut backoff_ms = TRANSIENT_RETRY_INITIAL_BACKOFF_MS; + loop { + match op().await { + Ok(value) => return Ok(value), + Err(err) + if attempt < TRANSIENT_RETRY_MAX_ATTEMPTS && is_transient_backend_error(&err) => + { + attempt += 1; + warn!( + operation, + attempt, + error = %err, + "transient database error; retrying" + ); + tokio::time::sleep(StdDuration::from_millis(backoff_ms)).await; + backoff_ms = + std::cmp::min(backoff_ms.saturating_mul(2), TRANSIENT_RETRY_MAX_BACKOFF_MS); + } + Err(err) => return Err(err), + } + } +} + +impl PostgresBackend { + /// Insert queued instances for run-loop consumption. + #[obs] + pub async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { + if instances.is_empty() { + return Ok(()); + } + let workflow_version_ids: Vec = instances + .iter() + .map(|instance| instance.workflow_version_id) + .collect(); + let workflow_rows = + sqlx::query("SELECT id, workflow_name FROM workflow_versions WHERE id = ANY($1)") + .bind(&workflow_version_ids) + .fetch_all(&self.pool) + .await?; + let mut workflow_names_by_version_id: HashMap = + HashMap::with_capacity(workflow_rows.len()); + for row in workflow_rows { + workflow_names_by_version_id.insert(row.get("id"), row.get("workflow_name")); + } + + let mut queued_payloads = Vec::new(); + let mut runner_payloads = Vec::new(); + for instance in instances { + let state = instance.state.as_ref().ok_or_else(|| { + BackendError::Message("queued instance missing runner state".to_string()) + })?; + let scheduled_at = instance.scheduled_at.unwrap_or_else(Utc::now); + let workflow_name = workflow_names_by_version_id + .get(&instance.workflow_version_id) + .cloned(); + let mut payload_instance = instance.clone(); + payload_instance.scheduled_at = Some(scheduled_at); + queued_payloads.push(( + payload_instance.instance_id, + scheduled_at, + workflow_name.clone(), + INSTANCE_STATUS_QUEUED, + Self::serialize(&payload_instance)?, + )); + let graph = GraphUpdate::from_state(instance.instance_id, state); + runner_payloads.push(( + instance.instance_id, + instance.entry_node, + instance.workflow_version_id, + instance.schedule_id, + workflow_name, + INSTANCE_STATUS_QUEUED, + Self::serialize(&graph)?, + )); + } + + let mut queued_builder: QueryBuilder = QueryBuilder::new( + "INSERT INTO queued_instances (instance_id, scheduled_at, workflow_name, current_status, payload) ", + ); + queued_builder.push_values( + queued_payloads.iter(), + |mut builder, (id, scheduled_at, workflow_name, current_status, payload)| { + builder + .push_bind(*id) + .push_bind(*scheduled_at) + .push_bind(workflow_name.as_deref()) + .push_bind(*current_status) + .push_bind(payload.as_slice()); + }, + ); + + let mut runner_builder: QueryBuilder = QueryBuilder::new( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, schedule_id, workflow_name, current_status, state) ", + ); + runner_builder.push_values( + runner_payloads.iter(), + |mut builder, + ( + id, + entry, + workflow_version_id, + schedule_id, + workflow_name, + current_status, + payload, + )| { + builder + .push_bind(*id) + .push_bind(*entry) + .push_bind(*workflow_version_id) + .push_bind(*schedule_id) + .push_bind(workflow_name.as_deref()) + .push_bind(*current_status) + .push_bind(payload.as_slice()); + }, + ); + + let mut tx = self.pool.begin().await?; + Self::count_query(&self.query_counts, "insert:queued_instances"); + Self::count_batch_size( + &self.batch_size_counts, + "insert:queued_instances", + instances.len(), + ); + queued_builder.build().execute(&mut *tx).await?; + Self::count_query(&self.query_counts, "insert:runner_instances"); + Self::count_batch_size( + &self.batch_size_counts, + "insert:runner_instances", + instances.len(), + ); + runner_builder.build().execute(&mut *tx).await?; + tx.commit().await?; + Ok(()) + } + + /// Upsert worker status for monitoring and activity graphs. + #[obs] + pub async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { + Self::count_query(&self.query_counts, "upsert:worker_status"); + sqlx::query( + r#" + INSERT INTO worker_status ( + pool_id, + worker_id, + throughput_per_min, + total_completed, + last_action_at, + updated_at, + median_dequeue_ms, + median_handling_ms, + dispatch_queue_size, + total_in_flight, + active_workers, + actions_per_sec, + median_instance_duration_secs, + active_instance_count, + total_instances_completed, + instances_per_sec, + instances_per_min, + time_series + ) + VALUES ($1, 0, $2, $3, $4, NOW(), $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) + ON CONFLICT (pool_id, worker_id) + DO UPDATE SET + throughput_per_min = EXCLUDED.throughput_per_min, + total_completed = EXCLUDED.total_completed, + last_action_at = EXCLUDED.last_action_at, + updated_at = EXCLUDED.updated_at, + median_dequeue_ms = EXCLUDED.median_dequeue_ms, + median_handling_ms = EXCLUDED.median_handling_ms, + dispatch_queue_size = EXCLUDED.dispatch_queue_size, + total_in_flight = EXCLUDED.total_in_flight, + active_workers = EXCLUDED.active_workers, + actions_per_sec = EXCLUDED.actions_per_sec, + median_instance_duration_secs = EXCLUDED.median_instance_duration_secs, + active_instance_count = EXCLUDED.active_instance_count, + total_instances_completed = EXCLUDED.total_instances_completed, + instances_per_sec = EXCLUDED.instances_per_sec, + instances_per_min = EXCLUDED.instances_per_min, + time_series = EXCLUDED.time_series + "#, + ) + .bind(status.pool_id) + .bind(status.throughput_per_min) + .bind(status.total_completed) + .bind(status.last_action_at) + .bind(status.median_dequeue_ms) + .bind(status.median_handling_ms) + .bind(status.dispatch_queue_size) + .bind(status.total_in_flight) + .bind(status.active_workers) + .bind(status.actions_per_sec) + .bind(status.median_instance_duration_secs) + .bind(status.active_instance_count) + .bind(status.total_instances_completed) + .bind(status.instances_per_sec) + .bind(status.instances_per_min) + .bind(&status.time_series) + .execute(&self.pool) + .await?; + + Ok(()) + } + + /// Clear expired queue locks so they can be claimed again by the runloop. + /// + /// This uses the same `FOR UPDATE SKIP LOCKED` claim pattern as dequeue to + /// avoid blocking under concurrent sweepers. + #[obs] + pub async fn reclaim_expired_instance_locks(&self, size: usize) -> BackendResult { + if size == 0 { + return Ok(0); + } + + let now = Utc::now(); + let mut tx = self.pool.begin().await?; + Self::count_query(&self.query_counts, "update:queued_instances_expired_unlock"); + let rows = sqlx::query( + r#" + WITH expired AS ( + SELECT instance_id + FROM queued_instances + WHERE lock_uuid IS NOT NULL + AND lock_expires_at <= $1 + ORDER BY lock_expires_at, scheduled_at, created_at + LIMIT $2 + FOR UPDATE SKIP LOCKED + ) + UPDATE queued_instances AS qi + SET lock_uuid = NULL, + lock_expires_at = NULL + FROM expired + WHERE qi.instance_id = expired.instance_id + RETURNING qi.instance_id + "#, + ) + .bind(now) + .bind(size as i64) + .fetch_all(&mut *tx) + .await?; + + if !rows.is_empty() { + let instance_ids: Vec = rows.iter().map(|row| row.get("instance_id")).collect(); + sqlx::query( + "UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1) AND result IS NULL AND error IS NULL", + ) + .bind(&instance_ids) + .bind(INSTANCE_STATUS_QUEUED) + .execute(&mut *tx) + .await?; + } + + tx.commit().await?; + + if !rows.is_empty() { + Self::count_batch_size( + &self.batch_size_counts, + "update:queued_instances_expired_unlock", + rows.len(), + ); + } + + Ok(rows.len()) + } + + /// Delete old finished instances and their action attempt rows. + #[obs] + pub async fn collect_done_instances_impl( + &self, + older_than: DateTime, + limit: usize, + ) -> BackendResult { + if limit == 0 { + return Ok(GarbageCollectionResult::default()); + } + + let mut tx = self.pool.begin().await?; + Self::count_query(&self.query_counts, "select:runner_instances_gc_candidates"); + let candidate_rows = sqlx::query( + r#" + SELECT instance_id, state + FROM runner_instances + WHERE created_at < $1 + AND (result IS NOT NULL OR error IS NOT NULL) + ORDER BY created_at, instance_id + LIMIT $2 + FOR UPDATE SKIP LOCKED + "#, + ) + .bind(older_than) + .bind(limit as i64) + .fetch_all(&mut *tx) + .await?; + + if candidate_rows.is_empty() { + tx.commit().await?; + return Ok(GarbageCollectionResult::default()); + } + + let mut instance_ids = Vec::with_capacity(candidate_rows.len()); + let mut action_execution_ids = Vec::new(); + for row in candidate_rows { + let instance_id: Uuid = row.get("instance_id"); + let state_payload: Option> = row.get("state"); + instance_ids.push(instance_id); + + let Some(state_payload) = state_payload else { + continue; + }; + match Self::deserialize::(&state_payload) { + Ok(graph) => { + for (execution_id, node) in graph.nodes { + if node.is_action_call() { + action_execution_ids.push(execution_id); + } + } + } + Err(err) => { + warn!( + %instance_id, + error = %err, + "failed to decode runner state while collecting garbage" + ); + } + } + } + + action_execution_ids.sort_unstable(); + action_execution_ids.dedup(); + let deleted_actions = if action_execution_ids.is_empty() { + 0 + } else { + Self::count_query(&self.query_counts, "delete:runner_actions_done_gc"); + let result = + sqlx::query("DELETE FROM runner_actions_done WHERE execution_id = ANY($1)") + .bind(&action_execution_ids) + .execute(&mut *tx) + .await?; + let rows = result.rows_affected() as usize; + Self::count_batch_size( + &self.batch_size_counts, + "delete:runner_actions_done_gc", + rows, + ); + rows + }; + + Self::count_query(&self.query_counts, "delete:queued_instances_gc"); + let _ = sqlx::query("DELETE FROM queued_instances WHERE instance_id = ANY($1)") + .bind(&instance_ids) + .execute(&mut *tx) + .await?; + + Self::count_query(&self.query_counts, "delete:runner_instances_gc"); + let deleted_instances_result = + sqlx::query("DELETE FROM runner_instances WHERE instance_id = ANY($1)") + .bind(&instance_ids) + .execute(&mut *tx) + .await?; + let deleted_instances = deleted_instances_result.rows_affected() as usize; + Self::count_batch_size( + &self.batch_size_counts, + "delete:runner_instances_gc", + deleted_instances, + ); + tx.commit().await?; + + Ok(GarbageCollectionResult { + deleted_instances, + deleted_actions, + }) + } + + #[obs] + async fn save_graphs_impl( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + retry_transient_backend("save_graphs_impl", || { + let claim = claim.clone(); + async move { self.save_graphs_once(claim, graphs).await } + }) + .await + } + + async fn save_graphs_once( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + if graphs.is_empty() { + return Ok(Vec::new()); + } + let mut payloads = Vec::with_capacity(graphs.len()); + for graph in graphs { + payloads.push(( + graph.instance_id, + graph.next_scheduled_at(), + claim.lock_expires_at, + Self::serialize(graph)?, + )); + } + + Self::count_query(&self.query_counts, "update:queued_instances_scheduled_at"); + Self::count_batch_size( + &self.batch_size_counts, + "update:queued_instances_scheduled_at", + payloads.len(), + ); + let now = Utc::now(); + let mut schedule_builder: QueryBuilder = QueryBuilder::new( + "UPDATE queued_instances AS qi SET scheduled_at = v.scheduled_at, lock_expires_at = CASE WHEN qi.lock_expires_at IS NULL OR qi.lock_expires_at < v.lock_expires_at THEN v.lock_expires_at ELSE qi.lock_expires_at END FROM (", + ); + schedule_builder.push_values( + payloads.iter(), + |mut b, (instance_id, scheduled_at, lock_expires_at, _payload)| { + b.push_bind(*instance_id) + .push_bind(*scheduled_at) + .push_bind(*lock_expires_at); + }, + ); + schedule_builder.push( + ") AS v(instance_id, scheduled_at, lock_expires_at) + WHERE qi.instance_id = v.instance_id + AND qi.lock_uuid = ", + ); + schedule_builder.push_bind(claim.lock_uuid); + schedule_builder.push(" AND (qi.lock_expires_at IS NULL OR qi.lock_expires_at > "); + schedule_builder.push_bind(now); + schedule_builder.push(")"); + schedule_builder.build().execute(&self.pool).await?; + + Self::count_query(&self.query_counts, "update:runner_instances_state"); + Self::count_batch_size( + &self.batch_size_counts, + "update:runner_instances_state", + payloads.len(), + ); + let mut runner_builder: QueryBuilder = + QueryBuilder::new("UPDATE runner_instances AS ri SET state = v.state FROM ("); + runner_builder.push_values( + payloads.iter(), + |mut b, (instance_id, _scheduled_at, _lock_expires_at, payload)| { + b.push_bind(*instance_id).push_bind(payload.as_slice()); + }, + ); + runner_builder.push( + ") AS v(instance_id, state) + JOIN queued_instances qi ON qi.instance_id = v.instance_id + WHERE ri.instance_id = v.instance_id + AND qi.lock_uuid = ", + ); + runner_builder.push_bind(claim.lock_uuid); + runner_builder.push(" AND (qi.lock_expires_at IS NULL OR qi.lock_expires_at > "); + runner_builder.push_bind(now); + runner_builder.push(")"); + runner_builder.build().execute(&self.pool).await?; + + let ids: Vec = graphs.iter().map(|graph| graph.instance_id).collect(); + let lock_rows = sqlx::query( + "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = ANY($1)", + ) + .bind(&ids) + .fetch_all(&self.pool) + .await?; + + let mut lock_map: HashMap = HashMap::new(); + for row in lock_rows { + let instance_id: Uuid = row.get(0); + lock_map.insert( + instance_id, + InstanceLockStatus { + instance_id, + lock_uuid: row.get(1), + lock_expires_at: row.get(2), + }, + ); + } + + let mut locks = Vec::with_capacity(ids.len()); + for instance_id in ids { + locks.push( + lock_map + .get(&instance_id) + .cloned() + .unwrap_or(InstanceLockStatus { + instance_id, + lock_uuid: None, + lock_expires_at: None, + }), + ); + } + Ok(locks) + } + + #[obs] + async fn save_actions_done_impl(&self, actions: &[ActionDone]) -> BackendResult<()> { + if actions.is_empty() { + return Ok(()); + } + Self::count_query(&self.query_counts, "insert:runner_actions_done"); + Self::count_batch_size( + &self.batch_size_counts, + "insert:runner_actions_done", + actions.len(), + ); + let mut payloads = Vec::new(); + for action in actions { + payloads.push(( + action.execution_id, + action.attempt, + action.status.to_string(), + action.started_at, + action.completed_at, + action.duration_ms, + Self::serialize(&action.result)?, + )); + } + let mut builder: QueryBuilder = QueryBuilder::new( + "INSERT INTO runner_actions_done (execution_id, attempt, status, started_at, completed_at, duration_ms, result) ", + ); + builder.push_values( + payloads.iter(), + |mut b, (execution_id, attempt, status, started_at, completed_at, duration_ms, payload)| { + b.push_bind(*execution_id) + .push_bind(*attempt) + .push_bind(status.as_str()) + .push_bind(*started_at) + .push_bind(*completed_at) + .push_bind(*duration_ms) + .push_bind(payload.as_slice()); + }, + ); + builder.build().execute(&self.pool).await?; + Ok(()) + } + + #[obs] + async fn get_queued_instances_impl( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + retry_transient_backend("get_queued_instances_impl", || { + let claim = claim.clone(); + async move { self.get_queued_instances_once(size, claim).await } + }) + .await + } + + async fn get_queued_instances_once( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + if size == 0 { + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + let now = Utc::now(); + let mut tx = self.pool.begin().await?; + Self::count_query(&self.query_counts, "select:queued_instances"); + let rows = sqlx::query( + r#" + WITH claimed AS ( + SELECT instance_id, payload + FROM queued_instances + WHERE scheduled_at <= $1 + AND (lock_uuid IS NULL OR lock_expires_at <= $1) + ORDER BY scheduled_at, created_at + LIMIT $2 + FOR UPDATE SKIP LOCKED + ), + updated AS ( + UPDATE queued_instances AS qi + SET lock_uuid = $3, + lock_expires_at = $4 + FROM claimed + WHERE qi.instance_id = claimed.instance_id + RETURNING qi.instance_id, claimed.payload + ) + SELECT updated.instance_id, updated.payload, ri.state + FROM updated + JOIN runner_instances ri ON ri.instance_id = updated.instance_id + "#, + ) + .bind(now) + .bind(size as i64) + .bind(claim.lock_uuid) + .bind(claim.lock_expires_at) + .fetch_all(&mut *tx) + .await?; + + if rows.is_empty() { + tx.commit().await?; + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + + let claimed_instance_ids: Vec = + rows.iter().map(|row| row.get("instance_id")).collect(); + sqlx::query("UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1)") + .bind(&claimed_instance_ids) + .bind(INSTANCE_STATUS_RUNNING) + .execute(&mut *tx) + .await?; + + Self::count_batch_size( + &self.batch_size_counts, + "select:queued_instances", + rows.len(), + ); + tx.commit().await?; + + let mut instances = Vec::new(); + let mut action_node_ids_by_instance: HashMap> = HashMap::new(); + let mut all_action_node_ids: Vec = Vec::new(); + for row in rows { + let instance_id: Uuid = row.get(0); + let payload: Vec = row.get(1); + let state_payload: Option> = row.get(2); + let mut instance: QueuedInstance = Self::deserialize(&payload)?; + instance.instance_id = instance_id; + if let Some(state_payload) = state_payload { + let graph: GraphUpdate = Self::deserialize(&state_payload)?; + let action_node_ids: Vec = graph + .nodes + .iter() + .filter_map(|(node_id, node)| node.is_action_call().then_some(*node_id)) + .collect(); + if !action_node_ids.is_empty() { + all_action_node_ids.extend(action_node_ids.iter().copied()); + action_node_ids_by_instance.insert(instance_id, action_node_ids); + } + instance.state = Some(RunnerState::new( + None, + Some(graph.nodes), + Some(graph.edges), + false, + )); + } + instances.push(instance); + } + + if !all_action_node_ids.is_empty() { + all_action_node_ids.sort_unstable(); + all_action_node_ids.dedup(); + + Self::count_query( + &self.query_counts, + "select:runner_actions_done_by_execution_id", + ); + let rows = sqlx::query( + r#" + SELECT DISTINCT ON (execution_id) + execution_id, + result + FROM runner_actions_done + WHERE execution_id = ANY($1) + ORDER BY execution_id, attempt DESC, id DESC + "#, + ) + .bind(&all_action_node_ids) + .fetch_all(&self.pool) + .await?; + + let mut action_results_by_execution_id: HashMap = + HashMap::new(); + for row in rows { + let execution_id: Uuid = row.get("execution_id"); + let result_payload: Option> = row.get("result"); + let Some(result_payload) = result_payload else { + continue; + }; + let result: serde_json::Value = Self::deserialize(&result_payload)?; + action_results_by_execution_id.insert(execution_id, result); + } + + for instance in &mut instances { + let Some(action_node_ids) = action_node_ids_by_instance.get(&instance.instance_id) + else { + continue; + }; + for node_id in action_node_ids { + if let Some(result) = action_results_by_execution_id.get(node_id) { + instance.action_results.insert(*node_id, result.clone()); + } + } + } + } + + Ok(QueuedInstanceBatch { instances }) + } + + #[obs] + async fn save_instances_done_impl(&self, instances: &[InstanceDone]) -> BackendResult<()> { + retry_transient_backend("save_instances_done_impl", || async move { + self.save_instances_done_once(instances).await + }) + .await + } + + async fn save_instances_done_once(&self, instances: &[InstanceDone]) -> BackendResult<()> { + if instances.is_empty() { + return Ok(()); + } + let ids: Vec = instances + .iter() + .map(|instance| instance.executor_id) + .collect(); + + let mut tx = self.pool.begin().await?; + Self::count_query(&self.query_counts, "delete:queued_instances_by_id"); + sqlx::query("DELETE FROM queued_instances WHERE instance_id = ANY($1)") + .bind(&ids) + .execute(&mut *tx) + .await?; + + Self::count_query(&self.query_counts, "update:runner_instances_result"); + Self::count_batch_size( + &self.batch_size_counts, + "update:runner_instances_result", + instances.len(), + ); + let mut payloads = Vec::with_capacity(instances.len()); + for instance in instances { + let current_status = instance_done_status(instance); + let result = match &instance.result { + Some(value) => Some(Self::serialize(value)?), + None => None, + }; + let error = match &instance.error { + Some(value) => Some(Self::serialize(value)?), + None => None, + }; + payloads.push((instance.executor_id, current_status, result, error)); + } + let mut builder: QueryBuilder = QueryBuilder::new( + "UPDATE runner_instances AS ri SET result = v.result, error = v.error, current_status = v.current_status FROM (", + ); + builder.push_values( + payloads.iter(), + |mut b, (instance_id, current_status, result, error)| { + b.push_bind(*instance_id) + .push_bind(*current_status) + .push_bind(result.as_deref()) + .push_bind(error.as_deref()); + }, + ); + builder.push( + ") AS v(instance_id, current_status, result, error) WHERE ri.instance_id = v.instance_id", + ); + builder.build().execute(&mut *tx).await?; + tx.commit().await?; + Ok(()) + } +} + +#[async_trait::async_trait] +impl waymark_core_backend::CoreBackend for PostgresBackend { + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + async fn save_graphs( + &self, + claim: waymark_core_backend::LockClaim, + graphs: &[waymark_core_backend::GraphUpdate], + ) -> BackendResult> { + self.save_graphs_impl(claim, graphs).await + } + + async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { + self.save_actions_done_impl(actions).await + } + + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + self.get_queued_instances_impl(size, claim).await + } + + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { + self.save_instances_done_impl(instances).await + } + + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + retry_transient_backend("refresh_instance_locks", || { + let claim = claim.clone(); + async move { self.refresh_instance_locks_once(claim, instance_ids).await } + }) + .await + } + + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()> { + if instance_ids.is_empty() { + return Ok(()); + } + Self::count_query(&self.query_counts, "update:queued_instances_release"); + let released_rows = sqlx::query( + r#" + WITH releasable AS ( + SELECT instance_id + FROM queued_instances + WHERE instance_id = ANY($1) + AND lock_uuid = $2 + FOR UPDATE SKIP LOCKED + ), + released AS ( + UPDATE queued_instances AS qi + SET lock_uuid = NULL, + lock_expires_at = NULL + FROM releasable + WHERE qi.instance_id = releasable.instance_id + RETURNING qi.instance_id + ) + SELECT instance_id FROM released + "#, + ) + .bind(instance_ids) + .bind(lock_uuid) + .fetch_all(&self.pool) + .await?; + + if !released_rows.is_empty() { + let released_instance_ids: Vec = released_rows + .iter() + .map(|row| row.get("instance_id")) + .collect(); + sqlx::query( + "UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1) AND result IS NULL AND error IS NULL", + ) + .bind(&released_instance_ids) + .bind(INSTANCE_STATUS_QUEUED) + .execute(&self.pool) + .await?; + } + + Ok(()) + } + + async fn queue_instances( + &self, + instances: &[waymark_core_backend::QueuedInstance], + ) -> BackendResult<()> { + PostgresBackend::queue_instances(self, instances).await + } +} + +impl PostgresBackend { + async fn refresh_instance_locks_once( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + if instance_ids.is_empty() { + return Ok(Vec::new()); + } + Self::count_query(&self.query_counts, "update:queued_instances_lock"); + sqlx::query( + r#" + WITH claimable AS ( + SELECT instance_id + FROM queued_instances + WHERE instance_id = ANY($2) + AND lock_uuid = $3 + FOR UPDATE SKIP LOCKED + ) + UPDATE queued_instances AS qi + SET lock_expires_at = $1 + FROM claimable + WHERE qi.instance_id = claimable.instance_id + "#, + ) + .bind(claim.lock_expires_at) + .bind(instance_ids) + .bind(claim.lock_uuid) + .execute(&self.pool) + .await?; + let rows = sqlx::query( + "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = ANY($1)", + ) + .bind(instance_ids) + .fetch_all(&self.pool) + .await?; + let mut locks = Vec::with_capacity(rows.len()); + for row in rows { + locks.push(InstanceLockStatus { + instance_id: row.get(0), + lock_uuid: row.get(1), + lock_expires_at: row.get(2), + }); + } + Ok(locks) + } +} + +#[async_trait::async_trait] +impl GarbageCollectorBackend for PostgresBackend { + async fn collect_done_instances( + &self, + older_than: DateTime, + limit: usize, + ) -> BackendResult { + self.collect_done_instances_impl(older_than, limit).await + } +} + +#[async_trait::async_trait] +impl WorkerStatusBackend for PostgresBackend { + async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { + PostgresBackend::upsert_worker_status(self, status).await + } +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration as StdDuration; + + use chrono::{DateTime, Duration, Utc}; + use serial_test::serial; + use sqlx::Row; + use uuid::Uuid; + use waymark_core_backend::{ActionAttemptStatus, CoreBackend}; + + use super::super::test_helpers::setup_backend; + use super::*; + + use waymark_dag::EdgeType; + use waymark_runner_state::{ActionCallSpec, ExecutionNode, NodeStatus}; + + fn sample_runner_state() -> RunnerState { + RunnerState::new(None, None, None, false) + } + + fn sample_queued_instance(instance_id: Uuid, entry_node: Uuid) -> QueuedInstance { + QueuedInstance { + workflow_version_id: Uuid::new_v4(), + schedule_id: None, + dag: None, + entry_node, + state: Some(sample_runner_state()), + action_results: HashMap::new(), + instance_id, + scheduled_at: Some(Utc::now() - Duration::seconds(1)), + } + } + + fn sample_execution_node(node_id: Uuid) -> ExecutionNode { + ExecutionNode { + node_id, + node_type: "action_call".to_string(), + label: "@tests.action()".to_string(), + status: NodeStatus::Queued, + template_id: Some("n0".to_string()), + targets: Vec::new(), + action: Some(ActionCallSpec { + action_name: "tests.action".to_string(), + module_name: Some("tests".to_string()), + kwargs: HashMap::new(), + }), + value_expr: None, + assignments: HashMap::new(), + action_attempt: 1, + started_at: None, + completed_at: None, + scheduled_at: Some(Utc::now() + Duration::seconds(15)), + } + } + + fn sample_lock_claim() -> LockClaim { + LockClaim { + lock_uuid: Uuid::new_v4(), + lock_expires_at: Utc::now() + Duration::seconds(30), + } + } + + async fn insert_workflow_version_row( + backend: &PostgresBackend, + workflow_version_id: Uuid, + workflow_name: &str, + ) { + sqlx::query( + "INSERT INTO workflow_versions (id, workflow_name, workflow_version, ir_hash, program_proto, concurrent) VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(workflow_version_id) + .bind(workflow_name) + .bind("v1") + .bind(format!("hash-{workflow_name}")) + .bind(vec![0_u8]) + .bind(false) + .execute(backend.pool()) + .await + .expect("insert workflow version row"); + } + + async fn claim_instance(backend: &PostgresBackend, instance_id: Uuid) -> LockClaim { + let claim = sample_lock_claim(); + let batch = CoreBackend::get_queued_instances(backend, 10, claim.clone()) + .await + .expect("claim queued instance"); + assert_eq!(batch.instances.len(), 1); + assert_eq!(batch.instances[0].instance_id, instance_id); + claim + } + + #[serial(postgres)] + #[tokio::test] + async fn core_queue_instances_happy_path() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let queued = sample_queued_instance(instance_id, entry_node); + let expected_workflow_version_id = queued.workflow_version_id; + + CoreBackend::queue_instances(&backend, &[queued]) + .await + .expect("queue instances"); + + let queued_count: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM queued_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued count"); + assert_eq!(queued_count, 1); + + let runner_count: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner count"); + assert_eq!(runner_count, 1); + + let workflow_version_id: Option = sqlx::query_scalar( + "SELECT workflow_version_id FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner workflow version"); + assert_eq!(workflow_version_id, Some(expected_workflow_version_id)); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner current status"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + + let queued_status: Option = sqlx::query_scalar( + "SELECT current_status FROM queued_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued current status"); + assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_queue_instances_persists_workflow_name_when_registered() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let workflow_version_id = Uuid::new_v4(); + insert_workflow_version_row(&backend, workflow_version_id, "tests.searchable").await; + + let queued = QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node, + state: Some(sample_runner_state()), + action_results: HashMap::new(), + instance_id, + scheduled_at: Some(Utc::now()), + }; + + CoreBackend::queue_instances(&backend, &[queued]) + .await + .expect("queue instances"); + + let runner_workflow_name: Option = + sqlx::query_scalar("SELECT workflow_name FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner workflow_name"); + assert_eq!(runner_workflow_name.as_deref(), Some("tests.searchable")); + + let queued_workflow_name: Option = + sqlx::query_scalar("SELECT workflow_name FROM queued_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued workflow_name"); + assert_eq!(queued_workflow_name.as_deref(), Some("tests.searchable")); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_get_queued_instances_updates_runner_status_without_mutating_queue_status() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let queued = sample_queued_instance(instance_id, entry_node); + CoreBackend::queue_instances(&backend, &[queued]) + .await + .expect("queue instances"); + + let claim = sample_lock_claim(); + let batch = CoreBackend::get_queued_instances(&backend, 1, claim.clone()) + .await + .expect("get queued instances"); + assert_eq!(batch.instances.len(), 1); + assert_eq!(batch.instances[0].instance_id, instance_id); + + let row = sqlx::query("SELECT lock_uuid FROM queued_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued lock row"); + let lock_uuid: Option = row.get("lock_uuid"); + assert_eq!(lock_uuid, Some(claim.lock_uuid)); + + let queued_status: Option = sqlx::query_scalar( + "SELECT current_status FROM queued_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued current status"); + assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner current status"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_RUNNING)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_get_queued_instances_restores_action_results_from_actions_done() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + + let initial_claim = sample_lock_claim(); + let initial_batch = CoreBackend::get_queued_instances(&backend, 1, initial_claim.clone()) + .await + .expect("initial claim"); + assert_eq!(initial_batch.instances.len(), 1); + + let execution_id = Uuid::new_v4(); + let mut completed_action_node = sample_execution_node(execution_id); + completed_action_node.status = NodeStatus::Completed; + completed_action_node.scheduled_at = None; + + let graph = GraphUpdate { + instance_id, + nodes: HashMap::from([(execution_id, completed_action_node)]), + edges: std::collections::HashSet::new(), + }; + CoreBackend::save_graphs( + &backend, + initial_claim.clone(), + std::slice::from_ref(&graph), + ) + .await + .expect("persist graph"); + + CoreBackend::save_actions_done( + &backend, + &[ActionDone { + execution_id, + attempt: 1, + status: ActionAttemptStatus::Completed, + started_at: None, + completed_at: Some(Utc::now()), + duration_ms: None, + result: serde_json::json!({"ok": true}), + }], + ) + .await + .expect("persist action result"); + + CoreBackend::release_instance_locks(&backend, initial_claim.lock_uuid, &[instance_id]) + .await + .expect("release initial lock"); + + let queued_status: Option = sqlx::query_scalar( + "SELECT current_status FROM queued_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued current status after release"); + assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner current status after release"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + + let second_claim = sample_lock_claim(); + let batch = CoreBackend::get_queued_instances(&backend, 1, second_claim) + .await + .expect("rehydrate instance"); + assert_eq!(batch.instances.len(), 1); + assert_eq!( + batch.instances[0].action_results.get(&execution_id), + Some(&serde_json::json!({"ok": true})) + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_save_graphs_happy_path() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + let claim = claim_instance(&backend, instance_id).await; + + let execution_id = Uuid::new_v4(); + let mut nodes = HashMap::new(); + nodes.insert(execution_id, sample_execution_node(execution_id)); + let graph = GraphUpdate { + instance_id, + nodes, + edges: std::collections::HashSet::from([waymark_runner_state::ExecutionEdge { + source: execution_id, + target: execution_id, + edge_type: EdgeType::StateMachine, + }]), + }; + let extended_claim = LockClaim { + lock_uuid: claim.lock_uuid, + lock_expires_at: claim.lock_expires_at + Duration::seconds(120), + }; + + let locks = CoreBackend::save_graphs( + &backend, + extended_claim.clone(), + std::slice::from_ref(&graph), + ) + .await + .expect("save graphs"); + assert_eq!(locks.len(), 1); + assert_eq!(locks[0].instance_id, instance_id); + assert_eq!(locks[0].lock_uuid, Some(claim.lock_uuid)); + assert_eq!( + locks[0] + .lock_expires_at + .map(|value| value.timestamp_micros()), + Some(extended_claim.lock_expires_at.timestamp_micros()), + ); + + let state_payload: Option> = + sqlx::query_scalar("SELECT state FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner state payload"); + let decoded: GraphUpdate = rmp_serde::from_slice(&state_payload.expect("state payload")) + .expect("decode graph update"); + assert_eq!(decoded.nodes.len(), 1); + assert_eq!(decoded.edges.len(), 1); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_save_graphs_returns_lock_status_for_duplicate_instance_updates() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + let claim = claim_instance(&backend, instance_id).await; + + let first_node_id = Uuid::new_v4(); + let second_node_id = Uuid::new_v4(); + let first_graph = GraphUpdate { + instance_id, + nodes: HashMap::from([(first_node_id, sample_execution_node(first_node_id))]), + edges: HashSet::new(), + }; + let second_graph = GraphUpdate { + instance_id, + nodes: HashMap::from([(second_node_id, sample_execution_node(second_node_id))]), + edges: HashSet::new(), + }; + + let locks = CoreBackend::save_graphs( + &backend, + claim.clone(), + &[first_graph.clone(), second_graph.clone()], + ) + .await + .expect("save duplicate instance graphs"); + assert_eq!(locks.len(), 2); + assert_eq!(locks[0].instance_id, instance_id); + assert_eq!(locks[1].instance_id, instance_id); + assert_eq!(locks[0].lock_uuid, Some(claim.lock_uuid)); + assert_eq!(locks[1].lock_uuid, Some(claim.lock_uuid)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_save_actions_done_happy_path() { + let backend = setup_backend().await; + let execution_id = Uuid::new_v4(); + CoreBackend::save_actions_done( + &backend, + &[ActionDone { + execution_id, + attempt: 1, + status: ActionAttemptStatus::Completed, + started_at: None, + completed_at: Some(Utc::now()), + duration_ms: None, + result: serde_json::json!({"ok": true}), + }], + ) + .await + .expect("save actions done"); + + let row = sqlx::query( + "SELECT execution_id, attempt, status, started_at, completed_at, duration_ms, result FROM runner_actions_done WHERE execution_id = $1", + ) + .bind(execution_id) + .fetch_one(backend.pool()) + .await + .expect("action row"); + + assert_eq!(row.get::("execution_id"), execution_id); + assert_eq!(row.get::("attempt"), 1); + assert_eq!(row.get::("status"), "completed"); + assert!( + row.get::>, _>("completed_at") + .is_some() + ); + let payload: Option> = row.get("result"); + let decoded: serde_json::Value = + rmp_serde::from_slice(&payload.expect("action payload")).expect("decode action"); + assert_eq!(decoded, serde_json::json!({"ok": true})); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_refresh_instance_locks_happy_path() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + let claim = claim_instance(&backend, instance_id).await; + + let refreshed_expiry = Utc::now() + Duration::seconds(120); + let refreshed = CoreBackend::refresh_instance_locks( + &backend, + LockClaim { + lock_uuid: claim.lock_uuid, + lock_expires_at: refreshed_expiry, + }, + &[instance_id], + ) + .await + .expect("refresh locks"); + + assert_eq!(refreshed.len(), 1); + assert_eq!(refreshed[0].instance_id, instance_id); + assert_eq!(refreshed[0].lock_uuid, Some(claim.lock_uuid)); + assert_eq!( + refreshed[0] + .lock_expires_at + .map(|value| value.timestamp_micros()), + Some(refreshed_expiry.timestamp_micros()), + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_refresh_instance_locks_skip_locked_does_not_block_or_override() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + let claim = claim_instance(&backend, instance_id).await; + + let mut tx = backend.pool().begin().await.expect("begin lock tx"); + sqlx::query("SELECT instance_id FROM queued_instances WHERE instance_id = $1 FOR UPDATE") + .bind(instance_id) + .fetch_one(&mut *tx) + .await + .expect("lock queued row"); + + let refreshed_expiry = Utc::now() + Duration::seconds(120); + let refreshed = tokio::time::timeout( + StdDuration::from_millis(300), + CoreBackend::refresh_instance_locks( + &backend, + LockClaim { + lock_uuid: claim.lock_uuid, + lock_expires_at: refreshed_expiry, + }, + &[instance_id], + ), + ) + .await + .expect("refresh should not block") + .expect("refresh locks"); + + assert_eq!(refreshed.len(), 1); + assert_eq!(refreshed[0].instance_id, instance_id); + assert_eq!(refreshed[0].lock_uuid, Some(claim.lock_uuid)); + assert_eq!( + refreshed[0] + .lock_expires_at + .map(|value| value.timestamp_micros()), + Some(claim.lock_expires_at.timestamp_micros()), + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_release_instance_locks_happy_path() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + let claim = claim_instance(&backend, instance_id).await; + + CoreBackend::release_instance_locks(&backend, claim.lock_uuid, &[instance_id]) + .await + .expect("release locks"); + + let row = sqlx::query( + "SELECT lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("lock row"); + let lock_uuid: Option = row.get("lock_uuid"); + let lock_expires_at: Option> = row.get("lock_expires_at"); + assert!(lock_uuid.is_none()); + assert!(lock_expires_at.is_none()); + + let queued_status: Option = sqlx::query_scalar( + "SELECT current_status FROM queued_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued current status after release"); + assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner current status after release"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_reclaim_expired_instance_locks_happy_path() { + let backend = setup_backend().await; + let expired_id = Uuid::new_v4(); + let live_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances( + &backend, + &[ + sample_queued_instance(expired_id, entry_node), + sample_queued_instance(live_id, entry_node), + ], + ) + .await + .expect("queue instances"); + + let claim = sample_lock_claim(); + let claimed = CoreBackend::get_queued_instances(&backend, 10, claim.clone()) + .await + .expect("claim queued instances"); + assert_eq!(claimed.instances.len(), 2); + + let expired_at = Utc::now() - Duration::seconds(1); + let live_at = Utc::now() + Duration::seconds(60); + sqlx::query( + r#" + UPDATE queued_instances + SET lock_expires_at = CASE + WHEN instance_id = $1 THEN $3 + ELSE $4 + END + WHERE instance_id IN ($1, $2) + "#, + ) + .bind(expired_id) + .bind(live_id) + .bind(expired_at) + .bind(live_at) + .execute(backend.pool()) + .await + .expect("set lock expiries"); + + let reclaimed = backend + .reclaim_expired_instance_locks(10) + .await + .expect("reclaim expired locks"); + assert_eq!(reclaimed, 1); + + let rows = sqlx::query( + "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id IN ($1, $2)", + ) + .bind(expired_id) + .bind(live_id) + .fetch_all(backend.pool()) + .await + .expect("fetch lock rows"); + let mut lock_rows: HashMap, Option>)> = + HashMap::new(); + for row in rows { + let instance_id: Uuid = row.get("instance_id"); + let lock_uuid: Option = row.get("lock_uuid"); + let lock_expires_at: Option> = row.get("lock_expires_at"); + lock_rows.insert(instance_id, (lock_uuid, lock_expires_at)); + } + + let expired_lock = lock_rows.get(&expired_id).expect("expired lock row"); + assert_eq!(*expired_lock, (None, None)); + + let expired_runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(expired_id) + .fetch_one(backend.pool()) + .await + .expect("expired runner status"); + assert_eq!( + expired_runner_status.as_deref(), + Some(INSTANCE_STATUS_QUEUED) + ); + + let live_lock = lock_rows.get(&live_id).expect("live lock row"); + assert_eq!(live_lock.0, Some(claim.lock_uuid)); + assert_eq!( + live_lock.1.map(|value| value.timestamp_micros()), + Some(live_at.timestamp_micros()), + ); + + let live_runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(live_id) + .fetch_one(backend.pool()) + .await + .expect("live runner status"); + assert_eq!(live_runner_status.as_deref(), Some(INSTANCE_STATUS_RUNNING)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_save_instances_done_happy_path() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + + CoreBackend::save_instances_done( + &backend, + &[InstanceDone { + executor_id: instance_id, + entry_node, + result: Some(serde_json::json!({"value": 3})), + error: None, + }], + ) + .await + .expect("save instances done"); + + let result_payload: Option> = + sqlx::query_scalar("SELECT result FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("result payload"); + let decoded: serde_json::Value = + rmp_serde::from_slice(&result_payload.expect("stored result")).expect("decode result"); + assert_eq!(decoded, serde_json::json!({"value": 3})); + + let queued_count: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM queued_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("queued count"); + assert_eq!(queued_count, 0); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner status"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_COMPLETED)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_save_instances_done_updates_runner_even_if_queue_row_missing() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) + .await + .expect("queue instances"); + + sqlx::query("DELETE FROM queued_instances WHERE instance_id = $1") + .bind(instance_id) + .execute(backend.pool()) + .await + .expect("delete queued row"); + + CoreBackend::save_instances_done( + &backend, + &[InstanceDone { + executor_id: instance_id, + entry_node, + result: Some(serde_json::json!({"value": 11})), + error: None, + }], + ) + .await + .expect("save instances done without queue row"); + + let runner_status: Option = sqlx::query_scalar( + "SELECT current_status FROM runner_instances WHERE instance_id = $1", + ) + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("runner status"); + assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_COMPLETED)); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_retry_transient_deadlock_sqlstate_happy_path() { + let backend = setup_backend().await; + let pool = backend.pool().clone(); + let attempts = Arc::new(AtomicUsize::new(0)); + let result = retry_transient_backend("core_retry_test", || { + let pool = pool.clone(); + let attempts = Arc::clone(&attempts); + async move { + let attempt = attempts.fetch_add(1, Ordering::SeqCst); + if attempt < 2 { + sqlx::query( + "DO $$ BEGIN RAISE EXCEPTION 'simulated deadlock' USING ERRCODE='40P01'; END $$;", + ) + .execute(&pool) + .await?; + } + Ok(()) + } + }) + .await; + + assert!(result.is_ok()); + assert_eq!(attempts.load(Ordering::SeqCst), 3); + } + + #[serial(postgres)] + #[tokio::test] + async fn core_retry_non_transient_sqlstate_fails_without_retry() { + let backend = setup_backend().await; + let pool = backend.pool().clone(); + let attempts = Arc::new(AtomicUsize::new(0)); + let result = retry_transient_backend("core_retry_non_transient_test", || { + let pool = pool.clone(); + let attempts = Arc::clone(&attempts); + async move { + attempts.fetch_add(1, Ordering::SeqCst); + sqlx::query( + "DO $$ BEGIN RAISE EXCEPTION 'simulated unique violation' USING ERRCODE='23505'; END $$;", + ) + .execute(&pool) + .await?; + Ok::<(), BackendError>(()) + } + }) + .await; + + assert!(result.is_err()); + assert_eq!(attempts.load(Ordering::SeqCst), 1); + } + + #[serial(postgres)] + #[tokio::test] + async fn garbage_collector_deletes_old_done_instances_and_actions() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let execution_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let workflow_version_id = Uuid::new_v4(); + + let state = GraphUpdate { + instance_id, + nodes: HashMap::from([(execution_id, sample_execution_node(execution_id))]), + edges: HashSet::new(), + }; + let state_payload = PostgresBackend::serialize(&state).expect("serialize state"); + let result_payload = + PostgresBackend::serialize(&serde_json::json!({"ok": true})).expect("serialize done"); + let action_payload = + PostgresBackend::serialize(&serde_json::json!({"value": 1})).expect("serialize action"); + + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, created_at, state, result) VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(instance_id) + .bind(entry_node) + .bind(workflow_version_id) + .bind(Utc::now() - Duration::hours(30)) + .bind(state_payload) + .bind(result_payload) + .execute(backend.pool()) + .await + .expect("insert old done instance"); + + sqlx::query( + "INSERT INTO runner_actions_done (execution_id, attempt, status, result) VALUES ($1, $2, $3, $4)", + ) + .bind(execution_id) + .bind(1_i32) + .bind("completed") + .bind(action_payload) + .execute(backend.pool()) + .await + .expect("insert action row"); + + let result = GarbageCollectorBackend::collect_done_instances( + &backend, + Utc::now() - Duration::hours(24), + 100, + ) + .await + .expect("collect done instances"); + + assert_eq!(result.deleted_instances, 1); + assert_eq!(result.deleted_actions, 1); + + let remaining_instances: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("count instances"); + assert_eq!(remaining_instances, 0); + + let remaining_actions: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM runner_actions_done WHERE execution_id = $1") + .bind(execution_id) + .fetch_one(backend.pool()) + .await + .expect("count actions"); + assert_eq!(remaining_actions, 0); + } + + #[serial(postgres)] + #[tokio::test] + async fn garbage_collector_keeps_recent_done_instances() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let workflow_version_id = Uuid::new_v4(); + let state_payload = PostgresBackend::serialize(&GraphUpdate { + instance_id, + nodes: HashMap::new(), + edges: HashSet::new(), + }) + .expect("serialize state"); + let result_payload = + PostgresBackend::serialize(&serde_json::json!({"ok": true})).expect("serialize done"); + + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, created_at, state, result) VALUES ($1, $2, $3, $4, $5, $6)", + ) + .bind(instance_id) + .bind(entry_node) + .bind(workflow_version_id) + .bind(Utc::now() - Duration::hours(1)) + .bind(state_payload) + .bind(result_payload) + .execute(backend.pool()) + .await + .expect("insert recent done instance"); + + let result = GarbageCollectorBackend::collect_done_instances( + &backend, + Utc::now() - Duration::hours(24), + 100, + ) + .await + .expect("collect done instances"); + + assert_eq!(result.deleted_instances, 0); + assert_eq!(result.deleted_actions, 0); + + let remaining_instances: i64 = + sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") + .bind(instance_id) + .fetch_one(backend.pool()) + .await + .expect("count instances"); + assert_eq!(remaining_instances, 1); + } + + #[serial(postgres)] + #[tokio::test] + async fn worker_status_backend_upsert_worker_status_happy_path() { + let backend = setup_backend().await; + let pool_id = Uuid::new_v4(); + + WorkerStatusBackend::upsert_worker_status( + &backend, + &WorkerStatusUpdate { + pool_id, + throughput_per_min: 180.0, + total_completed: 20, + last_action_at: Some(Utc::now()), + median_dequeue_ms: Some(5), + median_handling_ms: Some(12), + dispatch_queue_size: 3, + total_in_flight: 2, + active_workers: 4, + actions_per_sec: 3.0, + median_instance_duration_secs: Some(0.2), + active_instance_count: 1, + total_instances_completed: 8, + instances_per_sec: 0.5, + instances_per_min: 30.0, + time_series: None, + }, + ) + .await + .expect("upsert worker status"); + + let row = sqlx::query( + "SELECT total_completed, active_workers, actions_per_sec FROM worker_status WHERE pool_id = $1", + ) + .bind(pool_id) + .fetch_one(backend.pool()) + .await + .expect("worker status row"); + assert_eq!(row.get::("total_completed"), 20); + assert_eq!(row.get::("active_workers"), 4); + assert_eq!(row.get::("actions_per_sec"), 3.0); + } +} diff --git a/crates/backend-postgres/src/lib.rs b/crates/backend-postgres/src/lib.rs new file mode 100644 index 00000000..2b4e3821 --- /dev/null +++ b/crates/backend-postgres/src/lib.rs @@ -0,0 +1,115 @@ +//! Postgres backend for persisting runner state and action results. + +mod core; +mod registry; +mod scheduler; +#[cfg(test)] +mod test_helpers; +mod webapp; + +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +use sqlx::PgPool; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_observability::obs; + +/// Persist runner state and action results in Postgres. +#[derive(Clone)] +pub struct PostgresBackend { + pool: PgPool, + query_counts: Arc>>, + batch_size_counts: Arc>>>, +} + +impl PostgresBackend { + pub fn new(pool: PgPool) -> Self { + Self { + pool, + query_counts: Arc::new(Mutex::new(HashMap::new())), + batch_size_counts: Arc::new(Mutex::new(HashMap::new())), + } + } + + #[obs] + pub async fn connect(dsn: &str) -> BackendResult { + let pool = PgPool::connect(dsn).await?; + waymark_backend_postgres_migrations::run(&pool) + .await + .map_err(|err| BackendError::Message(err.to_string()))?; + Ok(Self::new(pool)) + } + + pub fn pool(&self) -> &PgPool { + &self.pool + } + + /// Delete all queued instances from the backing table. + #[obs] + pub async fn clear_queue(&self) -> BackendResult<()> { + Self::count_query(&self.query_counts, "delete:queued_instances_all"); + sqlx::query("DELETE FROM queued_instances") + .execute(&self.pool) + .await?; + Ok(()) + } + + /// Delete all persisted runner data for a clean benchmark run. + #[obs] + pub async fn clear_all(&self) -> BackendResult<()> { + Self::count_query(&self.query_counts, "truncate:runner_tables"); + sqlx::query( + r#" + TRUNCATE runner_actions_done, + runner_instances, + queued_instances + RESTART IDENTITY + "#, + ) + .execute(&self.pool) + .await?; + Ok(()) + } + + pub fn query_counts(&self) -> HashMap { + self.query_counts + .lock() + .expect("query counts poisoned") + .clone() + } + + pub fn batch_size_counts(&self) -> HashMap> { + self.batch_size_counts + .lock() + .expect("batch size counts poisoned") + .clone() + } + + pub(crate) fn count_query(counts: &Arc>>, label: &str) { + let mut guard = counts.lock().expect("query counts poisoned"); + *guard.entry(label.to_string()).or_insert(0) += 1; + } + + pub(crate) fn count_batch_size( + counts: &Arc>>>, + label: &str, + size: usize, + ) { + if size == 0 { + return; + } + let mut guard = counts.lock().expect("batch size counts poisoned"); + let entry = guard.entry(label.to_string()).or_default(); + *entry.entry(size).or_insert(0) += 1; + } + + pub(crate) fn serialize(value: &T) -> Result, BackendError> { + rmp_serde::to_vec_named(value).map_err(|e| BackendError::Message(e.to_string())) + } + + pub(crate) fn deserialize( + payload: &[u8], + ) -> Result { + rmp_serde::from_slice(payload).map_err(|e| BackendError::Message(e.to_string())) + } +} diff --git a/crates/backend-postgres/src/registry.rs b/crates/backend-postgres/src/registry.rs new file mode 100644 index 00000000..94fc1e2c --- /dev/null +++ b/crates/backend-postgres/src/registry.rs @@ -0,0 +1,146 @@ +use sqlx::Row; +use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; + +use super::PostgresBackend; + +#[async_trait::async_trait] +impl WorkflowRegistryBackend for PostgresBackend { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult { + let inserted = sqlx::query( + r#" + INSERT INTO workflow_versions + (workflow_name, workflow_version, ir_hash, program_proto, concurrent) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (workflow_name, workflow_version) + DO NOTHING + RETURNING id + "#, + ) + .bind(®istration.workflow_name) + .bind(®istration.workflow_version) + .bind(®istration.ir_hash) + .bind(®istration.program_proto) + .bind(registration.concurrent) + .fetch_optional(&self.pool) + .await?; + + if let Some(row) = inserted { + let id: Uuid = row.get("id"); + return Ok(id); + } + + let row = sqlx::query( + r#" + SELECT id, ir_hash + FROM workflow_versions + WHERE workflow_name = $1 AND workflow_version = $2 + "#, + ) + .bind(®istration.workflow_name) + .bind(®istration.workflow_version) + .fetch_one(&self.pool) + .await?; + + let id: Uuid = row.get("id"); + let existing_hash: String = row.get("ir_hash"); + if existing_hash != registration.ir_hash { + return Err(BackendError::Message(format!( + "workflow version already exists with different IR hash: {}@{}", + registration.workflow_name, registration.workflow_version + ))); + } + + Ok(id) + } + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { + if ids.is_empty() { + return Ok(Vec::new()); + } + let rows = sqlx::query( + r#" + SELECT id, workflow_name, workflow_version, ir_hash, program_proto, concurrent + FROM workflow_versions + WHERE id = ANY($1) + "#, + ) + .bind(ids) + .fetch_all(&self.pool) + .await?; + + let mut versions = Vec::with_capacity(rows.len()); + for row in rows { + versions.push(WorkflowVersion { + id: row.get("id"), + workflow_name: row.get("workflow_name"), + workflow_version: row.get("workflow_version"), + ir_hash: row.get("ir_hash"), + program_proto: row.get("program_proto"), + concurrent: row.get("concurrent"), + }); + } + Ok(versions) + } +} + +#[cfg(test)] +mod tests { + use serial_test::serial; + + use super::super::test_helpers::setup_backend; + use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; + + fn sample_registration(version: &str) -> WorkflowRegistration { + WorkflowRegistration { + workflow_name: "tests.workflow".to_string(), + workflow_version: version.to_string(), + ir_hash: format!("hash-{version}"), + program_proto: vec![1, 2, 3, 4], + concurrent: true, + } + } + + #[serial(postgres)] + #[tokio::test] + async fn workflow_registry_upsert_workflow_version_happy_path() { + let backend = setup_backend().await; + let registration = sample_registration("v1"); + + let id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) + .await + .expect("insert workflow version"); + let repeat_id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) + .await + .expect("idempotent workflow upsert"); + + assert_eq!(id, repeat_id); + } + + #[serial(postgres)] + #[tokio::test] + async fn workflow_registry_get_workflow_versions_happy_path() { + let backend = setup_backend().await; + let registration = sample_registration("v2"); + let id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) + .await + .expect("insert workflow version"); + + let versions = WorkflowRegistryBackend::get_workflow_versions(&backend, &[id]) + .await + .expect("get workflow versions"); + assert_eq!(versions.len(), 1); + assert_eq!(versions[0].id, id); + assert_eq!(versions[0].workflow_name, registration.workflow_name); + assert_eq!(versions[0].workflow_version, registration.workflow_version); + assert_eq!(versions[0].ir_hash, registration.ir_hash); + assert_eq!(versions[0].program_proto, registration.program_proto); + assert_eq!(versions[0].concurrent, registration.concurrent); + } +} diff --git a/crates/backend-postgres/src/scheduler.rs b/crates/backend-postgres/src/scheduler.rs new file mode 100644 index 00000000..e47f2114 --- /dev/null +++ b/crates/backend-postgres/src/scheduler.rs @@ -0,0 +1,605 @@ +use chrono::{DateTime, Utc}; +use sqlx::Row; +use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_scheduler_backend::SchedulerBackend; + +use waymark_scheduler_core::compute_next_run; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; + +#[async_trait::async_trait] +impl SchedulerBackend for crate::PostgresBackend { + async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { + let next_run_at = compute_next_run( + params.schedule_type, + params.cron_expression.as_deref(), + params.interval_seconds, + params.jitter_seconds, + None, + ) + .map_err(BackendError::Message)?; + + let row = sqlx::query( + r#" + INSERT INTO workflow_schedules + (workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, next_run_at, priority, allow_duplicate) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) + ON CONFLICT (workflow_name, schedule_name) + DO UPDATE SET + schedule_type = EXCLUDED.schedule_type, + cron_expression = EXCLUDED.cron_expression, + interval_seconds = EXCLUDED.interval_seconds, + jitter_seconds = EXCLUDED.jitter_seconds, + input_payload = EXCLUDED.input_payload, + next_run_at = COALESCE(workflow_schedules.next_run_at, EXCLUDED.next_run_at), + priority = EXCLUDED.priority, + allow_duplicate = EXCLUDED.allow_duplicate, + status = 'active', + updated_at = NOW() + RETURNING id + "#, + ) + .bind(¶ms.workflow_name) + .bind(¶ms.schedule_name) + .bind(params.schedule_type.as_str()) + .bind(¶ms.cron_expression) + .bind(params.interval_seconds) + .bind(params.jitter_seconds) + .bind(¶ms.input_payload) + .bind(next_run_at) + .bind(params.priority) + .bind(params.allow_duplicate) + .fetch_one(&self.pool) + .await?; + + let id: Uuid = row.get("id"); + Ok(ScheduleId(id)) + } + + async fn get_schedule(&self, id: ScheduleId) -> BackendResult { + let schedule = sqlx::query_as::<_, ScheduleRow>( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, + created_at, updated_at, priority, allow_duplicate + FROM workflow_schedules + WHERE id = $1 + "#, + ) + .bind(id.0) + .fetch_optional(&self.pool) + .await? + .ok_or_else(|| BackendError::Message(format!("schedule not found: {}", id)))?; + + Ok(schedule.into()) + } + + async fn get_schedule_by_name( + &self, + workflow_name: &str, + schedule_name: &str, + ) -> BackendResult> { + let schedule = sqlx::query_as::<_, ScheduleRow>( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, + created_at, updated_at, priority, allow_duplicate + FROM workflow_schedules + WHERE workflow_name = $1 AND schedule_name = $2 AND status != 'deleted' + "#, + ) + .bind(workflow_name) + .bind(schedule_name) + .fetch_optional(&self.pool) + .await?; + + Ok(schedule.map(Into::into)) + } + + async fn list_schedules( + &self, + limit: i64, + offset: i64, + ) -> BackendResult> { + let rows = sqlx::query_as::<_, ScheduleRow>( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, + created_at, updated_at, priority, allow_duplicate + FROM workflow_schedules + WHERE status != 'deleted' + ORDER BY workflow_name, schedule_name + LIMIT $1 OFFSET $2 + "#, + ) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(Into::into).collect()) + } + + async fn count_schedules(&self) -> BackendResult { + let count = sqlx::query_scalar::<_, i64>( + "SELECT COUNT(*) FROM workflow_schedules WHERE status != 'deleted'", + ) + .fetch_one(&self.pool) + .await?; + + Ok(count) + } + + async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { + let result = sqlx::query( + r#" + UPDATE workflow_schedules + SET status = $2, updated_at = NOW() + WHERE id = $1 + "#, + ) + .bind(id.0) + .bind(status) + .execute(&self.pool) + .await?; + + Ok(result.rows_affected() > 0) + } + + async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { + SchedulerBackend::update_schedule_status(self, id, "deleted").await + } + + async fn find_due_schedules(&self, limit: i32) -> BackendResult> { + let rows = sqlx::query_as::<_, ScheduleRow>( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, + created_at, updated_at, priority, allow_duplicate + FROM workflow_schedules + WHERE status = 'active' + AND next_run_at IS NOT NULL + AND next_run_at <= NOW() + ORDER BY next_run_at + FOR UPDATE SKIP LOCKED + LIMIT $1 + "#, + ) + .bind(limit) + .fetch_all(&self.pool) + .await?; + + Ok(rows.into_iter().map(Into::into).collect()) + } + + async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult { + let has_running = sqlx::query_scalar::<_, bool>( + r#" + SELECT EXISTS( + SELECT 1 + FROM runner_instances ri + JOIN queued_instances qi ON qi.instance_id = ri.instance_id + WHERE ri.schedule_id = $1 + ) + "#, + ) + .bind(schedule_id.0) + .fetch_one(&self.pool) + .await?; + + Ok(has_running) + } + + async fn mark_schedule_executed( + &self, + schedule_id: ScheduleId, + instance_id: Uuid, + ) -> BackendResult<()> { + let schedule = SchedulerBackend::get_schedule(self, schedule_id).await?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + + sqlx::query( + r#" + UPDATE workflow_schedules + SET last_run_at = NOW(), + last_instance_id = $2, + next_run_at = $3, + updated_at = NOW() + WHERE id = $1 + "#, + ) + .bind(schedule_id.0) + .bind(instance_id) + .bind(next_run_at) + .execute(&self.pool) + .await?; + + Ok(()) + } + + async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { + let schedule = SchedulerBackend::get_schedule(self, schedule_id).await?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + + sqlx::query( + r#" + UPDATE workflow_schedules + SET next_run_at = $2, updated_at = NOW() + WHERE id = $1 + "#, + ) + .bind(schedule_id.0) + .bind(next_run_at) + .execute(&self.pool) + .await?; + + Ok(()) + } +} + +#[derive(sqlx::FromRow)] +struct ScheduleRow { + id: Uuid, + workflow_name: String, + schedule_name: String, + schedule_type: String, + cron_expression: Option, + interval_seconds: Option, + jitter_seconds: i64, + input_payload: Option>, + status: String, + next_run_at: Option>, + last_run_at: Option>, + last_instance_id: Option, + created_at: DateTime, + updated_at: DateTime, + priority: i32, + allow_duplicate: bool, +} + +impl From for WorkflowSchedule { + fn from(row: ScheduleRow) -> Self { + Self { + id: row.id, + workflow_name: row.workflow_name, + schedule_name: row.schedule_name, + schedule_type: row.schedule_type, + cron_expression: row.cron_expression, + interval_seconds: row.interval_seconds, + jitter_seconds: row.jitter_seconds, + input_payload: row.input_payload, + status: row.status, + next_run_at: row.next_run_at, + last_run_at: row.last_run_at, + last_instance_id: row.last_instance_id, + created_at: row.created_at, + updated_at: row.updated_at, + priority: row.priority, + allow_duplicate: row.allow_duplicate, + } + } +} + +#[cfg(test)] +mod tests { + use chrono::Utc; + use serial_test::serial; + use sqlx::Row; + use uuid::Uuid; + + use crate::PostgresBackend; + + use super::super::test_helpers::setup_backend; + use super::*; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_scheduler_core::CreateScheduleParams; + + fn sample_params(schedule_name: &str) -> CreateScheduleParams { + CreateScheduleParams { + workflow_name: "tests.workflow".to_string(), + schedule_name: schedule_name.to_string(), + schedule_type: ScheduleType::Interval, + cron_expression: None, + interval_seconds: Some(60), + jitter_seconds: 0, + input_payload: Some(vec![1, 2, 3]), + priority: 3, + allow_duplicate: true, + } + } + + async fn insert_schedule(backend: &PostgresBackend, schedule_name: &str) -> ScheduleId { + SchedulerBackend::upsert_schedule(backend, &sample_params(schedule_name)) + .await + .expect("upsert schedule") + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_upsert_schedule_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "upsert").await; + let row = sqlx::query("SELECT id FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select schedule"); + + assert_eq!(row.get::("id"), id.0); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_upsert_schedule_preserves_existing_next_run_at() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "preserve-next-run").await; + sqlx::query( + "UPDATE workflow_schedules SET next_run_at = NOW() + INTERVAL '2 days' WHERE id = $1", + ) + .bind(id.0) + .execute(backend.pool()) + .await + .expect("force next_run_at"); + + let before: Option> = + sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select next_run_at before"); + + let upserted_id = + SchedulerBackend::upsert_schedule(&backend, &sample_params("preserve-next-run")) + .await + .expect("upsert existing schedule"); + assert_eq!(upserted_id.0, id.0); + + let after: Option> = + sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select next_run_at after"); + + assert_eq!(after, before); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_get_schedule_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "get").await; + let schedule = SchedulerBackend::get_schedule(&backend, id) + .await + .expect("get schedule"); + + assert_eq!(schedule.id, id.0); + assert_eq!(schedule.schedule_name, "get"); + assert_eq!(schedule.workflow_name, "tests.workflow"); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_get_schedule_by_name_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "by-name").await; + let schedule = + SchedulerBackend::get_schedule_by_name(&backend, "tests.workflow", "by-name") + .await + .expect("get schedule by name") + .expect("expected schedule"); + + assert_eq!(schedule.id, id.0); + assert_eq!(schedule.schedule_name, "by-name"); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_list_schedules_happy_path() { + let backend = setup_backend().await; + + insert_schedule(&backend, "a-list").await; + insert_schedule(&backend, "b-list").await; + + let schedules = SchedulerBackend::list_schedules(&backend, 10, 0) + .await + .expect("list schedules"); + + assert_eq!(schedules.len(), 2); + assert_eq!(schedules[0].schedule_name, "a-list"); + assert_eq!(schedules[1].schedule_name, "b-list"); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_count_schedules_happy_path() { + let backend = setup_backend().await; + + insert_schedule(&backend, "count-a").await; + insert_schedule(&backend, "count-b").await; + + let count = SchedulerBackend::count_schedules(&backend) + .await + .expect("count schedules"); + assert_eq!(count, 2); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_update_schedule_status_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "status").await; + let updated = SchedulerBackend::update_schedule_status(&backend, id, "paused") + .await + .expect("update schedule status"); + assert!(updated); + + let status: String = + sqlx::query_scalar("SELECT status FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select status"); + assert_eq!(status, "paused"); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_delete_schedule_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "delete").await; + let deleted = SchedulerBackend::delete_schedule(&backend, id) + .await + .expect("delete schedule"); + assert!(deleted); + + let status: String = + sqlx::query_scalar("SELECT status FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select status"); + assert_eq!(status, "deleted"); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_find_due_schedules_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "due").await; + sqlx::query( + "UPDATE workflow_schedules SET next_run_at = NOW() - INTERVAL '1 minute' WHERE id = $1", + ) + .bind(id.0) + .execute(backend.pool()) + .await + .expect("force schedule due"); + + let due = SchedulerBackend::find_due_schedules(&backend, 10) + .await + .expect("find due schedules"); + assert_eq!(due.len(), 1); + assert_eq!(due[0].id, id.0); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_has_running_instance_happy_path() { + let backend = setup_backend().await; + + let has_running = SchedulerBackend::has_running_instance(&backend, ScheduleId::new()) + .await + .expect("has running instance"); + assert!(!has_running); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_has_running_instance_true_with_queued_instance() { + let backend = setup_backend().await; + + let schedule_id = insert_schedule(&backend, "running-instance").await; + let instance_id = Uuid::new_v4(); + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, schedule_id) VALUES ($1, $2, $3)", + ) + .bind(instance_id) + .bind(Uuid::new_v4()) + .bind(schedule_id.0) + .execute(backend.pool()) + .await + .expect("insert runner instance"); + sqlx::query("INSERT INTO queued_instances (instance_id, payload) VALUES ($1, $2)") + .bind(instance_id) + .bind(vec![0_u8]) + .execute(backend.pool()) + .await + .expect("insert queued instance"); + + let has_running = SchedulerBackend::has_running_instance(&backend, schedule_id) + .await + .expect("has running instance"); + assert!(has_running); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_mark_schedule_executed_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "mark-executed").await; + let instance_id = Uuid::new_v4(); + SchedulerBackend::mark_schedule_executed(&backend, id, instance_id) + .await + .expect("mark schedule executed"); + + let row = sqlx::query( + "SELECT last_instance_id, last_run_at, next_run_at FROM workflow_schedules WHERE id = $1", + ) + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select schedule"); + + let last_instance_id: Option = row.get("last_instance_id"); + let last_run_at: Option> = row.get("last_run_at"); + let next_run_at: Option> = row.get("next_run_at"); + + assert_eq!(last_instance_id, Some(instance_id)); + assert!(last_run_at.is_some()); + assert!(next_run_at.is_some()); + } + + #[serial(postgres)] + #[tokio::test] + async fn scheduler_skip_schedule_run_happy_path() { + let backend = setup_backend().await; + + let id = insert_schedule(&backend, "skip").await; + sqlx::query( + "UPDATE workflow_schedules SET next_run_at = NOW() - INTERVAL '1 minute' WHERE id = $1", + ) + .bind(id.0) + .execute(backend.pool()) + .await + .expect("force schedule due"); + + SchedulerBackend::skip_schedule_run(&backend, id) + .await + .expect("skip schedule run"); + + let next_run_at: Option> = + sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") + .bind(id.0) + .fetch_one(backend.pool()) + .await + .expect("select next_run_at"); + assert!(next_run_at.expect("next_run_at").gt(&Utc::now())); + } +} diff --git a/crates/backend-postgres/src/test_helpers.rs b/crates/backend-postgres/src/test_helpers.rs new file mode 100644 index 00000000..addb1ad4 --- /dev/null +++ b/crates/backend-postgres/src/test_helpers.rs @@ -0,0 +1,27 @@ +use sqlx::PgPool; + +use super::PostgresBackend; +use waymark_test_support::postgres_setup; + +pub(super) async fn setup_backend() -> PostgresBackend { + let pool = postgres_setup().await; + reset_database(&pool).await; + PostgresBackend::new(pool) +} + +pub(super) async fn reset_database(pool: &PgPool) { + sqlx::query( + r#" + TRUNCATE runner_actions_done, + queued_instances, + runner_instances, + workflow_versions, + workflow_schedules, + worker_status + RESTART IDENTITY CASCADE + "#, + ) + .execute(pool) + .await + .expect("truncate postgres tables"); +} diff --git a/crates/backend-postgres/src/webapp.rs b/crates/backend-postgres/src/webapp.rs new file mode 100644 index 00000000..e3f50ced --- /dev/null +++ b/crates/backend-postgres/src/webapp.rs @@ -0,0 +1,2329 @@ +use std::collections::HashMap; + +use chrono::{DateTime, Utc}; +use prost::Message; +use serde_json::Value; +use sqlx::{Postgres, QueryBuilder, Row}; + +use uuid::Uuid; + +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::GraphUpdate; +use waymark_dag::{DAGNode, EdgeType, convert_to_dag}; +use waymark_proto::ast as ir; +use waymark_runner::replay_action_kwargs; +use waymark_runner_state::{ + ActionCallSpec, ExecutionNode, NodeStatus, RunnerState, format_value, value_visitor::ValueExpr, +}; +use waymark_webapp_core::{ + ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView, InstanceDetail, InstanceStatus, + InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, + WorkerActionRow, WorkerAggregateStats, WorkerStatus, +}; + +const INSTANCE_STATUS_FALLBACK_SQL: &str = r#" +CASE + WHEN ri.error IS NOT NULL THEN 'failed' + WHEN ri.result IS NOT NULL THEN 'completed' + WHEN ri.state IS NOT NULL THEN 'running' + ELSE 'queued' +END +"#; + +#[derive(Debug, Clone, PartialEq, Eq)] +enum InstanceSearchToken { + Term(String), + And, + Or, + LParen, + RParen, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum InstanceSearchExpr { + Term(String), + And(Box, Box), + Or(Box, Box), +} + +struct InstanceSearchParser { + tokens: Vec, + position: usize, +} + +impl InstanceSearchParser { + fn new(tokens: Vec) -> Self { + Self { + tokens, + position: 0, + } + } + + fn parse(mut self) -> Option { + let expr = self.parse_or()?; + if self.position == self.tokens.len() { + Some(expr) + } else { + None + } + } + + fn parse_or(&mut self) -> Option { + let mut expr = self.parse_and()?; + while self.consume_or() { + let rhs = self.parse_and()?; + expr = InstanceSearchExpr::Or(Box::new(expr), Box::new(rhs)); + } + Some(expr) + } + + fn parse_and(&mut self) -> Option { + let mut expr = self.parse_primary()?; + loop { + if self.consume_and() || self.peek_is_primary_start() { + let rhs = self.parse_primary()?; + expr = InstanceSearchExpr::And(Box::new(expr), Box::new(rhs)); + continue; + } + break; + } + Some(expr) + } + + fn parse_primary(&mut self) -> Option { + match self.peek()? { + InstanceSearchToken::Term(term) => { + let term = term.clone(); + self.position += 1; + Some(InstanceSearchExpr::Term(term)) + } + InstanceSearchToken::LParen => { + self.position += 1; + let expr = self.parse_or()?; + if !self.consume_rparen() { + return None; + } + Some(expr) + } + InstanceSearchToken::And | InstanceSearchToken::Or | InstanceSearchToken::RParen => { + None + } + } + } + + fn consume_and(&mut self) -> bool { + if matches!(self.peek(), Some(InstanceSearchToken::And)) { + self.position += 1; + true + } else { + false + } + } + + fn consume_or(&mut self) -> bool { + if matches!(self.peek(), Some(InstanceSearchToken::Or)) { + self.position += 1; + true + } else { + false + } + } + + fn consume_rparen(&mut self) -> bool { + if matches!(self.peek(), Some(InstanceSearchToken::RParen)) { + self.position += 1; + true + } else { + false + } + } + + fn peek_is_primary_start(&self) -> bool { + matches!( + self.peek(), + Some(InstanceSearchToken::Term(_)) | Some(InstanceSearchToken::LParen) + ) + } + + fn peek(&self) -> Option<&InstanceSearchToken> { + self.tokens.get(self.position) + } +} + +fn tokenize_instance_search(search: &str) -> Vec { + let mut chars = search.chars().peekable(); + let mut tokens = Vec::new(); + + while let Some(ch) = chars.peek().copied() { + if ch.is_whitespace() { + chars.next(); + continue; + } + if ch == '(' { + chars.next(); + tokens.push(InstanceSearchToken::LParen); + continue; + } + if ch == ')' { + chars.next(); + tokens.push(InstanceSearchToken::RParen); + continue; + } + if ch == '"' { + chars.next(); + let mut quoted = String::new(); + for next in chars.by_ref() { + if next == '"' { + break; + } + quoted.push(next); + } + if !quoted.is_empty() { + tokens.push(InstanceSearchToken::Term(quoted)); + } + continue; + } + + let mut term = String::new(); + while let Some(next) = chars.peek().copied() { + if next.is_whitespace() || next == '(' || next == ')' { + break; + } + term.push(next); + chars.next(); + } + if term.is_empty() { + continue; + } + + match term.to_ascii_uppercase().as_str() { + "AND" => tokens.push(InstanceSearchToken::And), + "OR" => tokens.push(InstanceSearchToken::Or), + _ => tokens.push(InstanceSearchToken::Term(term)), + } + } + + tokens +} + +fn parse_instance_search_expr(search: &str) -> Option { + let trimmed = search.trim(); + if trimmed.is_empty() { + return None; + } + + let tokens = tokenize_instance_search(trimmed); + if tokens.is_empty() { + return None; + } + + InstanceSearchParser::new(tokens) + .parse() + .or_else(|| Some(InstanceSearchExpr::Term(trimmed.to_string()))) +} + +fn push_instance_search_expr_sql( + builder: &mut QueryBuilder<'_, Postgres>, + expr: &InstanceSearchExpr, +) { + match expr { + InstanceSearchExpr::Term(term) => { + let pattern = format!("%{term}%"); + builder.push("("); + builder.push("COALESCE(ri.workflow_name, wv.workflow_name, '') ILIKE "); + builder.push_bind(pattern.clone()); + builder.push(" OR COALESCE(ri.current_status, "); + builder.push(INSTANCE_STATUS_FALLBACK_SQL); + builder.push(", '') ILIKE "); + builder.push_bind(pattern); + builder.push(")"); + } + InstanceSearchExpr::And(left, right) => { + builder.push("("); + push_instance_search_expr_sql(builder, left); + builder.push(" AND "); + push_instance_search_expr_sql(builder, right); + builder.push(")"); + } + InstanceSearchExpr::Or(left, right) => { + builder.push("("); + push_instance_search_expr_sql(builder, left); + builder.push(" OR "); + push_instance_search_expr_sql(builder, right); + builder.push(")"); + } + } +} + +fn parse_instance_status(status: &str) -> Option { + match status { + "queued" => Some(InstanceStatus::Queued), + "running" => Some(InstanceStatus::Running), + "completed" => Some(InstanceStatus::Completed), + "failed" => Some(InstanceStatus::Failed), + _ => None, + } +} + +#[async_trait::async_trait] +impl waymark_webapp_backend::WebappBackend for crate::PostgresBackend { + async fn count_instances(&self, search: Option<&str>) -> BackendResult { + let mut builder: QueryBuilder = QueryBuilder::new( + r#" + SELECT COUNT(*)::BIGINT + FROM runner_instances ri + LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id + "#, + ); + + if let Some(search_expr) = search.and_then(parse_instance_search_expr) { + builder.push(" WHERE "); + push_instance_search_expr_sql(&mut builder, &search_expr); + } + + let count: i64 = builder.build_query_scalar().fetch_one(&self.pool).await?; + Ok(count) + } + + async fn list_instances( + &self, + search: Option<&str>, + limit: i64, + offset: i64, + ) -> BackendResult> { + let mut builder: QueryBuilder = QueryBuilder::new( + r#" + SELECT + ri.instance_id, + ri.entry_node, + ri.created_at, + ri.state, + ri.result, + ri.error, + COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name, + COALESCE(ri.current_status, + CASE + WHEN ri.error IS NOT NULL THEN 'failed' + WHEN ri.result IS NOT NULL THEN 'completed' + WHEN ri.state IS NOT NULL THEN 'running' + ELSE 'queued' + END + ) AS current_status + FROM runner_instances ri + LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id + "#, + ); + if let Some(search_expr) = search.and_then(parse_instance_search_expr) { + builder.push(" WHERE "); + push_instance_search_expr_sql(&mut builder, &search_expr); + } + builder.push(" ORDER BY ri.created_at DESC, ri.instance_id DESC LIMIT "); + builder.push_bind(limit); + builder.push(" OFFSET "); + builder.push_bind(offset); + let rows = builder.build().fetch_all(&self.pool).await?; + + let mut instances = Vec::new(); + for row in rows { + let instance_id: Uuid = row.get("instance_id"); + let entry_node: Uuid = row.get("entry_node"); + let created_at: DateTime = row.get("created_at"); + let state_bytes: Option> = row.get("state"); + let result_bytes: Option> = row.get("result"); + let error_bytes: Option> = row.get("error"); + let workflow_name: Option = row.get("workflow_name"); + let current_status: Option = row.get("current_status"); + + let status = current_status + .as_deref() + .and_then(parse_instance_status) + .unwrap_or_else(|| determine_status(&state_bytes, &result_bytes, &error_bytes)); + let input_preview = extract_input_preview(&state_bytes); + + instances.push(InstanceSummary { + id: instance_id, + entry_node, + created_at, + status, + workflow_name, + input_preview, + }); + } + + Ok(instances) + } + + async fn get_instance(&self, instance_id: Uuid) -> BackendResult { + let row = sqlx::query( + r#" + SELECT + ri.instance_id, + ri.entry_node, + ri.created_at, + ri.state, + ri.result, + ri.error, + COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name, + COALESCE(ri.current_status, + CASE + WHEN ri.error IS NOT NULL THEN 'failed' + WHEN ri.result IS NOT NULL THEN 'completed' + WHEN ri.state IS NOT NULL THEN 'running' + ELSE 'queued' + END + ) AS current_status + FROM runner_instances ri + LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id + WHERE ri.instance_id = $1 + "#, + ) + .bind(instance_id) + .fetch_optional(&self.pool) + .await? + .ok_or_else(|| BackendError::Message(format!("instance not found: {}", instance_id)))?; + + let instance_id: Uuid = row.get("instance_id"); + let entry_node: Uuid = row.get("entry_node"); + let created_at: DateTime = row.get("created_at"); + let state_bytes: Option> = row.get("state"); + let result_bytes: Option> = row.get("result"); + let error_bytes: Option> = row.get("error"); + let workflow_name: Option = row.get("workflow_name"); + let current_status: Option = row.get("current_status"); + + let status = current_status + .as_deref() + .and_then(parse_instance_status) + .unwrap_or_else(|| determine_status(&state_bytes, &result_bytes, &error_bytes)); + let input_payload = format_input_payload(&state_bytes); + let result_payload = format_instance_result_payload(status, &result_bytes, &error_bytes); + let error_payload = format_error(&error_bytes); + + Ok(InstanceDetail { + id: instance_id, + entry_node, + created_at, + status, + workflow_name, + input_payload, + result_payload, + error_payload, + }) + } + + async fn get_execution_graph( + &self, + instance_id: Uuid, + ) -> BackendResult> { + let row = sqlx::query( + r#" + SELECT state FROM runner_instances WHERE instance_id = $1 + "#, + ) + .bind(instance_id) + .fetch_optional(&self.pool) + .await?; + + let Some(row) = row else { + return Ok(None); + }; + + let state_bytes: Option> = row.get("state"); + let Some(state_bytes) = state_bytes else { + return Ok(None); + }; + + let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) + .map_err(|e| BackendError::Message(format!("failed to decode state: {}", e)))?; + + let nodes: Vec = graph_update + .nodes + .values() + .map(|node| ExecutionNodeView { + id: node.node_id.to_string(), + node_type: node.node_type.clone(), + label: node.label.clone(), + status: format_node_status(&node.status), + action_name: node.action.as_ref().map(|a| a.action_name.clone()), + module_name: node.action.as_ref().and_then(|a| a.module_name.clone()), + }) + .collect(); + + let edges: Vec = graph_update + .edges + .iter() + .map(|edge| ExecutionEdgeView { + source: edge.source.to_string(), + target: edge.target.to_string(), + edge_type: format!("{:?}", edge.edge_type), + }) + .collect(); + + Ok(Some(ExecutionGraphView { nodes, edges })) + } + + async fn get_workflow_graph( + &self, + instance_id: Uuid, + ) -> BackendResult> { + let row = sqlx::query( + r#" + SELECT ri.state, wv.program_proto + FROM runner_instances ri + JOIN workflow_versions wv ON wv.id = ri.workflow_version_id + WHERE ri.instance_id = $1 + "#, + ) + .bind(instance_id) + .fetch_optional(&self.pool) + .await?; + + let Some(row) = row else { + return Ok(None); + }; + + let program_proto: Vec = row.get("program_proto"); + let program = ir::Program::decode(&program_proto[..]) + .map_err(|err| BackendError::Message(format!("failed to decode workflow IR: {err}")))?; + let dag = convert_to_dag(&program).map_err(|err| { + BackendError::Message(format!("failed to convert workflow DAG: {err}")) + })?; + + let mut template_statuses: HashMap = HashMap::new(); + let state_bytes: Option> = row.get("state"); + if let Some(state_bytes) = state_bytes { + let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) + .map_err(|err| BackendError::Message(format!("failed to decode state: {err}")))?; + + for node in graph_update.nodes.values() { + let Some(template_id) = node.template_id.as_ref() else { + continue; + }; + template_statuses + .entry(template_id.clone()) + .and_modify(|existing| { + *existing = merge_template_status(existing, &node.status); + }) + .or_insert_with(|| node.status.clone()); + } + } + + let mut node_ids: Vec = dag.nodes.keys().cloned().collect(); + node_ids.sort(); + let nodes: Vec = node_ids + .into_iter() + .filter_map(|node_id| { + let node = dag.nodes.get(&node_id)?; + let status = template_statuses + .get(&node_id) + .map(format_node_status) + .unwrap_or_else(|| "pending".to_string()); + let (action_name, module_name) = match node { + DAGNode::ActionCall(action) => { + (Some(action.action_name.clone()), action.module_name.clone()) + } + _ => (None, None), + }; + + Some(ExecutionNodeView { + id: node_id, + node_type: node.node_type().to_string(), + label: node.label(), + status, + action_name, + module_name, + }) + }) + .collect(); + + let edges: Vec = dag + .edges + .iter() + .filter(|edge| edge.edge_type == EdgeType::StateMachine) + .map(|edge| ExecutionEdgeView { + source: edge.source.clone(), + target: edge.target.clone(), + edge_type: if edge.is_loop_back { + "state_machine_loop_back".to_string() + } else { + "state_machine".to_string() + }, + }) + .collect(); + + Ok(Some(ExecutionGraphView { nodes, edges })) + } + + async fn get_action_results(&self, instance_id: Uuid) -> BackendResult> { + let row = sqlx::query( + r#" + SELECT state + FROM runner_instances + WHERE instance_id = $1 + "#, + ) + .bind(instance_id) + .fetch_optional(&self.pool) + .await?; + + let Some(row) = row else { + return Ok(Vec::new()); + }; + let state_bytes: Option> = row.get("state"); + let Some(state_bytes) = state_bytes else { + return Ok(Vec::new()); + }; + let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) + .map_err(|e| BackendError::Message(format!("failed to decode state: {}", e)))?; + + let runner_state = RunnerState::new( + None, + Some(graph_update.nodes.clone()), + Some(graph_update.edges), + false, + ); + let action_nodes: HashMap = graph_update + .nodes + .into_iter() + .filter(|(_, node)| node.is_action_call()) + .collect(); + if action_nodes.is_empty() { + return Ok(Vec::new()); + } + let execution_ids: Vec = action_nodes.keys().copied().collect(); + + let rows = sqlx::query( + r#" + SELECT created_at, execution_id, attempt, status, started_at, completed_at, duration_ms, result + FROM runner_actions_done + WHERE execution_id = ANY($1) + ORDER BY created_at ASC, attempt ASC + "#, + ) + .bind(&execution_ids) + .fetch_all(&self.pool) + .await?; + + let mut decoded_rows = Vec::with_capacity(rows.len()); + for row in rows { + let created_at: DateTime = row.get("created_at"); + let execution_id: Uuid = row.get("execution_id"); + let attempt: i32 = row.get("attempt"); + let status: Option = row.get("status"); + let started_at: Option> = row.get("started_at"); + let completed_at: Option> = row.get("completed_at"); + let duration_ms: Option = row.get("duration_ms"); + let result_bytes: Option> = row.get("result"); + let result = result_bytes + .as_deref() + .map(decode_msgpack_json) + .transpose()?; + decoded_rows.push(DecodedActionResultRow { + created_at, + execution_id, + attempt, + status, + started_at, + completed_at, + duration_ms, + result, + }); + } + + // Replay needs the current known action outputs by execution id. + let mut action_results = HashMap::new(); + for row in &decoded_rows { + if let Some(result) = &row.result { + action_results.insert(row.execution_id, result.clone()); + } + } + + let mut request_preview_cache: HashMap = HashMap::new(); + let mut entries = Vec::with_capacity(decoded_rows.len()); + for row in decoded_rows { + let node = action_nodes.get(&row.execution_id); + let action_name = node + .and_then(|n| n.action.as_ref().map(|a| a.action_name.clone())) + .unwrap_or_default(); + let module_name = + node.and_then(|n| n.action.as_ref().and_then(|a| a.module_name.clone())); + + let request_preview = + if let Some(existing) = request_preview_cache.get(&row.execution_id) { + existing.clone() + } else { + let rendered = render_action_request_preview( + node.and_then(|n| n.action.as_ref()), + &runner_state, + &action_results, + row.execution_id, + ); + request_preview_cache.insert(row.execution_id, rendered.clone()); + rendered + }; + + let (response_preview, error) = match &row.result { + Some(value) => format_action_result(value), + None => ("(no result)".to_string(), None), + }; + let status = row.status.clone().unwrap_or_else(|| { + if error.is_some() { + "failed".to_string() + } else { + "completed".to_string() + } + }); + let (dispatched_at, completed_at, duration_ms) = if row.started_at.is_some() + || row.completed_at.is_some() + || row.duration_ms.is_some() + { + ( + Some(row.started_at.unwrap_or(row.created_at).to_rfc3339()), + Some(row.completed_at.unwrap_or(row.created_at).to_rfc3339()), + row.duration_ms, + ) + } else { + action_timing_from_state(node, row.attempt, row.created_at) + }; + + entries.push(TimelineEntry { + action_id: row.execution_id.to_string(), + action_name, + module_name, + status, + attempt_number: row.attempt, + dispatched_at, + completed_at, + duration_ms, + request_preview, + response_preview, + error, + }); + } + + Ok(entries) + } + + async fn get_distinct_workflows(&self) -> BackendResult> { + let rows = sqlx::query( + r#" + SELECT DISTINCT COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name + FROM runner_instances ri + LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id + WHERE COALESCE(ri.workflow_name, wv.workflow_name) IS NOT NULL + ORDER BY workflow_name + "#, + ) + .fetch_all(&self.pool) + .await?; + + let mut workflows = Vec::with_capacity(rows.len()); + for row in rows { + let workflow_name: String = row.get("workflow_name"); + workflows.push(workflow_name); + } + Ok(workflows) + } + + async fn get_distinct_statuses(&self) -> BackendResult> { + Ok(vec![ + "queued".to_string(), + "running".to_string(), + "completed".to_string(), + "failed".to_string(), + ]) + } + + async fn count_schedules(&self) -> BackendResult { + let count = sqlx::query_scalar::<_, i64>( + "SELECT COUNT(*) FROM workflow_schedules WHERE status != 'deleted'", + ) + .fetch_one(&self.pool) + .await?; + + Ok(count) + } + + async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { + let rows = sqlx::query( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + status, next_run_at, last_run_at, created_at + FROM workflow_schedules + WHERE status != 'deleted' + ORDER BY workflow_name, schedule_name + LIMIT $1 OFFSET $2 + "#, + ) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; + + let mut schedules = Vec::new(); + for row in rows { + schedules.push(ScheduleSummary { + id: row.get::("id").to_string(), + workflow_name: row.get("workflow_name"), + schedule_name: row.get("schedule_name"), + schedule_type: row.get("schedule_type"), + cron_expression: row.get("cron_expression"), + interval_seconds: row.get("interval_seconds"), + status: row.get("status"), + next_run_at: row + .get::>, _>("next_run_at") + .map(|dt| dt.to_rfc3339()), + last_run_at: row + .get::>, _>("last_run_at") + .map(|dt| dt.to_rfc3339()), + created_at: row.get::, _>("created_at").to_rfc3339(), + }); + } + + Ok(schedules) + } + + async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { + let row = sqlx::query( + r#" + SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, + jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, + created_at, updated_at, priority, allow_duplicate + FROM workflow_schedules + WHERE id = $1 + "#, + ) + .bind(schedule_id) + .fetch_optional(&self.pool) + .await? + .ok_or_else(|| BackendError::Message(format!("schedule not found: {}", schedule_id)))?; + + let input_payload: Option = row + .get::>, _>("input_payload") + .and_then(|bytes| { + rmp_serde::from_slice::(&bytes) + .ok() + .map(|v| serde_json::to_string_pretty(&v).unwrap_or_default()) + }); + + Ok(ScheduleDetail { + id: row.get::("id").to_string(), + workflow_name: row.get("workflow_name"), + schedule_name: row.get("schedule_name"), + schedule_type: row.get("schedule_type"), + cron_expression: row.get("cron_expression"), + interval_seconds: row.get("interval_seconds"), + jitter_seconds: row.get("jitter_seconds"), + status: row.get("status"), + next_run_at: row + .get::>, _>("next_run_at") + .map(|dt| dt.to_rfc3339()), + last_run_at: row + .get::>, _>("last_run_at") + .map(|dt| dt.to_rfc3339()), + last_instance_id: row + .get::, _>("last_instance_id") + .map(|id| id.to_string()), + created_at: row.get::, _>("created_at").to_rfc3339(), + updated_at: row.get::, _>("updated_at").to_rfc3339(), + priority: row.get("priority"), + allow_duplicate: row.get("allow_duplicate"), + input_payload, + }) + } + + async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult { + let count = sqlx::query_scalar::<_, i64>( + r#" + SELECT COUNT(*) + FROM runner_instances + WHERE schedule_id = $1 + "#, + ) + .bind(schedule_id) + .fetch_one(&self.pool) + .await?; + Ok(count) + } + + async fn list_schedule_invocations( + &self, + schedule_id: Uuid, + limit: i64, + offset: i64, + ) -> BackendResult> { + let rows = sqlx::query( + r#" + SELECT instance_id, created_at, state, result, error + FROM runner_instances + WHERE schedule_id = $1 + ORDER BY created_at DESC, instance_id DESC + LIMIT $2 OFFSET $3 + "#, + ) + .bind(schedule_id) + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; + + let mut invocations = Vec::with_capacity(rows.len()); + for row in rows { + let state_bytes: Option> = row.get("state"); + let result_bytes: Option> = row.get("result"); + let error_bytes: Option> = row.get("error"); + + invocations.push(ScheduleInvocationSummary { + id: row.get("instance_id"), + created_at: row.get("created_at"), + status: determine_status(&state_bytes, &result_bytes, &error_bytes), + }); + } + + Ok(invocations) + } + + async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { + let result = sqlx::query( + r#" + UPDATE workflow_schedules + SET status = $2, updated_at = NOW() + WHERE id = $1 + "#, + ) + .bind(schedule_id) + .bind(status) + .execute(&self.pool) + .await?; + + Ok(result.rows_affected() > 0) + } + + async fn get_distinct_schedule_statuses(&self) -> BackendResult> { + Ok(vec!["active".to_string(), "paused".to_string()]) + } + + async fn get_distinct_schedule_types(&self) -> BackendResult> { + Ok(vec!["cron".to_string(), "interval".to_string()]) + } + + async fn get_worker_action_stats( + &self, + window_minutes: i64, + ) -> BackendResult> { + let rows = sqlx::query( + r#" + SELECT + pool_id, + COUNT(DISTINCT worker_id) as active_workers, + SUM(throughput_per_min) / 60.0 as actions_per_sec, + SUM(throughput_per_min) as throughput_per_min, + COALESCE(SUM(total_completed), 0)::BIGINT as total_completed, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_dequeue_ms) as median_dequeue_ms, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_handling_ms) as median_handling_ms, + MAX(last_action_at) as last_action_at, + MAX(updated_at) as updated_at + FROM worker_status + WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 + GROUP BY pool_id + ORDER BY actions_per_sec DESC + "#, + ) + .bind(window_minutes) + .fetch_all(&self.pool) + .await?; + + let mut stats = Vec::new(); + for row in rows { + stats.push(WorkerActionRow { + pool_id: row.get::("pool_id").to_string(), + active_workers: row.get::("active_workers"), + actions_per_sec: format!("{:.1}", row.get::("actions_per_sec")), + throughput_per_min: row.get::("throughput_per_min") as i64, + total_completed: row.get::("total_completed"), + median_dequeue_ms: row + .get::, _>("median_dequeue_ms") + .map(|v| v as i64), + median_handling_ms: row + .get::, _>("median_handling_ms") + .map(|v| v as i64), + last_action_at: row + .get::>, _>("last_action_at") + .map(|dt| dt.to_rfc3339()), + updated_at: row.get::, _>("updated_at").to_rfc3339(), + }); + } + + Ok(stats) + } + + async fn get_worker_aggregate_stats( + &self, + window_minutes: i64, + ) -> BackendResult { + let row = sqlx::query( + r#" + SELECT + COUNT(DISTINCT worker_id) as active_worker_count, + COALESCE(SUM(throughput_per_min) / 60.0, 0) as actions_per_sec, + COALESCE(SUM(total_in_flight), 0)::BIGINT as total_in_flight, + COALESCE(SUM(dispatch_queue_size), 0)::BIGINT as total_queue_depth + FROM worker_status + WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 + "#, + ) + .bind(window_minutes) + .fetch_one(&self.pool) + .await?; + + Ok(WorkerAggregateStats { + active_worker_count: row.get::("active_worker_count"), + actions_per_sec: format!("{:.1}", row.get::("actions_per_sec")), + total_in_flight: row.get::("total_in_flight"), + total_queue_depth: row.get::("total_queue_depth"), + }) + } + + async fn worker_status_table_exists(&self) -> bool { + sqlx::query_scalar::<_, bool>( + r#" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'worker_status' + ) + "#, + ) + .fetch_one(&self.pool) + .await + .unwrap_or(false) + } + + async fn schedules_table_exists(&self) -> bool { + sqlx::query_scalar::<_, bool>( + r#" + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'workflow_schedules' + ) + "#, + ) + .fetch_one(&self.pool) + .await + .unwrap_or(false) + } + + async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult> { + let rows = sqlx::query( + r#" + SELECT + pool_id, + MAX(active_workers) as active_workers, + COALESCE(SUM(throughput_per_min), 0) as throughput_per_min, + COALESCE(SUM(throughput_per_min) / 60.0, 0) as actions_per_sec, + COALESCE(SUM(total_completed), 0)::BIGINT as total_completed, + MAX(last_action_at) as last_action_at, + MAX(updated_at) as updated_at, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_dequeue_ms) as median_dequeue_ms, + PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_handling_ms) as median_handling_ms, + MAX(dispatch_queue_size) as dispatch_queue_size, + MAX(total_in_flight) as total_in_flight, + MAX(median_instance_duration_secs) as median_instance_duration_secs, + MAX(active_instance_count) as active_instance_count, + ( + SELECT COUNT(*)::BIGINT + FROM runner_instances ri + WHERE ri.result IS NOT NULL + AND ri.error IS NULL + ) as total_instances_completed, + MAX(instances_per_sec) as instances_per_sec, + MAX(instances_per_min) as instances_per_min, + ( + SELECT time_series FROM worker_status ws2 + WHERE ws2.pool_id = worker_status.pool_id + AND ws2.time_series IS NOT NULL + ORDER BY ws2.updated_at DESC LIMIT 1 + ) as time_series + FROM worker_status + WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 + GROUP BY pool_id + ORDER BY actions_per_sec DESC + "#, + ) + .bind(window_minutes) + .fetch_all(&self.pool) + .await?; + + let mut statuses = Vec::new(); + for row in rows { + statuses.push(WorkerStatus { + pool_id: row.get::("pool_id"), + active_workers: row.get::, _>("active_workers").unwrap_or(0), + throughput_per_min: row.get::("throughput_per_min"), + actions_per_sec: row.get::("actions_per_sec"), + total_completed: row.get::("total_completed"), + last_action_at: row.get::>, _>("last_action_at"), + updated_at: row.get::, _>("updated_at"), + median_dequeue_ms: row + .get::, _>("median_dequeue_ms") + .map(|v| v as i64), + median_handling_ms: row + .get::, _>("median_handling_ms") + .map(|v| v as i64), + dispatch_queue_size: row.get::, _>("dispatch_queue_size"), + total_in_flight: row.get::, _>("total_in_flight"), + median_instance_duration_secs: row + .get::, _>("median_instance_duration_secs"), + active_instance_count: row + .get::, _>("active_instance_count") + .unwrap_or(0), + total_instances_completed: row + .get::, _>("total_instances_completed") + .unwrap_or(0), + instances_per_sec: row + .get::, _>("instances_per_sec") + .unwrap_or(0.0), + instances_per_min: row + .get::, _>("instances_per_min") + .unwrap_or(0.0), + time_series: row.get::>, _>("time_series"), + }); + } + + Ok(statuses) + } +} + +struct DecodedActionResultRow { + created_at: DateTime, + execution_id: Uuid, + attempt: i32, + status: Option, + started_at: Option>, + completed_at: Option>, + duration_ms: Option, + result: Option, +} + +fn decode_msgpack_json(bytes: &[u8]) -> BackendResult { + rmp_serde::from_slice::(bytes) + .map_err(|err| BackendError::Message(format!("failed to decode action result: {err}"))) +} + +fn render_action_request_preview( + action: Option<&ActionCallSpec>, + state: &RunnerState, + action_results: &HashMap, + node_id: Uuid, +) -> String { + let Some(action) = action else { + return "{}".to_string(); + }; + + match replay_action_kwargs(state, action_results, node_id) { + Ok(kwargs) => { + let rendered_map: serde_json::Map = kwargs.into_iter().collect(); + pretty_json(&Value::Object(rendered_map)) + } + Err(_) => format_symbolic_kwargs(action), + } +} + +fn format_symbolic_kwargs(action: &ActionCallSpec) -> String { + if action.kwargs.is_empty() { + return "{}".to_string(); + } + let rendered_map: serde_json::Map = action + .kwargs + .iter() + .map(|(name, expr)| (name.clone(), Value::String(format_value(expr)))) + .collect(); + pretty_json(&Value::Object(rendered_map)) +} + +fn action_timing_from_state( + node: Option<&ExecutionNode>, + attempt: i32, + fallback_completed_at: DateTime, +) -> (Option, Option, Option) { + // Node timing fields represent the latest attempt for this execution id. + // For historical retries, fall back to row timestamps from actions_done. + let Some(node) = node else { + let at = fallback_completed_at.to_rfc3339(); + return (Some(at.clone()), Some(at), None); + }; + if node.action_attempt != attempt { + let at = fallback_completed_at.to_rfc3339(); + return (Some(at.clone()), Some(at), None); + } + + let dispatched_at = node + .started_at + .map(|value| value.to_rfc3339()) + .unwrap_or_else(|| fallback_completed_at.to_rfc3339()); + let completed_dt = node.completed_at.unwrap_or(fallback_completed_at); + let completed_at = completed_dt.to_rfc3339(); + let duration_ms = node + .started_at + .map(|started_at| { + completed_dt + .signed_duration_since(started_at) + .num_milliseconds() + }) + .filter(|duration| *duration >= 0); + + (Some(dispatched_at), Some(completed_at), duration_ms) +} + +fn format_action_result(value: &Value) -> (String, Option) { + let preview = pretty_json(value); + let error = extract_action_error(value); + (preview, error) +} + +fn extract_action_error(value: &Value) -> Option { + let Value::Object(map) = value else { + return None; + }; + let message = map.get("message").and_then(Value::as_str); + let is_exception = map.contains_key("type") && map.contains_key("message"); + if is_exception { + return Some(message.unwrap_or("action failed").to_string()); + } + map.get("error") + .and_then(Value::as_str) + .map(|msg| msg.to_string()) +} + +fn pretty_json(value: &Value) -> String { + serde_json::to_string_pretty(value).unwrap_or_else(|_| "{}".to_string()) +} + +fn determine_status( + state_bytes: &Option>, + result_bytes: &Option>, + error_bytes: &Option>, +) -> InstanceStatus { + if error_bytes.is_some() { + return InstanceStatus::Failed; + } + if result_bytes + .as_deref() + .is_some_and(result_payload_is_error_wrapper) + { + return InstanceStatus::Failed; + } + if result_bytes.is_some() { + return InstanceStatus::Completed; + } + if state_bytes.is_some() { + return InstanceStatus::Running; + } + InstanceStatus::Queued +} + +fn extract_input_preview(state_bytes: &Option>) -> String { + let Some(bytes) = state_bytes else { + return "{}".to_string(); + }; + + match rmp_serde::from_slice::(bytes) { + Ok(graph) => { + let count = graph.nodes.len(); + format!("{{nodes: {count}}}") + } + Err(_) => "{}".to_string(), + } +} + +fn format_input_payload(state_bytes: &Option>) -> String { + let Some(bytes) = state_bytes else { + return "{}".to_string(); + }; + + match rmp_serde::from_slice::(bytes) { + Ok(graph) => format_extracted_inputs(&graph.nodes), + Err(_) => "{}".to_string(), + } +} + +fn format_extracted_inputs(nodes: &HashMap) -> String { + let mut input_pairs: Vec<(String, Value)> = nodes + .values() + .filter_map(extract_input_assignment) + .collect(); + if input_pairs.is_empty() { + return "{}".to_string(); + } + input_pairs.sort_by(|(left, _), (right, _)| left.cmp(right)); + let input_map: serde_json::Map = input_pairs.into_iter().collect(); + pretty_json(&Value::Object(input_map)) +} + +fn extract_input_assignment(node: &ExecutionNode) -> Option<(String, Value)> { + let (name, raw_value) = parse_input_assignment_label(&node.label)?; + + if let Ok(value) = serde_json::from_str::(raw_value) { + return Some((name.to_string(), value)); + } + + if let Some(value_expr) = node.assignments.get(name) { + return Some((name.to_string(), value_expr_to_json(value_expr))); + } + + Some((name.to_string(), Value::String(raw_value.to_string()))) +} + +fn parse_input_assignment_label(label: &str) -> Option<(&str, &str)> { + let payload = label.strip_prefix("input ")?; + payload.split_once(" = ") +} + +fn value_expr_to_json(value_expr: &ValueExpr) -> Value { + match value_expr { + ValueExpr::Literal(value) => value.value.clone(), + ValueExpr::List(value) => { + Value::Array(value.elements.iter().map(value_expr_to_json).collect()) + } + ValueExpr::Dict(value) => { + let mut map = serde_json::Map::new(); + for entry in &value.entries { + let key = match value_expr_to_json(&entry.key) { + Value::String(key) => key, + other => other.to_string(), + }; + map.insert(key, value_expr_to_json(&entry.value)); + } + Value::Object(map) + } + _ => Value::String(format_value(value_expr)), + } +} + +fn format_instance_result_payload( + status: InstanceStatus, + result_bytes: &Option>, + error_bytes: &Option>, +) -> String { + match status { + InstanceStatus::Failed => { + let payload = error_bytes.as_deref().or(result_bytes.as_deref()); + let Some(bytes) = payload else { + return "(failed)".to_string(); + }; + match rmp_serde::from_slice::(bytes) { + Ok(value) => pretty_json(&normalize_error_payload(value)), + Err(_) => "(decode error)".to_string(), + } + } + InstanceStatus::Completed => { + let Some(bytes) = result_bytes else { + return "(pending)".to_string(); + }; + match rmp_serde::from_slice::(bytes) { + Ok(value) => pretty_json(&normalize_success_payload(value)), + Err(_) => "(decode error)".to_string(), + } + } + InstanceStatus::Running | InstanceStatus::Queued => "(pending)".to_string(), + } +} + +fn normalize_success_payload(value: Value) -> Value { + let Value::Object(mut map) = value else { + return value; + }; + map.remove("result").unwrap_or(Value::Object(map)) +} + +fn normalize_error_payload(value: Value) -> Value { + let Value::Object(mut map) = value else { + return value; + }; + + if let Some(error) = map.remove("error") { + return normalize_error_payload(error); + } + if let Some(exception) = map.remove("__exception__") { + return normalize_error_payload(exception); + } + if let Some(exception) = map.remove("exception") { + return normalize_error_payload(exception); + } + + Value::Object(map) +} + +fn result_payload_is_error_wrapper(bytes: &[u8]) -> bool { + let Ok(value) = rmp_serde::from_slice::(bytes) else { + return false; + }; + let Value::Object(map) = value else { + return false; + }; + map.len() == 1 + && (map.contains_key("error") + || map.contains_key("__exception__") + || map.contains_key("exception")) +} + +fn format_error(error_bytes: &Option>) -> Option { + let bytes = error_bytes.as_ref()?; + + match rmp_serde::from_slice::(bytes) { + Ok(value) => Some(pretty_json(&normalize_error_payload(value))), + Err(_) => Some("(decode error)".to_string()), + } +} + +fn format_node_status(status: &NodeStatus) -> String { + match status { + NodeStatus::Queued => "queued".to_string(), + NodeStatus::Running => "running".to_string(), + NodeStatus::Completed => "completed".to_string(), + NodeStatus::Failed => "failed".to_string(), + } +} + +fn merge_template_status(existing: &NodeStatus, new_status: &NodeStatus) -> NodeStatus { + if node_status_rank(new_status) > node_status_rank(existing) { + new_status.clone() + } else { + existing.clone() + } +} + +fn node_status_rank(status: &NodeStatus) -> u8 { + match status { + NodeStatus::Completed => 0, + NodeStatus::Queued => 1, + NodeStatus::Running => 2, + NodeStatus::Failed => 3, + } +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + + use chrono::{Duration as ChronoDuration, Utc}; + use prost::Message; + use serial_test::serial; + use uuid::Uuid; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_webapp_backend::WebappBackend; + use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; + use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; + + use crate::PostgresBackend; + + use super::super::test_helpers::setup_backend; + use super::*; + + use waymark_dag::EdgeType; + use waymark_ir_parser::parse_program; + use waymark_runner_state::{ + ActionCallSpec, ExecutionEdge, ExecutionNode, LiteralValue, NodeStatus, + value_visitor::ValueExpr, + }; + use waymark_scheduler_core::{CreateScheduleParams, ScheduleType}; + + #[test] + fn format_extracted_inputs_happy_path() { + let mut nodes = HashMap::new(); + let mut first_assignments = HashMap::new(); + first_assignments.insert( + "iterations".to_string(), + ValueExpr::Literal(LiteralValue { + value: serde_json::json!(3), + }), + ); + nodes.insert( + Uuid::new_v4(), + ExecutionNode { + node_id: Uuid::new_v4(), + node_type: "assignment".to_string(), + label: "input iterations = 3".to_string(), + status: NodeStatus::Completed, + template_id: None, + targets: vec!["iterations".to_string()], + action: None, + value_expr: None, + assignments: first_assignments, + action_attempt: 0, + started_at: None, + completed_at: None, + scheduled_at: None, + }, + ); + + let mut second_assignments = HashMap::new(); + second_assignments.insert( + "sleep_seconds".to_string(), + ValueExpr::Literal(LiteralValue { + value: serde_json::json!(20), + }), + ); + nodes.insert( + Uuid::new_v4(), + ExecutionNode { + node_id: Uuid::new_v4(), + node_type: "assignment".to_string(), + label: "input sleep_seconds = 20".to_string(), + status: NodeStatus::Completed, + template_id: None, + targets: vec!["sleep_seconds".to_string()], + action: None, + value_expr: None, + assignments: second_assignments, + action_attempt: 0, + started_at: None, + completed_at: None, + scheduled_at: None, + }, + ); + + let rendered = format_extracted_inputs(&nodes); + let value: Value = serde_json::from_str(&rendered).expect("decode rendered input payload"); + assert_eq!( + value, + serde_json::json!({ + "iterations": 3, + "sleep_seconds": 20 + }) + ); + } + + #[test] + fn format_instance_result_payload_unwraps_success_result_wrapper() { + let result_bytes = + rmp_serde::to_vec_named(&serde_json::json!({"result": {"total_iterations": 3}})) + .expect("encode result"); + let rendered = + format_instance_result_payload(InstanceStatus::Completed, &Some(result_bytes), &None); + let value: Value = serde_json::from_str(&rendered).expect("decode result payload"); + assert_eq!(value, serde_json::json!({"total_iterations": 3})); + } + + #[test] + fn format_instance_result_payload_unwraps_error_wrapper() { + let error_bytes = rmp_serde::to_vec_named(&serde_json::json!({ + "error": { + "__exception__": { + "type": "ValueError", + "message": "boom" + } + } + })) + .expect("encode error"); + let rendered = + format_instance_result_payload(InstanceStatus::Failed, &None, &Some(error_bytes)); + let value: Value = serde_json::from_str(&rendered).expect("decode result payload"); + assert_eq!( + value, + serde_json::json!({ + "type": "ValueError", + "message": "boom" + }) + ); + } + + #[test] + fn determine_status_marks_wrapped_result_errors_as_failed() { + let result_bytes = + rmp_serde::to_vec_named(&serde_json::json!({"error": {"message": "boom"}})) + .expect("encode result error"); + let status = determine_status(&None, &Some(result_bytes), &None); + assert_eq!(status, InstanceStatus::Failed); + } + + #[test] + fn parse_instance_search_expr_handles_boolean_operators() { + let parsed = parse_instance_search_expr("(alpha OR beta) AND running"); + assert_eq!( + parsed, + Some(InstanceSearchExpr::And( + Box::new(InstanceSearchExpr::Or( + Box::new(InstanceSearchExpr::Term("alpha".to_string())), + Box::new(InstanceSearchExpr::Term("beta".to_string())), + )), + Box::new(InstanceSearchExpr::Term("running".to_string())), + )) + ); + } + + #[test] + fn parse_instance_search_expr_falls_back_for_unbalanced_parentheses() { + let parsed = parse_instance_search_expr("(alpha OR beta"); + assert_eq!( + parsed, + Some(InstanceSearchExpr::Term("(alpha OR beta".to_string())) + ); + } + + #[test] + fn action_timing_from_state_uses_state_timestamps_for_latest_attempt() { + let started_at = Utc::now() - ChronoDuration::milliseconds(1500); + let completed_at = started_at + ChronoDuration::milliseconds(450); + let fallback = Utc::now(); + let node = ExecutionNode { + node_id: Uuid::new_v4(), + node_type: "action_call".to_string(), + label: "@tests.action()".to_string(), + status: NodeStatus::Completed, + template_id: Some("n0".to_string()), + targets: Vec::new(), + action: Some(ActionCallSpec { + action_name: "tests.action".to_string(), + module_name: Some("tests".to_string()), + kwargs: HashMap::new(), + }), + value_expr: None, + assignments: HashMap::new(), + action_attempt: 2, + started_at: Some(started_at), + completed_at: Some(completed_at), + scheduled_at: None, + }; + + let (dispatched_at, finished_at, duration_ms) = + action_timing_from_state(Some(&node), 2, fallback); + assert_eq!(dispatched_at, Some(started_at.to_rfc3339())); + assert_eq!(finished_at, Some(completed_at.to_rfc3339())); + assert_eq!(duration_ms, Some(450)); + } + + #[test] + fn action_timing_from_state_falls_back_for_prior_attempt_rows() { + let started_at = Utc::now() - ChronoDuration::milliseconds(1200); + let completed_at = started_at + ChronoDuration::milliseconds(600); + let fallback = Utc::now(); + let node = ExecutionNode { + node_id: Uuid::new_v4(), + node_type: "action_call".to_string(), + label: "@tests.action()".to_string(), + status: NodeStatus::Completed, + template_id: Some("n0".to_string()), + targets: Vec::new(), + action: Some(ActionCallSpec { + action_name: "tests.action".to_string(), + module_name: Some("tests".to_string()), + kwargs: HashMap::new(), + }), + value_expr: None, + assignments: HashMap::new(), + action_attempt: 3, + started_at: Some(started_at), + completed_at: Some(completed_at), + scheduled_at: None, + }; + + let (dispatched_at, finished_at, duration_ms) = + action_timing_from_state(Some(&node), 2, fallback); + assert_eq!(dispatched_at, Some(fallback.to_rfc3339())); + assert_eq!(finished_at, Some(fallback.to_rfc3339())); + assert_eq!(duration_ms, None); + } + + fn sample_execution_node(execution_id: Uuid) -> ExecutionNode { + ExecutionNode { + node_id: execution_id, + node_type: "action_call".to_string(), + label: "@tests.action()".to_string(), + status: NodeStatus::Queued, + template_id: Some("n0".to_string()), + targets: Vec::new(), + action: Some(ActionCallSpec { + action_name: "tests.action".to_string(), + module_name: Some("tests".to_string()), + kwargs: HashMap::from([( + "value".to_string(), + ValueExpr::Literal(LiteralValue { + value: serde_json::json!(7), + }), + )]), + }), + value_expr: None, + assignments: HashMap::new(), + action_attempt: 1, + started_at: None, + completed_at: None, + scheduled_at: Some(Utc::now()), + } + } + + fn sample_graph(instance_id: Uuid, execution_id: Uuid) -> GraphUpdate { + let mut nodes = HashMap::new(); + nodes.insert(execution_id, sample_execution_node(execution_id)); + + GraphUpdate { + instance_id, + nodes, + edges: HashSet::from([ExecutionEdge { + source: execution_id, + target: execution_id, + edge_type: EdgeType::StateMachine, + }]), + } + } + + async fn insert_instance_with_graph_with_workflow( + backend: &PostgresBackend, + workflow_name: &str, + ) -> (Uuid, Uuid, Uuid) { + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let execution_id = Uuid::new_v4(); + let workflow_version_id = insert_workflow_version(backend, workflow_name).await; + let graph = sample_graph(instance_id, execution_id); + let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); + + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, state) VALUES ($1, $2, $3, $4)", + ) + .bind(instance_id) + .bind(entry_node) + .bind(workflow_version_id) + .bind(state_payload) + .execute(backend.pool()) + .await + .expect("insert runner instance"); + + (instance_id, entry_node, execution_id) + } + + async fn insert_instance_with_graph(backend: &PostgresBackend) -> (Uuid, Uuid, Uuid) { + insert_instance_with_graph_with_workflow(backend, "tests.workflow").await + } + + async fn insert_action_result(backend: &PostgresBackend, execution_id: Uuid) { + let payload = rmp_serde::to_vec_named(&serde_json::json!({"ok": true})) + .expect("encode action result"); + sqlx::query( + "INSERT INTO runner_actions_done (execution_id, attempt, result) VALUES ($1, $2, $3)", + ) + .bind(execution_id) + .bind(1_i32) + .bind(payload) + .execute(backend.pool()) + .await + .expect("insert action result"); + } + + fn sample_program_proto() -> Vec { + let source = r#" +fn main(input: [x], output: [y]): + y = @tests.action(value=x) + return y +"#; + let program = parse_program(source.trim()).expect("parse program"); + program.encode_to_vec() + } + + fn loop_program_proto() -> Vec { + let source = r#" +fn main(input: [items], output: [total]): + total = 0 + for item in items: + total = total + item + return total +"#; + let program = parse_program(source.trim()).expect("parse loop program"); + program.encode_to_vec() + } + + async fn insert_workflow_version(backend: &PostgresBackend, workflow_name: &str) -> Uuid { + WorkflowRegistryBackend::upsert_workflow_version( + backend, + &WorkflowRegistration { + workflow_name: workflow_name.to_string(), + workflow_version: "v1".to_string(), + ir_hash: format!("hash-{workflow_name}"), + program_proto: sample_program_proto(), + concurrent: false, + }, + ) + .await + .expect("insert workflow version") + } + + async fn insert_loop_workflow_version(backend: &PostgresBackend, workflow_name: &str) -> Uuid { + WorkflowRegistryBackend::upsert_workflow_version( + backend, + &WorkflowRegistration { + workflow_name: workflow_name.to_string(), + workflow_version: "v1-loop".to_string(), + ir_hash: format!("hash-loop-{workflow_name}"), + program_proto: loop_program_proto(), + concurrent: false, + }, + ) + .await + .expect("insert loop workflow version") + } + + async fn insert_schedule(backend: &PostgresBackend, schedule_name: &str) -> Uuid { + SchedulerBackend::upsert_schedule( + backend, + &CreateScheduleParams { + workflow_name: "tests.workflow".to_string(), + schedule_name: schedule_name.to_string(), + schedule_type: ScheduleType::Interval, + cron_expression: None, + interval_seconds: Some(60), + jitter_seconds: 0, + input_payload: Some( + rmp_serde::to_vec_named(&serde_json::json!({"k": "v"})) + .expect("encode payload"), + ), + priority: 0, + allow_duplicate: false, + }, + ) + .await + .expect("upsert schedule") + .0 + } + + async fn insert_scheduled_instance( + backend: &PostgresBackend, + schedule_id: Uuid, + created_at: DateTime, + with_result: bool, + ) -> Uuid { + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let execution_id = Uuid::new_v4(); + let workflow_version_id = insert_workflow_version(backend, "tests.workflow").await; + let graph = sample_graph(instance_id, execution_id); + let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); + let result_payload = if with_result { + Some( + rmp_serde::to_vec_named(&serde_json::json!({"result": {"ok": true}})) + .expect("encode result"), + ) + } else { + None + }; + + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, schedule_id, created_at, state, result, error) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)", + ) + .bind(instance_id) + .bind(entry_node) + .bind(workflow_version_id) + .bind(schedule_id) + .bind(created_at) + .bind(state_payload) + .bind(result_payload) + .bind(Option::>::None) + .execute(backend.pool()) + .await + .expect("insert scheduled instance"); + + instance_id + } + + async fn insert_worker_status(backend: &PostgresBackend, pool_id: Uuid) { + WorkerStatusBackend::upsert_worker_status( + backend, + &WorkerStatusUpdate { + pool_id, + throughput_per_min: 180.0, + total_completed: 20, + last_action_at: Some(Utc::now()), + median_dequeue_ms: Some(5), + median_handling_ms: Some(12), + dispatch_queue_size: 3, + total_in_flight: 2, + active_workers: 4, + actions_per_sec: 3.0, + median_instance_duration_secs: Some(0.2), + active_instance_count: 1, + total_instances_completed: 8, + instances_per_sec: 0.5, + instances_per_min: 30.0, + time_series: None, + }, + ) + .await + .expect("upsert worker status"); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_count_instances_happy_path() { + let backend = setup_backend().await; + insert_instance_with_graph(&backend).await; + + let count = WebappBackend::count_instances(&backend, None) + .await + .expect("count instances"); + assert_eq!(count, 1); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_count_instances_applies_search_expression() { + let backend = setup_backend().await; + let (alpha_id, _, _) = + insert_instance_with_graph_with_workflow(&backend, "tests.alpha").await; + let (beta_id, _, _) = + insert_instance_with_graph_with_workflow(&backend, "tests.beta").await; + assert_ne!(alpha_id, beta_id); + + let completed_payload = + rmp_serde::to_vec_named(&serde_json::json!({"result": {"ok": true}})) + .expect("encode completed payload"); + sqlx::query( + "UPDATE runner_instances SET result = $2, current_status = $3 WHERE instance_id = $1", + ) + .bind(beta_id) + .bind(completed_payload) + .bind("completed") + .execute(backend.pool()) + .await + .expect("mark beta completed"); + + let alpha_count = WebappBackend::count_instances(&backend, Some("alpha")) + .await + .expect("count alpha"); + assert_eq!(alpha_count, 1); + + let completed_count = WebappBackend::count_instances(&backend, Some("completed")) + .await + .expect("count completed"); + assert_eq!(completed_count, 1); + + let combined = WebappBackend::count_instances(&backend, Some("(alpha OR completed)")) + .await + .expect("count combined"); + assert_eq!(combined, 2); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_list_instances_happy_path() { + let backend = setup_backend().await; + let (instance_id, _, _) = insert_instance_with_graph(&backend).await; + + let instances = WebappBackend::list_instances(&backend, None, 10, 0) + .await + .expect("list instances"); + + assert_eq!(instances.len(), 1); + assert_eq!(instances[0].id, instance_id); + assert_eq!(instances[0].status, InstanceStatus::Running); + assert_eq!( + instances[0].workflow_name, + Some("tests.workflow".to_string()) + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_list_instances_applies_search_expression() { + let backend = setup_backend().await; + let (alpha_id, _, _) = + insert_instance_with_graph_with_workflow(&backend, "tests.alpha").await; + let _ = insert_instance_with_graph_with_workflow(&backend, "tests.beta").await; + + let alpha_instances = WebappBackend::list_instances(&backend, Some("alpha"), 10, 0) + .await + .expect("list alpha"); + assert_eq!(alpha_instances.len(), 1); + assert_eq!(alpha_instances[0].id, alpha_id); + + let running_instances = + WebappBackend::list_instances(&backend, Some("(alpha OR beta) AND running"), 10, 0) + .await + .expect("list running instances"); + assert_eq!(running_instances.len(), 2); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_instance_happy_path() { + let backend = setup_backend().await; + let (instance_id, _, _) = insert_instance_with_graph(&backend).await; + + let instance = WebappBackend::get_instance(&backend, instance_id) + .await + .expect("get instance"); + + assert_eq!(instance.id, instance_id); + assert_eq!(instance.status, InstanceStatus::Running); + assert_eq!(instance.workflow_name, Some("tests.workflow".to_string())); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_workflow_name_prefers_registered_workflow_name() { + let backend = setup_backend().await; + let (instance_id, entry_node, execution_id) = + insert_instance_with_graph_with_workflow(&backend, "tests.workflow_name").await; + + let list = WebappBackend::list_instances(&backend, None, 10, 0) + .await + .expect("list instances"); + assert_eq!(list.len(), 1); + assert_eq!(list[0].id, instance_id); + assert_eq!( + list[0].workflow_name, + Some("tests.workflow_name".to_string()) + ); + + let detail = WebappBackend::get_instance(&backend, instance_id) + .await + .expect("get instance"); + assert_eq!(detail.id, instance_id); + assert_eq!(detail.entry_node, entry_node); + assert_eq!( + detail.workflow_name, + Some("tests.workflow_name".to_string()) + ); + + let graph = WebappBackend::get_execution_graph(&backend, instance_id) + .await + .expect("get graph") + .expect("graph"); + assert!( + graph + .nodes + .iter() + .any(|node| node.id == execution_id.to_string()), + "expected action node to remain intact" + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_execution_graph_happy_path() { + let backend = setup_backend().await; + let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; + + let graph = WebappBackend::get_execution_graph(&backend, instance_id) + .await + .expect("get execution graph") + .expect("expected execution graph"); + + assert_eq!(graph.nodes.len(), 1); + assert_eq!(graph.edges.len(), 1); + assert_eq!(graph.nodes[0].id, execution_id.to_string()); + assert_eq!(graph.nodes[0].action_name, Some("tests.action".to_string())); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_workflow_graph_uses_template_node_ids() { + let backend = setup_backend().await; + let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; + + let graph = WebappBackend::get_workflow_graph(&backend, instance_id) + .await + .expect("get workflow graph") + .expect("expected workflow graph"); + + assert!(!graph.nodes.is_empty(), "workflow graph should have nodes"); + assert!( + graph + .nodes + .iter() + .all(|node| node.id != execution_id.to_string()), + "workflow graph should use template node ids, not runtime execution ids" + ); + assert!( + graph + .nodes + .iter() + .any(|node| node.node_type == "action_call"), + "workflow graph should include action_call template nodes" + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_workflow_graph_marks_loop_back_edges() { + let backend = setup_backend().await; + let instance_id = Uuid::new_v4(); + let entry_node = Uuid::new_v4(); + let execution_id = Uuid::new_v4(); + let workflow_version_id = + insert_loop_workflow_version(&backend, "tests.loop_workflow").await; + let graph = sample_graph(instance_id, execution_id); + let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); + + sqlx::query( + "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, state) VALUES ($1, $2, $3, $4)", + ) + .bind(instance_id) + .bind(entry_node) + .bind(workflow_version_id) + .bind(state_payload) + .execute(backend.pool()) + .await + .expect("insert loop runner instance"); + + let workflow_graph = WebappBackend::get_workflow_graph(&backend, instance_id) + .await + .expect("get workflow graph") + .expect("expected workflow graph"); + + assert!( + workflow_graph + .edges + .iter() + .any(|edge| edge.edge_type == "state_machine_loop_back"), + "loop workflows should emit at least one loop_back edge" + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_action_results_happy_path() { + let backend = setup_backend().await; + let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; + insert_action_result(&backend, execution_id).await; + + let entries = WebappBackend::get_action_results(&backend, instance_id) + .await + .expect("get action results"); + + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].action_id, execution_id.to_string()); + assert_eq!(entries[0].action_name, "tests.action"); + assert_eq!(entries[0].status, "completed"); + assert!(entries[0].request_preview.contains("\"value\": 7")); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_distinct_workflows_happy_path() { + let backend = setup_backend().await; + insert_instance_with_graph_with_workflow(&backend, "tests.workflow_a").await; + insert_instance_with_graph_with_workflow(&backend, "tests.workflow_b").await; + + let workflows = WebappBackend::get_distinct_workflows(&backend) + .await + .expect("get distinct workflows"); + assert_eq!( + workflows, + vec![ + "tests.workflow_a".to_string(), + "tests.workflow_b".to_string() + ] + ); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_distinct_statuses_happy_path() { + let backend = setup_backend().await; + + let statuses = WebappBackend::get_distinct_statuses(&backend) + .await + .expect("get distinct statuses"); + assert_eq!(statuses, vec!["queued", "running", "completed", "failed"]); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_count_schedules_happy_path() { + let backend = setup_backend().await; + insert_schedule(&backend, "count").await; + + let count = WebappBackend::count_schedules(&backend) + .await + .expect("count schedules"); + assert_eq!(count, 1); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_list_schedules_happy_path() { + let backend = setup_backend().await; + let schedule_id = insert_schedule(&backend, "list").await; + + let schedules = WebappBackend::list_schedules(&backend, 10, 0) + .await + .expect("list schedules"); + assert_eq!(schedules.len(), 1); + assert_eq!(schedules[0].id, schedule_id.to_string()); + assert_eq!(schedules[0].schedule_name, "list"); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_schedule_happy_path() { + let backend = setup_backend().await; + let schedule_id = insert_schedule(&backend, "detail").await; + + let schedule = WebappBackend::get_schedule(&backend, schedule_id) + .await + .expect("get schedule"); + assert_eq!(schedule.id, schedule_id.to_string()); + assert_eq!(schedule.schedule_name, "detail"); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_schedule_invocations_are_filtered_by_schedule_id() { + let backend = setup_backend().await; + let schedule_id = insert_schedule(&backend, "invocations-a").await; + let other_schedule_id = insert_schedule(&backend, "invocations-b").await; + + let running_instance_id = insert_scheduled_instance( + &backend, + schedule_id, + Utc::now() - ChronoDuration::minutes(2), + false, + ) + .await; + let completed_instance_id = insert_scheduled_instance( + &backend, + schedule_id, + Utc::now() - ChronoDuration::minutes(1), + true, + ) + .await; + let _other_instance_id = + insert_scheduled_instance(&backend, other_schedule_id, Utc::now(), true).await; + + let total = WebappBackend::count_schedule_invocations(&backend, schedule_id) + .await + .expect("count schedule invocations"); + assert_eq!(total, 2); + + let invocations = WebappBackend::list_schedule_invocations(&backend, schedule_id, 10, 0) + .await + .expect("list schedule invocations"); + assert_eq!(invocations.len(), 2); + assert_eq!(invocations[0].id, completed_instance_id); + assert_eq!(invocations[0].status, InstanceStatus::Completed); + assert_eq!(invocations[1].id, running_instance_id); + assert_eq!(invocations[1].status, InstanceStatus::Running); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_update_schedule_status_happy_path() { + let backend = setup_backend().await; + let schedule_id = insert_schedule(&backend, "update").await; + + let updated = WebappBackend::update_schedule_status(&backend, schedule_id, "paused") + .await + .expect("update schedule status"); + assert!(updated); + + let schedule = WebappBackend::get_schedule(&backend, schedule_id) + .await + .expect("get schedule"); + assert_eq!(schedule.status, "paused"); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_distinct_schedule_statuses_happy_path() { + let backend = setup_backend().await; + + let statuses = WebappBackend::get_distinct_schedule_statuses(&backend) + .await + .expect("get distinct schedule statuses"); + assert_eq!(statuses, vec!["active", "paused"]); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_distinct_schedule_types_happy_path() { + let backend = setup_backend().await; + + let types = WebappBackend::get_distinct_schedule_types(&backend) + .await + .expect("get distinct schedule types"); + assert_eq!(types, vec!["cron", "interval"]); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_worker_action_stats_happy_path() { + let backend = setup_backend().await; + let pool_id = Uuid::new_v4(); + insert_worker_status(&backend, pool_id).await; + + let rows = WebappBackend::get_worker_action_stats(&backend, 60) + .await + .expect("get worker action stats"); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].pool_id, pool_id.to_string()); + assert_eq!(rows[0].total_completed, 20); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_worker_aggregate_stats_happy_path() { + let backend = setup_backend().await; + insert_worker_status(&backend, Uuid::new_v4()).await; + + let aggregate = WebappBackend::get_worker_aggregate_stats(&backend, 60) + .await + .expect("get worker aggregate stats"); + assert_eq!(aggregate.active_worker_count, 1); + assert_eq!(aggregate.total_in_flight, 2); + assert_eq!(aggregate.total_queue_depth, 3); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_worker_status_table_exists_happy_path() { + let backend = setup_backend().await; + + assert!(WebappBackend::worker_status_table_exists(&backend).await); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_schedules_table_exists_happy_path() { + let backend = setup_backend().await; + + assert!(WebappBackend::schedules_table_exists(&backend).await); + } + + #[serial(postgres)] + #[tokio::test] + async fn webapp_get_worker_statuses_happy_path() { + let backend = setup_backend().await; + let pool_id = Uuid::new_v4(); + insert_worker_status(&backend, pool_id).await; + let (completed_instance_id, _, _) = insert_instance_with_graph(&backend).await; + let completed_payload = + rmp_serde::to_vec_named(&serde_json::json!({"ok": true})).expect("encode result"); + sqlx::query("UPDATE runner_instances SET result = $2 WHERE instance_id = $1") + .bind(completed_instance_id) + .bind(completed_payload) + .execute(backend.pool()) + .await + .expect("mark instance completed"); + + let (failed_instance_id, _, _) = insert_instance_with_graph(&backend).await; + let error_payload = rmp_serde::to_vec_named(&serde_json::json!({ + "type": "Exception", + "message": "boom", + })) + .expect("encode error"); + sqlx::query("UPDATE runner_instances SET error = $2 WHERE instance_id = $1") + .bind(failed_instance_id) + .bind(error_payload) + .execute(backend.pool()) + .await + .expect("mark instance failed"); + + let statuses = WebappBackend::get_worker_statuses(&backend, 60) + .await + .expect("get worker statuses"); + assert_eq!(statuses.len(), 1); + assert_eq!(statuses[0].pool_id, pool_id); + assert_eq!(statuses[0].total_completed, 20); + assert_eq!(statuses[0].total_instances_completed, 1); + assert_eq!(statuses[0].total_in_flight, Some(2)); + assert_eq!(statuses[0].dispatch_queue_size, Some(3)); + } +} diff --git a/crates/backends-core/src/lib.rs b/crates/backends-core/src/lib.rs index 50f807b4..ee49d385 100644 --- a/crates/backends-core/src/lib.rs +++ b/crates/backends-core/src/lib.rs @@ -27,3 +27,10 @@ pub type InnerError = (); /// TODO: move away from the single-`Result` type aliases as we want to vary /// rrors per-call. pub type BackendResult = Result>; + +#[cfg(feature = "sqlx-error")] +impl From for BackendError { + fn from(value: sqlx::Error) -> Self { + Self::Inner(value) + } +} diff --git a/crates/core-backend/src/lib.rs b/crates/core-backend/src/lib.rs index e38f5cdd..5e876b29 100644 --- a/crates/core-backend/src/lib.rs +++ b/crates/core-backend/src/lib.rs @@ -4,7 +4,7 @@ mod data; use uuid::Uuid; -pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_backends_core::BackendResult; pub use self::data::*; diff --git a/crates/dag/Cargo.toml b/crates/dag/Cargo.toml index 54abfd13..a8bda894 100644 --- a/crates/dag/Cargo.toml +++ b/crates/dag/Cargo.toml @@ -11,4 +11,4 @@ uuid = { workspace = true, features = ["serde", "v4"] } waymark-proto = { workspace = true, features = ["serde"] } [dev-dependencies] -waymark = { workspace = true } +waymark-ir-parser = { workspace = true } diff --git a/crates/dag/src/builder/test_helpers.rs b/crates/dag/src/builder/test_helpers.rs index ed33ca00..0c9811c1 100644 --- a/crates/dag/src/builder/test_helpers.rs +++ b/crates/dag/src/builder/test_helpers.rs @@ -1,5 +1,5 @@ use crate::{DAG, DAGConverter, convert_to_dag}; -use waymark::waymark_core::ir_parser::parse_program; +use waymark_ir_parser::parse_program; use waymark_proto::ast as ir; pub(super) fn dedent(source: &str) -> String { diff --git a/crates/dag/src/validate.rs b/crates/dag/src/validate.rs index b6aa67e1..0e48504e 100644 --- a/crates/dag/src/validate.rs +++ b/crates/dag/src/validate.rs @@ -361,7 +361,7 @@ fn collect_expr_variables(expr: &ir::Expr, vars: &mut HashSet) { mod tests { use super::validate_dag; use crate::convert_to_dag; - use waymark::waymark_core::ir_parser::parse_program; + use waymark_ir_parser::parse_program; #[test] fn validate_dag_rejects_unresolved_variable_reference() { diff --git a/crates/garbage-collector-backend/src/lib.rs b/crates/garbage-collector-backend/src/lib.rs index d3f2e234..af8badff 100644 --- a/crates/garbage-collector-backend/src/lib.rs +++ b/crates/garbage-collector-backend/src/lib.rs @@ -1,6 +1,6 @@ use chrono::{DateTime, Utc}; -pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_backends_core::BackendResult; #[derive(Clone, Copy, Debug, Default)] /// Summary of a garbage collection sweep. diff --git a/crates/integration-support/Cargo.toml b/crates/integration-support/Cargo.toml new file mode 100644 index 00000000..1e24644c --- /dev/null +++ b/crates/integration-support/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-integration-support" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = { workspace = true } +sqlx = { workspace = true } +tokio = { workspace = true, features = ["process", "time", "sync"] } +waymark-backend-postgres-migrations = { workspace = true } diff --git a/crates/integration-support/src/lib.rs b/crates/integration-support/src/lib.rs new file mode 100644 index 00000000..db198a86 --- /dev/null +++ b/crates/integration-support/src/lib.rs @@ -0,0 +1,5 @@ +//! Shared integration harness helpers used by test binaries and Rust tests. + +mod postgres; + +pub use postgres::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; diff --git a/crates/integration-support/src/postgres.rs b/crates/integration-support/src/postgres.rs new file mode 100644 index 00000000..59c9caec --- /dev/null +++ b/crates/integration-support/src/postgres.rs @@ -0,0 +1,103 @@ +//! Shared Postgres bootstrap for integration harnesses. + +use std::path::PathBuf; +use std::time::{Duration, Instant}; + +use anyhow::{Context, Result, anyhow, bail}; +use sqlx::{PgPool, postgres::PgPoolOptions}; +use tokio::process::Command; +use tokio::sync::OnceCell; + +pub const LOCAL_POSTGRES_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; + +const READY_TIMEOUT: Duration = Duration::from_secs(45); +const RETRY_DELAY: Duration = Duration::from_millis(500); +const POOL_MAX_CONNECTIONS: u32 = 32; +const POOL_ACQUIRE_TIMEOUT: Duration = Duration::from_secs(15); + +static LOCAL_POSTGRES_BOOTSTRAPPED: OnceCell<()> = OnceCell::const_new(); + +/// Ensure the default local Postgres is available and migrated. +/// +/// This helper is intended for local integration workflows where the default +/// DSN maps to the repository docker-compose service. +pub async fn ensure_local_postgres() -> Result<()> { + LOCAL_POSTGRES_BOOTSTRAPPED + .get_or_try_init(|| async { ensure_local_postgres_impl().await }) + .await?; + Ok(()) +} + +/// Connect a PgPool using integration defaults. +pub async fn connect_pool(dsn: &str) -> Result { + Ok(PgPoolOptions::new() + .max_connections(POOL_MAX_CONNECTIONS) + .acquire_timeout(POOL_ACQUIRE_TIMEOUT) + .connect(dsn) + .await?) +} + +async fn ensure_local_postgres_impl() -> Result<()> { + if let Ok(pool) = connect_pool(LOCAL_POSTGRES_DSN).await { + waymark_backend_postgres_migrations::run(&pool) + .await + .context("run migrations for existing local postgres")?; + pool.close().await; + return Ok(()); + } + + run_compose_up().await?; + let pool = wait_for_postgres(LOCAL_POSTGRES_DSN).await?; + waymark_backend_postgres_migrations::run(&pool) + .await + .context("run migrations for local postgres")?; + pool.close().await; + Ok(()) +} + +async fn run_compose_up() -> Result<()> { + let root = project_root(); + let status = Command::new("docker") + .arg("compose") + .arg("-f") + .arg("../../docker-compose.yml") + .arg("up") + .arg("-d") + .arg("postgres") + .current_dir(&root) + .status() + .await + .with_context(|| format!("failed to run docker compose in {}", root.display()))?; + + if !status.success() { + bail!("docker compose up -d postgres exited with status {status}"); + } + + Ok(()) +} + +async fn wait_for_postgres(dsn: &str) -> Result { + let deadline = Instant::now() + READY_TIMEOUT; + let mut last_error = None; + + while Instant::now() < deadline { + match connect_pool(dsn).await { + Ok(pool) => return Ok(pool), + Err(err) => { + last_error = Some(err); + tokio::time::sleep(RETRY_DELAY).await; + } + } + } + + Err(anyhow!( + "timed out waiting for postgres at {dsn}; last error: {}", + last_error + .map(|err| err.to_string()) + .unwrap_or_else(|| "unknown".to_string()) + )) +} + +fn project_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) +} diff --git a/crates/runner/src/lib.rs b/crates/runner/src/lib.rs index ed59081f..684a49a0 100644 --- a/crates/runner/src/lib.rs +++ b/crates/runner/src/lib.rs @@ -4,7 +4,9 @@ pub mod executor; pub mod expression_evaluator; pub mod replay; pub(crate) mod retry; -pub(crate) mod synthetic_exceptions; + +/// TODO: make `pub(crate)` +pub mod synthetic_exceptions; pub use executor::{ DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, diff --git a/crates/runner/src/synthetic_exceptions.rs b/crates/runner/src/synthetic_exceptions.rs index df89b71f..5bd2be0d 100644 --- a/crates/runner/src/synthetic_exceptions.rs +++ b/crates/runner/src/synthetic_exceptions.rs @@ -3,13 +3,13 @@ use serde_json::Value; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum SyntheticExceptionType { +pub enum SyntheticExceptionType { ExecutorResume, ActionTimeout, } impl SyntheticExceptionType { - pub(crate) fn as_type_str(self) -> &'static str { + pub fn as_type_str(self) -> &'static str { match self { Self::ExecutorResume => "ExecutorResume", Self::ActionTimeout => "ActionTimeout", @@ -24,7 +24,7 @@ impl SyntheticExceptionType { } } - pub(crate) fn from_value(value: &Value) -> Option { + pub fn from_value(value: &Value) -> Option { let Value::Object(map) = value else { return None; }; @@ -34,7 +34,7 @@ impl SyntheticExceptionType { } } -pub(crate) fn build_synthetic_exception_value( +pub fn build_synthetic_exception_value( exception_type: SyntheticExceptionType, message: impl Into, fields: Vec<(String, Value)>, diff --git a/crates/test-support/Cargo.toml b/crates/test-support/Cargo.toml new file mode 100644 index 00000000..829d2395 --- /dev/null +++ b/crates/test-support/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-test-support" +version = "0.1.0" +edition = "2024" + +[dependencies] +sqlx = { workspace = true } +waymark-integration-support = { workspace = true } diff --git a/crates/test-support/src/lib.rs b/crates/test-support/src/lib.rs new file mode 100644 index 00000000..5e34abaa --- /dev/null +++ b/crates/test-support/src/lib.rs @@ -0,0 +1,5 @@ +//! Shared test fixtures for Rust tests. + +mod postgres; + +pub use postgres::postgres_setup; diff --git a/crates/test-support/src/postgres.rs b/crates/test-support/src/postgres.rs new file mode 100644 index 00000000..e76bf812 --- /dev/null +++ b/crates/test-support/src/postgres.rs @@ -0,0 +1,15 @@ +//! Shared Postgres fixture bootstrapped from root docker-compose. + +use sqlx::PgPool; + +use waymark_integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; + +/// Ensure test Postgres is available and migrated, then return a pooled connection. +pub async fn postgres_setup() -> PgPool { + ensure_local_postgres() + .await + .unwrap_or_else(|err| panic!("postgres_setup bootstrap failed: {err:#}")); + connect_pool(LOCAL_POSTGRES_DSN) + .await + .unwrap_or_else(|err| panic!("postgres_setup connect failed: {err:#}")) +} diff --git a/crates/webapp-backend/src/lib.rs b/crates/webapp-backend/src/lib.rs index 354e0e67..bc8f365c 100644 --- a/crates/webapp-backend/src/lib.rs +++ b/crates/webapp-backend/src/lib.rs @@ -1,5 +1,5 @@ use uuid::Uuid; -pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_backends_core::BackendResult; use waymark_webapp_core::{ ExecutionGraphView, InstanceDetail, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, diff --git a/crates/webapp-core/src/lib.rs b/crates/webapp-core/src/lib.rs index 7805c428..61a4a453 100644 --- a/crates/webapp-core/src/lib.rs +++ b/crates/webapp-core/src/lib.rs @@ -4,58 +4,6 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; -/// Configuration for the webapp server. -#[derive(Debug, Clone)] -pub struct WebappConfig { - pub enabled: bool, - pub host: String, - pub port: u16, -} - -impl Default for WebappConfig { - fn default() -> Self { - Self { - enabled: false, - host: "0.0.0.0".to_string(), - port: 24119, - } - } -} - -impl WebappConfig { - /// Create config from environment variables. - pub fn from_env() -> Self { - let enabled = std::env::var("WAYMARK_WEBAPP_ENABLED") - .map(|v| v == "true" || v == "1") - .unwrap_or(false); - - let (host, port) = std::env::var("WAYMARK_WEBAPP_ADDR") - .ok() - .and_then(|addr| { - let parts: Vec<&str> = addr.split(':').collect(); - if parts.len() == 2 { - let host = parts[0].to_string(); - let port = parts[1].parse().ok()?; - Some((host, port)) - } else { - None - } - }) - .unwrap_or_else(|| ("0.0.0.0".to_string(), 24119)); - - Self { - enabled, - host, - port, - } - } - - /// Get the bind address. - pub fn bind_addr(&self) -> String { - format!("{}:{}", self.host, self.port) - } -} - /// Instance status. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] From f15dca096936ebfddc9ecdd3011a905d04b82107 Mon Sep 17 00:00:00 2001 From: MOZGIII Date: Wed, 25 Feb 2026 21:46:00 +0400 Subject: [PATCH 5/5] Integrate separate crates into waymark main crate --- Cargo.lock | 23 +- crates/fuzzer/Cargo.toml | 4 + crates/fuzzer/src/harness.rs | 8 +- crates/waymark/Cargo.toml | 23 +- crates/waymark/migrations/0001_init.sql | 115 - .../0002_runner_actions_done_execution_id.sql | 7 - .../migrations/0003_instance_locks.sql | 12 - .../migrations/0004_workflow_versions.sql | 21 - ...5_runner_instances_workflow_version_id.sql | 7 - .../0006_drop_unused_runner_tables.sql | 4 - .../0007_runner_instances_schedule_id.sql | 5 - .../0008_runner_actions_done_timing.sql | 14 - .../0009_instance_search_columns.sql | 63 - crates/waymark/src/backends/base.rs | 366 -- crates/waymark/src/backends/memory.rs | 814 ----- crates/waymark/src/backends/mod.rs | 15 - crates/waymark/src/backends/postgres/core.rs | 1992 ----------- crates/waymark/src/backends/postgres/mod.rs | 116 - .../waymark/src/backends/postgres/registry.rs | 146 - .../src/backends/postgres/scheduler.rs | 604 ---- .../src/backends/postgres/test_helpers.rs | 27 - .../waymark/src/backends/postgres/webapp.rs | 2324 ------------- crates/waymark/src/bin/integration_test.rs | 15 +- crates/waymark/src/bin/soak-harness.rs | 11 +- crates/waymark/src/bin/start-workers.rs | 5 +- crates/waymark/src/bin/waymark-bridge.rs | 22 +- crates/waymark/src/db.rs | 14 - crates/waymark/src/garbage_collector/task.rs | 5 +- crates/waymark/src/integration_support/mod.rs | 5 - .../src/integration_support/postgres.rs | 105 - crates/waymark/src/lib.rs | 10 +- crates/waymark/src/observability.rs | 2 +- crates/waymark/src/scheduler/mod.rs | 4 - crates/waymark/src/scheduler/task.rs | 32 +- crates/waymark/src/scheduler/types.rs | 139 - crates/waymark/src/scheduler/utils.rs | 181 - crates/waymark/src/test_support/mod.rs | 5 - crates/waymark/src/test_support/postgres.rs | 15 - .../waymark/src/waymark_core/cli/benchmark.rs | 15 +- crates/waymark/src/waymark_core/cli/smoke.rs | 8 +- crates/waymark/src/waymark_core/ir_format.rs | 2 +- crates/waymark/src/waymark_core/lock.rs | 5 +- crates/waymark/src/waymark_core/mod.rs | 3 - crates/waymark/src/waymark_core/runloop.rs | 32 +- .../waymark/src/waymark_core/runloop/tests.rs | 136 +- .../src/waymark_core/runner/executor.rs | 3031 ----------------- .../runner/expression_evaluator.rs | 1058 ------ crates/waymark/src/waymark_core/runner/mod.rs | 19 - .../waymark/src/waymark_core/runner/replay.rs | 658 ---- .../waymark/src/waymark_core/runner/retry.rs | 137 - .../waymark/src/waymark_core/runner/state.rs | 2201 ------------ .../runner/synthetic_exceptions.rs | 90 - .../src/waymark_core/runner/value_visitor.rs | 533 --- crates/waymark/src/webapp/server.rs | 45 +- crates/waymark/src/webapp/types.rs | 248 +- crates/waymark/src/workers/status.rs | 2 +- 56 files changed, 162 insertions(+), 15341 deletions(-) delete mode 100644 crates/waymark/migrations/0001_init.sql delete mode 100644 crates/waymark/migrations/0002_runner_actions_done_execution_id.sql delete mode 100644 crates/waymark/migrations/0003_instance_locks.sql delete mode 100644 crates/waymark/migrations/0004_workflow_versions.sql delete mode 100644 crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql delete mode 100644 crates/waymark/migrations/0006_drop_unused_runner_tables.sql delete mode 100644 crates/waymark/migrations/0007_runner_instances_schedule_id.sql delete mode 100644 crates/waymark/migrations/0008_runner_actions_done_timing.sql delete mode 100644 crates/waymark/migrations/0009_instance_search_columns.sql delete mode 100644 crates/waymark/src/backends/base.rs delete mode 100644 crates/waymark/src/backends/memory.rs delete mode 100644 crates/waymark/src/backends/mod.rs delete mode 100644 crates/waymark/src/backends/postgres/core.rs delete mode 100644 crates/waymark/src/backends/postgres/mod.rs delete mode 100644 crates/waymark/src/backends/postgres/registry.rs delete mode 100644 crates/waymark/src/backends/postgres/scheduler.rs delete mode 100644 crates/waymark/src/backends/postgres/test_helpers.rs delete mode 100644 crates/waymark/src/backends/postgres/webapp.rs delete mode 100644 crates/waymark/src/db.rs delete mode 100644 crates/waymark/src/integration_support/mod.rs delete mode 100644 crates/waymark/src/integration_support/postgres.rs delete mode 100644 crates/waymark/src/scheduler/types.rs delete mode 100644 crates/waymark/src/scheduler/utils.rs delete mode 100644 crates/waymark/src/test_support/mod.rs delete mode 100644 crates/waymark/src/test_support/postgres.rs delete mode 100644 crates/waymark/src/waymark_core/runner/executor.rs delete mode 100644 crates/waymark/src/waymark_core/runner/expression_evaluator.rs delete mode 100644 crates/waymark/src/waymark_core/runner/mod.rs delete mode 100644 crates/waymark/src/waymark_core/runner/replay.rs delete mode 100644 crates/waymark/src/waymark_core/runner/retry.rs delete mode 100644 crates/waymark/src/waymark_core/runner/state.rs delete mode 100644 crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs delete mode 100644 crates/waymark/src/waymark_core/runner/value_visitor.rs diff --git a/Cargo.lock b/Cargo.lock index 9737bb86..08e29a10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3366,10 +3366,27 @@ dependencies = [ "tracing-chrome", "tracing-subscriber", "uuid", + "waymark-backend-fault-injection", + "waymark-backend-memory", + "waymark-backend-postgres", + "waymark-backend-postgres-migrations", + "waymark-backends-core", + "waymark-core-backend", "waymark-dag", + "waymark-garbage-collector-backend", + "waymark-integration-support", "waymark-ir-parser", - "waymark-observability-macros", + "waymark-observability", "waymark-proto", + "waymark-runner", + "waymark-runner-state", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-test-support", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", ] [[package]] @@ -3493,8 +3510,12 @@ dependencies = [ "tokio", "uuid", "waymark", + "waymark-backend-memory", + "waymark-core-backend", "waymark-dag", "waymark-ir-parser", + "waymark-runner-state", + "waymark-workflow-registry-backend", ] [[package]] diff --git a/crates/fuzzer/Cargo.toml b/crates/fuzzer/Cargo.toml index 4e0fd1e0..8c7c039d 100644 --- a/crates/fuzzer/Cargo.toml +++ b/crates/fuzzer/Cargo.toml @@ -15,3 +15,7 @@ tokio = { workspace = true } waymark = { workspace = true } waymark-dag = { workspace = true } waymark-ir-parser = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } diff --git a/crates/fuzzer/src/harness.rs b/crates/fuzzer/src/harness.rs index 59dc38c2..2bec4043 100644 --- a/crates/fuzzer/src/harness.rs +++ b/crates/fuzzer/src/harness.rs @@ -9,17 +9,17 @@ use prost::Message; use serde_json::Value; use sha2::{Digest, Sha256}; use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::QueuedInstance; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; use super::generator::GeneratedCase; -use waymark::backends::{ - MemoryBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; use waymark::messages::ast as ir; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{ActionCallable, InlineWorkerPool, WorkerPoolError}; use waymark_dag::convert_to_dag; use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; pub async fn run_case(case_index: usize, case: &GeneratedCase) -> Result<()> { let program = parse_program(case.source.trim()).map_err(|err| { diff --git a/crates/waymark/Cargo.toml b/crates/waymark/Cargo.toml index 83c06cd3..91b817bf 100644 --- a/crates/waymark/Cargo.toml +++ b/crates/waymark/Cargo.toml @@ -17,9 +17,25 @@ name = "smoke" path = "src/bin/smoke.rs" [dependencies] -waymark-proto = { workspace = true, features = ["serde", "client", "server"] } +waymark-core-backend = { workspace = true } waymark-dag = { workspace = true } waymark-ir-parser = { workspace = true } +waymark-observability = { workspace = true } +waymark-proto = { workspace = true, features = ["serde", "client", "server"] } +waymark-runner = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-webapp-backend = { workspace = true } +waymark-webapp-core = { workspace = true } +waymark-garbage-collector-backend = { workspace = true } +waymark-scheduler-backend = { workspace = true } +waymark-scheduler-core = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-integration-support = { workspace = true } +waymark-backend-postgres = { workspace = true } +waymark-backend-postgres-migrations = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-backend-memory = { workspace = true } anyhow = "1" axum = "0.8" @@ -51,7 +67,6 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-chrome = "0.7" metrics = "0.24" regex = "1" -waymark-observability-macros = { path = "../observability-macros" } console-subscriber = { version = "0.5", optional = true } [features] @@ -59,6 +74,10 @@ trace = [] observability = ["trace", "dep:console-subscriber"] [dev-dependencies] +waymark-backend-fault-injection = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-test-support = { workspace = true } + serial_test = "2" tower = { version = "0.5", features = ["util"] } http-body-util = "0.1" diff --git a/crates/waymark/migrations/0001_init.sql b/crates/waymark/migrations/0001_init.sql deleted file mode 100644 index dbb6b7da..00000000 --- a/crates/waymark/migrations/0001_init.sql +++ /dev/null @@ -1,115 +0,0 @@ --- Waymark core schema (baseline) - -CREATE EXTENSION IF NOT EXISTS pgcrypto; - --- --------------------------------------------------------------------------- --- Workflow definitions --- --------------------------------------------------------------------------- - -CREATE TABLE workflow_versions ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - workflow_name TEXT NOT NULL, - dag_hash TEXT NOT NULL, - program_proto BYTEA NOT NULL, - concurrent BOOLEAN NOT NULL DEFAULT false, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - UNIQUE(workflow_name, dag_hash) -); - -CREATE INDEX idx_workflow_versions_name ON workflow_versions(workflow_name); - --- --------------------------------------------------------------------------- --- Runner persistence tables --- --------------------------------------------------------------------------- - -CREATE TABLE runner_graph_updates ( - id BIGSERIAL PRIMARY KEY, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - state BYTEA NOT NULL -); - -CREATE TABLE runner_actions_done ( - id BIGSERIAL PRIMARY KEY, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - node_id UUID NOT NULL, - action_name TEXT NOT NULL, - attempt INTEGER NOT NULL, - result BYTEA -); - -CREATE TABLE runner_instances ( - instance_id UUID PRIMARY KEY, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - entry_node UUID NOT NULL, - state BYTEA, - result BYTEA, - error BYTEA -); - -CREATE TABLE runner_instances_done ( - id BIGSERIAL PRIMARY KEY, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - executor_id UUID NOT NULL, - entry_node UUID NOT NULL, - result BYTEA, - error BYTEA -); - -CREATE TABLE queued_instances ( - instance_id UUID PRIMARY KEY, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - payload BYTEA NOT NULL -); - --- --------------------------------------------------------------------------- --- Scheduler --- --------------------------------------------------------------------------- - -CREATE TABLE workflow_schedules ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - workflow_name TEXT NOT NULL, - schedule_name TEXT NOT NULL, - schedule_type TEXT NOT NULL, - cron_expression TEXT, - interval_seconds BIGINT, - jitter_seconds BIGINT NOT NULL DEFAULT 0, - input_payload BYTEA, - status TEXT NOT NULL DEFAULT 'active', - next_run_at TIMESTAMPTZ, - last_run_at TIMESTAMPTZ, - last_instance_id UUID, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - priority INT NOT NULL DEFAULT 0, - allow_duplicate BOOLEAN NOT NULL DEFAULT false, - UNIQUE(workflow_name, schedule_name) -); - -CREATE INDEX idx_schedules_due ON workflow_schedules(next_run_at) - WHERE status = 'active' AND next_run_at IS NOT NULL; - --- --------------------------------------------------------------------------- --- Worker status metrics --- --------------------------------------------------------------------------- - -CREATE TABLE worker_status ( - pool_id UUID NOT NULL, - worker_id BIGINT NOT NULL, - throughput_per_min DOUBLE PRECISION NOT NULL DEFAULT 0, - total_completed BIGINT NOT NULL DEFAULT 0, - last_action_at TIMESTAMPTZ, - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - median_dequeue_ms BIGINT, - median_handling_ms BIGINT, - dispatch_queue_size BIGINT, - total_in_flight BIGINT, - active_workers INT NOT NULL DEFAULT 0, - actions_per_sec DOUBLE PRECISION NOT NULL DEFAULT 0, - median_instance_duration_secs DOUBLE PRECISION, - active_instance_count INT NOT NULL DEFAULT 0, - total_instances_completed BIGINT NOT NULL DEFAULT 0, - instances_per_sec DOUBLE PRECISION NOT NULL DEFAULT 0, - instances_per_min DOUBLE PRECISION NOT NULL DEFAULT 0, - time_series BYTEA, - PRIMARY KEY (pool_id, worker_id) -); diff --git a/crates/waymark/migrations/0002_runner_actions_done_execution_id.sql b/crates/waymark/migrations/0002_runner_actions_done_execution_id.sql deleted file mode 100644 index b4bce178..00000000 --- a/crates/waymark/migrations/0002_runner_actions_done_execution_id.sql +++ /dev/null @@ -1,7 +0,0 @@ --- Rename runner action identifier to execution_id and drop stored action name. - -ALTER TABLE runner_actions_done - RENAME COLUMN node_id TO execution_id; - -ALTER TABLE runner_actions_done - DROP COLUMN action_name; diff --git a/crates/waymark/migrations/0003_instance_locks.sql b/crates/waymark/migrations/0003_instance_locks.sql deleted file mode 100644 index 6b826d18..00000000 --- a/crates/waymark/migrations/0003_instance_locks.sql +++ /dev/null @@ -1,12 +0,0 @@ --- Add scheduling and locking for queued instances. - -ALTER TABLE queued_instances - ADD COLUMN scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - ADD COLUMN lock_uuid UUID, - ADD COLUMN lock_expires_at TIMESTAMPTZ; - -CREATE INDEX IF NOT EXISTS idx_queued_instances_scheduled_at - ON queued_instances(scheduled_at); - -CREATE INDEX IF NOT EXISTS idx_queued_instances_lock_expires_at - ON queued_instances(lock_expires_at); diff --git a/crates/waymark/migrations/0004_workflow_versions.sql b/crates/waymark/migrations/0004_workflow_versions.sql deleted file mode 100644 index daf3b54d..00000000 --- a/crates/waymark/migrations/0004_workflow_versions.sql +++ /dev/null @@ -1,21 +0,0 @@ --- Workflow versions: replace dag_hash with workflow_version + ir_hash - -ALTER TABLE workflow_versions - RENAME COLUMN dag_hash TO workflow_version; - -ALTER TABLE workflow_versions - ADD COLUMN ir_hash TEXT; - -UPDATE workflow_versions -SET ir_hash = workflow_version -WHERE ir_hash IS NULL; - -ALTER TABLE workflow_versions - ALTER COLUMN ir_hash SET NOT NULL; - -ALTER TABLE workflow_versions - DROP CONSTRAINT IF EXISTS workflow_versions_workflow_name_dag_hash_key; - -ALTER TABLE workflow_versions - ADD CONSTRAINT workflow_versions_workflow_name_version_key - UNIQUE (workflow_name, workflow_version); diff --git a/crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql b/crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql deleted file mode 100644 index 6d09937b..00000000 --- a/crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql +++ /dev/null @@ -1,7 +0,0 @@ --- Persist workflow version on instances so webapp can show workflow names. - -ALTER TABLE runner_instances - ADD COLUMN workflow_version_id UUID; - -CREATE INDEX IF NOT EXISTS idx_runner_instances_workflow_version_id - ON runner_instances(workflow_version_id); diff --git a/crates/waymark/migrations/0006_drop_unused_runner_tables.sql b/crates/waymark/migrations/0006_drop_unused_runner_tables.sql deleted file mode 100644 index d3b1f272..00000000 --- a/crates/waymark/migrations/0006_drop_unused_runner_tables.sql +++ /dev/null @@ -1,4 +0,0 @@ --- Remove legacy tables no longer used by runtime or webapp. - -DROP TABLE IF EXISTS runner_graph_updates; -DROP TABLE IF EXISTS runner_instances_done; diff --git a/crates/waymark/migrations/0007_runner_instances_schedule_id.sql b/crates/waymark/migrations/0007_runner_instances_schedule_id.sql deleted file mode 100644 index 06cb1385..00000000 --- a/crates/waymark/migrations/0007_runner_instances_schedule_id.sql +++ /dev/null @@ -1,5 +0,0 @@ -ALTER TABLE runner_instances -ADD COLUMN IF NOT EXISTS schedule_id UUID; - -CREATE INDEX IF NOT EXISTS idx_runner_instances_schedule_id_created_at - ON runner_instances(schedule_id, created_at DESC); diff --git a/crates/waymark/migrations/0008_runner_actions_done_timing.sql b/crates/waymark/migrations/0008_runner_actions_done_timing.sql deleted file mode 100644 index b1b5551d..00000000 --- a/crates/waymark/migrations/0008_runner_actions_done_timing.sql +++ /dev/null @@ -1,14 +0,0 @@ --- Persist per-attempt lifecycle metadata for action history and timeline rendering. - -ALTER TABLE runner_actions_done - ADD COLUMN status TEXT, - ADD COLUMN started_at TIMESTAMPTZ, - ADD COLUMN completed_at TIMESTAMPTZ, - ADD COLUMN duration_ms BIGINT; - -ALTER TABLE runner_actions_done - ADD CONSTRAINT runner_actions_done_status_check - CHECK (status IS NULL OR status IN ('completed', 'failed', 'timed_out')); - -CREATE INDEX idx_runner_actions_done_execution_attempt - ON runner_actions_done (execution_id, attempt); diff --git a/crates/waymark/migrations/0009_instance_search_columns.sql b/crates/waymark/migrations/0009_instance_search_columns.sql deleted file mode 100644 index 948c6aca..00000000 --- a/crates/waymark/migrations/0009_instance_search_columns.sql +++ /dev/null @@ -1,63 +0,0 @@ --- Persist workflow/status instance metadata for indexed search in webapp queries. - -ALTER TABLE runner_instances - ADD COLUMN IF NOT EXISTS workflow_name TEXT, - ADD COLUMN IF NOT EXISTS current_status TEXT; - -ALTER TABLE queued_instances - ADD COLUMN IF NOT EXISTS workflow_name TEXT, - ADD COLUMN IF NOT EXISTS current_status TEXT; - -UPDATE runner_instances AS ri -SET workflow_name = wv.workflow_name -FROM workflow_versions wv -WHERE ri.workflow_name IS NULL - AND ri.workflow_version_id = wv.id; - -UPDATE runner_instances -SET current_status = CASE - WHEN error IS NOT NULL THEN 'failed' - WHEN result IS NOT NULL THEN 'completed' - WHEN state IS NOT NULL THEN 'running' - ELSE 'queued' -END -WHERE current_status IS NULL; - -UPDATE queued_instances AS qi -SET workflow_name = ri.workflow_name -FROM runner_instances ri -WHERE qi.workflow_name IS NULL - AND qi.instance_id = ri.instance_id; - -UPDATE queued_instances -SET current_status = CASE - WHEN lock_uuid IS NULL THEN 'queued' - ELSE 'running' -END -WHERE current_status IS NULL; - -ALTER TABLE runner_instances - ADD CONSTRAINT runner_instances_current_status_check - CHECK ( - current_status IS NULL - OR current_status IN ('queued', 'running', 'completed', 'failed') - ); - -ALTER TABLE queued_instances - ADD CONSTRAINT queued_instances_current_status_check - CHECK ( - current_status IS NULL - OR current_status IN ('queued', 'running') - ); - -CREATE INDEX IF NOT EXISTS idx_runner_instances_workflow_name - ON runner_instances(workflow_name); - -CREATE INDEX IF NOT EXISTS idx_runner_instances_current_status - ON runner_instances(current_status); - -CREATE INDEX IF NOT EXISTS idx_queued_instances_workflow_name - ON queued_instances(workflow_name); - -CREATE INDEX IF NOT EXISTS idx_queued_instances_current_status - ON queued_instances(current_status); diff --git a/crates/waymark/src/backends/base.rs b/crates/waymark/src/backends/base.rs deleted file mode 100644 index 92c17a3f..00000000 --- a/crates/waymark/src/backends/base.rs +++ /dev/null @@ -1,366 +0,0 @@ -//! Backend interfaces for persisting runner state and action results. - -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Deserializer, Serialize}; -use serde_json::Value; -use tonic::async_trait; -use uuid::Uuid; - -use crate::scheduler::{CreateScheduleParams, ScheduleId, WorkflowSchedule}; -use crate::waymark_core::runner::state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; -use crate::webapp::{ - ExecutionGraphView, InstanceDetail, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, - ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, -}; -use waymark_dag::DAG; - -#[derive(Debug, thiserror::Error)] -pub enum BackendError { - #[error("{0}")] - Message(String), - #[error(transparent)] - Sqlx(#[from] sqlx::Error), - #[error(transparent)] - Serialization(#[from] serde_json::Error), -} - -pub type BackendResult = Result; - -fn default_instance_id() -> Uuid { - Uuid::new_v4() -} - -fn default_action_results() -> HashMap { - HashMap::new() -} - -fn deserialize_action_results<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let value = Option::>::deserialize(deserializer)?; - Ok(value.unwrap_or_default()) -} - -// The models that we use for our backends are similar to the ones that we -// have specified in our database/Postgres backend, but not 1:1. It's better for -// us to internally convert within the given backend - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Queued instance payload for the run loop. -pub struct QueuedInstance { - pub workflow_version_id: Uuid, - #[serde(default)] - pub schedule_id: Option, - #[serde(skip, default)] - pub dag: Option>, - pub entry_node: Uuid, - pub state: Option, - #[serde( - default = "default_action_results", - deserialize_with = "deserialize_action_results" - )] - pub action_results: HashMap, - #[serde(default = "default_instance_id")] - pub instance_id: Uuid, - #[serde(default)] - pub scheduled_at: Option>, -} - -#[derive(Clone, Debug)] -/// Result payload for queued instance polling. -pub struct QueuedInstanceBatch { - pub instances: Vec, -} - -#[derive(Clone, Debug)] -/// Lock claim settings for owned instances. -pub struct LockClaim { - pub lock_uuid: Uuid, - pub lock_expires_at: DateTime, -} - -#[derive(Clone, Debug)] -/// Current lock status for an instance. -pub struct InstanceLockStatus { - pub instance_id: Uuid, - pub lock_uuid: Option, - pub lock_expires_at: Option>, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Completed instance payload with result or exception. -pub struct InstanceDone { - pub executor_id: Uuid, - pub entry_node: Uuid, - pub result: Option, - pub error: Option, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Batch payload representing an updated execution graph snapshot. -/// -/// This intentionally stores only runtime nodes and edges (no DAG template or -/// derived caches) so persistence stays lightweight. -pub struct GraphUpdate { - pub instance_id: Uuid, - pub nodes: HashMap, - pub edges: HashSet, -} - -impl GraphUpdate { - pub fn from_state(instance_id: Uuid, state: &RunnerState) -> Self { - Self { - instance_id, - nodes: state.nodes.clone(), - edges: state.edges.clone(), - } - } - - pub fn next_scheduled_at(&self) -> DateTime { - let mut next: Option> = None; - for node in self.nodes.values() { - if matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) { - continue; - } - if let Some(scheduled_at) = node.scheduled_at { - next = Some(match next { - Some(existing) => existing.min(scheduled_at), - None => scheduled_at, - }); - } - } - next.unwrap_or_else(Utc::now) - } -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Batch payload representing a finished action attempt (success or failure). -pub struct ActionDone { - pub execution_id: Uuid, - pub attempt: i32, - pub status: ActionAttemptStatus, - pub started_at: Option>, - pub completed_at: Option>, - pub duration_ms: Option, - pub result: Value, -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum ActionAttemptStatus { - Completed, - Failed, - TimedOut, -} - -impl std::fmt::Display for ActionAttemptStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Completed => write!(f, "completed"), - Self::Failed => write!(f, "failed"), - Self::TimedOut => write!(f, "timed_out"), - } - } -} - -/// Worker status update for persistence. -#[derive(Clone, Debug)] -pub struct WorkerStatusUpdate { - pub pool_id: Uuid, - pub throughput_per_min: f64, - pub total_completed: i64, - pub last_action_at: Option>, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub dispatch_queue_size: i64, - pub total_in_flight: i64, - pub active_workers: i32, - pub actions_per_sec: f64, - pub median_instance_duration_secs: Option, - pub active_instance_count: i32, - pub total_instances_completed: i64, - pub instances_per_sec: f64, - pub instances_per_min: f64, - pub time_series: Option>, -} - -/// Backend capability for recording worker status metrics. -#[async_trait] -pub trait WorkerStatusBackend: Send + Sync { - async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()>; -} - -/// Abstract persistence backend for runner state. -#[async_trait] -pub trait CoreBackend: Send + Sync { - fn clone_box(&self) -> Box; - - /// Persist updated execution graphs. - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult>; - - /// Persist finished action attempts (success or failure). - async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()>; - - /// Return up to size queued instances without blocking. - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult; - - /// Refresh lock expiry for owned instances. - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult>; - - /// Release instance locks when evicting from memory. - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()>; - - /// Persist completed workflow instances. - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()>; - - /// Insert queued instances for run-loop consumption. - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()>; -} - -/// Registration payload for storing workflow DAG metadata. -#[derive(Clone, Debug)] -pub struct WorkflowRegistration { - pub workflow_name: String, - pub workflow_version: String, - pub ir_hash: String, - pub program_proto: Vec, - pub concurrent: bool, -} - -#[derive(Clone, Debug)] -/// Stored workflow version metadata and IR payload. -pub struct WorkflowVersion { - pub id: Uuid, - pub workflow_name: String, - pub workflow_version: String, - pub ir_hash: String, - pub program_proto: Vec, - pub concurrent: bool, -} - -/// Backend capability for registering workflow DAGs. -#[async_trait] -pub trait WorkflowRegistryBackend: Send + Sync { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult; - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult>; -} - -/// Backend capability for workflow schedule persistence. -#[async_trait] -pub trait SchedulerBackend: Send + Sync { - async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult; - async fn get_schedule(&self, id: ScheduleId) -> BackendResult; - async fn get_schedule_by_name( - &self, - workflow_name: &str, - schedule_name: &str, - ) -> BackendResult>; - async fn list_schedules(&self, limit: i64, offset: i64) - -> BackendResult>; - async fn count_schedules(&self) -> BackendResult; - async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult; - async fn delete_schedule(&self, id: ScheduleId) -> BackendResult; - async fn find_due_schedules(&self, limit: i32) -> BackendResult>; - async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult; - async fn mark_schedule_executed( - &self, - schedule_id: ScheduleId, - instance_id: Uuid, - ) -> BackendResult<()>; - async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()>; -} - -#[derive(Clone, Copy, Debug, Default)] -/// Summary of a garbage collection sweep. -pub struct GarbageCollectionResult { - pub deleted_instances: usize, - pub deleted_actions: usize, -} - -/// Backend capability for deleting old finished workflow data. -#[async_trait] -pub trait GarbageCollectorBackend: Send + Sync { - async fn collect_done_instances( - &self, - older_than: DateTime, - limit: usize, - ) -> BackendResult; -} - -/// Backend capability for webapp-specific queries. -#[async_trait] -pub trait WebappBackend: Send + Sync { - async fn count_instances(&self, search: Option<&str>) -> BackendResult; - async fn list_instances( - &self, - search: Option<&str>, - limit: i64, - offset: i64, - ) -> BackendResult>; - async fn get_instance(&self, instance_id: Uuid) -> BackendResult; - async fn get_execution_graph( - &self, - instance_id: Uuid, - ) -> BackendResult>; - async fn get_workflow_graph( - &self, - instance_id: Uuid, - ) -> BackendResult>; - async fn get_action_results(&self, instance_id: Uuid) -> BackendResult>; - async fn get_distinct_workflows(&self) -> BackendResult>; - async fn get_distinct_statuses(&self) -> BackendResult>; - async fn count_schedules(&self) -> BackendResult; - async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult>; - async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult; - async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult; - async fn list_schedule_invocations( - &self, - schedule_id: Uuid, - limit: i64, - offset: i64, - ) -> BackendResult>; - async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult; - async fn get_distinct_schedule_statuses(&self) -> BackendResult>; - async fn get_distinct_schedule_types(&self) -> BackendResult>; - async fn get_worker_action_stats( - &self, - window_minutes: i64, - ) -> BackendResult>; - async fn get_worker_aggregate_stats( - &self, - window_minutes: i64, - ) -> BackendResult; - async fn worker_status_table_exists(&self) -> bool; - async fn schedules_table_exists(&self) -> bool; - async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult>; -} - -impl Clone for Box { - fn clone(&self) -> Self { - self.clone_box() - } -} diff --git a/crates/waymark/src/backends/memory.rs b/crates/waymark/src/backends/memory.rs deleted file mode 100644 index c49bc6e0..00000000 --- a/crates/waymark/src/backends/memory.rs +++ /dev/null @@ -1,814 +0,0 @@ -//! In-memory backend that prints persistence operations. - -use std::collections::{HashMap, VecDeque}; -use std::sync::{Arc, Mutex}; - -use chrono::{DateTime, Utc}; -use uuid::Uuid; - -use super::base::{ - ActionDone, BackendError, BackendResult, CoreBackend, GarbageCollectionResult, - GarbageCollectorBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, - QueuedInstance, QueuedInstanceBatch, SchedulerBackend, WebappBackend, WorkerStatusBackend, - WorkerStatusUpdate, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; -use crate::scheduler::compute_next_run; -use crate::scheduler::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; -use crate::webapp::{ - ExecutionGraphView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, - ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, - WorkerAggregateStats, WorkerStatus, -}; -use tonic::async_trait; - -type WorkflowVersionKey = (String, String); -type WorkflowVersionValue = (Uuid, WorkflowRegistration); -type WorkflowVersionStore = HashMap; -type InstanceLockStore = HashMap, Option>)>; - -/// Backend that stores updates in memory for tests or local runs. -#[derive(Clone)] -pub struct MemoryBackend { - instance_queue: Option>>>, - graph_updates: Arc>>, - actions_done: Arc>>, - instances_done: Arc>>, - worker_status_updates: Arc>>, - workflow_versions: Arc>, - schedules: Arc>>, - instance_locks: Arc>, -} - -impl Default for MemoryBackend { - fn default() -> Self { - Self { - instance_queue: None, - graph_updates: Arc::new(Mutex::new(Vec::new())), - actions_done: Arc::new(Mutex::new(Vec::new())), - instances_done: Arc::new(Mutex::new(Vec::new())), - worker_status_updates: Arc::new(Mutex::new(Vec::new())), - workflow_versions: Arc::new(Mutex::new(HashMap::new())), - schedules: Arc::new(Mutex::new(HashMap::new())), - instance_locks: Arc::new(Mutex::new(HashMap::new())), - } - } -} - -impl MemoryBackend { - pub fn new() -> Self { - Self::default() - } - - pub fn with_queue(queue: Arc>>) -> Self { - Self { - instance_queue: Some(queue), - ..Self::default() - } - } - - pub fn instance_queue(&self) -> Option>>> { - self.instance_queue.clone() - } - - pub fn graph_updates(&self) -> Vec { - self.graph_updates - .lock() - .expect("graph updates poisoned") - .clone() - } - - pub fn actions_done(&self) -> Vec { - self.actions_done - .lock() - .expect("actions done poisoned") - .clone() - } - - pub fn instances_done(&self) -> Vec { - self.instances_done - .lock() - .expect("instances done poisoned") - .clone() - } - - pub fn worker_status_updates(&self) -> Vec { - self.worker_status_updates - .lock() - .expect("worker status updates poisoned") - .clone() - } -} - -#[async_trait] -impl CoreBackend for MemoryBackend { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } - - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - let mut stored = self.graph_updates.lock().expect("graph updates poisoned"); - stored.extend(graphs.iter().cloned()); - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - let mut locks = Vec::with_capacity(graphs.len()); - for graph in graphs { - if let Some((Some(lock_uuid), lock_expires_at)) = guard.get_mut(&graph.instance_id) - && *lock_uuid == claim.lock_uuid - && lock_expires_at.is_none_or(|expires_at| expires_at < claim.lock_expires_at) - { - *lock_expires_at = Some(claim.lock_expires_at); - } - let (lock_uuid, lock_expires_at) = guard - .get(&graph.instance_id) - .cloned() - .unwrap_or((None, None)); - locks.push(InstanceLockStatus { - instance_id: graph.instance_id, - lock_uuid, - lock_expires_at, - }); - } - Ok(locks) - } - - async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { - let mut stored = self.actions_done.lock().expect("actions done poisoned"); - stored.extend(actions.iter().cloned()); - Ok(()) - } - - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { - let mut stored = self.instances_done.lock().expect("instances done poisoned"); - stored.extend(instances.iter().cloned()); - if !instances.is_empty() { - let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); - for instance in instances { - locks.remove(&instance.executor_id); - } - } - Ok(()) - } - - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - if size == 0 { - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - let queue = match &self.instance_queue { - Some(queue) => queue, - None => { - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - }; - let mut guard = queue.lock().expect("instance queue poisoned"); - let now = Utc::now(); - let mut instances = Vec::new(); - while instances.len() < size { - let Some(instance) = guard.front() else { - break; - }; - if let Some(scheduled_at) = instance.scheduled_at - && scheduled_at > now - { - break; - } - let instance = guard.pop_front().expect("instance queue empty"); - instances.push(instance); - } - if !instances.is_empty() { - let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); - for instance in &instances { - locks.insert( - instance.instance_id, - (Some(claim.lock_uuid), Some(claim.lock_expires_at)), - ); - } - } - Ok(QueuedInstanceBatch { instances }) - } - - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { - if instances.is_empty() { - return Ok(()); - } - let queue = self.instance_queue.as_ref().ok_or_else(|| { - BackendError::Message("memory backend missing instance queue".to_string()) - })?; - let mut guard = queue.lock().expect("instance queue poisoned"); - for instance in instances { - guard.push_back(instance.clone()); - } - Ok(()) - } - - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - let mut locks = Vec::new(); - for instance_id in instance_ids { - let entry = guard - .entry(*instance_id) - .or_insert((Some(claim.lock_uuid), Some(claim.lock_expires_at))); - if entry.0 == Some(claim.lock_uuid) { - entry.1 = Some(claim.lock_expires_at); - } - locks.push(InstanceLockStatus { - instance_id: *instance_id, - lock_uuid: entry.0, - lock_expires_at: entry.1, - }); - } - Ok(locks) - } - - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()> { - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - for instance_id in instance_ids { - if let Some((current_lock, _)) = guard.get(instance_id) - && *current_lock == Some(lock_uuid) - { - guard.remove(instance_id); - } - } - Ok(()) - } -} - -#[async_trait] -impl WorkerStatusBackend for MemoryBackend { - async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { - let mut stored = self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"); - stored.push(status.clone()); - Ok(()) - } -} - -#[async_trait] -impl WorkflowRegistryBackend for MemoryBackend { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult { - let mut guard = self - .workflow_versions - .lock() - .expect("workflow versions poisoned"); - let key = ( - registration.workflow_name.clone(), - registration.workflow_version.clone(), - ); - if let Some((id, existing)) = guard.get(&key) { - if existing.ir_hash != registration.ir_hash { - return Err(BackendError::Message(format!( - "workflow version already exists with different IR hash: {}@{}", - registration.workflow_name, registration.workflow_version - ))); - } - return Ok(*id); - } - - let id = Uuid::new_v4(); - guard.insert(key, (id, registration.clone())); - Ok(id) - } - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { - if ids.is_empty() { - return Ok(Vec::new()); - } - let guard = self - .workflow_versions - .lock() - .expect("workflow versions poisoned"); - let mut versions = Vec::new(); - for (id, registration) in guard.values() { - if ids.contains(id) { - versions.push(WorkflowVersion { - id: *id, - workflow_name: registration.workflow_name.clone(), - workflow_version: registration.workflow_version.clone(), - ir_hash: registration.ir_hash.clone(), - program_proto: registration.program_proto.clone(), - concurrent: registration.concurrent, - }); - } - } - Ok(versions) - } -} - -#[async_trait] -impl SchedulerBackend for MemoryBackend { - async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let existing_schedule = guard.iter().find_map(|(id, schedule)| { - if schedule.workflow_name == params.workflow_name - && schedule.schedule_name == params.schedule_name - { - Some((*id, schedule.clone())) - } else { - None - } - }); - let schedule_id = existing_schedule - .as_ref() - .map(|(id, _)| *id) - .unwrap_or_else(ScheduleId::new); - let now = Utc::now(); - let next_run_at = match existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.next_run_at) - { - Some(next_run_at) => Some(next_run_at), - None => Some( - compute_next_run( - params.schedule_type, - params.cron_expression.as_deref(), - params.interval_seconds, - params.jitter_seconds, - None, - ) - .map_err(BackendError::Message)?, - ), - }; - let schedule = WorkflowSchedule { - id: schedule_id.0, - workflow_name: params.workflow_name.clone(), - schedule_name: params.schedule_name.clone(), - schedule_type: params.schedule_type.as_str().to_string(), - cron_expression: params.cron_expression.clone(), - interval_seconds: params.interval_seconds, - jitter_seconds: params.jitter_seconds, - input_payload: params.input_payload.clone(), - status: "active".to_string(), - next_run_at, - last_run_at: existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.last_run_at), - last_instance_id: existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.last_instance_id), - created_at: existing_schedule - .as_ref() - .map(|(_, schedule)| schedule.created_at) - .unwrap_or(now), - updated_at: now, - priority: params.priority, - allow_duplicate: params.allow_duplicate, - }; - guard.insert(schedule_id, schedule); - Ok(schedule_id) - } - - async fn get_schedule(&self, id: ScheduleId) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - guard - .get(&id) - .cloned() - .ok_or_else(|| BackendError::Message(format!("schedule not found: {id}"))) - } - - async fn get_schedule_by_name( - &self, - workflow_name: &str, - schedule_name: &str, - ) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .find(|schedule| { - schedule.workflow_name == workflow_name - && schedule.schedule_name == schedule_name - && schedule.status != "deleted" - }) - .cloned()) - } - - async fn list_schedules( - &self, - limit: i64, - offset: i64, - ) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| schedule.status != "deleted") - .cloned() - .collect(); - schedules.sort_by(|a, b| { - (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) - }); - let start = offset.max(0) as usize; - let end = start.saturating_add(limit.max(0) as usize); - Ok(schedules - .into_iter() - .skip(start) - .take(end - start) - .collect()) - } - - async fn count_schedules(&self) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .filter(|schedule| schedule.status != "deleted") - .count() as i64) - } - - async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - if let Some(schedule) = guard.get_mut(&id) { - schedule.status = status.to_string(); - schedule.updated_at = Utc::now(); - Ok(true) - } else { - Ok(false) - } - } - - async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { - SchedulerBackend::update_schedule_status(self, id, "deleted").await - } - - async fn find_due_schedules(&self, limit: i32) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let now = Utc::now(); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| { - schedule.status == "active" - && schedule - .next_run_at - .map(|next| next <= now) - .unwrap_or(false) - }) - .cloned() - .collect(); - schedules.sort_by_key(|schedule| schedule.next_run_at); - Ok(schedules.into_iter().take(limit as usize).collect()) - } - - async fn has_running_instance(&self, _schedule_id: ScheduleId) -> BackendResult { - Ok(false) - } - - async fn mark_schedule_executed( - &self, - schedule_id: ScheduleId, - instance_id: Uuid, - ) -> BackendResult<()> { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .get_mut(&schedule_id) - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - schedule.last_run_at = Some(Utc::now()); - schedule.last_instance_id = Some(instance_id); - schedule.next_run_at = Some(next_run_at); - schedule.updated_at = Utc::now(); - Ok(()) - } - - async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .get_mut(&schedule_id) - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - schedule.next_run_at = Some(next_run_at); - schedule.updated_at = Utc::now(); - Ok(()) - } -} - -#[async_trait] -impl GarbageCollectorBackend for MemoryBackend { - async fn collect_done_instances( - &self, - _older_than: DateTime, - _limit: usize, - ) -> BackendResult { - Ok(GarbageCollectionResult::default()) - } -} - -#[async_trait] -impl WebappBackend for MemoryBackend { - async fn count_instances(&self, _search: Option<&str>) -> BackendResult { - Ok(0) - } - - async fn list_instances( - &self, - _search: Option<&str>, - _limit: i64, - _offset: i64, - ) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_instance(&self, instance_id: Uuid) -> BackendResult { - Err(BackendError::Message(format!( - "instance not found: {instance_id}" - ))) - } - - async fn get_execution_graph( - &self, - _instance_id: Uuid, - ) -> BackendResult> { - Ok(None) - } - - async fn get_workflow_graph( - &self, - _instance_id: Uuid, - ) -> BackendResult> { - Ok(None) - } - - async fn get_action_results(&self, _instance_id: Uuid) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_distinct_workflows(&self) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_distinct_statuses(&self) -> BackendResult> { - Ok(vec![ - InstanceStatus::Queued.to_string(), - InstanceStatus::Running.to_string(), - InstanceStatus::Completed.to_string(), - InstanceStatus::Failed.to_string(), - ]) - } - - async fn count_schedules(&self) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .filter(|schedule| schedule.status != "deleted") - .count() as i64) - } - - async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| schedule.status != "deleted") - .cloned() - .collect(); - schedules.sort_by(|a, b| { - (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) - }); - - let start = offset.max(0) as usize; - let page_limit = limit.max(0) as usize; - Ok(schedules - .into_iter() - .skip(start) - .take(page_limit) - .map(|schedule| ScheduleSummary { - id: schedule.id.to_string(), - workflow_name: schedule.workflow_name, - schedule_name: schedule.schedule_name, - schedule_type: schedule.schedule_type, - cron_expression: schedule.cron_expression, - interval_seconds: schedule.interval_seconds, - status: schedule.status, - next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), - last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), - created_at: schedule.created_at.to_rfc3339(), - }) - .collect()) - } - - async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .values() - .find(|schedule| schedule.id == schedule_id) - .cloned() - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - - let input_payload = schedule.input_payload.as_ref().and_then(|bytes| { - rmp_serde::from_slice::(bytes) - .ok() - .and_then(|value| serde_json::to_string_pretty(&value).ok()) - }); - - Ok(ScheduleDetail { - id: schedule.id.to_string(), - workflow_name: schedule.workflow_name, - schedule_name: schedule.schedule_name, - schedule_type: schedule.schedule_type, - cron_expression: schedule.cron_expression, - interval_seconds: schedule.interval_seconds, - jitter_seconds: schedule.jitter_seconds, - status: schedule.status, - next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), - last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), - last_instance_id: schedule.last_instance_id.map(|id| id.to_string()), - created_at: schedule.created_at.to_rfc3339(), - updated_at: schedule.updated_at.to_rfc3339(), - priority: schedule.priority, - allow_duplicate: schedule.allow_duplicate, - input_payload, - }) - } - - async fn count_schedule_invocations(&self, _schedule_id: Uuid) -> BackendResult { - Ok(0) - } - - async fn list_schedule_invocations( - &self, - _schedule_id: Uuid, - _limit: i64, - _offset: i64, - ) -> BackendResult> { - Ok(Vec::new()) - } - - async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let Some(schedule) = guard - .values_mut() - .find(|schedule| schedule.id == schedule_id) - else { - return Ok(false); - }; - schedule.status = status.to_string(); - schedule.updated_at = Utc::now(); - Ok(true) - } - - async fn get_distinct_schedule_statuses(&self) -> BackendResult> { - Ok(vec!["active".to_string(), "paused".to_string()]) - } - - async fn get_distinct_schedule_types(&self) -> BackendResult> { - Ok(vec!["cron".to_string(), "interval".to_string()]) - } - - async fn get_worker_action_stats( - &self, - _window_minutes: i64, - ) -> BackendResult> { - let statuses = latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - ); - - Ok(statuses - .into_iter() - .map(|status| WorkerActionRow { - pool_id: status.pool_id.to_string(), - active_workers: status.active_workers as i64, - actions_per_sec: format!("{:.1}", status.actions_per_sec), - throughput_per_min: status.throughput_per_min as i64, - total_completed: status.total_completed, - median_dequeue_ms: status.median_dequeue_ms, - median_handling_ms: status.median_handling_ms, - last_action_at: status.last_action_at.map(|dt| dt.to_rfc3339()), - updated_at: status.updated_at.to_rfc3339(), - }) - .collect()) - } - - async fn get_worker_aggregate_stats( - &self, - _window_minutes: i64, - ) -> BackendResult { - let statuses = latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - ); - - let active_worker_count = statuses - .iter() - .map(|status| status.active_workers as i64) - .sum(); - let total_in_flight = statuses - .iter() - .filter_map(|status| status.total_in_flight) - .sum(); - let total_queue_depth = statuses - .iter() - .filter_map(|status| status.dispatch_queue_size) - .sum(); - let actions_per_sec = statuses - .iter() - .map(|status| status.actions_per_sec) - .sum::(); - - Ok(WorkerAggregateStats { - active_worker_count, - actions_per_sec: format!("{:.1}", actions_per_sec), - total_in_flight, - total_queue_depth, - }) - } - - async fn worker_status_table_exists(&self) -> bool { - !self - .worker_status_updates - .lock() - .expect("worker status updates poisoned") - .is_empty() - } - - async fn schedules_table_exists(&self) -> bool { - !self - .schedules - .lock() - .expect("schedules poisoned") - .is_empty() - } - - async fn get_worker_statuses(&self, _window_minutes: i64) -> BackendResult> { - Ok(latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - )) - } -} - -fn latest_worker_statuses(updates: &[WorkerStatusUpdate]) -> Vec { - let mut by_pool: HashMap = HashMap::new(); - for update in updates { - by_pool.insert(update.pool_id, update.clone()); - } - - let now = Utc::now(); - let mut statuses: Vec<_> = by_pool - .into_values() - .map(|status| WorkerStatus { - pool_id: status.pool_id, - active_workers: status.active_workers, - throughput_per_min: status.throughput_per_min, - actions_per_sec: status.actions_per_sec, - total_completed: status.total_completed, - last_action_at: status.last_action_at, - updated_at: now, - median_dequeue_ms: status.median_dequeue_ms, - median_handling_ms: status.median_handling_ms, - dispatch_queue_size: Some(status.dispatch_queue_size), - total_in_flight: Some(status.total_in_flight), - median_instance_duration_secs: status.median_instance_duration_secs, - active_instance_count: status.active_instance_count, - total_instances_completed: status.total_instances_completed, - instances_per_sec: status.instances_per_sec, - instances_per_min: status.instances_per_min, - time_series: status.time_series, - }) - .collect(); - - statuses.sort_by(|left, right| right.actions_per_sec.total_cmp(&left.actions_per_sec)); - statuses -} diff --git a/crates/waymark/src/backends/mod.rs b/crates/waymark/src/backends/mod.rs deleted file mode 100644 index 7fbd84ad..00000000 --- a/crates/waymark/src/backends/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Backend implementations for runner persistence. - -mod base; -mod memory; -mod postgres; - -pub use base::{ - ActionAttemptStatus, ActionDone, BackendError, BackendResult, CoreBackend, - GarbageCollectionResult, GarbageCollectorBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, QueuedInstance, QueuedInstanceBatch, SchedulerBackend, - WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, WorkflowRegistration, - WorkflowRegistryBackend, WorkflowVersion, -}; -pub use memory::MemoryBackend; -pub use postgres::PostgresBackend; diff --git a/crates/waymark/src/backends/postgres/core.rs b/crates/waymark/src/backends/postgres/core.rs deleted file mode 100644 index c827b4c0..00000000 --- a/crates/waymark/src/backends/postgres/core.rs +++ /dev/null @@ -1,1992 +0,0 @@ -use std::collections::HashMap; -use std::future::Future; -use std::time::Duration as StdDuration; - -use chrono::{DateTime, Utc}; -use sqlx::{Postgres, QueryBuilder, Row}; -use tonic::async_trait; -use tracing::warn; -use uuid::Uuid; - -use super::PostgresBackend; -use crate::backends::base::{ - ActionDone, BackendError, BackendResult, CoreBackend, GarbageCollectionResult, - GarbageCollectorBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, - QueuedInstance, QueuedInstanceBatch, WorkerStatusBackend, WorkerStatusUpdate, -}; -use crate::observability::obs; -use crate::waymark_core::runner::state::RunnerState; - -const INSTANCE_STATUS_QUEUED: &str = "queued"; -const INSTANCE_STATUS_RUNNING: &str = "running"; -const INSTANCE_STATUS_COMPLETED: &str = "completed"; -const INSTANCE_STATUS_FAILED: &str = "failed"; -const TRANSIENT_DEADLOCK_SQLSTATE: &str = "40P01"; -const TRANSIENT_SERIALIZATION_SQLSTATE: &str = "40001"; -const TRANSIENT_RETRY_MAX_ATTEMPTS: usize = 3; -const TRANSIENT_RETRY_INITIAL_BACKOFF_MS: u64 = 25; -const TRANSIENT_RETRY_MAX_BACKOFF_MS: u64 = 250; - -fn instance_result_is_error_wrapper(result: &serde_json::Value) -> bool { - let serde_json::Value::Object(map) = result else { - return false; - }; - map.len() == 1 - && (map.contains_key("error") - || map.contains_key("__exception__") - || map.contains_key("exception")) -} - -fn instance_done_status(instance: &InstanceDone) -> &'static str { - if instance.error.is_some() - || instance - .result - .as_ref() - .is_some_and(instance_result_is_error_wrapper) - { - INSTANCE_STATUS_FAILED - } else { - INSTANCE_STATUS_COMPLETED - } -} - -fn is_transient_sqlstate(code: &str) -> bool { - matches!( - code, - TRANSIENT_DEADLOCK_SQLSTATE | TRANSIENT_SERIALIZATION_SQLSTATE - ) -} - -fn is_transient_backend_error(err: &BackendError) -> bool { - match err { - BackendError::Sqlx(sqlx::Error::Database(db_err)) => { - db_err.code().as_deref().is_some_and(is_transient_sqlstate) - } - // Fallback for cases where sqlstate is not preserved in wrapping. - BackendError::Message(message) => { - message.contains("deadlock detected") - || message.contains("could not serialize access due to") - } - _ => false, - } -} - -async fn retry_transient_backend( - operation: &'static str, - mut op: Op, -) -> BackendResult -where - Op: FnMut() -> Fut, - Fut: Future>, -{ - let mut attempt = 0usize; - let mut backoff_ms = TRANSIENT_RETRY_INITIAL_BACKOFF_MS; - loop { - match op().await { - Ok(value) => return Ok(value), - Err(err) - if attempt < TRANSIENT_RETRY_MAX_ATTEMPTS && is_transient_backend_error(&err) => - { - attempt += 1; - warn!( - operation, - attempt, - error = %err, - "transient database error; retrying" - ); - tokio::time::sleep(StdDuration::from_millis(backoff_ms)).await; - backoff_ms = - std::cmp::min(backoff_ms.saturating_mul(2), TRANSIENT_RETRY_MAX_BACKOFF_MS); - } - Err(err) => return Err(err), - } - } -} - -impl PostgresBackend { - /// Insert queued instances for run-loop consumption. - #[obs] - pub async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { - if instances.is_empty() { - return Ok(()); - } - let workflow_version_ids: Vec = instances - .iter() - .map(|instance| instance.workflow_version_id) - .collect(); - let workflow_rows = - sqlx::query("SELECT id, workflow_name FROM workflow_versions WHERE id = ANY($1)") - .bind(&workflow_version_ids) - .fetch_all(&self.pool) - .await?; - let mut workflow_names_by_version_id: HashMap = - HashMap::with_capacity(workflow_rows.len()); - for row in workflow_rows { - workflow_names_by_version_id.insert(row.get("id"), row.get("workflow_name")); - } - - let mut queued_payloads = Vec::new(); - let mut runner_payloads = Vec::new(); - for instance in instances { - let state = instance.state.as_ref().ok_or_else(|| { - BackendError::Message("queued instance missing runner state".to_string()) - })?; - let scheduled_at = instance.scheduled_at.unwrap_or_else(Utc::now); - let workflow_name = workflow_names_by_version_id - .get(&instance.workflow_version_id) - .cloned(); - let mut payload_instance = instance.clone(); - payload_instance.scheduled_at = Some(scheduled_at); - queued_payloads.push(( - payload_instance.instance_id, - scheduled_at, - workflow_name.clone(), - INSTANCE_STATUS_QUEUED, - Self::serialize(&payload_instance)?, - )); - let graph = GraphUpdate::from_state(instance.instance_id, state); - runner_payloads.push(( - instance.instance_id, - instance.entry_node, - instance.workflow_version_id, - instance.schedule_id, - workflow_name, - INSTANCE_STATUS_QUEUED, - Self::serialize(&graph)?, - )); - } - - let mut queued_builder: QueryBuilder = QueryBuilder::new( - "INSERT INTO queued_instances (instance_id, scheduled_at, workflow_name, current_status, payload) ", - ); - queued_builder.push_values( - queued_payloads.iter(), - |mut builder, (id, scheduled_at, workflow_name, current_status, payload)| { - builder - .push_bind(*id) - .push_bind(*scheduled_at) - .push_bind(workflow_name.as_deref()) - .push_bind(*current_status) - .push_bind(payload.as_slice()); - }, - ); - - let mut runner_builder: QueryBuilder = QueryBuilder::new( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, schedule_id, workflow_name, current_status, state) ", - ); - runner_builder.push_values( - runner_payloads.iter(), - |mut builder, - ( - id, - entry, - workflow_version_id, - schedule_id, - workflow_name, - current_status, - payload, - )| { - builder - .push_bind(*id) - .push_bind(*entry) - .push_bind(*workflow_version_id) - .push_bind(*schedule_id) - .push_bind(workflow_name.as_deref()) - .push_bind(*current_status) - .push_bind(payload.as_slice()); - }, - ); - - let mut tx = self.pool.begin().await?; - Self::count_query(&self.query_counts, "insert:queued_instances"); - Self::count_batch_size( - &self.batch_size_counts, - "insert:queued_instances", - instances.len(), - ); - queued_builder.build().execute(&mut *tx).await?; - Self::count_query(&self.query_counts, "insert:runner_instances"); - Self::count_batch_size( - &self.batch_size_counts, - "insert:runner_instances", - instances.len(), - ); - runner_builder.build().execute(&mut *tx).await?; - tx.commit().await?; - Ok(()) - } - - /// Upsert worker status for monitoring and activity graphs. - #[obs] - pub async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { - Self::count_query(&self.query_counts, "upsert:worker_status"); - sqlx::query( - r#" - INSERT INTO worker_status ( - pool_id, - worker_id, - throughput_per_min, - total_completed, - last_action_at, - updated_at, - median_dequeue_ms, - median_handling_ms, - dispatch_queue_size, - total_in_flight, - active_workers, - actions_per_sec, - median_instance_duration_secs, - active_instance_count, - total_instances_completed, - instances_per_sec, - instances_per_min, - time_series - ) - VALUES ($1, 0, $2, $3, $4, NOW(), $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16) - ON CONFLICT (pool_id, worker_id) - DO UPDATE SET - throughput_per_min = EXCLUDED.throughput_per_min, - total_completed = EXCLUDED.total_completed, - last_action_at = EXCLUDED.last_action_at, - updated_at = EXCLUDED.updated_at, - median_dequeue_ms = EXCLUDED.median_dequeue_ms, - median_handling_ms = EXCLUDED.median_handling_ms, - dispatch_queue_size = EXCLUDED.dispatch_queue_size, - total_in_flight = EXCLUDED.total_in_flight, - active_workers = EXCLUDED.active_workers, - actions_per_sec = EXCLUDED.actions_per_sec, - median_instance_duration_secs = EXCLUDED.median_instance_duration_secs, - active_instance_count = EXCLUDED.active_instance_count, - total_instances_completed = EXCLUDED.total_instances_completed, - instances_per_sec = EXCLUDED.instances_per_sec, - instances_per_min = EXCLUDED.instances_per_min, - time_series = EXCLUDED.time_series - "#, - ) - .bind(status.pool_id) - .bind(status.throughput_per_min) - .bind(status.total_completed) - .bind(status.last_action_at) - .bind(status.median_dequeue_ms) - .bind(status.median_handling_ms) - .bind(status.dispatch_queue_size) - .bind(status.total_in_flight) - .bind(status.active_workers) - .bind(status.actions_per_sec) - .bind(status.median_instance_duration_secs) - .bind(status.active_instance_count) - .bind(status.total_instances_completed) - .bind(status.instances_per_sec) - .bind(status.instances_per_min) - .bind(&status.time_series) - .execute(&self.pool) - .await?; - - Ok(()) - } - - /// Clear expired queue locks so they can be claimed again by the runloop. - /// - /// This uses the same `FOR UPDATE SKIP LOCKED` claim pattern as dequeue to - /// avoid blocking under concurrent sweepers. - #[obs] - pub async fn reclaim_expired_instance_locks(&self, size: usize) -> BackendResult { - if size == 0 { - return Ok(0); - } - - let now = Utc::now(); - let mut tx = self.pool.begin().await?; - Self::count_query(&self.query_counts, "update:queued_instances_expired_unlock"); - let rows = sqlx::query( - r#" - WITH expired AS ( - SELECT instance_id - FROM queued_instances - WHERE lock_uuid IS NOT NULL - AND lock_expires_at <= $1 - ORDER BY lock_expires_at, scheduled_at, created_at - LIMIT $2 - FOR UPDATE SKIP LOCKED - ) - UPDATE queued_instances AS qi - SET lock_uuid = NULL, - lock_expires_at = NULL - FROM expired - WHERE qi.instance_id = expired.instance_id - RETURNING qi.instance_id - "#, - ) - .bind(now) - .bind(size as i64) - .fetch_all(&mut *tx) - .await?; - - if !rows.is_empty() { - let instance_ids: Vec = rows.iter().map(|row| row.get("instance_id")).collect(); - sqlx::query( - "UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1) AND result IS NULL AND error IS NULL", - ) - .bind(&instance_ids) - .bind(INSTANCE_STATUS_QUEUED) - .execute(&mut *tx) - .await?; - } - - tx.commit().await?; - - if !rows.is_empty() { - Self::count_batch_size( - &self.batch_size_counts, - "update:queued_instances_expired_unlock", - rows.len(), - ); - } - - Ok(rows.len()) - } - - /// Delete old finished instances and their action attempt rows. - #[obs] - pub async fn collect_done_instances_impl( - &self, - older_than: DateTime, - limit: usize, - ) -> BackendResult { - if limit == 0 { - return Ok(GarbageCollectionResult::default()); - } - - let mut tx = self.pool.begin().await?; - Self::count_query(&self.query_counts, "select:runner_instances_gc_candidates"); - let candidate_rows = sqlx::query( - r#" - SELECT instance_id, state - FROM runner_instances - WHERE created_at < $1 - AND (result IS NOT NULL OR error IS NOT NULL) - ORDER BY created_at, instance_id - LIMIT $2 - FOR UPDATE SKIP LOCKED - "#, - ) - .bind(older_than) - .bind(limit as i64) - .fetch_all(&mut *tx) - .await?; - - if candidate_rows.is_empty() { - tx.commit().await?; - return Ok(GarbageCollectionResult::default()); - } - - let mut instance_ids = Vec::with_capacity(candidate_rows.len()); - let mut action_execution_ids = Vec::new(); - for row in candidate_rows { - let instance_id: Uuid = row.get("instance_id"); - let state_payload: Option> = row.get("state"); - instance_ids.push(instance_id); - - let Some(state_payload) = state_payload else { - continue; - }; - match Self::deserialize::(&state_payload) { - Ok(graph) => { - for (execution_id, node) in graph.nodes { - if node.is_action_call() { - action_execution_ids.push(execution_id); - } - } - } - Err(err) => { - warn!( - %instance_id, - error = %err, - "failed to decode runner state while collecting garbage" - ); - } - } - } - - action_execution_ids.sort_unstable(); - action_execution_ids.dedup(); - let deleted_actions = if action_execution_ids.is_empty() { - 0 - } else { - Self::count_query(&self.query_counts, "delete:runner_actions_done_gc"); - let result = - sqlx::query("DELETE FROM runner_actions_done WHERE execution_id = ANY($1)") - .bind(&action_execution_ids) - .execute(&mut *tx) - .await?; - let rows = result.rows_affected() as usize; - Self::count_batch_size( - &self.batch_size_counts, - "delete:runner_actions_done_gc", - rows, - ); - rows - }; - - Self::count_query(&self.query_counts, "delete:queued_instances_gc"); - let _ = sqlx::query("DELETE FROM queued_instances WHERE instance_id = ANY($1)") - .bind(&instance_ids) - .execute(&mut *tx) - .await?; - - Self::count_query(&self.query_counts, "delete:runner_instances_gc"); - let deleted_instances_result = - sqlx::query("DELETE FROM runner_instances WHERE instance_id = ANY($1)") - .bind(&instance_ids) - .execute(&mut *tx) - .await?; - let deleted_instances = deleted_instances_result.rows_affected() as usize; - Self::count_batch_size( - &self.batch_size_counts, - "delete:runner_instances_gc", - deleted_instances, - ); - tx.commit().await?; - - Ok(GarbageCollectionResult { - deleted_instances, - deleted_actions, - }) - } - - #[obs] - async fn save_graphs_impl( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - retry_transient_backend("save_graphs_impl", || { - let claim = claim.clone(); - async move { self.save_graphs_once(claim, graphs).await } - }) - .await - } - - async fn save_graphs_once( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - if graphs.is_empty() { - return Ok(Vec::new()); - } - let mut payloads = Vec::with_capacity(graphs.len()); - for graph in graphs { - payloads.push(( - graph.instance_id, - graph.next_scheduled_at(), - claim.lock_expires_at, - Self::serialize(graph)?, - )); - } - - Self::count_query(&self.query_counts, "update:queued_instances_scheduled_at"); - Self::count_batch_size( - &self.batch_size_counts, - "update:queued_instances_scheduled_at", - payloads.len(), - ); - let now = Utc::now(); - let mut schedule_builder: QueryBuilder = QueryBuilder::new( - "UPDATE queued_instances AS qi SET scheduled_at = v.scheduled_at, lock_expires_at = CASE WHEN qi.lock_expires_at IS NULL OR qi.lock_expires_at < v.lock_expires_at THEN v.lock_expires_at ELSE qi.lock_expires_at END FROM (", - ); - schedule_builder.push_values( - payloads.iter(), - |mut b, (instance_id, scheduled_at, lock_expires_at, _payload)| { - b.push_bind(*instance_id) - .push_bind(*scheduled_at) - .push_bind(*lock_expires_at); - }, - ); - schedule_builder.push( - ") AS v(instance_id, scheduled_at, lock_expires_at) - WHERE qi.instance_id = v.instance_id - AND qi.lock_uuid = ", - ); - schedule_builder.push_bind(claim.lock_uuid); - schedule_builder.push(" AND (qi.lock_expires_at IS NULL OR qi.lock_expires_at > "); - schedule_builder.push_bind(now); - schedule_builder.push(")"); - schedule_builder.build().execute(&self.pool).await?; - - Self::count_query(&self.query_counts, "update:runner_instances_state"); - Self::count_batch_size( - &self.batch_size_counts, - "update:runner_instances_state", - payloads.len(), - ); - let mut runner_builder: QueryBuilder = - QueryBuilder::new("UPDATE runner_instances AS ri SET state = v.state FROM ("); - runner_builder.push_values( - payloads.iter(), - |mut b, (instance_id, _scheduled_at, _lock_expires_at, payload)| { - b.push_bind(*instance_id).push_bind(payload.as_slice()); - }, - ); - runner_builder.push( - ") AS v(instance_id, state) - JOIN queued_instances qi ON qi.instance_id = v.instance_id - WHERE ri.instance_id = v.instance_id - AND qi.lock_uuid = ", - ); - runner_builder.push_bind(claim.lock_uuid); - runner_builder.push(" AND (qi.lock_expires_at IS NULL OR qi.lock_expires_at > "); - runner_builder.push_bind(now); - runner_builder.push(")"); - runner_builder.build().execute(&self.pool).await?; - - let ids: Vec = graphs.iter().map(|graph| graph.instance_id).collect(); - let lock_rows = sqlx::query( - "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = ANY($1)", - ) - .bind(&ids) - .fetch_all(&self.pool) - .await?; - - let mut lock_map: HashMap = HashMap::new(); - for row in lock_rows { - let instance_id: Uuid = row.get(0); - lock_map.insert( - instance_id, - InstanceLockStatus { - instance_id, - lock_uuid: row.get(1), - lock_expires_at: row.get(2), - }, - ); - } - - let mut locks = Vec::with_capacity(ids.len()); - for instance_id in ids { - locks.push( - lock_map - .get(&instance_id) - .cloned() - .unwrap_or(InstanceLockStatus { - instance_id, - lock_uuid: None, - lock_expires_at: None, - }), - ); - } - Ok(locks) - } - - #[obs] - async fn save_actions_done_impl(&self, actions: &[ActionDone]) -> BackendResult<()> { - if actions.is_empty() { - return Ok(()); - } - Self::count_query(&self.query_counts, "insert:runner_actions_done"); - Self::count_batch_size( - &self.batch_size_counts, - "insert:runner_actions_done", - actions.len(), - ); - let mut payloads = Vec::new(); - for action in actions { - payloads.push(( - action.execution_id, - action.attempt, - action.status.to_string(), - action.started_at, - action.completed_at, - action.duration_ms, - Self::serialize(&action.result)?, - )); - } - let mut builder: QueryBuilder = QueryBuilder::new( - "INSERT INTO runner_actions_done (execution_id, attempt, status, started_at, completed_at, duration_ms, result) ", - ); - builder.push_values( - payloads.iter(), - |mut b, (execution_id, attempt, status, started_at, completed_at, duration_ms, payload)| { - b.push_bind(*execution_id) - .push_bind(*attempt) - .push_bind(status.as_str()) - .push_bind(*started_at) - .push_bind(*completed_at) - .push_bind(*duration_ms) - .push_bind(payload.as_slice()); - }, - ); - builder.build().execute(&self.pool).await?; - Ok(()) - } - - #[obs] - async fn get_queued_instances_impl( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - retry_transient_backend("get_queued_instances_impl", || { - let claim = claim.clone(); - async move { self.get_queued_instances_once(size, claim).await } - }) - .await - } - - async fn get_queued_instances_once( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - if size == 0 { - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - let now = Utc::now(); - let mut tx = self.pool.begin().await?; - Self::count_query(&self.query_counts, "select:queued_instances"); - let rows = sqlx::query( - r#" - WITH claimed AS ( - SELECT instance_id, payload - FROM queued_instances - WHERE scheduled_at <= $1 - AND (lock_uuid IS NULL OR lock_expires_at <= $1) - ORDER BY scheduled_at, created_at - LIMIT $2 - FOR UPDATE SKIP LOCKED - ), - updated AS ( - UPDATE queued_instances AS qi - SET lock_uuid = $3, - lock_expires_at = $4 - FROM claimed - WHERE qi.instance_id = claimed.instance_id - RETURNING qi.instance_id, claimed.payload - ) - SELECT updated.instance_id, updated.payload, ri.state - FROM updated - JOIN runner_instances ri ON ri.instance_id = updated.instance_id - "#, - ) - .bind(now) - .bind(size as i64) - .bind(claim.lock_uuid) - .bind(claim.lock_expires_at) - .fetch_all(&mut *tx) - .await?; - - if rows.is_empty() { - tx.commit().await?; - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - - let claimed_instance_ids: Vec = - rows.iter().map(|row| row.get("instance_id")).collect(); - sqlx::query("UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1)") - .bind(&claimed_instance_ids) - .bind(INSTANCE_STATUS_RUNNING) - .execute(&mut *tx) - .await?; - - Self::count_batch_size( - &self.batch_size_counts, - "select:queued_instances", - rows.len(), - ); - tx.commit().await?; - - let mut instances = Vec::new(); - let mut action_node_ids_by_instance: HashMap> = HashMap::new(); - let mut all_action_node_ids: Vec = Vec::new(); - for row in rows { - let instance_id: Uuid = row.get(0); - let payload: Vec = row.get(1); - let state_payload: Option> = row.get(2); - let mut instance: QueuedInstance = Self::deserialize(&payload)?; - instance.instance_id = instance_id; - if let Some(state_payload) = state_payload { - let graph: GraphUpdate = Self::deserialize(&state_payload)?; - let action_node_ids: Vec = graph - .nodes - .iter() - .filter_map(|(node_id, node)| node.is_action_call().then_some(*node_id)) - .collect(); - if !action_node_ids.is_empty() { - all_action_node_ids.extend(action_node_ids.iter().copied()); - action_node_ids_by_instance.insert(instance_id, action_node_ids); - } - instance.state = Some(RunnerState::new( - None, - Some(graph.nodes), - Some(graph.edges), - false, - )); - } - instances.push(instance); - } - - if !all_action_node_ids.is_empty() { - all_action_node_ids.sort_unstable(); - all_action_node_ids.dedup(); - - Self::count_query( - &self.query_counts, - "select:runner_actions_done_by_execution_id", - ); - let rows = sqlx::query( - r#" - SELECT DISTINCT ON (execution_id) - execution_id, - result - FROM runner_actions_done - WHERE execution_id = ANY($1) - ORDER BY execution_id, attempt DESC, id DESC - "#, - ) - .bind(&all_action_node_ids) - .fetch_all(&self.pool) - .await?; - - let mut action_results_by_execution_id: HashMap = - HashMap::new(); - for row in rows { - let execution_id: Uuid = row.get("execution_id"); - let result_payload: Option> = row.get("result"); - let Some(result_payload) = result_payload else { - continue; - }; - let result: serde_json::Value = Self::deserialize(&result_payload)?; - action_results_by_execution_id.insert(execution_id, result); - } - - for instance in &mut instances { - let Some(action_node_ids) = action_node_ids_by_instance.get(&instance.instance_id) - else { - continue; - }; - for node_id in action_node_ids { - if let Some(result) = action_results_by_execution_id.get(node_id) { - instance.action_results.insert(*node_id, result.clone()); - } - } - } - } - - Ok(QueuedInstanceBatch { instances }) - } - - #[obs] - async fn save_instances_done_impl(&self, instances: &[InstanceDone]) -> BackendResult<()> { - retry_transient_backend("save_instances_done_impl", || async move { - self.save_instances_done_once(instances).await - }) - .await - } - - async fn save_instances_done_once(&self, instances: &[InstanceDone]) -> BackendResult<()> { - if instances.is_empty() { - return Ok(()); - } - let ids: Vec = instances - .iter() - .map(|instance| instance.executor_id) - .collect(); - - let mut tx = self.pool.begin().await?; - Self::count_query(&self.query_counts, "delete:queued_instances_by_id"); - sqlx::query("DELETE FROM queued_instances WHERE instance_id = ANY($1)") - .bind(&ids) - .execute(&mut *tx) - .await?; - - Self::count_query(&self.query_counts, "update:runner_instances_result"); - Self::count_batch_size( - &self.batch_size_counts, - "update:runner_instances_result", - instances.len(), - ); - let mut payloads = Vec::with_capacity(instances.len()); - for instance in instances { - let current_status = instance_done_status(instance); - let result = match &instance.result { - Some(value) => Some(Self::serialize(value)?), - None => None, - }; - let error = match &instance.error { - Some(value) => Some(Self::serialize(value)?), - None => None, - }; - payloads.push((instance.executor_id, current_status, result, error)); - } - let mut builder: QueryBuilder = QueryBuilder::new( - "UPDATE runner_instances AS ri SET result = v.result, error = v.error, current_status = v.current_status FROM (", - ); - builder.push_values( - payloads.iter(), - |mut b, (instance_id, current_status, result, error)| { - b.push_bind(*instance_id) - .push_bind(*current_status) - .push_bind(result.as_deref()) - .push_bind(error.as_deref()); - }, - ); - builder.push( - ") AS v(instance_id, current_status, result, error) WHERE ri.instance_id = v.instance_id", - ); - builder.build().execute(&mut *tx).await?; - tx.commit().await?; - Ok(()) - } -} - -#[async_trait] -impl CoreBackend for PostgresBackend { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } - - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - self.save_graphs_impl(claim, graphs).await - } - - async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { - self.save_actions_done_impl(actions).await - } - - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - self.get_queued_instances_impl(size, claim).await - } - - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { - self.save_instances_done_impl(instances).await - } - - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - retry_transient_backend("refresh_instance_locks", || { - let claim = claim.clone(); - async move { self.refresh_instance_locks_once(claim, instance_ids).await } - }) - .await - } - - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()> { - if instance_ids.is_empty() { - return Ok(()); - } - Self::count_query(&self.query_counts, "update:queued_instances_release"); - let released_rows = sqlx::query( - r#" - WITH releasable AS ( - SELECT instance_id - FROM queued_instances - WHERE instance_id = ANY($1) - AND lock_uuid = $2 - FOR UPDATE SKIP LOCKED - ), - released AS ( - UPDATE queued_instances AS qi - SET lock_uuid = NULL, - lock_expires_at = NULL - FROM releasable - WHERE qi.instance_id = releasable.instance_id - RETURNING qi.instance_id - ) - SELECT instance_id FROM released - "#, - ) - .bind(instance_ids) - .bind(lock_uuid) - .fetch_all(&self.pool) - .await?; - - if !released_rows.is_empty() { - let released_instance_ids: Vec = released_rows - .iter() - .map(|row| row.get("instance_id")) - .collect(); - sqlx::query( - "UPDATE runner_instances SET current_status = $2 WHERE instance_id = ANY($1) AND result IS NULL AND error IS NULL", - ) - .bind(&released_instance_ids) - .bind(INSTANCE_STATUS_QUEUED) - .execute(&self.pool) - .await?; - } - - Ok(()) - } - - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { - PostgresBackend::queue_instances(self, instances).await - } -} - -impl PostgresBackend { - async fn refresh_instance_locks_once( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - if instance_ids.is_empty() { - return Ok(Vec::new()); - } - Self::count_query(&self.query_counts, "update:queued_instances_lock"); - sqlx::query( - r#" - WITH claimable AS ( - SELECT instance_id - FROM queued_instances - WHERE instance_id = ANY($2) - AND lock_uuid = $3 - FOR UPDATE SKIP LOCKED - ) - UPDATE queued_instances AS qi - SET lock_expires_at = $1 - FROM claimable - WHERE qi.instance_id = claimable.instance_id - "#, - ) - .bind(claim.lock_expires_at) - .bind(instance_ids) - .bind(claim.lock_uuid) - .execute(&self.pool) - .await?; - let rows = sqlx::query( - "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = ANY($1)", - ) - .bind(instance_ids) - .fetch_all(&self.pool) - .await?; - let mut locks = Vec::with_capacity(rows.len()); - for row in rows { - locks.push(InstanceLockStatus { - instance_id: row.get(0), - lock_uuid: row.get(1), - lock_expires_at: row.get(2), - }); - } - Ok(locks) - } -} - -#[async_trait] -impl GarbageCollectorBackend for PostgresBackend { - async fn collect_done_instances( - &self, - older_than: DateTime, - limit: usize, - ) -> BackendResult { - self.collect_done_instances_impl(older_than, limit).await - } -} - -#[async_trait] -impl WorkerStatusBackend for PostgresBackend { - async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { - PostgresBackend::upsert_worker_status(self, status).await - } -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - use std::sync::Arc; - use std::sync::atomic::{AtomicUsize, Ordering}; - use std::time::Duration as StdDuration; - - use chrono::{DateTime, Duration, Utc}; - use serial_test::serial; - use sqlx::Row; - use uuid::Uuid; - - use super::super::test_helpers::setup_backend; - use super::*; - use crate::backends::{ - ActionAttemptStatus, CoreBackend, GarbageCollectorBackend, WorkerStatusBackend, - }; - use crate::waymark_core::runner::state::{ActionCallSpec, ExecutionNode, NodeStatus}; - use waymark_dag::EdgeType; - - fn sample_runner_state() -> RunnerState { - RunnerState::new(None, None, None, false) - } - - fn sample_queued_instance(instance_id: Uuid, entry_node: Uuid) -> QueuedInstance { - QueuedInstance { - workflow_version_id: Uuid::new_v4(), - schedule_id: None, - dag: None, - entry_node, - state: Some(sample_runner_state()), - action_results: HashMap::new(), - instance_id, - scheduled_at: Some(Utc::now() - Duration::seconds(1)), - } - } - - fn sample_execution_node(node_id: Uuid) -> ExecutionNode { - ExecutionNode { - node_id, - node_type: "action_call".to_string(), - label: "@tests.action()".to_string(), - status: NodeStatus::Queued, - template_id: Some("n0".to_string()), - targets: Vec::new(), - action: Some(ActionCallSpec { - action_name: "tests.action".to_string(), - module_name: Some("tests".to_string()), - kwargs: HashMap::new(), - }), - value_expr: None, - assignments: HashMap::new(), - action_attempt: 1, - started_at: None, - completed_at: None, - scheduled_at: Some(Utc::now() + Duration::seconds(15)), - } - } - - fn sample_lock_claim() -> LockClaim { - LockClaim { - lock_uuid: Uuid::new_v4(), - lock_expires_at: Utc::now() + Duration::seconds(30), - } - } - - async fn insert_workflow_version_row( - backend: &PostgresBackend, - workflow_version_id: Uuid, - workflow_name: &str, - ) { - sqlx::query( - "INSERT INTO workflow_versions (id, workflow_name, workflow_version, ir_hash, program_proto, concurrent) VALUES ($1, $2, $3, $4, $5, $6)", - ) - .bind(workflow_version_id) - .bind(workflow_name) - .bind("v1") - .bind(format!("hash-{workflow_name}")) - .bind(vec![0_u8]) - .bind(false) - .execute(backend.pool()) - .await - .expect("insert workflow version row"); - } - - async fn claim_instance(backend: &PostgresBackend, instance_id: Uuid) -> LockClaim { - let claim = sample_lock_claim(); - let batch = CoreBackend::get_queued_instances(backend, 10, claim.clone()) - .await - .expect("claim queued instance"); - assert_eq!(batch.instances.len(), 1); - assert_eq!(batch.instances[0].instance_id, instance_id); - claim - } - - #[serial(postgres)] - #[tokio::test] - async fn core_queue_instances_happy_path() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let queued = sample_queued_instance(instance_id, entry_node); - let expected_workflow_version_id = queued.workflow_version_id; - - CoreBackend::queue_instances(&backend, &[queued]) - .await - .expect("queue instances"); - - let queued_count: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM queued_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued count"); - assert_eq!(queued_count, 1); - - let runner_count: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner count"); - assert_eq!(runner_count, 1); - - let workflow_version_id: Option = sqlx::query_scalar( - "SELECT workflow_version_id FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner workflow version"); - assert_eq!(workflow_version_id, Some(expected_workflow_version_id)); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner current status"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - - let queued_status: Option = sqlx::query_scalar( - "SELECT current_status FROM queued_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued current status"); - assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_queue_instances_persists_workflow_name_when_registered() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let workflow_version_id = Uuid::new_v4(); - insert_workflow_version_row(&backend, workflow_version_id, "tests.searchable").await; - - let queued = QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node, - state: Some(sample_runner_state()), - action_results: HashMap::new(), - instance_id, - scheduled_at: Some(Utc::now()), - }; - - CoreBackend::queue_instances(&backend, &[queued]) - .await - .expect("queue instances"); - - let runner_workflow_name: Option = - sqlx::query_scalar("SELECT workflow_name FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner workflow_name"); - assert_eq!(runner_workflow_name.as_deref(), Some("tests.searchable")); - - let queued_workflow_name: Option = - sqlx::query_scalar("SELECT workflow_name FROM queued_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued workflow_name"); - assert_eq!(queued_workflow_name.as_deref(), Some("tests.searchable")); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_get_queued_instances_updates_runner_status_without_mutating_queue_status() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let queued = sample_queued_instance(instance_id, entry_node); - CoreBackend::queue_instances(&backend, &[queued]) - .await - .expect("queue instances"); - - let claim = sample_lock_claim(); - let batch = CoreBackend::get_queued_instances(&backend, 1, claim.clone()) - .await - .expect("get queued instances"); - assert_eq!(batch.instances.len(), 1); - assert_eq!(batch.instances[0].instance_id, instance_id); - - let row = sqlx::query("SELECT lock_uuid FROM queued_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued lock row"); - let lock_uuid: Option = row.get("lock_uuid"); - assert_eq!(lock_uuid, Some(claim.lock_uuid)); - - let queued_status: Option = sqlx::query_scalar( - "SELECT current_status FROM queued_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued current status"); - assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner current status"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_RUNNING)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_get_queued_instances_restores_action_results_from_actions_done() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - - let initial_claim = sample_lock_claim(); - let initial_batch = CoreBackend::get_queued_instances(&backend, 1, initial_claim.clone()) - .await - .expect("initial claim"); - assert_eq!(initial_batch.instances.len(), 1); - - let execution_id = Uuid::new_v4(); - let mut completed_action_node = sample_execution_node(execution_id); - completed_action_node.status = NodeStatus::Completed; - completed_action_node.scheduled_at = None; - - let graph = GraphUpdate { - instance_id, - nodes: HashMap::from([(execution_id, completed_action_node)]), - edges: std::collections::HashSet::new(), - }; - CoreBackend::save_graphs( - &backend, - initial_claim.clone(), - std::slice::from_ref(&graph), - ) - .await - .expect("persist graph"); - - CoreBackend::save_actions_done( - &backend, - &[ActionDone { - execution_id, - attempt: 1, - status: ActionAttemptStatus::Completed, - started_at: None, - completed_at: Some(Utc::now()), - duration_ms: None, - result: serde_json::json!({"ok": true}), - }], - ) - .await - .expect("persist action result"); - - CoreBackend::release_instance_locks(&backend, initial_claim.lock_uuid, &[instance_id]) - .await - .expect("release initial lock"); - - let queued_status: Option = sqlx::query_scalar( - "SELECT current_status FROM queued_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued current status after release"); - assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner current status after release"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - - let second_claim = sample_lock_claim(); - let batch = CoreBackend::get_queued_instances(&backend, 1, second_claim) - .await - .expect("rehydrate instance"); - assert_eq!(batch.instances.len(), 1); - assert_eq!( - batch.instances[0].action_results.get(&execution_id), - Some(&serde_json::json!({"ok": true})) - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_save_graphs_happy_path() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - let claim = claim_instance(&backend, instance_id).await; - - let execution_id = Uuid::new_v4(); - let mut nodes = HashMap::new(); - nodes.insert(execution_id, sample_execution_node(execution_id)); - let graph = GraphUpdate { - instance_id, - nodes, - edges: std::collections::HashSet::from([ - crate::waymark_core::runner::state::ExecutionEdge { - source: execution_id, - target: execution_id, - edge_type: EdgeType::StateMachine, - }, - ]), - }; - let extended_claim = LockClaim { - lock_uuid: claim.lock_uuid, - lock_expires_at: claim.lock_expires_at + Duration::seconds(120), - }; - - let locks = CoreBackend::save_graphs( - &backend, - extended_claim.clone(), - std::slice::from_ref(&graph), - ) - .await - .expect("save graphs"); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].instance_id, instance_id); - assert_eq!(locks[0].lock_uuid, Some(claim.lock_uuid)); - assert_eq!( - locks[0] - .lock_expires_at - .map(|value| value.timestamp_micros()), - Some(extended_claim.lock_expires_at.timestamp_micros()), - ); - - let state_payload: Option> = - sqlx::query_scalar("SELECT state FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner state payload"); - let decoded: GraphUpdate = rmp_serde::from_slice(&state_payload.expect("state payload")) - .expect("decode graph update"); - assert_eq!(decoded.nodes.len(), 1); - assert_eq!(decoded.edges.len(), 1); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_save_graphs_returns_lock_status_for_duplicate_instance_updates() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - let claim = claim_instance(&backend, instance_id).await; - - let first_node_id = Uuid::new_v4(); - let second_node_id = Uuid::new_v4(); - let first_graph = GraphUpdate { - instance_id, - nodes: HashMap::from([(first_node_id, sample_execution_node(first_node_id))]), - edges: HashSet::new(), - }; - let second_graph = GraphUpdate { - instance_id, - nodes: HashMap::from([(second_node_id, sample_execution_node(second_node_id))]), - edges: HashSet::new(), - }; - - let locks = CoreBackend::save_graphs( - &backend, - claim.clone(), - &[first_graph.clone(), second_graph.clone()], - ) - .await - .expect("save duplicate instance graphs"); - assert_eq!(locks.len(), 2); - assert_eq!(locks[0].instance_id, instance_id); - assert_eq!(locks[1].instance_id, instance_id); - assert_eq!(locks[0].lock_uuid, Some(claim.lock_uuid)); - assert_eq!(locks[1].lock_uuid, Some(claim.lock_uuid)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_save_actions_done_happy_path() { - let backend = setup_backend().await; - let execution_id = Uuid::new_v4(); - CoreBackend::save_actions_done( - &backend, - &[ActionDone { - execution_id, - attempt: 1, - status: ActionAttemptStatus::Completed, - started_at: None, - completed_at: Some(Utc::now()), - duration_ms: None, - result: serde_json::json!({"ok": true}), - }], - ) - .await - .expect("save actions done"); - - let row = sqlx::query( - "SELECT execution_id, attempt, status, started_at, completed_at, duration_ms, result FROM runner_actions_done WHERE execution_id = $1", - ) - .bind(execution_id) - .fetch_one(backend.pool()) - .await - .expect("action row"); - - assert_eq!(row.get::("execution_id"), execution_id); - assert_eq!(row.get::("attempt"), 1); - assert_eq!(row.get::("status"), "completed"); - assert!( - row.get::>, _>("completed_at") - .is_some() - ); - let payload: Option> = row.get("result"); - let decoded: serde_json::Value = - rmp_serde::from_slice(&payload.expect("action payload")).expect("decode action"); - assert_eq!(decoded, serde_json::json!({"ok": true})); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_refresh_instance_locks_happy_path() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - let claim = claim_instance(&backend, instance_id).await; - - let refreshed_expiry = Utc::now() + Duration::seconds(120); - let refreshed = CoreBackend::refresh_instance_locks( - &backend, - LockClaim { - lock_uuid: claim.lock_uuid, - lock_expires_at: refreshed_expiry, - }, - &[instance_id], - ) - .await - .expect("refresh locks"); - - assert_eq!(refreshed.len(), 1); - assert_eq!(refreshed[0].instance_id, instance_id); - assert_eq!(refreshed[0].lock_uuid, Some(claim.lock_uuid)); - assert_eq!( - refreshed[0] - .lock_expires_at - .map(|value| value.timestamp_micros()), - Some(refreshed_expiry.timestamp_micros()), - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_refresh_instance_locks_skip_locked_does_not_block_or_override() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - let claim = claim_instance(&backend, instance_id).await; - - let mut tx = backend.pool().begin().await.expect("begin lock tx"); - sqlx::query("SELECT instance_id FROM queued_instances WHERE instance_id = $1 FOR UPDATE") - .bind(instance_id) - .fetch_one(&mut *tx) - .await - .expect("lock queued row"); - - let refreshed_expiry = Utc::now() + Duration::seconds(120); - let refreshed = tokio::time::timeout( - StdDuration::from_millis(300), - CoreBackend::refresh_instance_locks( - &backend, - LockClaim { - lock_uuid: claim.lock_uuid, - lock_expires_at: refreshed_expiry, - }, - &[instance_id], - ), - ) - .await - .expect("refresh should not block") - .expect("refresh locks"); - - assert_eq!(refreshed.len(), 1); - assert_eq!(refreshed[0].instance_id, instance_id); - assert_eq!(refreshed[0].lock_uuid, Some(claim.lock_uuid)); - assert_eq!( - refreshed[0] - .lock_expires_at - .map(|value| value.timestamp_micros()), - Some(claim.lock_expires_at.timestamp_micros()), - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_release_instance_locks_happy_path() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - let claim = claim_instance(&backend, instance_id).await; - - CoreBackend::release_instance_locks(&backend, claim.lock_uuid, &[instance_id]) - .await - .expect("release locks"); - - let row = sqlx::query( - "SELECT lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("lock row"); - let lock_uuid: Option = row.get("lock_uuid"); - let lock_expires_at: Option> = row.get("lock_expires_at"); - assert!(lock_uuid.is_none()); - assert!(lock_expires_at.is_none()); - - let queued_status: Option = sqlx::query_scalar( - "SELECT current_status FROM queued_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued current status after release"); - assert_eq!(queued_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner current status after release"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_QUEUED)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_reclaim_expired_instance_locks_happy_path() { - let backend = setup_backend().await; - let expired_id = Uuid::new_v4(); - let live_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances( - &backend, - &[ - sample_queued_instance(expired_id, entry_node), - sample_queued_instance(live_id, entry_node), - ], - ) - .await - .expect("queue instances"); - - let claim = sample_lock_claim(); - let claimed = CoreBackend::get_queued_instances(&backend, 10, claim.clone()) - .await - .expect("claim queued instances"); - assert_eq!(claimed.instances.len(), 2); - - let expired_at = Utc::now() - Duration::seconds(1); - let live_at = Utc::now() + Duration::seconds(60); - sqlx::query( - r#" - UPDATE queued_instances - SET lock_expires_at = CASE - WHEN instance_id = $1 THEN $3 - ELSE $4 - END - WHERE instance_id IN ($1, $2) - "#, - ) - .bind(expired_id) - .bind(live_id) - .bind(expired_at) - .bind(live_at) - .execute(backend.pool()) - .await - .expect("set lock expiries"); - - let reclaimed = backend - .reclaim_expired_instance_locks(10) - .await - .expect("reclaim expired locks"); - assert_eq!(reclaimed, 1); - - let rows = sqlx::query( - "SELECT instance_id, lock_uuid, lock_expires_at FROM queued_instances WHERE instance_id IN ($1, $2)", - ) - .bind(expired_id) - .bind(live_id) - .fetch_all(backend.pool()) - .await - .expect("fetch lock rows"); - let mut lock_rows: HashMap, Option>)> = - HashMap::new(); - for row in rows { - let instance_id: Uuid = row.get("instance_id"); - let lock_uuid: Option = row.get("lock_uuid"); - let lock_expires_at: Option> = row.get("lock_expires_at"); - lock_rows.insert(instance_id, (lock_uuid, lock_expires_at)); - } - - let expired_lock = lock_rows.get(&expired_id).expect("expired lock row"); - assert_eq!(*expired_lock, (None, None)); - - let expired_runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(expired_id) - .fetch_one(backend.pool()) - .await - .expect("expired runner status"); - assert_eq!( - expired_runner_status.as_deref(), - Some(INSTANCE_STATUS_QUEUED) - ); - - let live_lock = lock_rows.get(&live_id).expect("live lock row"); - assert_eq!(live_lock.0, Some(claim.lock_uuid)); - assert_eq!( - live_lock.1.map(|value| value.timestamp_micros()), - Some(live_at.timestamp_micros()), - ); - - let live_runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(live_id) - .fetch_one(backend.pool()) - .await - .expect("live runner status"); - assert_eq!(live_runner_status.as_deref(), Some(INSTANCE_STATUS_RUNNING)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_save_instances_done_happy_path() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - - CoreBackend::save_instances_done( - &backend, - &[InstanceDone { - executor_id: instance_id, - entry_node, - result: Some(serde_json::json!({"value": 3})), - error: None, - }], - ) - .await - .expect("save instances done"); - - let result_payload: Option> = - sqlx::query_scalar("SELECT result FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("result payload"); - let decoded: serde_json::Value = - rmp_serde::from_slice(&result_payload.expect("stored result")).expect("decode result"); - assert_eq!(decoded, serde_json::json!({"value": 3})); - - let queued_count: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM queued_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("queued count"); - assert_eq!(queued_count, 0); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner status"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_COMPLETED)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_save_instances_done_updates_runner_even_if_queue_row_missing() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - CoreBackend::queue_instances(&backend, &[sample_queued_instance(instance_id, entry_node)]) - .await - .expect("queue instances"); - - sqlx::query("DELETE FROM queued_instances WHERE instance_id = $1") - .bind(instance_id) - .execute(backend.pool()) - .await - .expect("delete queued row"); - - CoreBackend::save_instances_done( - &backend, - &[InstanceDone { - executor_id: instance_id, - entry_node, - result: Some(serde_json::json!({"value": 11})), - error: None, - }], - ) - .await - .expect("save instances done without queue row"); - - let runner_status: Option = sqlx::query_scalar( - "SELECT current_status FROM runner_instances WHERE instance_id = $1", - ) - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("runner status"); - assert_eq!(runner_status.as_deref(), Some(INSTANCE_STATUS_COMPLETED)); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_retry_transient_deadlock_sqlstate_happy_path() { - let backend = setup_backend().await; - let pool = backend.pool().clone(); - let attempts = Arc::new(AtomicUsize::new(0)); - let result = retry_transient_backend("core_retry_test", || { - let pool = pool.clone(); - let attempts = Arc::clone(&attempts); - async move { - let attempt = attempts.fetch_add(1, Ordering::SeqCst); - if attempt < 2 { - sqlx::query( - "DO $$ BEGIN RAISE EXCEPTION 'simulated deadlock' USING ERRCODE='40P01'; END $$;", - ) - .execute(&pool) - .await?; - } - Ok(()) - } - }) - .await; - - assert!(result.is_ok()); - assert_eq!(attempts.load(Ordering::SeqCst), 3); - } - - #[serial(postgres)] - #[tokio::test] - async fn core_retry_non_transient_sqlstate_fails_without_retry() { - let backend = setup_backend().await; - let pool = backend.pool().clone(); - let attempts = Arc::new(AtomicUsize::new(0)); - let result = retry_transient_backend("core_retry_non_transient_test", || { - let pool = pool.clone(); - let attempts = Arc::clone(&attempts); - async move { - attempts.fetch_add(1, Ordering::SeqCst); - sqlx::query( - "DO $$ BEGIN RAISE EXCEPTION 'simulated unique violation' USING ERRCODE='23505'; END $$;", - ) - .execute(&pool) - .await?; - Ok::<(), BackendError>(()) - } - }) - .await; - - assert!(result.is_err()); - assert_eq!(attempts.load(Ordering::SeqCst), 1); - } - - #[serial(postgres)] - #[tokio::test] - async fn garbage_collector_deletes_old_done_instances_and_actions() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let execution_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let workflow_version_id = Uuid::new_v4(); - - let state = GraphUpdate { - instance_id, - nodes: HashMap::from([(execution_id, sample_execution_node(execution_id))]), - edges: HashSet::new(), - }; - let state_payload = PostgresBackend::serialize(&state).expect("serialize state"); - let result_payload = - PostgresBackend::serialize(&serde_json::json!({"ok": true})).expect("serialize done"); - let action_payload = - PostgresBackend::serialize(&serde_json::json!({"value": 1})).expect("serialize action"); - - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, created_at, state, result) VALUES ($1, $2, $3, $4, $5, $6)", - ) - .bind(instance_id) - .bind(entry_node) - .bind(workflow_version_id) - .bind(Utc::now() - Duration::hours(30)) - .bind(state_payload) - .bind(result_payload) - .execute(backend.pool()) - .await - .expect("insert old done instance"); - - sqlx::query( - "INSERT INTO runner_actions_done (execution_id, attempt, status, result) VALUES ($1, $2, $3, $4)", - ) - .bind(execution_id) - .bind(1_i32) - .bind("completed") - .bind(action_payload) - .execute(backend.pool()) - .await - .expect("insert action row"); - - let result = GarbageCollectorBackend::collect_done_instances( - &backend, - Utc::now() - Duration::hours(24), - 100, - ) - .await - .expect("collect done instances"); - - assert_eq!(result.deleted_instances, 1); - assert_eq!(result.deleted_actions, 1); - - let remaining_instances: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("count instances"); - assert_eq!(remaining_instances, 0); - - let remaining_actions: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM runner_actions_done WHERE execution_id = $1") - .bind(execution_id) - .fetch_one(backend.pool()) - .await - .expect("count actions"); - assert_eq!(remaining_actions, 0); - } - - #[serial(postgres)] - #[tokio::test] - async fn garbage_collector_keeps_recent_done_instances() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let workflow_version_id = Uuid::new_v4(); - let state_payload = PostgresBackend::serialize(&GraphUpdate { - instance_id, - nodes: HashMap::new(), - edges: HashSet::new(), - }) - .expect("serialize state"); - let result_payload = - PostgresBackend::serialize(&serde_json::json!({"ok": true})).expect("serialize done"); - - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, created_at, state, result) VALUES ($1, $2, $3, $4, $5, $6)", - ) - .bind(instance_id) - .bind(entry_node) - .bind(workflow_version_id) - .bind(Utc::now() - Duration::hours(1)) - .bind(state_payload) - .bind(result_payload) - .execute(backend.pool()) - .await - .expect("insert recent done instance"); - - let result = GarbageCollectorBackend::collect_done_instances( - &backend, - Utc::now() - Duration::hours(24), - 100, - ) - .await - .expect("collect done instances"); - - assert_eq!(result.deleted_instances, 0); - assert_eq!(result.deleted_actions, 0); - - let remaining_instances: i64 = - sqlx::query_scalar("SELECT COUNT(*) FROM runner_instances WHERE instance_id = $1") - .bind(instance_id) - .fetch_one(backend.pool()) - .await - .expect("count instances"); - assert_eq!(remaining_instances, 1); - } - - #[serial(postgres)] - #[tokio::test] - async fn worker_status_backend_upsert_worker_status_happy_path() { - let backend = setup_backend().await; - let pool_id = Uuid::new_v4(); - - WorkerStatusBackend::upsert_worker_status( - &backend, - &WorkerStatusUpdate { - pool_id, - throughput_per_min: 180.0, - total_completed: 20, - last_action_at: Some(Utc::now()), - median_dequeue_ms: Some(5), - median_handling_ms: Some(12), - dispatch_queue_size: 3, - total_in_flight: 2, - active_workers: 4, - actions_per_sec: 3.0, - median_instance_duration_secs: Some(0.2), - active_instance_count: 1, - total_instances_completed: 8, - instances_per_sec: 0.5, - instances_per_min: 30.0, - time_series: None, - }, - ) - .await - .expect("upsert worker status"); - - let row = sqlx::query( - "SELECT total_completed, active_workers, actions_per_sec FROM worker_status WHERE pool_id = $1", - ) - .bind(pool_id) - .fetch_one(backend.pool()) - .await - .expect("worker status row"); - assert_eq!(row.get::("total_completed"), 20); - assert_eq!(row.get::("active_workers"), 4); - assert_eq!(row.get::("actions_per_sec"), 3.0); - } -} diff --git a/crates/waymark/src/backends/postgres/mod.rs b/crates/waymark/src/backends/postgres/mod.rs deleted file mode 100644 index 4bec275d..00000000 --- a/crates/waymark/src/backends/postgres/mod.rs +++ /dev/null @@ -1,116 +0,0 @@ -//! Postgres backend for persisting runner state and action results. - -mod core; -mod registry; -mod scheduler; -#[cfg(test)] -mod test_helpers; -mod webapp; - -use std::collections::HashMap; -use std::sync::{Arc, Mutex}; - -use sqlx::PgPool; - -use crate::db; -use crate::observability::obs; - -use super::base::{BackendError, BackendResult}; - -/// Persist runner state and action results in Postgres. -#[derive(Clone)] -pub struct PostgresBackend { - pool: PgPool, - query_counts: Arc>>, - batch_size_counts: Arc>>>, -} - -impl PostgresBackend { - pub fn new(pool: PgPool) -> Self { - Self { - pool, - query_counts: Arc::new(Mutex::new(HashMap::new())), - batch_size_counts: Arc::new(Mutex::new(HashMap::new())), - } - } - - #[obs] - pub async fn connect(dsn: &str) -> BackendResult { - let pool = PgPool::connect(dsn).await?; - db::run_migrations(&pool).await?; - Ok(Self::new(pool)) - } - - pub fn pool(&self) -> &PgPool { - &self.pool - } - - /// Delete all queued instances from the backing table. - #[obs] - pub async fn clear_queue(&self) -> BackendResult<()> { - Self::count_query(&self.query_counts, "delete:queued_instances_all"); - sqlx::query("DELETE FROM queued_instances") - .execute(&self.pool) - .await?; - Ok(()) - } - - /// Delete all persisted runner data for a clean benchmark run. - #[obs] - pub async fn clear_all(&self) -> BackendResult<()> { - Self::count_query(&self.query_counts, "truncate:runner_tables"); - sqlx::query( - r#" - TRUNCATE runner_actions_done, - runner_instances, - queued_instances - RESTART IDENTITY - "#, - ) - .execute(&self.pool) - .await?; - Ok(()) - } - - pub fn query_counts(&self) -> HashMap { - self.query_counts - .lock() - .expect("query counts poisoned") - .clone() - } - - pub fn batch_size_counts(&self) -> HashMap> { - self.batch_size_counts - .lock() - .expect("batch size counts poisoned") - .clone() - } - - pub(super) fn count_query(counts: &Arc>>, label: &str) { - let mut guard = counts.lock().expect("query counts poisoned"); - *guard.entry(label.to_string()).or_insert(0) += 1; - } - - pub(super) fn count_batch_size( - counts: &Arc>>>, - label: &str, - size: usize, - ) { - if size == 0 { - return; - } - let mut guard = counts.lock().expect("batch size counts poisoned"); - let entry = guard.entry(label.to_string()).or_default(); - *entry.entry(size).or_insert(0) += 1; - } - - pub(super) fn serialize(value: &T) -> Result, BackendError> { - rmp_serde::to_vec_named(value).map_err(|e| BackendError::Message(e.to_string())) - } - - pub(super) fn deserialize( - payload: &[u8], - ) -> Result { - rmp_serde::from_slice(payload).map_err(|e| BackendError::Message(e.to_string())) - } -} diff --git a/crates/waymark/src/backends/postgres/registry.rs b/crates/waymark/src/backends/postgres/registry.rs deleted file mode 100644 index c8fb5a68..00000000 --- a/crates/waymark/src/backends/postgres/registry.rs +++ /dev/null @@ -1,146 +0,0 @@ -use sqlx::Row; -use tonic::async_trait; -use uuid::Uuid; - -use super::PostgresBackend; -use crate::backends::base::{ - BackendError, BackendResult, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; - -#[async_trait] -impl WorkflowRegistryBackend for PostgresBackend { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult { - let inserted = sqlx::query( - r#" - INSERT INTO workflow_versions - (workflow_name, workflow_version, ir_hash, program_proto, concurrent) - VALUES ($1, $2, $3, $4, $5) - ON CONFLICT (workflow_name, workflow_version) - DO NOTHING - RETURNING id - "#, - ) - .bind(®istration.workflow_name) - .bind(®istration.workflow_version) - .bind(®istration.ir_hash) - .bind(®istration.program_proto) - .bind(registration.concurrent) - .fetch_optional(&self.pool) - .await?; - - if let Some(row) = inserted { - let id: Uuid = row.get("id"); - return Ok(id); - } - - let row = sqlx::query( - r#" - SELECT id, ir_hash - FROM workflow_versions - WHERE workflow_name = $1 AND workflow_version = $2 - "#, - ) - .bind(®istration.workflow_name) - .bind(®istration.workflow_version) - .fetch_one(&self.pool) - .await?; - - let id: Uuid = row.get("id"); - let existing_hash: String = row.get("ir_hash"); - if existing_hash != registration.ir_hash { - return Err(BackendError::Message(format!( - "workflow version already exists with different IR hash: {}@{}", - registration.workflow_name, registration.workflow_version - ))); - } - - Ok(id) - } - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { - if ids.is_empty() { - return Ok(Vec::new()); - } - let rows = sqlx::query( - r#" - SELECT id, workflow_name, workflow_version, ir_hash, program_proto, concurrent - FROM workflow_versions - WHERE id = ANY($1) - "#, - ) - .bind(ids) - .fetch_all(&self.pool) - .await?; - - let mut versions = Vec::with_capacity(rows.len()); - for row in rows { - versions.push(WorkflowVersion { - id: row.get("id"), - workflow_name: row.get("workflow_name"), - workflow_version: row.get("workflow_version"), - ir_hash: row.get("ir_hash"), - program_proto: row.get("program_proto"), - concurrent: row.get("concurrent"), - }); - } - Ok(versions) - } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use super::super::test_helpers::setup_backend; - use crate::backends::{WorkflowRegistration, WorkflowRegistryBackend}; - - fn sample_registration(version: &str) -> WorkflowRegistration { - WorkflowRegistration { - workflow_name: "tests.workflow".to_string(), - workflow_version: version.to_string(), - ir_hash: format!("hash-{version}"), - program_proto: vec![1, 2, 3, 4], - concurrent: true, - } - } - - #[serial(postgres)] - #[tokio::test] - async fn workflow_registry_upsert_workflow_version_happy_path() { - let backend = setup_backend().await; - let registration = sample_registration("v1"); - - let id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) - .await - .expect("insert workflow version"); - let repeat_id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) - .await - .expect("idempotent workflow upsert"); - - assert_eq!(id, repeat_id); - } - - #[serial(postgres)] - #[tokio::test] - async fn workflow_registry_get_workflow_versions_happy_path() { - let backend = setup_backend().await; - let registration = sample_registration("v2"); - let id = WorkflowRegistryBackend::upsert_workflow_version(&backend, ®istration) - .await - .expect("insert workflow version"); - - let versions = WorkflowRegistryBackend::get_workflow_versions(&backend, &[id]) - .await - .expect("get workflow versions"); - assert_eq!(versions.len(), 1); - assert_eq!(versions[0].id, id); - assert_eq!(versions[0].workflow_name, registration.workflow_name); - assert_eq!(versions[0].workflow_version, registration.workflow_version); - assert_eq!(versions[0].ir_hash, registration.ir_hash); - assert_eq!(versions[0].program_proto, registration.program_proto); - assert_eq!(versions[0].concurrent, registration.concurrent); - } -} diff --git a/crates/waymark/src/backends/postgres/scheduler.rs b/crates/waymark/src/backends/postgres/scheduler.rs deleted file mode 100644 index 5eb00735..00000000 --- a/crates/waymark/src/backends/postgres/scheduler.rs +++ /dev/null @@ -1,604 +0,0 @@ -use chrono::{DateTime, Utc}; -use sqlx::Row; -use tonic::async_trait; -use uuid::Uuid; - -use super::PostgresBackend; -use crate::backends::base::{BackendError, BackendResult, SchedulerBackend}; -use crate::scheduler::compute_next_run; -use crate::scheduler::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; - -#[async_trait] -impl SchedulerBackend for PostgresBackend { - async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { - let next_run_at = compute_next_run( - params.schedule_type, - params.cron_expression.as_deref(), - params.interval_seconds, - params.jitter_seconds, - None, - ) - .map_err(BackendError::Message)?; - - let row = sqlx::query( - r#" - INSERT INTO workflow_schedules - (workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, next_run_at, priority, allow_duplicate) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) - ON CONFLICT (workflow_name, schedule_name) - DO UPDATE SET - schedule_type = EXCLUDED.schedule_type, - cron_expression = EXCLUDED.cron_expression, - interval_seconds = EXCLUDED.interval_seconds, - jitter_seconds = EXCLUDED.jitter_seconds, - input_payload = EXCLUDED.input_payload, - next_run_at = COALESCE(workflow_schedules.next_run_at, EXCLUDED.next_run_at), - priority = EXCLUDED.priority, - allow_duplicate = EXCLUDED.allow_duplicate, - status = 'active', - updated_at = NOW() - RETURNING id - "#, - ) - .bind(¶ms.workflow_name) - .bind(¶ms.schedule_name) - .bind(params.schedule_type.as_str()) - .bind(¶ms.cron_expression) - .bind(params.interval_seconds) - .bind(params.jitter_seconds) - .bind(¶ms.input_payload) - .bind(next_run_at) - .bind(params.priority) - .bind(params.allow_duplicate) - .fetch_one(&self.pool) - .await?; - - let id: Uuid = row.get("id"); - Ok(ScheduleId(id)) - } - - async fn get_schedule(&self, id: ScheduleId) -> BackendResult { - let schedule = sqlx::query_as::<_, ScheduleRow>( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, - created_at, updated_at, priority, allow_duplicate - FROM workflow_schedules - WHERE id = $1 - "#, - ) - .bind(id.0) - .fetch_optional(&self.pool) - .await? - .ok_or_else(|| BackendError::Message(format!("schedule not found: {}", id)))?; - - Ok(schedule.into()) - } - - async fn get_schedule_by_name( - &self, - workflow_name: &str, - schedule_name: &str, - ) -> BackendResult> { - let schedule = sqlx::query_as::<_, ScheduleRow>( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, - created_at, updated_at, priority, allow_duplicate - FROM workflow_schedules - WHERE workflow_name = $1 AND schedule_name = $2 AND status != 'deleted' - "#, - ) - .bind(workflow_name) - .bind(schedule_name) - .fetch_optional(&self.pool) - .await?; - - Ok(schedule.map(Into::into)) - } - - async fn list_schedules( - &self, - limit: i64, - offset: i64, - ) -> BackendResult> { - let rows = sqlx::query_as::<_, ScheduleRow>( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, - created_at, updated_at, priority, allow_duplicate - FROM workflow_schedules - WHERE status != 'deleted' - ORDER BY workflow_name, schedule_name - LIMIT $1 OFFSET $2 - "#, - ) - .bind(limit) - .bind(offset) - .fetch_all(&self.pool) - .await?; - - Ok(rows.into_iter().map(Into::into).collect()) - } - - async fn count_schedules(&self) -> BackendResult { - let count = sqlx::query_scalar::<_, i64>( - "SELECT COUNT(*) FROM workflow_schedules WHERE status != 'deleted'", - ) - .fetch_one(&self.pool) - .await?; - - Ok(count) - } - - async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { - let result = sqlx::query( - r#" - UPDATE workflow_schedules - SET status = $2, updated_at = NOW() - WHERE id = $1 - "#, - ) - .bind(id.0) - .bind(status) - .execute(&self.pool) - .await?; - - Ok(result.rows_affected() > 0) - } - - async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { - SchedulerBackend::update_schedule_status(self, id, "deleted").await - } - - async fn find_due_schedules(&self, limit: i32) -> BackendResult> { - let rows = sqlx::query_as::<_, ScheduleRow>( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, - created_at, updated_at, priority, allow_duplicate - FROM workflow_schedules - WHERE status = 'active' - AND next_run_at IS NOT NULL - AND next_run_at <= NOW() - ORDER BY next_run_at - FOR UPDATE SKIP LOCKED - LIMIT $1 - "#, - ) - .bind(limit) - .fetch_all(&self.pool) - .await?; - - Ok(rows.into_iter().map(Into::into).collect()) - } - - async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult { - let has_running = sqlx::query_scalar::<_, bool>( - r#" - SELECT EXISTS( - SELECT 1 - FROM runner_instances ri - JOIN queued_instances qi ON qi.instance_id = ri.instance_id - WHERE ri.schedule_id = $1 - ) - "#, - ) - .bind(schedule_id.0) - .fetch_one(&self.pool) - .await?; - - Ok(has_running) - } - - async fn mark_schedule_executed( - &self, - schedule_id: ScheduleId, - instance_id: Uuid, - ) -> BackendResult<()> { - let schedule = SchedulerBackend::get_schedule(self, schedule_id).await?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - - sqlx::query( - r#" - UPDATE workflow_schedules - SET last_run_at = NOW(), - last_instance_id = $2, - next_run_at = $3, - updated_at = NOW() - WHERE id = $1 - "#, - ) - .bind(schedule_id.0) - .bind(instance_id) - .bind(next_run_at) - .execute(&self.pool) - .await?; - - Ok(()) - } - - async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { - let schedule = SchedulerBackend::get_schedule(self, schedule_id).await?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - - sqlx::query( - r#" - UPDATE workflow_schedules - SET next_run_at = $2, updated_at = NOW() - WHERE id = $1 - "#, - ) - .bind(schedule_id.0) - .bind(next_run_at) - .execute(&self.pool) - .await?; - - Ok(()) - } -} - -#[derive(sqlx::FromRow)] -struct ScheduleRow { - id: Uuid, - workflow_name: String, - schedule_name: String, - schedule_type: String, - cron_expression: Option, - interval_seconds: Option, - jitter_seconds: i64, - input_payload: Option>, - status: String, - next_run_at: Option>, - last_run_at: Option>, - last_instance_id: Option, - created_at: DateTime, - updated_at: DateTime, - priority: i32, - allow_duplicate: bool, -} - -impl From for WorkflowSchedule { - fn from(row: ScheduleRow) -> Self { - Self { - id: row.id, - workflow_name: row.workflow_name, - schedule_name: row.schedule_name, - schedule_type: row.schedule_type, - cron_expression: row.cron_expression, - interval_seconds: row.interval_seconds, - jitter_seconds: row.jitter_seconds, - input_payload: row.input_payload, - status: row.status, - next_run_at: row.next_run_at, - last_run_at: row.last_run_at, - last_instance_id: row.last_instance_id, - created_at: row.created_at, - updated_at: row.updated_at, - priority: row.priority, - allow_duplicate: row.allow_duplicate, - } - } -} - -#[cfg(test)] -mod tests { - use chrono::Utc; - use serial_test::serial; - use sqlx::Row; - use uuid::Uuid; - - use super::super::test_helpers::setup_backend; - use super::*; - use crate::backends::SchedulerBackend; - use crate::scheduler::CreateScheduleParams; - - fn sample_params(schedule_name: &str) -> CreateScheduleParams { - CreateScheduleParams { - workflow_name: "tests.workflow".to_string(), - schedule_name: schedule_name.to_string(), - schedule_type: ScheduleType::Interval, - cron_expression: None, - interval_seconds: Some(60), - jitter_seconds: 0, - input_payload: Some(vec![1, 2, 3]), - priority: 3, - allow_duplicate: true, - } - } - - async fn insert_schedule(backend: &PostgresBackend, schedule_name: &str) -> ScheduleId { - SchedulerBackend::upsert_schedule(backend, &sample_params(schedule_name)) - .await - .expect("upsert schedule") - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_upsert_schedule_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "upsert").await; - let row = sqlx::query("SELECT id FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select schedule"); - - assert_eq!(row.get::("id"), id.0); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_upsert_schedule_preserves_existing_next_run_at() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "preserve-next-run").await; - sqlx::query( - "UPDATE workflow_schedules SET next_run_at = NOW() + INTERVAL '2 days' WHERE id = $1", - ) - .bind(id.0) - .execute(backend.pool()) - .await - .expect("force next_run_at"); - - let before: Option> = - sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select next_run_at before"); - - let upserted_id = - SchedulerBackend::upsert_schedule(&backend, &sample_params("preserve-next-run")) - .await - .expect("upsert existing schedule"); - assert_eq!(upserted_id.0, id.0); - - let after: Option> = - sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select next_run_at after"); - - assert_eq!(after, before); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_get_schedule_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "get").await; - let schedule = SchedulerBackend::get_schedule(&backend, id) - .await - .expect("get schedule"); - - assert_eq!(schedule.id, id.0); - assert_eq!(schedule.schedule_name, "get"); - assert_eq!(schedule.workflow_name, "tests.workflow"); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_get_schedule_by_name_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "by-name").await; - let schedule = - SchedulerBackend::get_schedule_by_name(&backend, "tests.workflow", "by-name") - .await - .expect("get schedule by name") - .expect("expected schedule"); - - assert_eq!(schedule.id, id.0); - assert_eq!(schedule.schedule_name, "by-name"); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_list_schedules_happy_path() { - let backend = setup_backend().await; - - insert_schedule(&backend, "a-list").await; - insert_schedule(&backend, "b-list").await; - - let schedules = SchedulerBackend::list_schedules(&backend, 10, 0) - .await - .expect("list schedules"); - - assert_eq!(schedules.len(), 2); - assert_eq!(schedules[0].schedule_name, "a-list"); - assert_eq!(schedules[1].schedule_name, "b-list"); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_count_schedules_happy_path() { - let backend = setup_backend().await; - - insert_schedule(&backend, "count-a").await; - insert_schedule(&backend, "count-b").await; - - let count = SchedulerBackend::count_schedules(&backend) - .await - .expect("count schedules"); - assert_eq!(count, 2); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_update_schedule_status_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "status").await; - let updated = SchedulerBackend::update_schedule_status(&backend, id, "paused") - .await - .expect("update schedule status"); - assert!(updated); - - let status: String = - sqlx::query_scalar("SELECT status FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select status"); - assert_eq!(status, "paused"); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_delete_schedule_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "delete").await; - let deleted = SchedulerBackend::delete_schedule(&backend, id) - .await - .expect("delete schedule"); - assert!(deleted); - - let status: String = - sqlx::query_scalar("SELECT status FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select status"); - assert_eq!(status, "deleted"); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_find_due_schedules_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "due").await; - sqlx::query( - "UPDATE workflow_schedules SET next_run_at = NOW() - INTERVAL '1 minute' WHERE id = $1", - ) - .bind(id.0) - .execute(backend.pool()) - .await - .expect("force schedule due"); - - let due = SchedulerBackend::find_due_schedules(&backend, 10) - .await - .expect("find due schedules"); - assert_eq!(due.len(), 1); - assert_eq!(due[0].id, id.0); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_has_running_instance_happy_path() { - let backend = setup_backend().await; - - let has_running = SchedulerBackend::has_running_instance(&backend, ScheduleId::new()) - .await - .expect("has running instance"); - assert!(!has_running); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_has_running_instance_true_with_queued_instance() { - let backend = setup_backend().await; - - let schedule_id = insert_schedule(&backend, "running-instance").await; - let instance_id = Uuid::new_v4(); - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, schedule_id) VALUES ($1, $2, $3)", - ) - .bind(instance_id) - .bind(Uuid::new_v4()) - .bind(schedule_id.0) - .execute(backend.pool()) - .await - .expect("insert runner instance"); - sqlx::query("INSERT INTO queued_instances (instance_id, payload) VALUES ($1, $2)") - .bind(instance_id) - .bind(vec![0_u8]) - .execute(backend.pool()) - .await - .expect("insert queued instance"); - - let has_running = SchedulerBackend::has_running_instance(&backend, schedule_id) - .await - .expect("has running instance"); - assert!(has_running); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_mark_schedule_executed_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "mark-executed").await; - let instance_id = Uuid::new_v4(); - SchedulerBackend::mark_schedule_executed(&backend, id, instance_id) - .await - .expect("mark schedule executed"); - - let row = sqlx::query( - "SELECT last_instance_id, last_run_at, next_run_at FROM workflow_schedules WHERE id = $1", - ) - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select schedule"); - - let last_instance_id: Option = row.get("last_instance_id"); - let last_run_at: Option> = row.get("last_run_at"); - let next_run_at: Option> = row.get("next_run_at"); - - assert_eq!(last_instance_id, Some(instance_id)); - assert!(last_run_at.is_some()); - assert!(next_run_at.is_some()); - } - - #[serial(postgres)] - #[tokio::test] - async fn scheduler_skip_schedule_run_happy_path() { - let backend = setup_backend().await; - - let id = insert_schedule(&backend, "skip").await; - sqlx::query( - "UPDATE workflow_schedules SET next_run_at = NOW() - INTERVAL '1 minute' WHERE id = $1", - ) - .bind(id.0) - .execute(backend.pool()) - .await - .expect("force schedule due"); - - SchedulerBackend::skip_schedule_run(&backend, id) - .await - .expect("skip schedule run"); - - let next_run_at: Option> = - sqlx::query_scalar("SELECT next_run_at FROM workflow_schedules WHERE id = $1") - .bind(id.0) - .fetch_one(backend.pool()) - .await - .expect("select next_run_at"); - assert!(next_run_at.expect("next_run_at").gt(&Utc::now())); - } -} diff --git a/crates/waymark/src/backends/postgres/test_helpers.rs b/crates/waymark/src/backends/postgres/test_helpers.rs deleted file mode 100644 index dd03cd7f..00000000 --- a/crates/waymark/src/backends/postgres/test_helpers.rs +++ /dev/null @@ -1,27 +0,0 @@ -use sqlx::PgPool; - -use super::PostgresBackend; -use crate::test_support::postgres_setup; - -pub(super) async fn setup_backend() -> PostgresBackend { - let pool = postgres_setup().await; - reset_database(&pool).await; - PostgresBackend::new(pool) -} - -pub(super) async fn reset_database(pool: &PgPool) { - sqlx::query( - r#" - TRUNCATE runner_actions_done, - queued_instances, - runner_instances, - workflow_versions, - workflow_schedules, - worker_status - RESTART IDENTITY CASCADE - "#, - ) - .execute(pool) - .await - .expect("truncate postgres tables"); -} diff --git a/crates/waymark/src/backends/postgres/webapp.rs b/crates/waymark/src/backends/postgres/webapp.rs deleted file mode 100644 index 8f6b932f..00000000 --- a/crates/waymark/src/backends/postgres/webapp.rs +++ /dev/null @@ -1,2324 +0,0 @@ -use std::collections::HashMap; - -use chrono::{DateTime, Utc}; -use prost::Message; -use serde_json::Value; -use sqlx::{Postgres, QueryBuilder, Row}; -use tonic::async_trait; -use uuid::Uuid; - -use super::PostgresBackend; -use crate::backends::base::{BackendError, BackendResult, GraphUpdate, WebappBackend}; -use crate::messages::ast as ir; -use crate::waymark_core::runner::state::{ActionCallSpec, ExecutionNode, NodeStatus}; -use crate::waymark_core::runner::{RunnerState, ValueExpr, format_value, replay_action_kwargs}; -use crate::webapp::{ - ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView, InstanceDetail, InstanceStatus, - InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, - WorkerActionRow, WorkerAggregateStats, WorkerStatus, -}; -use waymark_dag::{DAGNode, EdgeType, convert_to_dag}; - -const INSTANCE_STATUS_FALLBACK_SQL: &str = r#" -CASE - WHEN ri.error IS NOT NULL THEN 'failed' - WHEN ri.result IS NOT NULL THEN 'completed' - WHEN ri.state IS NOT NULL THEN 'running' - ELSE 'queued' -END -"#; - -#[derive(Debug, Clone, PartialEq, Eq)] -enum InstanceSearchToken { - Term(String), - And, - Or, - LParen, - RParen, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -enum InstanceSearchExpr { - Term(String), - And(Box, Box), - Or(Box, Box), -} - -struct InstanceSearchParser { - tokens: Vec, - position: usize, -} - -impl InstanceSearchParser { - fn new(tokens: Vec) -> Self { - Self { - tokens, - position: 0, - } - } - - fn parse(mut self) -> Option { - let expr = self.parse_or()?; - if self.position == self.tokens.len() { - Some(expr) - } else { - None - } - } - - fn parse_or(&mut self) -> Option { - let mut expr = self.parse_and()?; - while self.consume_or() { - let rhs = self.parse_and()?; - expr = InstanceSearchExpr::Or(Box::new(expr), Box::new(rhs)); - } - Some(expr) - } - - fn parse_and(&mut self) -> Option { - let mut expr = self.parse_primary()?; - loop { - if self.consume_and() || self.peek_is_primary_start() { - let rhs = self.parse_primary()?; - expr = InstanceSearchExpr::And(Box::new(expr), Box::new(rhs)); - continue; - } - break; - } - Some(expr) - } - - fn parse_primary(&mut self) -> Option { - match self.peek()? { - InstanceSearchToken::Term(term) => { - let term = term.clone(); - self.position += 1; - Some(InstanceSearchExpr::Term(term)) - } - InstanceSearchToken::LParen => { - self.position += 1; - let expr = self.parse_or()?; - if !self.consume_rparen() { - return None; - } - Some(expr) - } - InstanceSearchToken::And | InstanceSearchToken::Or | InstanceSearchToken::RParen => { - None - } - } - } - - fn consume_and(&mut self) -> bool { - if matches!(self.peek(), Some(InstanceSearchToken::And)) { - self.position += 1; - true - } else { - false - } - } - - fn consume_or(&mut self) -> bool { - if matches!(self.peek(), Some(InstanceSearchToken::Or)) { - self.position += 1; - true - } else { - false - } - } - - fn consume_rparen(&mut self) -> bool { - if matches!(self.peek(), Some(InstanceSearchToken::RParen)) { - self.position += 1; - true - } else { - false - } - } - - fn peek_is_primary_start(&self) -> bool { - matches!( - self.peek(), - Some(InstanceSearchToken::Term(_)) | Some(InstanceSearchToken::LParen) - ) - } - - fn peek(&self) -> Option<&InstanceSearchToken> { - self.tokens.get(self.position) - } -} - -fn tokenize_instance_search(search: &str) -> Vec { - let mut chars = search.chars().peekable(); - let mut tokens = Vec::new(); - - while let Some(ch) = chars.peek().copied() { - if ch.is_whitespace() { - chars.next(); - continue; - } - if ch == '(' { - chars.next(); - tokens.push(InstanceSearchToken::LParen); - continue; - } - if ch == ')' { - chars.next(); - tokens.push(InstanceSearchToken::RParen); - continue; - } - if ch == '"' { - chars.next(); - let mut quoted = String::new(); - for next in chars.by_ref() { - if next == '"' { - break; - } - quoted.push(next); - } - if !quoted.is_empty() { - tokens.push(InstanceSearchToken::Term(quoted)); - } - continue; - } - - let mut term = String::new(); - while let Some(next) = chars.peek().copied() { - if next.is_whitespace() || next == '(' || next == ')' { - break; - } - term.push(next); - chars.next(); - } - if term.is_empty() { - continue; - } - - match term.to_ascii_uppercase().as_str() { - "AND" => tokens.push(InstanceSearchToken::And), - "OR" => tokens.push(InstanceSearchToken::Or), - _ => tokens.push(InstanceSearchToken::Term(term)), - } - } - - tokens -} - -fn parse_instance_search_expr(search: &str) -> Option { - let trimmed = search.trim(); - if trimmed.is_empty() { - return None; - } - - let tokens = tokenize_instance_search(trimmed); - if tokens.is_empty() { - return None; - } - - InstanceSearchParser::new(tokens) - .parse() - .or_else(|| Some(InstanceSearchExpr::Term(trimmed.to_string()))) -} - -fn push_instance_search_expr_sql( - builder: &mut QueryBuilder<'_, Postgres>, - expr: &InstanceSearchExpr, -) { - match expr { - InstanceSearchExpr::Term(term) => { - let pattern = format!("%{term}%"); - builder.push("("); - builder.push("COALESCE(ri.workflow_name, wv.workflow_name, '') ILIKE "); - builder.push_bind(pattern.clone()); - builder.push(" OR COALESCE(ri.current_status, "); - builder.push(INSTANCE_STATUS_FALLBACK_SQL); - builder.push(", '') ILIKE "); - builder.push_bind(pattern); - builder.push(")"); - } - InstanceSearchExpr::And(left, right) => { - builder.push("("); - push_instance_search_expr_sql(builder, left); - builder.push(" AND "); - push_instance_search_expr_sql(builder, right); - builder.push(")"); - } - InstanceSearchExpr::Or(left, right) => { - builder.push("("); - push_instance_search_expr_sql(builder, left); - builder.push(" OR "); - push_instance_search_expr_sql(builder, right); - builder.push(")"); - } - } -} - -fn parse_instance_status(status: &str) -> Option { - match status { - "queued" => Some(InstanceStatus::Queued), - "running" => Some(InstanceStatus::Running), - "completed" => Some(InstanceStatus::Completed), - "failed" => Some(InstanceStatus::Failed), - _ => None, - } -} - -#[async_trait] -impl WebappBackend for PostgresBackend { - async fn count_instances(&self, search: Option<&str>) -> BackendResult { - let mut builder: QueryBuilder = QueryBuilder::new( - r#" - SELECT COUNT(*)::BIGINT - FROM runner_instances ri - LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id - "#, - ); - - if let Some(search_expr) = search.and_then(parse_instance_search_expr) { - builder.push(" WHERE "); - push_instance_search_expr_sql(&mut builder, &search_expr); - } - - let count: i64 = builder.build_query_scalar().fetch_one(&self.pool).await?; - Ok(count) - } - - async fn list_instances( - &self, - search: Option<&str>, - limit: i64, - offset: i64, - ) -> BackendResult> { - let mut builder: QueryBuilder = QueryBuilder::new( - r#" - SELECT - ri.instance_id, - ri.entry_node, - ri.created_at, - ri.state, - ri.result, - ri.error, - COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name, - COALESCE(ri.current_status, - CASE - WHEN ri.error IS NOT NULL THEN 'failed' - WHEN ri.result IS NOT NULL THEN 'completed' - WHEN ri.state IS NOT NULL THEN 'running' - ELSE 'queued' - END - ) AS current_status - FROM runner_instances ri - LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id - "#, - ); - if let Some(search_expr) = search.and_then(parse_instance_search_expr) { - builder.push(" WHERE "); - push_instance_search_expr_sql(&mut builder, &search_expr); - } - builder.push(" ORDER BY ri.created_at DESC, ri.instance_id DESC LIMIT "); - builder.push_bind(limit); - builder.push(" OFFSET "); - builder.push_bind(offset); - let rows = builder.build().fetch_all(&self.pool).await?; - - let mut instances = Vec::new(); - for row in rows { - let instance_id: Uuid = row.get("instance_id"); - let entry_node: Uuid = row.get("entry_node"); - let created_at: DateTime = row.get("created_at"); - let state_bytes: Option> = row.get("state"); - let result_bytes: Option> = row.get("result"); - let error_bytes: Option> = row.get("error"); - let workflow_name: Option = row.get("workflow_name"); - let current_status: Option = row.get("current_status"); - - let status = current_status - .as_deref() - .and_then(parse_instance_status) - .unwrap_or_else(|| determine_status(&state_bytes, &result_bytes, &error_bytes)); - let input_preview = extract_input_preview(&state_bytes); - - instances.push(InstanceSummary { - id: instance_id, - entry_node, - created_at, - status, - workflow_name, - input_preview, - }); - } - - Ok(instances) - } - - async fn get_instance(&self, instance_id: Uuid) -> BackendResult { - let row = sqlx::query( - r#" - SELECT - ri.instance_id, - ri.entry_node, - ri.created_at, - ri.state, - ri.result, - ri.error, - COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name, - COALESCE(ri.current_status, - CASE - WHEN ri.error IS NOT NULL THEN 'failed' - WHEN ri.result IS NOT NULL THEN 'completed' - WHEN ri.state IS NOT NULL THEN 'running' - ELSE 'queued' - END - ) AS current_status - FROM runner_instances ri - LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id - WHERE ri.instance_id = $1 - "#, - ) - .bind(instance_id) - .fetch_optional(&self.pool) - .await? - .ok_or_else(|| BackendError::Message(format!("instance not found: {}", instance_id)))?; - - let instance_id: Uuid = row.get("instance_id"); - let entry_node: Uuid = row.get("entry_node"); - let created_at: DateTime = row.get("created_at"); - let state_bytes: Option> = row.get("state"); - let result_bytes: Option> = row.get("result"); - let error_bytes: Option> = row.get("error"); - let workflow_name: Option = row.get("workflow_name"); - let current_status: Option = row.get("current_status"); - - let status = current_status - .as_deref() - .and_then(parse_instance_status) - .unwrap_or_else(|| determine_status(&state_bytes, &result_bytes, &error_bytes)); - let input_payload = format_input_payload(&state_bytes); - let result_payload = format_instance_result_payload(status, &result_bytes, &error_bytes); - let error_payload = format_error(&error_bytes); - - Ok(InstanceDetail { - id: instance_id, - entry_node, - created_at, - status, - workflow_name, - input_payload, - result_payload, - error_payload, - }) - } - - async fn get_execution_graph( - &self, - instance_id: Uuid, - ) -> BackendResult> { - let row = sqlx::query( - r#" - SELECT state FROM runner_instances WHERE instance_id = $1 - "#, - ) - .bind(instance_id) - .fetch_optional(&self.pool) - .await?; - - let Some(row) = row else { - return Ok(None); - }; - - let state_bytes: Option> = row.get("state"); - let Some(state_bytes) = state_bytes else { - return Ok(None); - }; - - let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) - .map_err(|e| BackendError::Message(format!("failed to decode state: {}", e)))?; - - let nodes: Vec = graph_update - .nodes - .values() - .map(|node| ExecutionNodeView { - id: node.node_id.to_string(), - node_type: node.node_type.clone(), - label: node.label.clone(), - status: format_node_status(&node.status), - action_name: node.action.as_ref().map(|a| a.action_name.clone()), - module_name: node.action.as_ref().and_then(|a| a.module_name.clone()), - }) - .collect(); - - let edges: Vec = graph_update - .edges - .iter() - .map(|edge| ExecutionEdgeView { - source: edge.source.to_string(), - target: edge.target.to_string(), - edge_type: format!("{:?}", edge.edge_type), - }) - .collect(); - - Ok(Some(ExecutionGraphView { nodes, edges })) - } - - async fn get_workflow_graph( - &self, - instance_id: Uuid, - ) -> BackendResult> { - let row = sqlx::query( - r#" - SELECT ri.state, wv.program_proto - FROM runner_instances ri - JOIN workflow_versions wv ON wv.id = ri.workflow_version_id - WHERE ri.instance_id = $1 - "#, - ) - .bind(instance_id) - .fetch_optional(&self.pool) - .await?; - - let Some(row) = row else { - return Ok(None); - }; - - let program_proto: Vec = row.get("program_proto"); - let program = ir::Program::decode(&program_proto[..]) - .map_err(|err| BackendError::Message(format!("failed to decode workflow IR: {err}")))?; - let dag = convert_to_dag(&program).map_err(|err| { - BackendError::Message(format!("failed to convert workflow DAG: {err}")) - })?; - - let mut template_statuses: HashMap = HashMap::new(); - let state_bytes: Option> = row.get("state"); - if let Some(state_bytes) = state_bytes { - let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) - .map_err(|err| BackendError::Message(format!("failed to decode state: {err}")))?; - - for node in graph_update.nodes.values() { - let Some(template_id) = node.template_id.as_ref() else { - continue; - }; - template_statuses - .entry(template_id.clone()) - .and_modify(|existing| { - *existing = merge_template_status(existing, &node.status); - }) - .or_insert_with(|| node.status.clone()); - } - } - - let mut node_ids: Vec = dag.nodes.keys().cloned().collect(); - node_ids.sort(); - let nodes: Vec = node_ids - .into_iter() - .filter_map(|node_id| { - let node = dag.nodes.get(&node_id)?; - let status = template_statuses - .get(&node_id) - .map(format_node_status) - .unwrap_or_else(|| "pending".to_string()); - let (action_name, module_name) = match node { - DAGNode::ActionCall(action) => { - (Some(action.action_name.clone()), action.module_name.clone()) - } - _ => (None, None), - }; - - Some(ExecutionNodeView { - id: node_id, - node_type: node.node_type().to_string(), - label: node.label(), - status, - action_name, - module_name, - }) - }) - .collect(); - - let edges: Vec = dag - .edges - .iter() - .filter(|edge| edge.edge_type == EdgeType::StateMachine) - .map(|edge| ExecutionEdgeView { - source: edge.source.clone(), - target: edge.target.clone(), - edge_type: if edge.is_loop_back { - "state_machine_loop_back".to_string() - } else { - "state_machine".to_string() - }, - }) - .collect(); - - Ok(Some(ExecutionGraphView { nodes, edges })) - } - - async fn get_action_results(&self, instance_id: Uuid) -> BackendResult> { - let row = sqlx::query( - r#" - SELECT state - FROM runner_instances - WHERE instance_id = $1 - "#, - ) - .bind(instance_id) - .fetch_optional(&self.pool) - .await?; - - let Some(row) = row else { - return Ok(Vec::new()); - }; - let state_bytes: Option> = row.get("state"); - let Some(state_bytes) = state_bytes else { - return Ok(Vec::new()); - }; - let graph_update: GraphUpdate = rmp_serde::from_slice(&state_bytes) - .map_err(|e| BackendError::Message(format!("failed to decode state: {}", e)))?; - - let runner_state = RunnerState::new( - None, - Some(graph_update.nodes.clone()), - Some(graph_update.edges), - false, - ); - let action_nodes: HashMap = graph_update - .nodes - .into_iter() - .filter(|(_, node)| node.is_action_call()) - .collect(); - if action_nodes.is_empty() { - return Ok(Vec::new()); - } - let execution_ids: Vec = action_nodes.keys().copied().collect(); - - let rows = sqlx::query( - r#" - SELECT created_at, execution_id, attempt, status, started_at, completed_at, duration_ms, result - FROM runner_actions_done - WHERE execution_id = ANY($1) - ORDER BY created_at ASC, attempt ASC - "#, - ) - .bind(&execution_ids) - .fetch_all(&self.pool) - .await?; - - let mut decoded_rows = Vec::with_capacity(rows.len()); - for row in rows { - let created_at: DateTime = row.get("created_at"); - let execution_id: Uuid = row.get("execution_id"); - let attempt: i32 = row.get("attempt"); - let status: Option = row.get("status"); - let started_at: Option> = row.get("started_at"); - let completed_at: Option> = row.get("completed_at"); - let duration_ms: Option = row.get("duration_ms"); - let result_bytes: Option> = row.get("result"); - let result = result_bytes - .as_deref() - .map(decode_msgpack_json) - .transpose()?; - decoded_rows.push(DecodedActionResultRow { - created_at, - execution_id, - attempt, - status, - started_at, - completed_at, - duration_ms, - result, - }); - } - - // Replay needs the current known action outputs by execution id. - let mut action_results = HashMap::new(); - for row in &decoded_rows { - if let Some(result) = &row.result { - action_results.insert(row.execution_id, result.clone()); - } - } - - let mut request_preview_cache: HashMap = HashMap::new(); - let mut entries = Vec::with_capacity(decoded_rows.len()); - for row in decoded_rows { - let node = action_nodes.get(&row.execution_id); - let action_name = node - .and_then(|n| n.action.as_ref().map(|a| a.action_name.clone())) - .unwrap_or_default(); - let module_name = - node.and_then(|n| n.action.as_ref().and_then(|a| a.module_name.clone())); - - let request_preview = - if let Some(existing) = request_preview_cache.get(&row.execution_id) { - existing.clone() - } else { - let rendered = render_action_request_preview( - node.and_then(|n| n.action.as_ref()), - &runner_state, - &action_results, - row.execution_id, - ); - request_preview_cache.insert(row.execution_id, rendered.clone()); - rendered - }; - - let (response_preview, error) = match &row.result { - Some(value) => format_action_result(value), - None => ("(no result)".to_string(), None), - }; - let status = row.status.clone().unwrap_or_else(|| { - if error.is_some() { - "failed".to_string() - } else { - "completed".to_string() - } - }); - let (dispatched_at, completed_at, duration_ms) = if row.started_at.is_some() - || row.completed_at.is_some() - || row.duration_ms.is_some() - { - ( - Some(row.started_at.unwrap_or(row.created_at).to_rfc3339()), - Some(row.completed_at.unwrap_or(row.created_at).to_rfc3339()), - row.duration_ms, - ) - } else { - action_timing_from_state(node, row.attempt, row.created_at) - }; - - entries.push(TimelineEntry { - action_id: row.execution_id.to_string(), - action_name, - module_name, - status, - attempt_number: row.attempt, - dispatched_at, - completed_at, - duration_ms, - request_preview, - response_preview, - error, - }); - } - - Ok(entries) - } - - async fn get_distinct_workflows(&self) -> BackendResult> { - let rows = sqlx::query( - r#" - SELECT DISTINCT COALESCE(ri.workflow_name, wv.workflow_name) AS workflow_name - FROM runner_instances ri - LEFT JOIN workflow_versions wv ON wv.id = ri.workflow_version_id - WHERE COALESCE(ri.workflow_name, wv.workflow_name) IS NOT NULL - ORDER BY workflow_name - "#, - ) - .fetch_all(&self.pool) - .await?; - - let mut workflows = Vec::with_capacity(rows.len()); - for row in rows { - let workflow_name: String = row.get("workflow_name"); - workflows.push(workflow_name); - } - Ok(workflows) - } - - async fn get_distinct_statuses(&self) -> BackendResult> { - Ok(vec![ - "queued".to_string(), - "running".to_string(), - "completed".to_string(), - "failed".to_string(), - ]) - } - - async fn count_schedules(&self) -> BackendResult { - let count = sqlx::query_scalar::<_, i64>( - "SELECT COUNT(*) FROM workflow_schedules WHERE status != 'deleted'", - ) - .fetch_one(&self.pool) - .await?; - - Ok(count) - } - - async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { - let rows = sqlx::query( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - status, next_run_at, last_run_at, created_at - FROM workflow_schedules - WHERE status != 'deleted' - ORDER BY workflow_name, schedule_name - LIMIT $1 OFFSET $2 - "#, - ) - .bind(limit) - .bind(offset) - .fetch_all(&self.pool) - .await?; - - let mut schedules = Vec::new(); - for row in rows { - schedules.push(ScheduleSummary { - id: row.get::("id").to_string(), - workflow_name: row.get("workflow_name"), - schedule_name: row.get("schedule_name"), - schedule_type: row.get("schedule_type"), - cron_expression: row.get("cron_expression"), - interval_seconds: row.get("interval_seconds"), - status: row.get("status"), - next_run_at: row - .get::>, _>("next_run_at") - .map(|dt| dt.to_rfc3339()), - last_run_at: row - .get::>, _>("last_run_at") - .map(|dt| dt.to_rfc3339()), - created_at: row.get::, _>("created_at").to_rfc3339(), - }); - } - - Ok(schedules) - } - - async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { - let row = sqlx::query( - r#" - SELECT id, workflow_name, schedule_name, schedule_type, cron_expression, interval_seconds, - jitter_seconds, input_payload, status, next_run_at, last_run_at, last_instance_id, - created_at, updated_at, priority, allow_duplicate - FROM workflow_schedules - WHERE id = $1 - "#, - ) - .bind(schedule_id) - .fetch_optional(&self.pool) - .await? - .ok_or_else(|| BackendError::Message(format!("schedule not found: {}", schedule_id)))?; - - let input_payload: Option = row - .get::>, _>("input_payload") - .and_then(|bytes| { - rmp_serde::from_slice::(&bytes) - .ok() - .map(|v| serde_json::to_string_pretty(&v).unwrap_or_default()) - }); - - Ok(ScheduleDetail { - id: row.get::("id").to_string(), - workflow_name: row.get("workflow_name"), - schedule_name: row.get("schedule_name"), - schedule_type: row.get("schedule_type"), - cron_expression: row.get("cron_expression"), - interval_seconds: row.get("interval_seconds"), - jitter_seconds: row.get("jitter_seconds"), - status: row.get("status"), - next_run_at: row - .get::>, _>("next_run_at") - .map(|dt| dt.to_rfc3339()), - last_run_at: row - .get::>, _>("last_run_at") - .map(|dt| dt.to_rfc3339()), - last_instance_id: row - .get::, _>("last_instance_id") - .map(|id| id.to_string()), - created_at: row.get::, _>("created_at").to_rfc3339(), - updated_at: row.get::, _>("updated_at").to_rfc3339(), - priority: row.get("priority"), - allow_duplicate: row.get("allow_duplicate"), - input_payload, - }) - } - - async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult { - let count = sqlx::query_scalar::<_, i64>( - r#" - SELECT COUNT(*) - FROM runner_instances - WHERE schedule_id = $1 - "#, - ) - .bind(schedule_id) - .fetch_one(&self.pool) - .await?; - Ok(count) - } - - async fn list_schedule_invocations( - &self, - schedule_id: Uuid, - limit: i64, - offset: i64, - ) -> BackendResult> { - let rows = sqlx::query( - r#" - SELECT instance_id, created_at, state, result, error - FROM runner_instances - WHERE schedule_id = $1 - ORDER BY created_at DESC, instance_id DESC - LIMIT $2 OFFSET $3 - "#, - ) - .bind(schedule_id) - .bind(limit) - .bind(offset) - .fetch_all(&self.pool) - .await?; - - let mut invocations = Vec::with_capacity(rows.len()); - for row in rows { - let state_bytes: Option> = row.get("state"); - let result_bytes: Option> = row.get("result"); - let error_bytes: Option> = row.get("error"); - - invocations.push(ScheduleInvocationSummary { - id: row.get("instance_id"), - created_at: row.get("created_at"), - status: determine_status(&state_bytes, &result_bytes, &error_bytes), - }); - } - - Ok(invocations) - } - - async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { - let result = sqlx::query( - r#" - UPDATE workflow_schedules - SET status = $2, updated_at = NOW() - WHERE id = $1 - "#, - ) - .bind(schedule_id) - .bind(status) - .execute(&self.pool) - .await?; - - Ok(result.rows_affected() > 0) - } - - async fn get_distinct_schedule_statuses(&self) -> BackendResult> { - Ok(vec!["active".to_string(), "paused".to_string()]) - } - - async fn get_distinct_schedule_types(&self) -> BackendResult> { - Ok(vec!["cron".to_string(), "interval".to_string()]) - } - - async fn get_worker_action_stats( - &self, - window_minutes: i64, - ) -> BackendResult> { - let rows = sqlx::query( - r#" - SELECT - pool_id, - COUNT(DISTINCT worker_id) as active_workers, - SUM(throughput_per_min) / 60.0 as actions_per_sec, - SUM(throughput_per_min) as throughput_per_min, - COALESCE(SUM(total_completed), 0)::BIGINT as total_completed, - PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_dequeue_ms) as median_dequeue_ms, - PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_handling_ms) as median_handling_ms, - MAX(last_action_at) as last_action_at, - MAX(updated_at) as updated_at - FROM worker_status - WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 - GROUP BY pool_id - ORDER BY actions_per_sec DESC - "#, - ) - .bind(window_minutes) - .fetch_all(&self.pool) - .await?; - - let mut stats = Vec::new(); - for row in rows { - stats.push(WorkerActionRow { - pool_id: row.get::("pool_id").to_string(), - active_workers: row.get::("active_workers"), - actions_per_sec: format!("{:.1}", row.get::("actions_per_sec")), - throughput_per_min: row.get::("throughput_per_min") as i64, - total_completed: row.get::("total_completed"), - median_dequeue_ms: row - .get::, _>("median_dequeue_ms") - .map(|v| v as i64), - median_handling_ms: row - .get::, _>("median_handling_ms") - .map(|v| v as i64), - last_action_at: row - .get::>, _>("last_action_at") - .map(|dt| dt.to_rfc3339()), - updated_at: row.get::, _>("updated_at").to_rfc3339(), - }); - } - - Ok(stats) - } - - async fn get_worker_aggregate_stats( - &self, - window_minutes: i64, - ) -> BackendResult { - let row = sqlx::query( - r#" - SELECT - COUNT(DISTINCT worker_id) as active_worker_count, - COALESCE(SUM(throughput_per_min) / 60.0, 0) as actions_per_sec, - COALESCE(SUM(total_in_flight), 0)::BIGINT as total_in_flight, - COALESCE(SUM(dispatch_queue_size), 0)::BIGINT as total_queue_depth - FROM worker_status - WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 - "#, - ) - .bind(window_minutes) - .fetch_one(&self.pool) - .await?; - - Ok(WorkerAggregateStats { - active_worker_count: row.get::("active_worker_count"), - actions_per_sec: format!("{:.1}", row.get::("actions_per_sec")), - total_in_flight: row.get::("total_in_flight"), - total_queue_depth: row.get::("total_queue_depth"), - }) - } - - async fn worker_status_table_exists(&self) -> bool { - sqlx::query_scalar::<_, bool>( - r#" - SELECT EXISTS ( - SELECT FROM information_schema.tables - WHERE table_name = 'worker_status' - ) - "#, - ) - .fetch_one(&self.pool) - .await - .unwrap_or(false) - } - - async fn schedules_table_exists(&self) -> bool { - sqlx::query_scalar::<_, bool>( - r#" - SELECT EXISTS ( - SELECT FROM information_schema.tables - WHERE table_name = 'workflow_schedules' - ) - "#, - ) - .fetch_one(&self.pool) - .await - .unwrap_or(false) - } - - async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult> { - let rows = sqlx::query( - r#" - SELECT - pool_id, - MAX(active_workers) as active_workers, - COALESCE(SUM(throughput_per_min), 0) as throughput_per_min, - COALESCE(SUM(throughput_per_min) / 60.0, 0) as actions_per_sec, - COALESCE(SUM(total_completed), 0)::BIGINT as total_completed, - MAX(last_action_at) as last_action_at, - MAX(updated_at) as updated_at, - PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_dequeue_ms) as median_dequeue_ms, - PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY median_handling_ms) as median_handling_ms, - MAX(dispatch_queue_size) as dispatch_queue_size, - MAX(total_in_flight) as total_in_flight, - MAX(median_instance_duration_secs) as median_instance_duration_secs, - MAX(active_instance_count) as active_instance_count, - ( - SELECT COUNT(*)::BIGINT - FROM runner_instances ri - WHERE ri.result IS NOT NULL - AND ri.error IS NULL - ) as total_instances_completed, - MAX(instances_per_sec) as instances_per_sec, - MAX(instances_per_min) as instances_per_min, - ( - SELECT time_series FROM worker_status ws2 - WHERE ws2.pool_id = worker_status.pool_id - AND ws2.time_series IS NOT NULL - ORDER BY ws2.updated_at DESC LIMIT 1 - ) as time_series - FROM worker_status - WHERE updated_at > NOW() - INTERVAL '1 minute' * $1 - GROUP BY pool_id - ORDER BY actions_per_sec DESC - "#, - ) - .bind(window_minutes) - .fetch_all(&self.pool) - .await?; - - let mut statuses = Vec::new(); - for row in rows { - statuses.push(WorkerStatus { - pool_id: row.get::("pool_id"), - active_workers: row.get::, _>("active_workers").unwrap_or(0), - throughput_per_min: row.get::("throughput_per_min"), - actions_per_sec: row.get::("actions_per_sec"), - total_completed: row.get::("total_completed"), - last_action_at: row.get::>, _>("last_action_at"), - updated_at: row.get::, _>("updated_at"), - median_dequeue_ms: row - .get::, _>("median_dequeue_ms") - .map(|v| v as i64), - median_handling_ms: row - .get::, _>("median_handling_ms") - .map(|v| v as i64), - dispatch_queue_size: row.get::, _>("dispatch_queue_size"), - total_in_flight: row.get::, _>("total_in_flight"), - median_instance_duration_secs: row - .get::, _>("median_instance_duration_secs"), - active_instance_count: row - .get::, _>("active_instance_count") - .unwrap_or(0), - total_instances_completed: row - .get::, _>("total_instances_completed") - .unwrap_or(0), - instances_per_sec: row - .get::, _>("instances_per_sec") - .unwrap_or(0.0), - instances_per_min: row - .get::, _>("instances_per_min") - .unwrap_or(0.0), - time_series: row.get::>, _>("time_series"), - }); - } - - Ok(statuses) - } -} - -struct DecodedActionResultRow { - created_at: DateTime, - execution_id: Uuid, - attempt: i32, - status: Option, - started_at: Option>, - completed_at: Option>, - duration_ms: Option, - result: Option, -} - -fn decode_msgpack_json(bytes: &[u8]) -> BackendResult { - rmp_serde::from_slice::(bytes) - .map_err(|err| BackendError::Message(format!("failed to decode action result: {err}"))) -} - -fn render_action_request_preview( - action: Option<&ActionCallSpec>, - state: &RunnerState, - action_results: &HashMap, - node_id: Uuid, -) -> String { - let Some(action) = action else { - return "{}".to_string(); - }; - - match replay_action_kwargs(state, action_results, node_id) { - Ok(kwargs) => { - let rendered_map: serde_json::Map = kwargs.into_iter().collect(); - pretty_json(&Value::Object(rendered_map)) - } - Err(_) => format_symbolic_kwargs(action), - } -} - -fn format_symbolic_kwargs(action: &ActionCallSpec) -> String { - if action.kwargs.is_empty() { - return "{}".to_string(); - } - let rendered_map: serde_json::Map = action - .kwargs - .iter() - .map(|(name, expr)| (name.clone(), Value::String(format_value(expr)))) - .collect(); - pretty_json(&Value::Object(rendered_map)) -} - -fn action_timing_from_state( - node: Option<&ExecutionNode>, - attempt: i32, - fallback_completed_at: DateTime, -) -> (Option, Option, Option) { - // Node timing fields represent the latest attempt for this execution id. - // For historical retries, fall back to row timestamps from actions_done. - let Some(node) = node else { - let at = fallback_completed_at.to_rfc3339(); - return (Some(at.clone()), Some(at), None); - }; - if node.action_attempt != attempt { - let at = fallback_completed_at.to_rfc3339(); - return (Some(at.clone()), Some(at), None); - } - - let dispatched_at = node - .started_at - .map(|value| value.to_rfc3339()) - .unwrap_or_else(|| fallback_completed_at.to_rfc3339()); - let completed_dt = node.completed_at.unwrap_or(fallback_completed_at); - let completed_at = completed_dt.to_rfc3339(); - let duration_ms = node - .started_at - .map(|started_at| { - completed_dt - .signed_duration_since(started_at) - .num_milliseconds() - }) - .filter(|duration| *duration >= 0); - - (Some(dispatched_at), Some(completed_at), duration_ms) -} - -fn format_action_result(value: &Value) -> (String, Option) { - let preview = pretty_json(value); - let error = extract_action_error(value); - (preview, error) -} - -fn extract_action_error(value: &Value) -> Option { - let Value::Object(map) = value else { - return None; - }; - let message = map.get("message").and_then(Value::as_str); - let is_exception = map.contains_key("type") && map.contains_key("message"); - if is_exception { - return Some(message.unwrap_or("action failed").to_string()); - } - map.get("error") - .and_then(Value::as_str) - .map(|msg| msg.to_string()) -} - -fn pretty_json(value: &Value) -> String { - serde_json::to_string_pretty(value).unwrap_or_else(|_| "{}".to_string()) -} - -fn determine_status( - state_bytes: &Option>, - result_bytes: &Option>, - error_bytes: &Option>, -) -> InstanceStatus { - if error_bytes.is_some() { - return InstanceStatus::Failed; - } - if result_bytes - .as_deref() - .is_some_and(result_payload_is_error_wrapper) - { - return InstanceStatus::Failed; - } - if result_bytes.is_some() { - return InstanceStatus::Completed; - } - if state_bytes.is_some() { - return InstanceStatus::Running; - } - InstanceStatus::Queued -} - -fn extract_input_preview(state_bytes: &Option>) -> String { - let Some(bytes) = state_bytes else { - return "{}".to_string(); - }; - - match rmp_serde::from_slice::(bytes) { - Ok(graph) => { - let count = graph.nodes.len(); - format!("{{nodes: {count}}}") - } - Err(_) => "{}".to_string(), - } -} - -fn format_input_payload(state_bytes: &Option>) -> String { - let Some(bytes) = state_bytes else { - return "{}".to_string(); - }; - - match rmp_serde::from_slice::(bytes) { - Ok(graph) => format_extracted_inputs(&graph.nodes), - Err(_) => "{}".to_string(), - } -} - -fn format_extracted_inputs(nodes: &HashMap) -> String { - let mut input_pairs: Vec<(String, Value)> = nodes - .values() - .filter_map(extract_input_assignment) - .collect(); - if input_pairs.is_empty() { - return "{}".to_string(); - } - input_pairs.sort_by(|(left, _), (right, _)| left.cmp(right)); - let input_map: serde_json::Map = input_pairs.into_iter().collect(); - pretty_json(&Value::Object(input_map)) -} - -fn extract_input_assignment(node: &ExecutionNode) -> Option<(String, Value)> { - let (name, raw_value) = parse_input_assignment_label(&node.label)?; - - if let Ok(value) = serde_json::from_str::(raw_value) { - return Some((name.to_string(), value)); - } - - if let Some(value_expr) = node.assignments.get(name) { - return Some((name.to_string(), value_expr_to_json(value_expr))); - } - - Some((name.to_string(), Value::String(raw_value.to_string()))) -} - -fn parse_input_assignment_label(label: &str) -> Option<(&str, &str)> { - let payload = label.strip_prefix("input ")?; - payload.split_once(" = ") -} - -fn value_expr_to_json(value_expr: &ValueExpr) -> Value { - match value_expr { - ValueExpr::Literal(value) => value.value.clone(), - ValueExpr::List(value) => { - Value::Array(value.elements.iter().map(value_expr_to_json).collect()) - } - ValueExpr::Dict(value) => { - let mut map = serde_json::Map::new(); - for entry in &value.entries { - let key = match value_expr_to_json(&entry.key) { - Value::String(key) => key, - other => other.to_string(), - }; - map.insert(key, value_expr_to_json(&entry.value)); - } - Value::Object(map) - } - _ => Value::String(format_value(value_expr)), - } -} - -fn format_instance_result_payload( - status: InstanceStatus, - result_bytes: &Option>, - error_bytes: &Option>, -) -> String { - match status { - InstanceStatus::Failed => { - let payload = error_bytes.as_deref().or(result_bytes.as_deref()); - let Some(bytes) = payload else { - return "(failed)".to_string(); - }; - match rmp_serde::from_slice::(bytes) { - Ok(value) => pretty_json(&normalize_error_payload(value)), - Err(_) => "(decode error)".to_string(), - } - } - InstanceStatus::Completed => { - let Some(bytes) = result_bytes else { - return "(pending)".to_string(); - }; - match rmp_serde::from_slice::(bytes) { - Ok(value) => pretty_json(&normalize_success_payload(value)), - Err(_) => "(decode error)".to_string(), - } - } - InstanceStatus::Running | InstanceStatus::Queued => "(pending)".to_string(), - } -} - -fn normalize_success_payload(value: Value) -> Value { - let Value::Object(mut map) = value else { - return value; - }; - map.remove("result").unwrap_or(Value::Object(map)) -} - -fn normalize_error_payload(value: Value) -> Value { - let Value::Object(mut map) = value else { - return value; - }; - - if let Some(error) = map.remove("error") { - return normalize_error_payload(error); - } - if let Some(exception) = map.remove("__exception__") { - return normalize_error_payload(exception); - } - if let Some(exception) = map.remove("exception") { - return normalize_error_payload(exception); - } - - Value::Object(map) -} - -fn result_payload_is_error_wrapper(bytes: &[u8]) -> bool { - let Ok(value) = rmp_serde::from_slice::(bytes) else { - return false; - }; - let Value::Object(map) = value else { - return false; - }; - map.len() == 1 - && (map.contains_key("error") - || map.contains_key("__exception__") - || map.contains_key("exception")) -} - -fn format_error(error_bytes: &Option>) -> Option { - let bytes = error_bytes.as_ref()?; - - match rmp_serde::from_slice::(bytes) { - Ok(value) => Some(pretty_json(&normalize_error_payload(value))), - Err(_) => Some("(decode error)".to_string()), - } -} - -fn format_node_status(status: &NodeStatus) -> String { - match status { - NodeStatus::Queued => "queued".to_string(), - NodeStatus::Running => "running".to_string(), - NodeStatus::Completed => "completed".to_string(), - NodeStatus::Failed => "failed".to_string(), - } -} - -fn merge_template_status(existing: &NodeStatus, new_status: &NodeStatus) -> NodeStatus { - if node_status_rank(new_status) > node_status_rank(existing) { - new_status.clone() - } else { - existing.clone() - } -} - -fn node_status_rank(status: &NodeStatus) -> u8 { - match status { - NodeStatus::Completed => 0, - NodeStatus::Queued => 1, - NodeStatus::Running => 2, - NodeStatus::Failed => 3, - } -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - - use chrono::{Duration as ChronoDuration, Utc}; - use prost::Message; - use serial_test::serial; - use uuid::Uuid; - - use super::super::test_helpers::setup_backend; - use super::*; - use crate::backends::{ - SchedulerBackend, WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, - WorkflowRegistration, WorkflowRegistryBackend, - }; - use crate::scheduler::{CreateScheduleParams, ScheduleType}; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::ValueExpr; - use crate::waymark_core::runner::state::{ - ActionCallSpec, ExecutionEdge, ExecutionNode, LiteralValue, NodeStatus, - }; - use waymark_dag::EdgeType; - - #[test] - fn format_extracted_inputs_happy_path() { - let mut nodes = HashMap::new(); - let mut first_assignments = HashMap::new(); - first_assignments.insert( - "iterations".to_string(), - ValueExpr::Literal(LiteralValue { - value: serde_json::json!(3), - }), - ); - nodes.insert( - Uuid::new_v4(), - ExecutionNode { - node_id: Uuid::new_v4(), - node_type: "assignment".to_string(), - label: "input iterations = 3".to_string(), - status: NodeStatus::Completed, - template_id: None, - targets: vec!["iterations".to_string()], - action: None, - value_expr: None, - assignments: first_assignments, - action_attempt: 0, - started_at: None, - completed_at: None, - scheduled_at: None, - }, - ); - - let mut second_assignments = HashMap::new(); - second_assignments.insert( - "sleep_seconds".to_string(), - ValueExpr::Literal(LiteralValue { - value: serde_json::json!(20), - }), - ); - nodes.insert( - Uuid::new_v4(), - ExecutionNode { - node_id: Uuid::new_v4(), - node_type: "assignment".to_string(), - label: "input sleep_seconds = 20".to_string(), - status: NodeStatus::Completed, - template_id: None, - targets: vec!["sleep_seconds".to_string()], - action: None, - value_expr: None, - assignments: second_assignments, - action_attempt: 0, - started_at: None, - completed_at: None, - scheduled_at: None, - }, - ); - - let rendered = format_extracted_inputs(&nodes); - let value: Value = serde_json::from_str(&rendered).expect("decode rendered input payload"); - assert_eq!( - value, - serde_json::json!({ - "iterations": 3, - "sleep_seconds": 20 - }) - ); - } - - #[test] - fn format_instance_result_payload_unwraps_success_result_wrapper() { - let result_bytes = - rmp_serde::to_vec_named(&serde_json::json!({"result": {"total_iterations": 3}})) - .expect("encode result"); - let rendered = - format_instance_result_payload(InstanceStatus::Completed, &Some(result_bytes), &None); - let value: Value = serde_json::from_str(&rendered).expect("decode result payload"); - assert_eq!(value, serde_json::json!({"total_iterations": 3})); - } - - #[test] - fn format_instance_result_payload_unwraps_error_wrapper() { - let error_bytes = rmp_serde::to_vec_named(&serde_json::json!({ - "error": { - "__exception__": { - "type": "ValueError", - "message": "boom" - } - } - })) - .expect("encode error"); - let rendered = - format_instance_result_payload(InstanceStatus::Failed, &None, &Some(error_bytes)); - let value: Value = serde_json::from_str(&rendered).expect("decode result payload"); - assert_eq!( - value, - serde_json::json!({ - "type": "ValueError", - "message": "boom" - }) - ); - } - - #[test] - fn determine_status_marks_wrapped_result_errors_as_failed() { - let result_bytes = - rmp_serde::to_vec_named(&serde_json::json!({"error": {"message": "boom"}})) - .expect("encode result error"); - let status = determine_status(&None, &Some(result_bytes), &None); - assert_eq!(status, InstanceStatus::Failed); - } - - #[test] - fn parse_instance_search_expr_handles_boolean_operators() { - let parsed = parse_instance_search_expr("(alpha OR beta) AND running"); - assert_eq!( - parsed, - Some(InstanceSearchExpr::And( - Box::new(InstanceSearchExpr::Or( - Box::new(InstanceSearchExpr::Term("alpha".to_string())), - Box::new(InstanceSearchExpr::Term("beta".to_string())), - )), - Box::new(InstanceSearchExpr::Term("running".to_string())), - )) - ); - } - - #[test] - fn parse_instance_search_expr_falls_back_for_unbalanced_parentheses() { - let parsed = parse_instance_search_expr("(alpha OR beta"); - assert_eq!( - parsed, - Some(InstanceSearchExpr::Term("(alpha OR beta".to_string())) - ); - } - - #[test] - fn action_timing_from_state_uses_state_timestamps_for_latest_attempt() { - let started_at = Utc::now() - ChronoDuration::milliseconds(1500); - let completed_at = started_at + ChronoDuration::milliseconds(450); - let fallback = Utc::now(); - let node = ExecutionNode { - node_id: Uuid::new_v4(), - node_type: "action_call".to_string(), - label: "@tests.action()".to_string(), - status: NodeStatus::Completed, - template_id: Some("n0".to_string()), - targets: Vec::new(), - action: Some(ActionCallSpec { - action_name: "tests.action".to_string(), - module_name: Some("tests".to_string()), - kwargs: HashMap::new(), - }), - value_expr: None, - assignments: HashMap::new(), - action_attempt: 2, - started_at: Some(started_at), - completed_at: Some(completed_at), - scheduled_at: None, - }; - - let (dispatched_at, finished_at, duration_ms) = - action_timing_from_state(Some(&node), 2, fallback); - assert_eq!(dispatched_at, Some(started_at.to_rfc3339())); - assert_eq!(finished_at, Some(completed_at.to_rfc3339())); - assert_eq!(duration_ms, Some(450)); - } - - #[test] - fn action_timing_from_state_falls_back_for_prior_attempt_rows() { - let started_at = Utc::now() - ChronoDuration::milliseconds(1200); - let completed_at = started_at + ChronoDuration::milliseconds(600); - let fallback = Utc::now(); - let node = ExecutionNode { - node_id: Uuid::new_v4(), - node_type: "action_call".to_string(), - label: "@tests.action()".to_string(), - status: NodeStatus::Completed, - template_id: Some("n0".to_string()), - targets: Vec::new(), - action: Some(ActionCallSpec { - action_name: "tests.action".to_string(), - module_name: Some("tests".to_string()), - kwargs: HashMap::new(), - }), - value_expr: None, - assignments: HashMap::new(), - action_attempt: 3, - started_at: Some(started_at), - completed_at: Some(completed_at), - scheduled_at: None, - }; - - let (dispatched_at, finished_at, duration_ms) = - action_timing_from_state(Some(&node), 2, fallback); - assert_eq!(dispatched_at, Some(fallback.to_rfc3339())); - assert_eq!(finished_at, Some(fallback.to_rfc3339())); - assert_eq!(duration_ms, None); - } - - fn sample_execution_node(execution_id: Uuid) -> ExecutionNode { - ExecutionNode { - node_id: execution_id, - node_type: "action_call".to_string(), - label: "@tests.action()".to_string(), - status: NodeStatus::Queued, - template_id: Some("n0".to_string()), - targets: Vec::new(), - action: Some(ActionCallSpec { - action_name: "tests.action".to_string(), - module_name: Some("tests".to_string()), - kwargs: HashMap::from([( - "value".to_string(), - ValueExpr::Literal(LiteralValue { - value: serde_json::json!(7), - }), - )]), - }), - value_expr: None, - assignments: HashMap::new(), - action_attempt: 1, - started_at: None, - completed_at: None, - scheduled_at: Some(Utc::now()), - } - } - - fn sample_graph(instance_id: Uuid, execution_id: Uuid) -> GraphUpdate { - let mut nodes = HashMap::new(); - nodes.insert(execution_id, sample_execution_node(execution_id)); - - GraphUpdate { - instance_id, - nodes, - edges: HashSet::from([ExecutionEdge { - source: execution_id, - target: execution_id, - edge_type: EdgeType::StateMachine, - }]), - } - } - - async fn insert_instance_with_graph_with_workflow( - backend: &PostgresBackend, - workflow_name: &str, - ) -> (Uuid, Uuid, Uuid) { - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let execution_id = Uuid::new_v4(); - let workflow_version_id = insert_workflow_version(backend, workflow_name).await; - let graph = sample_graph(instance_id, execution_id); - let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); - - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, state) VALUES ($1, $2, $3, $4)", - ) - .bind(instance_id) - .bind(entry_node) - .bind(workflow_version_id) - .bind(state_payload) - .execute(backend.pool()) - .await - .expect("insert runner instance"); - - (instance_id, entry_node, execution_id) - } - - async fn insert_instance_with_graph(backend: &PostgresBackend) -> (Uuid, Uuid, Uuid) { - insert_instance_with_graph_with_workflow(backend, "tests.workflow").await - } - - async fn insert_action_result(backend: &PostgresBackend, execution_id: Uuid) { - let payload = rmp_serde::to_vec_named(&serde_json::json!({"ok": true})) - .expect("encode action result"); - sqlx::query( - "INSERT INTO runner_actions_done (execution_id, attempt, result) VALUES ($1, $2, $3)", - ) - .bind(execution_id) - .bind(1_i32) - .bind(payload) - .execute(backend.pool()) - .await - .expect("insert action result"); - } - - fn sample_program_proto() -> Vec { - let source = r#" -fn main(input: [x], output: [y]): - y = @tests.action(value=x) - return y -"#; - let program = parse_program(source.trim()).expect("parse program"); - program.encode_to_vec() - } - - fn loop_program_proto() -> Vec { - let source = r#" -fn main(input: [items], output: [total]): - total = 0 - for item in items: - total = total + item - return total -"#; - let program = parse_program(source.trim()).expect("parse loop program"); - program.encode_to_vec() - } - - async fn insert_workflow_version(backend: &PostgresBackend, workflow_name: &str) -> Uuid { - WorkflowRegistryBackend::upsert_workflow_version( - backend, - &WorkflowRegistration { - workflow_name: workflow_name.to_string(), - workflow_version: "v1".to_string(), - ir_hash: format!("hash-{workflow_name}"), - program_proto: sample_program_proto(), - concurrent: false, - }, - ) - .await - .expect("insert workflow version") - } - - async fn insert_loop_workflow_version(backend: &PostgresBackend, workflow_name: &str) -> Uuid { - WorkflowRegistryBackend::upsert_workflow_version( - backend, - &WorkflowRegistration { - workflow_name: workflow_name.to_string(), - workflow_version: "v1-loop".to_string(), - ir_hash: format!("hash-loop-{workflow_name}"), - program_proto: loop_program_proto(), - concurrent: false, - }, - ) - .await - .expect("insert loop workflow version") - } - - async fn insert_schedule(backend: &PostgresBackend, schedule_name: &str) -> Uuid { - SchedulerBackend::upsert_schedule( - backend, - &CreateScheduleParams { - workflow_name: "tests.workflow".to_string(), - schedule_name: schedule_name.to_string(), - schedule_type: ScheduleType::Interval, - cron_expression: None, - interval_seconds: Some(60), - jitter_seconds: 0, - input_payload: Some( - rmp_serde::to_vec_named(&serde_json::json!({"k": "v"})) - .expect("encode payload"), - ), - priority: 0, - allow_duplicate: false, - }, - ) - .await - .expect("upsert schedule") - .0 - } - - async fn insert_scheduled_instance( - backend: &PostgresBackend, - schedule_id: Uuid, - created_at: DateTime, - with_result: bool, - ) -> Uuid { - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let execution_id = Uuid::new_v4(); - let workflow_version_id = insert_workflow_version(backend, "tests.workflow").await; - let graph = sample_graph(instance_id, execution_id); - let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); - let result_payload = if with_result { - Some( - rmp_serde::to_vec_named(&serde_json::json!({"result": {"ok": true}})) - .expect("encode result"), - ) - } else { - None - }; - - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, schedule_id, created_at, state, result, error) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)", - ) - .bind(instance_id) - .bind(entry_node) - .bind(workflow_version_id) - .bind(schedule_id) - .bind(created_at) - .bind(state_payload) - .bind(result_payload) - .bind(Option::>::None) - .execute(backend.pool()) - .await - .expect("insert scheduled instance"); - - instance_id - } - - async fn insert_worker_status(backend: &PostgresBackend, pool_id: Uuid) { - WorkerStatusBackend::upsert_worker_status( - backend, - &WorkerStatusUpdate { - pool_id, - throughput_per_min: 180.0, - total_completed: 20, - last_action_at: Some(Utc::now()), - median_dequeue_ms: Some(5), - median_handling_ms: Some(12), - dispatch_queue_size: 3, - total_in_flight: 2, - active_workers: 4, - actions_per_sec: 3.0, - median_instance_duration_secs: Some(0.2), - active_instance_count: 1, - total_instances_completed: 8, - instances_per_sec: 0.5, - instances_per_min: 30.0, - time_series: None, - }, - ) - .await - .expect("upsert worker status"); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_count_instances_happy_path() { - let backend = setup_backend().await; - insert_instance_with_graph(&backend).await; - - let count = WebappBackend::count_instances(&backend, None) - .await - .expect("count instances"); - assert_eq!(count, 1); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_count_instances_applies_search_expression() { - let backend = setup_backend().await; - let (alpha_id, _, _) = - insert_instance_with_graph_with_workflow(&backend, "tests.alpha").await; - let (beta_id, _, _) = - insert_instance_with_graph_with_workflow(&backend, "tests.beta").await; - assert_ne!(alpha_id, beta_id); - - let completed_payload = - rmp_serde::to_vec_named(&serde_json::json!({"result": {"ok": true}})) - .expect("encode completed payload"); - sqlx::query( - "UPDATE runner_instances SET result = $2, current_status = $3 WHERE instance_id = $1", - ) - .bind(beta_id) - .bind(completed_payload) - .bind("completed") - .execute(backend.pool()) - .await - .expect("mark beta completed"); - - let alpha_count = WebappBackend::count_instances(&backend, Some("alpha")) - .await - .expect("count alpha"); - assert_eq!(alpha_count, 1); - - let completed_count = WebappBackend::count_instances(&backend, Some("completed")) - .await - .expect("count completed"); - assert_eq!(completed_count, 1); - - let combined = WebappBackend::count_instances(&backend, Some("(alpha OR completed)")) - .await - .expect("count combined"); - assert_eq!(combined, 2); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_list_instances_happy_path() { - let backend = setup_backend().await; - let (instance_id, _, _) = insert_instance_with_graph(&backend).await; - - let instances = WebappBackend::list_instances(&backend, None, 10, 0) - .await - .expect("list instances"); - - assert_eq!(instances.len(), 1); - assert_eq!(instances[0].id, instance_id); - assert_eq!(instances[0].status, InstanceStatus::Running); - assert_eq!( - instances[0].workflow_name, - Some("tests.workflow".to_string()) - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_list_instances_applies_search_expression() { - let backend = setup_backend().await; - let (alpha_id, _, _) = - insert_instance_with_graph_with_workflow(&backend, "tests.alpha").await; - let _ = insert_instance_with_graph_with_workflow(&backend, "tests.beta").await; - - let alpha_instances = WebappBackend::list_instances(&backend, Some("alpha"), 10, 0) - .await - .expect("list alpha"); - assert_eq!(alpha_instances.len(), 1); - assert_eq!(alpha_instances[0].id, alpha_id); - - let running_instances = - WebappBackend::list_instances(&backend, Some("(alpha OR beta) AND running"), 10, 0) - .await - .expect("list running instances"); - assert_eq!(running_instances.len(), 2); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_instance_happy_path() { - let backend = setup_backend().await; - let (instance_id, _, _) = insert_instance_with_graph(&backend).await; - - let instance = WebappBackend::get_instance(&backend, instance_id) - .await - .expect("get instance"); - - assert_eq!(instance.id, instance_id); - assert_eq!(instance.status, InstanceStatus::Running); - assert_eq!(instance.workflow_name, Some("tests.workflow".to_string())); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_workflow_name_prefers_registered_workflow_name() { - let backend = setup_backend().await; - let (instance_id, entry_node, execution_id) = - insert_instance_with_graph_with_workflow(&backend, "tests.workflow_name").await; - - let list = WebappBackend::list_instances(&backend, None, 10, 0) - .await - .expect("list instances"); - assert_eq!(list.len(), 1); - assert_eq!(list[0].id, instance_id); - assert_eq!( - list[0].workflow_name, - Some("tests.workflow_name".to_string()) - ); - - let detail = WebappBackend::get_instance(&backend, instance_id) - .await - .expect("get instance"); - assert_eq!(detail.id, instance_id); - assert_eq!(detail.entry_node, entry_node); - assert_eq!( - detail.workflow_name, - Some("tests.workflow_name".to_string()) - ); - - let graph = WebappBackend::get_execution_graph(&backend, instance_id) - .await - .expect("get graph") - .expect("graph"); - assert!( - graph - .nodes - .iter() - .any(|node| node.id == execution_id.to_string()), - "expected action node to remain intact" - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_execution_graph_happy_path() { - let backend = setup_backend().await; - let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; - - let graph = WebappBackend::get_execution_graph(&backend, instance_id) - .await - .expect("get execution graph") - .expect("expected execution graph"); - - assert_eq!(graph.nodes.len(), 1); - assert_eq!(graph.edges.len(), 1); - assert_eq!(graph.nodes[0].id, execution_id.to_string()); - assert_eq!(graph.nodes[0].action_name, Some("tests.action".to_string())); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_workflow_graph_uses_template_node_ids() { - let backend = setup_backend().await; - let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; - - let graph = WebappBackend::get_workflow_graph(&backend, instance_id) - .await - .expect("get workflow graph") - .expect("expected workflow graph"); - - assert!(!graph.nodes.is_empty(), "workflow graph should have nodes"); - assert!( - graph - .nodes - .iter() - .all(|node| node.id != execution_id.to_string()), - "workflow graph should use template node ids, not runtime execution ids" - ); - assert!( - graph - .nodes - .iter() - .any(|node| node.node_type == "action_call"), - "workflow graph should include action_call template nodes" - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_workflow_graph_marks_loop_back_edges() { - let backend = setup_backend().await; - let instance_id = Uuid::new_v4(); - let entry_node = Uuid::new_v4(); - let execution_id = Uuid::new_v4(); - let workflow_version_id = - insert_loop_workflow_version(&backend, "tests.loop_workflow").await; - let graph = sample_graph(instance_id, execution_id); - let state_payload = rmp_serde::to_vec_named(&graph).expect("encode graph update"); - - sqlx::query( - "INSERT INTO runner_instances (instance_id, entry_node, workflow_version_id, state) VALUES ($1, $2, $3, $4)", - ) - .bind(instance_id) - .bind(entry_node) - .bind(workflow_version_id) - .bind(state_payload) - .execute(backend.pool()) - .await - .expect("insert loop runner instance"); - - let workflow_graph = WebappBackend::get_workflow_graph(&backend, instance_id) - .await - .expect("get workflow graph") - .expect("expected workflow graph"); - - assert!( - workflow_graph - .edges - .iter() - .any(|edge| edge.edge_type == "state_machine_loop_back"), - "loop workflows should emit at least one loop_back edge" - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_action_results_happy_path() { - let backend = setup_backend().await; - let (instance_id, _, execution_id) = insert_instance_with_graph(&backend).await; - insert_action_result(&backend, execution_id).await; - - let entries = WebappBackend::get_action_results(&backend, instance_id) - .await - .expect("get action results"); - - assert_eq!(entries.len(), 1); - assert_eq!(entries[0].action_id, execution_id.to_string()); - assert_eq!(entries[0].action_name, "tests.action"); - assert_eq!(entries[0].status, "completed"); - assert!(entries[0].request_preview.contains("\"value\": 7")); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_distinct_workflows_happy_path() { - let backend = setup_backend().await; - insert_instance_with_graph_with_workflow(&backend, "tests.workflow_a").await; - insert_instance_with_graph_with_workflow(&backend, "tests.workflow_b").await; - - let workflows = WebappBackend::get_distinct_workflows(&backend) - .await - .expect("get distinct workflows"); - assert_eq!( - workflows, - vec![ - "tests.workflow_a".to_string(), - "tests.workflow_b".to_string() - ] - ); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_distinct_statuses_happy_path() { - let backend = setup_backend().await; - - let statuses = WebappBackend::get_distinct_statuses(&backend) - .await - .expect("get distinct statuses"); - assert_eq!(statuses, vec!["queued", "running", "completed", "failed"]); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_count_schedules_happy_path() { - let backend = setup_backend().await; - insert_schedule(&backend, "count").await; - - let count = WebappBackend::count_schedules(&backend) - .await - .expect("count schedules"); - assert_eq!(count, 1); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_list_schedules_happy_path() { - let backend = setup_backend().await; - let schedule_id = insert_schedule(&backend, "list").await; - - let schedules = WebappBackend::list_schedules(&backend, 10, 0) - .await - .expect("list schedules"); - assert_eq!(schedules.len(), 1); - assert_eq!(schedules[0].id, schedule_id.to_string()); - assert_eq!(schedules[0].schedule_name, "list"); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_schedule_happy_path() { - let backend = setup_backend().await; - let schedule_id = insert_schedule(&backend, "detail").await; - - let schedule = WebappBackend::get_schedule(&backend, schedule_id) - .await - .expect("get schedule"); - assert_eq!(schedule.id, schedule_id.to_string()); - assert_eq!(schedule.schedule_name, "detail"); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_schedule_invocations_are_filtered_by_schedule_id() { - let backend = setup_backend().await; - let schedule_id = insert_schedule(&backend, "invocations-a").await; - let other_schedule_id = insert_schedule(&backend, "invocations-b").await; - - let running_instance_id = insert_scheduled_instance( - &backend, - schedule_id, - Utc::now() - ChronoDuration::minutes(2), - false, - ) - .await; - let completed_instance_id = insert_scheduled_instance( - &backend, - schedule_id, - Utc::now() - ChronoDuration::minutes(1), - true, - ) - .await; - let _other_instance_id = - insert_scheduled_instance(&backend, other_schedule_id, Utc::now(), true).await; - - let total = WebappBackend::count_schedule_invocations(&backend, schedule_id) - .await - .expect("count schedule invocations"); - assert_eq!(total, 2); - - let invocations = WebappBackend::list_schedule_invocations(&backend, schedule_id, 10, 0) - .await - .expect("list schedule invocations"); - assert_eq!(invocations.len(), 2); - assert_eq!(invocations[0].id, completed_instance_id); - assert_eq!(invocations[0].status, InstanceStatus::Completed); - assert_eq!(invocations[1].id, running_instance_id); - assert_eq!(invocations[1].status, InstanceStatus::Running); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_update_schedule_status_happy_path() { - let backend = setup_backend().await; - let schedule_id = insert_schedule(&backend, "update").await; - - let updated = WebappBackend::update_schedule_status(&backend, schedule_id, "paused") - .await - .expect("update schedule status"); - assert!(updated); - - let schedule = WebappBackend::get_schedule(&backend, schedule_id) - .await - .expect("get schedule"); - assert_eq!(schedule.status, "paused"); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_distinct_schedule_statuses_happy_path() { - let backend = setup_backend().await; - - let statuses = WebappBackend::get_distinct_schedule_statuses(&backend) - .await - .expect("get distinct schedule statuses"); - assert_eq!(statuses, vec!["active", "paused"]); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_distinct_schedule_types_happy_path() { - let backend = setup_backend().await; - - let types = WebappBackend::get_distinct_schedule_types(&backend) - .await - .expect("get distinct schedule types"); - assert_eq!(types, vec!["cron", "interval"]); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_worker_action_stats_happy_path() { - let backend = setup_backend().await; - let pool_id = Uuid::new_v4(); - insert_worker_status(&backend, pool_id).await; - - let rows = WebappBackend::get_worker_action_stats(&backend, 60) - .await - .expect("get worker action stats"); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0].pool_id, pool_id.to_string()); - assert_eq!(rows[0].total_completed, 20); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_worker_aggregate_stats_happy_path() { - let backend = setup_backend().await; - insert_worker_status(&backend, Uuid::new_v4()).await; - - let aggregate = WebappBackend::get_worker_aggregate_stats(&backend, 60) - .await - .expect("get worker aggregate stats"); - assert_eq!(aggregate.active_worker_count, 1); - assert_eq!(aggregate.total_in_flight, 2); - assert_eq!(aggregate.total_queue_depth, 3); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_worker_status_table_exists_happy_path() { - let backend = setup_backend().await; - - assert!(WebappBackend::worker_status_table_exists(&backend).await); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_schedules_table_exists_happy_path() { - let backend = setup_backend().await; - - assert!(WebappBackend::schedules_table_exists(&backend).await); - } - - #[serial(postgres)] - #[tokio::test] - async fn webapp_get_worker_statuses_happy_path() { - let backend = setup_backend().await; - let pool_id = Uuid::new_v4(); - insert_worker_status(&backend, pool_id).await; - let (completed_instance_id, _, _) = insert_instance_with_graph(&backend).await; - let completed_payload = - rmp_serde::to_vec_named(&serde_json::json!({"ok": true})).expect("encode result"); - sqlx::query("UPDATE runner_instances SET result = $2 WHERE instance_id = $1") - .bind(completed_instance_id) - .bind(completed_payload) - .execute(backend.pool()) - .await - .expect("mark instance completed"); - - let (failed_instance_id, _, _) = insert_instance_with_graph(&backend).await; - let error_payload = rmp_serde::to_vec_named(&serde_json::json!({ - "type": "Exception", - "message": "boom", - })) - .expect("encode error"); - sqlx::query("UPDATE runner_instances SET error = $2 WHERE instance_id = $1") - .bind(failed_instance_id) - .bind(error_payload) - .execute(backend.pool()) - .await - .expect("mark instance failed"); - - let statuses = WebappBackend::get_worker_statuses(&backend, 60) - .await - .expect("get worker statuses"); - assert_eq!(statuses.len(), 1); - assert_eq!(statuses[0].pool_id, pool_id); - assert_eq!(statuses[0].total_completed, 20); - assert_eq!(statuses[0].total_instances_completed, 1); - assert_eq!(statuses[0].total_in_flight, Some(2)); - assert_eq!(statuses[0].dispatch_queue_size, Some(3)); - } -} diff --git a/crates/waymark/src/bin/integration_test.rs b/crates/waymark/src/bin/integration_test.rs index 35f86fdf..e7faa3ae 100644 --- a/crates/waymark/src/bin/integration_test.rs +++ b/crates/waymark/src/bin/integration_test.rs @@ -19,17 +19,16 @@ use serde_json::Value; use sqlx::Row; use uuid::Uuid; -use waymark::backends::{ - CoreBackend, MemoryBackend, PostgresBackend, QueuedInstance, WorkflowRegistration, - WorkflowRegistryBackend, -}; -use waymark::db; -use waymark::integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; use waymark::messages::ast as ir; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{PythonWorkerConfig, RemoteWorkerPool}; +use waymark_backend_memory::MemoryBackend; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::{CoreBackend, QueuedInstance}; use waymark_dag::{DAG, convert_to_dag}; +use waymark_integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; +use waymark_runner_state::RunnerState; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; #[derive(Parser, Debug)] #[command(name = "integration_test")] @@ -452,7 +451,7 @@ async fn connect_postgres_backend() -> Result { let pool = connect_pool(&dsn) .await .with_context(|| format!("connect postgres backend: {dsn}"))?; - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run postgres migrations for integration runner")?; Ok(PostgresBackend::new(pool)) diff --git a/crates/waymark/src/bin/soak-harness.rs b/crates/waymark/src/bin/soak-harness.rs index d4e77388..2bccbb4e 100644 --- a/crates/waymark/src/bin/soak-harness.rs +++ b/crates/waymark/src/bin/soak-harness.rs @@ -29,14 +29,13 @@ use tokio::process::{Child, Command}; use tracing::{error, info, warn}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::{ - PostgresBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; -use waymark::db; use waymark::messages::ast as ir; -use waymark::waymark_core::runner::RunnerState; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::QueuedInstance; use waymark_dag::{DAG, convert_to_dag}; use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; const DEFAULT_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; const DEFAULT_WORKFLOW_NAME: &str = "waymark_soak_timeout_mix_v1"; @@ -287,7 +286,7 @@ async fn main() -> Result<()> { } let pool = wait_for_database(&args.dsn, DB_READY_TIMEOUT).await?; - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run migrations before soak")?; diff --git a/crates/waymark/src/bin/start-workers.rs b/crates/waymark/src/bin/start-workers.rs index 2bf9733d..8aa1c493 100644 --- a/crates/waymark/src/bin/start-workers.rs +++ b/crates/waymark/src/bin/start-workers.rs @@ -43,13 +43,12 @@ use tracing::{error, info, warn}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::PostgresBackend; use waymark::config::WorkerConfig; -use waymark::db; use waymark::messages::ast as ir; use waymark::scheduler::{DagResolver, WorkflowDag}; use waymark::waymark_core::runloop::{RunLoopSupervisorConfig, runloop_supervisor}; use waymark::{PythonWorkerConfig, RemoteWorkerPool, WebappServer, spawn_status_reporter}; +use waymark_backend_postgres::PostgresBackend; use waymark_dag::convert_to_dag; #[tokio::main] @@ -87,7 +86,7 @@ async fn main() -> Result<()> { // Initialize the database and backend. let pool = PgPool::connect(&config.database_url).await?; - db::run_migrations(&pool).await?; + waymark_backend_postgres_migrations::run(&pool).await?; let backend = PostgresBackend::new(pool); // Start the worker pool (bridge + python workers). diff --git a/crates/waymark/src/bin/waymark-bridge.rs b/crates/waymark/src/bin/waymark-bridge.rs index 1bc6ac18..878e6a6e 100644 --- a/crates/waymark/src/bin/waymark-bridge.rs +++ b/crates/waymark/src/bin/waymark-bridge.rs @@ -29,18 +29,22 @@ use tracing::{debug, info}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::{ - ActionDone, BackendError, BackendResult, CoreBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, PostgresBackend, QueuedInstance, QueuedInstanceBatch, - SchedulerBackend, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; -use waymark::db; use waymark::messages::{self, ast as ir, proto}; -use waymark::scheduler::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType}; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; +use waymark_backend_postgres::PostgresBackend; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::{ + ActionDone, CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, + QueuedInstance, QueuedInstanceBatch, +}; use waymark_dag::convert_to_dag; +use waymark_runner_state::RunnerState; +use waymark_scheduler_backend::SchedulerBackend as _; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; const DEFAULT_GRPC_ADDR: &str = "127.0.0.1:24117"; @@ -52,7 +56,7 @@ struct WorkflowStore { impl WorkflowStore { async fn connect(dsn: &str) -> Result { let pool = PgPool::connect(dsn).await?; - db::run_migrations(&pool).await?; + waymark_backend_postgres_migrations::run(&pool).await?; let backend = PostgresBackend::new(pool); Ok(Self { backend }) } diff --git a/crates/waymark/src/db.rs b/crates/waymark/src/db.rs deleted file mode 100644 index f89f0e04..00000000 --- a/crates/waymark/src/db.rs +++ /dev/null @@ -1,14 +0,0 @@ -//! Database helpers shared across services. - -use sqlx::PgPool; - -use crate::backends::{BackendError, BackendResult}; - -/// Run the embedded SQLx migrations. -pub async fn run_migrations(pool: &PgPool) -> BackendResult<()> { - sqlx::migrate!() - .run(pool) - .await - .map_err(|err| BackendError::Message(err.to_string()))?; - Ok(()) -} diff --git a/crates/waymark/src/garbage_collector/task.rs b/crates/waymark/src/garbage_collector/task.rs index a96a280d..e1f673ef 100644 --- a/crates/waymark/src/garbage_collector/task.rs +++ b/crates/waymark/src/garbage_collector/task.rs @@ -6,8 +6,7 @@ use std::time::Duration; use chrono::Utc; use tracing::{debug, error, info}; - -use crate::backends::{GarbageCollectionResult, GarbageCollectorBackend}; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; /// Configuration for the garbage collector task. #[derive(Debug, Clone)] @@ -120,9 +119,9 @@ mod tests { use chrono::{Duration as ChronoDuration, Utc}; use tonic::async_trait; + use waymark_backends_core::BackendResult; use super::*; - use crate::backends::{BackendResult, GarbageCollectorBackend}; #[derive(Clone)] struct StubGarbageCollectorBackend { diff --git a/crates/waymark/src/integration_support/mod.rs b/crates/waymark/src/integration_support/mod.rs deleted file mode 100644 index db198a86..00000000 --- a/crates/waymark/src/integration_support/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! Shared integration harness helpers used by test binaries and Rust tests. - -mod postgres; - -pub use postgres::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; diff --git a/crates/waymark/src/integration_support/postgres.rs b/crates/waymark/src/integration_support/postgres.rs deleted file mode 100644 index 5a876f2d..00000000 --- a/crates/waymark/src/integration_support/postgres.rs +++ /dev/null @@ -1,105 +0,0 @@ -//! Shared Postgres bootstrap for integration harnesses. - -use std::path::PathBuf; -use std::time::{Duration, Instant}; - -use anyhow::{Context, Result, anyhow, bail}; -use sqlx::{PgPool, postgres::PgPoolOptions}; -use tokio::process::Command; -use tokio::sync::OnceCell; - -use crate::db; - -pub const LOCAL_POSTGRES_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; - -const READY_TIMEOUT: Duration = Duration::from_secs(45); -const RETRY_DELAY: Duration = Duration::from_millis(500); -const POOL_MAX_CONNECTIONS: u32 = 32; -const POOL_ACQUIRE_TIMEOUT: Duration = Duration::from_secs(15); - -static LOCAL_POSTGRES_BOOTSTRAPPED: OnceCell<()> = OnceCell::const_new(); - -/// Ensure the default local Postgres is available and migrated. -/// -/// This helper is intended for local integration workflows where the default -/// DSN maps to the repository docker-compose service. -pub async fn ensure_local_postgres() -> Result<()> { - LOCAL_POSTGRES_BOOTSTRAPPED - .get_or_try_init(|| async { ensure_local_postgres_impl().await }) - .await?; - Ok(()) -} - -/// Connect a PgPool using integration defaults. -pub async fn connect_pool(dsn: &str) -> Result { - Ok(PgPoolOptions::new() - .max_connections(POOL_MAX_CONNECTIONS) - .acquire_timeout(POOL_ACQUIRE_TIMEOUT) - .connect(dsn) - .await?) -} - -async fn ensure_local_postgres_impl() -> Result<()> { - if let Ok(pool) = connect_pool(LOCAL_POSTGRES_DSN).await { - db::run_migrations(&pool) - .await - .context("run migrations for existing local postgres")?; - pool.close().await; - return Ok(()); - } - - run_compose_up().await?; - let pool = wait_for_postgres(LOCAL_POSTGRES_DSN).await?; - db::run_migrations(&pool) - .await - .context("run migrations for local postgres")?; - pool.close().await; - Ok(()) -} - -async fn run_compose_up() -> Result<()> { - let root = project_root(); - let status = Command::new("docker") - .arg("compose") - .arg("-f") - .arg("../../docker-compose.yml") - .arg("up") - .arg("-d") - .arg("postgres") - .current_dir(&root) - .status() - .await - .with_context(|| format!("failed to run docker compose in {}", root.display()))?; - - if !status.success() { - bail!("docker compose up -d postgres exited with status {status}"); - } - - Ok(()) -} - -async fn wait_for_postgres(dsn: &str) -> Result { - let deadline = Instant::now() + READY_TIMEOUT; - let mut last_error = None; - - while Instant::now() < deadline { - match connect_pool(dsn).await { - Ok(pool) => return Ok(pool), - Err(err) => { - last_error = Some(err); - tokio::time::sleep(RETRY_DELAY).await; - } - } - } - - Err(anyhow!( - "timed out waiting for postgres at {dsn}; last error: {}", - last_error - .map(|err| err.to_string()) - .unwrap_or_else(|| "unknown".to_string()) - )) -} - -fn project_root() -> PathBuf { - PathBuf::from(env!("CARGO_MANIFEST_DIR")) -} diff --git a/crates/waymark/src/lib.rs b/crates/waymark/src/lib.rs index 66945900..568c1ba0 100644 --- a/crates/waymark/src/lib.rs +++ b/crates/waymark/src/lib.rs @@ -1,17 +1,12 @@ //! Waymark - worker pool infrastructure plus the core IR/runtime port. -pub mod backends; pub mod config; -pub mod db; pub mod garbage_collector; -pub mod integration_support; pub mod messages; pub mod observability; pub mod pool_status; pub mod scheduler; pub mod server_worker; -#[cfg(test)] -pub mod test_support; pub mod waymark_core; pub mod webapp; pub mod workers; @@ -21,10 +16,7 @@ pub use garbage_collector::{GarbageCollectorConfig, GarbageCollectorTask}; pub use messages::{MessageError, ast as ir_ast, proto, workflow_argument_value_to_json}; pub use observability::obs; pub use pool_status::{PoolTimeSeries, TimeSeriesEntry, TimeSeriesJsonEntry}; -pub use scheduler::{ - CreateScheduleParams, ScheduleId, ScheduleType, SchedulerConfig, SchedulerTask, - WorkflowSchedule, -}; +pub use scheduler::{SchedulerConfig, SchedulerTask}; pub use server_worker::{WorkerBridgeChannels, WorkerBridgeServer}; pub use webapp::{WebappConfig, WebappServer}; pub use workers::{ diff --git a/crates/waymark/src/observability.rs b/crates/waymark/src/observability.rs index dbb8a7af..49c1700f 100644 --- a/crates/waymark/src/observability.rs +++ b/crates/waymark/src/observability.rs @@ -1,6 +1,6 @@ //! Observability helpers for optional tracing instrumentation. -pub use waymark_observability_macros::obs; +pub use waymark_observability::obs; #[cfg(feature = "trace")] use std::sync::OnceLock; diff --git a/crates/waymark/src/scheduler/mod.rs b/crates/waymark/src/scheduler/mod.rs index d1ba2abb..d14adfc4 100644 --- a/crates/waymark/src/scheduler/mod.rs +++ b/crates/waymark/src/scheduler/mod.rs @@ -6,9 +6,5 @@ //! - Cron and interval utilities mod task; -mod types; -mod utils; pub use task::{DagResolver, SchedulerConfig, SchedulerTask, WorkflowDag}; -pub use types::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType, WorkflowSchedule}; -pub use utils::{apply_jitter, compute_next_run, next_cron_run, next_interval_run, validate_cron}; diff --git a/crates/waymark/src/scheduler/task.rs b/crates/waymark/src/scheduler/task.rs index 9f9550cd..283745cc 100644 --- a/crates/waymark/src/scheduler/task.rs +++ b/crates/waymark/src/scheduler/task.rs @@ -9,9 +9,9 @@ use std::time::Duration; use serde_json::Value; use tracing::{debug, error, info}; use uuid::Uuid; +use waymark_core_backend::QueuedInstance; +use waymark_scheduler_core::{ScheduleId, WorkflowSchedule}; -use super::types::{ScheduleId, WorkflowSchedule}; -use crate::backends::{CoreBackend, QueuedInstance, SchedulerBackend}; use crate::messages; use crate::messages::ast as ir; use waymark_dag::DAG; @@ -53,7 +53,8 @@ pub struct SchedulerTask { impl SchedulerTask where - B: CoreBackend + SchedulerBackend + Clone + Send + Sync + 'static, + B: waymark_core_backend::CoreBackend + waymark_scheduler_backend::SchedulerBackend, + B: Clone + Send + Sync + 'static, { /// Run the scheduler loop. pub async fn run(self, shutdown: tokio_util::sync::WaitForCancellationFutureOwned) { @@ -153,12 +154,8 @@ where .as_ref() .ok_or_else(|| "DAG has no entry node".to_string())?; - let mut state = crate::waymark_core::runner::RunnerState::new( - Some(Arc::clone(&dag)), - None, - None, - false, - ); + let mut state = + waymark_runner_state::RunnerState::new(Some(Arc::clone(&dag)), None, None, false); if let Some(input_payload) = schedule.input_payload.as_deref() { let inputs = messages::workflow_arguments_to_json(input_payload) .ok_or_else(|| "failed to decode schedule input payload".to_string())?; @@ -278,14 +275,16 @@ mod tests { use chrono::{Duration as ChronoDuration, Utc}; use prost::Message; use serde_json::Value; + use waymark_backend_memory::MemoryBackend; + use waymark_core_backend::{CoreBackend, LockClaim}; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_scheduler_core::{CreateScheduleParams, ScheduleType}; use super::*; - use crate::backends::{CoreBackend, LockClaim, MemoryBackend, SchedulerBackend}; use crate::messages::proto; - use crate::scheduler::{CreateScheduleParams, ScheduleType}; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::RunnerExecutor; use waymark_dag::convert_to_dag; + use waymark_ir_parser::parse_program; + use waymark_runner::RunnerExecutor; fn workflow_args_payload(key: &str, value: i64) -> Vec { proto::WorkflowArguments { @@ -374,11 +373,8 @@ fn main(input: [number], output: [result]): let state = queued.state.clone().expect("queued state"); let mut executor = RunnerExecutor::new(Arc::clone(&dag), state, queued.action_results.clone(), None); - let replay = crate::waymark_core::runner::replay_variables( - executor.state(), - executor.action_results(), - ) - .expect("replay inputs"); + let replay = waymark_runner::replay_variables(executor.state(), executor.action_results()) + .expect("replay inputs"); assert_eq!( replay.variables.get("number"), Some(&Value::Number(7.into())) diff --git a/crates/waymark/src/scheduler/types.rs b/crates/waymark/src/scheduler/types.rs deleted file mode 100644 index 4f8c9104..00000000 --- a/crates/waymark/src/scheduler/types.rs +++ /dev/null @@ -1,139 +0,0 @@ -//! Schedule types. - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -/// Unique identifier for a schedule. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct ScheduleId(pub Uuid); - -impl ScheduleId { - pub fn new() -> Self { - Self(Uuid::new_v4()) - } -} - -impl Default for ScheduleId { - fn default() -> Self { - Self::new() - } -} - -impl std::fmt::Display for ScheduleId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) - } -} - -/// Type of schedule. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum ScheduleType { - Cron, - Interval, -} - -impl ScheduleType { - pub fn as_str(&self) -> &'static str { - match self { - Self::Cron => "cron", - Self::Interval => "interval", - } - } - - pub fn parse(s: &str) -> Option { - match s { - "cron" => Some(Self::Cron), - "interval" => Some(Self::Interval), - _ => None, - } - } -} - -impl std::fmt::Display for ScheduleType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -/// Status of a workflow schedule. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum ScheduleStatus { - Active, - Paused, - Deleted, -} - -impl ScheduleStatus { - pub fn as_str(&self) -> &'static str { - match self { - Self::Active => "active", - Self::Paused => "paused", - Self::Deleted => "deleted", - } - } - - pub fn parse(s: &str) -> Option { - match s { - "active" => Some(Self::Active), - "paused" => Some(Self::Paused), - "deleted" => Some(Self::Deleted), - _ => None, - } - } -} - -impl std::fmt::Display for ScheduleStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -/// A workflow schedule (recurring execution). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WorkflowSchedule { - pub id: Uuid, - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: String, - pub cron_expression: Option, - pub interval_seconds: Option, - pub jitter_seconds: i64, - pub input_payload: Option>, - pub status: String, - pub next_run_at: Option>, - pub last_run_at: Option>, - pub last_instance_id: Option, - pub created_at: DateTime, - pub updated_at: DateTime, - pub priority: i32, - pub allow_duplicate: bool, -} - -impl WorkflowSchedule { - /// Get the schedule type as an enum. - pub fn schedule_type_enum(&self) -> Option { - ScheduleType::parse(&self.schedule_type) - } - - /// Get the status as an enum. - pub fn status_enum(&self) -> Option { - ScheduleStatus::parse(&self.status) - } -} - -/// Parameters for creating a schedule. -#[derive(Debug, Clone)] -pub struct CreateScheduleParams { - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: ScheduleType, - pub cron_expression: Option, - pub interval_seconds: Option, - pub jitter_seconds: i64, - pub input_payload: Option>, - pub priority: i32, - pub allow_duplicate: bool, -} diff --git a/crates/waymark/src/scheduler/utils.rs b/crates/waymark/src/scheduler/utils.rs deleted file mode 100644 index 4530329f..00000000 --- a/crates/waymark/src/scheduler/utils.rs +++ /dev/null @@ -1,181 +0,0 @@ -//! Cron and interval schedule utilities. -//! -//! This module provides utilities for computing the next run time for -//! cron expressions and fixed intervals. -//! -//! Note: This module accepts standard 5-field Unix cron expressions -//! (minute, hour, day-of-month, month, day-of-week) and converts them -//! to 6-field format (with seconds) for the `cron` crate. - -use chrono::{DateTime, Utc}; -use cron::Schedule; -use rand::Rng; -use std::str::FromStr; - -use super::ScheduleType; - -/// Convert a 5-field Unix cron expression to 6-field format. -/// -/// The `cron` crate requires 6 fields (sec min hour dom month dow), -/// but standard Unix cron uses 5 fields (min hour dom month dow). -/// This function prepends "0 " to run at second 0 of each match. -fn normalize_cron_expr(cron_expr: &str) -> String { - let fields: Vec<&str> = cron_expr.split_whitespace().collect(); - if fields.len() == 5 { - // Standard 5-field cron: prepend "0" for seconds - format!("0 {}", cron_expr) - } else { - // Already 6+ fields, use as-is - cron_expr.to_string() - } -} - -/// Compute the next run time for a cron expression. -/// -/// Accepts standard 5-field Unix cron expressions (e.g., "0 * * * *" for hourly) -/// or 6-field expressions with seconds. -/// -/// Returns the next occurrence after the current time (UTC). -pub fn next_cron_run(cron_expr: &str) -> Result, String> { - let normalized = normalize_cron_expr(cron_expr); - let schedule = Schedule::from_str(&normalized) - .map_err(|e| format!("Invalid cron expression '{}': {}", cron_expr, e))?; - schedule - .upcoming(Utc) - .next() - .ok_or_else(|| "No upcoming schedule found".to_string()) -} - -/// Compute the next run time for an interval-based schedule. -/// -/// If `last_run_at` is provided, the next run is `last_run_at + interval_seconds`. -/// Otherwise, the next run is `now + interval_seconds`. -pub fn next_interval_run( - interval_seconds: i64, - last_run_at: Option>, -) -> DateTime { - let base = last_run_at.unwrap_or_else(Utc::now); - base + chrono::Duration::seconds(interval_seconds) -} - -/// Validate a cron expression without computing the next run. -/// -/// Accepts standard 5-field Unix cron expressions or 6-field expressions. -pub fn validate_cron(cron_expr: &str) -> Result<(), String> { - let normalized = normalize_cron_expr(cron_expr); - Schedule::from_str(&normalized) - .map(|_| ()) - .map_err(|e| format!("Invalid cron expression '{}': {}", cron_expr, e)) -} - -/// Apply a random jitter delay (in seconds) to a scheduled time. -/// -/// If `jitter_seconds` is 0, the base time is returned unchanged. -pub fn apply_jitter(base: DateTime, jitter_seconds: i64) -> Result, String> { - if jitter_seconds < 0 { - return Err("jitter_seconds must be non-negative".to_string()); - } - if jitter_seconds == 0 { - return Ok(base); - } - let jitter = rand::thread_rng().gen_range(0..=jitter_seconds); - Ok(base + chrono::Duration::seconds(jitter)) -} - -/// Compute the next run time for a schedule type with optional jitter. -pub fn compute_next_run( - schedule_type: ScheduleType, - cron_expression: Option<&str>, - interval_seconds: Option, - jitter_seconds: i64, - last_run_at: Option>, -) -> Result, String> { - let base = match schedule_type { - ScheduleType::Cron => { - let expr = cron_expression.ok_or_else(|| "cron expression required".to_string())?; - next_cron_run(expr)? - } - ScheduleType::Interval => { - let seconds = - interval_seconds.ok_or_else(|| "interval_seconds required".to_string())?; - next_interval_run(seconds, last_run_at) - } - }; - - apply_jitter(base, jitter_seconds) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_normalize_cron_expr() { - // 5-field should get "0 " prepended - assert_eq!(normalize_cron_expr("* * * * *"), "0 * * * * *"); - assert_eq!(normalize_cron_expr("0 * * * *"), "0 0 * * * *"); - - // 6-field should remain unchanged - assert_eq!(normalize_cron_expr("0 0 * * * *"), "0 0 * * * *"); - } - - #[test] - fn test_valid_cron_expression() { - // Standard 5-field Unix cron expressions - assert!(validate_cron("0 * * * *").is_ok()); - assert!(validate_cron("0 0 * * *").is_ok()); - assert!(validate_cron("* * * * *").is_ok()); - - // 6-field expression with seconds - assert!(validate_cron("0 0 * * * *").is_ok()); - } - - #[test] - fn test_invalid_cron_expression() { - assert!(validate_cron("invalid").is_err()); - assert!(validate_cron("").is_err()); - } - - #[test] - fn test_next_cron_run() { - // Every minute should return a time in the future - let next = next_cron_run("* * * * *").unwrap(); - assert!(next > Utc::now()); - } - - #[test] - fn test_next_interval_run_from_now() { - let before = Utc::now(); - let next = next_interval_run(3600, None); - let after = Utc::now(); - - // Should be approximately 1 hour from now - assert!(next >= before + chrono::Duration::seconds(3600)); - assert!(next <= after + chrono::Duration::seconds(3600)); - } - - #[test] - fn test_next_interval_run_from_last() { - let last_run = Utc::now() - chrono::Duration::seconds(1800); - let next = next_interval_run(3600, Some(last_run)); - - // Should be 1 hour after last_run (30 minutes from now) - let expected = last_run + chrono::Duration::seconds(3600); - assert_eq!(next, expected); - } - - #[test] - fn test_apply_jitter_zero() { - let base = Utc::now(); - let jittered = apply_jitter(base, 0).unwrap(); - assert_eq!(jittered, base); - } - - #[test] - fn test_apply_jitter_range() { - let base = Utc::now(); - let jittered = apply_jitter(base, 5).unwrap(); - assert!(jittered >= base); - assert!(jittered <= base + chrono::Duration::seconds(5)); - } -} diff --git a/crates/waymark/src/test_support/mod.rs b/crates/waymark/src/test_support/mod.rs deleted file mode 100644 index 5e34abaa..00000000 --- a/crates/waymark/src/test_support/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! Shared test fixtures for Rust tests. - -mod postgres; - -pub use postgres::postgres_setup; diff --git a/crates/waymark/src/test_support/postgres.rs b/crates/waymark/src/test_support/postgres.rs deleted file mode 100644 index 1fb9e50f..00000000 --- a/crates/waymark/src/test_support/postgres.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Shared Postgres fixture bootstrapped from root docker-compose. - -use sqlx::PgPool; - -use crate::integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; - -/// Ensure test Postgres is available and migrated, then return a pooled connection. -pub async fn postgres_setup() -> PgPool { - ensure_local_postgres() - .await - .unwrap_or_else(|err| panic!("postgres_setup bootstrap failed: {err:#}")); - connect_pool(LOCAL_POSTGRES_DSN) - .await - .unwrap_or_else(|err| panic!("postgres_setup connect failed: {err:#}")) -} diff --git a/crates/waymark/src/waymark_core/cli/benchmark.rs b/crates/waymark/src/waymark_core/cli/benchmark.rs index 99558eea..241afdd1 100644 --- a/crates/waymark/src/waymark_core/cli/benchmark.rs +++ b/crates/waymark/src/waymark_core/cli/benchmark.rs @@ -12,12 +12,11 @@ use serde_json::Value; use sha2::{Digest, Sha256}; use sqlx::PgPool; use uuid::Uuid; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::QueuedInstance; +use waymark_integration_support::{LOCAL_POSTGRES_DSN, ensure_local_postgres}; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; -use crate::backends::{ - PostgresBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; -use crate::db; -use crate::integration_support::{LOCAL_POSTGRES_DSN, ensure_local_postgres}; use crate::messages::ast as ir; use crate::observability::obs; use crate::waymark_core::cli::smoke::{ @@ -25,9 +24,9 @@ use crate::waymark_core::cli::smoke::{ build_try_except_program, build_while_loop_program, literal_from_value, }; use crate::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use crate::waymark_core::runner::RunnerState; use crate::workers::{ActionCallable, InlineWorkerPool, WorkerPoolError}; use waymark_dag::convert_to_dag; +use waymark_runner_state::RunnerState; const DEFAULT_DSN: &str = LOCAL_POSTGRES_DSN; const DEFAULT_MAX_CONCURRENT_INSTANCES: usize = 500; @@ -318,7 +317,9 @@ async fn run_benchmark( } let pool = PgPool::connect(dsn).await.expect("connect postgres"); drop_benchmark_tables(&pool).await; - db::run_migrations(&pool).await.expect("run migrations"); + waymark_backend_postgres_migrations::run(&pool) + .await + .expect("run migrations"); let backend = PostgresBackend::new(pool); backend.clear_all().await.expect("clear all"); let total = queue_benchmark_instances(&backend, &cases, count_per_case, batch_size).await; diff --git a/crates/waymark/src/waymark_core/cli/smoke.rs b/crates/waymark/src/waymark_core/cli/smoke.rs index abd34109..bb5a49c3 100644 --- a/crates/waymark/src/waymark_core/cli/smoke.rs +++ b/crates/waymark/src/waymark_core/cli/smoke.rs @@ -11,18 +11,18 @@ use prost::Message; use serde_json::Value; use sha2::{Digest, Sha256}; use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::QueuedInstance; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; -use crate::backends::{ - MemoryBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; use crate::messages::ast as ir; use crate::waymark_core::dag_viz::render_dag_image; use crate::waymark_core::ir_format::format_program; use crate::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use crate::waymark_core::runner::RunnerState; use crate::workers::{PythonWorkerConfig, RemoteWorkerPool}; use waymark_dag::convert_to_dag; use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; #[derive(Parser, Debug)] #[command(name = "waymark-smoke", about = "Smoke check core-python components.")] diff --git a/crates/waymark/src/waymark_core/ir_format.rs b/crates/waymark/src/waymark_core/ir_format.rs index 57a9460a..45795f0e 100644 --- a/crates/waymark/src/waymark_core/ir_format.rs +++ b/crates/waymark/src/waymark_core/ir_format.rs @@ -569,7 +569,7 @@ pub fn format_program(program: &ir::Program) -> String { #[cfg(test)] mod tests { use super::{DEFAULT_INDENT, format_program}; - use crate::waymark_core::ir_parser::IRParser; + use waymark_ir_parser::IRParser; #[test] fn test_format_program_happy_path() { diff --git a/crates/waymark/src/waymark_core/lock.rs b/crates/waymark/src/waymark_core/lock.rs index 6838407d..3f3acd5d 100644 --- a/crates/waymark/src/waymark_core/lock.rs +++ b/crates/waymark/src/waymark_core/lock.rs @@ -8,8 +8,7 @@ use chrono::{Duration as ChronoDuration, Utc}; use uuid::Uuid; use tracing::{debug, info, warn}; - -use crate::backends::{CoreBackend, LockClaim}; +use waymark_core_backend::LockClaim; #[derive(Clone)] pub struct InstanceLockTracker { @@ -60,7 +59,7 @@ impl InstanceLockTracker { } pub fn spawn_lock_heartbeat( - backend: Arc, + backend: Arc, tracker: InstanceLockTracker, heartbeat_interval: Duration, lock_ttl: Duration, diff --git a/crates/waymark/src/waymark_core/mod.rs b/crates/waymark/src/waymark_core/mod.rs index 5e3b9090..d5a4ec27 100644 --- a/crates/waymark/src/waymark_core/mod.rs +++ b/crates/waymark/src/waymark_core/mod.rs @@ -6,11 +6,8 @@ pub mod dag_viz; pub mod ir_format; pub mod lock; pub mod runloop; -pub mod runner; -pub use crate::backends::{InstanceDone, QueuedInstance}; pub use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, InlineWorkerPool}; pub use dag_viz::{build_dag_graph, render_dag_image}; pub use ir_format::format_program; pub use runloop::RunLoop; -pub use runner::RunnerState; diff --git a/crates/waymark/src/waymark_core/runloop.rs b/crates/waymark/src/waymark_core/runloop.rs index 6257bda0..407e4a3b 100644 --- a/crates/waymark/src/waymark_core/runloop.rs +++ b/crates/waymark/src/waymark_core/runloop.rs @@ -15,24 +15,26 @@ use serde_json::Value; use tokio::sync::mpsc; use tracing::{debug, error, info, warn}; use uuid::Uuid; - -use crate::backends::{ - ActionDone, BackendError, CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, - LockClaim, QueuedInstance, QueuedInstanceBatch, WorkflowRegistryBackend, +use waymark_backends_core::BackendError; +use waymark_core_backend::{ + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, }; +use waymark_workflow_registry_backend::WorkflowRegistryBackend; + use crate::messages::ast as ir; use crate::observability::obs; use crate::waymark_core::commit_barrier::{CommitBarrier, DeferredInstanceEvent}; use crate::waymark_core::lock::{InstanceLockTracker, spawn_lock_heartbeat}; -use crate::waymark_core::runner::synthetic_exceptions::{ +use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; +use waymark_dag::{DAG, DAGNode, OutputNode, ReturnNode, convert_to_dag}; +use waymark_runner::synthetic_exceptions::{ SyntheticExceptionType, build_synthetic_exception_value, }; -use crate::waymark_core::runner::{ +use waymark_runner::{ DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, replay_variables, }; -use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; -use waymark_dag::{DAG, DAGNode, OutputNode, ReturnNode, convert_to_dag}; /// Raised when the run loop cannot coordinate execution. #[derive(Debug, thiserror::Error)] @@ -351,7 +353,7 @@ impl ShardExecutor { fn run_executor_shard( shard_id: usize, - backend: Arc, + backend: Arc, receiver: std_mpsc::Receiver, sender: mpsc::UnboundedSender, ) { @@ -530,7 +532,7 @@ fn run_executor_shard( /// Run loop that fans out executor work across CPU-bound shard threads. pub struct RunLoop { worker_pool: Arc, - core_backend: Arc, + core_backend: Arc, registry_backend: Arc, workflow_cache: HashMap>, max_concurrent_instances: usize, @@ -566,7 +568,7 @@ pub struct RunLoopSupervisorConfig { impl RunLoop { pub fn new( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, ) -> Self { Self::new_internal( @@ -580,7 +582,7 @@ impl RunLoop { pub fn new_with_shutdown( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, shutdown_token: tokio_util::sync::CancellationToken, ) -> Self { @@ -589,14 +591,14 @@ impl RunLoop { fn new_internal( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, shutdown_token: tokio_util::sync::CancellationToken, exit_on_idle: bool, ) -> Self { let max_concurrent_instances = std::cmp::max(1, config.max_concurrent_instances); let backend = Arc::new(backend); - let core_backend: Arc = backend.clone(); + let core_backend: Arc = backend.clone(); let registry_backend: Arc = backend; Self { worker_pool: Arc::new(worker_pool), @@ -1766,7 +1768,7 @@ pub async fn runloop_supervisor( config: RunLoopSupervisorConfig, shutdown_token: tokio_util::sync::CancellationToken, ) where - B: CoreBackend + WorkflowRegistryBackend + Clone + Send + Sync + 'static, + B: waymark_core_backend::CoreBackend + WorkflowRegistryBackend + Clone + Send + Sync + 'static, W: BaseWorkerPool + Clone + Send + Sync + 'static, { let mut backoff = Duration::from_millis(200); diff --git a/crates/waymark/src/waymark_core/runloop/tests.rs b/crates/waymark/src/waymark_core/runloop/tests.rs index 24b0ea34..53d6c634 100644 --- a/crates/waymark/src/waymark_core/runloop/tests.rs +++ b/crates/waymark/src/waymark_core/runloop/tests.rs @@ -1,141 +1,23 @@ use super::*; use std::collections::{HashMap, VecDeque}; -use std::sync::{ - Arc, Mutex, - atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}, -}; +use std::sync::{Arc, Mutex}; use std::time::Duration; use chrono::Utc; use prost::Message; use sha2::{Digest, Sha256}; -use tonic::async_trait; +use waymark_backend_fault_injection::FaultInjectingBackend; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::{ActionAttemptStatus, CoreBackend}; +use waymark_workflow_registry_backend::WorkflowRegistration; -use crate::backends::{ - ActionAttemptStatus, BackendError, BackendResult, CoreBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, MemoryBackend, QueuedInstanceBatch, WorkflowRegistration, - WorkflowRegistryBackend, WorkflowVersion, -}; use crate::messages::ast as ir; -use crate::waymark_core::ir_parser::parse_program; -use crate::waymark_core::runner::RunnerState; -use crate::waymark_core::runner::state::NodeStatus; use crate::workers::ActionCallable; -use waymark_dag::convert_to_dag; - -#[derive(Clone)] -struct FaultInjectingBackend { - inner: MemoryBackend, - fail_get_queued_instances_with_depth_limit: Arc, - get_queued_instances_calls: Arc, -} - -impl FaultInjectingBackend { - fn with_depth_limit_poll_failures(inner: MemoryBackend) -> Self { - Self { - inner, - fail_get_queued_instances_with_depth_limit: Arc::new(AtomicBool::new(true)), - get_queued_instances_calls: Arc::new(AtomicUsize::new(0)), - } - } - - fn get_queued_instances_calls(&self) -> usize { - self.get_queued_instances_calls.load(AtomicOrdering::SeqCst) - } - - fn queue_len(&self) -> usize { - self.inner - .instance_queue() - .as_ref() - .map(|queue| queue.lock().expect("queue poisoned").len()) - .unwrap_or(0) - } - - fn instances_done_len(&self) -> usize { - self.inner.instances_done().len() - } -} - -#[async_trait] -impl CoreBackend for FaultInjectingBackend { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } - - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - self.inner.save_graphs(claim, graphs).await - } - - async fn save_actions_done( - &self, - actions: &[crate::backends::ActionDone], - ) -> BackendResult<()> { - self.inner.save_actions_done(actions).await - } - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { - self.inner.save_instances_done(instances).await - } - - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - self.get_queued_instances_calls - .fetch_add(1, AtomicOrdering::SeqCst); - if self - .fail_get_queued_instances_with_depth_limit - .load(AtomicOrdering::SeqCst) - { - return Err(BackendError::Message("depth limit exceeded".to_string())); - } - self.inner.get_queued_instances(size, claim).await - } - - async fn queue_instances( - &self, - instances: &[crate::backends::QueuedInstance], - ) -> BackendResult<()> { - self.inner.queue_instances(instances).await - } - - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - self.inner.refresh_instance_locks(claim, instance_ids).await - } - - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()> { - self.inner - .release_instance_locks(lock_uuid, instance_ids) - .await - } -} - -#[async_trait] -impl WorkflowRegistryBackend for FaultInjectingBackend { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult { - self.inner.upsert_workflow_version(registration).await - } - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { - self.inner.get_workflow_versions(ids).await - } -} +use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; +use waymark_runner_state::NodeStatus; +use waymark_runner_state::RunnerState; fn default_test_config(lock_uuid: Uuid) -> RunLoopSupervisorConfig { RunLoopSupervisorConfig { diff --git a/crates/waymark/src/waymark_core/runner/executor.rs b/crates/waymark/src/waymark_core/runner/executor.rs deleted file mode 100644 index e0572688..00000000 --- a/crates/waymark/src/waymark_core/runner/executor.rs +++ /dev/null @@ -1,3031 +0,0 @@ -//! Incremental DAG executor for runner state graphs. - -use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; -use std::time::Duration; - -use chrono::{DateTime, Utc}; -use rustc_hash::FxHashMap; -use serde_json::Value; -use uuid::Uuid; - -use crate::backends::{ActionAttemptStatus, ActionDone, CoreBackend, GraphUpdate}; -use crate::messages::ast as ir; -use crate::observability::obs; -use crate::waymark_core::runner::expression_evaluator::is_exception_value; -use crate::waymark_core::runner::retry::{ - RetryDecision, RetryPolicyEvaluator, timeout_seconds_from_policies, -}; -use crate::waymark_core::runner::state::{ - ActionCallSpec, ExecutionEdge, ExecutionNode, ExecutionNodeType, IndexValue, ListValue, - LiteralValue, NodeStatus, QueueNodeParams, RunnerState, RunnerStateError, -}; -use crate::waymark_core::runner::synthetic_exceptions::{ - SyntheticExceptionType, build_synthetic_exception_value, -}; -use crate::waymark_core::runner::value_visitor::ValueExpr; -use waymark_dag::{ - ActionCallNode, AggregatorNode, DAG, DAGEdge, DagEdgeIndex, EXCEPTION_SCOPE_VAR, EdgeType, -}; - -/// Raised when the runner executor cannot advance safely. -#[derive(Debug, thiserror::Error)] -#[error("{0}")] -pub struct RunnerExecutorError(pub String); - -#[derive(Clone, Debug)] -/// Persistence payloads required before dispatching new actions. -/// These need to be written to the backends in order to ensure that we can mark any -/// inflight actions as failed before queuing them up again -pub struct DurableUpdates { - pub actions_done: Vec, - pub graph_updates: Vec, -} - -#[derive(Clone, Debug)] -/// Return value for executor steps with newly queued action nodes. -pub struct ExecutorStep { - pub actions: Vec, - pub sleep_requests: Vec, - pub updates: Option, -} - -#[derive(Clone, Debug)] -/// Sleep requests emitted by the executor with wake-up times. -pub struct SleepRequest { - pub node_id: Uuid, - pub wake_at: DateTime, -} - -/// Action result payloads keyed by execution node id. -type ExecutionResultMap = HashMap; - -struct FinishedNodeOutcome { - /// Node to continue graph traversal from. - start: Option, - /// Exception payload forwarded to exception edges. - exception_value: Option, - /// Durable attempt metadata for this finished action (if applicable). - action_done: Option, - /// Retry action to dispatch immediately after state transition. - retry_action: Option, -} - -#[derive(Default)] -struct IncrementAccumulator { - actions_done: Vec, - pending_starts: Vec<(ExecutionNode, Option)>, - actions: Vec, - sleep_requests: Vec, - seen_actions: HashSet, - seen_sleep_nodes: HashSet, -} - -impl IncrementAccumulator { - fn absorb_finished_outcome(&mut self, outcome: FinishedNodeOutcome) { - if let Some(start) = outcome.start { - self.pending_starts.push((start, outcome.exception_value)); - } - if let Some(done) = outcome.action_done { - self.actions_done.push(done); - } - if let Some(retry_action) = outcome.retry_action { - self.record_action(retry_action); - } - } - - fn record_action(&mut self, action: ExecutionNode) { - // Multiple finished nodes can converge on the same queued action. - if self.seen_actions.insert(action.node_id) { - self.actions.push(action); - } - } - - fn record_sleep_request(&mut self, sleep_request: SleepRequest) { - if self.seen_sleep_nodes.insert(sleep_request.node_id) { - self.sleep_requests.push(sleep_request); - } - } -} - -struct WalkOutcome { - actions: Vec, - sleep_requests: Vec, -} - -struct FinishedActionMetadata { - attempt: i32, - started_at: Option>, - result: Value, -} - -enum ActionFailureTransition { - RetryQueued(Box), - Failed, -} - -enum TemplateKind { - SpreadAction(Box), - Aggregator(String), - Regular(String), -} - -enum SleepDecision { - Completed, - Blocked(DateTime), -} - -/// Advance a DAG template using the current runner state and action results. -/// -/// The executor treats the DAG as a control-flow template. It queues runtime -/// execution nodes into RunnerState, unrolling loops/spreads into explicit -/// iterations, and stops when it encounters action calls that must be executed -/// by an external worker. -/// -/// This serves as a runner supervisor for a single instance that's owned -/// in memory by our logic. -/// -/// Each call to increment() starts from finished execution nodes, walks -/// downstream through inline nodes (assignments, branches, joins, etc.), and -/// returns any newly queued action nodes that are now unblocked. -pub struct RunnerExecutor { - dag: Arc, - state: RunnerState, - action_results: ExecutionResultMap, - backend: Option>, - template_index: DagEdgeIndex, - incoming_exec_edges: FxHashMap>, - /// Index: template_id -> list of execution node IDs with that template - template_to_exec_nodes: FxHashMap>, - /// Cached assignment evaluations for the current increment pass. - /// Cleared at the start of each increment call. - eval_cache: RefCell>, - instance_id: Option, - terminal_error: Option, -} - -impl RunnerExecutor { - pub fn new( - dag: Arc, - state: RunnerState, - // Action results keyed by execution node id. - action_results: ExecutionResultMap, - backend: Option>, - ) -> Self { - let mut state = state; - state.dag = Some(dag.clone()); - state.set_link_queued_nodes(false); - - let template_index = dag.edge_index(); - let incoming_exec_edges = Self::build_incoming_exec_edges(&state); - let template_to_exec_nodes = Self::build_template_to_exec_nodes(&state); - - Self { - dag, - state, - action_results, - backend, - template_index, - incoming_exec_edges, - template_to_exec_nodes, - eval_cache: RefCell::new(FxHashMap::default()), - instance_id: None, - terminal_error: None, - } - } - - pub fn state(&self) -> &RunnerState { - &self.state - } - - pub fn state_mut(&mut self) -> &mut RunnerState { - &mut self.state - } - - pub fn dag(&self) -> &DAG { - &self.dag - } - - pub fn action_results(&self) -> &ExecutionResultMap { - &self.action_results - } - - pub fn instance_id(&self) -> Option { - self.instance_id - } - - pub fn set_instance_id(&mut self, instance_id: Uuid) { - self.instance_id = Some(instance_id); - } - - pub fn terminal_error(&self) -> Option<&Value> { - self.terminal_error.as_ref() - } - - pub(super) fn eval_cache_get(&self, key: &(Uuid, String)) -> Option { - self.eval_cache.borrow().get(key).cloned() - } - - pub(super) fn eval_cache_insert(&self, key: (Uuid, String), value: Value) { - self.eval_cache.borrow_mut().insert(key, value); - } - - /// Store an action result value for a specific execution node id. - pub fn set_action_result(&mut self, node_id: Uuid, result: Value) { - self.action_results.insert(node_id, result); - } - - /// Remove any cached action result for a specific execution node. - /// Used when re-queuing an action so we don't replay stale results. - pub fn clear_action_result(&mut self, node_id: Uuid) { - self.action_results.remove(&node_id); - } - - /// Resolve timeout policy seconds for an action node. - pub fn action_timeout_seconds(&self, node_id: Uuid) -> Result { - let node = self.execution_node(node_id)?; - if !node.is_action_call() { - return Ok(0); - } - let Some(action_template) = self.template_action_for_execution_node(node)? else { - return Ok(0); - }; - Ok(timeout_seconds_from_policies(&action_template.policies).unwrap_or(0)) - } - - /// Fail inflight actions and return any that should be retried. - /// - /// Use this after recovering from a crash: running actions are treated as - /// failed, their attempt counter is incremented if retry policies allow, - /// and retryable nodes are re-queued for execution. - pub fn resume(&mut self) -> Result { - let mut finished_nodes = Vec::new(); - for (node_id, node) in &self.state.nodes { - if node.is_action_call() && node.status == NodeStatus::Running { - finished_nodes.push(*node_id); - self.action_results.insert( - *node_id, - build_synthetic_exception_value( - SyntheticExceptionType::ExecutorResume, - format!( - "action {node_id} was running during resume and is treated as failed" - ), - Vec::new(), - ), - ); - } - } - if finished_nodes.is_empty() { - let updates = self.collect_updates(Vec::new())?; - return Ok(ExecutorStep { - actions: Vec::new(), - sleep_requests: Vec::new(), - updates, - }); - } - self.increment(&finished_nodes) - } - - /// Advance execution for finished nodes in a single batch. - /// - /// Use this when multiple actions complete in the same tick so the graph - /// update and action inserts are persisted together. - #[obs] - pub fn increment( - &mut self, - finished_nodes: &[Uuid], - ) -> Result { - self.eval_cache.borrow_mut().clear(); - let mut accum = IncrementAccumulator::default(); - self.collect_increment_results(finished_nodes, &mut accum)?; - self.walk_pending_starts(&mut accum)?; - - let IncrementAccumulator { - actions_done, - actions, - sleep_requests, - .. - } = accum; - let running_actions = self.mark_actions_running(&actions)?; - let updates = self.collect_updates(actions_done)?; - - // Note: Action timeouts and delayed retries require wall-clock tracking in the run loop. - // The executor only handles timeout failures once they surface as action results. - - Ok(ExecutorStep { - actions: running_actions, - sleep_requests, - updates, - }) - } - - fn collect_increment_results( - &mut self, - finished_nodes: &[Uuid], - accum: &mut IncrementAccumulator, - ) -> Result<(), RunnerExecutorError> { - for &node_id in finished_nodes { - accum.absorb_finished_outcome(self.apply_finished_node(node_id)?); - } - Ok(()) - } - - fn walk_pending_starts( - &mut self, - accum: &mut IncrementAccumulator, - ) -> Result<(), RunnerExecutorError> { - while let Some((start, exception_value)) = accum.pending_starts.pop() { - let outcome = self.walk_from(start, exception_value)?; - for action in outcome.actions { - accum.record_action(action); - } - for sleep_request in outcome.sleep_requests { - accum.record_sleep_request(sleep_request); - } - } - Ok(()) - } - - fn mark_actions_running( - &mut self, - actions: &[ExecutionNode], - ) -> Result, RunnerExecutorError> { - let mut running_actions = Vec::with_capacity(actions.len()); - for action in actions { - self.clear_action_result(action.node_id); - self.state - .mark_running(action.node_id) - .map_err(Self::state_error)?; - running_actions.push(self.execution_node_clone(action.node_id)?); - } - Ok(running_actions) - } - - /// Walk downstream from a node, executing inline nodes until blocked by an action node. - #[obs] - fn walk_from( - &mut self, - node: ExecutionNode, - exception_value: Option, - ) -> Result { - let mut pending = vec![(node, exception_value)]; - let mut actions = Vec::new(); - let mut sleep_requests = Vec::new(); - let mut forwarded_completed: HashSet = HashSet::new(); - - while let Some((current, current_exception)) = pending.pop() { - // template_id is the DAG node id, not the execution id. - let template_node_id = match ¤t.template_id { - Some(id) => id, - None => continue, - }; - let edges = if let Some(template_edges) = self.template_index.outgoing(template_node_id) - { - self.select_edges(template_edges, ¤t, current_exception)? - } else { - continue; - }; - for edge in edges { - let successors = self.queue_successor(¤t, &edge)?; - for successor in successors { - self.handle_walk_successor( - successor, - &mut pending, - &mut actions, - &mut sleep_requests, - &mut forwarded_completed, - )?; - } - } - } - Ok(WalkOutcome { - actions, - sleep_requests, - }) - } - - fn handle_walk_successor( - &mut self, - successor: ExecutionNode, - pending: &mut Vec<(ExecutionNode, Option)>, - actions: &mut Vec, - sleep_requests: &mut Vec, - forwarded_completed: &mut HashSet, - ) -> Result<(), RunnerExecutorError> { - if self.forward_completed_successor(&successor, pending, forwarded_completed) { - return Ok(()); - } - if successor.is_action_call() { - actions.push(successor); - return Ok(()); - } - if successor.is_sleep() { - self.handle_sleep_successor(successor, pending, sleep_requests)?; - return Ok(()); - } - self.handle_inline_successor(successor, pending) - } - - fn forward_completed_successor( - &self, - successor: &ExecutionNode, - pending: &mut Vec<(ExecutionNode, Option)>, - forwarded_completed: &mut HashSet, - ) -> bool { - if successor.status != NodeStatus::Completed { - return false; - } - if forwarded_completed.insert(successor.node_id) { - // Rehydrated runs can revisit completed paths to recover downstream - // sleep/action work without mutating already completed nodes. - pending.push((successor.clone(), None)); - } - true - } - - fn handle_sleep_successor( - &mut self, - successor: ExecutionNode, - pending: &mut Vec<(ExecutionNode, Option)>, - sleep_requests: &mut Vec, - ) -> Result<(), RunnerExecutorError> { - if !self.inline_ready(&successor) { - return Ok(()); - } - match self.handle_sleep_node(&successor)? { - SleepDecision::Completed => pending.push((successor, None)), - SleepDecision::Blocked(wake_at) => sleep_requests.push(SleepRequest { - node_id: successor.node_id, - wake_at, - }), - } - Ok(()) - } - - fn handle_inline_successor( - &mut self, - successor: ExecutionNode, - pending: &mut Vec<(ExecutionNode, Option)>, - ) -> Result<(), RunnerExecutorError> { - if !self.inline_ready(&successor) { - return Ok(()); - } - self.execute_inline_node(&successor)?; - pending.push((successor, None)); - Ok(()) - } - - /// Update state for a finished node and return replay metadata. - #[obs] - fn apply_finished_node( - &mut self, - node_id: Uuid, - ) -> Result { - if self.execution_node(node_id)?.is_action_call() { - return self.apply_finished_action_node(node_id); - } - // Non-action nodes are inline runtime steps; completion is a status flip. - self.state - .mark_completed(node_id) - .map_err(Self::state_error)?; - Ok(FinishedNodeOutcome { - start: Some(self.execution_node_clone(node_id)?), - exception_value: None, - action_done: None, - retry_action: None, - }) - } - - fn apply_finished_action_node( - &mut self, - node_id: Uuid, - ) -> Result { - let metadata = self.finished_action_metadata(node_id)?; - if is_exception_value(&metadata.result) { - return self.apply_exception_action_completion(node_id, metadata); - } - self.apply_successful_action_completion(node_id, metadata) - } - - fn finished_action_metadata( - &self, - node_id: Uuid, - ) -> Result { - let node = self.execution_node(node_id)?; - let result = - self.action_results.get(&node_id).cloned().ok_or_else(|| { - RunnerExecutorError(format!("missing action result for {node_id}")) - })?; - Ok(FinishedActionMetadata { - attempt: node.action_attempt, - started_at: node.started_at, - result, - }) - } - - fn apply_successful_action_completion( - &mut self, - node_id: Uuid, - metadata: FinishedActionMetadata, - ) -> Result { - self.state - .mark_completed(node_id) - .map_err(Self::state_error)?; - let assignments = self.execution_node(node_id)?.assignments.clone(); - if !assignments.is_empty() { - self.state.mark_latest_assignments(node_id, &assignments); - } - let completed_at = self - .execution_node(node_id)? - .completed_at - .unwrap_or_else(Utc::now); - let action_done = build_action_done( - node_id, - metadata.attempt, - ActionAttemptStatus::Completed, - metadata.started_at, - completed_at, - metadata.result, - ); - Ok(FinishedNodeOutcome { - start: Some(self.execution_node_clone(node_id)?), - exception_value: None, - action_done: Some(action_done), - retry_action: None, - }) - } - - fn apply_exception_action_completion( - &mut self, - node_id: Uuid, - metadata: FinishedActionMetadata, - ) -> Result { - let exception_value = metadata.result; - let status = action_done_status_for_exception(&exception_value); - let finished_at = Utc::now(); - - match self.apply_action_failure_transition(node_id, Some(&exception_value), finished_at)? { - ActionFailureTransition::RetryQueued(retry_action) => { - // Retries are re-queued and dispatched in this same increment pass. - let action_done = build_action_done( - node_id, - metadata.attempt, - status, - metadata.started_at, - finished_at, - exception_value, - ); - Ok(FinishedNodeOutcome { - start: None, - exception_value: None, - action_done: Some(action_done), - retry_action: Some(*retry_action), - }) - } - ActionFailureTransition::Failed => { - // Terminal failures keep exception payloads on the node so exception - // handler edges can bind $__exception in downstream inline nodes. - if !self.failure_has_exception_handler(node_id, &exception_value)? - && self.terminal_error.is_none() - { - self.terminal_error = Some(exception_value.clone()); - } - let completed_at = self - .execution_node(node_id)? - .completed_at - .unwrap_or(finished_at); - let action_done = build_action_done( - node_id, - metadata.attempt, - status, - metadata.started_at, - completed_at, - exception_value.clone(), - ); - Ok(FinishedNodeOutcome { - start: Some(self.execution_node_clone(node_id)?), - exception_value: Some(exception_value), - action_done: Some(action_done), - retry_action: None, - }) - } - } - } - - fn apply_action_failure_transition( - &mut self, - node_id: Uuid, - exception_value: Option<&Value>, - finished_at: DateTime, - ) -> Result { - let should_retry = { - let node = self.execution_node(node_id)?; - self.retry_decision(node, exception_value)?.should_retry - }; - if should_retry { - let retry_node = self.transition_action_to_retry(node_id, finished_at)?; - return Ok(ActionFailureTransition::RetryQueued(Box::new(retry_node))); - } - self.transition_action_to_failed(node_id, exception_value, finished_at)?; - Ok(ActionFailureTransition::Failed) - } - - fn transition_action_to_retry( - &mut self, - node_id: Uuid, - finished_at: DateTime, - ) -> Result { - // Retry transition invariants: - // 1) bump attempt counter before re-dispatch - // 2) return to queued status - // 3) keep completion timestamp for the failed attempt - self.state - .increment_action_attempt(node_id) - .map_err(Self::state_error)?; - let should_queue = !self.state.ready_queue.contains(&node_id); - { - let node = self.execution_node_mut(node_id)?; - node.status = NodeStatus::Queued; - node.started_at = None; - node.completed_at = Some(finished_at); - } - if should_queue { - self.state.ready_queue.push(node_id); - } - self.execution_node_clone(node_id) - } - - fn transition_action_to_failed( - &mut self, - node_id: Uuid, - exception_value: Option<&Value>, - finished_at: DateTime, - ) -> Result<(), RunnerExecutorError> { - self.state.mark_failed(node_id).map_err(Self::state_error)?; - self.execution_node_mut(node_id)?.completed_at = Some(finished_at); - if let Some(exception_value) = exception_value { - self.assign_exception_scope(node_id, exception_value.clone())?; - } - Ok(()) - } - - fn assign_exception_scope( - &mut self, - node_id: Uuid, - exception_value: Value, - ) -> Result<(), RunnerExecutorError> { - let exception_expr = ValueExpr::Literal(LiteralValue { - value: exception_value, - }); - let mut exception_assignment = HashMap::new(); - exception_assignment.insert(EXCEPTION_SCOPE_VAR.to_string(), exception_expr.clone()); - self.execution_node_mut(node_id)? - .assignments - .insert(EXCEPTION_SCOPE_VAR.to_string(), exception_expr); - self.state - .mark_latest_assignments(node_id, &exception_assignment); - Ok(()) - } - - fn failure_has_exception_handler( - &self, - node_id: Uuid, - exception_value: &Value, - ) -> Result { - let node = self.execution_node(node_id)?; - let template_id = match &node.template_id { - Some(id) => id, - None => return Ok(false), - }; - let template_edges = match self.template_index.outgoing(template_id) { - Some(edges) => edges, - None => return Ok(false), - }; - let selected = self.select_edges(template_edges, node, Some(exception_value.clone()))?; - Ok(selected - .iter() - .any(|edge| edge.edge_type == EdgeType::StateMachine)) - } - - fn retry_decision( - &self, - node: &ExecutionNode, - exception_value: Option<&Value>, - ) -> Result { - let Some(action) = self.template_action_for_execution_node(node)? else { - return Ok(RetryDecision { - should_retry: false, - }); - }; - let exception_name = exception_value.and_then(exception_type); - let evaluator = RetryPolicyEvaluator::new(&action.policies, exception_name); - Ok(evaluator.decision(node.action_attempt)) - } - - /// Select outgoing edges based on guards and exception state. - fn select_edges( - &self, - edges: &[DAGEdge], - _node: &ExecutionNode, - exception_value: Option, - ) -> Result, RunnerExecutorError> { - // Fast path: exception handling - if let Some(exception_value) = exception_value { - let mut result = Vec::new(); - for edge in edges { - if edge.exception_types.is_some() && self.exception_matches(edge, &exception_value) - { - result.push(edge.clone()); - } - } - return Ok(result); - } - - // Check if we have any conditional edges (guards or else) - let has_guards = edges.iter().any(|e| e.guard_expr.is_some()); - let has_else = edges.iter().any(|e| e.is_else); - - if has_guards || has_else { - // Evaluate guards first - let mut passed = Vec::new(); - for edge in edges { - if edge.guard_expr.is_some() && self.evaluate_guard(edge.guard_expr.as_ref())? { - passed.push(edge.clone()); - } - } - if !passed.is_empty() { - return Ok(passed); - } - // Fall through to else edges - let mut else_edges = Vec::new(); - for edge in edges { - if edge.is_else { - else_edges.push(edge.clone()); - } - } - return Ok(else_edges); - } - - // Fast path: regular edges (no exceptions, guards, or else) - let mut result = Vec::with_capacity(edges.len()); - for edge in edges { - if edge.exception_types.is_none() { - result.push(edge.clone()); - } - } - Ok(result) - } - - /// Queue successor nodes for a template edge, handling spreads/aggregators. - fn queue_successor( - &mut self, - source: &ExecutionNode, - edge: &DAGEdge, - ) -> Result, RunnerExecutorError> { - if edge.edge_type != EdgeType::StateMachine { - return Ok(Vec::new()); - } - - // Extract info from template without holding borrow across mutable calls - let kind = { - let template = self.dag.nodes.get(&edge.target).ok_or_else(|| { - RunnerExecutorError(format!("template node not found: {}", edge.target)) - })?; - - match template { - waymark_dag::DAGNode::ActionCall(action) if action.spread_loop_var.is_some() => { - TemplateKind::SpreadAction(Box::new(action.clone())) - } - waymark_dag::DAGNode::Aggregator(_) => { - TemplateKind::Aggregator(template.id().to_string()) - } - _ => TemplateKind::Regular(template.id().to_string()), - } - }; - - match kind { - TemplateKind::SpreadAction(action) => { - self.expand_spread_action(source, action.as_ref()) - } - TemplateKind::Aggregator(template_id) => { - if let Some(existing) = self.find_connected_successor(source.node_id, &template_id) - { - return Ok(vec![existing]); - } - let agg_node = self.get_or_create_aggregator(&template_id)?; - self.add_exec_edge(source.node_id, agg_node.node_id); - Ok(vec![agg_node]) - } - TemplateKind::Regular(template_id) => { - if let Some(existing) = self.find_connected_successor(source.node_id, &template_id) - { - return Ok(vec![existing]); - } - let exec_node = self.get_or_create_exec_node(&template_id)?; - self.add_exec_edge(source.node_id, exec_node.node_id); - Ok(vec![exec_node]) - } - } - } - - /// Unroll a spread action into per-item action nodes and a shared aggregator. - /// - /// Example IR: - /// - results = spread items:item -> @work(item=item) - /// Produces one action execution node per element in items and connects - /// them to a single aggregator node for results. - fn expand_spread_action( - &mut self, - source: &ExecutionNode, - template: &ActionCallNode, - ) -> Result, RunnerExecutorError> { - let collection_expr = template.spread_collection_expr.as_ref().ok_or_else(|| { - RunnerExecutorError("spread action missing collection expression".to_string()) - })?; - let loop_var = template.spread_loop_var.as_ref().ok_or_else(|| { - RunnerExecutorError("spread action missing loop variable".to_string()) - })?; - let elements = self.expand_collection(collection_expr)?; - let agg_id = template.aggregates_to.as_ref().ok_or_else(|| { - RunnerExecutorError("spread action missing aggregator link".to_string()) - })?; - - let agg_node = self - .state - .queue_template_node(agg_id, None) - .map_err(|err| RunnerExecutorError(err.0))?; - if elements.is_empty() { - return Ok(vec![agg_node]); - } - - let mut created = Vec::new(); - for (idx, element) in elements.into_iter().enumerate() { - let exec_node = self.queue_action_from_template( - template, - Some(HashMap::from([(loop_var.clone(), element)])), - Some(idx as i32), - )?; - self.add_exec_edge(source.node_id, exec_node.node_id); - self.add_exec_edge(exec_node.node_id, agg_node.node_id); - created.push(exec_node); - } - Ok(created) - } - - /// Create an action execution node from a template with optional bindings. - /// - /// Example IR: - /// - @work(value=item) with local_scope{"item": LiteralValue(3)} - /// Produces an action node whose kwargs include the literal 3. - fn queue_action_from_template( - &mut self, - template: &ActionCallNode, - local_scope: Option>, - iteration_index: Option, - ) -> Result { - let kwargs = template - .kwarg_exprs - .iter() - .map(|(name, expr)| { - let value = self - .state - .expr_to_value(expr, local_scope.as_ref()) - .map_err(|err| RunnerExecutorError(err.0))?; - Ok((name.clone(), value)) - }) - .collect::, RunnerExecutorError>>()?; - - let spec = ActionCallSpec { - action_name: template.action_name.clone(), - module_name: template.module_name.clone(), - kwargs, - }; - let targets = template - .targets - .clone() - .or_else(|| template.target.clone().map(|target| vec![target])) - .unwrap_or_default(); - let node = self - .state - .queue_node( - ExecutionNodeType::ActionCall.as_str(), - &template.label(), - QueueNodeParams { - template_id: Some(template.id.clone()), - targets: Some(targets.clone()), - action: Some(spec.clone()), - ..QueueNodeParams::default() - }, - ) - .map_err(|err| RunnerExecutorError(err.0))?; - for value in spec.kwargs.values() { - self.state.record_data_flow_from_value(node.node_id, value); - } - let result = self - .state - .assign_action_results( - &node, - &template.action_name, - Some(&targets), - iteration_index, - false, - ) - .map_err(|err| RunnerExecutorError(err.0))?; - if let Some(node_mut) = self.state.nodes.get_mut(&node.node_id) { - node_mut.value_expr = Some(ValueExpr::ActionResult(result)); - } - Ok(node) - } - - /// Execute a non-action node inline and update assignments/edges. - fn execute_inline_node(&mut self, node: &ExecutionNode) -> Result<(), RunnerExecutorError> { - let template_id = node - .template_id - .as_ref() - .ok_or_else(|| RunnerExecutorError("inline node missing template id".to_string()))?; - let template = self.dag.nodes.get(template_id).ok_or_else(|| { - RunnerExecutorError(format!("template node not found: {template_id}")) - })?; - - let aggregator = match template { - waymark_dag::DAGNode::Aggregator(aggregator) => Some(aggregator.clone()), - _ => None, - }; - if let Some(aggregator) = aggregator { - self.apply_aggregator_assignments(node, &aggregator)?; - } - - self.state - .mark_completed(node.node_id) - .map_err(|err| RunnerExecutorError(err.0)) - } - - fn handle_sleep_node( - &mut self, - node: &ExecutionNode, - ) -> Result { - let now = Utc::now(); - let scheduled_at = self - .state - .nodes - .get(&node.node_id) - .and_then(|node| node.scheduled_at); - if let Some(wake_at) = scheduled_at { - if wake_at <= now { - self.state - .mark_completed(node.node_id) - .map_err(|err| RunnerExecutorError(err.0))?; - return Ok(SleepDecision::Completed); - } - return Ok(SleepDecision::Blocked(wake_at)); - } - - let value_expr = self - .state - .nodes - .get(&node.node_id) - .and_then(|node| node.value_expr.clone()) - .unwrap_or(ValueExpr::Literal(LiteralValue { - value: Value::Number(0.into()), - })); - let materialized = self.state.materialize_value(value_expr); - let duration_value = self.evaluate_value_expr(&materialized)?; - - let duration_secs = match duration_value { - Value::Number(value) => value.as_f64().ok_or_else(|| { - RunnerExecutorError("sleep duration must be a number".to_string()) - })?, - Value::Null => 0.0, - _ => { - return Err(RunnerExecutorError( - "sleep duration must be a number".to_string(), - )); - } - }; - - if !duration_secs.is_finite() { - return Err(RunnerExecutorError( - "sleep duration must be finite".to_string(), - )); - } - - if duration_secs <= 0.0 { - self.state - .mark_completed(node.node_id) - .map_err(|err| RunnerExecutorError(err.0))?; - return Ok(SleepDecision::Completed); - } - - let duration = Duration::from_secs_f64(duration_secs); - let chrono_duration = chrono::Duration::from_std(duration) - .map_err(|_| RunnerExecutorError("sleep duration is out of range".to_string()))?; - let wake_at = now + chrono_duration; - self.state - .set_node_scheduled_at(node.node_id, Some(wake_at)) - .map_err(|err| RunnerExecutorError(err.0))?; - Ok(SleepDecision::Blocked(wake_at)) - } - - /// Check if an inline node is ready to run based on incoming edges. - fn inline_ready(&self, node: &ExecutionNode) -> bool { - if node.status == NodeStatus::Completed { - return false; - } - let incoming = match self.incoming_exec_edges.get(&node.node_id) { - Some(edges) if !edges.is_empty() => edges, - _ => return true, // No incoming edges means ready - }; - - let template = match node - .template_id - .as_ref() - .and_then(|id| self.dag.nodes.get(id)) - { - Some(template) => template, - None => return false, - }; - - if let waymark_dag::DAGNode::Aggregator(_) = template { - if let Some(required) = self.template_index.incoming(template.id()) { - let connected = self.connected_template_sources(node.node_id); - if !required.is_subset(&connected) { - return false; - } - } - for edge in incoming { - if let Some(source) = self.state.nodes.get(&edge.source) { - if !matches!(source.status, NodeStatus::Completed | NodeStatus::Failed) { - return false; - } - } else { - return false; - } - } - return true; - } - - for edge in incoming { - if let Some(source) = self.state.nodes.get(&edge.source) { - if !matches!(source.status, NodeStatus::Completed | NodeStatus::Failed) { - return false; - } - } else { - return false; - } - } - true - } - - /// Populate aggregated list assignments for a ready aggregator node. - /// - /// Example: - /// - results = spread items: @work(item) - /// When all action nodes complete, the aggregator assigns - /// results = [ActionResultValue(...), ...]. - fn apply_aggregator_assignments( - &mut self, - node: &ExecutionNode, - template: &AggregatorNode, - ) -> Result<(), RunnerExecutorError> { - let targets = template - .targets - .clone() - .or_else(|| template.target.clone().map(|target| vec![target])) - .unwrap_or_default(); - if targets.len() != 1 { - return Ok(()); - } - - let incoming_nodes: Vec = self - .incoming_exec_edges - .get(&node.node_id) - .cloned() - .unwrap_or_default() - .into_iter() - .filter(|edge| edge.edge_type == EdgeType::StateMachine) - .filter_map(|edge| self.state.nodes.get(&edge.source).cloned()) - .collect(); - - let mut values = Vec::new(); - for source in &incoming_nodes { - let value_expr = source.value_expr.clone().ok_or_else(|| { - RunnerExecutorError("aggregator missing source value".to_string()) - })?; - values.push(value_expr); - } - - let ordered = self.order_aggregated_values(&incoming_nodes, &values)?; - let list_value = ValueExpr::List(ListValue { elements: ordered }); - let assignment = HashMap::from([(targets[0].clone(), list_value.clone())]); - if let Some(node_mut) = self.state.nodes.get_mut(&node.node_id) { - node_mut.assignments.extend(assignment.clone()); - } - self.state - .mark_latest_assignments(node.node_id, &assignment); - self.state - .record_data_flow_from_value(node.node_id, &list_value); - Ok(()) - } - - /// Order aggregator values by spread iteration or parallel index. - fn order_aggregated_values( - &self, - sources: &[ExecutionNode], - values: &[ValueExpr], - ) -> Result, RunnerExecutorError> { - // Order by explicit iteration/parallel indices when available, then fall back to timeline. - if sources.len() != values.len() { - return Err(RunnerExecutorError( - "aggregator sources/value mismatch".to_string(), - )); - } - let timeline_index: HashMap = self - .state - .timeline - .iter() - .enumerate() - .map(|(idx, node_id)| (*node_id, idx)) - .collect(); - let mut pairs: Vec<((i32, i32), ValueExpr)> = Vec::with_capacity(values.len()); - for (source, value) in sources.iter().zip(values.iter()) { - let key = self.aggregated_sort_key(source, value, &timeline_index); - pairs.push((key, value.clone())); - } - pairs.sort_by_key(|item| item.0); - Ok(pairs.into_iter().map(|(_, value)| value).collect()) - } - - fn aggregated_sort_key( - &self, - source: &ExecutionNode, - value: &ValueExpr, - timeline_index: &HashMap, - ) -> (i32, i32) { - let mut primary = 2; - let mut secondary = *timeline_index.get(&source.node_id).unwrap_or(&0) as i32; - if let ValueExpr::ActionResult(action) = value { - if let Some(iter_idx) = action.iteration_index { - primary = 0; - secondary = iter_idx; - } - } else if let Some(template_id) = &source.template_id - && let Some(waymark_dag::DAGNode::ActionCall(action)) = self.dag.nodes.get(template_id) - && let Some(idx) = action.parallel_index - { - primary = 1; - secondary = idx; - } - (primary, secondary) - } - - /// Expand a collection expression into element ValueExprs. - /// - /// Example IR: - /// - spread range(3):i -> @work(i) - /// Produces [LiteralValue(0), LiteralValue(1), LiteralValue(2)]. - fn expand_collection( - &mut self, - expr: &ir::Expr, - ) -> Result, RunnerExecutorError> { - let value = Self::expr_to_value(expr)?; - let value = self.state.materialize_value(value); - if let ValueExpr::List(list) = value { - return Ok(list.elements); - } - - if let ValueExpr::ActionResult(action_value) = value.clone() { - let action_result = self.resolve_action_result(&action_value)?; - if let Value::Array(items) = action_result { - return Ok(items - .iter() - .enumerate() - .map(|(idx, _)| { - ValueExpr::Index(IndexValue { - object: Box::new(ValueExpr::ActionResult(action_value.clone())), - index: Box::new(ValueExpr::Literal(LiteralValue { - value: Value::Number((idx as i64).into()), - })), - }) - }) - .collect()); - } - return Err(RunnerExecutorError( - "spread collection is not iterable".to_string(), - )); - } - - let evaluated = self.evaluate_value_expr(&value)?; - if let Value::Array(items) = evaluated { - return Ok(items - .into_iter() - .map(|item| ValueExpr::Literal(LiteralValue { value: item })) - .collect()); - } - - Err(RunnerExecutorError( - "spread collection is not iterable".to_string(), - )) - } - - fn build_incoming_exec_edges(state: &RunnerState) -> FxHashMap> { - let mut incoming: FxHashMap> = FxHashMap::default(); - for edge in &state.edges { - if edge.edge_type != EdgeType::StateMachine { - continue; - } - incoming.entry(edge.target).or_default().push(edge.clone()); - } - incoming - } - - fn build_template_to_exec_nodes(state: &RunnerState) -> FxHashMap> { - let mut index: FxHashMap> = FxHashMap::default(); - for (node_id, node) in &state.nodes { - if let Some(template_id) = &node.template_id { - index.entry(template_id.clone()).or_default().push(*node_id); - } - } - index - } - - /// Register a new execution node in the template index - fn register_exec_node(&mut self, template_id: &str, node_id: Uuid) { - self.template_to_exec_nodes - .entry(template_id.to_string()) - .or_default() - .push(node_id); - } - - fn add_exec_edge(&mut self, source: Uuid, target: Uuid) { - let edge = ExecutionEdge { - source, - target, - edge_type: EdgeType::StateMachine, - }; - if self.state.edges.contains(&edge) { - return; - } - self.state.edges.insert(edge.clone()); - self.incoming_exec_edges - .entry(target) - .or_default() - .push(edge); - } - - fn connected_template_sources(&self, exec_node_id: Uuid) -> HashSet { - let mut connected = HashSet::new(); - for edge in self - .incoming_exec_edges - .get(&exec_node_id) - .cloned() - .unwrap_or_default() - { - if let Some(source) = self.state.nodes.get(&edge.source) - && let Some(template_id) = &source.template_id - { - connected.insert(template_id.clone()); - } - } - connected - } - - fn find_connected_successor( - &self, - source_id: Uuid, - template_id: &str, - ) -> Option { - for edge in &self.state.edges { - if edge.edge_type != EdgeType::StateMachine || edge.source != source_id { - continue; - } - let target = self.state.nodes.get(&edge.target)?; - if target.template_id.as_deref() == Some(template_id) { - return Some(target.clone()); - } - } - None - } - - fn get_or_create_aggregator( - &mut self, - template_id: &str, - ) -> Result { - let mut candidates: Vec = self - .state - .nodes - .values() - .filter(|node| { - node.template_id.as_deref() == Some(template_id) - && node.status != NodeStatus::Completed - }) - .cloned() - .collect(); - if !candidates.is_empty() { - let timeline_index: HashMap = self - .state - .timeline - .iter() - .enumerate() - .map(|(idx, node_id)| (*node_id, idx)) - .collect(); - candidates.sort_by_key(|node| { - std::cmp::Reverse(timeline_index.get(&node.node_id).copied().unwrap_or(0)) - }); - return Ok(candidates[0].clone()); - } - self.state - .queue_template_node(template_id, None) - .map_err(|err| RunnerExecutorError(err.0)) - } - - fn get_or_create_exec_node( - &mut self, - template_id: &str, - ) -> Result { - // Use the index to find candidate nodes - O(k) where k is nodes for this template - if let Some(node_ids) = self.template_to_exec_nodes.get(template_id) { - // Find the most recent non-completed node - let mut best_node_id: Option = None; - let mut best_timeline_pos: Option = None; - - for &node_id in node_ids { - if let Some(node) = self.state.nodes.get(&node_id) - && !matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) - { - let timeline_pos = self.state.timeline.iter().position(|&id| id == node_id); - if let Some(pos) = timeline_pos { - if best_timeline_pos.is_none() || pos > best_timeline_pos.unwrap() { - best_timeline_pos = Some(pos); - best_node_id = Some(node_id); - } - } else if best_node_id.is_none() { - best_node_id = Some(node_id); - } - } - } - - if let Some(node_id) = best_node_id { - return self - .state - .nodes - .get(&node_id) - .cloned() - .ok_or_else(|| RunnerExecutorError(format!("node disappeared: {node_id}"))); - } - } - - // Create new node and register it in the index - let node = self - .state - .queue_template_node(template_id, None) - .map_err(|err| RunnerExecutorError(err.0))?; - self.register_exec_node(template_id, node.node_id); - Ok(node) - } - - fn execution_node(&self, node_id: Uuid) -> Result<&ExecutionNode, RunnerExecutorError> { - self.state - .nodes - .get(&node_id) - .ok_or_else(|| RunnerExecutorError(format!("execution node not found: {node_id}"))) - } - - fn execution_node_mut( - &mut self, - node_id: Uuid, - ) -> Result<&mut ExecutionNode, RunnerExecutorError> { - self.state - .nodes - .get_mut(&node_id) - .ok_or_else(|| RunnerExecutorError(format!("execution node not found: {node_id}"))) - } - - fn execution_node_clone(&self, node_id: Uuid) -> Result { - self.execution_node(node_id).cloned() - } - - fn template_action_for_execution_node( - &self, - node: &ExecutionNode, - ) -> Result, RunnerExecutorError> { - let Some(template_id) = node.template_id.as_ref() else { - return Ok(None); - }; - let template = self.dag.nodes.get(template_id).ok_or_else(|| { - RunnerExecutorError(format!("template node not found: {template_id}")) - })?; - match template { - waymark_dag::DAGNode::ActionCall(action) => Ok(Some(action)), - _ => Ok(None), - } - } - - fn state_error(err: RunnerStateError) -> RunnerExecutorError { - RunnerExecutorError(err.0) - } - - fn collect_updates( - &mut self, - actions_done: Vec, - ) -> Result, RunnerExecutorError> { - if self.backend.is_none() { - return Ok(None); - } - let graph_dirty = self.state.consume_graph_dirty_for_durable_execution(); - let mut graph_updates = Vec::new(); - if graph_dirty { - let instance_id = self.instance_id.ok_or_else(|| { - RunnerExecutorError("instance_id is required for graph persistence".to_string()) - })?; - graph_updates.push(GraphUpdate::from_state(instance_id, &self.state)); - } - let updates = DurableUpdates { - actions_done, - graph_updates, - }; - if updates.actions_done.is_empty() && updates.graph_updates.is_empty() { - Ok(None) - } else { - Ok(Some(updates)) - } - } -} - -fn exception_type(value: &Value) -> Option<&str> { - match value { - Value::Object(map) => map.get("type").and_then(|value| value.as_str()), - _ => None, - } -} - -fn action_done_status_for_exception(value: &Value) -> ActionAttemptStatus { - match SyntheticExceptionType::from_value(value) { - Some(SyntheticExceptionType::ExecutorResume) - | Some(SyntheticExceptionType::ActionTimeout) => ActionAttemptStatus::TimedOut, - None => ActionAttemptStatus::Failed, - } -} - -fn compute_action_duration_ms( - started_at: Option>, - completed_at: DateTime, -) -> Option { - started_at - .map(|started_at| { - completed_at - .signed_duration_since(started_at) - .num_milliseconds() - }) - .filter(|duration| *duration >= 0) -} - -fn build_action_done( - execution_id: Uuid, - attempt: i32, - status: ActionAttemptStatus, - started_at: Option>, - completed_at: DateTime, - result: Value, -) -> ActionDone { - ActionDone { - execution_id, - attempt, - status, - started_at, - completed_at: Some(completed_at), - duration_ms: compute_action_duration_ms(started_at, completed_at), - result, - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::collections::{HashMap, HashSet}; - use std::sync::Arc; - - use crate::backends::MemoryBackend; - use crate::messages::ast as ir; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::state::{ - ExecutionEdge, ExecutionNode, NodeStatus, RunnerState, - }; - use waymark_dag::{ - ActionCallNode, ActionCallParams, AggregatorNode, AssignmentNode, DAG, DAGEdge, - convert_to_dag, - }; - - fn variable(name: &str) -> ir::Expr { - ir::Expr { - kind: Some(ir::expr::Kind::Variable(ir::Variable { - name: name.to_string(), - })), - span: None, - } - } - - fn literal_int(value: i64) -> ir::Expr { - ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(value)), - })), - span: None, - } - } - - fn binary(left: ir::Expr, op: ir::BinaryOperator, right: ir::Expr) -> ir::Expr { - ir::Expr { - kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { - left: Some(Box::new(left)), - op: op as i32, - right: Some(Box::new(right)), - }))), - span: None, - } - } - - #[test] - fn test_action_done_status_for_resume_exception_is_timed_out() { - let value = serde_json::json!({ - "type": "ExecutorResume", - "message": "resumed action timed out", - }); - assert_eq!( - action_done_status_for_exception(&value), - ActionAttemptStatus::TimedOut - ); - } - - #[test] - fn test_action_done_status_for_action_timeout_exception_is_timed_out() { - let value = serde_json::json!({ - "type": "ActionTimeout", - "message": "action timed out", - "timeout_seconds": 1, - "attempt": 1, - }); - assert_eq!( - action_done_status_for_exception(&value), - ActionAttemptStatus::TimedOut - ); - } - - #[test] - fn test_action_done_status_for_generic_exception_is_failed() { - let value = serde_json::json!({ - "type": "ValueError", - "message": "boom", - }); - assert_eq!( - action_done_status_for_exception(&value), - ActionAttemptStatus::Failed - ); - } - - #[test] - fn test_action_done_status_for_non_synthetic_timeout_error_is_failed() { - let value = serde_json::json!({ - "type": "TimeoutError", - "message": "user action raised timeout", - }); - assert_eq!( - action_done_status_for_exception(&value), - ActionAttemptStatus::Failed - ); - } - - #[test] - fn test_build_action_done_sets_duration_from_started_and_completed() { - let execution_id = Uuid::new_v4(); - let started_at = Utc::now(); - let completed_at = started_at + chrono::Duration::milliseconds(275); - let done = build_action_done( - execution_id, - 2, - ActionAttemptStatus::Completed, - Some(started_at), - completed_at, - serde_json::json!({"ok": true}), - ); - - assert_eq!(done.execution_id, execution_id); - assert_eq!(done.attempt, 2); - assert_eq!(done.status, ActionAttemptStatus::Completed); - assert_eq!(done.started_at, Some(started_at)); - assert_eq!(done.completed_at, Some(completed_at)); - assert_eq!(done.duration_ms, Some(275)); - } - - #[derive(Default)] - struct ActionNodeOptions { - policies: Vec, - spread_loop_var: Option, - spread_collection_expr: Option, - aggregates_to: Option, - } - - fn action_node( - node_id: &str, - action_name: &str, - kwarg_exprs: HashMap, - targets: Vec, - options: ActionNodeOptions, - ) -> ActionCallNode { - let ActionNodeOptions { - policies, - spread_loop_var, - spread_collection_expr, - aggregates_to, - } = options; - ActionCallNode::new( - node_id, - action_name, - ActionCallParams { - module_name: None, - kwargs: HashMap::new(), - kwarg_exprs, - policies, - targets: Some(targets), - target: None, - parallel_index: None, - aggregates_to, - spread_loop_var, - spread_collection_expr, - function_name: Some("main".to_string()), - }, - ) - } - - fn assignment_node( - node_id: &str, - targets: Vec, - assign_expr: ir::Expr, - ) -> AssignmentNode { - AssignmentNode::new( - node_id, - targets, - None, - Some(assign_expr), - None, - Some("main".to_string()), - ) - } - - fn aggregator_node( - node_id: &str, - aggregates_from: &str, - targets: Vec, - ) -> AggregatorNode { - AggregatorNode::new( - node_id, - aggregates_from, - Some(targets), - None, - "aggregate", - Some("main".to_string()), - ) - } - - fn snapshot_state( - state: &RunnerState, - action_results: &HashMap, - ) -> ( - HashMap, - HashSet, - HashMap, - ) { - ( - state.nodes.clone(), - state.edges.clone(), - action_results.clone(), - ) - } - - fn create_rehydrated_executor( - dag: &Arc, - nodes: HashMap, - edges: HashSet, - action_results: HashMap, - ) -> RunnerExecutor { - let state = RunnerState::new(Some(Arc::clone(dag)), Some(nodes), Some(edges), false); - RunnerExecutor::new(Arc::clone(dag), state, action_results, None) - } - - fn compare_executor_states(original: &RunnerExecutor, rehydrated: &RunnerExecutor) { - let orig_state = original.state(); - let rehy_state = rehydrated.state(); - assert_eq!( - orig_state.nodes.keys().collect::>(), - rehy_state.nodes.keys().collect::>(), - ); - for node_id in orig_state.nodes.keys() { - let orig_node = orig_state.nodes.get(node_id).unwrap(); - let rehy_node = rehy_state.nodes.get(node_id).unwrap(); - assert_eq!(orig_node.node_type, rehy_node.node_type); - assert_eq!(orig_node.status, rehy_node.status); - assert_eq!(orig_node.template_id, rehy_node.template_id); - assert_eq!(orig_node.targets, rehy_node.targets); - assert_eq!(orig_node.action_attempt, rehy_node.action_attempt); - } - assert_eq!(orig_state.edges, rehy_state.edges); - } - - fn completion_action_result(action: &ExecutionNode) -> Value { - Value::String(format!( - "{}:attempt{}", - action.template_id.as_deref().unwrap_or("unknown_action"), - action.action_attempt - )) - } - - fn dag_from_ir_source(source: &str) -> Arc { - let program = parse_program(source.trim()).expect("parse program"); - Arc::new(convert_to_dag(&program).expect("convert program to DAG")) - } - - fn build_executor_at_entry(dag: &Arc) -> (RunnerExecutor, Uuid) { - let mut state = RunnerState::new(Some(Arc::clone(dag)), None, None, false); - let entry_template = dag.entry_node.as_ref().expect("dag entry node"); - let entry_exec = state - .queue_template_node(entry_template, None) - .expect("queue entry node"); - ( - RunnerExecutor::new(Arc::clone(dag), state, HashMap::new(), None), - entry_exec.node_id, - ) - } - - type ActionResultFor = fn(&ExecutionNode) -> Value; - - struct RehydrateBranchHarness { - dag: Arc, - canonical: RunnerExecutor, - branches: Vec, - action_result_for: ActionResultFor, - } - - impl RehydrateBranchHarness { - const MAX_TICKS: usize = 256; - - fn new( - dag: Arc, - canonical: RunnerExecutor, - action_result_for: ActionResultFor, - ) -> Self { - let mut harness = Self { - dag, - canonical, - branches: Vec::new(), - action_result_for, - }; - harness.fork_from_canonical(); - harness - } - - fn run_and_assert(mut self) { - self.advance_canonical_with_forks(); - for (index, branch) in self.branches.iter_mut().enumerate() { - Self::advance_executor_to_completion(branch, self.action_result_for) - .unwrap_or_else(|err| panic!("branch {index} failed to complete: {err}")); - Self::assert_completed_executor_equivalent(&self.canonical, branch); - } - } - - fn fork_from_canonical(&mut self) { - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(self.canonical.state(), self.canonical.action_results()); - self.branches.push(create_rehydrated_executor( - &self.dag, - nodes_snap, - edges_snap, - results_snap, - )); - } - - fn advance_canonical_with_forks(&mut self) { - let mut converged = false; - for _ in 0..Self::MAX_TICKS { - let progressed = Self::advance_executor_one_increment( - &mut self.canonical, - self.action_result_for, - ) - .expect("advance canonical executor"); - if !progressed { - converged = true; - break; - } - self.fork_from_canonical(); - } - assert!(converged, "canonical executor did not converge"); - assert!( - !self.branches.is_empty(), - "expected at least one rehydrated branch" - ); - } - - fn advance_executor_one_increment( - executor: &mut RunnerExecutor, - action_result_for: ActionResultFor, - ) -> Result { - let active_actions: Vec = executor - .state() - .nodes - .values() - .filter(|node| { - node.is_action_call() - && matches!(node.status, NodeStatus::Queued | NodeStatus::Running) - }) - .cloned() - .collect(); - for action in &active_actions { - if !executor.action_results().contains_key(&action.node_id) { - executor.set_action_result(action.node_id, action_result_for(action)); - } - } - - let mut finished_nodes: Vec = - active_actions.iter().map(|node| node.node_id).collect(); - finished_nodes.extend( - executor - .state() - .nodes - .values() - .filter(|node| { - node.status == NodeStatus::Queued - && node.is_sleep() - && node.scheduled_at.is_some() - }) - .map(|node| node.node_id), - ); - - if finished_nodes.is_empty() { - return Ok(false); - } - - let step = executor.increment(&finished_nodes)?; - for action in &step.actions { - if !executor.action_results().contains_key(&action.node_id) { - executor.set_action_result(action.node_id, action_result_for(action)); - } - } - for sleep_request in &step.sleep_requests { - executor - .state_mut() - .set_node_scheduled_at( - sleep_request.node_id, - Some(Utc::now() - chrono::Duration::seconds(1)), - ) - .map_err(|err| RunnerExecutorError(err.0))?; - } - Ok(true) - } - - fn advance_executor_to_completion( - executor: &mut RunnerExecutor, - action_result_for: ActionResultFor, - ) -> Result<(), RunnerExecutorError> { - for _ in 0..Self::MAX_TICKS { - if !Self::advance_executor_one_increment(executor, action_result_for)? { - return Ok(()); - } - } - - Err(RunnerExecutorError( - "executor did not converge to completion".to_string(), - )) - } - - fn count_keyed(items: impl IntoIterator) -> HashMap { - let mut counts: HashMap = HashMap::new(); - for item in items { - *counts.entry(item).or_insert(0) += 1; - } - counts - } - - fn node_shape_counts(executor: &RunnerExecutor) -> HashMap { - Self::count_keyed(executor.state().nodes.values().map(|node| { - let mut targets = node.targets.clone(); - targets.sort(); - let mut assignment_keys: Vec = node.assignments.keys().cloned().collect(); - assignment_keys.sort(); - let mut action_kwarg_keys = node - .action - .as_ref() - .map(|action| action.kwargs.keys().cloned().collect::>()) - .unwrap_or_default(); - action_kwarg_keys.sort(); - format!( - "type={}|template={}|status={:?}|attempt={}|targets={targets:?}|assignments={assignment_keys:?}|action={}({action_kwarg_keys:?})|scheduled={}", - node.node_type, - node.template_id.clone().unwrap_or_default(), - node.status, - node.action_attempt, - node.action - .as_ref() - .map(|action| action.action_name.clone()) - .unwrap_or_default(), - node.scheduled_at.is_some(), - ) - })) - } - - fn edge_shape_counts(executor: &RunnerExecutor) -> HashMap { - Self::count_keyed(executor.state().edges.iter().map(|edge| { - let source = executor - .state() - .nodes - .get(&edge.source) - .expect("source node") - .template_id - .clone() - .unwrap_or_else(|| "__unknown_source".to_string()); - let target = executor - .state() - .nodes - .get(&edge.target) - .expect("target node") - .template_id - .clone() - .unwrap_or_else(|| "__unknown_target".to_string()); - format!("{source}-{:?}->{target}", edge.edge_type) - })) - } - - fn action_result_counts(executor: &RunnerExecutor) -> HashMap { - Self::count_keyed(executor.action_results().iter().map(|(node_id, value)| { - let template_id = executor - .state() - .nodes - .get(node_id) - .and_then(|node| node.template_id.clone()) - .unwrap_or_else(|| "__unknown_action".to_string()); - let rendered = - serde_json::to_string(value).expect("action result should serialize to JSON"); - format!("{template_id}:{rendered}") - })) - } - - fn assert_completed_executor_equivalent( - canonical: &RunnerExecutor, - rehydrated: &RunnerExecutor, - ) { - assert_eq!( - Self::node_shape_counts(canonical), - Self::node_shape_counts(rehydrated) - ); - assert_eq!( - Self::edge_shape_counts(canonical), - Self::edge_shape_counts(rehydrated) - ); - assert_eq!( - canonical.state().timeline.len(), - rehydrated.state().timeline.len() - ); - assert_eq!( - Self::action_result_counts(canonical), - Self::action_result_counts(rehydrated) - ); - assert_eq!( - canonical.state().ready_queue.is_empty(), - rehydrated.state().ready_queue.is_empty() - ); - - let replay_canonical = crate::waymark_core::runner::replay_variables( - canonical.state(), - canonical.action_results(), - ) - .expect("replay canonical"); - let replay_rehydrated = crate::waymark_core::runner::replay_variables( - rehydrated.state(), - rehydrated.action_results(), - ) - .expect("replay rehydrated"); - - let mut assignment_counts: HashMap = HashMap::new(); - for node in canonical.state().nodes.values() { - for target in node.assignments.keys() { - *assignment_counts.entry(target.clone()).or_insert(0) += 1; - } - } - let stable_canonical: HashMap = replay_canonical - .variables - .into_iter() - .filter(|(name, _)| assignment_counts.get(name).copied().unwrap_or(0) <= 1) - .collect(); - let stable_rehydrated: HashMap = replay_rehydrated - .variables - .into_iter() - .filter(|(name, _)| assignment_counts.get(name).copied().unwrap_or(0) <= 1) - .collect(); - assert_eq!(stable_canonical, stable_rehydrated); - } - } - - fn setup_linear_assignment_checkpoint() -> (Arc, RunnerExecutor) { - let dag = dag_from_ir_source( - r#" -fn main(input: [], output: [z]): - x = @fetch() - y = x + 1 - z = @process(value=y) - return z -"#, - ); - let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); - - let first_step = executor - .increment(&[entry_exec_id]) - .expect("advance from entry"); - assert_eq!(first_step.actions.len(), 1); - let first_exec = first_step.actions[0].clone(); - executor.set_action_result(first_exec.node_id, Value::Number(10.into())); - - let step = executor.increment(&[first_exec.node_id]).expect("advance"); - assert_eq!(step.actions.len(), 1); - (dag, executor) - } - - fn setup_sleep_resume_checkpoint() -> (Arc, RunnerExecutor) { - let dag = dag_from_ir_source( - r#" -fn main(input: [], output: [resumed]): - seed = 1 - started = @get_timestamp() - sleep 60 - resumed = @get_timestamp() - return resumed -"#, - ); - let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); - - let start_step = executor.increment(&[entry_exec_id]).expect("start"); - assert_eq!(start_step.actions.len(), 1); - let start_exec = start_step.actions[0].clone(); - executor.set_action_result(start_exec.node_id, Value::String("t0".to_string())); - - let sleep_step = executor - .increment(&[start_exec.node_id]) - .expect("advance to sleep"); - assert!(sleep_step.actions.is_empty()); - assert_eq!(sleep_step.sleep_requests.len(), 1); - (dag, executor) - } - - fn setup_spread_checkpoint() -> (Arc, RunnerExecutor) { - let dag = dag_from_ir_source( - r#" -fn main(input: [], output: [done]): - items = @get_items() - results = spread items:item -> @double(value=item) - done = @finalize(values=results) - return done -"#, - ); - let (mut executor, entry_exec_id) = build_executor_at_entry(&dag); - - let first_step = executor.increment(&[entry_exec_id]).expect("start"); - assert_eq!(first_step.actions.len(), 1); - let initial_exec = first_step.actions[0].clone(); - executor.set_action_result( - initial_exec.node_id, - Value::Array(vec![1.into(), 2.into(), 3.into()]), - ); - - let step1 = executor - .increment(&[initial_exec.node_id]) - .expect("expand spread"); - assert_eq!(step1.actions.len(), 3); - for (idx, node) in step1.actions.iter().enumerate() { - executor.set_action_result(node.node_id, Value::Number(((idx + 1) as i64).into())); - } - - let step2 = executor - .increment( - &step1 - .actions - .iter() - .map(|node| node.node_id) - .collect::>(), - ) - .expect("complete spread"); - assert_eq!(step2.actions.len(), 1); - (dag, executor) - } - - #[test] - fn test_executor_unblocks_downstream_action() { - let mut dag = DAG::default(); - - let action_start = action_node( - "action_start", - "fetch", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let assign_node = assignment_node( - "assign", - vec!["y".to_string()], - binary( - variable("x"), - ir::BinaryOperator::BinaryOpAdd, - literal_int(1), - ), - ); - let action_next = action_node( - "action_next", - "work", - HashMap::from([("value".to_string(), variable("y"))]), - vec!["z".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action_start.clone())); - dag.add_node(waymark_dag::DAGNode::Assignment(assign_node.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action_next.clone())); - dag.add_edge(DAGEdge::state_machine( - action_start.id.clone(), - assign_node.id.clone(), - )); - dag.add_edge(DAGEdge::state_machine( - assign_node.id.clone(), - action_next.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let start_exec = state - .queue_template_node(&action_start.id, None) - .expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(start_exec.node_id, Value::Number(10.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - - let step = executor - .increment(&[start_exec.node_id]) - .expect("increment"); - assert_eq!(step.actions.len(), 1); - assert_eq!( - step.actions[0].template_id.as_deref(), - Some(action_next.id.as_str()) - ); - } - - #[test] - fn test_rehydrate_after_first_action_queued() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "fetch", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let action2 = action_node( - "action2", - "process", - HashMap::from([("value".to_string(), variable("x"))]), - vec!["y".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - action2.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - compare_executor_states(&executor, &rehydrated); - let node = rehydrated.state().nodes.get(&exec1.node_id).expect("node"); - assert_eq!(node.status, NodeStatus::Queued); - } - - #[test] - fn test_rehydrate_after_action_completed_and_increment() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "fetch", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let action2 = action_node( - "action2", - "process", - HashMap::from([("value".to_string(), variable("x"))]), - vec!["y".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - action2.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(exec1.node_id, Value::Number(42.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - - let step = executor.increment(&[exec1.node_id]).expect("increment"); - assert_eq!(step.actions.len(), 1); - let exec2 = &step.actions[0]; - assert_eq!(exec2.template_id.as_deref(), Some(action2.id.as_str())); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - let node1 = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); - assert_eq!(node1.status, NodeStatus::Completed); - let node2 = rehydrated.state().nodes.get(&exec2.node_id).unwrap(); - assert_eq!(node2.status, NodeStatus::Running); - } - - #[test] - fn test_rehydrate_multi_step_chain() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "step1", - HashMap::new(), - vec!["a".to_string()], - ActionNodeOptions::default(), - ); - let action2 = action_node( - "action2", - "step2", - HashMap::from([("input".to_string(), variable("a"))]), - vec!["b".to_string()], - ActionNodeOptions::default(), - ); - let action3 = action_node( - "action3", - "step3", - HashMap::from([("input".to_string(), variable("b"))]), - vec!["c".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action3.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - action2.id.clone(), - )); - dag.add_edge(DAGEdge::state_machine( - action2.id.clone(), - action3.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - let mut executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - executor.set_action_result(exec1.node_id, Value::Number(10.into())); - let step1 = executor.increment(&[exec1.node_id]).expect("increment"); - let exec2 = step1.actions[0].clone(); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - executor.set_action_result(exec2.node_id, Value::Number(20.into())); - let step2 = executor.increment(&[exec2.node_id]).expect("increment"); - let exec3 = step2.actions[0].clone(); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - executor.set_action_result(exec3.node_id, Value::Number(30.into())); - let step3 = executor.increment(&[exec3.node_id]).expect("increment"); - assert!(step3.actions.is_empty()); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - for node in rehydrated.state().nodes.values() { - if node.is_action_call() { - assert_eq!(node.status, NodeStatus::Completed); - } - } - } - - #[test] - fn test_rehydrate_with_assignment_node() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "fetch", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let assign = assignment_node( - "assign", - vec!["y".to_string()], - binary( - variable("x"), - ir::BinaryOperator::BinaryOpAdd, - literal_int(1), - ), - ); - let action2 = action_node( - "action2", - "process", - HashMap::from([("value".to_string(), variable("y"))]), - vec!["z".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::Assignment(assign.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - assign.id.clone(), - )); - dag.add_edge(DAGEdge::state_machine( - assign.id.clone(), - action2.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(exec1.node_id, Value::Number(10.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - - let step = executor.increment(&[exec1.node_id]).expect("increment"); - assert_eq!(step.actions.len(), 1); - assert_eq!( - step.actions[0].template_id.as_deref(), - Some(action2.id.as_str()) - ); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - let assign_nodes: Vec<_> = rehydrated - .state() - .nodes - .values() - .filter(|node| node.template_id.as_deref() == Some(&assign.id)) - .collect(); - assert_eq!(assign_nodes.len(), 1); - assert_eq!(assign_nodes[0].status, NodeStatus::Completed); - assert!(assign_nodes[0].assignments.contains_key("y")); - } - - #[test] - fn test_rehydrate_preserves_action_kwargs() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "compute", - HashMap::from([ - ("a".to_string(), literal_int(5)), - ( - "b".to_string(), - ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::StringValue("test".to_string())), - })), - span: None, - }, - ), - ]), - vec!["result".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - let orig_node = executor.state().nodes.get(&exec1.node_id).unwrap(); - let rehy_node = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); - assert!(orig_node.action.is_some()); - assert!(rehy_node.action.is_some()); - let orig_action = orig_node.action.as_ref().unwrap(); - let rehy_action = rehy_node.action.as_ref().unwrap(); - assert_eq!(orig_action.action_name, rehy_action.action_name); - let orig_keys: HashSet<_> = orig_action.kwargs.keys().cloned().collect(); - let rehy_keys: HashSet<_> = rehy_action.kwargs.keys().cloned().collect(); - assert_eq!(orig_keys, rehy_keys); - } - - #[test] - fn test_rehydrate_increments_from_same_position() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "first", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let action2 = action_node( - "action2", - "second", - HashMap::new(), - vec!["y".to_string()], - ActionNodeOptions::default(), - ); - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - action2.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(exec1.node_id, Value::Number(100.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let mut rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - let orig_step = executor.increment(&[exec1.node_id]).expect("increment"); - let rehy_step = rehydrated.increment(&[exec1.node_id]).expect("increment"); - assert_eq!(orig_step.actions.len(), rehy_step.actions.len()); - assert_eq!( - orig_step.actions[0].template_id, - rehy_step.actions[0].template_id - ); - } - - #[test] - fn test_rehydrate_resume_marks_running_as_retryable() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "work", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions { - policies: vec![ir::PolicyBracket { - kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { - max_retries: 3, - backoff: None, - exception_types: vec!["ExecutorResume".to_string()], - })), - }], - ..ActionNodeOptions::default() - }, - ); - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - state.mark_running(exec1.node_id).expect("mark running"); - - let executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let mut rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - assert_eq!( - rehydrated.state().nodes.get(&exec1.node_id).unwrap().status, - NodeStatus::Running - ); - - let step = rehydrated.resume().expect("resume"); - assert_eq!(step.actions.len(), 1); - assert_eq!(step.actions[0].node_id, exec1.node_id); - let node = rehydrated.state().nodes.get(&exec1.node_id).unwrap(); - assert_eq!(node.status, NodeStatus::Running); - assert_eq!(node.action_attempt, 2); - assert!(node.started_at.is_some()); - } - - #[test] - fn test_increment_records_failed_action_attempt() { - let mut dag = DAG::default(); - let action = action_node( - "action1", - "work", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec = state.queue_template_node(&action.id, None).expect("queue"); - - let mut executor = RunnerExecutor::new( - dag, - state, - HashMap::new(), - Some(Arc::new(MemoryBackend::new())), - ); - executor.set_instance_id(Uuid::new_v4()); - executor.set_action_result( - exec.node_id, - serde_json::json!({"type": "ValueError", "message": "boom"}), - ); - - let step = executor.increment(&[exec.node_id]).expect("increment"); - let updates = step.updates.expect("durable updates"); - assert_eq!(updates.actions_done.len(), 1); - assert_eq!(updates.actions_done[0].execution_id, exec.node_id); - assert_eq!(updates.actions_done[0].attempt, 1); - assert_eq!( - updates.actions_done[0] - .result - .get("type") - .and_then(Value::as_str), - Some("ValueError") - ); - assert_eq!( - executor - .state() - .nodes - .get(&exec.node_id) - .map(|n| n.status.clone()), - Some(NodeStatus::Failed) - ); - } - - #[test] - fn test_increment_records_failed_attempt_before_retry() { - let mut dag = DAG::default(); - let action = action_node( - "action1", - "work", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions { - policies: vec![ir::PolicyBracket { - kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { - max_retries: 2, - backoff: None, - exception_types: Vec::new(), - })), - }], - ..ActionNodeOptions::default() - }, - ); - dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec = state.queue_template_node(&action.id, None).expect("queue"); - - let mut executor = RunnerExecutor::new( - dag, - state, - HashMap::new(), - Some(Arc::new(MemoryBackend::new())), - ); - executor.set_instance_id(Uuid::new_v4()); - executor.set_action_result( - exec.node_id, - serde_json::json!({"type": "ValueError", "message": "retry me"}), - ); - - let first_step = executor - .increment(&[exec.node_id]) - .expect("first increment"); - assert_eq!(first_step.actions.len(), 1); - assert_eq!(first_step.actions[0].node_id, exec.node_id); - let first_updates = first_step.updates.expect("first durable updates"); - assert_eq!(first_updates.actions_done.len(), 1); - assert_eq!(first_updates.actions_done[0].attempt, 1); - assert_eq!( - executor - .state() - .nodes - .get(&exec.node_id) - .map(|n| n.status.clone()), - Some(NodeStatus::Running) - ); - assert_eq!( - executor - .state() - .nodes - .get(&exec.node_id) - .map(|n| n.action_attempt), - Some(2) - ); - - executor.set_action_result(exec.node_id, Value::String("ok".to_string())); - let second_step = executor - .increment(&[exec.node_id]) - .expect("second increment"); - let second_updates = second_step.updates.expect("second durable updates"); - assert_eq!(second_updates.actions_done.len(), 1); - assert_eq!(second_updates.actions_done[0].attempt, 2); - assert_eq!( - executor - .state() - .nodes - .get(&exec.node_id) - .map(|n| n.status.clone()), - Some(NodeStatus::Completed) - ); - } - - #[test] - fn test_rehydrate_replay_variables_consistent() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "fetch", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let assign = assignment_node( - "assign", - vec!["doubled".to_string()], - binary( - variable("x"), - ir::BinaryOperator::BinaryOpMul, - literal_int(2), - ), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::Assignment(assign.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - assign.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(exec1.node_id, Value::Number(21.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - executor.increment(&[exec1.node_id]).expect("increment"); - - let orig_replay = crate::waymark_core::runner::replay_variables( - executor.state(), - executor.action_results(), - ) - .expect("replay"); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - let rehy_replay = crate::waymark_core::runner::replay_variables( - rehydrated.state(), - rehydrated.action_results(), - ) - .expect("replay"); - assert_eq!(orig_replay.variables, rehy_replay.variables); - assert_eq!( - rehy_replay.variables.get("doubled"), - Some(&Value::Number(42.into())) - ); - } - - #[test] - fn test_rehydrate_completion_equivalent_across_ir_scenarios() { - let (linear_dag, linear_executor) = setup_linear_assignment_checkpoint(); - RehydrateBranchHarness::new(linear_dag, linear_executor, completion_action_result) - .run_and_assert(); - - let (sleep_dag, sleep_executor) = setup_sleep_resume_checkpoint(); - RehydrateBranchHarness::new(sleep_dag, sleep_executor, completion_action_result) - .run_and_assert(); - - let (spread_dag, spread_executor) = setup_spread_checkpoint(); - RehydrateBranchHarness::new(spread_dag, spread_executor, completion_action_result) - .run_and_assert(); - } - - #[test] - fn test_rehydrate_spread_action_with_aggregator() { - let mut dag = DAG::default(); - let initial_action = action_node( - "initial", - "get_items", - HashMap::new(), - vec!["items".to_string()], - ActionNodeOptions::default(), - ); - let spread_action = action_node( - "spread_action", - "process_item", - HashMap::from([("item".to_string(), variable("item"))]), - vec!["item_result".to_string()], - ActionNodeOptions { - spread_loop_var: Some("item".to_string()), - spread_collection_expr: Some(variable("items")), - aggregates_to: Some("aggregator".to_string()), - ..ActionNodeOptions::default() - }, - ); - let aggregator = - aggregator_node("aggregator", "spread_action", vec!["results".to_string()]); - - dag.add_node(waymark_dag::DAGNode::ActionCall(initial_action.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(spread_action.clone())); - dag.add_node(waymark_dag::DAGNode::Aggregator(aggregator.clone())); - dag.add_edge(DAGEdge::state_machine( - initial_action.id.clone(), - spread_action.id.clone(), - )); - dag.add_edge(DAGEdge::state_machine( - spread_action.id.clone(), - aggregator.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let initial_exec = state - .queue_template_node(&initial_action.id, None) - .expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert( - initial_exec.node_id, - Value::Array(vec![1.into(), 2.into(), 3.into()]), - ); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - - let step1 = executor - .increment(&[initial_exec.node_id]) - .expect("increment"); - assert_eq!(step1.actions.len(), 3); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - compare_executor_states(&executor, &rehydrated); - let action_nodes: Vec<_> = executor - .state() - .nodes - .values() - .filter(|node| { - node.is_action_call() && node.template_id.as_deref() == Some(&spread_action.id) - }) - .collect(); - assert_eq!(action_nodes.len(), 3); - for action_node in action_nodes { - let rehy_node = rehydrated.state().nodes.get(&action_node.node_id).unwrap(); - assert_eq!(rehy_node.node_type, action_node.node_type); - assert_eq!(rehy_node.status, action_node.status); - } - } - - #[test] - fn test_rehydrate_full_spread_execution() { - let mut dag = DAG::default(); - let initial_action = action_node( - "initial", - "get_items", - HashMap::new(), - vec!["items".to_string()], - ActionNodeOptions::default(), - ); - let spread_action = action_node( - "spread_action", - "double", - HashMap::from([("value".to_string(), variable("item"))]), - vec!["item_result".to_string()], - ActionNodeOptions { - spread_loop_var: Some("item".to_string()), - spread_collection_expr: Some(variable("items")), - aggregates_to: Some("aggregator".to_string()), - ..ActionNodeOptions::default() - }, - ); - let aggregator = - aggregator_node("aggregator", "spread_action", vec!["results".to_string()]); - - dag.add_node(waymark_dag::DAGNode::ActionCall(initial_action.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(spread_action.clone())); - dag.add_node(waymark_dag::DAGNode::Aggregator(aggregator.clone())); - dag.add_edge(DAGEdge::state_machine( - initial_action.id.clone(), - spread_action.id.clone(), - )); - dag.add_edge(DAGEdge::state_machine( - spread_action.id.clone(), - aggregator.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let initial_exec = state - .queue_template_node(&initial_action.id, None) - .expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert( - initial_exec.node_id, - Value::Array(vec![10.into(), 20.into()]), - ); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results.clone(), None); - - let step1 = executor - .increment(&[initial_exec.node_id]) - .expect("increment"); - let spread_nodes = step1.actions; - assert_eq!(spread_nodes.len(), 2); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - for (idx, node) in spread_nodes.iter().enumerate() { - executor.set_action_result(node.node_id, Value::Number(((idx + 1) * 100).into())); - } - - let _step2 = executor - .increment(&spread_nodes.iter().map(|n| n.node_id).collect::>()) - .expect("increment"); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - compare_executor_states(&executor, &rehydrated); - - let agg_nodes: Vec<_> = rehydrated - .state() - .nodes - .values() - .filter(|node| node.template_id.as_deref() == Some(&aggregator.id)) - .collect(); - assert_eq!(agg_nodes.len(), 1); - assert_eq!(agg_nodes[0].status, NodeStatus::Completed); - assert!(agg_nodes[0].assignments.contains_key("results")); - } - - #[test] - fn test_rehydrate_timeline_ordering_preserved() { - let mut dag = DAG::default(); - let mut actions = Vec::new(); - for i in 0..4 { - actions.push(action_node( - &format!("action{i}"), - &format!("step{i}"), - HashMap::new(), - vec![format!("x{i}")], - ActionNodeOptions::default(), - )); - } - for action in &actions { - dag.add_node(waymark_dag::DAGNode::ActionCall(action.clone())); - } - for i in 0..actions.len() - 1 { - dag.add_edge(DAGEdge::state_machine( - actions[i].id.clone(), - actions[i + 1].id.clone(), - )); - } - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let mut exec_nodes: Vec = Vec::new(); - exec_nodes.push( - state - .queue_template_node(&actions[0].id, None) - .expect("queue"), - ); - let mut executor = RunnerExecutor::new(dag.clone(), state, HashMap::new(), None); - - for i in 0..3 { - executor.set_action_result( - exec_nodes.last().unwrap().node_id, - Value::Number((i * 10).into()), - ); - let step = executor - .increment(&[exec_nodes.last().unwrap().node_id]) - .expect("increment"); - if !step.actions.is_empty() { - exec_nodes.push(step.actions[0].clone()); - } - } - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - let orig_timeline = executor.state().timeline.clone(); - let rehy_timeline = rehydrated.state().timeline.clone(); - assert_eq!(orig_timeline.len(), rehy_timeline.len()); - assert_eq!( - orig_timeline.iter().collect::>(), - rehy_timeline.iter().collect::>() - ); - } - - #[test] - fn test_rehydrate_ready_queue_rebuilt_for_running_actions() { - let mut dag = DAG::default(); - let action1 = action_node( - "action1", - "first", - HashMap::new(), - vec!["x".to_string()], - ActionNodeOptions::default(), - ); - let action2 = action_node( - "action2", - "second", - HashMap::new(), - vec!["y".to_string()], - ActionNodeOptions::default(), - ); - - dag.add_node(waymark_dag::DAGNode::ActionCall(action1.clone())); - dag.add_node(waymark_dag::DAGNode::ActionCall(action2.clone())); - dag.add_edge(DAGEdge::state_machine( - action1.id.clone(), - action2.id.clone(), - )); - - let dag = Arc::new(dag); - let mut state = RunnerState::new(Some(dag.clone()), None, None, false); - let exec1 = state.queue_template_node(&action1.id, None).expect("queue"); - - let mut action_results = HashMap::new(); - action_results.insert(exec1.node_id, Value::Number(50.into())); - let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); - let step = executor.increment(&[exec1.node_id]).expect("increment"); - let exec2 = step.actions[0].clone(); - - let (nodes_snap, edges_snap, results_snap) = - snapshot_state(executor.state(), executor.action_results()); - let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - - let queued_nodes: Vec<_> = rehydrated - .state() - .nodes - .values() - .filter(|node| node.status == NodeStatus::Queued) - .collect(); - assert!(queued_nodes.is_empty()); - let running_nodes: Vec<_> = rehydrated - .state() - .nodes - .values() - .filter(|node| node.status == NodeStatus::Running) - .collect(); - assert_eq!(running_nodes.len(), 1); - assert_eq!(running_nodes[0].node_id, exec2.node_id); - assert!( - rehydrated.state().ready_queue.is_empty(), - "rehydration should not requeue running action nodes" - ); - } -} diff --git a/crates/waymark/src/waymark_core/runner/expression_evaluator.rs b/crates/waymark/src/waymark_core/runner/expression_evaluator.rs deleted file mode 100644 index 96d908e2..00000000 --- a/crates/waymark/src/waymark_core/runner/expression_evaluator.rs +++ /dev/null @@ -1,1058 +0,0 @@ -use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; -use std::rc::Rc; - -use serde_json::Value; -use uuid::Uuid; - -use crate::messages::ast as ir; -use crate::observability::obs; -use crate::waymark_core::runner::state::{ - ActionCallSpec, ActionResultValue, BinaryOpValue, DictEntryValue, DictValue, DotValue, - FunctionCallValue, IndexValue, ListValue, LiteralValue, UnaryOpValue, VariableValue, - literal_value, -}; -use crate::waymark_core::runner::value_visitor::{ValueExpr, ValueExprEvaluator}; -use waymark_dag::{DAGEdge, EdgeType}; - -use super::{RunnerExecutor, RunnerExecutorError}; - -impl RunnerExecutor { - /// Convert a pure IR expression into a ValueExpr without side effects. - pub(super) fn expr_to_value(expr: &ir::Expr) -> Result { - match expr.kind.as_ref() { - Some(ir::expr::Kind::Literal(lit)) => Ok(ValueExpr::Literal(LiteralValue { - value: literal_value(lit), - })), - Some(ir::expr::Kind::Variable(var)) => Ok(ValueExpr::Variable(VariableValue { - name: var.name.clone(), - })), - Some(ir::expr::Kind::BinaryOp(op)) => { - let left = op - .left - .as_ref() - .ok_or_else(|| RunnerExecutorError("binary op missing left".to_string()))?; - let right = op - .right - .as_ref() - .ok_or_else(|| RunnerExecutorError("binary op missing right".to_string()))?; - Ok(ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(Self::expr_to_value(left)?), - op: op.op, - right: Box::new(Self::expr_to_value(right)?), - })) - } - Some(ir::expr::Kind::UnaryOp(op)) => { - let operand = op - .operand - .as_ref() - .ok_or_else(|| RunnerExecutorError("unary op missing operand".to_string()))?; - Ok(ValueExpr::UnaryOp(UnaryOpValue { - op: op.op, - operand: Box::new(Self::expr_to_value(operand)?), - })) - } - Some(ir::expr::Kind::List(list)) => { - let mut elements = Vec::new(); - for item in &list.elements { - elements.push(Self::expr_to_value(item)?); - } - Ok(ValueExpr::List(ListValue { elements })) - } - Some(ir::expr::Kind::Dict(dict_expr)) => { - let mut entries = Vec::new(); - for entry in &dict_expr.entries { - let key = entry - .key - .as_ref() - .ok_or_else(|| RunnerExecutorError("dict entry missing key".to_string()))?; - let value = entry.value.as_ref().ok_or_else(|| { - RunnerExecutorError("dict entry missing value".to_string()) - })?; - entries.push(DictEntryValue { - key: Self::expr_to_value(key)?, - value: Self::expr_to_value(value)?, - }); - } - Ok(ValueExpr::Dict(DictValue { entries })) - } - Some(ir::expr::Kind::Index(index)) => { - let object = index.object.as_ref().ok_or_else(|| { - RunnerExecutorError("index access missing object".to_string()) - })?; - let index_expr = index - .index - .as_ref() - .ok_or_else(|| RunnerExecutorError("index access missing index".to_string()))?; - Ok(ValueExpr::Index(IndexValue { - object: Box::new(Self::expr_to_value(object)?), - index: Box::new(Self::expr_to_value(index_expr)?), - })) - } - Some(ir::expr::Kind::Dot(dot)) => { - let object = dot - .object - .as_ref() - .ok_or_else(|| RunnerExecutorError("dot access missing object".to_string()))?; - Ok(ValueExpr::Dot(DotValue { - object: Box::new(Self::expr_to_value(object)?), - attribute: dot.attribute.clone(), - })) - } - Some(ir::expr::Kind::FunctionCall(call)) => { - let mut args = Vec::new(); - for arg in &call.args { - args.push(Self::expr_to_value(arg)?); - } - let mut kwargs = HashMap::new(); - for kw in &call.kwargs { - if let Some(value) = &kw.value { - kwargs.insert(kw.name.clone(), Self::expr_to_value(value)?); - } - } - let global_fn = if call.global_function != 0 { - Some(call.global_function) - } else { - None - }; - Ok(ValueExpr::FunctionCall(FunctionCallValue { - name: call.name.clone(), - args, - kwargs, - global_function: global_fn, - })) - } - Some( - ir::expr::Kind::ActionCall(_) - | ir::expr::Kind::ParallelExpr(_) - | ir::expr::Kind::SpreadExpr(_), - ) => Err(RunnerExecutorError( - "action/spread calls not allowed in guard expressions".to_string(), - )), - None => Ok(ValueExpr::Literal(LiteralValue { value: Value::Null })), - } - } - - /// Evaluate a guard expression using current symbolic assignments. - pub(super) fn evaluate_guard( - &self, - expr: Option<&ir::Expr>, - ) -> Result { - let expr = match expr { - Some(expr) => expr, - None => return Ok(false), - }; - let value_expr = self.state().materialize_value(Self::expr_to_value(expr)?); - let result = self.evaluate_value_expr(&value_expr)?; - Ok(is_truthy(&result)) - } - - /// Resolve an action's symbolic kwargs to concrete Python values. - /// - /// Example: - /// - spec.kwargs={"value": VariableValue("x")} - /// - with x assigned to LiteralValue(10), returns {"value": 10}. - #[obs] - pub fn resolve_action_kwargs( - &self, - node_id: Uuid, - action: &ActionCallSpec, - ) -> Result, RunnerExecutorError> { - let mut resolved = HashMap::new(); - for (name, expr) in &action.kwargs { - resolved.insert( - name.clone(), - self.evaluate_value_expr_for_node(expr, Some(node_id))?, - ); - } - Ok(resolved) - } - - /// Evaluate a ValueExpr into a concrete Python value. - #[obs] - pub(super) fn evaluate_value_expr( - &self, - expr: &ValueExpr, - ) -> Result { - self.evaluate_value_expr_for_node(expr, None) - } - - fn evaluate_value_expr_for_node( - &self, - expr: &ValueExpr, - current_node_id: Option, - ) -> Result { - let stack = Rc::new(RefCell::new(HashSet::new())); - let resolve_variable = { - let stack = stack.clone(); - let this = self; - move |name: &str| { - this.evaluate_variable_with_context(current_node_id, name, stack.clone()) - } - }; - let resolve_action_result = { - let this = self; - move |value: &ActionResultValue| this.resolve_action_result(value) - }; - let resolve_function_call = { - let this = self; - move |value: &FunctionCallValue, args, kwargs| { - this.evaluate_function_call(value, args, kwargs) - } - }; - let apply_binary = |op, left, right| Self::apply_binary(op, left, right); - let apply_unary = |op, operand| Self::apply_unary(op, operand); - let error_factory = |message: &str| RunnerExecutorError(message.to_string()); - let evaluator = ValueExprEvaluator::new( - &resolve_variable, - &resolve_action_result, - &resolve_function_call, - &apply_binary, - &apply_unary, - &error_factory, - ); - evaluator.visit(expr) - } - - fn find_variable_source_node(&self, current_node_id: Uuid, name: &str) -> Option { - let timeline_index: HashMap = self - .state() - .timeline - .iter() - .enumerate() - .map(|(idx, node_id)| (*node_id, idx)) - .collect(); - - self.state() - .edges - .iter() - .filter(|edge| edge.edge_type == EdgeType::DataFlow && edge.target == current_node_id) - .map(|edge| edge.source) - .filter(|source| { - self.state() - .nodes - .get(source) - .map(|node| node.assignments.contains_key(name)) - .unwrap_or(false) - }) - .max_by_key(|source| timeline_index.get(source).copied().unwrap_or(0)) - } - - fn evaluate_variable_with_context( - &self, - current_node_id: Option, - name: &str, - stack: Rc>>, - ) -> Result { - let node_id = current_node_id - .and_then(|node_id| self.find_variable_source_node(node_id, name)) - .or_else(|| self.state().latest_assignment(name)) - .ok_or_else(|| RunnerExecutorError(format!("variable not found: {name}")))?; - self.evaluate_assignment(node_id, name, stack) - } - - pub(super) fn evaluate_assignment( - &self, - node_id: Uuid, - target: &str, - stack: Rc>>, - ) -> Result { - let key = (node_id, target.to_string()); - if let Some(value) = self.eval_cache_get(&key) { - return Ok(value); - } - if stack.borrow().contains(&key) { - return Err(RunnerExecutorError(format!( - "recursive assignment detected for {target}" - ))); - } - - let node = self - .state() - .nodes - .get(&node_id) - .ok_or_else(|| RunnerExecutorError(format!("missing assignment for {target}")))?; - let expr = node - .assignments - .get(target) - .ok_or_else(|| RunnerExecutorError(format!("missing assignment for {target}")))?; - - stack.borrow_mut().insert(key.clone()); - let resolve_variable = { - let stack = stack.clone(); - let this = self; - move |name: &str| { - this.evaluate_variable_with_context(Some(node_id), name, stack.clone()) - } - }; - let resolve_action_result = { - let this = self; - move |value: &ActionResultValue| this.resolve_action_result(value) - }; - let resolve_function_call = { - let this = self; - move |value: &FunctionCallValue, args, kwargs| { - this.evaluate_function_call(value, args, kwargs) - } - }; - let apply_binary = |op, left, right| Self::apply_binary(op, left, right); - let apply_unary = |op, operand| Self::apply_unary(op, operand); - let error_factory = |message: &str| RunnerExecutorError(message.to_string()); - let evaluator = ValueExprEvaluator::new( - &resolve_variable, - &resolve_action_result, - &resolve_function_call, - &apply_binary, - &apply_unary, - &error_factory, - ); - let value = evaluator.visit(expr)?; - stack.borrow_mut().remove(&key); - self.eval_cache_insert(key, value.clone()); - Ok(value) - } - - pub(super) fn resolve_action_result( - &self, - expr: &ActionResultValue, - ) -> Result { - let value = self - .action_results() - .get(&expr.node_id) - .cloned() - .ok_or_else(|| { - RunnerExecutorError(format!("missing action result for {}", expr.node_id)) - })?; - if let Some(idx) = expr.result_index { - if let Value::Array(items) = value { - let idx = idx as usize; - return items.get(idx).cloned().ok_or_else(|| { - RunnerExecutorError(format!( - "action result for {} has no index {}", - expr.node_id, idx - )) - }); - } - return Err(RunnerExecutorError(format!( - "action result for {} has no index {}", - expr.node_id, idx - ))); - } - Ok(value) - } - - pub(super) fn evaluate_function_call( - &self, - expr: &FunctionCallValue, - args: Vec, - kwargs: HashMap, - ) -> Result { - if let Some(global_fn) = expr.global_function - && global_fn != ir::GlobalFunction::Unspecified as i32 - { - return self.evaluate_global_function(global_fn, args, kwargs); - } - Err(RunnerExecutorError(format!( - "cannot evaluate non-global function call: {}", - expr.name - ))) - } - - pub(super) fn evaluate_global_function( - &self, - global_function: i32, - args: Vec, - kwargs: HashMap, - ) -> Result { - let error = executor_error; - match ir::GlobalFunction::try_from(global_function).ok() { - Some(ir::GlobalFunction::Range) => Ok(range_from_args(&args).into()), - Some(ir::GlobalFunction::Len) => { - if let Some(first) = args.first() { - return Ok(Value::Number(len_of_value(first, error)?)); - } - if let Some(items) = kwargs.get("items") { - return Ok(Value::Number(len_of_value(items, error)?)); - } - Err(RunnerExecutorError("len() missing argument".to_string())) - } - Some(ir::GlobalFunction::Enumerate) => { - let items = if let Some(first) = args.first() { - first.clone() - } else if let Some(items) = kwargs.get("items") { - items.clone() - } else { - return Err(RunnerExecutorError( - "enumerate() missing argument".to_string(), - )); - }; - let list = match items { - Value::Array(items) => items, - _ => return Err(RunnerExecutorError("enumerate() expects list".to_string())), - }; - let pairs: Vec = list - .into_iter() - .enumerate() - .map(|(idx, item)| Value::Array(vec![Value::Number((idx as i64).into()), item])) - .collect(); - Ok(Value::Array(pairs)) - } - Some(ir::GlobalFunction::Isexception) => { - if let Some(first) = args.first() { - return Ok(Value::Bool(is_exception_value(first))); - } - if let Some(value) = kwargs.get("value") { - return Ok(Value::Bool(is_exception_value(value))); - } - Err(RunnerExecutorError( - "isexception() missing argument".to_string(), - )) - } - Some(ir::GlobalFunction::Unspecified) | None => Err(RunnerExecutorError( - "global function unspecified".to_string(), - )), - } - } - - pub(super) fn apply_binary( - op: i32, - left: Value, - right: Value, - ) -> Result { - let error = executor_error; - match ir::BinaryOperator::try_from(op).ok() { - Some(ir::BinaryOperator::BinaryOpOr) => { - if is_truthy(&left) { - Ok(left) - } else { - Ok(right) - } - } - Some(ir::BinaryOperator::BinaryOpAnd) => { - if is_truthy(&left) { - Ok(right) - } else { - Ok(left) - } - } - Some(ir::BinaryOperator::BinaryOpEq) => Ok(Value::Bool(left == right)), - Some(ir::BinaryOperator::BinaryOpNe) => Ok(Value::Bool(left != right)), - Some(ir::BinaryOperator::BinaryOpLt) => { - compare_values(left, right, |a, b| a < b, error) - } - Some(ir::BinaryOperator::BinaryOpLe) => { - compare_values(left, right, |a, b| a <= b, error) - } - Some(ir::BinaryOperator::BinaryOpGt) => { - compare_values(left, right, |a, b| a > b, error) - } - Some(ir::BinaryOperator::BinaryOpGe) => { - compare_values(left, right, |a, b| a >= b, error) - } - Some(ir::BinaryOperator::BinaryOpIn) => Ok(Value::Bool(value_in(&left, &right))), - Some(ir::BinaryOperator::BinaryOpNotIn) => Ok(Value::Bool(!value_in(&left, &right))), - Some(ir::BinaryOperator::BinaryOpAdd) => add_values(left, right, error), - Some(ir::BinaryOperator::BinaryOpSub) => { - numeric_op(left, right, |a, b| a - b, true, error) - } - Some(ir::BinaryOperator::BinaryOpMul) => { - numeric_op(left, right, |a, b| a * b, true, error) - } - Some(ir::BinaryOperator::BinaryOpDiv) => { - numeric_op(left, right, |a, b| a / b, false, error) - } - Some(ir::BinaryOperator::BinaryOpFloorDiv) => { - numeric_op(left, right, |a, b| (a / b).floor(), true, error) - } - Some(ir::BinaryOperator::BinaryOpMod) => { - numeric_op(left, right, |a, b| a % b, true, error) - } - Some(ir::BinaryOperator::BinaryOpUnspecified) | None => Err(RunnerExecutorError( - "binary operator unspecified".to_string(), - )), - } - } - - pub(super) fn apply_unary(op: i32, operand: Value) -> Result { - match ir::UnaryOperator::try_from(op).ok() { - Some(ir::UnaryOperator::UnaryOpNeg) => { - if let Some(value) = int_value(&operand) { - return Ok(Value::Number((-value).into())); - } - match operand.as_f64() { - Some(value) => Ok(Value::Number( - serde_json::Number::from_f64(-value) - .unwrap_or_else(|| serde_json::Number::from(0)), - )), - None => Err(RunnerExecutorError("unary neg expects number".to_string())), - } - } - Some(ir::UnaryOperator::UnaryOpNot) => Ok(Value::Bool(!is_truthy(&operand))), - Some(ir::UnaryOperator::UnaryOpUnspecified) | None => Err(RunnerExecutorError( - "unary operator unspecified".to_string(), - )), - } - } - - pub(super) fn exception_matches(&self, edge: &DAGEdge, exception_value: &Value) -> bool { - let exception_types = match &edge.exception_types { - Some(types) => types, - None => return false, - }; - if exception_types.is_empty() { - return true; - } - let exc_name = match exception_value { - Value::Object(map) => map - .get("type") - .and_then(|value| value.as_str()) - .map(|value| value.to_string()), - _ => None, - }; - if let Some(name) = exc_name { - return exception_types.iter().any(|value| value == &name); - } - false - } -} - -fn executor_error(message: &'static str) -> RunnerExecutorError { - RunnerExecutorError(message.to_string()) -} - -pub(crate) fn int_value(value: &Value) -> Option { - value - .as_i64() - .or_else(|| value.as_u64().and_then(|value| i64::try_from(value).ok())) -} - -pub(crate) fn numeric_op( - left: Value, - right: Value, - op: impl Fn(f64, f64) -> f64, - prefer_int: bool, - error: fn(&'static str) -> E, -) -> Result { - let left_num = left - .as_f64() - .ok_or_else(|| error("numeric operation expects number"))?; - let right_num = right - .as_f64() - .ok_or_else(|| error("numeric operation expects number"))?; - let result = op(left_num, right_num); - if prefer_int && int_value(&left).is_some() && int_value(&right).is_some() && result.is_finite() - { - let rounded = result.round(); - if (result - rounded).abs() < 1e-9 - && rounded >= (i64::MIN as f64) - && rounded <= (i64::MAX as f64) - { - return Ok(Value::Number((rounded as i64).into())); - } - } - Ok(Value::Number( - serde_json::Number::from_f64(result).unwrap_or_else(|| serde_json::Number::from(0)), - )) -} - -pub(crate) fn add_values( - left: Value, - right: Value, - error: fn(&'static str) -> E, -) -> Result { - if let (Value::Array(mut left), Value::Array(right)) = (left.clone(), right.clone()) { - left.extend(right); - return Ok(Value::Array(left)); - } - if let (Some(left), Some(right)) = (left.as_str(), right.as_str()) { - return Ok(Value::String(format!("{left}{right}"))); - } - numeric_op(left, right, |a, b| a + b, true, error) -} - -pub(crate) fn compare_values( - left: Value, - right: Value, - op: impl Fn(f64, f64) -> bool, - error: fn(&'static str) -> E, -) -> Result { - let left = left - .as_f64() - .ok_or_else(|| error("comparison expects number"))?; - let right = right - .as_f64() - .ok_or_else(|| error("comparison expects number"))?; - Ok(Value::Bool(op(left, right))) -} - -pub(crate) fn value_in(value: &Value, container: &Value) -> bool { - match container { - Value::Array(items) => items.iter().any(|item| item == value), - Value::Object(map) => value - .as_str() - .map(|key| map.contains_key(key)) - .unwrap_or(false), - Value::String(text) => value - .as_str() - .map(|needle| text.contains(needle)) - .unwrap_or(false), - _ => false, - } -} - -pub(crate) fn is_truthy(value: &Value) -> bool { - match value { - Value::Null => false, - Value::Bool(value) => *value, - Value::Number(number) => number.as_f64().map(|value| value != 0.0).unwrap_or(false), - Value::String(value) => !value.is_empty(), - Value::Array(values) => !values.is_empty(), - Value::Object(map) => !map.is_empty(), - } -} - -pub(crate) fn is_exception_value(value: &Value) -> bool { - if let Value::Object(map) = value { - return map.contains_key("type") && map.contains_key("message"); - } - false -} - -pub(crate) fn len_of_value( - value: &Value, - error: fn(&'static str) -> E, -) -> Result { - let len = match value { - Value::Array(items) => items.len() as i64, - Value::String(text) => text.len() as i64, - Value::Object(map) => map.len() as i64, - _ => return Err(error("len() expects list, string, or dict")), - }; - Ok(len.into()) -} - -pub(crate) fn range_from_args(args: &[Value]) -> Vec { - let mut start = 0i64; - let mut end = 0i64; - let mut step = 1i64; - if args.len() == 1 { - end = args[0].as_i64().unwrap_or(0); - } else if args.len() >= 2 { - start = args[0].as_i64().unwrap_or(0); - end = args[1].as_i64().unwrap_or(0); - if args.len() >= 3 { - step = args[2].as_i64().unwrap_or(1); - } - } - if step == 0 { - return Vec::new(); - } - let mut values = Vec::new(); - if step > 0 { - let mut current = start; - while current < end { - values.push(Value::Number(current.into())); - current += step; - } - } else { - let mut current = start; - while current > end { - values.push(Value::Number(current.into())); - current += step; - } - } - values -} - -#[cfg(test)] -mod tests { - use std::cell::RefCell; - use std::collections::{HashMap, HashSet}; - use std::rc::Rc; - use std::sync::Arc; - - use uuid::Uuid; - - use super::*; - use crate::messages::ast as ir; - use crate::waymark_core::ir_parser::IRParser; - use crate::waymark_core::runner::RunnerState; - use crate::waymark_core::runner::state::{ - ActionCallSpec, ActionResultValue, BinaryOpValue, FunctionCallValue, LiteralValue, - VariableValue, - }; - use crate::waymark_core::runner::value_visitor::ValueExpr; - use waymark_dag::{DAG, DAGEdge}; - - fn parse_expr(source: &str) -> ir::Expr { - IRParser::new(" ") - .parse_expr(source) - .expect("parse expression") - } - - fn literal_int(value: i64) -> ValueExpr { - ValueExpr::Literal(LiteralValue { - value: Value::Number(value.into()), - }) - } - - fn empty_executor() -> RunnerExecutor { - let dag = Arc::new(DAG::default()); - let state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - RunnerExecutor::new(dag, state, HashMap::new(), None) - } - - fn executor_with_assignment(name: &str, value: ValueExpr) -> RunnerExecutor { - let dag = Arc::new(DAG::default()); - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - state - .record_assignment_value( - vec![name.to_string()], - value, - None, - Some("test assignment".to_string()), - ) - .expect("record assignment"); - RunnerExecutor::new(dag, state, HashMap::new(), None) - } - - #[test] - fn test_expr_to_value_happy_path() { - let expr = parse_expr("x + 2"); - let value = RunnerExecutor::expr_to_value(&expr).expect("convert expression"); - match value { - ValueExpr::BinaryOp(binary) => { - assert!(matches!(*binary.left, ValueExpr::Variable(_))); - assert!(matches!(*binary.right, ValueExpr::Literal(_))); - } - other => panic!("expected binary op, got {other:?}"), - } - } - - #[test] - fn test_evaluate_guard_happy_path() { - let executor = executor_with_assignment("x", literal_int(2)); - let guard = parse_expr("x > 1"); - let result = executor - .evaluate_guard(Some(&guard)) - .expect("evaluate guard"); - assert!(result); - } - - #[test] - fn test_resolve_action_kwargs_happy_path() { - let executor = executor_with_assignment("x", literal_int(10)); - let action = ActionCallSpec { - action_name: "double".to_string(), - module_name: Some("tests".to_string()), - kwargs: HashMap::from([( - "value".to_string(), - ValueExpr::Variable(VariableValue { - name: "x".to_string(), - }), - )]), - }; - let resolved = executor - .resolve_action_kwargs(Uuid::new_v4(), &action) - .expect("resolve kwargs"); - assert_eq!(resolved.get("value"), Some(&Value::Number(10.into()))); - } - - #[test] - fn test_resolve_action_kwargs_uses_data_flow_for_self_referential_targets() { - let dag = Arc::new(DAG::default()); - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - state - .record_assignment_value( - vec!["current".to_string()], - literal_int(0), - None, - Some("current = 0".to_string()), - ) - .expect("record current"); - let action_result = state - .queue_action( - "increment", - Some(vec!["current".to_string()]), - Some(HashMap::from([( - "value".to_string(), - ValueExpr::Variable(VariableValue { - name: "current".to_string(), - }), - )])), - None, - None, - ) - .expect("queue increment"); - let action_node = state - .nodes - .get(&action_result.node_id) - .expect("action node") - .clone(); - let action_spec = action_node.action.expect("action spec"); - - let executor = RunnerExecutor::new(dag, state, HashMap::new(), None); - let resolved = executor - .resolve_action_kwargs(action_result.node_id, &action_spec) - .expect("resolve kwargs"); - assert_eq!(resolved.get("value"), Some(&Value::Number(0.into()))); - } - - #[test] - fn test_evaluate_value_expr_happy_path() { - let executor = executor_with_assignment("x", literal_int(3)); - let expr = ValueExpr::BinaryOp(crate::waymark_core::runner::state::BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "x".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(literal_int(1)), - }); - let value = executor - .evaluate_value_expr(&expr) - .expect("evaluate value expression"); - assert_eq!(value, Value::Number(4.into())); - } - - #[test] - fn test_evaluate_variable_happy_path() { - let executor = executor_with_assignment("value", literal_int(5)); - let stack = Rc::new(RefCell::new(HashSet::new())); - let value = executor - .evaluate_variable_with_context(None, "value", stack) - .expect("evaluate variable"); - assert_eq!(value, Value::Number(5.into())); - } - - #[test] - fn test_evaluate_assignment_happy_path() { - let executor = executor_with_assignment("value", literal_int(9)); - let node_id = executor - .state() - .latest_assignment("value") - .expect("latest assignment"); - let stack = Rc::new(RefCell::new(HashSet::new())); - let value = executor - .evaluate_assignment(node_id, "value", stack) - .expect("evaluate assignment"); - assert_eq!(value, Value::Number(9.into())); - } - - #[test] - fn test_evaluate_assignment_uses_data_flow_for_self_referential_updates() { - let dag = Arc::new(DAG::default()); - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - state - .record_assignment_value( - vec!["count".to_string()], - literal_int(0), - None, - Some("count = 0".to_string()), - ) - .expect("record initial count"); - state - .record_assignment_value( - vec!["count".to_string()], - ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "count".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(literal_int(1)), - }), - None, - Some("count = count + 1".to_string()), - ) - .expect("record updated count"); - - let executor = RunnerExecutor::new(dag, state, HashMap::new(), None); - let node_id = executor - .state() - .latest_assignment("count") - .expect("latest assignment"); - let stack = Rc::new(RefCell::new(HashSet::new())); - let value = executor - .evaluate_assignment(node_id, "count", stack) - .expect("evaluate self-referential assignment"); - assert_eq!(value, Value::Number(1.into())); - } - - #[test] - fn test_resolve_action_result_happy_path() { - let mut executor = empty_executor(); - let action_id = Uuid::new_v4(); - executor.set_action_result( - action_id, - Value::Array(vec![Value::Number(7.into()), Value::Number(8.into())]), - ); - let result = executor - .resolve_action_result(&ActionResultValue { - node_id: action_id, - action_name: "fetch".to_string(), - iteration_index: None, - result_index: Some(1), - }) - .expect("resolve action result"); - assert_eq!(result, Value::Number(8.into())); - } - - #[test] - fn test_evaluate_function_call_happy_path() { - let executor = empty_executor(); - let value = executor - .evaluate_function_call( - &FunctionCallValue { - name: "len".to_string(), - args: Vec::new(), - kwargs: HashMap::new(), - global_function: Some(ir::GlobalFunction::Len as i32), - }, - vec![Value::Array(vec![Value::Null, Value::Null])], - HashMap::new(), - ) - .expect("evaluate function call"); - assert_eq!(value, Value::Number(2.into())); - } - - #[test] - fn test_evaluate_global_function_happy_path() { - let executor = empty_executor(); - let value = executor - .evaluate_global_function( - ir::GlobalFunction::Range as i32, - vec![Value::Number(1.into()), Value::Number(4.into())], - HashMap::new(), - ) - .expect("evaluate global function"); - assert_eq!( - value, - Value::Array(vec![ - Value::Number(1.into()), - Value::Number(2.into()), - Value::Number(3.into()) - ]) - ); - } - - #[test] - fn test_apply_binary_happy_path() { - let value = RunnerExecutor::apply_binary( - ir::BinaryOperator::BinaryOpAdd as i32, - Value::Number(2.into()), - Value::Number(3.into()), - ) - .expect("apply binary"); - assert_eq!(value, Value::Number(5.into())); - } - - #[test] - fn test_apply_unary_happy_path() { - let value = - RunnerExecutor::apply_unary(ir::UnaryOperator::UnaryOpNot as i32, Value::Bool(true)) - .expect("apply unary"); - assert_eq!(value, Value::Bool(false)); - } - - #[test] - fn test_exception_matches_happy_path() { - let executor = empty_executor(); - let edge = DAGEdge::state_machine_with_exception("a", "b", vec!["ValueError".to_string()]); - let exception = serde_json::json!({ - "type": "ValueError", - "message": "boom", - }); - assert!(executor.exception_matches(&edge, &exception)); - } - - #[test] - fn test_executor_error_happy_path() { - let error = executor_error("hello"); - assert_eq!(error.0, "hello"); - } - - #[test] - fn test_int_value_happy_path() { - let value = Value::Number(7_u64.into()); - assert_eq!(int_value(&value), Some(7)); - } - - #[test] - fn test_numeric_op_happy_path() { - let value = numeric_op( - Value::Number(10.into()), - Value::Number(3.into()), - |a, b| a + b, - true, - executor_error, - ) - .expect("numeric op"); - assert_eq!(value, Value::Number(13.into())); - } - - #[test] - fn test_add_values_happy_path() { - let value = add_values( - Value::String("hello ".to_string()), - Value::String("world".to_string()), - executor_error, - ) - .expect("add values"); - assert_eq!(value, Value::String("hello world".to_string())); - } - - #[test] - fn test_compare_values_happy_path() { - let value = compare_values( - Value::Number(3.into()), - Value::Number(5.into()), - |a, b| a < b, - executor_error, - ) - .expect("compare values"); - assert_eq!(value, Value::Bool(true)); - } - - #[test] - fn test_value_in_happy_path() { - let container = Value::Array(vec![Value::Number(1.into()), Value::Number(2.into())]); - assert!(value_in(&Value::Number(2.into()), &container)); - } - - #[test] - fn test_is_truthy_happy_path() { - assert!(is_truthy(&Value::String("non-empty".to_string()))); - } - - #[test] - fn test_is_exception_value_happy_path() { - let value = serde_json::json!({ - "type": "RuntimeError", - "message": "bad", - }); - assert!(is_exception_value(&value)); - } - - #[test] - fn test_len_of_value_happy_path() { - let value = Value::Array(vec![Value::Null, Value::Null, Value::Null]); - let len = len_of_value(&value, executor_error).expect("length"); - assert_eq!(len.as_i64(), Some(3)); - } - - #[test] - fn test_range_from_args_happy_path() { - let values = range_from_args(&[ - Value::Number(0.into()), - Value::Number(5.into()), - Value::Number(2.into()), - ]); - assert_eq!( - values, - vec![ - Value::Number(0.into()), - Value::Number(2.into()), - Value::Number(4.into()) - ] - ); - } -} diff --git a/crates/waymark/src/waymark_core/runner/mod.rs b/crates/waymark/src/waymark_core/runner/mod.rs deleted file mode 100644 index 4e7a491d..00000000 --- a/crates/waymark/src/waymark_core/runner/mod.rs +++ /dev/null @@ -1,19 +0,0 @@ -//! Runner utilities. - -pub mod executor; -pub mod expression_evaluator; -pub mod replay; -pub(crate) mod retry; -pub mod state; -pub(crate) mod synthetic_exceptions; -pub mod value_visitor; - -pub use executor::{ - DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, -}; -pub use replay::{ReplayError, ReplayResult, replay_action_kwargs, replay_variables}; -pub use state::{ - ActionCallSpec, ActionResultValue, ExecutionEdge, ExecutionNode, NodeStatus, RunnerState, - RunnerStateError, format_value, -}; -pub use value_visitor::ValueExpr; diff --git a/crates/waymark/src/waymark_core/runner/replay.rs b/crates/waymark/src/waymark_core/runner/replay.rs deleted file mode 100644 index 246caace..00000000 --- a/crates/waymark/src/waymark_core/runner/replay.rs +++ /dev/null @@ -1,658 +0,0 @@ -//! Replay variable values from a runner state snapshot. - -use std::cell::RefCell; -use std::collections::{HashMap, HashSet}; -use std::rc::Rc; - -use serde_json::Value; -use uuid::Uuid; - -use crate::messages::ast as ir; -use crate::waymark_core::runner::expression_evaluator::{ - add_values, compare_values, int_value, is_exception_value, is_truthy, len_of_value, numeric_op, - range_from_args, value_in, -}; -use crate::waymark_core::runner::state::{ActionResultValue, FunctionCallValue, RunnerState}; -use crate::waymark_core::runner::value_visitor::{ValueExpr, ValueExprEvaluator}; -use waymark_dag::{EXCEPTION_SCOPE_VAR, EdgeType}; - -/// Raised when replay cannot reconstruct variable values. -#[derive(Debug, thiserror::Error)] -#[error("{0}")] -pub struct ReplayError(pub String); - -#[derive(Clone, Debug)] -pub struct ReplayResult { - pub variables: HashMap, -} - -/// Replay variable values from a runner state snapshot. -pub struct ReplayEngine<'a> { - state: &'a RunnerState, - action_results: &'a HashMap, - cache: RefCell>, - timeline: Vec, - index: HashMap, - incoming_data: HashMap>, -} - -impl<'a> ReplayEngine<'a> { - /// Prepare replay state derived from a runner snapshot. - /// - /// We precompute a timeline index and incoming data-flow map so lookups are - /// O(1) during evaluation. - /// - /// Example: - /// - timeline = [node_a, node_b] - /// - index[node_b] == 1 and incoming data edges are pre-sorted. - pub fn new(state: &'a RunnerState, action_results: &'a HashMap) -> Self { - let timeline = if state.timeline.is_empty() { - state.nodes.keys().cloned().collect() - } else { - state.timeline.clone() - }; - let index = timeline - .iter() - .enumerate() - .map(|(idx, node_id)| (*node_id, idx)) - .collect(); - let incoming_data = build_incoming_data_map(state, &index); - Self { - state, - action_results, - cache: RefCell::new(HashMap::new()), - timeline, - index, - incoming_data, - } - } - - /// Replay variable values by scanning assignments from newest to oldest. - /// - /// We walk the timeline in reverse to capture the latest assignment for each - /// variable and skip older definitions once a value is known. This mirrors - /// "last write wins" semantics while avoiding redundant evaluation work. - /// - /// Example: - /// - x = 1 - /// - x = 2 - /// Reverse traversal yields x=2 without evaluating the older assignment. - pub fn replay_variables(&self) -> Result { - let mut variables: HashMap = HashMap::new(); - for node_id in self.timeline.iter().rev() { - let node = match self.state.nodes.get(node_id) { - Some(node) => node, - None => continue, - }; - if node.assignments.is_empty() { - continue; - } - for target in node.assignments.keys() { - if variables.contains_key(target) { - continue; - } - let value = self.evaluate_assignment( - *node_id, - target, - Rc::new(RefCell::new(HashSet::new())), - )?; - variables.insert(target.clone(), value); - } - } - Ok(ReplayResult { variables }) - } - - /// Replay concrete kwargs for an action execution node. - /// - /// This resolves symbolic kwargs from the action node in the context of - /// the node's incoming data-flow edges. - pub fn replay_action_kwargs( - &self, - node_id: Uuid, - ) -> Result, ReplayError> { - let node = self - .state - .nodes - .get(&node_id) - .ok_or_else(|| ReplayError(format!("action node not found: {node_id}")))?; - let action = node - .action - .as_ref() - .ok_or_else(|| ReplayError(format!("node is not an action call: {node_id}")))?; - let mut resolved = HashMap::new(); - for (name, expr) in &action.kwargs { - let value = self.evaluate_value_expr_at_node(node_id, expr)?; - resolved.insert(name.clone(), value); - } - Ok(resolved) - } - - /// Evaluate a single assignment expression with cycle detection. - /// - /// We memoize evaluated (node, target) pairs and guard against recursive - /// references by tracking a stack of active evaluations. - /// - /// Example: - /// - x = y + 1 - /// - y = 2 - /// Evaluating x resolves y first, then computes x. - fn evaluate_assignment( - &self, - node_id: Uuid, - target: &str, - stack: Rc>>, - ) -> Result { - let key = (node_id, target.to_string()); - if let Some(value) = self.cache.borrow().get(&key) { - return Ok(value.clone()); - } - if stack.borrow().contains(&key) { - return Err(ReplayError(format!( - "recursive assignment detected for {target} in {node_id}" - ))); - } - - let node = - self.state.nodes.get(&node_id).ok_or_else(|| { - ReplayError(format!("missing assignment for {target} in {node_id}")) - })?; - let expr = node - .assignments - .get(target) - .ok_or_else(|| ReplayError(format!("missing assignment for {target} in {node_id}")))?; - - stack.borrow_mut().insert(key.clone()); - let resolve_variable = { - let stack = stack.clone(); - let this = self; - move |name: &str| this.resolve_variable(node_id, name, stack.clone()) - }; - let resolve_action_result = { - let this = self; - move |value: &ActionResultValue| this.resolve_action_result(value) - }; - let resolve_function_call = { - let this = self; - move |value: &FunctionCallValue, args, kwargs| { - this.evaluate_function_call(value, args, kwargs) - } - }; - let apply_binary = |op, left, right| apply_binary(op, left, right); - let apply_unary = |op, operand| apply_unary(op, operand); - let error_factory = |message: &str| ReplayError(message.to_string()); - let evaluator = ValueExprEvaluator::new( - &resolve_variable, - &resolve_action_result, - &resolve_function_call, - &apply_binary, - &apply_unary, - &error_factory, - ); - let value = evaluator.visit(expr)?; - stack.borrow_mut().remove(&key); - self.cache.borrow_mut().insert(key, value.clone()); - Ok(value) - } - - fn evaluate_value_expr_at_node( - &self, - node_id: Uuid, - expr: &ValueExpr, - ) -> Result { - let stack = Rc::new(RefCell::new(HashSet::new())); - let resolve_variable = { - let stack = stack.clone(); - let this = self; - move |name: &str| this.resolve_variable(node_id, name, stack.clone()) - }; - let resolve_action_result = { - let this = self; - move |value: &ActionResultValue| this.resolve_action_result(value) - }; - let resolve_function_call = { - let this = self; - move |value: &FunctionCallValue, args, kwargs| { - this.evaluate_function_call(value, args, kwargs) - } - }; - let apply_binary = |op, left, right| apply_binary(op, left, right); - let apply_unary = |op, operand| apply_unary(op, operand); - let error_factory = |message: &str| ReplayError(message.to_string()); - let evaluator = ValueExprEvaluator::new( - &resolve_variable, - &resolve_action_result, - &resolve_function_call, - &apply_binary, - &apply_unary, - &error_factory, - ); - evaluator.visit(expr) - } - - /// Resolve a variable reference via data-flow edges. - /// - /// This walks to the closest upstream definition and replays that - /// assignment for the requested variable. - /// - /// Example: - /// - action_1 defines x - /// - assign_2 uses x - /// Resolving x from assign_2 evaluates action_1's assignment. - fn resolve_variable( - &self, - current_node_id: Uuid, - name: &str, - stack: Rc>>, - ) -> Result { - let mut source_node_id = self.find_variable_source_node(current_node_id, name); - if source_node_id.is_none() && name == EXCEPTION_SCOPE_VAR { - source_node_id = self.state.latest_assignment(name); - } - let source_node_id = source_node_id.ok_or_else(|| { - ReplayError(format!("variable not found via data-flow edges: {name}")) - })?; - self.evaluate_assignment(source_node_id, name, stack) - } - - /// Find the nearest upstream node that defines the variable. - /// - /// We consult pre-sorted incoming data edges and ignore sources that are - /// later in the timeline than the current node. - /// - /// Example: - /// - if node_b comes after node_a, node_b cannot be a source for node_a. - fn find_variable_source_node(&self, current_node_id: Uuid, name: &str) -> Option { - let sources = self.incoming_data.get(¤t_node_id)?; - let current_idx = self - .index - .get(¤t_node_id) - .copied() - .unwrap_or(self.index.len()); - for source_id in sources { - if self.index.get(source_id).copied().unwrap_or(0) > current_idx { - continue; - } - if let Some(node) = self.state.nodes.get(source_id) - && node.assignments.contains_key(name) - { - return Some(*source_id); - } - } - None - } - - /// Fetch an action result by node id, handling indexed results. - /// - /// Example: - /// - result = @fetch() - /// - result[0] - /// The evaluator looks up the action result and returns index 0. - fn resolve_action_result(&self, expr: &ActionResultValue) -> Result { - let value = self - .action_results - .get(&expr.node_id) - .cloned() - .ok_or_else(|| ReplayError(format!("missing action result for {}", expr.node_id)))?; - if let Some(idx) = expr.result_index { - if let Value::Array(items) = value { - let idx = idx as usize; - return items.get(idx).cloned().ok_or_else(|| { - ReplayError(format!( - "action result for {} has no index {}", - expr.node_id, idx - )) - }); - } - return Err(ReplayError(format!( - "action result for {} has no index {}", - expr.node_id, idx - ))); - } - Ok(value) - } - - /// Evaluate a function call during replay. - /// - /// Only global functions are supported because user-defined functions are - /// not available in this replay context. - /// - /// Example: - /// - len(items=[1, 2]) -> 2 - fn evaluate_function_call( - &self, - expr: &FunctionCallValue, - args: Vec, - kwargs: HashMap, - ) -> Result { - if let Some(global_fn) = expr.global_function - && global_fn != ir::GlobalFunction::Unspecified as i32 - { - return evaluate_global_function(global_fn, args, kwargs); - } - Err(ReplayError(format!( - "cannot replay non-global function call: {}", - expr.name - ))) - } -} - -fn replay_error(message: &'static str) -> ReplayError { - ReplayError(message.to_string()) -} - -/// Apply a binary operator to replayed operands. -/// -/// Example: -/// - left=1, right=2, op=ADD -> 3 -fn apply_binary(op: i32, left: Value, right: Value) -> Result { - let error = replay_error; - match ir::BinaryOperator::try_from(op).ok() { - Some(ir::BinaryOperator::BinaryOpOr) => { - if is_truthy(&left) { - Ok(left) - } else { - Ok(right) - } - } - Some(ir::BinaryOperator::BinaryOpAnd) => { - if is_truthy(&left) { - Ok(right) - } else { - Ok(left) - } - } - Some(ir::BinaryOperator::BinaryOpEq) => Ok(Value::Bool(left == right)), - Some(ir::BinaryOperator::BinaryOpNe) => Ok(Value::Bool(left != right)), - Some(ir::BinaryOperator::BinaryOpLt) => compare_values(left, right, |a, b| a < b, error), - Some(ir::BinaryOperator::BinaryOpLe) => compare_values(left, right, |a, b| a <= b, error), - Some(ir::BinaryOperator::BinaryOpGt) => compare_values(left, right, |a, b| a > b, error), - Some(ir::BinaryOperator::BinaryOpGe) => compare_values(left, right, |a, b| a >= b, error), - Some(ir::BinaryOperator::BinaryOpIn) => Ok(Value::Bool(value_in(&left, &right))), - Some(ir::BinaryOperator::BinaryOpNotIn) => Ok(Value::Bool(!value_in(&left, &right))), - Some(ir::BinaryOperator::BinaryOpAdd) => add_values(left, right, error), - Some(ir::BinaryOperator::BinaryOpSub) => numeric_op(left, right, |a, b| a - b, true, error), - Some(ir::BinaryOperator::BinaryOpMul) => numeric_op(left, right, |a, b| a * b, true, error), - Some(ir::BinaryOperator::BinaryOpDiv) => { - numeric_op(left, right, |a, b| a / b, false, error) - } - Some(ir::BinaryOperator::BinaryOpFloorDiv) => { - numeric_op(left, right, |a, b| (a / b).floor(), true, error) - } - Some(ir::BinaryOperator::BinaryOpMod) => numeric_op(left, right, |a, b| a % b, true, error), - Some(ir::BinaryOperator::BinaryOpUnspecified) | None => { - Err(ReplayError("binary operator unspecified".to_string())) - } - } -} - -/// Apply a unary operator to a replayed operand. -/// -/// Example: -/// - op=NOT, operand=True -> False -fn apply_unary(op: i32, operand: Value) -> Result { - match ir::UnaryOperator::try_from(op).ok() { - Some(ir::UnaryOperator::UnaryOpNeg) => { - if let Some(value) = int_value(&operand) { - return Ok(Value::Number((-value).into())); - } - match operand.as_f64() { - Some(value) => Ok(Value::Number( - serde_json::Number::from_f64(-value) - .unwrap_or_else(|| serde_json::Number::from(0)), - )), - None => Err(ReplayError("unary neg expects number".to_string())), - } - } - Some(ir::UnaryOperator::UnaryOpNot) => Ok(Value::Bool(!is_truthy(&operand))), - Some(ir::UnaryOperator::UnaryOpUnspecified) | None => { - Err(ReplayError("unary operator unspecified".to_string())) - } - } -} - -/// Evaluate supported global helper functions. -/// -/// Example: -/// - range(0, 3) -> [0, 1, 2] -/// - isexception(value={"type": "...", "message": "..."}) -> True -fn evaluate_global_function( - global_function: i32, - args: Vec, - kwargs: HashMap, -) -> Result { - match ir::GlobalFunction::try_from(global_function).ok() { - Some(ir::GlobalFunction::Range) => Ok(range_from_args(&args).into()), - Some(ir::GlobalFunction::Len) => { - if let Some(first) = args.first() { - return Ok(Value::Number(len_of_value(first, replay_error)?)); - } - if let Some(items) = kwargs.get("items") { - return Ok(Value::Number(len_of_value(items, replay_error)?)); - } - Err(ReplayError("len() missing argument".to_string())) - } - Some(ir::GlobalFunction::Enumerate) => { - let items = if let Some(first) = args.first() { - first.clone() - } else if let Some(items) = kwargs.get("items") { - items.clone() - } else { - return Err(ReplayError("enumerate() missing argument".to_string())); - }; - let list = match items { - Value::Array(items) => items, - _ => return Err(ReplayError("enumerate() expects list".to_string())), - }; - let pairs: Vec = list - .into_iter() - .enumerate() - .map(|(idx, item)| Value::Array(vec![Value::Number((idx as i64).into()), item])) - .collect(); - Ok(Value::Array(pairs)) - } - Some(ir::GlobalFunction::Isexception) => { - if let Some(first) = args.first() { - return Ok(Value::Bool(is_exception_value(first))); - } - if let Some(value) = kwargs.get("value") { - return Ok(Value::Bool(is_exception_value(value))); - } - Err(ReplayError("isexception() missing argument".to_string())) - } - Some(ir::GlobalFunction::Unspecified) | None => { - Err(ReplayError("global function unspecified".to_string())) - } - } -} - -/// Build a reverse index of incoming data-flow edges. -/// -/// Sources are sorted from most-recent to oldest by timeline index so -/// lookups can short-circuit on the first viable definition. -fn build_incoming_data_map( - state: &RunnerState, - index: &HashMap, -) -> HashMap> { - let mut incoming: HashMap> = HashMap::new(); - for edge in &state.edges { - if edge.edge_type != EdgeType::DataFlow { - continue; - } - incoming.entry(edge.target).or_default().push(edge.source); - } - for (_target, sources) in incoming.iter_mut() { - sources.sort_by_key(|node_id| { - ( - index.get(node_id).copied().unwrap_or(0), - node_id.to_string(), - ) - }); - sources.reverse(); - } - incoming -} - -/// Replay variable values from a runner state snapshot. -/// -/// This is a convenience wrapper around ReplayEngine that prefers the latest -/// assignment for each variable and returns a fully materialized mapping. -pub fn replay_variables( - state: &RunnerState, - action_results: &HashMap, -) -> Result { - ReplayEngine::new(state, action_results).replay_variables() -} - -/// Replay concrete kwargs for a specific action node from a state snapshot. -pub fn replay_action_kwargs( - state: &RunnerState, - action_results: &HashMap, - node_id: Uuid, -) -> Result, ReplayError> { - ReplayEngine::new(state, action_results).replay_action_kwargs(node_id) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::messages::ast as ir; - use crate::waymark_core::runner::state::{RunnerState, VariableValue}; - use crate::waymark_core::runner::value_visitor::ValueExpr; - - fn action_plus_two_expr() -> ir::Expr { - ir::Expr { - kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { - left: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Variable(ir::Variable { - name: "action_result".to_string(), - })), - span: None, - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(2)), - })), - span: None, - })), - }))), - span: None, - } - } - - #[test] - fn test_replay_variables_resolves_action_results() { - let mut state = RunnerState::new(None, None, None, true); - - let action0 = state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - Some(0), - ) - .expect("queue action"); - let first_list = ir::Expr { - kind: Some(ir::expr::Kind::List(ir::ListExpr { - elements: vec![action_plus_two_expr()], - })), - span: None, - }; - state - .record_assignment(vec!["results".to_string()], &first_list, None, None) - .expect("record assignment"); - - let action1 = state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - Some(1), - ) - .expect("queue action"); - let second_list = ir::Expr { - kind: Some(ir::expr::Kind::List(ir::ListExpr { - elements: vec![action_plus_two_expr()], - })), - span: None, - }; - let concat_expr = ir::Expr { - kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { - left: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Variable(ir::Variable { - name: "results".to_string(), - })), - span: None, - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Some(Box::new(second_list)), - }))), - span: None, - }; - state - .record_assignment(vec!["results".to_string()], &concat_expr, None, None) - .expect("record assignment"); - - let replayed = replay_variables( - &state, - &HashMap::from([ - (action0.node_id, Value::Number(1.into())), - (action1.node_id, Value::Number(2.into())), - ]), - ) - .expect("replay"); - - assert_eq!( - replayed.variables.get("results"), - Some(&Value::Array(vec![3.into(), 4.into()])), - ); - } - - #[test] - fn test_replay_action_kwargs_resolves_variable_inputs() { - let mut state = RunnerState::new(None, None, None, true); - - let number_expr = ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(7)), - })), - span: None, - }; - state - .record_assignment( - vec!["number".to_string()], - &number_expr, - None, - Some("number = 7".to_string()), - ) - .expect("record assignment"); - - let kwargs = HashMap::from([( - "value".to_string(), - ValueExpr::Variable(VariableValue { - name: "number".to_string(), - }), - )]); - - let action = state - .queue_action( - "compute", - Some(vec!["result".to_string()]), - Some(kwargs), - Some("tests".to_string()), - None, - ) - .expect("queue action"); - - let kwargs = replay_action_kwargs( - &state, - &HashMap::from([(action.node_id, Value::Number(14.into()))]), - action.node_id, - ) - .expect("replay kwargs"); - - assert_eq!(kwargs.get("value"), Some(&Value::Number(7.into()))); - } -} diff --git a/crates/waymark/src/waymark_core/runner/retry.rs b/crates/waymark/src/waymark_core/runner/retry.rs deleted file mode 100644 index e6fb4d70..00000000 --- a/crates/waymark/src/waymark_core/runner/retry.rs +++ /dev/null @@ -1,137 +0,0 @@ -//! Retry/timeout policy helpers shared by runner components. - -use crate::messages::ast as ir; - -#[derive(Clone, Debug)] -pub(crate) struct RetryDecision { - pub(crate) should_retry: bool, -} - -pub(crate) struct RetryPolicyEvaluator<'a> { - policies: &'a [ir::PolicyBracket], - exception_name: Option<&'a str>, -} - -fn is_synthetic_runtime_exception(exception_name: Option<&str>) -> bool { - matches!(exception_name, Some("ExecutorResume" | "ActionTimeout")) -} - -impl<'a> RetryPolicyEvaluator<'a> { - pub(crate) fn new(policies: &'a [ir::PolicyBracket], exception_name: Option<&'a str>) -> Self { - Self { - policies, - exception_name, - } - } - - pub(crate) fn decision(&self, attempt: i32) -> RetryDecision { - let mut max_retries: i32 = 0; - let mut matched_policy = false; - - for policy in self.policies { - let Some(ir::policy_bracket::Kind::Retry(retry)) = policy.kind.as_ref() else { - continue; - }; - let matches_exception = if retry.exception_types.is_empty() { - // Synthetic runtime exceptions (resume/timeout) can represent in-flight - // work that may still be running out-of-band. Require explicit opt-in - // exception filters before retrying these cases. - !is_synthetic_runtime_exception(self.exception_name) - } else if let Some(name) = self.exception_name { - retry.exception_types.iter().any(|value| value == name) - } else { - false - }; - if !matches_exception { - continue; - } - matched_policy = true; - max_retries = max_retries.max(retry.max_retries as i32); - } - - let should_retry = matched_policy && attempt - 1 < max_retries; - - RetryDecision { should_retry } - } -} - -pub(crate) fn timeout_seconds_from_policies(policies: &[ir::PolicyBracket]) -> Option { - let mut timeout_seconds: Option = None; - for policy in policies { - let Some(ir::policy_bracket::Kind::Timeout(timeout)) = policy.kind.as_ref() else { - continue; - }; - let seconds = timeout - .timeout - .as_ref() - .map(|duration| duration.seconds) - .unwrap_or(0); - if seconds == 0 { - continue; - } - timeout_seconds = Some(match timeout_seconds { - Some(existing) => existing.min(seconds), - None => seconds, - }); - } - timeout_seconds.map(|seconds| seconds.min(u64::from(u32::MAX)) as u32) -} - -#[cfg(test)] -mod tests { - use super::*; - - fn retry_policy(max_retries: u32, exception_types: Vec<&str>) -> ir::PolicyBracket { - ir::PolicyBracket { - kind: Some(ir::policy_bracket::Kind::Retry(ir::RetryPolicy { - exception_types: exception_types - .into_iter() - .map(ToString::to_string) - .collect(), - max_retries, - backoff: None, - })), - } - } - - fn timeout_policy(seconds: u64) -> ir::PolicyBracket { - ir::PolicyBracket { - kind: Some(ir::policy_bracket::Kind::Timeout(ir::TimeoutPolicy { - timeout: Some(ir::Duration { seconds }), - })), - } - } - - #[test] - fn retry_policy_evaluator_happy_path() { - let policies = vec![ - retry_policy(1, vec!["ValueError"]), - retry_policy(3, Vec::new()), - ]; - let decision = RetryPolicyEvaluator::new(&policies, Some("ValueError")).decision(2); - assert!(decision.should_retry); - - let exhausted = RetryPolicyEvaluator::new(&policies, Some("ValueError")).decision(4); - assert!(!exhausted.should_retry); - } - - #[test] - fn retry_policy_evaluator_wildcard_does_not_retry_synthetic_timeout() { - let policies = vec![retry_policy(3, Vec::new())]; - let decision = RetryPolicyEvaluator::new(&policies, Some("ActionTimeout")).decision(1); - assert!(!decision.should_retry); - } - - #[test] - fn retry_policy_evaluator_explicit_timeout_retry_happy_path() { - let policies = vec![retry_policy(2, vec!["ActionTimeout"])]; - let decision = RetryPolicyEvaluator::new(&policies, Some("ActionTimeout")).decision(1); - assert!(decision.should_retry); - } - - #[test] - fn timeout_seconds_from_policies_happy_path() { - let policies = vec![timeout_policy(30), timeout_policy(10), timeout_policy(0)]; - assert_eq!(timeout_seconds_from_policies(&policies), Some(10)); - } -} diff --git a/crates/waymark/src/waymark_core/runner/state.rs b/crates/waymark/src/waymark_core/runner/state.rs deleted file mode 100644 index dd9e68be..00000000 --- a/crates/waymark/src/waymark_core/runner/state.rs +++ /dev/null @@ -1,2201 +0,0 @@ -//! Execution-time DAG state with unrolled nodes and symbolic values. - -use std::collections::{HashMap, HashSet}; -use std::fmt; -use std::sync::Arc; - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -use crate::messages::ast as ir; -use crate::waymark_core::runner::expression_evaluator::is_truthy; -use crate::waymark_core::runner::value_visitor::{ - ValueExpr, collect_value_sources, resolve_value_tree, -}; -use waymark_dag::{ - ActionCallNode, AggregatorNode, AssignmentNode, DAG, DAGNode, EdgeType, FnCallNode, JoinNode, - ReturnNode, SleepNode, -}; - -/// Raised when the runner state cannot be updated safely. -#[derive(Debug, thiserror::Error)] -#[error("{0}")] -pub struct RunnerStateError(pub String); - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct ActionCallSpec { - pub action_name: String, - pub module_name: Option, - pub kwargs: HashMap, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct LiteralValue { - pub value: serde_json::Value, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct VariableValue { - pub name: String, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct ActionResultValue { - pub node_id: Uuid, - pub action_name: String, - pub iteration_index: Option, - pub result_index: Option, -} - -impl ActionResultValue { - pub fn label(&self) -> String { - let mut label = self.action_name.clone(); - if let Some(idx) = self.iteration_index { - label = format!("{label}[{idx}]"); - } - if let Some(idx) = self.result_index { - label = format!("{label}[{idx}]"); - } - label - } -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct BinaryOpValue { - pub left: Box, - pub op: i32, - pub right: Box, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct UnaryOpValue { - pub op: i32, - pub operand: Box, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct ListValue { - pub elements: Vec, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct DictEntryValue { - pub key: ValueExpr, - pub value: ValueExpr, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct DictValue { - pub entries: Vec, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct IndexValue { - pub object: Box, - pub index: Box, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct DotValue { - pub object: Box, - pub attribute: String, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct FunctionCallValue { - pub name: String, - pub args: Vec, - pub kwargs: HashMap, - pub global_function: Option, -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct SpreadValue { - pub collection: Box, - pub loop_var: String, - pub action: ActionCallSpec, -} - -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "type", content = "data")] -pub enum NodeStatus { - Queued, - Running, - Completed, - Failed, -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum ExecutionNodeType { - Input, - Output, - Assignment, - ActionCall, - FnCall, - Parallel, - Aggregator, - Branch, - Join, - Return, - Break, - Continue, - Sleep, - Expression, -} - -impl ExecutionNodeType { - pub fn as_str(&self) -> &'static str { - match self { - ExecutionNodeType::Input => "input", - ExecutionNodeType::Output => "output", - ExecutionNodeType::Assignment => "assignment", - ExecutionNodeType::ActionCall => "action_call", - ExecutionNodeType::FnCall => "fn_call", - ExecutionNodeType::Parallel => "parallel", - ExecutionNodeType::Aggregator => "aggregator", - ExecutionNodeType::Branch => "branch", - ExecutionNodeType::Join => "join", - ExecutionNodeType::Return => "return", - ExecutionNodeType::Break => "break", - ExecutionNodeType::Continue => "continue", - ExecutionNodeType::Sleep => "sleep", - ExecutionNodeType::Expression => "expression", - } - } -} - -impl TryFrom<&str> for ExecutionNodeType { - type Error = RunnerStateError; - - fn try_from(value: &str) -> Result { - match value { - "input" => Ok(ExecutionNodeType::Input), - "output" => Ok(ExecutionNodeType::Output), - "assignment" => Ok(ExecutionNodeType::Assignment), - "action_call" => Ok(ExecutionNodeType::ActionCall), - "fn_call" => Ok(ExecutionNodeType::FnCall), - "parallel" => Ok(ExecutionNodeType::Parallel), - "aggregator" => Ok(ExecutionNodeType::Aggregator), - "branch" => Ok(ExecutionNodeType::Branch), - "join" => Ok(ExecutionNodeType::Join), - "return" => Ok(ExecutionNodeType::Return), - "break" => Ok(ExecutionNodeType::Break), - "continue" => Ok(ExecutionNodeType::Continue), - "sleep" => Ok(ExecutionNodeType::Sleep), - "expression" => Ok(ExecutionNodeType::Expression), - _ => Err(RunnerStateError(format!( - "unknown execution node type: {value}" - ))), - } - } -} - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -pub struct ExecutionNode { - pub node_id: Uuid, - pub node_type: String, - pub label: String, - pub status: NodeStatus, - pub template_id: Option, - pub targets: Vec, - pub action: Option, - pub value_expr: Option, - pub assignments: HashMap, - pub action_attempt: i32, - #[serde(default)] - pub started_at: Option>, - #[serde(default)] - pub completed_at: Option>, - #[serde(default)] - pub scheduled_at: Option>, -} - -impl ExecutionNode { - pub fn node_type_enum(&self) -> Result { - ExecutionNodeType::try_from(self.node_type.as_str()) - } - - pub fn is_action_call(&self) -> bool { - matches!( - ExecutionNodeType::try_from(self.node_type.as_str()), - Ok(ExecutionNodeType::ActionCall) - ) - } - - pub fn is_sleep(&self) -> bool { - matches!( - ExecutionNodeType::try_from(self.node_type.as_str()), - Ok(ExecutionNodeType::Sleep) - ) - } -} - -#[derive(Clone, Debug, Default)] -pub struct QueueNodeParams { - pub node_id: Option, - pub template_id: Option, - pub targets: Option>, - pub action: Option, - pub value_expr: Option, - pub scheduled_at: Option>, -} - -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct ExecutionEdge { - pub source: Uuid, - pub target: Uuid, - pub edge_type: EdgeType, -} - -/// Track queued/executed DAG nodes with an unrolled, symbolic state. -/// -/// Design overview: -/// - The runner state is not a variable heap; it is the runtime graph itself, -/// unrolled to the exact nodes that have been queued or executed. -/// - Each execution node stores assignments as symbolic expressions so action -/// results can be replayed later without having the concrete payloads. -/// - Data-flow edges encode which execution node supplies a value to another, -/// while state-machine edges encode execution ordering and control flow. This -/// mirrors how the ground truth IR->DAG functions. -/// -/// Expected usage: -/// - Callers queue nodes as the program executes (ie. the DAG template is -/// walked) so loops and spreads expand into explicit iterations. -/// - Callers never mutate variables directly; they record assignments on nodes -/// and let replay walk the graph to reconstruct values. -/// - Persisted state can be rehydrated only with nodes/edges. The constructor will -/// rebuild in-memory cache (like timeline ordering and latest assignment tracking). -/// -/// In short, RunnerState is the ground-truth runtime DAG: symbolic assignments -/// plus control/data edges, suitable for replay and visualization. -/// -/// Action nodes represent our "frontier" nodes. Because of how we construct the graph and always -/// greedily walk the state until we hit the next actions that are possible to run, we guarantee that -/// leaf nodes are only ever actions. -/// -/// Cycle walkthrough (mid-loop example): -/// Suppose we are partway through: -/// - results = [] -/// - for item in items: -/// - action_result = @action(item) -/// - results = results + [action_result + 1] -/// -/// On a single iteration update: -/// 1) The runner queues an action node for @action(item). -/// - A new execution node is created with a UUID id. -/// - Its assignments map action_result -> ActionResultValue(node_id). -/// - Data-flow edges are added from the node that last defined `item`. -/// 2) The runner queues the assignment node for results update. -/// - The RHS expression is materialized: -/// results + [action_result + 1] becomes a BinaryOpValue whose tree -/// contains the ActionResultValue from step (1), plus a LiteralValue(1). -/// - Data-flow edges are added from the prior results definition node and -/// from the action node created in step (1). -/// - Latest assignment tracking is updated so `results` now points to this -/// new execution node. -/// -/// After this iteration, the state graph has explicit nodes for the current -/// action and the results update. Subsequent iterations repeat the same -/// sequence, producing a chain of assignments where replay can reconstruct the -/// incremental `results` value by following data-flow edges. -#[derive(Clone, Debug, Serialize, Deserialize)] -pub struct RunnerState { - #[serde(skip, default)] - pub dag: Option>, - pub nodes: HashMap, - pub edges: HashSet, - pub ready_queue: Vec, - pub timeline: Vec, - link_queued_nodes: bool, - latest_assignments: HashMap, - graph_dirty: bool, -} - -impl RunnerState { - pub fn new( - dag: Option>, - nodes: Option>, - edges: Option>, - link_queued_nodes: bool, - ) -> Self { - let mut state = Self { - dag, - nodes: nodes.unwrap_or_default(), - edges: edges.unwrap_or_default(), - ready_queue: Vec::new(), - timeline: Vec::new(), - link_queued_nodes, - latest_assignments: HashMap::new(), - graph_dirty: false, - }; - if !state.nodes.is_empty() || !state.edges.is_empty() { - state.rehydrate_state(); - } - state - } - - pub(crate) fn set_link_queued_nodes(&mut self, value: bool) { - self.link_queued_nodes = value; - } - - pub(crate) fn latest_assignment(&self, name: &str) -> Option { - self.latest_assignments.get(name).copied() - } - - /// Queue a runtime node based on the DAG template and apply its effects. - /// - /// Use this when stepping through a compiled DAG so the runtime state mirrors - /// the template node (assignments, action results, and data-flow edges). - /// - /// Example IR: - /// - total = a + b - /// When the AssignmentNode template is queued, the execution node records - /// the symbolic BinaryOpValue and updates data-flow edges from a/b. - pub fn queue_template_node( - &mut self, - template_id: &str, - iteration_index: Option, - ) -> Result { - let dag = self - .dag - .as_ref() - .ok_or_else(|| RunnerStateError("runner state has no DAG template".to_string()))?; - let template = dag - .nodes - .get(template_id) - .ok_or_else(|| RunnerStateError(format!("template node not found: {template_id}")))? - .clone(); - - let node_id = Uuid::new_v4(); - let node = ExecutionNode { - node_id, - node_type: template.node_type().to_string(), - label: template.label(), - status: NodeStatus::Queued, - template_id: Some(template_id.to_string()), - targets: self.node_targets(&template), - action: if let DAGNode::ActionCall(action_node) = &template { - Some(self.action_spec_from_node(action_node)) - } else { - None - }, - value_expr: None, - assignments: HashMap::new(), - action_attempt: if matches!(template, DAGNode::ActionCall(_)) { - 1 - } else { - 0 - }, - started_at: None, - completed_at: None, - scheduled_at: None, - }; - - self.register_node(node.clone())?; - self.apply_template_node(&node, &template, iteration_index)?; - Ok(node) - } - - /// Create a runtime node directly without a DAG template. - /// - /// Use this for ad-hoc nodes (tests, synthetic steps) and as a common - /// builder for higher-level queue helpers like queue_action. - /// - /// Example: - /// - queue_node(node_type="assignment", label="results = []") - pub fn queue_node( - &mut self, - node_type: &str, - label: &str, - params: QueueNodeParams, - ) -> Result { - let node_type_enum = ExecutionNodeType::try_from(node_type)?; - let QueueNodeParams { - node_id, - template_id, - targets, - action, - value_expr, - scheduled_at, - } = params; - let node_id = node_id.unwrap_or_else(Uuid::new_v4); - let action_attempt = if matches!(node_type_enum, ExecutionNodeType::ActionCall) { - 1 - } else { - 0 - }; - let node = ExecutionNode { - node_id, - node_type: node_type.to_string(), - label: label.to_string(), - status: NodeStatus::Queued, - template_id, - targets: targets.unwrap_or_default(), - action, - value_expr, - assignments: HashMap::new(), - action_attempt, - started_at: None, - completed_at: None, - scheduled_at, - }; - self.register_node(node.clone())?; - Ok(node) - } - - /// Queue an action call from IR, respecting a local scope for loop vars. - /// - /// Use this during IR -> runner-state conversion (including spreads) so - /// action arguments are converted to symbolic expressions. - /// - /// Example IR: - /// - @double(value=item) - /// With local_scope={"item": LiteralValue(2)}, the queued action uses a - /// literal argument and links data-flow to the literal's source nodes. - pub fn queue_action_call( - &mut self, - action: &ir::ActionCall, - targets: Option>, - iteration_index: Option, - local_scope: Option<&HashMap>, - ) -> Result { - let spec = self.action_spec_from_ir(action, local_scope); - let node = self.queue_node( - ExecutionNodeType::ActionCall.as_str(), - &format!("@{}()", spec.action_name), - QueueNodeParams { - targets: targets.clone(), - action: Some(spec.clone()), - ..QueueNodeParams::default() - }, - )?; - for value in spec.kwargs.values() { - self.record_data_flow_from_value(node.node_id, value); - } - let result = self.assign_action_results( - &node, - &spec.action_name, - targets.as_deref(), - iteration_index, - true, - )?; - if let Some(node_mut) = self.nodes.get_mut(&node.node_id) { - node_mut.value_expr = Some(ValueExpr::ActionResult(result.clone())); - } - Ok(result) - } - - pub fn mark_running(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { - let is_action = { - let node = self.get_node_mut(node_id)?; - node.status = NodeStatus::Running; - let is_action = node.is_action_call(); - if is_action { - node.started_at = Some(Utc::now()); - node.completed_at = None; - } - is_action - }; - self.ready_queue.retain(|id| id != &node_id); - if is_action { - self.mark_graph_dirty(); - } - Ok(()) - } - - pub fn mark_completed(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { - let is_action = { - let node = self.get_node_mut(node_id)?; - node.status = NodeStatus::Completed; - let is_action = node.is_action_call(); - if is_action { - node.completed_at = Some(Utc::now()); - } - node.scheduled_at = None; - is_action - }; - self.ready_queue.retain(|id| id != &node_id); - if is_action { - self.mark_graph_dirty(); - } - Ok(()) - } - - pub fn mark_failed(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { - let is_action = { - let node = self.get_node_mut(node_id)?; - node.status = NodeStatus::Failed; - let is_action = node.is_action_call(); - if is_action { - node.completed_at = Some(Utc::now()); - } - node.scheduled_at = None; - is_action - }; - self.ready_queue.retain(|id| id != &node_id); - if is_action { - self.mark_graph_dirty(); - } - Ok(()) - } - - pub fn set_node_scheduled_at( - &mut self, - node_id: Uuid, - scheduled_at: Option>, - ) -> Result<(), RunnerStateError> { - let node = self.get_node_mut(node_id)?; - node.scheduled_at = scheduled_at; - self.mark_graph_dirty(); - Ok(()) - } - - pub fn increment_action_attempt(&mut self, node_id: Uuid) -> Result<(), RunnerStateError> { - let node = self.get_node_mut(node_id)?; - if !node.is_action_call() { - return Err(RunnerStateError( - "action attempt increment requires an action_call node".to_string(), - )); - } - node.action_attempt += 1; - self.mark_graph_dirty(); - Ok(()) - } - - /// Return and clear the graph dirty bit for durable execution. - /// - /// Only action nodes and their retry parameters must be persisted; other - /// nodes are deterministic from the ground-truth DAG definition. - pub fn consume_graph_dirty_for_durable_execution(&mut self) -> bool { - let dirty = self.graph_dirty; - self.graph_dirty = false; - dirty - } - - pub fn add_edge(&mut self, source: Uuid, target: Uuid, edge_type: EdgeType) { - self.register_edge(ExecutionEdge { - source, - target, - edge_type, - }); - } - - /// Insert a node into the runtime bookkeeping and optional control flow. - /// - /// Use this for all queued nodes so the ready queue, timeline, and implicit - /// state-machine edge ordering remain consistent. - /// - /// Example: - /// - queue node A then node B with link_queued_nodes=True - /// This creates a state-machine edge A -> B automatically. - fn register_node(&mut self, node: ExecutionNode) -> Result<(), RunnerStateError> { - if self.nodes.contains_key(&node.node_id) { - return Err(RunnerStateError(format!( - "execution node already queued: {}", - node.node_id - ))); - } - self.nodes.insert(node.node_id, node.clone()); - self.ready_queue.push(node.node_id); - if node.is_action_call() { - self.mark_graph_dirty(); - } - if self.link_queued_nodes - && let Some(last) = self.timeline.last() - { - self.register_edge(ExecutionEdge { - source: *last, - target: node.node_id, - edge_type: EdgeType::StateMachine, - }); - } - self.timeline.push(node.node_id); - Ok(()) - } - - fn register_edge(&mut self, edge: ExecutionEdge) { - self.edges.insert(edge); - } - - fn mark_graph_dirty(&mut self) { - self.graph_dirty = true; - } - - /// Rebuild derived structures from persisted nodes and edges. - /// - /// Use this when loading a snapshot so timeline ordering, latest assignment - /// tracking, and ready queue reflect the current node set. - /// - /// Example: - /// - Given nodes {A, B} and edge A -> B, rehydration restores timeline - /// [A, B] and marks the latest assignment targets from node B. - fn rehydrate_state(&mut self) { - self.timeline = self.build_timeline(); - self.latest_assignments.clear(); - for node_id in &self.timeline { - if let Some(node) = self.nodes.get(node_id) { - for target in node.assignments.keys() { - self.latest_assignments.insert(target.clone(), *node_id); - } - } - } - if self.ready_queue.is_empty() { - self.ready_queue = self - .timeline - .iter() - .filter(|node_id| { - self.nodes - .get(node_id) - .map(|node| node.status == NodeStatus::Queued) - .unwrap_or(false) - }) - .cloned() - .collect(); - } - } - - fn build_timeline(&self) -> Vec { - if self.edges.is_empty() { - return self.nodes.keys().cloned().collect(); - } - let mut adjacency: HashMap> = self - .nodes - .keys() - .map(|node_id| (*node_id, Vec::new())) - .collect(); - let mut in_degree: HashMap = - self.nodes.keys().map(|node_id| (*node_id, 0)).collect(); - let mut edges: Vec<&ExecutionEdge> = self.edges.iter().collect(); - edges.sort_by_key(|edge| (edge.source, edge.target)); - for edge in edges { - if edge.edge_type != EdgeType::StateMachine { - continue; - } - if adjacency.contains_key(&edge.source) && adjacency.contains_key(&edge.target) { - adjacency.entry(edge.source).or_default().push(edge.target); - *in_degree.entry(edge.target).or_insert(0) += 1; - } - } - let mut queue: Vec = in_degree - .iter() - .filter(|(_, degree)| **degree == 0) - .map(|(node_id, _)| *node_id) - .collect(); - queue.sort_by_key(|id| id.to_string()); - let mut order: Vec = Vec::new(); - while !queue.is_empty() { - let node_id = queue.remove(0); - order.push(node_id); - if let Some(neighbors) = adjacency.get(&node_id) { - let mut sorted = neighbors.clone(); - sorted.sort_by_key(|id| id.to_string()); - for neighbor in sorted { - if let Some(degree) = in_degree.get_mut(&neighbor) { - *degree -= 1; - if *degree == 0 { - queue.push(neighbor); - } - } - } - queue.sort_by_key(|id| id.to_string()); - } - } - let mut remaining: Vec = self - .nodes - .keys() - .filter(|node_id| !order.contains(node_id)) - .cloned() - .collect(); - remaining.sort_by_key(|id| id.to_string()); - order.extend(remaining); - order - } - - fn get_node_mut(&mut self, node_id: Uuid) -> Result<&mut ExecutionNode, RunnerStateError> { - self.nodes - .get_mut(&node_id) - .ok_or_else(|| RunnerStateError(format!("execution node not found: {node_id}"))) - } - - fn node_targets(&self, node: &DAGNode) -> Vec { - match node { - DAGNode::Assignment(AssignmentNode { - targets, target, .. - }) => { - if !targets.is_empty() { - return targets.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - DAGNode::ActionCall(ActionCallNode { - targets, target, .. - }) => { - if let Some(list) = targets - && !list.is_empty() - { - return list.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - DAGNode::FnCall(FnCallNode { - targets, target, .. - }) => { - if let Some(list) = targets - && !list.is_empty() - { - return list.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - DAGNode::Join(JoinNode { - targets, target, .. - }) => { - if let Some(list) = targets - && !list.is_empty() - { - return list.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - DAGNode::Aggregator(AggregatorNode { - targets, target, .. - }) => { - if let Some(list) = targets - && !list.is_empty() - { - return list.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - DAGNode::Return(ReturnNode { - targets, target, .. - }) => { - if let Some(list) = targets - && !list.is_empty() - { - return list.clone(); - } - target.clone().map(|item| vec![item]).unwrap_or_default() - } - _ => Vec::new(), - } - } - - /// Apply DAG template semantics to a queued execution node. - /// - /// Use this right after queue_template_node so assignments, action result - /// references, and data-flow edges are populated from the template. - /// - /// Example IR: - /// - total = @sum(values=items) - /// The ActionCallNode template produces an ActionResultValue and defines - /// total via assignments on the execution node. - fn apply_template_node( - &mut self, - exec_node: &ExecutionNode, - template: &DAGNode, - iteration_index: Option, - ) -> Result<(), RunnerStateError> { - match template { - DAGNode::Assignment(AssignmentNode { - assign_expr: Some(expr), - .. - }) => { - let value_expr = self.expr_to_value(expr, None)?; - if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { - node_mut.value_expr = Some(value_expr.clone()); - } - self.record_data_flow_from_value(exec_node.node_id, &value_expr); - let assignments = - self.build_assignments(&self.node_targets(template), &value_expr)?; - if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { - node.assignments.extend(assignments.clone()); - } - self.mark_latest_assignments(exec_node.node_id, &assignments); - return Ok(()); - } - DAGNode::ActionCall(ActionCallNode { - action_name, - targets, - target, - .. - }) => { - let kwarg_values = self - .nodes - .get(&exec_node.node_id) - .and_then(|node| node.action.as_ref()) - .map(|action| action.kwargs.values().cloned().collect::>()) - .unwrap_or_default(); - for expr in &kwarg_values { - self.record_data_flow_from_value(exec_node.node_id, expr); - } - let targets = targets - .clone() - .or_else(|| target.clone().map(|item| vec![item])); - let result = self.assign_action_results( - exec_node, - action_name, - targets.as_deref(), - iteration_index, - true, - )?; - if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { - node_mut.value_expr = Some(ValueExpr::ActionResult(result)); - } - return Ok(()); - } - DAGNode::Sleep(SleepNode { - duration_expr: Some(expr), - .. - }) => { - let value_expr = self.expr_to_value(expr, None)?; - if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { - node_mut.value_expr = Some(value_expr.clone()); - } - self.record_data_flow_from_value(exec_node.node_id, &value_expr); - return Ok(()); - } - DAGNode::FnCall(FnCallNode { - assign_expr: Some(expr), - .. - }) => { - let value_expr = self.expr_to_value(expr, None)?; - if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { - node_mut.value_expr = Some(value_expr.clone()); - } - self.record_data_flow_from_value(exec_node.node_id, &value_expr); - let assignments = - self.build_assignments(&self.node_targets(template), &value_expr)?; - if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { - node.assignments.extend(assignments.clone()); - } - self.mark_latest_assignments(exec_node.node_id, &assignments); - return Ok(()); - } - DAGNode::Return(ReturnNode { - assign_expr: Some(expr), - target, - .. - }) => { - let value_expr = self.expr_to_value(expr, None)?; - if let Some(node_mut) = self.nodes.get_mut(&exec_node.node_id) { - node_mut.value_expr = Some(value_expr.clone()); - } - self.record_data_flow_from_value(exec_node.node_id, &value_expr); - let target = target.clone().unwrap_or_else(|| "result".to_string()); - let assignments = self.build_assignments(&[target], &value_expr)?; - if let Some(node) = self.nodes.get_mut(&exec_node.node_id) { - node.assignments.extend(assignments.clone()); - } - self.mark_latest_assignments(exec_node.node_id, &assignments); - return Ok(()); - } - _ => {} - } - Ok(()) - } - - /// Create symbolic action results and map them to targets. - /// - /// Use this when an action produces one or more results that are assigned - /// to variables (including tuple unpacking). - /// - /// `update_latest` controls whether assigned targets are published into - /// `latest_assignments` for downstream variable/data-flow resolution. - /// - /// Use `update_latest = true` for user-visible assignments so later nodes - /// can resolve those target names through `latest_assignments`. - /// - /// Use `update_latest = false` for internal/synthetic bindings that should - /// not become globally visible variable definitions. Example: spread action - /// unroll nodes can bind an internal `_spread_result`, and the aggregator - /// later publishes the final user target. - /// - /// Example IR: - /// - a, b = @pair() - /// This yields ActionResultValue(node_id, result_index=0/1) for a and b. - pub(crate) fn assign_action_results( - &mut self, - node: &ExecutionNode, - action_name: &str, - targets: Option<&[String]>, - iteration_index: Option, - update_latest: bool, - ) -> Result { - let result_ref = ActionResultValue { - node_id: node.node_id, - action_name: action_name.to_string(), - iteration_index, - result_index: None, - }; - let targets = targets.unwrap_or(&[]); - let assignments = - self.build_assignments(targets, &ValueExpr::ActionResult(result_ref.clone()))?; - if !assignments.is_empty() { - if let Some(node) = self.nodes.get_mut(&node.node_id) { - node.assignments.extend(assignments.clone()); - } - if update_latest { - self.mark_latest_assignments(node.node_id, &assignments); - } - } - Ok(result_ref) - } - - /// Expand an assignment into per-target symbolic values. - /// - /// Use this for single-target assignments, tuple unpacking, and action - /// multi-result binding to keep definitions explicit. - /// - /// Example IR: - /// - a, b = [1, 2] - /// Produces {"a": LiteralValue(1), "b": LiteralValue(2)}. - fn build_assignments( - &self, - targets: &[String], - value: &ValueExpr, - ) -> Result, RunnerStateError> { - if targets.is_empty() { - return Ok(HashMap::new()); - } - if targets.len() == 1 { - let mut map = HashMap::new(); - // Keep single-target assignments symbolic to avoid recursively - // embedding prior values into each update (which can explode - // persisted runner_instances.state size/depth in loops). - map.insert(targets[0].clone(), value.clone()); - return Ok(map); - } - let value = self.materialize_value(value.clone()); - - match value { - ValueExpr::List(ListValue { elements }) => { - if elements.len() != targets.len() { - return Err(RunnerStateError("tuple unpacking mismatch".to_string())); - } - let mut map = HashMap::new(); - for (target, item) in targets.iter().zip(elements.into_iter()) { - map.insert(target.clone(), item); - } - Ok(map) - } - ValueExpr::ActionResult(action_value) => { - let mut map = HashMap::new(); - for (idx, target) in targets.iter().enumerate() { - map.insert( - target.clone(), - ValueExpr::ActionResult(ActionResultValue { - node_id: action_value.node_id, - action_name: action_value.action_name.clone(), - iteration_index: action_value.iteration_index, - result_index: Some(idx as i32), - }), - ); - } - Ok(map) - } - ValueExpr::FunctionCall(func_value) => { - let mut map = HashMap::new(); - for (idx, target) in targets.iter().enumerate() { - map.insert( - target.clone(), - ValueExpr::Index(IndexValue { - object: Box::new(ValueExpr::FunctionCall(func_value.clone())), - index: Box::new(ValueExpr::Literal(LiteralValue { - value: serde_json::Value::Number((idx as i64).into()), - })), - }), - ); - } - Ok(map) - } - ValueExpr::Index(index_value) => { - let mut map = HashMap::new(); - for (idx, target) in targets.iter().enumerate() { - map.insert( - target.clone(), - ValueExpr::Index(IndexValue { - object: Box::new(ValueExpr::Index(index_value.clone())), - index: Box::new(ValueExpr::Literal(LiteralValue { - value: serde_json::Value::Number((idx as i64).into()), - })), - }), - ); - } - Ok(map) - } - _ => Err(RunnerStateError("tuple unpacking mismatch".to_string())), - } - } - - /// Inline variable references and apply light constant folding. - /// - /// Use this before storing assignments so values are self-contained and - /// list concatenations are simplified. - /// - /// Example IR: - /// - xs = [1] - /// - ys = xs + [2] - /// Materialization turns ys into ListValue([1, 2]) rather than keeping xs. - pub(crate) fn materialize_value(&self, value: ValueExpr) -> ValueExpr { - let resolved = resolve_value_tree(&value, &|name, seen| { - self.resolve_variable_value(name, seen) - }); - if let ValueExpr::BinaryOp(BinaryOpValue { left, op, right }) = &resolved - && ir::BinaryOperator::try_from(*op).ok() == Some(ir::BinaryOperator::BinaryOpAdd) - && let (ValueExpr::List(left_list), ValueExpr::List(right_list)) = (&**left, &**right) - { - let mut elements = left_list.elements.clone(); - elements.extend(right_list.elements.clone()); - return ValueExpr::List(ListValue { elements }); - } - resolved - } - - /// Resolve a variable name to its latest symbolic definition. - /// - /// Use this when materializing expressions so variables become their - /// defining expression while guarding against cycles. - /// - /// Example IR: - /// - x = 1 - /// - y = x + 2 - /// When materializing y, the VariableValue("x") is replaced with the - /// LiteralValue(1), yielding a BinaryOpValue(1 + 2) instead of a reference - /// to x. This makes downstream replay use the symbolic expression rather - /// than requiring a separate variable lookup. - fn resolve_variable_value(&self, name: &str, seen: &mut HashSet) -> ValueExpr { - if seen.contains(name) { - return ValueExpr::Variable(VariableValue { - name: name.to_string(), - }); - } - let node_id = match self.latest_assignments.get(name) { - Some(node_id) => *node_id, - None => { - return ValueExpr::Variable(VariableValue { - name: name.to_string(), - }); - } - }; - let node = match self.nodes.get(&node_id) { - Some(node) => node, - None => { - return ValueExpr::Variable(VariableValue { - name: name.to_string(), - }); - } - }; - let assigned = match node.assignments.get(name) { - Some(value) => value.clone(), - None => { - return ValueExpr::Variable(VariableValue { - name: name.to_string(), - }); - } - }; - // Avoid inlining self-referential updates such as `i = i + 1`. - // Returning the raw assignment here would inject one "extra step" - // into materialized consumers (e.g. loop guards), causing off-by-one - // behavior and deep recursive expression trees. - if value_expr_contains_variable(&assigned, name) { - return ValueExpr::Variable(VariableValue { - name: name.to_string(), - }); - } - if let ValueExpr::Variable(var) = &assigned { - seen.insert(name.to_string()); - return self.resolve_variable_value(&var.name, seen); - } - assigned - } - - pub(crate) fn mark_latest_assignments( - &mut self, - node_id: Uuid, - assignments: &HashMap, - ) { - for target in assignments.keys() { - self.latest_assignments.insert(target.clone(), node_id); - } - } - - /// Add data-flow edges implied by a value expression. - /// - /// Use this when a node consumes an expression so upstream dependencies are - /// encoded in the runtime graph. - /// - /// Example IR: - /// - total = @sum(values) - /// A data-flow edge is added from the values assignment node to the action. - pub(crate) fn record_data_flow_from_value(&mut self, node_id: Uuid, value: &ValueExpr) { - let source_ids = - collect_value_sources(value, &|name| self.latest_assignments.get(name).copied()); - self.record_data_flow_edges(node_id, &source_ids); - } - - /// Register data-flow edges from sources to the given node. - /// - /// Example: - /// - sources {A, B} and node C produce edges A -> C and B -> C. - fn record_data_flow_edges(&mut self, node_id: Uuid, source_ids: &HashSet) { - for source_id in source_ids { - if *source_id == node_id { - continue; - } - self.register_edge(ExecutionEdge { - source: *source_id, - target: node_id, - edge_type: EdgeType::DataFlow, - }); - } - } - - /// Convert an IR expression into a symbolic ValueExpr tree. - /// - /// Use this when interpreting IR statements or DAG templates into the - /// runtime state; it queues actions and spreads as needed. - /// - /// Example IR: - /// - total = base + 1 - /// Produces BinaryOpValue(VariableValue("base"), LiteralValue(1)). - pub fn expr_to_value( - &mut self, - expr: &ir::Expr, - local_scope: Option<&HashMap>, - ) -> Result { - match expr.kind.as_ref() { - Some(ir::expr::Kind::Literal(lit)) => Ok(ValueExpr::Literal(LiteralValue { - value: literal_value(lit), - })), - Some(ir::expr::Kind::Variable(var)) => { - if let Some(scope) = local_scope - && let Some(value) = scope.get(&var.name) - { - return Ok(value.clone()); - } - Ok(ValueExpr::Variable(VariableValue { - name: var.name.clone(), - })) - } - Some(ir::expr::Kind::BinaryOp(op)) => { - let left = op - .left - .as_ref() - .ok_or_else(|| RunnerStateError("binary op missing left".to_string()))?; - let right = op - .right - .as_ref() - .ok_or_else(|| RunnerStateError("binary op missing right".to_string()))?; - let left_value = self.expr_to_value(left, local_scope)?; - let right_value = self.expr_to_value(right, local_scope)?; - Ok(self.binary_op_value(op.op, left_value, right_value)) - } - Some(ir::expr::Kind::UnaryOp(op)) => { - let operand = op - .operand - .as_ref() - .ok_or_else(|| RunnerStateError("unary op missing operand".to_string()))?; - let operand_value = self.expr_to_value(operand, local_scope)?; - Ok(self.unary_op_value(op.op, operand_value)) - } - Some(ir::expr::Kind::List(list)) => { - let elements = list - .elements - .iter() - .map(|item| self.expr_to_value(item, local_scope)) - .collect::, RunnerStateError>>()?; - Ok(ValueExpr::List(ListValue { elements })) - } - Some(ir::expr::Kind::Dict(dict_expr)) => { - let mut entries = Vec::new(); - for entry in &dict_expr.entries { - let key_expr = entry - .key - .as_ref() - .ok_or_else(|| RunnerStateError("dict entry missing key".to_string()))?; - let value_expr = entry - .value - .as_ref() - .ok_or_else(|| RunnerStateError("dict entry missing value".to_string()))?; - entries.push(DictEntryValue { - key: self.expr_to_value(key_expr, local_scope)?, - value: self.expr_to_value(value_expr, local_scope)?, - }); - } - Ok(ValueExpr::Dict(DictValue { entries })) - } - Some(ir::expr::Kind::Index(index)) => { - let object = index - .object - .as_ref() - .ok_or_else(|| RunnerStateError("index access missing object".to_string()))?; - let index_expr = index - .index - .as_ref() - .ok_or_else(|| RunnerStateError("index access missing index".to_string()))?; - let object_value = self.expr_to_value(object, local_scope)?; - let index_value = self.expr_to_value(index_expr, local_scope)?; - Ok(self.index_value(object_value, index_value)) - } - Some(ir::expr::Kind::Dot(dot)) => { - let object = dot - .object - .as_ref() - .ok_or_else(|| RunnerStateError("dot access missing object".to_string()))?; - Ok(ValueExpr::Dot(DotValue { - object: Box::new(self.expr_to_value(object, local_scope)?), - attribute: dot.attribute.clone(), - })) - } - Some(ir::expr::Kind::FunctionCall(call)) => { - let args = call - .args - .iter() - .map(|arg| self.expr_to_value(arg, local_scope)) - .collect::, RunnerStateError>>()?; - let mut kwargs = HashMap::new(); - for kw in &call.kwargs { - if let Some(value) = &kw.value { - kwargs.insert(kw.name.clone(), self.expr_to_value(value, local_scope)?); - } - } - let global_fn = if call.global_function != 0 { - Some(call.global_function) - } else { - None - }; - Ok(ValueExpr::FunctionCall(FunctionCallValue { - name: call.name.clone(), - args, - kwargs, - global_function: global_fn, - })) - } - Some(ir::expr::Kind::ActionCall(action)) => { - let result = self.queue_action_call(action, None, None, local_scope)?; - Ok(ValueExpr::ActionResult(result)) - } - Some(ir::expr::Kind::ParallelExpr(parallel)) => { - let mut calls = Vec::new(); - for call in ¶llel.calls { - calls.push(self.call_to_value(call, local_scope)?); - } - Ok(ValueExpr::List(ListValue { elements: calls })) - } - Some(ir::expr::Kind::SpreadExpr(spread)) => self.spread_expr_value(spread, local_scope), - None => Ok(ValueExpr::Literal(LiteralValue { - value: serde_json::Value::Null, - })), - } - } - - /// Convert an IR call (action/function) into a ValueExpr. - /// - /// Use this for parallel expressions that contain mixed call types. - /// - /// Example IR: - /// - parallel { @double(x), helper(x) } - /// Action calls become ActionResultValue nodes; function calls become - /// FunctionCallValue expressions. - fn call_to_value( - &mut self, - call: &ir::Call, - local_scope: Option<&HashMap>, - ) -> Result { - match call.kind.as_ref() { - Some(ir::call::Kind::Action(action)) => Ok(ValueExpr::ActionResult( - self.queue_action_call(action, None, None, local_scope)?, - )), - Some(ir::call::Kind::Function(function)) => self.expr_to_value( - &ir::Expr { - kind: Some(ir::expr::Kind::FunctionCall(function.clone())), - span: None, - }, - local_scope, - ), - None => Ok(ValueExpr::Literal(LiteralValue { - value: serde_json::Value::Null, - })), - } - } - - /// Materialize a spread expression into concrete calls or a symbolic spread. - /// - /// Use this when converting IR spreads so known list collections unroll to - /// explicit action calls, while unknown collections stay symbolic. - /// - /// Example IR: - /// - spread [1, 2]:item -> @double(value=item) - /// Produces a ListValue of ActionResultValue entries for each item. - fn spread_expr_value( - &mut self, - spread: &ir::SpreadExpr, - local_scope: Option<&HashMap>, - ) -> Result { - let collection = self.expr_to_value( - spread - .collection - .as_ref() - .ok_or_else(|| RunnerStateError("spread collection missing".to_string()))?, - local_scope, - )?; - if let ValueExpr::List(list) = &collection { - let mut results = Vec::new(); - for (idx, item) in list.elements.iter().enumerate() { - let mut scope = HashMap::new(); - scope.insert(spread.loop_var.clone(), item.clone()); - let result = self.queue_action_call( - spread - .action - .as_ref() - .ok_or_else(|| RunnerStateError("spread action missing".to_string()))?, - None, - Some(idx as i32), - Some(&scope), - )?; - results.push(ValueExpr::ActionResult(result)); - } - return Ok(ValueExpr::List(ListValue { elements: results })); - } - - let action_spec = self.action_spec_from_ir( - spread - .action - .as_ref() - .ok_or_else(|| RunnerStateError("spread action missing".to_string()))?, - None, - ); - Ok(ValueExpr::Spread(SpreadValue { - collection: Box::new(collection), - loop_var: spread.loop_var.clone(), - action: action_spec, - })) - } - - /// Build a binary-op value with simple constant folding. - /// - /// Use this when converting IR so literals and list concatenations are - /// simplified early. - /// - /// Example IR: - /// - total = 1 + 2 - /// Produces LiteralValue(3) instead of a BinaryOpValue. - fn binary_op_value(&self, op: i32, left: ValueExpr, right: ValueExpr) -> ValueExpr { - if ir::BinaryOperator::try_from(op).ok() == Some(ir::BinaryOperator::BinaryOpAdd) - && let (ValueExpr::List(left_list), ValueExpr::List(right_list)) = (&left, &right) - { - let mut elements = left_list.elements.clone(); - elements.extend(right_list.elements.clone()); - return ValueExpr::List(ListValue { elements }); - } - if let (ValueExpr::Literal(left_val), ValueExpr::Literal(right_val)) = (&left, &right) - && let Some(folded) = fold_literal_binary(op, &left_val.value, &right_val.value) - { - return ValueExpr::Literal(LiteralValue { value: folded }); - } - ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(left), - op, - right: Box::new(right), - }) - } - - /// Build a unary-op value with constant folding for literals. - /// - /// Example IR: - /// - neg = -1 - /// Produces LiteralValue(-1) instead of UnaryOpValue. - fn unary_op_value(&self, op: i32, operand: ValueExpr) -> ValueExpr { - if let ValueExpr::Literal(lit) = &operand - && let Some(folded) = fold_literal_unary(op, &lit.value) - { - return ValueExpr::Literal(LiteralValue { value: folded }); - } - ValueExpr::UnaryOp(UnaryOpValue { - op, - operand: Box::new(operand), - }) - } - - /// Build an index value, folding list literals when possible. - /// - /// Example IR: - /// - first = [10, 20][0] - /// Produces LiteralValue(10) when the list is fully literal. - fn index_value(&self, object: ValueExpr, index: ValueExpr) -> ValueExpr { - if let (ValueExpr::List(list), ValueExpr::Literal(idx)) = (&object, &index) - && let Some(idx) = idx.value.as_i64() - && idx >= 0 - && (idx as usize) < list.elements.len() - { - return list.elements[idx as usize].clone(); - } - ValueExpr::Index(IndexValue { - object: Box::new(object), - index: Box::new(index), - }) - } - - /// Extract an action call spec from a DAG node. - /// - /// Use this when queueing nodes from the DAG template. - /// - /// Example: - /// - ActionCallNode(action_name="double", kwargs={"value": "$x"}) - /// Produces ActionCallSpec(action_name="double", kwargs={"value": VariableValue("x")}). - fn action_spec_from_node(&mut self, node: &ActionCallNode) -> ActionCallSpec { - let kwargs = node - .kwarg_exprs - .iter() - .map(|(name, expr)| (name.clone(), self.expr_to_value(expr, None).unwrap())) - .collect(); - ActionCallSpec { - action_name: node.action_name.clone(), - module_name: node.module_name.clone(), - kwargs, - } - } - - /// Extract an action call spec from IR, applying local scope bindings. - /// - /// Example IR: - /// - @double(value=item) with local_scope["item"]=LiteralValue(2) - /// Produces kwargs {"value": LiteralValue(2)}. - fn action_spec_from_ir( - &mut self, - action: &ir::ActionCall, - local_scope: Option<&HashMap>, - ) -> ActionCallSpec { - let kwargs = action - .kwargs - .iter() - .filter_map(|kw| kw.value.as_ref().map(|value| (kw.name.clone(), value))) - .map(|(name, value)| (name, self.expr_to_value(value, local_scope).unwrap())) - .collect(); - ActionCallSpec { - action_name: action.action_name.clone(), - module_name: action.module_name.clone(), - kwargs, - } - } - - /// Queue an action call from raw parameters and return a symbolic result. - /// - /// Use this when constructing runner state programmatically without IR - /// objects, while still wiring data-flow edges and assignments. - /// - /// Example: - /// - queue_action("double", targets=["out"], kwargs={"value": LiteralValue(2)}) - /// Defines out via an ActionResultValue and records data-flow from the literal. - pub fn queue_action( - &mut self, - action_name: &str, - targets: Option>, - kwargs: Option>, - module_name: Option, - iteration_index: Option, - ) -> Result { - let spec = ActionCallSpec { - action_name: action_name.to_string(), - module_name, - kwargs: kwargs.unwrap_or_default(), - }; - let node = self.queue_node( - ExecutionNodeType::ActionCall.as_str(), - &format!("@{}()", spec.action_name), - QueueNodeParams { - targets: targets.clone(), - action: Some(spec.clone()), - ..QueueNodeParams::default() - }, - )?; - for value in spec.kwargs.values() { - self.record_data_flow_from_value(node.node_id, value); - } - let result = self.assign_action_results( - &node, - &spec.action_name, - targets.as_deref(), - iteration_index, - true, - )?; - if let Some(node) = self.nodes.get_mut(&node.node_id) { - node.value_expr = Some(ValueExpr::ActionResult(result.clone())); - } - Ok(result) - } - - /// Record an IR assignment as a runtime node with symbolic values. - /// - /// Use this when interpreting IR statements into the unrolled runtime graph. - /// - /// Example IR: - /// - results = [] - /// Produces an assignment node with targets ["results"] and a ListValue([]). - pub fn record_assignment( - &mut self, - targets: Vec, - expr: &ir::Expr, - node_id: Option, - label: Option, - ) -> Result { - let value_expr = self.expr_to_value(expr, None)?; - self.record_assignment_value(targets, value_expr, node_id, label) - } - - /// Record a symbolic assignment node and update data-flow/definitions. - /// - /// Use this for assignments created programmatically after ValueExpr - /// construction (tests or state rewrites). - /// - /// Example: - /// - record_assignment_value(targets=["x"], value_expr=LiteralValue(1)) - /// Creates an assignment node with x bound to LiteralValue(1). - pub fn record_assignment_value( - &mut self, - targets: Vec, - value_expr: ValueExpr, - node_id: Option, - label: Option, - ) -> Result { - let exec_node_id = node_id.unwrap_or_else(Uuid::new_v4); - let node = self.queue_node( - "assignment", - label.as_deref().unwrap_or("assignment"), - QueueNodeParams { - node_id: Some(exec_node_id), - targets: Some(targets.clone()), - value_expr: Some(value_expr.clone()), - ..QueueNodeParams::default() - }, - )?; - self.record_data_flow_from_value(exec_node_id, &value_expr); - let assignments = self.build_assignments(&targets, &value_expr)?; - if let Some(node_mut) = self.nodes.get_mut(&node.node_id) { - node_mut.assignments.extend(assignments.clone()); - } - self.mark_latest_assignments(node.node_id, &assignments); - Ok(node) - } -} - -/// Render a ValueExpr to a python-like string for debugging/visualization. -/// -/// Example: -/// - BinaryOpValue(VariableValue("a"), +, LiteralValue(1)) -> "a + 1" -pub fn format_value(expr: &ValueExpr) -> String { - format_value_inner(expr, 0) -} - -/// Recursive ValueExpr formatter with operator precedence handling. -/// -/// Example: -/// - (a + b) * c renders with parentheses when needed. -fn format_value_inner(expr: &ValueExpr, parent_prec: i32) -> String { - match expr { - ValueExpr::Literal(lit) => format_literal(&lit.value), - ValueExpr::Variable(var) => var.name.clone(), - ValueExpr::ActionResult(value) => value.label(), - ValueExpr::BinaryOp(value) => { - let (op_str, prec) = binary_operator(value.op); - let left = format_value_inner(&value.left, prec); - let right = format_value_inner(&value.right, prec + 1); - let rendered = format!("{left} {op_str} {right}"); - if prec < parent_prec { - format!("({rendered})") - } else { - rendered - } - } - ValueExpr::UnaryOp(value) => { - let (op_str, prec) = unary_operator(value.op); - let operand = format_value_inner(&value.operand, prec); - let rendered = format!("{op_str}{operand}"); - if prec < parent_prec { - format!("({rendered})") - } else { - rendered - } - } - ValueExpr::List(value) => { - let items: Vec = value - .elements - .iter() - .map(|item| format_value_inner(item, 0)) - .collect(); - format!("[{}]", items.join(", ")) - } - ValueExpr::Dict(value) => { - let entries: Vec = value - .entries - .iter() - .map(|entry| { - format!( - "{}: {}", - format_value_inner(&entry.key, 0), - format_value_inner(&entry.value, 0) - ) - }) - .collect(); - format!("{{{}}}", entries.join(", ")) - } - ValueExpr::Index(value) => { - let prec = precedence("index"); - let obj = format_value_inner(&value.object, prec); - let idx = format_value_inner(&value.index, 0); - let rendered = format!("{obj}[{idx}]"); - if prec < parent_prec { - format!("({rendered})") - } else { - rendered - } - } - ValueExpr::Dot(value) => { - let prec = precedence("dot"); - let obj = format_value_inner(&value.object, prec); - let rendered = format!("{obj}.{}", value.attribute); - if prec < parent_prec { - format!("({rendered})") - } else { - rendered - } - } - ValueExpr::FunctionCall(value) => { - let mut args: Vec = value - .args - .iter() - .map(|arg| format_value_inner(arg, 0)) - .collect(); - for (name, val) in &value.kwargs { - args.push(format!("{name}={}", format_value_inner(val, 0))); - } - format!("{}({})", value.name, args.join(", ")) - } - ValueExpr::Spread(value) => { - let collection = format_value_inner(&value.collection, 0); - let mut args: Vec = Vec::new(); - for (name, val) in &value.action.kwargs { - args.push(format!("{name}={}", format_value_inner(val, 0))); - } - let call = format!("@{}({})", value.action.action_name, args.join(", ")); - format!("spread {collection}:{} -> {call}", value.loop_var) - } - } -} - -fn value_expr_contains_variable(expr: &ValueExpr, name: &str) -> bool { - match expr { - ValueExpr::Variable(var) => var.name == name, - ValueExpr::BinaryOp(value) => { - value_expr_contains_variable(&value.left, name) - || value_expr_contains_variable(&value.right, name) - } - ValueExpr::UnaryOp(value) => value_expr_contains_variable(&value.operand, name), - ValueExpr::List(value) => value - .elements - .iter() - .any(|item| value_expr_contains_variable(item, name)), - ValueExpr::Dict(value) => value.entries.iter().any(|entry| { - value_expr_contains_variable(&entry.key, name) - || value_expr_contains_variable(&entry.value, name) - }), - ValueExpr::Index(value) => { - value_expr_contains_variable(&value.object, name) - || value_expr_contains_variable(&value.index, name) - } - ValueExpr::Dot(value) => value_expr_contains_variable(&value.object, name), - ValueExpr::FunctionCall(value) => { - value - .args - .iter() - .any(|arg| value_expr_contains_variable(arg, name)) - || value - .kwargs - .values() - .any(|kwarg| value_expr_contains_variable(kwarg, name)) - } - ValueExpr::Spread(value) => { - value_expr_contains_variable(&value.collection, name) - || value - .action - .kwargs - .values() - .any(|kwarg| value_expr_contains_variable(kwarg, name)) - } - ValueExpr::Literal(_) | ValueExpr::ActionResult(_) => false, - } -} - -/// Map binary operator enums to (symbol, precedence) for formatting. -fn binary_operator(op: i32) -> (&'static str, i32) { - match ir::BinaryOperator::try_from(op).ok() { - Some(ir::BinaryOperator::BinaryOpOr) => ("or", 10), - Some(ir::BinaryOperator::BinaryOpAnd) => ("and", 20), - Some(ir::BinaryOperator::BinaryOpEq) => ("==", 30), - Some(ir::BinaryOperator::BinaryOpNe) => ("!=", 30), - Some(ir::BinaryOperator::BinaryOpLt) => ("<", 30), - Some(ir::BinaryOperator::BinaryOpLe) => ("<=", 30), - Some(ir::BinaryOperator::BinaryOpGt) => (">", 30), - Some(ir::BinaryOperator::BinaryOpGe) => (">=", 30), - Some(ir::BinaryOperator::BinaryOpIn) => ("in", 30), - Some(ir::BinaryOperator::BinaryOpNotIn) => ("not in", 30), - Some(ir::BinaryOperator::BinaryOpAdd) => ("+", 40), - Some(ir::BinaryOperator::BinaryOpSub) => ("-", 40), - Some(ir::BinaryOperator::BinaryOpMul) => ("*", 50), - Some(ir::BinaryOperator::BinaryOpDiv) => ("/", 50), - Some(ir::BinaryOperator::BinaryOpFloorDiv) => ("//", 50), - Some(ir::BinaryOperator::BinaryOpMod) => ("%", 50), - _ => ("?", 0), - } -} - -/// Map unary operator enums to (symbol, precedence) for formatting. -fn unary_operator(op: i32) -> (&'static str, i32) { - match ir::UnaryOperator::try_from(op).ok() { - Some(ir::UnaryOperator::UnaryOpNeg) => ("-", 60), - Some(ir::UnaryOperator::UnaryOpNot) => ("not ", 60), - _ => ("?", 0), - } -} - -/// Return precedence for non-operator constructs like index/dot. -fn precedence(kind: &str) -> i32 { - match kind { - "index" | "dot" => 80, - _ => 0, - } -} - -/// Format Python literals as source-like text. -fn format_literal(value: &serde_json::Value) -> String { - match value { - serde_json::Value::Null => "None".to_string(), - serde_json::Value::Bool(value) => { - if *value { - "True".to_string() - } else { - "False".to_string() - } - } - serde_json::Value::String(value) => { - serde_json::to_string(value).unwrap_or_else(|_| format!("\"{value}\"")) - } - _ => value.to_string(), - } -} - -/// Convert an IR literal into a Python value. -/// -/// Example IR: -/// - Literal(int_value=3) -> 3 -pub(crate) fn literal_value(lit: &ir::Literal) -> serde_json::Value { - match lit.value.as_ref() { - Some(ir::literal::Value::IntValue(value)) => serde_json::Value::Number((*value).into()), - Some(ir::literal::Value::FloatValue(value)) => serde_json::Number::from_f64(*value) - .map(serde_json::Value::Number) - .unwrap_or(serde_json::Value::Null), - Some(ir::literal::Value::StringValue(value)) => serde_json::Value::String(value.clone()), - Some(ir::literal::Value::BoolValue(value)) => serde_json::Value::Bool(*value), - Some(ir::literal::Value::IsNone(_)) => serde_json::Value::Null, - None => serde_json::Value::Null, - } -} - -/// Try to fold a literal binary operation to a concrete value. -/// -/// Example: -/// - (1, 2, BINARY_OP_ADD) -> 3 -fn fold_literal_binary( - op: i32, - left: &serde_json::Value, - right: &serde_json::Value, -) -> Option { - match ir::BinaryOperator::try_from(op).ok() { - Some(ir::BinaryOperator::BinaryOpAdd) => { - if let (Some(left), Some(right)) = (left.as_i64(), right.as_i64()) { - return Some(serde_json::Value::Number((left + right).into())); - } - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - return serde_json::Number::from_f64(left + right).map(serde_json::Value::Number); - } - if let (Some(left), Some(right)) = (left.as_str(), right.as_str()) { - return Some(serde_json::Value::String(format!("{left}{right}"))); - } - None - } - Some(ir::BinaryOperator::BinaryOpSub) => { - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - return serde_json::Number::from_f64(left - right).map(serde_json::Value::Number); - } - None - } - Some(ir::BinaryOperator::BinaryOpMul) => { - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - return serde_json::Number::from_f64(left * right).map(serde_json::Value::Number); - } - None - } - Some(ir::BinaryOperator::BinaryOpDiv) => { - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - return serde_json::Number::from_f64(left / right).map(serde_json::Value::Number); - } - None - } - Some(ir::BinaryOperator::BinaryOpFloorDiv) => { - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - if right == 0.0 { - return None; - } - let value = (left / right).floor(); - return serde_json::Number::from_f64(value).map(serde_json::Value::Number); - } - None - } - Some(ir::BinaryOperator::BinaryOpMod) => { - if let (Some(left), Some(right)) = (left.as_f64(), right.as_f64()) { - return serde_json::Number::from_f64(left % right).map(serde_json::Value::Number); - } - None - } - _ => None, - } -} - -/// Try to fold a literal unary operation to a concrete value. -/// -/// Example: -/// - (UNARY_OP_NEG, 4) -> -4 -fn fold_literal_unary(op: i32, operand: &serde_json::Value) -> Option { - match ir::UnaryOperator::try_from(op).ok() { - Some(ir::UnaryOperator::UnaryOpNeg) => operand - .as_f64() - .and_then(|value| serde_json::Number::from_f64(-value).map(serde_json::Value::Number)), - Some(ir::UnaryOperator::UnaryOpNot) => Some(serde_json::Value::Bool(!is_truthy(operand))), - _ => None, - } -} - -impl fmt::Display for NodeStatus { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let value = match self { - NodeStatus::Queued => "queued", - NodeStatus::Running => "running", - NodeStatus::Completed => "completed", - NodeStatus::Failed => "failed", - }; - write!(f, "{value}") - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::messages::ast as ir; - use serde_json::Value; - - fn action_plus_two_expr() -> ir::Expr { - ir::Expr { - kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { - left: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Variable(ir::Variable { - name: "action_result".to_string(), - })), - span: None, - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(2)), - })), - span: None, - })), - }))), - span: None, - } - } - - #[test] - fn test_runner_state_unrolls_loop_assignments() { - let mut state = RunnerState::new(None, None, None, true); - - state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - Some(0), - ) - .expect("queue action"); - let first_list = ir::Expr { - kind: Some(ir::expr::Kind::List(ir::ListExpr { - elements: vec![action_plus_two_expr()], - })), - span: None, - }; - state - .record_assignment(vec!["results".to_string()], &first_list, None, None) - .expect("record assignment"); - - state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - Some(1), - ) - .expect("queue action"); - let second_list = ir::Expr { - kind: Some(ir::expr::Kind::List(ir::ListExpr { - elements: vec![action_plus_two_expr()], - })), - span: None, - }; - let concat_expr = ir::Expr { - kind: Some(ir::expr::Kind::BinaryOp(Box::new(ir::BinaryOp { - left: Some(Box::new(ir::Expr { - kind: Some(ir::expr::Kind::Variable(ir::Variable { - name: "results".to_string(), - })), - span: None, - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Some(Box::new(second_list)), - }))), - span: None, - }; - state - .record_assignment(vec!["results".to_string()], &concat_expr, None, None) - .expect("record assignment"); - - let mut results: Option = None; - for node_id in state.timeline.iter().rev() { - let node = state.nodes.get(node_id).unwrap(); - if let Some(value) = node.assignments.get("results") { - results = Some(value.clone()); - break; - } - } - - let results = results.expect("results assignment"); - let binary = match results { - ValueExpr::BinaryOp(value) => value, - other => panic!("expected BinaryOpValue, got {other:?}"), - }; - - match binary.left.as_ref() { - ValueExpr::Variable(value) => assert_eq!(value.name, "results"), - other => panic!("expected VariableValue, got {other:?}"), - } - - let right_list = match binary.right.as_ref() { - ValueExpr::List(value) => value, - other => panic!("expected ListValue, got {other:?}"), - }; - assert_eq!(right_list.elements.len(), 1); - - let item_bin = match &right_list.elements[0] { - ValueExpr::BinaryOp(value) => value, - other => panic!("expected BinaryOpValue, got {other:?}"), - }; - - match item_bin.left.as_ref() { - ValueExpr::Variable(value) => assert_eq!(value.name, "action_result"), - other => panic!("expected VariableValue, got {other:?}"), - } - - match item_bin.right.as_ref() { - ValueExpr::Literal(value) => assert_eq!(value.value, Value::Number(2.into())), - other => panic!("expected LiteralValue, got {other:?}"), - } - } - - #[test] - fn test_runner_state_single_target_assignments_stay_symbolic() { - let mut state = RunnerState::new(None, None, None, true); - - let initial = ValueExpr::Dict(DictValue { - entries: vec![DictEntryValue { - key: ValueExpr::Literal(LiteralValue { - value: Value::String("result".to_string()), - }), - value: ValueExpr::Literal(LiteralValue { - value: Value::Number(1.into()), - }), - }], - }); - state - .record_assignment_value(vec!["result".to_string()], initial, None, None) - .expect("record initial assignment"); - - let wrapped = ValueExpr::Dict(DictValue { - entries: vec![DictEntryValue { - key: ValueExpr::Literal(LiteralValue { - value: Value::String("result".to_string()), - }), - value: ValueExpr::Variable(VariableValue { - name: "result".to_string(), - }), - }], - }); - state - .record_assignment_value(vec!["result".to_string()], wrapped, None, None) - .expect("record wrapped assignment"); - - let mut latest: Option = None; - for node_id in state.timeline.iter().rev() { - let node = state.nodes.get(node_id).expect("node"); - if let Some(value) = node.assignments.get("result") { - latest = Some(value.clone()); - break; - } - } - let latest = latest.expect("latest assignment"); - let dict = match latest { - ValueExpr::Dict(value) => value, - other => panic!("expected DictValue, got {other:?}"), - }; - assert_eq!(dict.entries.len(), 1); - match &dict.entries[0].value { - ValueExpr::Variable(value) => assert_eq!(value.name, "result"), - other => panic!("expected VariableValue, got {other:?}"), - } - } - - #[test] - fn test_materialize_value_keeps_self_referential_variable_symbolic() { - let mut state = RunnerState::new(None, None, None, true); - state - .record_assignment_value( - vec!["count".to_string()], - ValueExpr::Literal(LiteralValue { - value: Value::Number(0.into()), - }), - None, - None, - ) - .expect("record initial count"); - state - .record_assignment_value( - vec!["count".to_string()], - ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "count".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(ValueExpr::Literal(LiteralValue { - value: Value::Number(1.into()), - })), - }), - None, - None, - ) - .expect("record count update"); - - let materialized = state.materialize_value(ValueExpr::Variable(VariableValue { - name: "count".to_string(), - })); - match materialized { - ValueExpr::Variable(value) => assert_eq!(value.name, "count"), - other => panic!("expected VariableValue, got {other:?}"), - } - } - - #[test] - fn test_runner_state_graph_dirty_for_action_updates() { - let mut state = RunnerState::new(None, None, None, true); - assert!(!state.consume_graph_dirty_for_durable_execution()); - - let action_result = state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - None, - ) - .expect("queue action"); - assert!(state.consume_graph_dirty_for_durable_execution()); - assert!(!state.consume_graph_dirty_for_durable_execution()); - - state - .increment_action_attempt(action_result.node_id) - .expect("increment action attempt"); - assert!(state.consume_graph_dirty_for_durable_execution()); - } - - #[test] - fn test_runner_state_graph_dirty_not_set_for_assignments() { - let mut state = RunnerState::new(None, None, None, true); - let value_expr = ValueExpr::Literal(LiteralValue { - value: Value::Number(1.into()), - }); - state - .record_assignment_value(vec!["value".to_string()], value_expr, None, None) - .expect("record assignment"); - - assert!(!state.consume_graph_dirty_for_durable_execution()); - } - - #[test] - fn test_runner_state_records_action_start_stop_timestamps() { - let mut state = RunnerState::new(None, None, None, true); - let action_result = state - .queue_action( - "action", - Some(vec!["action_result".to_string()]), - None, - None, - None, - ) - .expect("queue action"); - - // Clear queue-time dirty bit so lifecycle transitions are isolated. - assert!(state.consume_graph_dirty_for_durable_execution()); - - state - .mark_running(action_result.node_id) - .expect("mark running"); - let started_at = state - .nodes - .get(&action_result.node_id) - .and_then(|node| node.started_at); - assert!( - started_at.is_some(), - "running action should record started_at" - ); - assert!( - state - .nodes - .get(&action_result.node_id) - .and_then(|node| node.completed_at) - .is_none(), - "running action should clear completed_at" - ); - assert!( - !state.ready_queue.contains(&action_result.node_id), - "running action should be removed from ready_queue" - ); - assert!(state.consume_graph_dirty_for_durable_execution()); - - state - .mark_completed(action_result.node_id) - .expect("mark completed"); - let completed_at = state - .nodes - .get(&action_result.node_id) - .and_then(|node| node.completed_at); - assert!( - completed_at.is_some(), - "completed action should record completed_at" - ); - assert!( - completed_at >= started_at, - "completed_at should be at or after started_at" - ); - assert!(state.consume_graph_dirty_for_durable_execution()); - } -} diff --git a/crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs b/crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs deleted file mode 100644 index df89b71f..00000000 --- a/crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs +++ /dev/null @@ -1,90 +0,0 @@ -//! Synthetic exception helpers produced by Rust runtime coordination paths. - -use serde_json::Value; - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum SyntheticExceptionType { - ExecutorResume, - ActionTimeout, -} - -impl SyntheticExceptionType { - pub(crate) fn as_type_str(self) -> &'static str { - match self { - Self::ExecutorResume => "ExecutorResume", - Self::ActionTimeout => "ActionTimeout", - } - } - - fn from_type_str(value: &str) -> Option { - match value { - "ExecutorResume" => Some(Self::ExecutorResume), - "ActionTimeout" => Some(Self::ActionTimeout), - _ => None, - } - } - - pub(crate) fn from_value(value: &Value) -> Option { - let Value::Object(map) = value else { - return None; - }; - map.get("type") - .and_then(Value::as_str) - .and_then(Self::from_type_str) - } -} - -pub(crate) fn build_synthetic_exception_value( - exception_type: SyntheticExceptionType, - message: impl Into, - fields: Vec<(String, Value)>, -) -> Value { - let mut map = serde_json::Map::new(); - map.insert( - "type".to_string(), - Value::String(exception_type.as_type_str().to_string()), - ); - map.insert("message".to_string(), Value::String(message.into())); - for (key, value) in fields { - map.insert(key, value); - } - Value::Object(map) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn synthetic_exception_from_value_happy_path() { - let value = serde_json::json!({"type": "ActionTimeout", "message": "x"}); - assert_eq!( - SyntheticExceptionType::from_value(&value), - Some(SyntheticExceptionType::ActionTimeout) - ); - } - - #[test] - fn build_synthetic_exception_value_happy_path() { - let value = build_synthetic_exception_value( - SyntheticExceptionType::ExecutorResume, - "resume", - vec![( - "attempt".to_string(), - Value::Number(serde_json::Number::from(2)), - )], - ); - let Value::Object(map) = value else { - panic!("expected object value"); - }; - assert_eq!( - map.get("type"), - Some(&Value::String("ExecutorResume".to_string())) - ); - assert_eq!( - map.get("message"), - Some(&Value::String("resume".to_string())) - ); - assert_eq!(map.get("attempt"), Some(&Value::Number(2.into()))); - } -} diff --git a/crates/waymark/src/waymark_core/runner/value_visitor.rs b/crates/waymark/src/waymark_core/runner/value_visitor.rs deleted file mode 100644 index 82f02db1..00000000 --- a/crates/waymark/src/waymark_core/runner/value_visitor.rs +++ /dev/null @@ -1,533 +0,0 @@ -//! Shared ValueExpr visitors for traversal, resolution, and evaluation. - -use std::collections::{HashMap, HashSet}; - -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -use super::state::{ - ActionCallSpec, ActionResultValue, BinaryOpValue, DictEntryValue, DictValue, DotValue, - FunctionCallValue, IndexValue, ListValue, LiteralValue, SpreadValue, UnaryOpValue, - VariableValue, -}; - -#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -#[serde(tag = "type", content = "data")] -pub enum ValueExpr { - Literal(LiteralValue), - Variable(VariableValue), - ActionResult(ActionResultValue), - BinaryOp(BinaryOpValue), - UnaryOp(UnaryOpValue), - List(ListValue), - Dict(DictValue), - Index(IndexValue), - Dot(DotValue), - FunctionCall(FunctionCallValue), - Spread(SpreadValue), -} - -/// Resolve variables inside a ValueExpr tree without executing actions. -/// -/// Example IR: -/// - y = x + 1 (where x -> LiteralValue(2)) -/// Produces BinaryOpValue(LiteralValue(2), +, LiteralValue(1)). -pub struct ValueExprResolver<'a> { - resolve_variable: &'a dyn Fn(&str, &mut HashSet) -> ValueExpr, - seen: &'a mut HashSet, -} - -impl<'a> ValueExprResolver<'a> { - pub fn new( - resolve_variable: &'a dyn Fn(&str, &mut HashSet) -> ValueExpr, - seen: &'a mut HashSet, - ) -> Self { - Self { - resolve_variable, - seen, - } - } - - pub fn visit(&mut self, expr: &ValueExpr) -> ValueExpr { - match expr { - ValueExpr::Literal(value) => ValueExpr::Literal(value.clone()), - ValueExpr::Variable(value) => (self.resolve_variable)(&value.name, self.seen), - ValueExpr::ActionResult(value) => ValueExpr::ActionResult(value.clone()), - ValueExpr::BinaryOp(value) => ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(self.visit(&value.left)), - op: value.op, - right: Box::new(self.visit(&value.right)), - }), - ValueExpr::UnaryOp(value) => ValueExpr::UnaryOp(UnaryOpValue { - op: value.op, - operand: Box::new(self.visit(&value.operand)), - }), - ValueExpr::List(value) => ValueExpr::List(ListValue { - elements: value.elements.iter().map(|item| self.visit(item)).collect(), - }), - ValueExpr::Dict(value) => ValueExpr::Dict(DictValue { - entries: value - .entries - .iter() - .map(|entry| DictEntryValue { - key: self.visit(&entry.key), - value: self.visit(&entry.value), - }) - .collect(), - }), - ValueExpr::Index(value) => ValueExpr::Index(IndexValue { - object: Box::new(self.visit(&value.object)), - index: Box::new(self.visit(&value.index)), - }), - ValueExpr::Dot(value) => ValueExpr::Dot(DotValue { - object: Box::new(self.visit(&value.object)), - attribute: value.attribute.clone(), - }), - ValueExpr::FunctionCall(value) => ValueExpr::FunctionCall(FunctionCallValue { - name: value.name.clone(), - args: value.args.iter().map(|arg| self.visit(arg)).collect(), - kwargs: value - .kwargs - .iter() - .map(|(name, arg)| (name.clone(), self.visit(arg))) - .collect(), - global_function: value.global_function, - }), - ValueExpr::Spread(value) => { - let kwargs = value - .action - .kwargs - .iter() - .map(|(name, arg)| (name.clone(), self.visit(arg))) - .collect::>(); - let action = ActionCallSpec { - action_name: value.action.action_name.clone(), - module_name: value.action.module_name.clone(), - kwargs, - }; - ValueExpr::Spread(SpreadValue { - collection: Box::new(self.visit(&value.collection)), - loop_var: value.loop_var.clone(), - action, - }) - } - } - } -} - -/// Collect execution node ids that supply data to a ValueExpr tree. -/// -/// Example IR: -/// - total = a + @sum(values) -/// Returns the node ids that last defined `a` and the action node for sum(). -pub struct ValueExprSourceCollector<'a> { - resolve_variable: &'a dyn Fn(&str) -> Option, -} - -impl<'a> ValueExprSourceCollector<'a> { - pub fn new(resolve_variable: &'a dyn Fn(&str) -> Option) -> Self { - Self { resolve_variable } - } - - pub fn visit(&self, expr: &ValueExpr) -> HashSet { - match expr { - ValueExpr::Literal(_) => HashSet::new(), - ValueExpr::Variable(value) => { - (self.resolve_variable)(&value.name).into_iter().collect() - } - ValueExpr::ActionResult(value) => [value.node_id].into_iter().collect(), - ValueExpr::BinaryOp(value) => { - let mut sources = self.visit(&value.left); - sources.extend(self.visit(&value.right)); - sources - } - ValueExpr::UnaryOp(value) => self.visit(&value.operand), - ValueExpr::List(value) => { - let mut sources = HashSet::new(); - for item in &value.elements { - sources.extend(self.visit(item)); - } - sources - } - ValueExpr::Dict(value) => { - let mut sources = HashSet::new(); - for entry in &value.entries { - sources.extend(self.visit(&entry.key)); - sources.extend(self.visit(&entry.value)); - } - sources - } - ValueExpr::Index(value) => { - let mut sources = self.visit(&value.object); - sources.extend(self.visit(&value.index)); - sources - } - ValueExpr::Dot(value) => self.visit(&value.object), - ValueExpr::FunctionCall(value) => { - let mut sources = HashSet::new(); - for arg in &value.args { - sources.extend(self.visit(arg)); - } - for arg in value.kwargs.values() { - sources.extend(self.visit(arg)); - } - sources - } - ValueExpr::Spread(value) => { - let mut sources = self.visit(&value.collection); - for arg in value.action.kwargs.values() { - sources.extend(self.visit(arg)); - } - sources - } - } - } -} - -/// Evaluate ValueExpr nodes into concrete Python values. -/// -/// Example: -/// - BinaryOpValue(VariableValue("a"), +, LiteralValue(1)) becomes the -/// current value of a plus 1. -pub struct ValueExprEvaluator<'a, E> { - resolve_variable: &'a dyn Fn(&str) -> Result, - resolve_action_result: &'a dyn Fn(&ActionResultValue) -> Result, - resolve_function_call: &'a ResolveFunctionCall<'a, E>, - apply_binary: - &'a dyn Fn(i32, serde_json::Value, serde_json::Value) -> Result, - apply_unary: &'a dyn Fn(i32, serde_json::Value) -> Result, - error_factory: &'a dyn Fn(&str) -> E, -} - -type ResolveFunctionCall<'a, E> = dyn Fn( - &FunctionCallValue, - Vec, - HashMap, - ) -> Result - + 'a; - -impl<'a, E> ValueExprEvaluator<'a, E> { - pub fn new( - resolve_variable: &'a dyn Fn(&str) -> Result, - resolve_action_result: &'a dyn Fn(&ActionResultValue) -> Result, - resolve_function_call: &'a ResolveFunctionCall<'a, E>, - apply_binary: &'a dyn Fn( - i32, - serde_json::Value, - serde_json::Value, - ) -> Result, - apply_unary: &'a dyn Fn(i32, serde_json::Value) -> Result, - error_factory: &'a dyn Fn(&str) -> E, - ) -> Self { - Self { - resolve_variable, - resolve_action_result, - resolve_function_call, - apply_binary, - apply_unary, - error_factory, - } - } - - pub fn visit(&self, expr: &ValueExpr) -> Result { - match expr { - ValueExpr::Literal(value) => Ok(value.value.clone()), - ValueExpr::Variable(value) => (self.resolve_variable)(&value.name), - ValueExpr::ActionResult(value) => (self.resolve_action_result)(value), - ValueExpr::BinaryOp(value) => { - let left = self.visit(&value.left)?; - let right = self.visit(&value.right)?; - (self.apply_binary)(value.op, left, right) - } - ValueExpr::UnaryOp(value) => { - let operand = self.visit(&value.operand)?; - (self.apply_unary)(value.op, operand) - } - ValueExpr::List(value) => { - let mut items = Vec::with_capacity(value.elements.len()); - for item in &value.elements { - items.push(self.visit(item)?); - } - Ok(serde_json::Value::Array(items)) - } - ValueExpr::Dict(value) => { - let mut map = serde_json::Map::with_capacity(value.entries.len()); - for entry in &value.entries { - let key_value = self.visit(&entry.key)?; - let key = key_value - .as_str() - .map(|value| value.to_string()) - .unwrap_or_else(|| key_value.to_string()); - let entry_value = self.visit(&entry.value)?; - map.insert(key, entry_value); - } - Ok(serde_json::Value::Object(map)) - } - ValueExpr::Index(value) => { - let object = self.visit(&value.object)?; - let index = self.visit(&value.index)?; - match (object, index) { - (serde_json::Value::Array(items), serde_json::Value::Number(idx)) => { - let idx = idx.as_i64().unwrap_or(-1); - if idx < 0 || idx as usize >= items.len() { - return Err((self.error_factory)("index out of range")); - } - Ok(items[idx as usize].clone()) - } - (serde_json::Value::Object(map), serde_json::Value::String(key)) => map - .get(&key) - .cloned() - .or_else(|| lookup_exception_value(&map, &key)) - .ok_or_else(|| (self.error_factory)("dict has no key")), - _ => Err((self.error_factory)("unsupported index operation")), - } - } - ValueExpr::Dot(value) => { - let object = self.visit(&value.object)?; - if let serde_json::Value::Object(map) = object { - return map - .get(&value.attribute) - .cloned() - .or_else(|| lookup_exception_value(&map, &value.attribute)) - .ok_or_else(|| (self.error_factory)("dict has no key")); - } - Err((self.error_factory)("attribute not found")) - } - ValueExpr::FunctionCall(value) => { - let mut args = Vec::with_capacity(value.args.len()); - for arg in &value.args { - args.push(self.visit(arg)?); - } - let mut kwargs = HashMap::new(); - for (name, arg) in &value.kwargs { - kwargs.insert(name.clone(), self.visit(arg)?); - } - (self.resolve_function_call)(value, args, kwargs) - } - ValueExpr::Spread(_) => Err((self.error_factory)( - "cannot replay unresolved spread expression", - )), - } - } -} - -fn lookup_exception_value( - map: &serde_json::Map, - key: &str, -) -> Option { - if !(map.contains_key("type") && map.contains_key("message")) { - return None; - } - map.get("values") - .and_then(|value| value.as_object()) - .and_then(|values| values.get(key)) - .cloned() -} - -/// Recursively resolve variable references throughout a value tree. -/// -/// Use this as the core materialization step before assignment storage. -/// -/// Example IR: -/// - z = (x + y) * 2 -/// The tree walk replaces VariableValue("x")/("y") with their latest -/// symbolic definitions before storing z. -pub fn resolve_value_tree( - value: &ValueExpr, - resolve_variable: &dyn Fn(&str, &mut HashSet) -> ValueExpr, -) -> ValueExpr { - let mut seen = HashSet::new(); - let mut resolver = ValueExprResolver::new(resolve_variable, &mut seen); - resolver.visit(value) -} - -/// Find execution node ids that supply data to the given value. -/// -/// Example IR: -/// - total = a + @sum(values) -/// Returns the latest assignment node for a and the action node for sum(). -pub fn collect_value_sources( - value: &ValueExpr, - resolve_variable: &dyn Fn(&str) -> Option, -) -> HashSet { - let collector = ValueExprSourceCollector::new(resolve_variable); - collector.visit(value) -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - - use serde_json::Value; - use uuid::Uuid; - - use super::*; - use crate::messages::ast as ir; - - fn literal_int(value: i64) -> ValueExpr { - ValueExpr::Literal(LiteralValue { - value: Value::Number(value.into()), - }) - } - - #[test] - fn test_value_expr_resolver_visit_happy_path() { - let mut seen = HashSet::new(); - let resolve = |name: &str, _: &mut HashSet| { - if name == "x" { - literal_int(3) - } else { - literal_int(0) - } - }; - let mut resolver = ValueExprResolver::new(&resolve, &mut seen); - let expr = ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "x".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(literal_int(1)), - }); - - let resolved = resolver.visit(&expr); - match resolved { - ValueExpr::BinaryOp(value) => { - assert!(matches!(*value.left, ValueExpr::Literal(_))); - assert!(matches!(*value.right, ValueExpr::Literal(_))); - } - other => panic!("expected binary value, got {other:?}"), - } - } - - #[test] - fn test_value_expr_source_collector_visit_happy_path() { - let variable_source = Uuid::new_v4(); - let action_source = Uuid::new_v4(); - let resolve = |name: &str| { - if name == "x" { - Some(variable_source) - } else { - None - } - }; - let collector = ValueExprSourceCollector::new(&resolve); - let expr = ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "x".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(ValueExpr::ActionResult(ActionResultValue { - node_id: action_source, - action_name: "fetch".to_string(), - iteration_index: None, - result_index: None, - })), - }); - - let sources = collector.visit(&expr); - assert!(sources.contains(&variable_source)); - assert!(sources.contains(&action_source)); - } - - #[test] - fn test_value_expr_evaluator_visit_happy_path() { - let resolve_variable = |name: &str| -> Result { - if name == "x" { - Ok(Value::Number(2.into())) - } else { - Err(format!("unknown variable: {name}")) - } - }; - let resolve_action_result = - |_value: &ActionResultValue| -> Result { Ok(Value::Number(0.into())) }; - let resolve_function_call = - |_call: &FunctionCallValue, - args: Vec, - _kwargs: HashMap| - -> Result { Ok(Value::Number((args.len() as i64).into())) }; - let apply_binary = |_op: i32, left: Value, right: Value| -> Result { - match (left.as_i64(), right.as_i64()) { - (Some(left), Some(right)) => Ok(Value::Number((left + right).into())), - _ => Err("bad operands".to_string()), - } - }; - let apply_unary = |_op: i32, value: Value| -> Result { - Ok(Value::Bool(!value.as_bool().unwrap_or(false))) - }; - let error_factory = |message: &str| message.to_string(); - - let evaluator = ValueExprEvaluator::new( - &resolve_variable, - &resolve_action_result, - &resolve_function_call, - &apply_binary, - &apply_unary, - &error_factory, - ); - let expr = ValueExpr::BinaryOp(BinaryOpValue { - left: Box::new(ValueExpr::Variable(VariableValue { - name: "x".to_string(), - })), - op: ir::BinaryOperator::BinaryOpAdd as i32, - right: Box::new(literal_int(5)), - }); - - let value = evaluator.visit(&expr).expect("evaluate expression"); - assert_eq!(value, Value::Number(7.into())); - } - - #[test] - fn test_resolve_value_tree_happy_path() { - let expr = ValueExpr::List(ListValue { - elements: vec![ValueExpr::Variable(VariableValue { - name: "user_id".to_string(), - })], - }); - let resolve = |name: &str, _seen: &mut HashSet| { - if name == "user_id" { - ValueExpr::Literal(LiteralValue { - value: Value::String("abc".to_string()), - }) - } else { - ValueExpr::Literal(LiteralValue { value: Value::Null }) - } - }; - - let resolved = resolve_value_tree(&expr, &resolve); - match resolved { - ValueExpr::List(list) => { - assert_eq!(list.elements.len(), 1); - assert!(matches!(list.elements[0], ValueExpr::Literal(_))); - } - other => panic!("expected list value, got {other:?}"), - } - } - - #[test] - fn test_collect_value_sources_happy_path() { - let source_a = Uuid::new_v4(); - let source_b = Uuid::new_v4(); - let expr = ValueExpr::FunctionCall(FunctionCallValue { - name: "sum".to_string(), - args: vec![ValueExpr::Variable(VariableValue { - name: "a".to_string(), - })], - kwargs: HashMap::from([( - "other".to_string(), - ValueExpr::ActionResult(ActionResultValue { - node_id: source_b, - action_name: "compute".to_string(), - iteration_index: None, - result_index: None, - }), - )]), - global_function: None, - }); - let resolve = |name: &str| if name == "a" { Some(source_a) } else { None }; - - let sources = collect_value_sources(&expr, &resolve); - assert_eq!(sources.len(), 2); - assert!(sources.contains(&source_a)); - assert!(sources.contains(&source_b)); - } -} diff --git a/crates/waymark/src/webapp/server.rs b/crates/waymark/src/webapp/server.rs index 43818ca3..9afda350 100644 --- a/crates/waymark/src/webapp/server.rs +++ b/crates/waymark/src/webapp/server.rs @@ -16,12 +16,15 @@ use tera::{Context as TeraContext, Tera}; use tokio::net::TcpListener; use tracing::{error, info}; use uuid::Uuid; +use waymark_webapp_backend::WebappBackend; +use waymark_webapp_core::WorkerStatus; -use super::types::{ +use waymark_webapp_core::{ ActionLogsResponse, FilterValuesResponse, HealthResponse, InstanceExportInfo, TimelineEntry, - WebappConfig, WorkflowInstanceExport, WorkflowRunDataResponse, + WorkflowInstanceExport, WorkflowRunDataResponse, }; -use crate::backends::WebappBackend; + +use crate::WebappConfig; // Embed templates at compile time const TEMPLATE_BASE: &str = include_str!("../../templates/base.html"); @@ -367,7 +370,7 @@ async fn get_action_logs( let logs: Vec<_> = timeline .into_iter() .filter(|e| e.action_id == action_id_str) - .map(|e| super::types::ActionLogEntry { + .map(|e| waymark_webapp_core::ActionLogEntry { action_id: e.action_id, action_name: e.action_name, module_name: e.module_name, @@ -736,7 +739,7 @@ struct InvocationRow { fn render_invocations_page( templates: &Tera, - instances: &[super::types::InstanceSummary], + instances: &[waymark_webapp_core::InstanceSummary], current_page: i64, total_pages: i64, search_query: Option, @@ -812,8 +815,8 @@ struct GraphNode { fn render_instance_detail_page( templates: &Tera, - instance: &super::types::InstanceDetail, - graph: Option, + instance: &waymark_webapp_core::InstanceDetail, + graph: Option, ) -> String { let graph_data = graph .as_ref() @@ -843,8 +846,8 @@ fn render_instance_detail_page( render_template(templates, "workflow_run.html", &context) } -fn build_graph_data(graph: &super::types::ExecutionGraphView) -> GraphData { - let action_nodes: Vec<&super::types::ExecutionNodeView> = graph +fn build_graph_data(graph: &waymark_webapp_core::ExecutionGraphView) -> GraphData { + let action_nodes: Vec<&waymark_webapp_core::ExecutionNodeView> = graph .nodes .iter() .filter(|node| is_action_node(&node.node_type)) @@ -1055,7 +1058,7 @@ struct ScheduleRow { fn render_schedules_page( templates: &Tera, - schedules: &[super::types::ScheduleSummary], + schedules: &[waymark_webapp_core::ScheduleSummary], current_page: i64, total_pages: i64, total_count: i64, @@ -1136,8 +1139,8 @@ struct ScheduleInvocationRow { fn render_schedule_detail_page( templates: &Tera, - schedule: &super::types::ScheduleDetail, - invocations: &[super::types::ScheduleInvocationSummary], + schedule: &waymark_webapp_core::ScheduleDetail, + invocations: &[waymark_webapp_core::ScheduleInvocationSummary], current_page: i64, total_pages: i64, ) -> String { @@ -1234,11 +1237,7 @@ struct WorkerInstanceRowView { updated_at: String, } -fn render_workers_page( - templates: &Tera, - statuses: &[super::WorkerStatus], - window_minutes: i64, -) -> String { +fn render_workers_page(templates: &Tera, statuses: &[WorkerStatus], window_minutes: i64) -> String { use crate::pool_status::PoolTimeSeries; // Build action rows @@ -1373,13 +1372,15 @@ mod tests { use sqlx::postgres::PgPoolOptions; use tower::util::ServiceExt; use uuid::Uuid; + use waymark_backend_memory::MemoryBackend; + use waymark_backend_postgres::PostgresBackend; + use waymark_webapp_backend::WebappBackend; + use waymark_worker_status_backend::{WorkerStatusBackend as _, WorkerStatusUpdate}; use super::{WebappState, build_graph_data, build_router, init_templates}; - use crate::backends::{ - MemoryBackend, PostgresBackend, WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, - }; - use crate::test_support::postgres_setup; - use crate::webapp::{ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView}; + + use waymark_test_support::postgres_setup; + use waymark_webapp_core::{ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView}; #[test] fn build_graph_data_projects_internal_nodes_to_action_dependencies() { diff --git a/crates/waymark/src/webapp/types.rs b/crates/waymark/src/webapp/types.rs index 7805c428..0b2ec6e8 100644 --- a/crates/waymark/src/webapp/types.rs +++ b/crates/waymark/src/webapp/types.rs @@ -1,8 +1,4 @@ -//! Shared types for the webapp. - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; +//! Shared types for the webapp server. /// Configuration for the webapp server. #[derive(Debug, Clone)] @@ -55,245 +51,3 @@ impl WebappConfig { format!("{}:{}", self.host, self.port) } } - -/// Instance status. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum InstanceStatus { - Queued, - Running, - Completed, - Failed, -} - -impl std::fmt::Display for InstanceStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Queued => write!(f, "queued"), - Self::Running => write!(f, "running"), - Self::Completed => write!(f, "completed"), - Self::Failed => write!(f, "failed"), - } - } -} - -/// Summary of a workflow instance for listing. -#[derive(Debug, Clone, Serialize)] -pub struct InstanceSummary { - pub id: Uuid, - pub entry_node: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, - pub workflow_name: Option, - pub input_preview: String, -} - -/// Full details of a workflow instance. -#[derive(Debug, Clone, Serialize)] -pub struct InstanceDetail { - pub id: Uuid, - pub entry_node: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, - pub workflow_name: Option, - pub input_payload: String, - pub result_payload: String, - pub error_payload: Option, -} - -/// Node in the execution graph for display. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionNodeView { - pub id: String, - pub node_type: String, - pub label: String, - pub status: String, - pub action_name: Option, - pub module_name: Option, -} - -/// Edge in the execution graph for display. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionEdgeView { - pub source: String, - pub target: String, - pub edge_type: String, -} - -/// Execution graph data for rendering. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionGraphView { - pub nodes: Vec, - pub edges: Vec, -} - -/// Timeline entry for an action execution. -#[derive(Debug, Clone, Serialize)] -pub struct TimelineEntry { - pub action_id: String, - pub action_name: String, - pub module_name: Option, - pub status: String, - pub attempt_number: i32, - pub dispatched_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub request_preview: String, - pub response_preview: String, - pub error: Option, -} - -/// Action log entry with full details. -#[derive(Debug, Clone, Serialize)] -pub struct ActionLogEntry { - pub action_id: String, - pub action_name: String, - pub module_name: Option, - pub status: String, - pub attempt_number: i32, - pub dispatched_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub request: String, - pub response: String, - pub error: Option, -} - -/// Response for the workflow run data API. -#[derive(Debug, Serialize)] -pub struct WorkflowRunDataResponse { - pub nodes: Vec, - pub timeline: Vec, - pub page: i64, - pub per_page: i64, - pub total: i64, - pub has_more: bool, -} - -/// Response for action logs API. -#[derive(Debug, Serialize)] -pub struct ActionLogsResponse { - pub logs: Vec, -} - -/// Filter values response. -#[derive(Debug, Serialize)] -pub struct FilterValuesResponse { - pub values: Vec, -} - -/// Health check response. -#[derive(Debug, Serialize)] -pub struct HealthResponse { - pub status: &'static str, - pub service: &'static str, -} - -/// Export format for a workflow instance. -#[derive(Debug, Serialize)] -pub struct WorkflowInstanceExport { - pub export_version: &'static str, - pub exported_at: String, - pub instance: InstanceExportInfo, - pub nodes: Vec, - pub timeline: Vec, -} - -/// Full worker status for webapp display. -#[derive(Debug, Clone)] -pub struct WorkerStatus { - pub pool_id: Uuid, - pub active_workers: i32, - pub throughput_per_min: f64, - pub actions_per_sec: f64, - pub total_completed: i64, - pub last_action_at: Option>, - pub updated_at: DateTime, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub dispatch_queue_size: Option, - pub total_in_flight: Option, - pub median_instance_duration_secs: Option, - pub active_instance_count: i32, - pub total_instances_completed: i64, - pub instances_per_sec: f64, - pub instances_per_min: f64, - pub time_series: Option>, -} - -/// Worker action stats row for display. -#[derive(Debug, Clone)] -pub struct WorkerActionRow { - pub pool_id: String, - pub active_workers: i64, - pub actions_per_sec: String, - pub throughput_per_min: i64, - pub total_completed: i64, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub last_action_at: Option, - pub updated_at: String, -} - -/// Aggregate worker stats for overview cards. -#[derive(Debug, Clone)] -pub struct WorkerAggregateStats { - pub active_worker_count: i64, - pub actions_per_sec: String, - pub total_in_flight: i64, - pub total_queue_depth: i64, -} - -/// Instance info for export. -#[derive(Debug, Serialize)] -pub struct InstanceExportInfo { - pub id: String, - pub status: String, - pub created_at: String, - pub input_payload: String, - pub result_payload: String, -} - -/// Schedule summary for listing. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleSummary { - pub id: String, - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: String, - pub cron_expression: Option, - pub interval_seconds: Option, - pub status: String, - pub next_run_at: Option, - pub last_run_at: Option, - pub created_at: String, -} - -/// Full schedule details. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleDetail { - pub id: String, - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: String, - pub cron_expression: Option, - pub interval_seconds: Option, - pub jitter_seconds: i64, - pub status: String, - pub next_run_at: Option, - pub last_run_at: Option, - pub last_instance_id: Option, - pub created_at: String, - pub updated_at: String, - pub priority: i32, - pub allow_duplicate: bool, - pub input_payload: Option, -} - -/// Invocation summary row for schedule detail pages. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleInvocationSummary { - pub id: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, -} diff --git a/crates/waymark/src/workers/status.rs b/crates/waymark/src/workers/status.rs index c9428602..03fec671 100644 --- a/crates/waymark/src/workers/status.rs +++ b/crates/waymark/src/workers/status.rs @@ -9,8 +9,8 @@ use std::time::Duration; use chrono::{DateTime, Utc}; use tracing::{info, warn}; use uuid::Uuid; +use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; -use crate::backends::{WorkerStatusBackend, WorkerStatusUpdate}; use crate::pool_status::{PoolTimeSeries, TimeSeriesEntry}; #[derive(Debug, Clone)]