diff --git a/.gitignore b/.gitignore index 1cf1d602..5a5e2a84 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,9 @@ share/python-wheels/ *.egg MANIFEST +# Exception for crates. +!/crates/lib + # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. diff --git a/Cargo.lock b/Cargo.lock index bf98727e..08e29a10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1517,9 +1517,9 @@ checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "metrics" -version = "0.24.2" +version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25dea7ac8057892855ec285c440160265225438c3c45072613c25a4b26e98ef5" +checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" dependencies = [ "ahash", "portable-atomic", @@ -1847,9 +1847,9 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "potential_utf" @@ -2529,7 +2529,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.17", + "thiserror", "tokio", "tokio-stream", "tracing", @@ -2614,7 +2614,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.17", + "thiserror", "tracing", "uuid", "whoami", @@ -2653,7 +2653,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.17", + "thiserror", "tracing", "uuid", "whoami", @@ -2679,7 +2679,7 @@ dependencies = [ "serde", "serde_urlencoded", "sqlx-core", - "thiserror 2.0.17", + "thiserror", "tracing", "url", "uuid", @@ -2783,33 +2783,13 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - [[package]] name = "thiserror" version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.17", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -3375,9 +3355,10 @@ dependencies = [ "sha2", "sqlx", "tera", - "thiserror 1.0.69", + "thiserror", "tokio", "tokio-stream", + "tokio-util", "tonic 0.11.0", "tonic-health", "tower 0.5.2", @@ -3385,9 +3366,123 @@ dependencies = [ "tracing-chrome", "tracing-subscriber", "uuid", + "waymark-backend-fault-injection", + "waymark-backend-memory", + "waymark-backend-postgres", + "waymark-backend-postgres-migrations", + "waymark-backends-core", + "waymark-core-backend", "waymark-dag", - "waymark-observability-macros", + "waymark-garbage-collector-backend", + "waymark-integration-support", + "waymark-ir-parser", + "waymark-observability", "waymark-proto", + "waymark-runner", + "waymark-runner-state", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-test-support", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backend-fault-injection" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backend-memory", + "waymark-backends-core", + "waymark-core-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backend-memory" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "rmp-serde", + "serde_json", + "uuid", + "waymark-backends-core", + "waymark-core-backend", + "waymark-garbage-collector-backend", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backend-postgres" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "prost 0.12.6", + "rmp-serde", + "serde", + "serde_json", + "serial_test", + "sqlx", + "tokio", + "tracing", + "uuid", + "waymark-backend-postgres-migrations", + "waymark-backends-core", + "waymark-core-backend", + "waymark-dag", + "waymark-garbage-collector-backend", + "waymark-ir-parser", + "waymark-observability", + "waymark-proto", + "waymark-runner", + "waymark-runner-state", + "waymark-scheduler-backend", + "waymark-scheduler-core", + "waymark-test-support", + "waymark-webapp-backend", + "waymark-webapp-core", + "waymark-worker-status-backend", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-backend-postgres-migrations" +version = "0.1.0" +dependencies = [ + "sqlx", +] + +[[package]] +name = "waymark-backends-core" +version = "0.1.0" +dependencies = [ + "serde_json", + "sqlx", + "thiserror", +] + +[[package]] +name = "waymark-core-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "serde", + "serde_json", + "uuid", + "waymark-backends-core", + "waymark-dag", + "waymark-runner-state", ] [[package]] @@ -3396,9 +3491,9 @@ version = "0.1.0" dependencies = [ "rustc-hash", "serde", - "thiserror 1.0.69", + "thiserror", "uuid", - "waymark", + "waymark-ir-parser", "waymark-proto", ] @@ -3415,7 +3510,47 @@ dependencies = [ "tokio", "uuid", "waymark", + "waymark-backend-memory", + "waymark-core-backend", "waymark-dag", + "waymark-ir-parser", + "waymark-runner-state", + "waymark-workflow-registry-backend", +] + +[[package]] +name = "waymark-garbage-collector-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "waymark-backends-core", +] + +[[package]] +name = "waymark-integration-support" +version = "0.1.0" +dependencies = [ + "anyhow", + "sqlx", + "tokio", + "waymark-backend-postgres-migrations", +] + +[[package]] +name = "waymark-ir-parser" +version = "0.1.0" +dependencies = [ + "regex", + "waymark-proto", +] + +[[package]] +name = "waymark-observability" +version = "0.1.0" +dependencies = [ + "tracing", + "waymark-observability-macros", ] [[package]] @@ -3438,6 +3573,105 @@ dependencies = [ "tonic-build", ] +[[package]] +name = "waymark-runner" +version = "0.1.0" +dependencies = [ + "chrono", + "rustc-hash", + "serde_json", + "thiserror", + "tracing", + "uuid", + "waymark-backend-memory", + "waymark-core-backend", + "waymark-dag", + "waymark-ir-parser", + "waymark-observability", + "waymark-proto", + "waymark-runner-state", +] + +[[package]] +name = "waymark-runner-state" +version = "0.1.0" +dependencies = [ + "chrono", + "serde", + "serde_json", + "thiserror", + "uuid", + "waymark-dag", + "waymark-proto", +] + +[[package]] +name = "waymark-scheduler-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", + "waymark-scheduler-core", +] + +[[package]] +name = "waymark-scheduler-core" +version = "0.1.0" +dependencies = [ + "chrono", + "cron", + "rand 0.8.5", + "serde", + "uuid", +] + +[[package]] +name = "waymark-test-support" +version = "0.1.0" +dependencies = [ + "sqlx", + "waymark-integration-support", +] + +[[package]] +name = "waymark-webapp-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", + "waymark-webapp-core", +] + +[[package]] +name = "waymark-webapp-core" +version = "0.1.0" +dependencies = [ + "chrono", + "serde", + "uuid", +] + +[[package]] +name = "waymark-worker-status-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "chrono", + "uuid", + "waymark-backends-core", +] + +[[package]] +name = "waymark-workflow-registry-backend" +version = "0.1.0" +dependencies = [ + "async-trait", + "uuid", + "waymark-backends-core", +] + [[package]] name = "webpki-roots" version = "0.26.11" diff --git a/Cargo.toml b/Cargo.toml index c75a7299..853e1067 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,23 +1,52 @@ [workspace] resolver = "3" -members = ["crates/*"] +members = ["crates/lib/*", "crates/bin/*", "crates/waymark"] [workspace.dependencies] waymark = { path = "crates/waymark" } -waymark-dag = { path = "crates/dag" } -waymark-proto = { path = "crates/proto" } +waymark-backend-fault-injection = { path = "crates/lib/backend-fault-injection" } +waymark-backend-memory = { path = "crates/lib/backend-memory" } +waymark-backend-postgres = { path = "crates/lib/backend-postgres" } +waymark-backend-postgres-migrations = { path = "crates/lib/backend-postgres-migrations" } +waymark-backends-core = { path = "crates/lib/backends-core" } +waymark-core-backend = { path = "crates/lib/core-backend" } +waymark-dag = { path = "crates/lib/dag" } +waymark-garbage-collector-backend = { path = "crates/lib/garbage-collector-backend" } +waymark-integration-support = { path = "crates/lib/integration-support" } +waymark-ir-parser = { path = "crates/lib/ir-parser" } +waymark-observability = { path = "crates/lib/observability" } +waymark-observability-macros = { path = "crates/lib/observability-macros" } +waymark-proto = { path = "crates/lib/proto" } +waymark-runner = { path = "crates/lib/runner" } +waymark-runner-state = { path = "crates/lib/runner-state" } +waymark-scheduler-backend = { path = "crates/lib/scheduler-backend" } +waymark-scheduler-core = { path = "crates/lib/scheduler-core" } +waymark-test-support = { path = "crates/lib/test-support" } +waymark-webapp-backend = { path = "crates/lib/webapp-backend" } +waymark-webapp-core = { path = "crates/lib/webapp-core" } +waymark-worker-status-backend = { path = "crates/lib/worker-status-backend" } +waymark-workflow-registry-backend = { path = "crates/lib/workflow-registry-backend" } anyhow = "1" +async-trait = "0.1" +chrono = { version = "0.4", default-features = false } clap = "4.5" +cron = "0.12" proptest = "1.9" prost = "0.12" prost-types = "0.12" +rand = "0.8" +regex = "1" +rmp-serde = "1" rustc-hash = "2" serde = "1" serde_json = "1" +serial_test = "2" sha2 = "0.10" -thiserror = "1" +sqlx = { version = "0.8", default-features = false } +thiserror = "2" tokio = "1" tonic = "0.11" tonic-build = "0.11" +tracing = "0.1" uuid = "1" diff --git a/crates/fuzzer/Cargo.toml b/crates/bin/fuzzer/Cargo.toml similarity index 65% rename from crates/fuzzer/Cargo.toml rename to crates/bin/fuzzer/Cargo.toml index e31f9970..8c7c039d 100644 --- a/crates/fuzzer/Cargo.toml +++ b/crates/bin/fuzzer/Cargo.toml @@ -14,3 +14,8 @@ uuid = { workspace = true, features = ["serde", "v4"] } tokio = { workspace = true } waymark = { workspace = true } waymark-dag = { workspace = true } +waymark-ir-parser = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } diff --git a/crates/fuzzer/src/bin/waymark-fuzz.rs b/crates/bin/fuzzer/src/bin/waymark-fuzz.rs similarity index 100% rename from crates/fuzzer/src/bin/waymark-fuzz.rs rename to crates/bin/fuzzer/src/bin/waymark-fuzz.rs diff --git a/crates/fuzzer/src/generator.rs b/crates/bin/fuzzer/src/generator.rs similarity index 100% rename from crates/fuzzer/src/generator.rs rename to crates/bin/fuzzer/src/generator.rs diff --git a/crates/fuzzer/src/harness.rs b/crates/bin/fuzzer/src/harness.rs similarity index 96% rename from crates/fuzzer/src/harness.rs rename to crates/bin/fuzzer/src/harness.rs index 242d2924..2bec4043 100644 --- a/crates/fuzzer/src/harness.rs +++ b/crates/bin/fuzzer/src/harness.rs @@ -9,17 +9,17 @@ use prost::Message; use serde_json::Value; use sha2::{Digest, Sha256}; use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::QueuedInstance; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; use super::generator::GeneratedCase; -use waymark::backends::{ - MemoryBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; use waymark::messages::ast as ir; -use waymark::waymark_core::ir_parser::parse_program; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{ActionCallable, InlineWorkerPool, WorkerPoolError}; use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; pub async fn run_case(case_index: usize, case: &GeneratedCase) -> Result<()> { let program = parse_program(case.source.trim()).map_err(|err| { diff --git a/crates/fuzzer/src/lib.rs b/crates/bin/fuzzer/src/lib.rs similarity index 100% rename from crates/fuzzer/src/lib.rs rename to crates/bin/fuzzer/src/lib.rs diff --git a/crates/lib/backend-fault-injection/Cargo.toml b/crates/lib/backend-fault-injection/Cargo.toml new file mode 100644 index 00000000..1b592ba1 --- /dev/null +++ b/crates/lib/backend-fault-injection/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "waymark-backend-fault-injection" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } diff --git a/crates/lib/backend-fault-injection/src/lib.rs b/crates/lib/backend-fault-injection/src/lib.rs new file mode 100644 index 00000000..4a43d305 --- /dev/null +++ b/crates/lib/backend-fault-injection/src/lib.rs @@ -0,0 +1,128 @@ +use std::sync::{ + Arc, + atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}, +}; + +use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::{ + CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstanceBatch, +}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; + +#[derive(Clone)] +pub struct FaultInjectingBackend { + inner: MemoryBackend, + fail_get_queued_instances_with_depth_limit: Arc, + get_queued_instances_calls: Arc, +} + +impl FaultInjectingBackend { + pub fn with_depth_limit_poll_failures(inner: MemoryBackend) -> Self { + Self { + inner, + fail_get_queued_instances_with_depth_limit: Arc::new(AtomicBool::new(true)), + get_queued_instances_calls: Arc::new(AtomicUsize::new(0)), + } + } + + pub fn get_queued_instances_calls(&self) -> usize { + self.get_queued_instances_calls.load(AtomicOrdering::SeqCst) + } + + pub fn queue_len(&self) -> usize { + self.inner + .instance_queue() + .as_ref() + .map(|queue| queue.lock().expect("queue poisoned").len()) + .unwrap_or(0) + } + + pub fn instances_done_len(&self) -> usize { + self.inner.instances_done().len() + } +} + +#[async_trait::async_trait] +impl CoreBackend for FaultInjectingBackend { + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + self.inner.save_graphs(claim, graphs).await + } + + async fn save_actions_done( + &self, + actions: &[waymark_core_backend::ActionDone], + ) -> BackendResult<()> { + self.inner.save_actions_done(actions).await + } + + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { + self.inner.save_instances_done(instances).await + } + + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + self.get_queued_instances_calls + .fetch_add(1, AtomicOrdering::SeqCst); + if self + .fail_get_queued_instances_with_depth_limit + .load(AtomicOrdering::SeqCst) + { + return Err(BackendError::Message("depth limit exceeded".to_string())); + } + self.inner.get_queued_instances(size, claim).await + } + + async fn queue_instances( + &self, + instances: &[waymark_core_backend::QueuedInstance], + ) -> BackendResult<()> { + self.inner.queue_instances(instances).await + } + + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + self.inner.refresh_instance_locks(claim, instance_ids).await + } + + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()> { + self.inner + .release_instance_locks(lock_uuid, instance_ids) + .await + } +} + +#[async_trait::async_trait] +impl WorkflowRegistryBackend for FaultInjectingBackend { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult { + self.inner.upsert_workflow_version(registration).await + } + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { + self.inner.get_workflow_versions(ids).await + } +} diff --git a/crates/lib/backend-memory/Cargo.toml b/crates/lib/backend-memory/Cargo.toml new file mode 100644 index 00000000..203e0f35 --- /dev/null +++ b/crates/lib/backend-memory/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "waymark-backend-memory" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +rmp-serde = { workspace = true } +serde_json = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-garbage-collector-backend = { workspace = true, optional = true } +waymark-scheduler-backend = { workspace = true, optional = true } +waymark-scheduler-core = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +waymark-webapp-backend = { workspace = true, optional = true } +waymark-webapp-core = { workspace = true, optional = true } + +[features] +default = [ + "core-backend", + "worker-status-backend", + "workflow-registry-backend", + "scheduler-backend", + "garbage-collector-backend", + "webapp-backend", +] + +core-backend = [] +garbage-collector-backend = ["dep:waymark-garbage-collector-backend"] +scheduler-backend = ["dep:waymark-scheduler-backend"] +worker-status-backend = [] +workflow-registry-backend = [] +webapp-backend = ["dep:waymark-webapp-backend", "dep:waymark-webapp-core"] diff --git a/crates/lib/backend-memory/src/core_backend.rs b/crates/lib/backend-memory/src/core_backend.rs new file mode 100644 index 00000000..49a40330 --- /dev/null +++ b/crates/lib/backend-memory/src/core_backend.rs @@ -0,0 +1,159 @@ +use chrono::Utc; +use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::{ + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, +}; + +#[async_trait::async_trait] +impl waymark_core_backend::CoreBackend for crate::MemoryBackend { + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult> { + let mut stored = self.graph_updates.lock().expect("graph updates poisoned"); + stored.extend(graphs.iter().cloned()); + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + let mut locks = Vec::with_capacity(graphs.len()); + for graph in graphs { + if let Some((Some(lock_uuid), lock_expires_at)) = guard.get_mut(&graph.instance_id) + && *lock_uuid == claim.lock_uuid + && lock_expires_at.is_none_or(|expires_at| expires_at < claim.lock_expires_at) + { + *lock_expires_at = Some(claim.lock_expires_at); + } + let (lock_uuid, lock_expires_at) = guard + .get(&graph.instance_id) + .cloned() + .unwrap_or((None, None)); + locks.push(InstanceLockStatus { + instance_id: graph.instance_id, + lock_uuid, + lock_expires_at, + }); + } + Ok(locks) + } + + async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { + let mut stored = self.actions_done.lock().expect("actions done poisoned"); + stored.extend(actions.iter().cloned()); + Ok(()) + } + + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { + let mut stored = self.instances_done.lock().expect("instances done poisoned"); + stored.extend(instances.iter().cloned()); + if !instances.is_empty() { + let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); + for instance in instances { + locks.remove(&instance.executor_id); + } + } + Ok(()) + } + + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult { + if size == 0 { + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + let queue = match &self.instance_queue { + Some(queue) => queue, + None => { + return Ok(QueuedInstanceBatch { + instances: Vec::new(), + }); + } + }; + let mut guard = queue.lock().expect("instance queue poisoned"); + let now = Utc::now(); + let mut instances = Vec::new(); + while instances.len() < size { + let Some(instance) = guard.front() else { + break; + }; + if let Some(scheduled_at) = instance.scheduled_at + && scheduled_at > now + { + break; + } + let instance = guard.pop_front().expect("instance queue empty"); + instances.push(instance); + } + if !instances.is_empty() { + let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); + for instance in &instances { + locks.insert( + instance.instance_id, + (Some(claim.lock_uuid), Some(claim.lock_expires_at)), + ); + } + } + Ok(QueuedInstanceBatch { instances }) + } + + async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { + if instances.is_empty() { + return Ok(()); + } + let queue = self.instance_queue.as_ref().ok_or_else(|| { + BackendError::Message("memory backend missing instance queue".to_string()) + })?; + let mut guard = queue.lock().expect("instance queue poisoned"); + for instance in instances { + guard.push_back(instance.clone()); + } + Ok(()) + } + + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult> { + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + let mut locks = Vec::new(); + for instance_id in instance_ids { + let entry = guard + .entry(*instance_id) + .or_insert((Some(claim.lock_uuid), Some(claim.lock_expires_at))); + if entry.0 == Some(claim.lock_uuid) { + entry.1 = Some(claim.lock_expires_at); + } + locks.push(InstanceLockStatus { + instance_id: *instance_id, + lock_uuid: entry.0, + lock_expires_at: entry.1, + }); + } + Ok(locks) + } + + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()> { + let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); + for instance_id in instance_ids { + if let Some((current_lock, _)) = guard.get(instance_id) + && *current_lock == Some(lock_uuid) + { + guard.remove(instance_id); + } + } + Ok(()) + } +} diff --git a/crates/lib/backend-memory/src/garbage_collector_backend.rs b/crates/lib/backend-memory/src/garbage_collector_backend.rs new file mode 100644 index 00000000..6a4cda66 --- /dev/null +++ b/crates/lib/backend-memory/src/garbage_collector_backend.rs @@ -0,0 +1,14 @@ +use chrono::{DateTime, Utc}; +use waymark_backends_core::BackendResult; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; + +#[async_trait::async_trait] +impl GarbageCollectorBackend for crate::MemoryBackend { + async fn collect_done_instances( + &self, + _older_than: DateTime, + _limit: usize, + ) -> BackendResult { + Ok(GarbageCollectionResult::default()) + } +} diff --git a/crates/lib/backend-memory/src/lib.rs b/crates/lib/backend-memory/src/lib.rs new file mode 100644 index 00000000..e2ef56e4 --- /dev/null +++ b/crates/lib/backend-memory/src/lib.rs @@ -0,0 +1,111 @@ +//! In-memory backend that prints persistence operations. + +#[cfg(feature = "core-backend")] +mod core_backend; + +#[cfg(feature = "garbage-collector-backend")] +mod garbage_collector_backend; + +#[cfg(feature = "scheduler-backend")] +mod scheduler_backend; + +#[cfg(feature = "webapp-backend")] +mod webapp_backend; + +#[cfg(feature = "worker-status-backend")] +mod worker_status_backend; + +#[cfg(feature = "workflow-registry-backend")] +mod workflow_registry_backend; + +use std::collections::{HashMap, VecDeque}; +use std::sync::{Arc, Mutex}; + +use chrono::{DateTime, Utc}; +use uuid::Uuid; + +use waymark_core_backend::{ActionDone, GraphUpdate, InstanceDone, QueuedInstance}; +use waymark_scheduler_core::{ScheduleId, WorkflowSchedule}; +use waymark_worker_status_backend::WorkerStatusUpdate; +use waymark_workflow_registry_backend::WorkflowRegistration; + +type WorkflowVersionKey = (String, String); +type WorkflowVersionValue = (Uuid, WorkflowRegistration); +type WorkflowVersionStore = HashMap; +type InstanceLockStore = HashMap, Option>)>; + +/// Backend that stores updates in memory for tests or local runs. +#[derive(Clone)] +pub struct MemoryBackend { + instance_queue: Option>>>, + graph_updates: Arc>>, + actions_done: Arc>>, + instances_done: Arc>>, + worker_status_updates: Arc>>, + #[cfg_attr(not(feature = "workflow-registry-backend"), allow(dead_code))] + workflow_versions: Arc>, + #[cfg_attr(not(feature = "scheduler-backend"), allow(dead_code))] + schedules: Arc>>, + #[cfg_attr(not(feature = "core-backend"), allow(dead_code))] + instance_locks: Arc>, +} + +impl Default for MemoryBackend { + fn default() -> Self { + Self { + instance_queue: None, + graph_updates: Arc::new(Mutex::new(Vec::new())), + actions_done: Arc::new(Mutex::new(Vec::new())), + instances_done: Arc::new(Mutex::new(Vec::new())), + worker_status_updates: Arc::new(Mutex::new(Vec::new())), + workflow_versions: Arc::new(Mutex::new(HashMap::new())), + schedules: Arc::new(Mutex::new(HashMap::new())), + instance_locks: Arc::new(Mutex::new(HashMap::new())), + } + } +} + +impl MemoryBackend { + pub fn new() -> Self { + Self::default() + } + + pub fn with_queue(queue: Arc>>) -> Self { + Self { + instance_queue: Some(queue), + ..Self::default() + } + } + + pub fn instance_queue(&self) -> Option>>> { + self.instance_queue.clone() + } + + pub fn graph_updates(&self) -> Vec { + self.graph_updates + .lock() + .expect("graph updates poisoned") + .clone() + } + + pub fn actions_done(&self) -> Vec { + self.actions_done + .lock() + .expect("actions done poisoned") + .clone() + } + + pub fn instances_done(&self) -> Vec { + self.instances_done + .lock() + .expect("instances done poisoned") + .clone() + } + + pub fn worker_status_updates(&self) -> Vec { + self.worker_status_updates + .lock() + .expect("worker status updates poisoned") + .clone() + } +} diff --git a/crates/lib/backend-memory/src/scheduler_backend.rs b/crates/lib/backend-memory/src/scheduler_backend.rs new file mode 100644 index 00000000..3764f489 --- /dev/null +++ b/crates/lib/backend-memory/src/scheduler_backend.rs @@ -0,0 +1,209 @@ +use chrono::Utc; +use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_scheduler_backend::SchedulerBackend; +use waymark_scheduler_core::{ + CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule, compute_next_run, +}; + +#[async_trait::async_trait] +impl SchedulerBackend for crate::MemoryBackend { + async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let existing_schedule = guard.iter().find_map(|(id, schedule)| { + if schedule.workflow_name == params.workflow_name + && schedule.schedule_name == params.schedule_name + { + Some((*id, schedule.clone())) + } else { + None + } + }); + let schedule_id = existing_schedule + .as_ref() + .map(|(id, _)| *id) + .unwrap_or_else(ScheduleId::new); + let now = Utc::now(); + let next_run_at = match existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.next_run_at) + { + Some(next_run_at) => Some(next_run_at), + None => Some( + compute_next_run( + params.schedule_type, + params.cron_expression.as_deref(), + params.interval_seconds, + params.jitter_seconds, + None, + ) + .map_err(BackendError::Message)?, + ), + }; + let schedule = WorkflowSchedule { + id: schedule_id.0, + workflow_name: params.workflow_name.clone(), + schedule_name: params.schedule_name.clone(), + schedule_type: params.schedule_type.as_str().to_string(), + cron_expression: params.cron_expression.clone(), + interval_seconds: params.interval_seconds, + jitter_seconds: params.jitter_seconds, + input_payload: params.input_payload.clone(), + status: "active".to_string(), + next_run_at, + last_run_at: existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.last_run_at), + last_instance_id: existing_schedule + .as_ref() + .and_then(|(_, schedule)| schedule.last_instance_id), + created_at: existing_schedule + .as_ref() + .map(|(_, schedule)| schedule.created_at) + .unwrap_or(now), + updated_at: now, + priority: params.priority, + allow_duplicate: params.allow_duplicate, + }; + guard.insert(schedule_id, schedule); + Ok(schedule_id) + } + + async fn get_schedule(&self, id: ScheduleId) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + guard + .get(&id) + .cloned() + .ok_or_else(|| BackendError::Message(format!("schedule not found: {id}"))) + } + + async fn get_schedule_by_name( + &self, + workflow_name: &str, + schedule_name: &str, + ) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .find(|schedule| { + schedule.workflow_name == workflow_name + && schedule.schedule_name == schedule_name + && schedule.status != "deleted" + }) + .cloned()) + } + + async fn list_schedules( + &self, + limit: i64, + offset: i64, + ) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| schedule.status != "deleted") + .cloned() + .collect(); + schedules.sort_by(|a, b| { + (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) + }); + let start = offset.max(0) as usize; + let end = start.saturating_add(limit.max(0) as usize); + Ok(schedules + .into_iter() + .skip(start) + .take(end - start) + .collect()) + } + + async fn count_schedules(&self) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .filter(|schedule| schedule.status != "deleted") + .count() as i64) + } + + async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + if let Some(schedule) = guard.get_mut(&id) { + schedule.status = status.to_string(); + schedule.updated_at = Utc::now(); + Ok(true) + } else { + Ok(false) + } + } + + async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { + SchedulerBackend::update_schedule_status(self, id, "deleted").await + } + + async fn find_due_schedules(&self, limit: i32) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let now = Utc::now(); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| { + schedule.status == "active" + && schedule + .next_run_at + .map(|next| next <= now) + .unwrap_or(false) + }) + .cloned() + .collect(); + schedules.sort_by_key(|schedule| schedule.next_run_at); + Ok(schedules.into_iter().take(limit as usize).collect()) + } + + async fn has_running_instance(&self, _schedule_id: ScheduleId) -> BackendResult { + Ok(false) + } + + async fn mark_schedule_executed( + &self, + schedule_id: ScheduleId, + instance_id: Uuid, + ) -> BackendResult<()> { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .get_mut(&schedule_id) + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + schedule.last_run_at = Some(Utc::now()); + schedule.last_instance_id = Some(instance_id); + schedule.next_run_at = Some(next_run_at); + schedule.updated_at = Utc::now(); + Ok(()) + } + + async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .get_mut(&schedule_id) + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + let schedule_type = ScheduleType::parse(&schedule.schedule_type) + .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; + let next_run_at = compute_next_run( + schedule_type, + schedule.cron_expression.as_deref(), + schedule.interval_seconds, + schedule.jitter_seconds, + Some(Utc::now()), + ) + .map_err(BackendError::Message)?; + schedule.next_run_at = Some(next_run_at); + schedule.updated_at = Utc::now(); + Ok(()) + } +} diff --git a/crates/lib/backend-memory/src/webapp_backend.rs b/crates/lib/backend-memory/src/webapp_backend.rs new file mode 100644 index 00000000..5bcca7c1 --- /dev/null +++ b/crates/lib/backend-memory/src/webapp_backend.rs @@ -0,0 +1,295 @@ +use std::collections::HashMap; + +use chrono::Utc; +use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_webapp_backend::WebappBackend; +use waymark_webapp_core::{ + ExecutionGraphView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, + ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, + WorkerAggregateStats, WorkerStatus, +}; +use waymark_worker_status_backend::WorkerStatusUpdate; + +#[async_trait::async_trait] +impl WebappBackend for crate::MemoryBackend { + async fn count_instances(&self, _search: Option<&str>) -> BackendResult { + Ok(0) + } + + async fn list_instances( + &self, + _search: Option<&str>, + _limit: i64, + _offset: i64, + ) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_instance(&self, instance_id: Uuid) -> BackendResult { + Err(BackendError::Message(format!( + "instance not found: {instance_id}" + ))) + } + + async fn get_execution_graph( + &self, + _instance_id: Uuid, + ) -> BackendResult> { + Ok(None) + } + + async fn get_workflow_graph( + &self, + _instance_id: Uuid, + ) -> BackendResult> { + Ok(None) + } + + async fn get_action_results(&self, _instance_id: Uuid) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_distinct_workflows(&self) -> BackendResult> { + Ok(Vec::new()) + } + + async fn get_distinct_statuses(&self) -> BackendResult> { + Ok(vec![ + InstanceStatus::Queued.to_string(), + InstanceStatus::Running.to_string(), + InstanceStatus::Completed.to_string(), + InstanceStatus::Failed.to_string(), + ]) + } + + async fn count_schedules(&self) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + Ok(guard + .values() + .filter(|schedule| schedule.status != "deleted") + .count() as i64) + } + + async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { + let guard = self.schedules.lock().expect("schedules poisoned"); + let mut schedules: Vec<_> = guard + .values() + .filter(|schedule| schedule.status != "deleted") + .cloned() + .collect(); + schedules.sort_by(|a, b| { + (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) + }); + + let start = offset.max(0) as usize; + let page_limit = limit.max(0) as usize; + Ok(schedules + .into_iter() + .skip(start) + .take(page_limit) + .map(|schedule| ScheduleSummary { + id: schedule.id.to_string(), + workflow_name: schedule.workflow_name, + schedule_name: schedule.schedule_name, + schedule_type: schedule.schedule_type, + cron_expression: schedule.cron_expression, + interval_seconds: schedule.interval_seconds, + status: schedule.status, + next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), + last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), + created_at: schedule.created_at.to_rfc3339(), + }) + .collect()) + } + + async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { + let guard = self.schedules.lock().expect("schedules poisoned"); + let schedule = guard + .values() + .find(|schedule| schedule.id == schedule_id) + .cloned() + .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; + + let input_payload = schedule.input_payload.as_ref().and_then(|bytes| { + rmp_serde::from_slice::(bytes) + .ok() + .and_then(|value| serde_json::to_string_pretty(&value).ok()) + }); + + Ok(ScheduleDetail { + id: schedule.id.to_string(), + workflow_name: schedule.workflow_name, + schedule_name: schedule.schedule_name, + schedule_type: schedule.schedule_type, + cron_expression: schedule.cron_expression, + interval_seconds: schedule.interval_seconds, + jitter_seconds: schedule.jitter_seconds, + status: schedule.status, + next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), + last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), + last_instance_id: schedule.last_instance_id.map(|id| id.to_string()), + created_at: schedule.created_at.to_rfc3339(), + updated_at: schedule.updated_at.to_rfc3339(), + priority: schedule.priority, + allow_duplicate: schedule.allow_duplicate, + input_payload, + }) + } + + async fn count_schedule_invocations(&self, _schedule_id: Uuid) -> BackendResult { + Ok(0) + } + + async fn list_schedule_invocations( + &self, + _schedule_id: Uuid, + _limit: i64, + _offset: i64, + ) -> BackendResult> { + Ok(Vec::new()) + } + + async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { + let mut guard = self.schedules.lock().expect("schedules poisoned"); + let Some(schedule) = guard + .values_mut() + .find(|schedule| schedule.id == schedule_id) + else { + return Ok(false); + }; + schedule.status = status.to_string(); + schedule.updated_at = Utc::now(); + Ok(true) + } + + async fn get_distinct_schedule_statuses(&self) -> BackendResult> { + Ok(vec!["active".to_string(), "paused".to_string()]) + } + + async fn get_distinct_schedule_types(&self) -> BackendResult> { + Ok(vec!["cron".to_string(), "interval".to_string()]) + } + + async fn get_worker_action_stats( + &self, + _window_minutes: i64, + ) -> BackendResult> { + let statuses = latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + ); + + Ok(statuses + .into_iter() + .map(|status| WorkerActionRow { + pool_id: status.pool_id.to_string(), + active_workers: status.active_workers as i64, + actions_per_sec: format!("{:.1}", status.actions_per_sec), + throughput_per_min: status.throughput_per_min as i64, + total_completed: status.total_completed, + median_dequeue_ms: status.median_dequeue_ms, + median_handling_ms: status.median_handling_ms, + last_action_at: status.last_action_at.map(|dt| dt.to_rfc3339()), + updated_at: status.updated_at.to_rfc3339(), + }) + .collect()) + } + + async fn get_worker_aggregate_stats( + &self, + _window_minutes: i64, + ) -> BackendResult { + let statuses = latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + ); + + let active_worker_count = statuses + .iter() + .map(|status| status.active_workers as i64) + .sum(); + let total_in_flight = statuses + .iter() + .filter_map(|status| status.total_in_flight) + .sum(); + let total_queue_depth = statuses + .iter() + .filter_map(|status| status.dispatch_queue_size) + .sum(); + let actions_per_sec = statuses + .iter() + .map(|status| status.actions_per_sec) + .sum::(); + + Ok(WorkerAggregateStats { + active_worker_count, + actions_per_sec: format!("{:.1}", actions_per_sec), + total_in_flight, + total_queue_depth, + }) + } + + async fn worker_status_table_exists(&self) -> bool { + !self + .worker_status_updates + .lock() + .expect("worker status updates poisoned") + .is_empty() + } + + async fn schedules_table_exists(&self) -> bool { + !self + .schedules + .lock() + .expect("schedules poisoned") + .is_empty() + } + + async fn get_worker_statuses(&self, _window_minutes: i64) -> BackendResult> { + Ok(latest_worker_statuses( + &self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"), + )) + } +} + +fn latest_worker_statuses(updates: &[WorkerStatusUpdate]) -> Vec { + let mut by_pool: HashMap = HashMap::new(); + for update in updates { + by_pool.insert(update.pool_id, update.clone()); + } + + let now = Utc::now(); + let mut statuses: Vec<_> = by_pool + .into_values() + .map(|status| WorkerStatus { + pool_id: status.pool_id, + active_workers: status.active_workers, + throughput_per_min: status.throughput_per_min, + actions_per_sec: status.actions_per_sec, + total_completed: status.total_completed, + last_action_at: status.last_action_at, + updated_at: now, + median_dequeue_ms: status.median_dequeue_ms, + median_handling_ms: status.median_handling_ms, + dispatch_queue_size: Some(status.dispatch_queue_size), + total_in_flight: Some(status.total_in_flight), + median_instance_duration_secs: status.median_instance_duration_secs, + active_instance_count: status.active_instance_count, + total_instances_completed: status.total_instances_completed, + instances_per_sec: status.instances_per_sec, + instances_per_min: status.instances_per_min, + time_series: status.time_series, + }) + .collect(); + + statuses.sort_by(|left, right| right.actions_per_sec.total_cmp(&left.actions_per_sec)); + statuses +} diff --git a/crates/lib/backend-memory/src/worker_status_backend.rs b/crates/lib/backend-memory/src/worker_status_backend.rs new file mode 100644 index 00000000..dbca9794 --- /dev/null +++ b/crates/lib/backend-memory/src/worker_status_backend.rs @@ -0,0 +1,13 @@ +use waymark_worker_status_backend::{BackendResult, WorkerStatusBackend, WorkerStatusUpdate}; + +#[async_trait::async_trait] +impl WorkerStatusBackend for crate::MemoryBackend { + async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { + let mut stored = self + .worker_status_updates + .lock() + .expect("worker status updates poisoned"); + stored.push(status.clone()); + Ok(()) + } +} diff --git a/crates/lib/backend-memory/src/workflow_registry_backend.rs b/crates/lib/backend-memory/src/workflow_registry_backend.rs new file mode 100644 index 00000000..e820b5a9 --- /dev/null +++ b/crates/lib/backend-memory/src/workflow_registry_backend.rs @@ -0,0 +1,58 @@ +use uuid::Uuid; +use waymark_workflow_registry_backend::{ + BackendError, BackendResult, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; + +#[async_trait::async_trait] +impl WorkflowRegistryBackend for crate::MemoryBackend { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult { + let mut guard = self + .workflow_versions + .lock() + .expect("workflow versions poisoned"); + let key = ( + registration.workflow_name.clone(), + registration.workflow_version.clone(), + ); + if let Some((id, existing)) = guard.get(&key) { + if existing.ir_hash != registration.ir_hash { + return Err(BackendError::Message(format!( + "workflow version already exists with different IR hash: {}@{}", + registration.workflow_name, registration.workflow_version + ))); + } + return Ok(*id); + } + + let id = Uuid::new_v4(); + guard.insert(key, (id, registration.clone())); + Ok(id) + } + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { + if ids.is_empty() { + return Ok(Vec::new()); + } + let guard = self + .workflow_versions + .lock() + .expect("workflow versions poisoned"); + let mut versions = Vec::new(); + for (id, registration) in guard.values() { + if ids.contains(id) { + versions.push(WorkflowVersion { + id: *id, + workflow_name: registration.workflow_name.clone(), + workflow_version: registration.workflow_version.clone(), + ir_hash: registration.ir_hash.clone(), + program_proto: registration.program_proto.clone(), + concurrent: registration.concurrent, + }); + } + } + Ok(versions) + } +} diff --git a/crates/lib/backend-postgres-migrations/Cargo.toml b/crates/lib/backend-postgres-migrations/Cargo.toml new file mode 100644 index 00000000..f84ad14c --- /dev/null +++ b/crates/lib/backend-postgres-migrations/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "waymark-backend-postgres-migrations" +version = "0.1.0" +edition = "2024" + +[dependencies] +sqlx = { workspace = true, features = ["postgres", "macros", "migrate"] } diff --git a/crates/lib/backend-postgres-migrations/build.rs b/crates/lib/backend-postgres-migrations/build.rs new file mode 100644 index 00000000..3a8149ef --- /dev/null +++ b/crates/lib/backend-postgres-migrations/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rerun-if-changed=migrations"); +} diff --git a/crates/waymark/migrations/0001_init.sql b/crates/lib/backend-postgres-migrations/migrations/0001_init.sql similarity index 100% rename from crates/waymark/migrations/0001_init.sql rename to crates/lib/backend-postgres-migrations/migrations/0001_init.sql diff --git a/crates/waymark/migrations/0002_runner_actions_done_execution_id.sql b/crates/lib/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql similarity index 100% rename from crates/waymark/migrations/0002_runner_actions_done_execution_id.sql rename to crates/lib/backend-postgres-migrations/migrations/0002_runner_actions_done_execution_id.sql diff --git a/crates/waymark/migrations/0003_instance_locks.sql b/crates/lib/backend-postgres-migrations/migrations/0003_instance_locks.sql similarity index 100% rename from crates/waymark/migrations/0003_instance_locks.sql rename to crates/lib/backend-postgres-migrations/migrations/0003_instance_locks.sql diff --git a/crates/waymark/migrations/0004_workflow_versions.sql b/crates/lib/backend-postgres-migrations/migrations/0004_workflow_versions.sql similarity index 100% rename from crates/waymark/migrations/0004_workflow_versions.sql rename to crates/lib/backend-postgres-migrations/migrations/0004_workflow_versions.sql diff --git a/crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql b/crates/lib/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql similarity index 100% rename from crates/waymark/migrations/0005_runner_instances_workflow_version_id.sql rename to crates/lib/backend-postgres-migrations/migrations/0005_runner_instances_workflow_version_id.sql diff --git a/crates/waymark/migrations/0006_drop_unused_runner_tables.sql b/crates/lib/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql similarity index 100% rename from crates/waymark/migrations/0006_drop_unused_runner_tables.sql rename to crates/lib/backend-postgres-migrations/migrations/0006_drop_unused_runner_tables.sql diff --git a/crates/waymark/migrations/0007_runner_instances_schedule_id.sql b/crates/lib/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql similarity index 100% rename from crates/waymark/migrations/0007_runner_instances_schedule_id.sql rename to crates/lib/backend-postgres-migrations/migrations/0007_runner_instances_schedule_id.sql diff --git a/crates/waymark/migrations/0008_runner_actions_done_timing.sql b/crates/lib/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql similarity index 100% rename from crates/waymark/migrations/0008_runner_actions_done_timing.sql rename to crates/lib/backend-postgres-migrations/migrations/0008_runner_actions_done_timing.sql diff --git a/crates/waymark/migrations/0009_instance_search_columns.sql b/crates/lib/backend-postgres-migrations/migrations/0009_instance_search_columns.sql similarity index 100% rename from crates/waymark/migrations/0009_instance_search_columns.sql rename to crates/lib/backend-postgres-migrations/migrations/0009_instance_search_columns.sql diff --git a/crates/lib/backend-postgres-migrations/src/lib.rs b/crates/lib/backend-postgres-migrations/src/lib.rs new file mode 100644 index 00000000..82495aeb --- /dev/null +++ b/crates/lib/backend-postgres-migrations/src/lib.rs @@ -0,0 +1,8 @@ +//! Migrations for the postgres backend. + +use sqlx::PgPool; + +/// Run the embedded SQLx migrations. +pub async fn run(pool: &PgPool) -> Result<(), sqlx::migrate::MigrateError> { + sqlx::migrate!().run(pool).await +} diff --git a/crates/lib/backend-postgres/Cargo.toml b/crates/lib/backend-postgres/Cargo.toml new file mode 100644 index 00000000..f61f582f --- /dev/null +++ b/crates/lib/backend-postgres/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "waymark-backend-postgres" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +rmp-serde = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +sqlx = { workspace = true, features = ["uuid", "chrono"] } +tokio = { workspace = true, features = ["macros"] } +tracing = { workspace = true } +uuid = { workspace = true } +waymark-backend-postgres-migrations = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-core-backend = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } +waymark-garbage-collector-backend = { workspace = true } +waymark-observability = { workspace = true } +waymark-runner = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-scheduler-backend = { workspace = true } +waymark-scheduler-core = { workspace = true } +waymark-webapp-backend = { workspace = true } +waymark-webapp-core = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +prost = { workspace = true } + +[dev-dependencies] +serial_test = { workspace = true } +waymark-test-support = { workspace = true } +waymark-ir-parser = { workspace = true } + +[features] +trace = [] diff --git a/crates/waymark/src/backends/postgres/core.rs b/crates/lib/backend-postgres/src/core.rs similarity index 98% rename from crates/waymark/src/backends/postgres/core.rs rename to crates/lib/backend-postgres/src/core.rs index c827b4c0..bc46747e 100644 --- a/crates/waymark/src/backends/postgres/core.rs +++ b/crates/lib/backend-postgres/src/core.rs @@ -4,18 +4,19 @@ use std::time::Duration as StdDuration; use chrono::{DateTime, Utc}; use sqlx::{Postgres, QueryBuilder, Row}; -use tonic::async_trait; use tracing::warn; use uuid::Uuid; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; +use waymark_scheduler_backend::{BackendError, BackendResult}; +use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; use super::PostgresBackend; -use crate::backends::base::{ - ActionDone, BackendError, BackendResult, CoreBackend, GarbageCollectionResult, - GarbageCollectorBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, - QueuedInstance, QueuedInstanceBatch, WorkerStatusBackend, WorkerStatusUpdate, +use waymark_core_backend::{ + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, }; -use crate::observability::obs; -use crate::waymark_core::runner::state::RunnerState; +use waymark_observability::obs; +use waymark_runner_state::RunnerState; const INSTANCE_STATUS_QUEUED: &str = "queued"; const INSTANCE_STATUS_RUNNING: &str = "running"; @@ -59,7 +60,7 @@ fn is_transient_sqlstate(code: &str) -> bool { fn is_transient_backend_error(err: &BackendError) -> bool { match err { - BackendError::Sqlx(sqlx::Error::Database(db_err)) => { + BackendError::Inner(sqlx::Error::Database(db_err)) => { db_err.code().as_deref().is_some_and(is_transient_sqlstate) } // Fallback for cases where sqlstate is not preserved in wrapping. @@ -842,16 +843,16 @@ impl PostgresBackend { } } -#[async_trait] -impl CoreBackend for PostgresBackend { - fn clone_box(&self) -> Box { +#[async_trait::async_trait] +impl waymark_core_backend::CoreBackend for PostgresBackend { + fn clone_box(&self) -> Box { Box::new(self.clone()) } async fn save_graphs( &self, - claim: LockClaim, - graphs: &[GraphUpdate], + claim: waymark_core_backend::LockClaim, + graphs: &[waymark_core_backend::GraphUpdate], ) -> BackendResult> { self.save_graphs_impl(claim, graphs).await } @@ -935,7 +936,10 @@ impl CoreBackend for PostgresBackend { Ok(()) } - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { + async fn queue_instances( + &self, + instances: &[waymark_core_backend::QueuedInstance], + ) -> BackendResult<()> { PostgresBackend::queue_instances(self, instances).await } } @@ -988,7 +992,7 @@ impl PostgresBackend { } } -#[async_trait] +#[async_trait::async_trait] impl GarbageCollectorBackend for PostgresBackend { async fn collect_done_instances( &self, @@ -999,7 +1003,7 @@ impl GarbageCollectorBackend for PostgresBackend { } } -#[async_trait] +#[async_trait::async_trait] impl WorkerStatusBackend for PostgresBackend { async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { PostgresBackend::upsert_worker_status(self, status).await @@ -1017,14 +1021,13 @@ mod tests { use serial_test::serial; use sqlx::Row; use uuid::Uuid; + use waymark_core_backend::{ActionAttemptStatus, CoreBackend}; use super::super::test_helpers::setup_backend; use super::*; - use crate::backends::{ - ActionAttemptStatus, CoreBackend, GarbageCollectorBackend, WorkerStatusBackend, - }; - use crate::waymark_core::runner::state::{ActionCallSpec, ExecutionNode, NodeStatus}; + use waymark_dag::EdgeType; + use waymark_runner_state::{ActionCallSpec, ExecutionNode, NodeStatus}; fn sample_runner_state() -> RunnerState { RunnerState::new(None, None, None, false) @@ -1343,13 +1346,11 @@ mod tests { let graph = GraphUpdate { instance_id, nodes, - edges: std::collections::HashSet::from([ - crate::waymark_core::runner::state::ExecutionEdge { - source: execution_id, - target: execution_id, - edge_type: EdgeType::StateMachine, - }, - ]), + edges: std::collections::HashSet::from([waymark_runner_state::ExecutionEdge { + source: execution_id, + target: execution_id, + edge_type: EdgeType::StateMachine, + }]), }; let extended_claim = LockClaim { lock_uuid: claim.lock_uuid, diff --git a/crates/waymark/src/backends/postgres/mod.rs b/crates/lib/backend-postgres/src/lib.rs similarity index 86% rename from crates/waymark/src/backends/postgres/mod.rs rename to crates/lib/backend-postgres/src/lib.rs index 4bec275d..2b4e3821 100644 --- a/crates/waymark/src/backends/postgres/mod.rs +++ b/crates/lib/backend-postgres/src/lib.rs @@ -11,11 +11,8 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use sqlx::PgPool; - -use crate::db; -use crate::observability::obs; - -use super::base::{BackendError, BackendResult}; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_observability::obs; /// Persist runner state and action results in Postgres. #[derive(Clone)] @@ -37,7 +34,9 @@ impl PostgresBackend { #[obs] pub async fn connect(dsn: &str) -> BackendResult { let pool = PgPool::connect(dsn).await?; - db::run_migrations(&pool).await?; + waymark_backend_postgres_migrations::run(&pool) + .await + .map_err(|err| BackendError::Message(err.to_string()))?; Ok(Self::new(pool)) } @@ -86,12 +85,12 @@ impl PostgresBackend { .clone() } - pub(super) fn count_query(counts: &Arc>>, label: &str) { + pub(crate) fn count_query(counts: &Arc>>, label: &str) { let mut guard = counts.lock().expect("query counts poisoned"); *guard.entry(label.to_string()).or_insert(0) += 1; } - pub(super) fn count_batch_size( + pub(crate) fn count_batch_size( counts: &Arc>>>, label: &str, size: usize, @@ -104,11 +103,11 @@ impl PostgresBackend { *entry.entry(size).or_insert(0) += 1; } - pub(super) fn serialize(value: &T) -> Result, BackendError> { + pub(crate) fn serialize(value: &T) -> Result, BackendError> { rmp_serde::to_vec_named(value).map_err(|e| BackendError::Message(e.to_string())) } - pub(super) fn deserialize( + pub(crate) fn deserialize( payload: &[u8], ) -> Result { rmp_serde::from_slice(payload).map_err(|e| BackendError::Message(e.to_string())) diff --git a/crates/waymark/src/backends/postgres/registry.rs b/crates/lib/backend-postgres/src/registry.rs similarity index 94% rename from crates/waymark/src/backends/postgres/registry.rs rename to crates/lib/backend-postgres/src/registry.rs index c8fb5a68..94fc1e2c 100644 --- a/crates/waymark/src/backends/postgres/registry.rs +++ b/crates/lib/backend-postgres/src/registry.rs @@ -1,13 +1,13 @@ use sqlx::Row; -use tonic::async_trait; use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; use super::PostgresBackend; -use crate::backends::base::{ - BackendError, BackendResult, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; -#[async_trait] +#[async_trait::async_trait] impl WorkflowRegistryBackend for PostgresBackend { async fn upsert_workflow_version( &self, @@ -95,7 +95,7 @@ mod tests { use serial_test::serial; use super::super::test_helpers::setup_backend; - use crate::backends::{WorkflowRegistration, WorkflowRegistryBackend}; + use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; fn sample_registration(version: &str) -> WorkflowRegistration { WorkflowRegistration { diff --git a/crates/waymark/src/backends/postgres/scheduler.rs b/crates/lib/backend-postgres/src/scheduler.rs similarity index 97% rename from crates/waymark/src/backends/postgres/scheduler.rs rename to crates/lib/backend-postgres/src/scheduler.rs index 5eb00735..e47f2114 100644 --- a/crates/waymark/src/backends/postgres/scheduler.rs +++ b/crates/lib/backend-postgres/src/scheduler.rs @@ -1,15 +1,14 @@ use chrono::{DateTime, Utc}; use sqlx::Row; -use tonic::async_trait; use uuid::Uuid; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_scheduler_backend::SchedulerBackend; -use super::PostgresBackend; -use crate::backends::base::{BackendError, BackendResult, SchedulerBackend}; -use crate::scheduler::compute_next_run; -use crate::scheduler::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; +use waymark_scheduler_core::compute_next_run; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; -#[async_trait] -impl SchedulerBackend for PostgresBackend { +#[async_trait::async_trait] +impl SchedulerBackend for crate::PostgresBackend { async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { let next_run_at = compute_next_run( params.schedule_type, @@ -307,10 +306,12 @@ mod tests { use sqlx::Row; use uuid::Uuid; + use crate::PostgresBackend; + use super::super::test_helpers::setup_backend; use super::*; - use crate::backends::SchedulerBackend; - use crate::scheduler::CreateScheduleParams; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_scheduler_core::CreateScheduleParams; fn sample_params(schedule_name: &str) -> CreateScheduleParams { CreateScheduleParams { diff --git a/crates/waymark/src/backends/postgres/test_helpers.rs b/crates/lib/backend-postgres/src/test_helpers.rs similarity index 93% rename from crates/waymark/src/backends/postgres/test_helpers.rs rename to crates/lib/backend-postgres/src/test_helpers.rs index dd03cd7f..addb1ad4 100644 --- a/crates/waymark/src/backends/postgres/test_helpers.rs +++ b/crates/lib/backend-postgres/src/test_helpers.rs @@ -1,7 +1,7 @@ use sqlx::PgPool; use super::PostgresBackend; -use crate::test_support::postgres_setup; +use waymark_test_support::postgres_setup; pub(super) async fn setup_backend() -> PostgresBackend { let pool = postgres_setup().await; diff --git a/crates/waymark/src/backends/postgres/webapp.rs b/crates/lib/backend-postgres/src/webapp.rs similarity index 98% rename from crates/waymark/src/backends/postgres/webapp.rs rename to crates/lib/backend-postgres/src/webapp.rs index 8f6b932f..e3f50ced 100644 --- a/crates/waymark/src/backends/postgres/webapp.rs +++ b/crates/lib/backend-postgres/src/webapp.rs @@ -4,20 +4,22 @@ use chrono::{DateTime, Utc}; use prost::Message; use serde_json::Value; use sqlx::{Postgres, QueryBuilder, Row}; -use tonic::async_trait; + use uuid::Uuid; -use super::PostgresBackend; -use crate::backends::base::{BackendError, BackendResult, GraphUpdate, WebappBackend}; -use crate::messages::ast as ir; -use crate::waymark_core::runner::state::{ActionCallSpec, ExecutionNode, NodeStatus}; -use crate::waymark_core::runner::{RunnerState, ValueExpr, format_value, replay_action_kwargs}; -use crate::webapp::{ +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::GraphUpdate; +use waymark_dag::{DAGNode, EdgeType, convert_to_dag}; +use waymark_proto::ast as ir; +use waymark_runner::replay_action_kwargs; +use waymark_runner_state::{ + ActionCallSpec, ExecutionNode, NodeStatus, RunnerState, format_value, value_visitor::ValueExpr, +}; +use waymark_webapp_core::{ ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, }; -use waymark_dag::{DAGNode, EdgeType, convert_to_dag}; const INSTANCE_STATUS_FALLBACK_SQL: &str = r#" CASE @@ -263,8 +265,8 @@ fn parse_instance_status(status: &str) -> Option { } } -#[async_trait] -impl WebappBackend for PostgresBackend { +#[async_trait::async_trait] +impl waymark_webapp_backend::WebappBackend for crate::PostgresBackend { async fn count_instances(&self, search: Option<&str>) -> BackendResult { let mut builder: QueryBuilder = QueryBuilder::new( r#" @@ -1407,20 +1409,23 @@ mod tests { use prost::Message; use serial_test::serial; use uuid::Uuid; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_webapp_backend::WebappBackend; + use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; + use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; + + use crate::PostgresBackend; use super::super::test_helpers::setup_backend; use super::*; - use crate::backends::{ - SchedulerBackend, WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, - WorkflowRegistration, WorkflowRegistryBackend, - }; - use crate::scheduler::{CreateScheduleParams, ScheduleType}; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::ValueExpr; - use crate::waymark_core::runner::state::{ + + use waymark_dag::EdgeType; + use waymark_ir_parser::parse_program; + use waymark_runner_state::{ ActionCallSpec, ExecutionEdge, ExecutionNode, LiteralValue, NodeStatus, + value_visitor::ValueExpr, }; - use waymark_dag::EdgeType; + use waymark_scheduler_core::{CreateScheduleParams, ScheduleType}; #[test] fn format_extracted_inputs_happy_path() { diff --git a/crates/lib/backends-core/Cargo.toml b/crates/lib/backends-core/Cargo.toml new file mode 100644 index 00000000..194062c0 --- /dev/null +++ b/crates/lib/backends-core/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "waymark-backends-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +serde_json = { workspace = true } +thiserror = { workspace = true } +sqlx = { workspace = true, optional = true } + +[features] +default = ["sqlx-error"] + +# TODO: this has to abstracted away since not all backends will use sqlx. +sqlx-error = ["dep:sqlx"] diff --git a/crates/lib/backends-core/src/lib.rs b/crates/lib/backends-core/src/lib.rs new file mode 100644 index 00000000..ee49d385 --- /dev/null +++ b/crates/lib/backends-core/src/lib.rs @@ -0,0 +1,36 @@ +//! Core primitives for various waymark subsystem backends. + +/// The common backend error. +/// +/// TODO: move away from a shared notion of backend error to use concrete error +/// type per-operation (rather than per-subsystem or per-crate). +#[derive(Debug, thiserror::Error)] +pub enum BackendError { + #[error("{0}")] + Message(String), + + #[error(transparent)] + Inner(Inner), + + #[error(transparent)] + Serialization(serde_json::Error), +} + +#[cfg(feature = "sqlx-error")] +pub type InnerError = sqlx::Error; + +#[cfg(not(feature = "sqlx-error"))] +pub type InnerError = (); + +/// Utility type alias for backend results. +/// +/// TODO: move away from the single-`Result` type aliases as we want to vary +/// rrors per-call. +pub type BackendResult = Result>; + +#[cfg(feature = "sqlx-error")] +impl From for BackendError { + fn from(value: sqlx::Error) -> Self { + Self::Inner(value) + } +} diff --git a/crates/lib/core-backend/Cargo.toml b/crates/lib/core-backend/Cargo.toml new file mode 100644 index 00000000..da2aa394 --- /dev/null +++ b/crates/lib/core-backend/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "waymark-core-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +chrono = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-dag = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/lib/core-backend/src/data.rs b/crates/lib/core-backend/src/data.rs new file mode 100644 index 00000000..d9320e71 --- /dev/null +++ b/crates/lib/core-backend/src/data.rs @@ -0,0 +1,150 @@ +// The models that we use for our backends are similar to the ones that we +// have specified in our database/Postgres backend, but not 1:1. It's better for +// us to internally convert within the given backend + +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use waymark_dag::DAG; +use waymark_runner_state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Queued instance payload for the run loop. +pub struct QueuedInstance { + pub workflow_version_id: Uuid, + #[serde(default)] + pub schedule_id: Option, + #[serde(skip, default)] + pub dag: Option>, + pub entry_node: Uuid, + pub state: Option, + #[serde( + default = "default_action_results", + deserialize_with = "deserialize_action_results" + )] + pub action_results: HashMap, + #[serde(default = "default_instance_id")] + pub instance_id: Uuid, + #[serde(default)] + pub scheduled_at: Option>, +} + +#[derive(Clone, Debug)] +/// Result payload for queued instance polling. +pub struct QueuedInstanceBatch { + pub instances: Vec, +} + +#[derive(Clone, Debug)] +/// Lock claim settings for owned instances. +pub struct LockClaim { + pub lock_uuid: Uuid, + pub lock_expires_at: DateTime, +} + +#[derive(Clone, Debug)] +/// Current lock status for an instance. +pub struct InstanceLockStatus { + pub instance_id: Uuid, + pub lock_uuid: Option, + pub lock_expires_at: Option>, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Completed instance payload with result or exception. +pub struct InstanceDone { + pub executor_id: Uuid, + pub entry_node: Uuid, + pub result: Option, + pub error: Option, +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Batch payload representing an updated execution graph snapshot. +/// +/// This intentionally stores only runtime nodes and edges (no DAG template or +/// derived caches) so persistence stays lightweight. +pub struct GraphUpdate { + pub instance_id: Uuid, + pub nodes: HashMap, + pub edges: HashSet, +} + +impl GraphUpdate { + pub fn from_state(instance_id: Uuid, state: &RunnerState) -> Self { + Self { + instance_id, + nodes: state.nodes.clone(), + edges: state.edges.clone(), + } + } + + pub fn next_scheduled_at(&self) -> DateTime { + let mut next: Option> = None; + for node in self.nodes.values() { + if matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) { + continue; + } + if let Some(scheduled_at) = node.scheduled_at { + next = Some(match next { + Some(existing) => existing.min(scheduled_at), + None => scheduled_at, + }); + } + } + next.unwrap_or_else(Utc::now) + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +/// Batch payload representing a finished action attempt (success or failure). +pub struct ActionDone { + pub execution_id: Uuid, + pub attempt: i32, + pub status: ActionAttemptStatus, + pub started_at: Option>, + pub completed_at: Option>, + pub duration_ms: Option, + pub result: serde_json::Value, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ActionAttemptStatus { + Completed, + Failed, + TimedOut, +} + +impl std::fmt::Display for ActionAttemptStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Completed => write!(f, "completed"), + Self::Failed => write!(f, "failed"), + Self::TimedOut => write!(f, "timed_out"), + } + } +} + +fn default_instance_id() -> Uuid { + Uuid::new_v4() +} + +fn default_action_results() -> HashMap { + HashMap::new() +} + +fn deserialize_action_results<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let value = Option::>::deserialize(deserializer)?; + Ok(value.unwrap_or_default()) +} diff --git a/crates/lib/core-backend/src/lib.rs b/crates/lib/core-backend/src/lib.rs new file mode 100644 index 00000000..5e876b29 --- /dev/null +++ b/crates/lib/core-backend/src/lib.rs @@ -0,0 +1,58 @@ +//! Core backend traits for waymark. + +mod data; + +use uuid::Uuid; + +use waymark_backends_core::BackendResult; + +pub use self::data::*; + +/// Abstract persistence backend for runner state. +#[async_trait::async_trait] +pub trait CoreBackend: Send + Sync { + fn clone_box(&self) -> Box; + + /// Persist updated execution graphs. + async fn save_graphs( + &self, + claim: LockClaim, + graphs: &[GraphUpdate], + ) -> BackendResult>; + + /// Persist finished action attempts (success or failure). + async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()>; + + /// Return up to size queued instances without blocking. + async fn get_queued_instances( + &self, + size: usize, + claim: LockClaim, + ) -> BackendResult; + + /// Refresh lock expiry for owned instances. + async fn refresh_instance_locks( + &self, + claim: LockClaim, + instance_ids: &[Uuid], + ) -> BackendResult>; + + /// Release instance locks when evicting from memory. + async fn release_instance_locks( + &self, + lock_uuid: Uuid, + instance_ids: &[Uuid], + ) -> BackendResult<()>; + + /// Persist completed workflow instances. + async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()>; + + /// Insert queued instances for run-loop consumption. + async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()>; +} + +impl Clone for Box { + fn clone(&self) -> Self { + self.clone_box() + } +} diff --git a/crates/dag/Cargo.toml b/crates/lib/dag/Cargo.toml similarity index 89% rename from crates/dag/Cargo.toml rename to crates/lib/dag/Cargo.toml index 54abfd13..a8bda894 100644 --- a/crates/dag/Cargo.toml +++ b/crates/lib/dag/Cargo.toml @@ -11,4 +11,4 @@ uuid = { workspace = true, features = ["serde", "v4"] } waymark-proto = { workspace = true, features = ["serde"] } [dev-dependencies] -waymark = { workspace = true } +waymark-ir-parser = { workspace = true } diff --git a/crates/dag/src/builder/assignments.rs b/crates/lib/dag/src/builder/assignments.rs similarity index 100% rename from crates/dag/src/builder/assignments.rs rename to crates/lib/dag/src/builder/assignments.rs diff --git a/crates/dag/src/builder/conditionals.rs b/crates/lib/dag/src/builder/conditionals.rs similarity index 100% rename from crates/dag/src/builder/conditionals.rs rename to crates/lib/dag/src/builder/conditionals.rs diff --git a/crates/dag/src/builder/converter.rs b/crates/lib/dag/src/builder/converter.rs similarity index 100% rename from crates/dag/src/builder/converter.rs rename to crates/lib/dag/src/builder/converter.rs diff --git a/crates/dag/src/builder/data_flow.rs b/crates/lib/dag/src/builder/data_flow.rs similarity index 100% rename from crates/dag/src/builder/data_flow.rs rename to crates/lib/dag/src/builder/data_flow.rs diff --git a/crates/dag/src/builder/exceptions.rs b/crates/lib/dag/src/builder/exceptions.rs similarity index 100% rename from crates/dag/src/builder/exceptions.rs rename to crates/lib/dag/src/builder/exceptions.rs diff --git a/crates/dag/src/builder/expansion.rs b/crates/lib/dag/src/builder/expansion.rs similarity index 100% rename from crates/dag/src/builder/expansion.rs rename to crates/lib/dag/src/builder/expansion.rs diff --git a/crates/dag/src/builder/loops.rs b/crates/lib/dag/src/builder/loops.rs similarity index 100% rename from crates/dag/src/builder/loops.rs rename to crates/lib/dag/src/builder/loops.rs diff --git a/crates/dag/src/builder/mod.rs b/crates/lib/dag/src/builder/mod.rs similarity index 100% rename from crates/dag/src/builder/mod.rs rename to crates/lib/dag/src/builder/mod.rs diff --git a/crates/dag/src/builder/spreads.rs b/crates/lib/dag/src/builder/spreads.rs similarity index 100% rename from crates/dag/src/builder/spreads.rs rename to crates/lib/dag/src/builder/spreads.rs diff --git a/crates/dag/src/builder/test_helpers.rs b/crates/lib/dag/src/builder/test_helpers.rs similarity index 96% rename from crates/dag/src/builder/test_helpers.rs rename to crates/lib/dag/src/builder/test_helpers.rs index ed33ca00..0c9811c1 100644 --- a/crates/dag/src/builder/test_helpers.rs +++ b/crates/lib/dag/src/builder/test_helpers.rs @@ -1,5 +1,5 @@ use crate::{DAG, DAGConverter, convert_to_dag}; -use waymark::waymark_core::ir_parser::parse_program; +use waymark_ir_parser::parse_program; use waymark_proto::ast as ir; pub(super) fn dedent(source: &str) -> String { diff --git a/crates/dag/src/builder/utils.rs b/crates/lib/dag/src/builder/utils.rs similarity index 100% rename from crates/dag/src/builder/utils.rs rename to crates/lib/dag/src/builder/utils.rs diff --git a/crates/dag/src/lib.rs b/crates/lib/dag/src/lib.rs similarity index 100% rename from crates/dag/src/lib.rs rename to crates/lib/dag/src/lib.rs diff --git a/crates/dag/src/models.rs b/crates/lib/dag/src/models.rs similarity index 100% rename from crates/dag/src/models.rs rename to crates/lib/dag/src/models.rs diff --git a/crates/dag/src/nodes.rs b/crates/lib/dag/src/nodes.rs similarity index 100% rename from crates/dag/src/nodes.rs rename to crates/lib/dag/src/nodes.rs diff --git a/crates/dag/src/validate.rs b/crates/lib/dag/src/validate.rs similarity index 99% rename from crates/dag/src/validate.rs rename to crates/lib/dag/src/validate.rs index b6aa67e1..0e48504e 100644 --- a/crates/dag/src/validate.rs +++ b/crates/lib/dag/src/validate.rs @@ -361,7 +361,7 @@ fn collect_expr_variables(expr: &ir::Expr, vars: &mut HashSet) { mod tests { use super::validate_dag; use crate::convert_to_dag; - use waymark::waymark_core::ir_parser::parse_program; + use waymark_ir_parser::parse_program; #[test] fn validate_dag_rejects_unresolved_variable_reference() { diff --git a/crates/lib/garbage-collector-backend/Cargo.toml b/crates/lib/garbage-collector-backend/Cargo.toml new file mode 100644 index 00000000..e1e4f300 --- /dev/null +++ b/crates/lib/garbage-collector-backend/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-garbage-collector-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +chrono = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/lib/garbage-collector-backend/src/lib.rs b/crates/lib/garbage-collector-backend/src/lib.rs new file mode 100644 index 00000000..af8badff --- /dev/null +++ b/crates/lib/garbage-collector-backend/src/lib.rs @@ -0,0 +1,20 @@ +use chrono::{DateTime, Utc}; + +use waymark_backends_core::BackendResult; + +#[derive(Clone, Copy, Debug, Default)] +/// Summary of a garbage collection sweep. +pub struct GarbageCollectionResult { + pub deleted_instances: usize, + pub deleted_actions: usize, +} + +/// Backend capability for deleting old finished workflow data. +#[async_trait::async_trait] +pub trait GarbageCollectorBackend: Send + Sync { + async fn collect_done_instances( + &self, + older_than: DateTime, + limit: usize, + ) -> BackendResult; +} diff --git a/crates/lib/integration-support/Cargo.toml b/crates/lib/integration-support/Cargo.toml new file mode 100644 index 00000000..1e24644c --- /dev/null +++ b/crates/lib/integration-support/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-integration-support" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = { workspace = true } +sqlx = { workspace = true } +tokio = { workspace = true, features = ["process", "time", "sync"] } +waymark-backend-postgres-migrations = { workspace = true } diff --git a/crates/waymark/src/integration_support/mod.rs b/crates/lib/integration-support/src/lib.rs similarity index 100% rename from crates/waymark/src/integration_support/mod.rs rename to crates/lib/integration-support/src/lib.rs diff --git a/crates/waymark/src/integration_support/postgres.rs b/crates/lib/integration-support/src/postgres.rs similarity index 96% rename from crates/waymark/src/integration_support/postgres.rs rename to crates/lib/integration-support/src/postgres.rs index 5a876f2d..59c9caec 100644 --- a/crates/waymark/src/integration_support/postgres.rs +++ b/crates/lib/integration-support/src/postgres.rs @@ -8,8 +8,6 @@ use sqlx::{PgPool, postgres::PgPoolOptions}; use tokio::process::Command; use tokio::sync::OnceCell; -use crate::db; - pub const LOCAL_POSTGRES_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; const READY_TIMEOUT: Duration = Duration::from_secs(45); @@ -41,7 +39,7 @@ pub async fn connect_pool(dsn: &str) -> Result { async fn ensure_local_postgres_impl() -> Result<()> { if let Ok(pool) = connect_pool(LOCAL_POSTGRES_DSN).await { - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run migrations for existing local postgres")?; pool.close().await; @@ -50,7 +48,7 @@ async fn ensure_local_postgres_impl() -> Result<()> { run_compose_up().await?; let pool = wait_for_postgres(LOCAL_POSTGRES_DSN).await?; - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run migrations for local postgres")?; pool.close().await; diff --git a/crates/lib/ir-parser/Cargo.toml b/crates/lib/ir-parser/Cargo.toml new file mode 100644 index 00000000..fee9b094 --- /dev/null +++ b/crates/lib/ir-parser/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-ir-parser" +version = "0.1.0" +edition = "2024" + +[dependencies] +waymark-proto = { workspace = true } +regex = { workspace = true } diff --git a/crates/waymark/src/waymark_core/ir_parser.rs b/crates/lib/ir-parser/src/lib.rs similarity index 99% rename from crates/waymark/src/waymark_core/ir_parser.rs rename to crates/lib/ir-parser/src/lib.rs index a3f2bbf7..1af43324 100644 --- a/crates/waymark/src/waymark_core/ir_parser.rs +++ b/crates/lib/ir-parser/src/lib.rs @@ -2,7 +2,7 @@ use std::fmt; -use crate::messages::ast as ir; +use waymark_proto::ast as ir; /// Raised when parsing the IR source representation fails. #[derive(Debug, Clone)] diff --git a/crates/observability-macros/Cargo.toml b/crates/lib/observability-macros/Cargo.toml similarity index 100% rename from crates/observability-macros/Cargo.toml rename to crates/lib/observability-macros/Cargo.toml diff --git a/crates/observability-macros/src/lib.rs b/crates/lib/observability-macros/src/lib.rs similarity index 62% rename from crates/observability-macros/src/lib.rs rename to crates/lib/observability-macros/src/lib.rs index 9fc1df7b..e10c19ec 100644 --- a/crates/observability-macros/src/lib.rs +++ b/crates/lib/observability-macros/src/lib.rs @@ -6,10 +6,10 @@ use syn::{ItemFn, parse_macro_input}; pub fn obs(args: TokenStream, input: TokenStream) -> TokenStream { let mut item = parse_macro_input!(input as ItemFn); let attr = if args.is_empty() { - syn::parse_quote!(#[cfg_attr(feature = "trace", tracing::instrument(skip_all))]) + syn::parse_quote!(#[cfg_attr(feature = "trace", ::waymark_observability::__inner::tracing::instrument(skip_all))]) } else { let args = proc_macro2::TokenStream::from(args); - syn::parse_quote!(#[cfg_attr(feature = "trace", tracing::instrument(#args))]) + syn::parse_quote!(#[cfg_attr(feature = "trace", ::waymark_observability::__inner::tracing::instrument(#args))]) }; item.attrs.push(attr); TokenStream::from(quote!(#item)) diff --git a/crates/lib/observability/Cargo.toml b/crates/lib/observability/Cargo.toml new file mode 100644 index 00000000..bc27b66c --- /dev/null +++ b/crates/lib/observability/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-observability" +version = "0.1.0" +edition = "2024" + +[dependencies] +waymark-observability-macros = { workspace = true } +tracing = { workspace = true } diff --git a/crates/lib/observability/src/lib.rs b/crates/lib/observability/src/lib.rs new file mode 100644 index 00000000..d2fa50f6 --- /dev/null +++ b/crates/lib/observability/src/lib.rs @@ -0,0 +1,8 @@ +pub use waymark_observability_macros::obs; + +#[doc(hidden)] +pub mod __inner { + pub mod tracing { + pub use tracing::instrument; + } +} diff --git a/crates/proto/Cargo.toml b/crates/lib/proto/Cargo.toml similarity index 88% rename from crates/proto/Cargo.toml rename to crates/lib/proto/Cargo.toml index 277e7df3..79ea21bf 100644 --- a/crates/proto/Cargo.toml +++ b/crates/lib/proto/Cargo.toml @@ -3,6 +3,9 @@ name = "waymark-proto" version = "0.1.0" edition = "2024" +[package.metadata.cargo-shear] +ignored = ["prost"] + [dependencies] prost = "0.12" prost-types = "0.12" diff --git a/crates/proto/build.rs b/crates/lib/proto/build.rs similarity index 97% rename from crates/proto/build.rs rename to crates/lib/proto/build.rs index 9cbb4369..33a80357 100644 --- a/crates/proto/build.rs +++ b/crates/lib/proto/build.rs @@ -1,4 +1,4 @@ -const PROTO_DIR: &str = "../../proto"; +const PROTO_DIR: &str = "../../../proto"; fn if_feature_enabled( builder: tonic_build::Builder, diff --git a/crates/proto/src/lib.rs b/crates/lib/proto/src/lib.rs similarity index 100% rename from crates/proto/src/lib.rs rename to crates/lib/proto/src/lib.rs diff --git a/crates/lib/runner-state/Cargo.toml b/crates/lib/runner-state/Cargo.toml new file mode 100644 index 00000000..6a64d994 --- /dev/null +++ b/crates/lib/runner-state/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "waymark-runner-state" +version = "0.1.0" +edition = "2024" + +[dependencies] +chrono = { workspace = true, features = ["serde", "clock"] } +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } + +[features] +trace = [] diff --git a/crates/lib/runner-state/src/lib.rs b/crates/lib/runner-state/src/lib.rs new file mode 100644 index 00000000..5c7ae36b --- /dev/null +++ b/crates/lib/runner-state/src/lib.rs @@ -0,0 +1,5 @@ +mod state; +mod util; +pub mod value_visitor; + +pub use self::state::*; diff --git a/crates/waymark/src/waymark_core/runner/state.rs b/crates/lib/runner-state/src/state.rs similarity index 99% rename from crates/waymark/src/waymark_core/runner/state.rs rename to crates/lib/runner-state/src/state.rs index dd9e68be..da418624 100644 --- a/crates/waymark/src/waymark_core/runner/state.rs +++ b/crates/lib/runner-state/src/state.rs @@ -8,15 +8,13 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::messages::ast as ir; -use crate::waymark_core::runner::expression_evaluator::is_truthy; -use crate::waymark_core::runner::value_visitor::{ - ValueExpr, collect_value_sources, resolve_value_tree, -}; +use crate::util::is_truthy; +use crate::value_visitor::{ValueExpr, collect_value_sources, resolve_value_tree}; use waymark_dag::{ ActionCallNode, AggregatorNode, AssignmentNode, DAG, DAGNode, EdgeType, FnCallNode, JoinNode, ReturnNode, SleepNode, }; +use waymark_proto::ast as ir; /// Raised when the runner state cannot be updated safely. #[derive(Debug, thiserror::Error)] @@ -335,11 +333,13 @@ impl RunnerState { state } - pub(crate) fn set_link_queued_nodes(&mut self, value: bool) { + /// TODO: make this `pub(crate)` again + pub fn set_link_queued_nodes(&mut self, value: bool) { self.link_queued_nodes = value; } - pub(crate) fn latest_assignment(&self, name: &str) -> Option { + /// TODO: make this `pub(crate)` again + pub fn latest_assignment(&self, name: &str) -> Option { self.latest_assignments.get(name).copied() } @@ -915,7 +915,9 @@ impl RunnerState { /// Example IR: /// - a, b = @pair() /// This yields ActionResultValue(node_id, result_index=0/1) for a and b. - pub(crate) fn assign_action_results( + /// + /// TODO: make this `pub(crate)` again + pub fn assign_action_results( &mut self, node: &ExecutionNode, action_name: &str, @@ -1038,7 +1040,7 @@ impl RunnerState { /// - xs = [1] /// - ys = xs + [2] /// Materialization turns ys into ListValue([1, 2]) rather than keeping xs. - pub(crate) fn materialize_value(&self, value: ValueExpr) -> ValueExpr { + pub fn materialize_value(&self, value: ValueExpr) -> ValueExpr { let resolved = resolve_value_tree(&value, &|name, seen| { self.resolve_variable_value(name, seen) }); @@ -1111,7 +1113,8 @@ impl RunnerState { assigned } - pub(crate) fn mark_latest_assignments( + /// TODO: make this `pub(crate)` again + pub fn mark_latest_assignments( &mut self, node_id: Uuid, assignments: &HashMap, @@ -1129,7 +1132,9 @@ impl RunnerState { /// Example IR: /// - total = @sum(values) /// A data-flow edge is added from the values assignment node to the action. - pub(crate) fn record_data_flow_from_value(&mut self, node_id: Uuid, value: &ValueExpr) { + /// + /// TODO: make this `pub(crate)` again + pub fn record_data_flow_from_value(&mut self, node_id: Uuid, value: &ValueExpr) { let source_ids = collect_value_sources(value, &|name| self.latest_assignments.get(name).copied()); self.record_data_flow_edges(node_id, &source_ids); @@ -1790,7 +1795,7 @@ fn format_literal(value: &serde_json::Value) -> String { /// /// Example IR: /// - Literal(int_value=3) -> 3 -pub(crate) fn literal_value(lit: &ir::Literal) -> serde_json::Value { +pub fn literal_value(lit: &ir::Literal) -> serde_json::Value { match lit.value.as_ref() { Some(ir::literal::Value::IntValue(value)) => serde_json::Value::Number((*value).into()), Some(ir::literal::Value::FloatValue(value)) => serde_json::Number::from_f64(*value) @@ -1892,8 +1897,8 @@ impl fmt::Display for NodeStatus { #[cfg(test)] mod tests { use super::*; - use crate::messages::ast as ir; use serde_json::Value; + use waymark_proto::ast as ir; fn action_plus_two_expr() -> ir::Expr { ir::Expr { diff --git a/crates/lib/runner-state/src/util.rs b/crates/lib/runner-state/src/util.rs new file mode 100644 index 00000000..20768070 --- /dev/null +++ b/crates/lib/runner-state/src/util.rs @@ -0,0 +1,12 @@ +pub(crate) fn is_truthy(value: &serde_json::Value) -> bool { + match value { + serde_json::Value::Null => false, + serde_json::Value::Bool(value) => *value, + serde_json::Value::Number(number) => { + number.as_f64().map(|value| value != 0.0).unwrap_or(false) + } + serde_json::Value::String(value) => !value.is_empty(), + serde_json::Value::Array(values) => !values.is_empty(), + serde_json::Value::Object(map) => !map.is_empty(), + } +} diff --git a/crates/waymark/src/waymark_core/runner/value_visitor.rs b/crates/lib/runner-state/src/value_visitor.rs similarity index 99% rename from crates/waymark/src/waymark_core/runner/value_visitor.rs rename to crates/lib/runner-state/src/value_visitor.rs index 82f02db1..fbc7736a 100644 --- a/crates/waymark/src/waymark_core/runner/value_visitor.rs +++ b/crates/lib/runner-state/src/value_visitor.rs @@ -362,7 +362,7 @@ mod tests { use uuid::Uuid; use super::*; - use crate::messages::ast as ir; + use waymark_proto::ast as ir; fn literal_int(value: i64) -> ValueExpr { ValueExpr::Literal(LiteralValue { diff --git a/crates/lib/runner/Cargo.toml b/crates/lib/runner/Cargo.toml new file mode 100644 index 00000000..115de256 --- /dev/null +++ b/crates/lib/runner/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "waymark-runner" +version = "0.1.0" +edition = "2024" + +[dependencies] +chrono = { workspace = true, features = ["serde"] } +rustc-hash = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +uuid = { workspace = true } +waymark-dag = { workspace = true } +waymark-proto = { workspace = true } +waymark-observability = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-core-backend = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +waymark-ir-parser = { workspace = true } +waymark-backend-memory = { workspace = true } + +[features] +trace = [] diff --git a/crates/waymark/src/waymark_core/runner/executor.rs b/crates/lib/runner/src/executor.rs similarity index 98% rename from crates/waymark/src/waymark_core/runner/executor.rs rename to crates/lib/runner/src/executor.rs index e0572688..bae9a9c2 100644 --- a/crates/waymark/src/waymark_core/runner/executor.rs +++ b/crates/lib/runner/src/executor.rs @@ -10,24 +10,20 @@ use rustc_hash::FxHashMap; use serde_json::Value; use uuid::Uuid; -use crate::backends::{ActionAttemptStatus, ActionDone, CoreBackend, GraphUpdate}; -use crate::messages::ast as ir; -use crate::observability::obs; -use crate::waymark_core::runner::expression_evaluator::is_exception_value; -use crate::waymark_core::runner::retry::{ - RetryDecision, RetryPolicyEvaluator, timeout_seconds_from_policies, +use crate::expression_evaluator::is_exception_value; +use crate::retry::{RetryDecision, RetryPolicyEvaluator, timeout_seconds_from_policies}; +use crate::synthetic_exceptions::{SyntheticExceptionType, build_synthetic_exception_value}; +use waymark_core_backend::{ActionAttemptStatus, ActionDone, CoreBackend, GraphUpdate}; +use waymark_dag::{ + ActionCallNode, AggregatorNode, DAG, DAGEdge, DagEdgeIndex, EXCEPTION_SCOPE_VAR, EdgeType, }; -use crate::waymark_core::runner::state::{ +use waymark_observability::obs; +use waymark_proto::ast as ir; +use waymark_runner_state::value_visitor::ValueExpr; +use waymark_runner_state::{ ActionCallSpec, ExecutionEdge, ExecutionNode, ExecutionNodeType, IndexValue, ListValue, LiteralValue, NodeStatus, QueueNodeParams, RunnerState, RunnerStateError, }; -use crate::waymark_core::runner::synthetic_exceptions::{ - SyntheticExceptionType, build_synthetic_exception_value, -}; -use crate::waymark_core::runner::value_visitor::ValueExpr; -use waymark_dag::{ - ActionCallNode, AggregatorNode, DAG, DAGEdge, DagEdgeIndex, EXCEPTION_SCOPE_VAR, EdgeType, -}; /// Raised when the runner executor cannot advance safely. #[derive(Debug, thiserror::Error)] @@ -1505,16 +1501,14 @@ mod tests { use std::collections::{HashMap, HashSet}; use std::sync::Arc; - use crate::backends::MemoryBackend; - use crate::messages::ast as ir; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::state::{ - ExecutionEdge, ExecutionNode, NodeStatus, RunnerState, - }; + use waymark_backend_memory::MemoryBackend; use waymark_dag::{ ActionCallNode, ActionCallParams, AggregatorNode, AssignmentNode, DAG, DAGEdge, convert_to_dag, }; + use waymark_ir_parser::parse_program; + use waymark_proto::ast as ir; + use waymark_runner_state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; fn variable(name: &str) -> ir::Expr { ir::Expr { @@ -1992,16 +1986,12 @@ mod tests { rehydrated.state().ready_queue.is_empty() ); - let replay_canonical = crate::waymark_core::runner::replay_variables( - canonical.state(), - canonical.action_results(), - ) - .expect("replay canonical"); - let replay_rehydrated = crate::waymark_core::runner::replay_variables( - rehydrated.state(), - rehydrated.action_results(), - ) - .expect("replay rehydrated"); + let replay_canonical = + crate::replay_variables(canonical.state(), canonical.action_results()) + .expect("replay canonical"); + let replay_rehydrated = + crate::replay_variables(rehydrated.state(), rehydrated.action_results()) + .expect("replay rehydrated"); let mut assignment_counts: HashMap = HashMap::new(); for node in canonical.state().nodes.values() { @@ -2712,21 +2702,15 @@ fn main(input: [], output: [done]): let mut executor = RunnerExecutor::new(dag.clone(), state, action_results, None); executor.increment(&[exec1.node_id]).expect("increment"); - let orig_replay = crate::waymark_core::runner::replay_variables( - executor.state(), - executor.action_results(), - ) - .expect("replay"); + let orig_replay = + crate::replay_variables(executor.state(), executor.action_results()).expect("replay"); let (nodes_snap, edges_snap, results_snap) = snapshot_state(executor.state(), executor.action_results()); let rehydrated = create_rehydrated_executor(&dag, nodes_snap, edges_snap, results_snap); - let rehy_replay = crate::waymark_core::runner::replay_variables( - rehydrated.state(), - rehydrated.action_results(), - ) - .expect("replay"); + let rehy_replay = crate::replay_variables(rehydrated.state(), rehydrated.action_results()) + .expect("replay"); assert_eq!(orig_replay.variables, rehy_replay.variables); assert_eq!( rehy_replay.variables.get("doubled"), diff --git a/crates/waymark/src/waymark_core/runner/expression_evaluator.rs b/crates/lib/runner/src/expression_evaluator.rs similarity index 98% rename from crates/waymark/src/waymark_core/runner/expression_evaluator.rs rename to crates/lib/runner/src/expression_evaluator.rs index 96d908e2..dac989a9 100644 --- a/crates/waymark/src/waymark_core/runner/expression_evaluator.rs +++ b/crates/lib/runner/src/expression_evaluator.rs @@ -5,15 +5,15 @@ use std::rc::Rc; use serde_json::Value; use uuid::Uuid; -use crate::messages::ast as ir; -use crate::observability::obs; -use crate::waymark_core::runner::state::{ +use waymark_dag::{DAGEdge, EdgeType}; +use waymark_observability::obs; +use waymark_proto::ast as ir; +use waymark_runner_state::{ ActionCallSpec, ActionResultValue, BinaryOpValue, DictEntryValue, DictValue, DotValue, FunctionCallValue, IndexValue, ListValue, LiteralValue, UnaryOpValue, VariableValue, literal_value, + value_visitor::{ValueExpr, ValueExprEvaluator}, }; -use crate::waymark_core::runner::value_visitor::{ValueExpr, ValueExprEvaluator}; -use waymark_dag::{DAGEdge, EdgeType}; use super::{RunnerExecutor, RunnerExecutorError}; @@ -674,15 +674,13 @@ mod tests { use uuid::Uuid; use super::*; - use crate::messages::ast as ir; - use crate::waymark_core::ir_parser::IRParser; - use crate::waymark_core::runner::RunnerState; - use crate::waymark_core::runner::state::{ + use waymark_dag::{DAG, DAGEdge}; + use waymark_ir_parser::IRParser; + use waymark_proto::ast as ir; + use waymark_runner_state::{ ActionCallSpec, ActionResultValue, BinaryOpValue, FunctionCallValue, LiteralValue, - VariableValue, + RunnerState, VariableValue, value_visitor::ValueExpr, }; - use crate::waymark_core::runner::value_visitor::ValueExpr; - use waymark_dag::{DAG, DAGEdge}; fn parse_expr(source: &str) -> ir::Expr { IRParser::new(" ") @@ -801,7 +799,7 @@ mod tests { #[test] fn test_evaluate_value_expr_happy_path() { let executor = executor_with_assignment("x", literal_int(3)); - let expr = ValueExpr::BinaryOp(crate::waymark_core::runner::state::BinaryOpValue { + let expr = ValueExpr::BinaryOp(waymark_runner_state::BinaryOpValue { left: Box::new(ValueExpr::Variable(VariableValue { name: "x".to_string(), })), diff --git a/crates/waymark/src/waymark_core/runner/mod.rs b/crates/lib/runner/src/lib.rs similarity index 53% rename from crates/waymark/src/waymark_core/runner/mod.rs rename to crates/lib/runner/src/lib.rs index 4e7a491d..684a49a0 100644 --- a/crates/waymark/src/waymark_core/runner/mod.rs +++ b/crates/lib/runner/src/lib.rs @@ -4,16 +4,11 @@ pub mod executor; pub mod expression_evaluator; pub mod replay; pub(crate) mod retry; -pub mod state; -pub(crate) mod synthetic_exceptions; -pub mod value_visitor; + +/// TODO: make `pub(crate)` +pub mod synthetic_exceptions; pub use executor::{ DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, }; pub use replay::{ReplayError, ReplayResult, replay_action_kwargs, replay_variables}; -pub use state::{ - ActionCallSpec, ActionResultValue, ExecutionEdge, ExecutionNode, NodeStatus, RunnerState, - RunnerStateError, format_value, -}; -pub use value_visitor::ValueExpr; diff --git a/crates/waymark/src/waymark_core/runner/replay.rs b/crates/lib/runner/src/replay.rs similarity index 98% rename from crates/waymark/src/waymark_core/runner/replay.rs rename to crates/lib/runner/src/replay.rs index 246caace..ffb413a1 100644 --- a/crates/waymark/src/waymark_core/runner/replay.rs +++ b/crates/lib/runner/src/replay.rs @@ -7,14 +7,16 @@ use std::rc::Rc; use serde_json::Value; use uuid::Uuid; -use crate::messages::ast as ir; -use crate::waymark_core::runner::expression_evaluator::{ +use crate::expression_evaluator::{ add_values, compare_values, int_value, is_exception_value, is_truthy, len_of_value, numeric_op, range_from_args, value_in, }; -use crate::waymark_core::runner::state::{ActionResultValue, FunctionCallValue, RunnerState}; -use crate::waymark_core::runner::value_visitor::{ValueExpr, ValueExprEvaluator}; use waymark_dag::{EXCEPTION_SCOPE_VAR, EdgeType}; +use waymark_proto::ast as ir; +use waymark_runner_state::{ + ActionResultValue, FunctionCallValue, RunnerState, + value_visitor::{ValueExpr, ValueExprEvaluator}, +}; /// Raised when replay cannot reconstruct variable values. #[derive(Debug, thiserror::Error)] @@ -515,9 +517,8 @@ pub fn replay_action_kwargs( #[cfg(test)] mod tests { use super::*; - use crate::messages::ast as ir; - use crate::waymark_core::runner::state::{RunnerState, VariableValue}; - use crate::waymark_core::runner::value_visitor::ValueExpr; + use waymark_proto::ast as ir; + use waymark_runner_state::{RunnerState, VariableValue, value_visitor::ValueExpr}; fn action_plus_two_expr() -> ir::Expr { ir::Expr { diff --git a/crates/waymark/src/waymark_core/runner/retry.rs b/crates/lib/runner/src/retry.rs similarity index 99% rename from crates/waymark/src/waymark_core/runner/retry.rs rename to crates/lib/runner/src/retry.rs index e6fb4d70..a24f7a2d 100644 --- a/crates/waymark/src/waymark_core/runner/retry.rs +++ b/crates/lib/runner/src/retry.rs @@ -1,6 +1,6 @@ //! Retry/timeout policy helpers shared by runner components. -use crate::messages::ast as ir; +use waymark_proto::ast as ir; #[derive(Clone, Debug)] pub(crate) struct RetryDecision { diff --git a/crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs b/crates/lib/runner/src/synthetic_exceptions.rs similarity index 91% rename from crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs rename to crates/lib/runner/src/synthetic_exceptions.rs index df89b71f..5bd2be0d 100644 --- a/crates/waymark/src/waymark_core/runner/synthetic_exceptions.rs +++ b/crates/lib/runner/src/synthetic_exceptions.rs @@ -3,13 +3,13 @@ use serde_json::Value; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub(crate) enum SyntheticExceptionType { +pub enum SyntheticExceptionType { ExecutorResume, ActionTimeout, } impl SyntheticExceptionType { - pub(crate) fn as_type_str(self) -> &'static str { + pub fn as_type_str(self) -> &'static str { match self { Self::ExecutorResume => "ExecutorResume", Self::ActionTimeout => "ActionTimeout", @@ -24,7 +24,7 @@ impl SyntheticExceptionType { } } - pub(crate) fn from_value(value: &Value) -> Option { + pub fn from_value(value: &Value) -> Option { let Value::Object(map) = value else { return None; }; @@ -34,7 +34,7 @@ impl SyntheticExceptionType { } } -pub(crate) fn build_synthetic_exception_value( +pub fn build_synthetic_exception_value( exception_type: SyntheticExceptionType, message: impl Into, fields: Vec<(String, Value)>, diff --git a/crates/lib/scheduler-backend/Cargo.toml b/crates/lib/scheduler-backend/Cargo.toml new file mode 100644 index 00000000..6af1c2bb --- /dev/null +++ b/crates/lib/scheduler-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-scheduler-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-scheduler-core = { workspace = true } diff --git a/crates/lib/scheduler-backend/src/lib.rs b/crates/lib/scheduler-backend/src/lib.rs new file mode 100644 index 00000000..613cc4ac --- /dev/null +++ b/crates/lib/scheduler-backend/src/lib.rs @@ -0,0 +1,29 @@ +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, WorkflowSchedule}; + +/// Backend capability for workflow schedule persistence. +#[async_trait::async_trait] +pub trait SchedulerBackend: Send + Sync { + async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult; + async fn get_schedule(&self, id: ScheduleId) -> BackendResult; + async fn get_schedule_by_name( + &self, + workflow_name: &str, + schedule_name: &str, + ) -> BackendResult>; + async fn list_schedules(&self, limit: i64, offset: i64) + -> BackendResult>; + async fn count_schedules(&self) -> BackendResult; + async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult; + async fn delete_schedule(&self, id: ScheduleId) -> BackendResult; + async fn find_due_schedules(&self, limit: i32) -> BackendResult>; + async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult; + async fn mark_schedule_executed( + &self, + schedule_id: ScheduleId, + instance_id: Uuid, + ) -> BackendResult<()>; + async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()>; +} diff --git a/crates/lib/scheduler-core/Cargo.toml b/crates/lib/scheduler-core/Cargo.toml new file mode 100644 index 00000000..9659e878 --- /dev/null +++ b/crates/lib/scheduler-core/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "waymark-scheduler-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +uuid = { workspace = true, features = ["serde", "v4"] } +chrono = { workspace = true, features = ["serde"] } +serde = { workspace = true, features = ["derive"] } +cron = { workspace = true } +rand = { workspace = true } + +[dev-dependencies] +chrono = { workspace = true, features = ["clock"] } diff --git a/crates/lib/scheduler-core/src/lib.rs b/crates/lib/scheduler-core/src/lib.rs new file mode 100644 index 00000000..02d2783b --- /dev/null +++ b/crates/lib/scheduler-core/src/lib.rs @@ -0,0 +1,6 @@ +mod types; +mod utils; + +pub use self::types::*; + +pub use self::utils::*; diff --git a/crates/waymark/src/scheduler/types.rs b/crates/lib/scheduler-core/src/types.rs similarity index 100% rename from crates/waymark/src/scheduler/types.rs rename to crates/lib/scheduler-core/src/types.rs diff --git a/crates/waymark/src/scheduler/utils.rs b/crates/lib/scheduler-core/src/utils.rs similarity index 100% rename from crates/waymark/src/scheduler/utils.rs rename to crates/lib/scheduler-core/src/utils.rs diff --git a/crates/lib/test-support/Cargo.toml b/crates/lib/test-support/Cargo.toml new file mode 100644 index 00000000..829d2395 --- /dev/null +++ b/crates/lib/test-support/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "waymark-test-support" +version = "0.1.0" +edition = "2024" + +[dependencies] +sqlx = { workspace = true } +waymark-integration-support = { workspace = true } diff --git a/crates/waymark/src/test_support/mod.rs b/crates/lib/test-support/src/lib.rs similarity index 100% rename from crates/waymark/src/test_support/mod.rs rename to crates/lib/test-support/src/lib.rs diff --git a/crates/waymark/src/test_support/postgres.rs b/crates/lib/test-support/src/postgres.rs similarity index 83% rename from crates/waymark/src/test_support/postgres.rs rename to crates/lib/test-support/src/postgres.rs index 1fb9e50f..e76bf812 100644 --- a/crates/waymark/src/test_support/postgres.rs +++ b/crates/lib/test-support/src/postgres.rs @@ -2,7 +2,7 @@ use sqlx::PgPool; -use crate::integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; +use waymark_integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; /// Ensure test Postgres is available and migrated, then return a pooled connection. pub async fn postgres_setup() -> PgPool { diff --git a/crates/lib/webapp-backend/Cargo.toml b/crates/lib/webapp-backend/Cargo.toml new file mode 100644 index 00000000..735810b3 --- /dev/null +++ b/crates/lib/webapp-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-webapp-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-webapp-core = { workspace = true } diff --git a/crates/lib/webapp-backend/src/lib.rs b/crates/lib/webapp-backend/src/lib.rs new file mode 100644 index 00000000..bc8f365c --- /dev/null +++ b/crates/lib/webapp-backend/src/lib.rs @@ -0,0 +1,54 @@ +use uuid::Uuid; +use waymark_backends_core::BackendResult; +use waymark_webapp_core::{ + ExecutionGraphView, InstanceDetail, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, + ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, +}; + +/// Backend capability for webapp-specific queries. +#[async_trait::async_trait] +pub trait WebappBackend: Send + Sync { + async fn count_instances(&self, search: Option<&str>) -> BackendResult; + async fn list_instances( + &self, + search: Option<&str>, + limit: i64, + offset: i64, + ) -> BackendResult>; + async fn get_instance(&self, instance_id: Uuid) -> BackendResult; + async fn get_execution_graph( + &self, + instance_id: Uuid, + ) -> BackendResult>; + async fn get_workflow_graph( + &self, + instance_id: Uuid, + ) -> BackendResult>; + async fn get_action_results(&self, instance_id: Uuid) -> BackendResult>; + async fn get_distinct_workflows(&self) -> BackendResult>; + async fn get_distinct_statuses(&self) -> BackendResult>; + async fn count_schedules(&self) -> BackendResult; + async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult>; + async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult; + async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult; + async fn list_schedule_invocations( + &self, + schedule_id: Uuid, + limit: i64, + offset: i64, + ) -> BackendResult>; + async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult; + async fn get_distinct_schedule_statuses(&self) -> BackendResult>; + async fn get_distinct_schedule_types(&self) -> BackendResult>; + async fn get_worker_action_stats( + &self, + window_minutes: i64, + ) -> BackendResult>; + async fn get_worker_aggregate_stats( + &self, + window_minutes: i64, + ) -> BackendResult; + async fn worker_status_table_exists(&self) -> bool; + async fn schedules_table_exists(&self) -> bool; + async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult>; +} diff --git a/crates/lib/webapp-core/Cargo.toml b/crates/lib/webapp-core/Cargo.toml new file mode 100644 index 00000000..2b51dc6d --- /dev/null +++ b/crates/lib/webapp-core/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-webapp-core" +version = "0.1.0" +edition = "2024" + +[dependencies] +uuid = { workspace = true, features = ["serde"] } +chrono = { workspace = true, features = ["serde"] } +serde = { workspace = true, features = ["derive"] } diff --git a/crates/lib/webapp-core/src/lib.rs b/crates/lib/webapp-core/src/lib.rs new file mode 100644 index 00000000..61a4a453 --- /dev/null +++ b/crates/lib/webapp-core/src/lib.rs @@ -0,0 +1,247 @@ +//! Shared types for the webapp. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Instance status. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum InstanceStatus { + Queued, + Running, + Completed, + Failed, +} + +impl std::fmt::Display for InstanceStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Queued => write!(f, "queued"), + Self::Running => write!(f, "running"), + Self::Completed => write!(f, "completed"), + Self::Failed => write!(f, "failed"), + } + } +} + +/// Summary of a workflow instance for listing. +#[derive(Debug, Clone, Serialize)] +pub struct InstanceSummary { + pub id: Uuid, + pub entry_node: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, + pub workflow_name: Option, + pub input_preview: String, +} + +/// Full details of a workflow instance. +#[derive(Debug, Clone, Serialize)] +pub struct InstanceDetail { + pub id: Uuid, + pub entry_node: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, + pub workflow_name: Option, + pub input_payload: String, + pub result_payload: String, + pub error_payload: Option, +} + +/// Node in the execution graph for display. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionNodeView { + pub id: String, + pub node_type: String, + pub label: String, + pub status: String, + pub action_name: Option, + pub module_name: Option, +} + +/// Edge in the execution graph for display. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionEdgeView { + pub source: String, + pub target: String, + pub edge_type: String, +} + +/// Execution graph data for rendering. +#[derive(Debug, Clone, Serialize)] +pub struct ExecutionGraphView { + pub nodes: Vec, + pub edges: Vec, +} + +/// Timeline entry for an action execution. +#[derive(Debug, Clone, Serialize)] +pub struct TimelineEntry { + pub action_id: String, + pub action_name: String, + pub module_name: Option, + pub status: String, + pub attempt_number: i32, + pub dispatched_at: Option, + pub completed_at: Option, + pub duration_ms: Option, + pub request_preview: String, + pub response_preview: String, + pub error: Option, +} + +/// Action log entry with full details. +#[derive(Debug, Clone, Serialize)] +pub struct ActionLogEntry { + pub action_id: String, + pub action_name: String, + pub module_name: Option, + pub status: String, + pub attempt_number: i32, + pub dispatched_at: Option, + pub completed_at: Option, + pub duration_ms: Option, + pub request: String, + pub response: String, + pub error: Option, +} + +/// Response for the workflow run data API. +#[derive(Debug, Serialize)] +pub struct WorkflowRunDataResponse { + pub nodes: Vec, + pub timeline: Vec, + pub page: i64, + pub per_page: i64, + pub total: i64, + pub has_more: bool, +} + +/// Response for action logs API. +#[derive(Debug, Serialize)] +pub struct ActionLogsResponse { + pub logs: Vec, +} + +/// Filter values response. +#[derive(Debug, Serialize)] +pub struct FilterValuesResponse { + pub values: Vec, +} + +/// Health check response. +#[derive(Debug, Serialize)] +pub struct HealthResponse { + pub status: &'static str, + pub service: &'static str, +} + +/// Export format for a workflow instance. +#[derive(Debug, Serialize)] +pub struct WorkflowInstanceExport { + pub export_version: &'static str, + pub exported_at: String, + pub instance: InstanceExportInfo, + pub nodes: Vec, + pub timeline: Vec, +} + +/// Full worker status for webapp display. +#[derive(Debug, Clone)] +pub struct WorkerStatus { + pub pool_id: Uuid, + pub active_workers: i32, + pub throughput_per_min: f64, + pub actions_per_sec: f64, + pub total_completed: i64, + pub last_action_at: Option>, + pub updated_at: DateTime, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub dispatch_queue_size: Option, + pub total_in_flight: Option, + pub median_instance_duration_secs: Option, + pub active_instance_count: i32, + pub total_instances_completed: i64, + pub instances_per_sec: f64, + pub instances_per_min: f64, + pub time_series: Option>, +} + +/// Worker action stats row for display. +#[derive(Debug, Clone)] +pub struct WorkerActionRow { + pub pool_id: String, + pub active_workers: i64, + pub actions_per_sec: String, + pub throughput_per_min: i64, + pub total_completed: i64, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub last_action_at: Option, + pub updated_at: String, +} + +/// Aggregate worker stats for overview cards. +#[derive(Debug, Clone)] +pub struct WorkerAggregateStats { + pub active_worker_count: i64, + pub actions_per_sec: String, + pub total_in_flight: i64, + pub total_queue_depth: i64, +} + +/// Instance info for export. +#[derive(Debug, Serialize)] +pub struct InstanceExportInfo { + pub id: String, + pub status: String, + pub created_at: String, + pub input_payload: String, + pub result_payload: String, +} + +/// Schedule summary for listing. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleSummary { + pub id: String, + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: String, + pub cron_expression: Option, + pub interval_seconds: Option, + pub status: String, + pub next_run_at: Option, + pub last_run_at: Option, + pub created_at: String, +} + +/// Full schedule details. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleDetail { + pub id: String, + pub workflow_name: String, + pub schedule_name: String, + pub schedule_type: String, + pub cron_expression: Option, + pub interval_seconds: Option, + pub jitter_seconds: i64, + pub status: String, + pub next_run_at: Option, + pub last_run_at: Option, + pub last_instance_id: Option, + pub created_at: String, + pub updated_at: String, + pub priority: i32, + pub allow_duplicate: bool, + pub input_payload: Option, +} + +/// Invocation summary row for schedule detail pages. +#[derive(Debug, Clone, Serialize)] +pub struct ScheduleInvocationSummary { + pub id: Uuid, + pub created_at: DateTime, + pub status: InstanceStatus, +} diff --git a/crates/lib/worker-status-backend/Cargo.toml b/crates/lib/worker-status-backend/Cargo.toml new file mode 100644 index 00000000..ff50466a --- /dev/null +++ b/crates/lib/worker-status-backend/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "waymark-worker-status-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/lib/worker-status-backend/src/lib.rs b/crates/lib/worker-status-backend/src/lib.rs new file mode 100644 index 00000000..bc23eb4e --- /dev/null +++ b/crates/lib/worker-status-backend/src/lib.rs @@ -0,0 +1,32 @@ +//! Worker status backend. + +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +/// Worker status update for persistence. +#[derive(Clone, Debug)] +pub struct WorkerStatusUpdate { + pub pool_id: Uuid, + pub throughput_per_min: f64, + pub total_completed: i64, + pub last_action_at: Option>, + pub median_dequeue_ms: Option, + pub median_handling_ms: Option, + pub dispatch_queue_size: i64, + pub total_in_flight: i64, + pub active_workers: i32, + pub actions_per_sec: f64, + pub median_instance_duration_secs: Option, + pub active_instance_count: i32, + pub total_instances_completed: i64, + pub instances_per_sec: f64, + pub instances_per_min: f64, + pub time_series: Option>, +} + +/// Backend capability for recording worker status metrics. +#[async_trait::async_trait] +pub trait WorkerStatusBackend: Send + Sync { + async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()>; +} diff --git a/crates/lib/workflow-registry-backend/Cargo.toml b/crates/lib/workflow-registry-backend/Cargo.toml new file mode 100644 index 00000000..2dc85a4d --- /dev/null +++ b/crates/lib/workflow-registry-backend/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "waymark-workflow-registry-backend" +version = "0.1.0" +edition = "2024" + +[dependencies] +async-trait = { workspace = true } +uuid = { workspace = true } +waymark-backends-core = { workspace = true } diff --git a/crates/lib/workflow-registry-backend/src/lib.rs b/crates/lib/workflow-registry-backend/src/lib.rs new file mode 100644 index 00000000..041c8482 --- /dev/null +++ b/crates/lib/workflow-registry-backend/src/lib.rs @@ -0,0 +1,35 @@ +use uuid::Uuid; + +pub use waymark_backends_core::{BackendError, BackendResult}; + +/// Registration payload for storing workflow DAG metadata. +#[derive(Clone, Debug)] +pub struct WorkflowRegistration { + pub workflow_name: String, + pub workflow_version: String, + pub ir_hash: String, + pub program_proto: Vec, + pub concurrent: bool, +} + +#[derive(Clone, Debug)] +/// Stored workflow version metadata and IR payload. +pub struct WorkflowVersion { + pub id: Uuid, + pub workflow_name: String, + pub workflow_version: String, + pub ir_hash: String, + pub program_proto: Vec, + pub concurrent: bool, +} + +/// Backend capability for registering workflow DAGs. +#[async_trait::async_trait] +pub trait WorkflowRegistryBackend: Send + Sync { + async fn upsert_workflow_version( + &self, + registration: &WorkflowRegistration, + ) -> BackendResult; + + async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult>; +} diff --git a/crates/waymark/Cargo.toml b/crates/waymark/Cargo.toml index 998ba4f9..91b817bf 100644 --- a/crates/waymark/Cargo.toml +++ b/crates/waymark/Cargo.toml @@ -17,8 +17,25 @@ name = "smoke" path = "src/bin/smoke.rs" [dependencies] -waymark-proto = { workspace = true, features = ["serde", "client", "server"] } +waymark-core-backend = { workspace = true } waymark-dag = { workspace = true } +waymark-ir-parser = { workspace = true } +waymark-observability = { workspace = true } +waymark-proto = { workspace = true, features = ["serde", "client", "server"] } +waymark-runner = { workspace = true } +waymark-runner-state = { workspace = true } +waymark-webapp-backend = { workspace = true } +waymark-webapp-core = { workspace = true } +waymark-garbage-collector-backend = { workspace = true } +waymark-scheduler-backend = { workspace = true } +waymark-scheduler-core = { workspace = true } +waymark-backends-core = { workspace = true } +waymark-integration-support = { workspace = true } +waymark-backend-postgres = { workspace = true } +waymark-backend-postgres-migrations = { workspace = true } +waymark-workflow-registry-backend = { workspace = true } +waymark-worker-status-backend = { workspace = true } +waymark-backend-memory = { workspace = true } anyhow = "1" axum = "0.8" @@ -39,9 +56,10 @@ sqlx = { version = "0.8", default-features = false, features = ["runtime-tokio-r tera = "1" uuid = { version = "1", features = ["serde", "v4"] } dotenvy = "0.15" -thiserror = "1" +thiserror = { workspace = true } tokio = { version = "1", features = ["full"] } tokio-stream = { version = "0.1", features = ["net"] } +tokio-util = "0.7" tonic = { version = "0.11", features = ["transport"] } tonic-health = "0.11" tracing = "0.1" @@ -49,7 +67,6 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-chrome = "0.7" metrics = "0.24" regex = "1" -waymark-observability-macros = { path = "../observability-macros" } console-subscriber = { version = "0.5", optional = true } [features] @@ -57,6 +74,10 @@ trace = [] observability = ["trace", "dep:console-subscriber"] [dev-dependencies] +waymark-backend-fault-injection = { workspace = true } +waymark-backend-memory = { workspace = true } +waymark-test-support = { workspace = true } + serial_test = "2" tower = { version = "0.5", features = ["util"] } http-body-util = "0.1" diff --git a/crates/waymark/src/backends/base.rs b/crates/waymark/src/backends/base.rs deleted file mode 100644 index 92c17a3f..00000000 --- a/crates/waymark/src/backends/base.rs +++ /dev/null @@ -1,366 +0,0 @@ -//! Backend interfaces for persisting runner state and action results. - -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Deserializer, Serialize}; -use serde_json::Value; -use tonic::async_trait; -use uuid::Uuid; - -use crate::scheduler::{CreateScheduleParams, ScheduleId, WorkflowSchedule}; -use crate::waymark_core::runner::state::{ExecutionEdge, ExecutionNode, NodeStatus, RunnerState}; -use crate::webapp::{ - ExecutionGraphView, InstanceDetail, InstanceSummary, ScheduleDetail, ScheduleInvocationSummary, - ScheduleSummary, TimelineEntry, WorkerActionRow, WorkerAggregateStats, WorkerStatus, -}; -use waymark_dag::DAG; - -#[derive(Debug, thiserror::Error)] -pub enum BackendError { - #[error("{0}")] - Message(String), - #[error(transparent)] - Sqlx(#[from] sqlx::Error), - #[error(transparent)] - Serialization(#[from] serde_json::Error), -} - -pub type BackendResult = Result; - -fn default_instance_id() -> Uuid { - Uuid::new_v4() -} - -fn default_action_results() -> HashMap { - HashMap::new() -} - -fn deserialize_action_results<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let value = Option::>::deserialize(deserializer)?; - Ok(value.unwrap_or_default()) -} - -// The models that we use for our backends are similar to the ones that we -// have specified in our database/Postgres backend, but not 1:1. It's better for -// us to internally convert within the given backend - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Queued instance payload for the run loop. -pub struct QueuedInstance { - pub workflow_version_id: Uuid, - #[serde(default)] - pub schedule_id: Option, - #[serde(skip, default)] - pub dag: Option>, - pub entry_node: Uuid, - pub state: Option, - #[serde( - default = "default_action_results", - deserialize_with = "deserialize_action_results" - )] - pub action_results: HashMap, - #[serde(default = "default_instance_id")] - pub instance_id: Uuid, - #[serde(default)] - pub scheduled_at: Option>, -} - -#[derive(Clone, Debug)] -/// Result payload for queued instance polling. -pub struct QueuedInstanceBatch { - pub instances: Vec, -} - -#[derive(Clone, Debug)] -/// Lock claim settings for owned instances. -pub struct LockClaim { - pub lock_uuid: Uuid, - pub lock_expires_at: DateTime, -} - -#[derive(Clone, Debug)] -/// Current lock status for an instance. -pub struct InstanceLockStatus { - pub instance_id: Uuid, - pub lock_uuid: Option, - pub lock_expires_at: Option>, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Completed instance payload with result or exception. -pub struct InstanceDone { - pub executor_id: Uuid, - pub entry_node: Uuid, - pub result: Option, - pub error: Option, -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Batch payload representing an updated execution graph snapshot. -/// -/// This intentionally stores only runtime nodes and edges (no DAG template or -/// derived caches) so persistence stays lightweight. -pub struct GraphUpdate { - pub instance_id: Uuid, - pub nodes: HashMap, - pub edges: HashSet, -} - -impl GraphUpdate { - pub fn from_state(instance_id: Uuid, state: &RunnerState) -> Self { - Self { - instance_id, - nodes: state.nodes.clone(), - edges: state.edges.clone(), - } - } - - pub fn next_scheduled_at(&self) -> DateTime { - let mut next: Option> = None; - for node in self.nodes.values() { - if matches!(node.status, NodeStatus::Completed | NodeStatus::Failed) { - continue; - } - if let Some(scheduled_at) = node.scheduled_at { - next = Some(match next { - Some(existing) => existing.min(scheduled_at), - None => scheduled_at, - }); - } - } - next.unwrap_or_else(Utc::now) - } -} - -#[derive(Clone, Debug, Serialize, Deserialize)] -/// Batch payload representing a finished action attempt (success or failure). -pub struct ActionDone { - pub execution_id: Uuid, - pub attempt: i32, - pub status: ActionAttemptStatus, - pub started_at: Option>, - pub completed_at: Option>, - pub duration_ms: Option, - pub result: Value, -} - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "snake_case")] -pub enum ActionAttemptStatus { - Completed, - Failed, - TimedOut, -} - -impl std::fmt::Display for ActionAttemptStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Completed => write!(f, "completed"), - Self::Failed => write!(f, "failed"), - Self::TimedOut => write!(f, "timed_out"), - } - } -} - -/// Worker status update for persistence. -#[derive(Clone, Debug)] -pub struct WorkerStatusUpdate { - pub pool_id: Uuid, - pub throughput_per_min: f64, - pub total_completed: i64, - pub last_action_at: Option>, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub dispatch_queue_size: i64, - pub total_in_flight: i64, - pub active_workers: i32, - pub actions_per_sec: f64, - pub median_instance_duration_secs: Option, - pub active_instance_count: i32, - pub total_instances_completed: i64, - pub instances_per_sec: f64, - pub instances_per_min: f64, - pub time_series: Option>, -} - -/// Backend capability for recording worker status metrics. -#[async_trait] -pub trait WorkerStatusBackend: Send + Sync { - async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()>; -} - -/// Abstract persistence backend for runner state. -#[async_trait] -pub trait CoreBackend: Send + Sync { - fn clone_box(&self) -> Box; - - /// Persist updated execution graphs. - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult>; - - /// Persist finished action attempts (success or failure). - async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()>; - - /// Return up to size queued instances without blocking. - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult; - - /// Refresh lock expiry for owned instances. - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult>; - - /// Release instance locks when evicting from memory. - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()>; - - /// Persist completed workflow instances. - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()>; - - /// Insert queued instances for run-loop consumption. - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()>; -} - -/// Registration payload for storing workflow DAG metadata. -#[derive(Clone, Debug)] -pub struct WorkflowRegistration { - pub workflow_name: String, - pub workflow_version: String, - pub ir_hash: String, - pub program_proto: Vec, - pub concurrent: bool, -} - -#[derive(Clone, Debug)] -/// Stored workflow version metadata and IR payload. -pub struct WorkflowVersion { - pub id: Uuid, - pub workflow_name: String, - pub workflow_version: String, - pub ir_hash: String, - pub program_proto: Vec, - pub concurrent: bool, -} - -/// Backend capability for registering workflow DAGs. -#[async_trait] -pub trait WorkflowRegistryBackend: Send + Sync { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult; - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult>; -} - -/// Backend capability for workflow schedule persistence. -#[async_trait] -pub trait SchedulerBackend: Send + Sync { - async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult; - async fn get_schedule(&self, id: ScheduleId) -> BackendResult; - async fn get_schedule_by_name( - &self, - workflow_name: &str, - schedule_name: &str, - ) -> BackendResult>; - async fn list_schedules(&self, limit: i64, offset: i64) - -> BackendResult>; - async fn count_schedules(&self) -> BackendResult; - async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult; - async fn delete_schedule(&self, id: ScheduleId) -> BackendResult; - async fn find_due_schedules(&self, limit: i32) -> BackendResult>; - async fn has_running_instance(&self, schedule_id: ScheduleId) -> BackendResult; - async fn mark_schedule_executed( - &self, - schedule_id: ScheduleId, - instance_id: Uuid, - ) -> BackendResult<()>; - async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()>; -} - -#[derive(Clone, Copy, Debug, Default)] -/// Summary of a garbage collection sweep. -pub struct GarbageCollectionResult { - pub deleted_instances: usize, - pub deleted_actions: usize, -} - -/// Backend capability for deleting old finished workflow data. -#[async_trait] -pub trait GarbageCollectorBackend: Send + Sync { - async fn collect_done_instances( - &self, - older_than: DateTime, - limit: usize, - ) -> BackendResult; -} - -/// Backend capability for webapp-specific queries. -#[async_trait] -pub trait WebappBackend: Send + Sync { - async fn count_instances(&self, search: Option<&str>) -> BackendResult; - async fn list_instances( - &self, - search: Option<&str>, - limit: i64, - offset: i64, - ) -> BackendResult>; - async fn get_instance(&self, instance_id: Uuid) -> BackendResult; - async fn get_execution_graph( - &self, - instance_id: Uuid, - ) -> BackendResult>; - async fn get_workflow_graph( - &self, - instance_id: Uuid, - ) -> BackendResult>; - async fn get_action_results(&self, instance_id: Uuid) -> BackendResult>; - async fn get_distinct_workflows(&self) -> BackendResult>; - async fn get_distinct_statuses(&self) -> BackendResult>; - async fn count_schedules(&self) -> BackendResult; - async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult>; - async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult; - async fn count_schedule_invocations(&self, schedule_id: Uuid) -> BackendResult; - async fn list_schedule_invocations( - &self, - schedule_id: Uuid, - limit: i64, - offset: i64, - ) -> BackendResult>; - async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult; - async fn get_distinct_schedule_statuses(&self) -> BackendResult>; - async fn get_distinct_schedule_types(&self) -> BackendResult>; - async fn get_worker_action_stats( - &self, - window_minutes: i64, - ) -> BackendResult>; - async fn get_worker_aggregate_stats( - &self, - window_minutes: i64, - ) -> BackendResult; - async fn worker_status_table_exists(&self) -> bool; - async fn schedules_table_exists(&self) -> bool; - async fn get_worker_statuses(&self, window_minutes: i64) -> BackendResult>; -} - -impl Clone for Box { - fn clone(&self) -> Self { - self.clone_box() - } -} diff --git a/crates/waymark/src/backends/memory.rs b/crates/waymark/src/backends/memory.rs deleted file mode 100644 index c49bc6e0..00000000 --- a/crates/waymark/src/backends/memory.rs +++ /dev/null @@ -1,814 +0,0 @@ -//! In-memory backend that prints persistence operations. - -use std::collections::{HashMap, VecDeque}; -use std::sync::{Arc, Mutex}; - -use chrono::{DateTime, Utc}; -use uuid::Uuid; - -use super::base::{ - ActionDone, BackendError, BackendResult, CoreBackend, GarbageCollectionResult, - GarbageCollectorBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, - QueuedInstance, QueuedInstanceBatch, SchedulerBackend, WebappBackend, WorkerStatusBackend, - WorkerStatusUpdate, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; -use crate::scheduler::compute_next_run; -use crate::scheduler::{CreateScheduleParams, ScheduleId, ScheduleType, WorkflowSchedule}; -use crate::webapp::{ - ExecutionGraphView, InstanceDetail, InstanceStatus, InstanceSummary, ScheduleDetail, - ScheduleInvocationSummary, ScheduleSummary, TimelineEntry, WorkerActionRow, - WorkerAggregateStats, WorkerStatus, -}; -use tonic::async_trait; - -type WorkflowVersionKey = (String, String); -type WorkflowVersionValue = (Uuid, WorkflowRegistration); -type WorkflowVersionStore = HashMap; -type InstanceLockStore = HashMap, Option>)>; - -/// Backend that stores updates in memory for tests or local runs. -#[derive(Clone)] -pub struct MemoryBackend { - instance_queue: Option>>>, - graph_updates: Arc>>, - actions_done: Arc>>, - instances_done: Arc>>, - worker_status_updates: Arc>>, - workflow_versions: Arc>, - schedules: Arc>>, - instance_locks: Arc>, -} - -impl Default for MemoryBackend { - fn default() -> Self { - Self { - instance_queue: None, - graph_updates: Arc::new(Mutex::new(Vec::new())), - actions_done: Arc::new(Mutex::new(Vec::new())), - instances_done: Arc::new(Mutex::new(Vec::new())), - worker_status_updates: Arc::new(Mutex::new(Vec::new())), - workflow_versions: Arc::new(Mutex::new(HashMap::new())), - schedules: Arc::new(Mutex::new(HashMap::new())), - instance_locks: Arc::new(Mutex::new(HashMap::new())), - } - } -} - -impl MemoryBackend { - pub fn new() -> Self { - Self::default() - } - - pub fn with_queue(queue: Arc>>) -> Self { - Self { - instance_queue: Some(queue), - ..Self::default() - } - } - - pub fn instance_queue(&self) -> Option>>> { - self.instance_queue.clone() - } - - pub fn graph_updates(&self) -> Vec { - self.graph_updates - .lock() - .expect("graph updates poisoned") - .clone() - } - - pub fn actions_done(&self) -> Vec { - self.actions_done - .lock() - .expect("actions done poisoned") - .clone() - } - - pub fn instances_done(&self) -> Vec { - self.instances_done - .lock() - .expect("instances done poisoned") - .clone() - } - - pub fn worker_status_updates(&self) -> Vec { - self.worker_status_updates - .lock() - .expect("worker status updates poisoned") - .clone() - } -} - -#[async_trait] -impl CoreBackend for MemoryBackend { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } - - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - let mut stored = self.graph_updates.lock().expect("graph updates poisoned"); - stored.extend(graphs.iter().cloned()); - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - let mut locks = Vec::with_capacity(graphs.len()); - for graph in graphs { - if let Some((Some(lock_uuid), lock_expires_at)) = guard.get_mut(&graph.instance_id) - && *lock_uuid == claim.lock_uuid - && lock_expires_at.is_none_or(|expires_at| expires_at < claim.lock_expires_at) - { - *lock_expires_at = Some(claim.lock_expires_at); - } - let (lock_uuid, lock_expires_at) = guard - .get(&graph.instance_id) - .cloned() - .unwrap_or((None, None)); - locks.push(InstanceLockStatus { - instance_id: graph.instance_id, - lock_uuid, - lock_expires_at, - }); - } - Ok(locks) - } - - async fn save_actions_done(&self, actions: &[ActionDone]) -> BackendResult<()> { - let mut stored = self.actions_done.lock().expect("actions done poisoned"); - stored.extend(actions.iter().cloned()); - Ok(()) - } - - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { - let mut stored = self.instances_done.lock().expect("instances done poisoned"); - stored.extend(instances.iter().cloned()); - if !instances.is_empty() { - let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); - for instance in instances { - locks.remove(&instance.executor_id); - } - } - Ok(()) - } - - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - if size == 0 { - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - let queue = match &self.instance_queue { - Some(queue) => queue, - None => { - return Ok(QueuedInstanceBatch { - instances: Vec::new(), - }); - } - }; - let mut guard = queue.lock().expect("instance queue poisoned"); - let now = Utc::now(); - let mut instances = Vec::new(); - while instances.len() < size { - let Some(instance) = guard.front() else { - break; - }; - if let Some(scheduled_at) = instance.scheduled_at - && scheduled_at > now - { - break; - } - let instance = guard.pop_front().expect("instance queue empty"); - instances.push(instance); - } - if !instances.is_empty() { - let mut locks = self.instance_locks.lock().expect("instance locks poisoned"); - for instance in &instances { - locks.insert( - instance.instance_id, - (Some(claim.lock_uuid), Some(claim.lock_expires_at)), - ); - } - } - Ok(QueuedInstanceBatch { instances }) - } - - async fn queue_instances(&self, instances: &[QueuedInstance]) -> BackendResult<()> { - if instances.is_empty() { - return Ok(()); - } - let queue = self.instance_queue.as_ref().ok_or_else(|| { - BackendError::Message("memory backend missing instance queue".to_string()) - })?; - let mut guard = queue.lock().expect("instance queue poisoned"); - for instance in instances { - guard.push_back(instance.clone()); - } - Ok(()) - } - - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - let mut locks = Vec::new(); - for instance_id in instance_ids { - let entry = guard - .entry(*instance_id) - .or_insert((Some(claim.lock_uuid), Some(claim.lock_expires_at))); - if entry.0 == Some(claim.lock_uuid) { - entry.1 = Some(claim.lock_expires_at); - } - locks.push(InstanceLockStatus { - instance_id: *instance_id, - lock_uuid: entry.0, - lock_expires_at: entry.1, - }); - } - Ok(locks) - } - - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()> { - let mut guard = self.instance_locks.lock().expect("instance locks poisoned"); - for instance_id in instance_ids { - if let Some((current_lock, _)) = guard.get(instance_id) - && *current_lock == Some(lock_uuid) - { - guard.remove(instance_id); - } - } - Ok(()) - } -} - -#[async_trait] -impl WorkerStatusBackend for MemoryBackend { - async fn upsert_worker_status(&self, status: &WorkerStatusUpdate) -> BackendResult<()> { - let mut stored = self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"); - stored.push(status.clone()); - Ok(()) - } -} - -#[async_trait] -impl WorkflowRegistryBackend for MemoryBackend { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult { - let mut guard = self - .workflow_versions - .lock() - .expect("workflow versions poisoned"); - let key = ( - registration.workflow_name.clone(), - registration.workflow_version.clone(), - ); - if let Some((id, existing)) = guard.get(&key) { - if existing.ir_hash != registration.ir_hash { - return Err(BackendError::Message(format!( - "workflow version already exists with different IR hash: {}@{}", - registration.workflow_name, registration.workflow_version - ))); - } - return Ok(*id); - } - - let id = Uuid::new_v4(); - guard.insert(key, (id, registration.clone())); - Ok(id) - } - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { - if ids.is_empty() { - return Ok(Vec::new()); - } - let guard = self - .workflow_versions - .lock() - .expect("workflow versions poisoned"); - let mut versions = Vec::new(); - for (id, registration) in guard.values() { - if ids.contains(id) { - versions.push(WorkflowVersion { - id: *id, - workflow_name: registration.workflow_name.clone(), - workflow_version: registration.workflow_version.clone(), - ir_hash: registration.ir_hash.clone(), - program_proto: registration.program_proto.clone(), - concurrent: registration.concurrent, - }); - } - } - Ok(versions) - } -} - -#[async_trait] -impl SchedulerBackend for MemoryBackend { - async fn upsert_schedule(&self, params: &CreateScheduleParams) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let existing_schedule = guard.iter().find_map(|(id, schedule)| { - if schedule.workflow_name == params.workflow_name - && schedule.schedule_name == params.schedule_name - { - Some((*id, schedule.clone())) - } else { - None - } - }); - let schedule_id = existing_schedule - .as_ref() - .map(|(id, _)| *id) - .unwrap_or_else(ScheduleId::new); - let now = Utc::now(); - let next_run_at = match existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.next_run_at) - { - Some(next_run_at) => Some(next_run_at), - None => Some( - compute_next_run( - params.schedule_type, - params.cron_expression.as_deref(), - params.interval_seconds, - params.jitter_seconds, - None, - ) - .map_err(BackendError::Message)?, - ), - }; - let schedule = WorkflowSchedule { - id: schedule_id.0, - workflow_name: params.workflow_name.clone(), - schedule_name: params.schedule_name.clone(), - schedule_type: params.schedule_type.as_str().to_string(), - cron_expression: params.cron_expression.clone(), - interval_seconds: params.interval_seconds, - jitter_seconds: params.jitter_seconds, - input_payload: params.input_payload.clone(), - status: "active".to_string(), - next_run_at, - last_run_at: existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.last_run_at), - last_instance_id: existing_schedule - .as_ref() - .and_then(|(_, schedule)| schedule.last_instance_id), - created_at: existing_schedule - .as_ref() - .map(|(_, schedule)| schedule.created_at) - .unwrap_or(now), - updated_at: now, - priority: params.priority, - allow_duplicate: params.allow_duplicate, - }; - guard.insert(schedule_id, schedule); - Ok(schedule_id) - } - - async fn get_schedule(&self, id: ScheduleId) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - guard - .get(&id) - .cloned() - .ok_or_else(|| BackendError::Message(format!("schedule not found: {id}"))) - } - - async fn get_schedule_by_name( - &self, - workflow_name: &str, - schedule_name: &str, - ) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .find(|schedule| { - schedule.workflow_name == workflow_name - && schedule.schedule_name == schedule_name - && schedule.status != "deleted" - }) - .cloned()) - } - - async fn list_schedules( - &self, - limit: i64, - offset: i64, - ) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| schedule.status != "deleted") - .cloned() - .collect(); - schedules.sort_by(|a, b| { - (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) - }); - let start = offset.max(0) as usize; - let end = start.saturating_add(limit.max(0) as usize); - Ok(schedules - .into_iter() - .skip(start) - .take(end - start) - .collect()) - } - - async fn count_schedules(&self) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .filter(|schedule| schedule.status != "deleted") - .count() as i64) - } - - async fn update_schedule_status(&self, id: ScheduleId, status: &str) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - if let Some(schedule) = guard.get_mut(&id) { - schedule.status = status.to_string(); - schedule.updated_at = Utc::now(); - Ok(true) - } else { - Ok(false) - } - } - - async fn delete_schedule(&self, id: ScheduleId) -> BackendResult { - SchedulerBackend::update_schedule_status(self, id, "deleted").await - } - - async fn find_due_schedules(&self, limit: i32) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let now = Utc::now(); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| { - schedule.status == "active" - && schedule - .next_run_at - .map(|next| next <= now) - .unwrap_or(false) - }) - .cloned() - .collect(); - schedules.sort_by_key(|schedule| schedule.next_run_at); - Ok(schedules.into_iter().take(limit as usize).collect()) - } - - async fn has_running_instance(&self, _schedule_id: ScheduleId) -> BackendResult { - Ok(false) - } - - async fn mark_schedule_executed( - &self, - schedule_id: ScheduleId, - instance_id: Uuid, - ) -> BackendResult<()> { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .get_mut(&schedule_id) - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - schedule.last_run_at = Some(Utc::now()); - schedule.last_instance_id = Some(instance_id); - schedule.next_run_at = Some(next_run_at); - schedule.updated_at = Utc::now(); - Ok(()) - } - - async fn skip_schedule_run(&self, schedule_id: ScheduleId) -> BackendResult<()> { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .get_mut(&schedule_id) - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - let schedule_type = ScheduleType::parse(&schedule.schedule_type) - .ok_or_else(|| BackendError::Message("invalid schedule type".to_string()))?; - let next_run_at = compute_next_run( - schedule_type, - schedule.cron_expression.as_deref(), - schedule.interval_seconds, - schedule.jitter_seconds, - Some(Utc::now()), - ) - .map_err(BackendError::Message)?; - schedule.next_run_at = Some(next_run_at); - schedule.updated_at = Utc::now(); - Ok(()) - } -} - -#[async_trait] -impl GarbageCollectorBackend for MemoryBackend { - async fn collect_done_instances( - &self, - _older_than: DateTime, - _limit: usize, - ) -> BackendResult { - Ok(GarbageCollectionResult::default()) - } -} - -#[async_trait] -impl WebappBackend for MemoryBackend { - async fn count_instances(&self, _search: Option<&str>) -> BackendResult { - Ok(0) - } - - async fn list_instances( - &self, - _search: Option<&str>, - _limit: i64, - _offset: i64, - ) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_instance(&self, instance_id: Uuid) -> BackendResult { - Err(BackendError::Message(format!( - "instance not found: {instance_id}" - ))) - } - - async fn get_execution_graph( - &self, - _instance_id: Uuid, - ) -> BackendResult> { - Ok(None) - } - - async fn get_workflow_graph( - &self, - _instance_id: Uuid, - ) -> BackendResult> { - Ok(None) - } - - async fn get_action_results(&self, _instance_id: Uuid) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_distinct_workflows(&self) -> BackendResult> { - Ok(Vec::new()) - } - - async fn get_distinct_statuses(&self) -> BackendResult> { - Ok(vec![ - InstanceStatus::Queued.to_string(), - InstanceStatus::Running.to_string(), - InstanceStatus::Completed.to_string(), - InstanceStatus::Failed.to_string(), - ]) - } - - async fn count_schedules(&self) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - Ok(guard - .values() - .filter(|schedule| schedule.status != "deleted") - .count() as i64) - } - - async fn list_schedules(&self, limit: i64, offset: i64) -> BackendResult> { - let guard = self.schedules.lock().expect("schedules poisoned"); - let mut schedules: Vec<_> = guard - .values() - .filter(|schedule| schedule.status != "deleted") - .cloned() - .collect(); - schedules.sort_by(|a, b| { - (&a.workflow_name, &a.schedule_name).cmp(&(&b.workflow_name, &b.schedule_name)) - }); - - let start = offset.max(0) as usize; - let page_limit = limit.max(0) as usize; - Ok(schedules - .into_iter() - .skip(start) - .take(page_limit) - .map(|schedule| ScheduleSummary { - id: schedule.id.to_string(), - workflow_name: schedule.workflow_name, - schedule_name: schedule.schedule_name, - schedule_type: schedule.schedule_type, - cron_expression: schedule.cron_expression, - interval_seconds: schedule.interval_seconds, - status: schedule.status, - next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), - last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), - created_at: schedule.created_at.to_rfc3339(), - }) - .collect()) - } - - async fn get_schedule(&self, schedule_id: Uuid) -> BackendResult { - let guard = self.schedules.lock().expect("schedules poisoned"); - let schedule = guard - .values() - .find(|schedule| schedule.id == schedule_id) - .cloned() - .ok_or_else(|| BackendError::Message(format!("schedule not found: {schedule_id}")))?; - - let input_payload = schedule.input_payload.as_ref().and_then(|bytes| { - rmp_serde::from_slice::(bytes) - .ok() - .and_then(|value| serde_json::to_string_pretty(&value).ok()) - }); - - Ok(ScheduleDetail { - id: schedule.id.to_string(), - workflow_name: schedule.workflow_name, - schedule_name: schedule.schedule_name, - schedule_type: schedule.schedule_type, - cron_expression: schedule.cron_expression, - interval_seconds: schedule.interval_seconds, - jitter_seconds: schedule.jitter_seconds, - status: schedule.status, - next_run_at: schedule.next_run_at.map(|dt| dt.to_rfc3339()), - last_run_at: schedule.last_run_at.map(|dt| dt.to_rfc3339()), - last_instance_id: schedule.last_instance_id.map(|id| id.to_string()), - created_at: schedule.created_at.to_rfc3339(), - updated_at: schedule.updated_at.to_rfc3339(), - priority: schedule.priority, - allow_duplicate: schedule.allow_duplicate, - input_payload, - }) - } - - async fn count_schedule_invocations(&self, _schedule_id: Uuid) -> BackendResult { - Ok(0) - } - - async fn list_schedule_invocations( - &self, - _schedule_id: Uuid, - _limit: i64, - _offset: i64, - ) -> BackendResult> { - Ok(Vec::new()) - } - - async fn update_schedule_status(&self, schedule_id: Uuid, status: &str) -> BackendResult { - let mut guard = self.schedules.lock().expect("schedules poisoned"); - let Some(schedule) = guard - .values_mut() - .find(|schedule| schedule.id == schedule_id) - else { - return Ok(false); - }; - schedule.status = status.to_string(); - schedule.updated_at = Utc::now(); - Ok(true) - } - - async fn get_distinct_schedule_statuses(&self) -> BackendResult> { - Ok(vec!["active".to_string(), "paused".to_string()]) - } - - async fn get_distinct_schedule_types(&self) -> BackendResult> { - Ok(vec!["cron".to_string(), "interval".to_string()]) - } - - async fn get_worker_action_stats( - &self, - _window_minutes: i64, - ) -> BackendResult> { - let statuses = latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - ); - - Ok(statuses - .into_iter() - .map(|status| WorkerActionRow { - pool_id: status.pool_id.to_string(), - active_workers: status.active_workers as i64, - actions_per_sec: format!("{:.1}", status.actions_per_sec), - throughput_per_min: status.throughput_per_min as i64, - total_completed: status.total_completed, - median_dequeue_ms: status.median_dequeue_ms, - median_handling_ms: status.median_handling_ms, - last_action_at: status.last_action_at.map(|dt| dt.to_rfc3339()), - updated_at: status.updated_at.to_rfc3339(), - }) - .collect()) - } - - async fn get_worker_aggregate_stats( - &self, - _window_minutes: i64, - ) -> BackendResult { - let statuses = latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - ); - - let active_worker_count = statuses - .iter() - .map(|status| status.active_workers as i64) - .sum(); - let total_in_flight = statuses - .iter() - .filter_map(|status| status.total_in_flight) - .sum(); - let total_queue_depth = statuses - .iter() - .filter_map(|status| status.dispatch_queue_size) - .sum(); - let actions_per_sec = statuses - .iter() - .map(|status| status.actions_per_sec) - .sum::(); - - Ok(WorkerAggregateStats { - active_worker_count, - actions_per_sec: format!("{:.1}", actions_per_sec), - total_in_flight, - total_queue_depth, - }) - } - - async fn worker_status_table_exists(&self) -> bool { - !self - .worker_status_updates - .lock() - .expect("worker status updates poisoned") - .is_empty() - } - - async fn schedules_table_exists(&self) -> bool { - !self - .schedules - .lock() - .expect("schedules poisoned") - .is_empty() - } - - async fn get_worker_statuses(&self, _window_minutes: i64) -> BackendResult> { - Ok(latest_worker_statuses( - &self - .worker_status_updates - .lock() - .expect("worker status updates poisoned"), - )) - } -} - -fn latest_worker_statuses(updates: &[WorkerStatusUpdate]) -> Vec { - let mut by_pool: HashMap = HashMap::new(); - for update in updates { - by_pool.insert(update.pool_id, update.clone()); - } - - let now = Utc::now(); - let mut statuses: Vec<_> = by_pool - .into_values() - .map(|status| WorkerStatus { - pool_id: status.pool_id, - active_workers: status.active_workers, - throughput_per_min: status.throughput_per_min, - actions_per_sec: status.actions_per_sec, - total_completed: status.total_completed, - last_action_at: status.last_action_at, - updated_at: now, - median_dequeue_ms: status.median_dequeue_ms, - median_handling_ms: status.median_handling_ms, - dispatch_queue_size: Some(status.dispatch_queue_size), - total_in_flight: Some(status.total_in_flight), - median_instance_duration_secs: status.median_instance_duration_secs, - active_instance_count: status.active_instance_count, - total_instances_completed: status.total_instances_completed, - instances_per_sec: status.instances_per_sec, - instances_per_min: status.instances_per_min, - time_series: status.time_series, - }) - .collect(); - - statuses.sort_by(|left, right| right.actions_per_sec.total_cmp(&left.actions_per_sec)); - statuses -} diff --git a/crates/waymark/src/backends/mod.rs b/crates/waymark/src/backends/mod.rs deleted file mode 100644 index 7fbd84ad..00000000 --- a/crates/waymark/src/backends/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Backend implementations for runner persistence. - -mod base; -mod memory; -mod postgres; - -pub use base::{ - ActionAttemptStatus, ActionDone, BackendError, BackendResult, CoreBackend, - GarbageCollectionResult, GarbageCollectorBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, QueuedInstance, QueuedInstanceBatch, SchedulerBackend, - WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, WorkflowRegistration, - WorkflowRegistryBackend, WorkflowVersion, -}; -pub use memory::MemoryBackend; -pub use postgres::PostgresBackend; diff --git a/crates/waymark/src/bin/integration_test.rs b/crates/waymark/src/bin/integration_test.rs index 35f86fdf..e7faa3ae 100644 --- a/crates/waymark/src/bin/integration_test.rs +++ b/crates/waymark/src/bin/integration_test.rs @@ -19,17 +19,16 @@ use serde_json::Value; use sqlx::Row; use uuid::Uuid; -use waymark::backends::{ - CoreBackend, MemoryBackend, PostgresBackend, QueuedInstance, WorkflowRegistration, - WorkflowRegistryBackend, -}; -use waymark::db; -use waymark::integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; use waymark::messages::ast as ir; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{PythonWorkerConfig, RemoteWorkerPool}; +use waymark_backend_memory::MemoryBackend; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::{CoreBackend, QueuedInstance}; use waymark_dag::{DAG, convert_to_dag}; +use waymark_integration_support::{LOCAL_POSTGRES_DSN, connect_pool, ensure_local_postgres}; +use waymark_runner_state::RunnerState; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend}; #[derive(Parser, Debug)] #[command(name = "integration_test")] @@ -452,7 +451,7 @@ async fn connect_postgres_backend() -> Result { let pool = connect_pool(&dsn) .await .with_context(|| format!("connect postgres backend: {dsn}"))?; - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run postgres migrations for integration runner")?; Ok(PostgresBackend::new(pool)) diff --git a/crates/waymark/src/bin/soak-harness.rs b/crates/waymark/src/bin/soak-harness.rs index 3503fe94..2bccbb4e 100644 --- a/crates/waymark/src/bin/soak-harness.rs +++ b/crates/waymark/src/bin/soak-harness.rs @@ -29,14 +29,13 @@ use tokio::process::{Child, Command}; use tracing::{error, info, warn}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::{ - PostgresBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; -use waymark::db; use waymark::messages::ast as ir; -use waymark::waymark_core::ir_parser::parse_program; -use waymark::waymark_core::runner::RunnerState; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::QueuedInstance; use waymark_dag::{DAG, convert_to_dag}; +use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; const DEFAULT_DSN: &str = "postgresql://waymark:waymark@127.0.0.1:5433/waymark"; const DEFAULT_WORKFLOW_NAME: &str = "waymark_soak_timeout_mix_v1"; @@ -287,7 +286,7 @@ async fn main() -> Result<()> { } let pool = wait_for_database(&args.dsn, DB_READY_TIMEOUT).await?; - db::run_migrations(&pool) + waymark_backend_postgres_migrations::run(&pool) .await .context("run migrations before soak")?; diff --git a/crates/waymark/src/bin/start-workers.rs b/crates/waymark/src/bin/start-workers.rs index 20773cb5..8aa1c493 100644 --- a/crates/waymark/src/bin/start-workers.rs +++ b/crates/waymark/src/bin/start-workers.rs @@ -39,21 +39,16 @@ use anyhow::Result; use prost::Message; use sqlx::{PgPool, Row}; use tokio::signal; -use tokio::sync::watch; use tracing::{error, info, warn}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::PostgresBackend; use waymark::config::WorkerConfig; -use waymark::db; use waymark::messages::ast as ir; use waymark::scheduler::{DagResolver, WorkflowDag}; use waymark::waymark_core::runloop::{RunLoopSupervisorConfig, runloop_supervisor}; -use waymark::{ - PythonWorkerConfig, RemoteWorkerPool, WebappServer, spawn_garbage_collector, spawn_scheduler, - spawn_status_reporter, -}; +use waymark::{PythonWorkerConfig, RemoteWorkerPool, WebappServer, spawn_status_reporter}; +use waymark_backend_postgres::PostgresBackend; use waymark_dag::convert_to_dag; #[tokio::main] @@ -86,9 +81,12 @@ async fn main() -> Result<()> { "starting worker infrastructure" ); + // Wire shutdown coordination. + let shutdown_token = tokio_util::sync::CancellationToken::new(); + // Initialize the database and backend. let pool = PgPool::connect(&config.database_url).await?; - db::run_migrations(&pool).await?; + waymark_backend_postgres_migrations::run(&pool).await?; let backend = PostgresBackend::new(pool); // Start the worker pool (bridge + python workers). @@ -117,15 +115,29 @@ async fn main() -> Result<()> { // Start the scheduler loop. let dag_resolver = build_dag_resolver(backend.pool().clone()); - let (scheduler_handle, scheduler_shutdown) = - spawn_scheduler(backend.clone(), config.scheduler.clone(), dag_resolver); + let scheduler_handle = { + let shutdown = shutdown_token.clone().cancelled_owned(); + let task = waymark::SchedulerTask { + backend: backend.clone(), + config: config.scheduler.clone(), + dag_resolver, + }; + tokio::spawn(task.run(shutdown)) + }; info!( poll_interval_ms = config.scheduler.poll_interval.as_millis(), batch_size = config.scheduler.batch_size, "scheduler task started" ); - let (garbage_collector_handle, garbage_collector_shutdown) = - spawn_garbage_collector(backend.clone(), config.garbage_collector.clone()); + + let garbage_collector_handle = { + let shutdown = shutdown_token.clone().cancelled_owned(); + let task = waymark::GarbageCollectorTask { + backend: backend.clone(), + config: config.garbage_collector.clone(), + }; + tokio::spawn(task.run(shutdown)) + }; info!( interval_ms = config.garbage_collector.interval.as_millis(), batch_size = config.garbage_collector.batch_size, @@ -133,8 +145,6 @@ async fn main() -> Result<()> { "garbage collector task started" ); - // Wire shutdown coordination. - let (shutdown_tx, shutdown_rx) = watch::channel(false); let active_instance_gauge = Arc::new(AtomicUsize::new(0)); // Start status reporting. @@ -145,28 +155,24 @@ async fn main() -> Result<()> { remote_pool.clone(), active_instance_gauge.clone(), config.profile_interval, - shutdown_rx.clone(), + shutdown_token.clone().cancelled_owned(), ); let expired_lock_reclaimer_handle = spawn_expired_lock_reclaimer( backend.clone(), config.expired_lock_reclaimer_interval, config.expired_lock_reclaimer_batch_size, - shutdown_rx.clone(), + shutdown_token.clone().cancelled_owned(), ); let shutdown_handle = tokio::spawn({ - let shutdown_tx = shutdown_tx.clone(); - let scheduler_shutdown = scheduler_shutdown.clone(); - let garbage_collector_shutdown = garbage_collector_shutdown.clone(); + let shutdown_token = shutdown_token.clone(); async move { if let Err(err) = wait_for_shutdown().await { error!(error = %err, "shutdown signal listener failed"); return; } info!("shutdown signal received"); - let _ = shutdown_tx.send(true); - let _ = scheduler_shutdown.send(true); - let _ = garbage_collector_shutdown.send(true); + shutdown_token.cancel(); } }); @@ -188,7 +194,7 @@ async fn main() -> Result<()> { skip_sleep: false, active_instance_gauge: Some(active_instance_gauge), }, - shutdown_rx, + shutdown_token, ) .await; @@ -248,7 +254,7 @@ fn spawn_expired_lock_reclaimer( backend: PostgresBackend, interval: Duration, batch_size: usize, - mut shutdown_rx: watch::Receiver, + shutdown: tokio_util::sync::WaitForCancellationFutureOwned, ) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { let mut ticker = tokio::time::interval(interval); @@ -257,6 +263,7 @@ fn spawn_expired_lock_reclaimer( interval_ms = interval.as_millis(), batch_size, "expired lock reclaimer started" ); + let mut shutdown = std::pin::pin!(shutdown); loop { tokio::select! { _ = ticker.tick() => { @@ -282,11 +289,9 @@ fn spawn_expired_lock_reclaimer( ); } } - _ = shutdown_rx.changed() => { - if *shutdown_rx.borrow() { - info!("expired lock reclaimer shutting down"); - break; - } + _ = &mut shutdown => { + info!("expired lock reclaimer shutting down"); + break; } } } diff --git a/crates/waymark/src/bin/waymark-bridge.rs b/crates/waymark/src/bin/waymark-bridge.rs index 1bc6ac18..878e6a6e 100644 --- a/crates/waymark/src/bin/waymark-bridge.rs +++ b/crates/waymark/src/bin/waymark-bridge.rs @@ -29,18 +29,22 @@ use tracing::{debug, info}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; use uuid::Uuid; -use waymark::backends::{ - ActionDone, BackendError, BackendResult, CoreBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, PostgresBackend, QueuedInstance, QueuedInstanceBatch, - SchedulerBackend, WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, -}; -use waymark::db; use waymark::messages::{self, ast as ir, proto}; -use waymark::scheduler::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType}; use waymark::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use waymark::waymark_core::runner::RunnerState; use waymark::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; +use waymark_backend_postgres::PostgresBackend; +use waymark_backends_core::{BackendError, BackendResult}; +use waymark_core_backend::{ + ActionDone, CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, + QueuedInstance, QueuedInstanceBatch, +}; use waymark_dag::convert_to_dag; +use waymark_runner_state::RunnerState; +use waymark_scheduler_backend::SchedulerBackend as _; +use waymark_scheduler_core::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType}; +use waymark_workflow_registry_backend::{ + WorkflowRegistration, WorkflowRegistryBackend, WorkflowVersion, +}; const DEFAULT_GRPC_ADDR: &str = "127.0.0.1:24117"; @@ -52,7 +56,7 @@ struct WorkflowStore { impl WorkflowStore { async fn connect(dsn: &str) -> Result { let pool = PgPool::connect(dsn).await?; - db::run_migrations(&pool).await?; + waymark_backend_postgres_migrations::run(&pool).await?; let backend = PostgresBackend::new(pool); Ok(Self { backend }) } diff --git a/crates/waymark/src/db.rs b/crates/waymark/src/db.rs deleted file mode 100644 index f89f0e04..00000000 --- a/crates/waymark/src/db.rs +++ /dev/null @@ -1,14 +0,0 @@ -//! Database helpers shared across services. - -use sqlx::PgPool; - -use crate::backends::{BackendError, BackendResult}; - -/// Run the embedded SQLx migrations. -pub async fn run_migrations(pool: &PgPool) -> BackendResult<()> { - sqlx::migrate!() - .run(pool) - .await - .map_err(|err| BackendError::Message(err.to_string()))?; - Ok(()) -} diff --git a/crates/waymark/src/garbage_collector/mod.rs b/crates/waymark/src/garbage_collector/mod.rs index fe29bbc0..72ce00a1 100644 --- a/crates/waymark/src/garbage_collector/mod.rs +++ b/crates/waymark/src/garbage_collector/mod.rs @@ -2,4 +2,4 @@ mod task; -pub use task::{GarbageCollectorConfig, GarbageCollectorTask, spawn_garbage_collector}; +pub use task::{GarbageCollectorConfig, GarbageCollectorTask}; diff --git a/crates/waymark/src/garbage_collector/task.rs b/crates/waymark/src/garbage_collector/task.rs index 39eaf803..e1f673ef 100644 --- a/crates/waymark/src/garbage_collector/task.rs +++ b/crates/waymark/src/garbage_collector/task.rs @@ -5,10 +5,8 @@ use std::time::Duration; use chrono::Utc; -use tokio::sync::watch; use tracing::{debug, error, info}; - -use crate::backends::{GarbageCollectionResult, GarbageCollectorBackend}; +use waymark_garbage_collector_backend::{GarbageCollectionResult, GarbageCollectorBackend}; /// Configuration for the garbage collector task. #[derive(Debug, Clone)] @@ -33,29 +31,16 @@ impl Default for GarbageCollectorConfig { /// Background garbage collector task. pub struct GarbageCollectorTask { - backend: B, - config: GarbageCollectorConfig, - shutdown_rx: watch::Receiver, + pub backend: B, + pub config: GarbageCollectorConfig, } impl GarbageCollectorTask where B: GarbageCollectorBackend + Clone + Send + Sync + 'static, { - pub fn new( - backend: B, - config: GarbageCollectorConfig, - shutdown_rx: watch::Receiver, - ) -> Self { - Self { - backend, - config, - shutdown_rx, - } - } - /// Run the garbage collector loop. - pub async fn run(mut self) { + pub async fn run(self, shutdown: tokio_util::sync::WaitForCancellationFutureOwned) { info!( interval_ms = self.config.interval.as_millis(), batch_size = self.config.batch_size, @@ -63,13 +48,13 @@ where "garbage collector task started" ); + let mut shutdown = std::pin::pin!(shutdown); + loop { tokio::select! { - _ = self.shutdown_rx.changed() => { - if *self.shutdown_rx.borrow() { - info!("garbage collector task shutting down"); - break; - } + _ = &mut shutdown => { + info!("garbage collector task shutting down"); + break; } _ = tokio::time::sleep(self.config.interval) => { if let Err(err) = self.collect_until_drained().await { @@ -126,20 +111,6 @@ where } } -/// Convenience function to spawn a garbage collector task. -pub fn spawn_garbage_collector( - backend: B, - config: GarbageCollectorConfig, -) -> (tokio::task::JoinHandle<()>, watch::Sender) -where - B: GarbageCollectorBackend + Clone + Send + Sync + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(false); - let task = GarbageCollectorTask::new(backend, config, shutdown_rx); - let handle = tokio::spawn(task.run()); - (handle, shutdown_tx) -} - #[cfg(test)] mod tests { use std::collections::VecDeque; @@ -148,9 +119,9 @@ mod tests { use chrono::{Duration as ChronoDuration, Utc}; use tonic::async_trait; + use waymark_backends_core::BackendResult; use super::*; - use crate::backends::{BackendResult, GarbageCollectorBackend}; #[derive(Clone)] struct StubGarbageCollectorBackend { @@ -199,16 +170,14 @@ mod tests { observed_limits: Arc::new(Mutex::new(Vec::new())), observed_cutoffs: Arc::new(Mutex::new(Vec::new())), }; - let (_shutdown_tx, shutdown_rx) = watch::channel(false); - let task = GarbageCollectorTask::new( - backend.clone(), - GarbageCollectorConfig { + let task = GarbageCollectorTask { + backend: backend.clone(), + config: GarbageCollectorConfig { interval: Duration::from_secs(60), batch_size: 2, retention: Duration::from_secs(24 * 60 * 60), }, - shutdown_rx, - ); + }; task.collect_until_drained() .await @@ -229,16 +198,14 @@ mod tests { observed_limits: Arc::new(Mutex::new(Vec::new())), observed_cutoffs: Arc::new(Mutex::new(Vec::new())), }; - let (_shutdown_tx, shutdown_rx) = watch::channel(false); - let task = GarbageCollectorTask::new( - backend.clone(), - GarbageCollectorConfig { + let task = GarbageCollectorTask { + backend: backend.clone(), + config: GarbageCollectorConfig { interval: Duration::from_secs(60), batch_size: 3, retention: Duration::from_secs(24 * 60 * 60), }, - shutdown_rx, - ); + }; let before = Utc::now(); task.collect_until_drained() diff --git a/crates/waymark/src/lib.rs b/crates/waymark/src/lib.rs index b9193e66..568c1ba0 100644 --- a/crates/waymark/src/lib.rs +++ b/crates/waymark/src/lib.rs @@ -1,32 +1,22 @@ //! Waymark - worker pool infrastructure plus the core IR/runtime port. -pub mod backends; pub mod config; -pub mod db; pub mod garbage_collector; -pub mod integration_support; pub mod messages; pub mod observability; pub mod pool_status; pub mod scheduler; pub mod server_worker; -#[cfg(test)] -pub mod test_support; pub mod waymark_core; pub mod webapp; pub mod workers; // Worker infrastructure (preserved from the legacy Rust core). -pub use garbage_collector::{ - GarbageCollectorConfig, GarbageCollectorTask, spawn_garbage_collector, -}; +pub use garbage_collector::{GarbageCollectorConfig, GarbageCollectorTask}; pub use messages::{MessageError, ast as ir_ast, proto, workflow_argument_value_to_json}; pub use observability::obs; pub use pool_status::{PoolTimeSeries, TimeSeriesEntry, TimeSeriesJsonEntry}; -pub use scheduler::{ - CreateScheduleParams, ScheduleId, ScheduleType, SchedulerConfig, SchedulerTask, - WorkflowSchedule, spawn_scheduler, -}; +pub use scheduler::{SchedulerConfig, SchedulerTask}; pub use server_worker::{WorkerBridgeChannels, WorkerBridgeServer}; pub use webapp::{WebappConfig, WebappServer}; pub use workers::{ diff --git a/crates/waymark/src/observability.rs b/crates/waymark/src/observability.rs index dbb8a7af..49c1700f 100644 --- a/crates/waymark/src/observability.rs +++ b/crates/waymark/src/observability.rs @@ -1,6 +1,6 @@ //! Observability helpers for optional tracing instrumentation. -pub use waymark_observability_macros::obs; +pub use waymark_observability::obs; #[cfg(feature = "trace")] use std::sync::OnceLock; diff --git a/crates/waymark/src/scheduler/mod.rs b/crates/waymark/src/scheduler/mod.rs index 4c6e8c08..d14adfc4 100644 --- a/crates/waymark/src/scheduler/mod.rs +++ b/crates/waymark/src/scheduler/mod.rs @@ -6,9 +6,5 @@ //! - Cron and interval utilities mod task; -mod types; -mod utils; -pub use task::{DagResolver, SchedulerConfig, SchedulerTask, WorkflowDag, spawn_scheduler}; -pub use types::{CreateScheduleParams, ScheduleId, ScheduleStatus, ScheduleType, WorkflowSchedule}; -pub use utils::{apply_jitter, compute_next_run, next_cron_run, next_interval_run, validate_cron}; +pub use task::{DagResolver, SchedulerConfig, SchedulerTask, WorkflowDag}; diff --git a/crates/waymark/src/scheduler/task.rs b/crates/waymark/src/scheduler/task.rs index 9ad0da0b..283745cc 100644 --- a/crates/waymark/src/scheduler/task.rs +++ b/crates/waymark/src/scheduler/task.rs @@ -7,12 +7,11 @@ use std::sync::Arc; use std::time::Duration; use serde_json::Value; -use tokio::sync::watch; use tracing::{debug, error, info}; use uuid::Uuid; +use waymark_core_backend::QueuedInstance; +use waymark_scheduler_core::{ScheduleId, WorkflowSchedule}; -use super::types::{ScheduleId, WorkflowSchedule}; -use crate::backends::{CoreBackend, QueuedInstance, SchedulerBackend}; use crate::messages; use crate::messages::ast as ir; use waymark_dag::DAG; @@ -45,48 +44,33 @@ impl Default for SchedulerConfig { /// Background scheduler task. pub struct SchedulerTask { - backend: B, - config: SchedulerConfig, - shutdown_rx: watch::Receiver, + pub backend: B, + pub config: SchedulerConfig, /// Function to get the DAG for a workflow. /// This should look up the workflow definition and return its DAG. - dag_resolver: DagResolver, + pub dag_resolver: DagResolver, } impl SchedulerTask where - B: CoreBackend + SchedulerBackend + Clone + Send + Sync + 'static, + B: waymark_core_backend::CoreBackend + waymark_scheduler_backend::SchedulerBackend, + B: Clone + Send + Sync + 'static, { - /// Create a new scheduler task. - pub fn new( - backend: B, - config: SchedulerConfig, - shutdown_rx: watch::Receiver, - dag_resolver: DagResolver, - ) -> Self { - Self { - backend, - config, - shutdown_rx, - dag_resolver, - } - } - /// Run the scheduler loop. - pub async fn run(mut self) { + pub async fn run(self, shutdown: tokio_util::sync::WaitForCancellationFutureOwned) { info!( poll_interval_ms = self.config.poll_interval.as_millis(), batch_size = self.config.batch_size, "scheduler task started" ); + let mut shutdown = std::pin::pin!(shutdown); + loop { tokio::select! { - _ = self.shutdown_rx.changed() => { - if *self.shutdown_rx.borrow() { - info!("scheduler task shutting down"); - break; - } + _ = &mut shutdown => { + info!("scheduler task shutting down"); + break; } _ = tokio::time::sleep(self.config.poll_interval) => { if let Err(e) = self.poll_and_fire().await { @@ -170,12 +154,8 @@ where .as_ref() .ok_or_else(|| "DAG has no entry node".to_string())?; - let mut state = crate::waymark_core::runner::RunnerState::new( - Some(Arc::clone(&dag)), - None, - None, - false, - ); + let mut state = + waymark_runner_state::RunnerState::new(Some(Arc::clone(&dag)), None, None, false); if let Some(input_payload) = schedule.input_payload.as_deref() { let inputs = messages::workflow_arguments_to_json(input_payload) .ok_or_else(|| "failed to decode schedule input payload".to_string())?; @@ -287,21 +267,6 @@ fn literal_from_json_value(value: &Value) -> ir::Expr { } } -/// Convenience function to spawn a scheduler task. -pub fn spawn_scheduler( - backend: B, - config: SchedulerConfig, - dag_resolver: DagResolver, -) -> (tokio::task::JoinHandle<()>, watch::Sender) -where - B: CoreBackend + SchedulerBackend + Clone + Send + Sync + 'static, -{ - let (shutdown_tx, shutdown_rx) = watch::channel(false); - let task = SchedulerTask::new(backend, config, shutdown_rx, dag_resolver); - let handle = tokio::spawn(task.run()); - (handle, shutdown_tx) -} - #[cfg(test)] mod tests { use std::collections::VecDeque; @@ -310,14 +275,16 @@ mod tests { use chrono::{Duration as ChronoDuration, Utc}; use prost::Message; use serde_json::Value; + use waymark_backend_memory::MemoryBackend; + use waymark_core_backend::{CoreBackend, LockClaim}; + use waymark_scheduler_backend::SchedulerBackend; + use waymark_scheduler_core::{CreateScheduleParams, ScheduleType}; use super::*; - use crate::backends::{CoreBackend, LockClaim, MemoryBackend, SchedulerBackend}; use crate::messages::proto; - use crate::scheduler::{CreateScheduleParams, ScheduleType}; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::RunnerExecutor; use waymark_dag::convert_to_dag; + use waymark_ir_parser::parse_program; + use waymark_runner::RunnerExecutor; fn workflow_args_payload(key: &str, value: i64) -> Vec { proto::WorkflowArguments { @@ -339,7 +306,6 @@ mod tests { async fn scheduler_fire_schedule_applies_input_payload_to_state() { let queue = Arc::new(Mutex::new(VecDeque::new())); let backend = MemoryBackend::with_queue(queue); - let (_shutdown_tx, shutdown_rx) = watch::channel(false); let source = r#" fn main(input: [number], output: [result]): @@ -362,12 +328,11 @@ fn main(input: [number], output: [result]): } }); - let scheduler = SchedulerTask::new( - backend.clone(), - SchedulerConfig::default(), - shutdown_rx, + let scheduler = SchedulerTask { + backend: backend.clone(), + config: SchedulerConfig::default(), dag_resolver, - ); + }; SchedulerBackend::upsert_schedule( &backend, &CreateScheduleParams { @@ -408,11 +373,8 @@ fn main(input: [number], output: [result]): let state = queued.state.clone().expect("queued state"); let mut executor = RunnerExecutor::new(Arc::clone(&dag), state, queued.action_results.clone(), None); - let replay = crate::waymark_core::runner::replay_variables( - executor.state(), - executor.action_results(), - ) - .expect("replay inputs"); + let replay = waymark_runner::replay_variables(executor.state(), executor.action_results()) + .expect("replay inputs"); assert_eq!( replay.variables.get("number"), Some(&Value::Number(7.into())) diff --git a/crates/waymark/src/waymark_core/cli/benchmark.rs b/crates/waymark/src/waymark_core/cli/benchmark.rs index 99558eea..241afdd1 100644 --- a/crates/waymark/src/waymark_core/cli/benchmark.rs +++ b/crates/waymark/src/waymark_core/cli/benchmark.rs @@ -12,12 +12,11 @@ use serde_json::Value; use sha2::{Digest, Sha256}; use sqlx::PgPool; use uuid::Uuid; +use waymark_backend_postgres::PostgresBackend; +use waymark_core_backend::QueuedInstance; +use waymark_integration_support::{LOCAL_POSTGRES_DSN, ensure_local_postgres}; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; -use crate::backends::{ - PostgresBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; -use crate::db; -use crate::integration_support::{LOCAL_POSTGRES_DSN, ensure_local_postgres}; use crate::messages::ast as ir; use crate::observability::obs; use crate::waymark_core::cli::smoke::{ @@ -25,9 +24,9 @@ use crate::waymark_core::cli::smoke::{ build_try_except_program, build_while_loop_program, literal_from_value, }; use crate::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use crate::waymark_core::runner::RunnerState; use crate::workers::{ActionCallable, InlineWorkerPool, WorkerPoolError}; use waymark_dag::convert_to_dag; +use waymark_runner_state::RunnerState; const DEFAULT_DSN: &str = LOCAL_POSTGRES_DSN; const DEFAULT_MAX_CONCURRENT_INSTANCES: usize = 500; @@ -318,7 +317,9 @@ async fn run_benchmark( } let pool = PgPool::connect(dsn).await.expect("connect postgres"); drop_benchmark_tables(&pool).await; - db::run_migrations(&pool).await.expect("run migrations"); + waymark_backend_postgres_migrations::run(&pool) + .await + .expect("run migrations"); let backend = PostgresBackend::new(pool); backend.clear_all().await.expect("clear all"); let total = queue_benchmark_instances(&backend, &cases, count_per_case, batch_size).await; diff --git a/crates/waymark/src/waymark_core/cli/smoke.rs b/crates/waymark/src/waymark_core/cli/smoke.rs index 3625e952..bb5a49c3 100644 --- a/crates/waymark/src/waymark_core/cli/smoke.rs +++ b/crates/waymark/src/waymark_core/cli/smoke.rs @@ -11,18 +11,18 @@ use prost::Message; use serde_json::Value; use sha2::{Digest, Sha256}; use uuid::Uuid; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::QueuedInstance; +use waymark_workflow_registry_backend::{WorkflowRegistration, WorkflowRegistryBackend as _}; -use crate::backends::{ - MemoryBackend, QueuedInstance, WorkflowRegistration, WorkflowRegistryBackend, -}; use crate::messages::ast as ir; use crate::waymark_core::dag_viz::render_dag_image; use crate::waymark_core::ir_format::format_program; -use crate::waymark_core::ir_parser::parse_program; use crate::waymark_core::runloop::{RunLoop, RunLoopSupervisorConfig}; -use crate::waymark_core::runner::RunnerState; use crate::workers::{PythonWorkerConfig, RemoteWorkerPool}; use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; +use waymark_runner_state::RunnerState; #[derive(Parser, Debug)] #[command(name = "waymark-smoke", about = "Smoke check core-python components.")] diff --git a/crates/waymark/src/waymark_core/ir_format.rs b/crates/waymark/src/waymark_core/ir_format.rs index 57a9460a..45795f0e 100644 --- a/crates/waymark/src/waymark_core/ir_format.rs +++ b/crates/waymark/src/waymark_core/ir_format.rs @@ -569,7 +569,7 @@ pub fn format_program(program: &ir::Program) -> String { #[cfg(test)] mod tests { use super::{DEFAULT_INDENT, format_program}; - use crate::waymark_core::ir_parser::IRParser; + use waymark_ir_parser::IRParser; #[test] fn test_format_program_happy_path() { diff --git a/crates/waymark/src/waymark_core/lock.rs b/crates/waymark/src/waymark_core/lock.rs index d270bc53..3f3acd5d 100644 --- a/crates/waymark/src/waymark_core/lock.rs +++ b/crates/waymark/src/waymark_core/lock.rs @@ -1,17 +1,14 @@ //! Instance lock tracking and heartbeat maintenance. use std::collections::HashSet; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Duration; use chrono::{Duration as ChronoDuration, Utc}; -use tokio::sync::Notify; use uuid::Uuid; use tracing::{debug, info, warn}; - -use crate::backends::{CoreBackend, LockClaim}; +use waymark_core_backend::LockClaim; #[derive(Clone)] pub struct InstanceLockTracker { @@ -62,21 +59,17 @@ impl InstanceLockTracker { } pub fn spawn_lock_heartbeat( - backend: Arc, + backend: Arc, tracker: InstanceLockTracker, heartbeat_interval: Duration, lock_ttl: Duration, - stop: Arc, - stop_notify: Arc, + stop: tokio_util::sync::WaitForCancellationFutureOwned, ) -> tokio::task::JoinHandle<()> { tokio::spawn(async move { + let mut stop = std::pin::pin!(stop); loop { - if stop.load(Ordering::SeqCst) { - info!("lock heartbeat stop flag set"); - break; - } tokio::select! { - _ = stop_notify.notified() => { + _ = &mut stop => { info!("lock heartbeat stop notified"); break; } diff --git a/crates/waymark/src/waymark_core/mod.rs b/crates/waymark/src/waymark_core/mod.rs index 85f0c008..d5a4ec27 100644 --- a/crates/waymark/src/waymark_core/mod.rs +++ b/crates/waymark/src/waymark_core/mod.rs @@ -4,14 +4,10 @@ pub mod cli; pub mod commit_barrier; pub mod dag_viz; pub mod ir_format; -pub mod ir_parser; pub mod lock; pub mod runloop; -pub mod runner; -pub use crate::backends::{InstanceDone, QueuedInstance}; pub use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, InlineWorkerPool}; pub use dag_viz::{build_dag_graph, render_dag_image}; pub use ir_format::format_program; pub use runloop::RunLoop; -pub use runner::RunnerState; diff --git a/crates/waymark/src/waymark_core/runloop.rs b/crates/waymark/src/waymark_core/runloop.rs index 9bb739d4..407e4a3b 100644 --- a/crates/waymark/src/waymark_core/runloop.rs +++ b/crates/waymark/src/waymark_core/runloop.rs @@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet, VecDeque}; use std::sync::{ Arc, - atomic::{AtomicBool, AtomicUsize, Ordering}, + atomic::{AtomicUsize, Ordering}, mpsc as std_mpsc, }; use std::thread; @@ -12,27 +12,29 @@ use std::time::{Duration, Instant}; use chrono::{DateTime, Utc}; use prost::Message; use serde_json::Value; -use tokio::sync::{Notify, mpsc, watch}; +use tokio::sync::mpsc; use tracing::{debug, error, info, warn}; use uuid::Uuid; - -use crate::backends::{ - ActionDone, BackendError, CoreBackend, GraphUpdate, InstanceDone, InstanceLockStatus, - LockClaim, QueuedInstance, QueuedInstanceBatch, WorkflowRegistryBackend, +use waymark_backends_core::BackendError; +use waymark_core_backend::{ + ActionDone, GraphUpdate, InstanceDone, InstanceLockStatus, LockClaim, QueuedInstance, + QueuedInstanceBatch, }; +use waymark_workflow_registry_backend::WorkflowRegistryBackend; + use crate::messages::ast as ir; use crate::observability::obs; use crate::waymark_core::commit_barrier::{CommitBarrier, DeferredInstanceEvent}; use crate::waymark_core::lock::{InstanceLockTracker, spawn_lock_heartbeat}; -use crate::waymark_core::runner::synthetic_exceptions::{ +use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; +use waymark_dag::{DAG, DAGNode, OutputNode, ReturnNode, convert_to_dag}; +use waymark_runner::synthetic_exceptions::{ SyntheticExceptionType, build_synthetic_exception_value, }; -use crate::waymark_core::runner::{ +use waymark_runner::{ DurableUpdates, ExecutorStep, RunnerExecutor, RunnerExecutorError, SleepRequest, replay_variables, }; -use crate::workers::{ActionCompletion, ActionRequest, BaseWorkerPool, WorkerPoolError}; -use waymark_dag::{DAG, DAGNode, OutputNode, ReturnNode, convert_to_dag}; /// Raised when the run loop cannot coordinate execution. #[derive(Debug, thiserror::Error)] @@ -134,58 +136,33 @@ enum PersistAck { }, } -async fn send_instance_message_with_stop( - instance_tx: &mpsc::Sender, - message: InstanceMessage, - stop_notify: &Notify, +async fn send_with_stop( + tx: &mpsc::Sender, + item: T, + stop: tokio_util::sync::WaitForCancellationFuture<'_>, + kind: &'static str, ) -> bool { - let send_fut = instance_tx.send(message); + let send_fut = tx.send(item); tokio::pin!(send_fut); - let mut warned = false; - loop { - tokio::select! { - res = &mut send_fut => { - if res.is_err() { - warn!("instance poller receiver dropped"); - return false; - } - return true; - } - _ = stop_notify.notified() => { - info!("instance poller stop notified during send"); - return false; - } - _ = tokio::time::sleep(Duration::from_secs(2)), if !warned => { - warn!("instance poller send pending >2s"); - warned = true; - } - } - } -} -async fn send_persist_command_with_stop( - persist_tx: &mpsc::Sender, - command: PersistCommand, - stop_notify: &Notify, -) -> bool { - let send_fut = persist_tx.send(command); - tokio::pin!(send_fut); + let mut stop = std::pin::pin!(stop); + let mut warned = false; loop { tokio::select! { res = &mut send_fut => { if res.is_err() { - warn!("persistence task receiver dropped"); + warn!(%kind, "receiver dropped"); return false; } return true; } - _ = stop_notify.notified() => { - info!("persist sender stop notified during send"); + _ = &mut stop => { + info!(%kind, "sender stop notified during send"); return false; } _ = tokio::time::sleep(Duration::from_secs(2)), if !warned => { - warn!("persist command send pending >2s"); + warn!(%kind, "send pending >2s"); warned = true; } } @@ -376,7 +353,7 @@ impl ShardExecutor { fn run_executor_shard( shard_id: usize, - backend: Arc, + backend: Arc, receiver: std_mpsc::Receiver, sender: mpsc::UnboundedSender, ) { @@ -555,7 +532,7 @@ fn run_executor_shard( /// Run loop that fans out executor work across CPU-bound shard threads. pub struct RunLoop { worker_pool: Arc, - core_backend: Arc, + core_backend: Arc, registry_backend: Arc, workflow_cache: HashMap>, max_concurrent_instances: usize, @@ -569,7 +546,8 @@ pub struct RunLoop { evict_sleep_threshold: Duration, skip_sleep: bool, active_instance_gauge: Option>, - shutdown_rx: Option>, + shutdown_token: tokio_util::sync::CancellationToken, + exit_on_idle: bool, } #[derive(Clone, Debug)] @@ -590,30 +568,37 @@ pub struct RunLoopSupervisorConfig { impl RunLoop { pub fn new( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, ) -> Self { - Self::new_internal(worker_pool, backend, config, None) + Self::new_internal( + worker_pool, + backend, + config, + tokio_util::sync::CancellationToken::new(), + true, + ) } pub fn new_with_shutdown( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, - shutdown_rx: watch::Receiver, + shutdown_token: tokio_util::sync::CancellationToken, ) -> Self { - Self::new_internal(worker_pool, backend, config, Some(shutdown_rx)) + Self::new_internal(worker_pool, backend, config, shutdown_token, false) } fn new_internal( worker_pool: impl BaseWorkerPool + 'static, - backend: impl CoreBackend + WorkflowRegistryBackend + 'static, + backend: impl waymark_core_backend::CoreBackend + WorkflowRegistryBackend + 'static, config: RunLoopSupervisorConfig, - shutdown_rx: Option>, + shutdown_token: tokio_util::sync::CancellationToken, + exit_on_idle: bool, ) -> Self { let max_concurrent_instances = std::cmp::max(1, config.max_concurrent_instances); let backend = Arc::new(backend); - let core_backend: Arc = backend.clone(); + let core_backend: Arc = backend.clone(); let registry_backend: Arc = backend; Self { worker_pool: Arc::new(worker_pool), @@ -636,7 +621,8 @@ impl RunLoop { evict_sleep_threshold: config.evict_sleep_threshold, skip_sleep: config.skip_sleep, active_instance_gauge: config.active_instance_gauge.clone(), - shutdown_rx, + shutdown_token, + exit_on_idle, } } @@ -891,30 +877,28 @@ impl RunLoop { let (completion_tx, mut completion_rx) = mpsc::channel::>(32); let (instance_tx, mut instance_rx) = mpsc::channel::(16); let (sleep_tx, mut sleep_rx) = mpsc::unbounded_channel::(); - let stop = Arc::new(AtomicBool::new(false)); - let stop_notify = Arc::new(Notify::new()); + let lock_tracker = InstanceLockTracker::new(self.lock_uuid); let lock_handle = spawn_lock_heartbeat( self.core_backend.clone(), lock_tracker.clone(), self.lock_heartbeat, self.lock_ttl, - stop.clone(), - stop_notify.clone(), + self.shutdown_token.clone().cancelled_owned(), ); let worker_pool = self.worker_pool.clone(); - let completion_stop = stop.clone(); - let completion_notify = stop_notify.clone(); + let completion_shutdown_token = self.shutdown_token.clone(); let completion_handle = tokio::spawn(async move { + let _completion_shutdown_guard = completion_shutdown_token.drop_guard_ref(); loop { - if completion_stop.load(Ordering::SeqCst) { + if completion_shutdown_token.is_cancelled() { info!("completion task stop flag set"); break; } debug!("completion task awaiting completions"); let completions = tokio::select! { - _ = completion_notify.notified() => { + _ = completion_shutdown_token.cancelled() => { info!("completion task stop notified"); break; } @@ -930,35 +914,21 @@ impl RunLoop { count = completions.len(), "completion task sending completions" ); - let send_fut = completion_tx.send(completions); - tokio::pin!(send_fut); - let mut warned = false; - let mut stop_during_send = false; - let send_result = loop { - tokio::select! { - res = &mut send_fut => break Some(res), - _ = completion_notify.notified() => { - info!("completion task stop notified during send"); - stop_during_send = true; - break None; - } - _ = tokio::time::sleep(Duration::from_secs(2)), if !warned => { - warn!("completion task send pending >2s"); - warned = true; - } - } - }; - if stop_during_send { - break; - } - if send_result.is_none() || send_result.unwrap().is_err() { - warn!("completion task receiver dropped"); + + if !send_with_stop( + &completion_tx, + completions, + completion_shutdown_token.cancelled(), + "completions", + ) + .await + { break; } + debug!("completion task sent completions"); } info!("completion task exiting"); - completion_notify.notify_waiters(); }); let backend = self.core_backend.clone(); @@ -967,11 +937,11 @@ impl RunLoop { let lock_uuid = self.lock_uuid; let lock_ttl = self.lock_ttl; let instance_available_slots = Arc::clone(&available_instance_slots); - let instance_stop = stop.clone(); - let instance_notify = stop_notify.clone(); + let instance_shutdown_token = self.shutdown_token.clone(); let instance_handle = tokio::spawn(async move { + let _instance_shutdown_guard = instance_shutdown_token.drop_guard_ref(); loop { - if instance_stop.load(Ordering::SeqCst) { + if instance_shutdown_token.is_cancelled() { info!("instance poller stop flag set"); break; } @@ -1005,7 +975,14 @@ impl RunLoop { } Err(err) => InstanceMessage::Error(err), }; - if !send_instance_message_with_stop(&instance_tx, message, &instance_notify).await { + if !send_with_stop( + &instance_tx, + message, + instance_shutdown_token.cancelled(), + "instance message", + ) + .await + { break; } if poll_interval > Duration::ZERO { @@ -1015,7 +992,6 @@ impl RunLoop { } } info!("instance poller exiting"); - instance_notify.notify_waiters(); }); const PERSIST_COALESCE_WINDOW: Duration = Duration::from_millis(2); @@ -1175,18 +1151,15 @@ impl RunLoop { let mut commit_barrier: CommitBarrier = CommitBarrier::new(); let mut instances_idle = false; let mut instances_done_pending: Vec = Vec::new(); - let mut run_result = Ok(()); - let mut shutdown_rx = self.shutdown_rx.clone(); + let shutdown_token = self.shutdown_token.clone(); - loop { - if let Some(rx) = shutdown_rx.as_ref() - && *rx.borrow() - { + let mut run_result = 'runloop: loop { + if shutdown_token.is_cancelled() { info!("runloop exiting: shutdown requested"); - break; + break 'runloop Ok(()); } - if shutdown_rx.is_none() + if self.exit_on_idle && instances_idle && executor_shards.is_empty() && sleeping_nodes.is_empty() @@ -1196,18 +1169,13 @@ impl RunLoop { blocked = blocked_until_by_instance.len(), "runloop exiting: idle with no active executors" ); - break; + break 'runloop Ok(()); } - let has_shutdown = shutdown_rx.is_some(); - let shutdown_rx_fut = async { shutdown_rx.as_mut().unwrap().changed().await.is_ok() }; let first_event = tokio::select! { - shutdown_signal = shutdown_rx_fut, if has_shutdown => { - if !shutdown_signal || shutdown_rx.as_ref().is_some_and(|rx| *rx.borrow()) { - info!("runloop exiting: shutdown requested"); - break; - } - None + _ = shutdown_token.cancelled() => { + info!("runloop exiting: shutdown requested"); + break 'runloop Ok(()); } Some(completions) = completion_rx.recv() => { Some(CoordinatorEvent::Completions(completions)) @@ -1235,7 +1203,7 @@ impl RunLoop { } else => { warn!("runloop exiting: event channels closed"); - break; + break 'runloop Ok(()); }, }; @@ -1267,8 +1235,7 @@ impl RunLoop { } CoordinatorEvent::Instance(InstanceMessage::Error(err)) => { warn!(error = %err, "runloop exiting: instance poller backend error"); - run_result = Err(RunLoopError::Backend(err)); - break; + break 'runloop Err(RunLoopError::Backend(err)); } CoordinatorEvent::Shard(event) => match event { ShardEvent::Step(step) => all_steps.push(step), @@ -1308,15 +1275,11 @@ impl RunLoop { } InstanceMessage::Error(err) => { warn!(error = %err, "runloop exiting: instance poller backend error"); - run_result = Err(RunLoopError::Backend(err)); - break; + break 'runloop Err(RunLoopError::Backend(err)); } } } - if run_result.is_err() { - warn!("runloop exiting: error after draining instance messages"); - break; - } + while let Ok(event) = event_rx.try_recv() { match event { ShardEvent::Step(step) => all_steps.push(step), @@ -1334,10 +1297,7 @@ impl RunLoop { } } } - if run_result.is_err() { - warn!("runloop exiting: error after draining shard events"); - break; - } + while let Ok(wake) = sleep_rx.try_recv() { all_wakes.push(wake); } @@ -1418,8 +1378,7 @@ impl RunLoop { ) .await { - run_result = Err(err); - break; + break 'runloop Err(err); } for step in batch.steps { if !batch.instance_ids.contains(&step.executor_id) { @@ -1434,13 +1393,10 @@ impl RunLoop { continue; } if let Err(err) = self.apply_confirmed_step(step, &mut state) { - run_result = Err(err); - break; + break 'runloop Err(err); } } - if run_result.is_err() { - break; - } + for instance_id in batch.instance_ids { if evict_ids.contains(&instance_id) { state.commit_barrier.remove_instance(instance_id); @@ -1457,14 +1413,10 @@ impl RunLoop { } PersistAck::StepsPersistFailed { batch_id, error } => { warn!(batch_id, error = %error, "persist step batch failed"); - run_result = Err(error); - break; + break 'runloop Err(error); } } } - if run_result.is_err() { - break; - } } if !all_completions.is_empty() { @@ -1587,8 +1539,7 @@ impl RunLoop { if had_instances { instances_idle = false; if let Err(err) = self.hydrate_instances(&mut all_instances).await { - run_result = Err(err); - break; + break 'runloop Err(err); } debug!(count = all_instances.len(), "hydrated queued instances"); let mut by_shard: HashMap> = HashMap::new(); @@ -1680,7 +1631,7 @@ impl RunLoop { .map(|update| update.instance_id) .collect(); let batch_id = commit_barrier.register_batch(instance_ids.clone(), all_steps); - if !send_persist_command_with_stop( + if !send_with_stop( &persist_tx, PersistCommand { batch_id, @@ -1689,7 +1640,8 @@ impl RunLoop { actions_done, graph_updates, }, - &stop_notify, + shutdown_token.cancelled(), + "persist command", ) .await { @@ -1698,10 +1650,9 @@ impl RunLoop { commit_barrier.remove_instance(instance_id); } } - run_result = Err(RunLoopError::Message( + break 'runloop Err(RunLoopError::Message( "failed to submit persist batch to persistence task".to_string(), )); - break; } } @@ -1737,8 +1688,7 @@ impl RunLoop { sleep_tx: &sleep_tx, }; if let Err(err) = self.evict_instances(&evict_ids, &mut state).await { - run_result = Err(err); - break; + break 'runloop Err(err); } for instance_id in evict_ids { state.commit_barrier.remove_instance(instance_id); @@ -1751,10 +1701,9 @@ impl RunLoop { if instances_done_pending.len() >= self.instance_done_batch_size && let Err(err) = self.flush_instances_done(&mut instances_done_pending).await { - run_result = Err(err); - break; + break 'runloop Err(err); } - } + }; info!( instances_idle, @@ -1775,8 +1724,7 @@ impl RunLoop { } drop(persist_tx); let _ = persist_handle.await; - stop.store(true, Ordering::SeqCst); - stop_notify.notify_waiters(); + shutdown_token.cancel(); let _ = completion_handle.await; let _ = instance_handle.await; let _ = lock_handle.await; @@ -1818,9 +1766,9 @@ pub async fn runloop_supervisor( backend: B, worker_pool: W, config: RunLoopSupervisorConfig, - shutdown_rx: watch::Receiver, + shutdown_token: tokio_util::sync::CancellationToken, ) where - B: CoreBackend + WorkflowRegistryBackend + Clone + Send + Sync + 'static, + B: waymark_core_backend::CoreBackend + WorkflowRegistryBackend + Clone + Send + Sync + 'static, W: BaseWorkerPool + Clone + Send + Sync + 'static, { let mut backoff = Duration::from_millis(200); @@ -1829,7 +1777,7 @@ pub async fn runloop_supervisor( let poll_interval = config.poll_interval; loop { - if *shutdown_rx.borrow() { + if shutdown_token.is_cancelled() { break; } @@ -1844,12 +1792,12 @@ pub async fn runloop_supervisor( worker_pool.clone(), backend.clone(), config.clone(), - shutdown_rx.clone(), + shutdown_token.child_token(), ); let result = runloop.run().await; - if *shutdown_rx.borrow() { + if shutdown_token.is_cancelled() { break; } @@ -1983,1081 +1931,4 @@ fn build_instance_done( } #[cfg(test)] -mod tests { - use super::*; - use std::collections::{HashMap, VecDeque}; - use std::sync::{ - Arc, Mutex, - atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}, - }; - use std::time::Duration; - - use chrono::Utc; - use prost::Message; - use sha2::{Digest, Sha256}; - use tonic::async_trait; - - use crate::backends::{ - ActionAttemptStatus, BackendError, BackendResult, CoreBackend, GraphUpdate, InstanceDone, - InstanceLockStatus, LockClaim, MemoryBackend, QueuedInstanceBatch, WorkflowRegistration, - WorkflowRegistryBackend, WorkflowVersion, - }; - use crate::messages::ast as ir; - use crate::waymark_core::ir_parser::parse_program; - use crate::waymark_core::runner::RunnerState; - use crate::waymark_core::runner::state::NodeStatus; - use crate::workers::ActionCallable; - use waymark_dag::convert_to_dag; - - #[derive(Clone)] - struct FaultInjectingBackend { - inner: MemoryBackend, - fail_get_queued_instances_with_depth_limit: Arc, - get_queued_instances_calls: Arc, - } - - impl FaultInjectingBackend { - fn with_depth_limit_poll_failures(inner: MemoryBackend) -> Self { - Self { - inner, - fail_get_queued_instances_with_depth_limit: Arc::new(AtomicBool::new(true)), - get_queued_instances_calls: Arc::new(AtomicUsize::new(0)), - } - } - - fn get_queued_instances_calls(&self) -> usize { - self.get_queued_instances_calls.load(AtomicOrdering::SeqCst) - } - - fn queue_len(&self) -> usize { - self.inner - .instance_queue() - .as_ref() - .map(|queue| queue.lock().expect("queue poisoned").len()) - .unwrap_or(0) - } - - fn instances_done_len(&self) -> usize { - self.inner.instances_done().len() - } - } - - #[async_trait] - impl CoreBackend for FaultInjectingBackend { - fn clone_box(&self) -> Box { - Box::new(self.clone()) - } - - async fn save_graphs( - &self, - claim: LockClaim, - graphs: &[GraphUpdate], - ) -> BackendResult> { - self.inner.save_graphs(claim, graphs).await - } - - async fn save_actions_done( - &self, - actions: &[crate::backends::ActionDone], - ) -> BackendResult<()> { - self.inner.save_actions_done(actions).await - } - - async fn save_instances_done(&self, instances: &[InstanceDone]) -> BackendResult<()> { - self.inner.save_instances_done(instances).await - } - - async fn get_queued_instances( - &self, - size: usize, - claim: LockClaim, - ) -> BackendResult { - self.get_queued_instances_calls - .fetch_add(1, AtomicOrdering::SeqCst); - if self - .fail_get_queued_instances_with_depth_limit - .load(AtomicOrdering::SeqCst) - { - return Err(BackendError::Message("depth limit exceeded".to_string())); - } - self.inner.get_queued_instances(size, claim).await - } - - async fn queue_instances( - &self, - instances: &[crate::backends::QueuedInstance], - ) -> BackendResult<()> { - self.inner.queue_instances(instances).await - } - - async fn refresh_instance_locks( - &self, - claim: LockClaim, - instance_ids: &[Uuid], - ) -> BackendResult> { - self.inner.refresh_instance_locks(claim, instance_ids).await - } - - async fn release_instance_locks( - &self, - lock_uuid: Uuid, - instance_ids: &[Uuid], - ) -> BackendResult<()> { - self.inner - .release_instance_locks(lock_uuid, instance_ids) - .await - } - } - - #[async_trait] - impl WorkflowRegistryBackend for FaultInjectingBackend { - async fn upsert_workflow_version( - &self, - registration: &WorkflowRegistration, - ) -> BackendResult { - self.inner.upsert_workflow_version(registration).await - } - - async fn get_workflow_versions(&self, ids: &[Uuid]) -> BackendResult> { - self.inner.get_workflow_versions(ids).await - } - } - - fn default_test_config(lock_uuid: Uuid) -> RunLoopSupervisorConfig { - RunLoopSupervisorConfig { - max_concurrent_instances: 25, - executor_shards: 1, - instance_done_batch_size: None, - poll_interval: Duration::from_millis(10), - persistence_interval: Duration::from_millis(10), - lock_uuid, - lock_ttl: Duration::from_secs(15), - lock_heartbeat: Duration::from_secs(5), - evict_sleep_threshold: Duration::from_secs(10), - skip_sleep: false, - active_instance_gauge: None, - } - } - - #[tokio::test] - async fn test_runloop_executes_actions() { - let source = r#" -fn main(input: [x], output: [y]): - y = @tests.fixtures.test_actions.double(value=x) - return y -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let _ = state - .record_assignment( - vec!["x".to_string()], - &ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(4)), - })), - span: None, - }, - None, - Some("input x = 4".to_string()), - ) - .expect("record assignment"); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let mut actions: HashMap = HashMap::new(); - actions.insert( - "double".to_string(), - Arc::new(|kwargs| { - Box::pin(async move { - let value = kwargs - .get("value") - .and_then(|value| value.as_i64()) - .unwrap_or(0); - Ok(Value::Number((value * 2).into())) - }) - }), - ); - let worker_pool = crate::workers::InlineWorkerPool::new(actions); - - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - RunLoopSupervisorConfig { - max_concurrent_instances: 25, - executor_shards: 1, - instance_done_batch_size: None, - poll_interval: Duration::from_secs_f64(0.0), - persistence_interval: Duration::from_secs_f64(0.1), - lock_uuid: Uuid::new_v4(), - lock_ttl: Duration::from_secs(15), - lock_heartbeat: Duration::from_secs(5), - evict_sleep_threshold: Duration::from_secs(10), - skip_sleep: false, - active_instance_gauge: None, - }, - ); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id: Uuid::new_v4(), - scheduled_at: None, - }); - - runloop.run().await.expect("runloop"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - let done = &instances_done[0]; - let output = done.result.clone().expect("instance result"); - let Value::Object(map) = output else { - panic!("expected output object"); - }; - assert_eq!(map.get("y"), Some(&Value::Number(8.into()))); - } - - #[tokio::test] - async fn test_runloop_times_out_action_and_persists_timestamps() { - let source = r#" -fn main(input: [], output: [y]): - y = @tests.fixtures.test_actions.hang()[timeout: 1 s] - return y -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test_timeout".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let mut actions: HashMap = HashMap::new(); - actions.insert( - "hang".to_string(), - Arc::new(|_kwargs| { - Box::pin(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - Ok(Value::String("late".to_string())) - }) - }), - ); - let worker_pool = crate::workers::InlineWorkerPool::new(actions); - - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - RunLoopSupervisorConfig { - max_concurrent_instances: 25, - executor_shards: 1, - instance_done_batch_size: None, - poll_interval: Duration::from_secs_f64(0.0), - persistence_interval: Duration::from_secs_f64(0.05), - lock_uuid: Uuid::new_v4(), - lock_ttl: Duration::from_secs(15), - lock_heartbeat: Duration::from_secs(5), - evict_sleep_threshold: Duration::from_secs(10), - skip_sleep: false, - active_instance_gauge: None, - }, - ); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id: Uuid::new_v4(), - scheduled_at: None, - }); - - runloop.run().await.expect("runloop"); - - let actions_done = backend.actions_done(); - assert_eq!(actions_done.len(), 1); - let action_done = &actions_done[0]; - assert_eq!(action_done.status, ActionAttemptStatus::TimedOut); - assert!(action_done.started_at.is_some()); - assert!(action_done.completed_at.is_some()); - assert!(action_done.duration_ms.is_some()); - - let execution_id = action_done.execution_id; - let graph_updates = backend.graph_updates(); - let mut saw_running_snapshot = false; - let mut saw_failed_snapshot = false; - for update in graph_updates { - let Some(node) = update.nodes.get(&execution_id) else { - continue; - }; - if node.status == NodeStatus::Running && node.started_at.is_some() { - saw_running_snapshot = true; - } - if node.status == NodeStatus::Failed - && node.started_at.is_some() - && node.completed_at.is_some() - { - saw_failed_snapshot = true; - } - } - assert!(saw_running_snapshot, "expected running graph snapshot"); - assert!(saw_failed_snapshot, "expected failed graph snapshot"); - - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - assert!(instances_done[0].result.is_none()); - let Value::Object(error_obj) = instances_done[0] - .error - .clone() - .expect("instance error payload") - else { - panic!("expected error payload object"); - }; - assert_eq!( - error_obj.get("type"), - Some(&Value::String("ActionTimeout".to_string())) - ); - } - - #[tokio::test] - async fn test_runloop_marks_instance_failed_on_executor_error() { - let source = r#" -fn main(input: [x], output: [y]): - y = @tests.fixtures.test_actions.double(value=x) - return y -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - // Intentionally omit input assignment so action kwarg resolution fails at runtime. - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - RunLoopSupervisorConfig { - max_concurrent_instances: 25, - executor_shards: 1, - instance_done_batch_size: None, - poll_interval: Duration::from_secs_f64(0.0), - persistence_interval: Duration::from_secs_f64(0.1), - lock_uuid: Uuid::new_v4(), - lock_ttl: Duration::from_secs(15), - lock_heartbeat: Duration::from_secs(5), - evict_sleep_threshold: Duration::from_secs(10), - skip_sleep: false, - active_instance_gauge: None, - }, - ); - let instance_id = Uuid::new_v4(); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id, - scheduled_at: None, - }); - - runloop - .run() - .await - .expect("runloop should continue after instance failure"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - - let done = &instances_done[0]; - assert_eq!(done.executor_id, instance_id); - assert!(done.result.is_none()); - let error = done.error.as_ref().expect("instance error"); - let Value::Object(error_obj) = error else { - panic!("expected error payload object"); - }; - assert_eq!( - error_obj.get("type"), - Some(&Value::String("ExecutionError".to_string())) - ); - let message = error_obj - .get("message") - .and_then(Value::as_str) - .expect("error message"); - assert!(message.contains("variable not found: x")); - } - - #[tokio::test] - async fn test_runloop_executes_for_loop_action_assignments() { - let source = r#" -fn main(input: [limit], output: [result]): - current = 0 - iterations = 0 - for _ in range(limit): - current = @tests.fixtures.test_actions.increment(value=current) - iterations = iterations + 1 - result = @tests.fixtures.test_actions.pack(limit=limit, final=current, iterations=iterations) - return result -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let _ = state - .record_assignment( - vec!["limit".to_string()], - &ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(4)), - })), - span: None, - }, - None, - Some("input limit = 4".to_string()), - ) - .expect("record assignment"); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test_loop_actions".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let mut actions: HashMap = HashMap::new(); - let increment_inputs = Arc::new(Mutex::new(Vec::new())); - let increment_inputs_clone = Arc::clone(&increment_inputs); - actions.insert( - "increment".to_string(), - Arc::new(move |kwargs| { - let increment_inputs = Arc::clone(&increment_inputs_clone); - Box::pin(async move { - let value = kwargs - .get("value") - .and_then(|value| value.as_i64()) - .unwrap_or(0); - increment_inputs - .lock() - .expect("increment inputs lock") - .push(value); - Ok(Value::Number((value + 1).into())) - }) - }), - ); - actions.insert( - "pack".to_string(), - Arc::new(|kwargs| { - Box::pin(async move { - let limit = kwargs.get("limit").cloned().unwrap_or(Value::Null); - let final_value = kwargs.get("final").cloned().unwrap_or(Value::Null); - let iterations = kwargs.get("iterations").cloned().unwrap_or(Value::Null); - Ok(Value::Object( - [ - ("limit".to_string(), limit), - ("final".to_string(), final_value), - ("iterations".to_string(), iterations), - ] - .into_iter() - .collect(), - )) - }) - }), - ); - let worker_pool = crate::workers::InlineWorkerPool::new(actions); - - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - RunLoopSupervisorConfig { - max_concurrent_instances: 25, - executor_shards: 1, - instance_done_batch_size: None, - poll_interval: Duration::from_secs_f64(0.0), - persistence_interval: Duration::from_secs_f64(0.1), - lock_uuid: Uuid::new_v4(), - lock_ttl: Duration::from_secs(15), - lock_heartbeat: Duration::from_secs(5), - evict_sleep_threshold: Duration::from_secs(10), - skip_sleep: false, - active_instance_gauge: None, - }, - ); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id: Uuid::new_v4(), - scheduled_at: None, - }); - - runloop.run().await.expect("runloop"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - let done = &instances_done[0]; - let output = done.result.clone().expect("instance result"); - let Value::Object(map) = output else { - panic!("expected output object"); - }; - let Value::Object(result_map) = map - .get("result") - .cloned() - .expect("result payload should include result") - else { - panic!("expected nested result object"); - }; - assert_eq!( - *increment_inputs.lock().expect("increment inputs lock"), - vec![0, 1, 2, 3] - ); - assert_eq!(result_map.get("limit"), Some(&Value::Number(4.into()))); - assert_eq!(result_map.get("final"), Some(&Value::Number(4.into()))); - assert_eq!(result_map.get("iterations"), Some(&Value::Number(4.into()))); - } - - #[tokio::test] - async fn test_instance_poller_send_unblocks_on_stop_notification() { - let (instance_tx, mut instance_rx) = mpsc::channel::(1); - instance_tx - .send(InstanceMessage::Batch { - instances: Vec::new(), - }) - .await - .expect("seed channel"); - - let stop_notify = Arc::new(Notify::new()); - let send_task = tokio::spawn({ - let instance_tx = instance_tx.clone(); - let stop_notify = Arc::clone(&stop_notify); - async move { - send_instance_message_with_stop( - &instance_tx, - InstanceMessage::Batch { - instances: Vec::new(), - }, - &stop_notify, - ) - .await - } - }); - - tokio::time::sleep(Duration::from_millis(20)).await; - stop_notify.notify_waiters(); - let sent = tokio::time::timeout(Duration::from_millis(300), send_task) - .await - .expect("send task should complete") - .expect("send task should not panic"); - assert!(!sent, "send should abort when stop is notified"); - - let _ = instance_rx.recv().await; - } - - #[tokio::test] - async fn test_instance_poller_send_succeeds_when_channel_has_capacity() { - let (instance_tx, mut instance_rx) = mpsc::channel::(1); - let stop_notify = Notify::new(); - let sent = send_instance_message_with_stop( - &instance_tx, - InstanceMessage::Batch { - instances: Vec::new(), - }, - &stop_notify, - ) - .await; - assert!(sent); - - let received = instance_rx.recv().await.expect("queued message"); - match received { - InstanceMessage::Batch { instances } => assert!(instances.is_empty()), - InstanceMessage::Error(err) => panic!("unexpected error message: {err}"), - } - } - - #[tokio::test] - async fn test_runloop_supervisor_restarts_on_depth_limit_backend_errors() { - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = - FaultInjectingBackend::with_depth_limit_poll_failures(MemoryBackend::with_queue(queue)); - let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); - let (shutdown_tx, shutdown_rx) = watch::channel(false); - - let supervisor = tokio::spawn(runloop_supervisor( - backend.clone(), - worker_pool, - default_test_config(Uuid::new_v4()), - shutdown_rx, - )); - - tokio::time::sleep(Duration::from_millis(750)).await; - shutdown_tx.send(true).expect("send shutdown"); - tokio::time::timeout(Duration::from_secs(2), supervisor) - .await - .expect("supervisor should stop") - .expect("supervisor task should not panic"); - - assert!( - backend.get_queued_instances_calls() >= 2, - "expected multiple polling attempts while supervisor restarts" - ); - } - - #[tokio::test] - async fn test_runloop_supervisor_reproduces_no_progress_with_continued_queue_growth() { - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = - FaultInjectingBackend::with_depth_limit_poll_failures(MemoryBackend::with_queue(queue)); - let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); - let (shutdown_tx, shutdown_rx) = watch::channel(false); - - let supervisor = tokio::spawn(runloop_supervisor( - backend.clone(), - worker_pool, - default_test_config(Uuid::new_v4()), - shutdown_rx, - )); - - for _ in 0..20 { - backend - .queue_instances(&[QueuedInstance { - workflow_version_id: Uuid::new_v4(), - schedule_id: None, - dag: None, - entry_node: Uuid::new_v4(), - state: None, - action_results: HashMap::new(), - instance_id: Uuid::new_v4(), - scheduled_at: None, - }]) - .await - .expect("queue synthetic instance"); - } - - tokio::time::sleep(Duration::from_millis(500)).await; - shutdown_tx.send(true).expect("send shutdown"); - tokio::time::timeout(Duration::from_secs(2), supervisor) - .await - .expect("supervisor should stop") - .expect("supervisor task should not panic"); - - assert!( - backend.get_queued_instances_calls() >= 1, - "expected polling attempts during stall simulation" - ); - assert!( - backend.queue_len() >= 20, - "queued work should continue to grow when poller cannot read instances" - ); - assert_eq!( - backend.instances_done_len(), - 0, - "no instances should complete while poller is failing" - ); - } - - #[tokio::test] - async fn test_runloop_marks_instance_failed_when_rehydrated_state_is_missing_action_result() { - let source = r#" -fn main(input: [x], output: [y]): - y = @tests.fixtures.test_actions.double(value=x) - return y -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let _ = state - .record_assignment( - vec!["x".to_string()], - &ir::Expr { - kind: Some(ir::expr::Kind::Literal(ir::Literal { - value: Some(ir::literal::Value::IntValue(4)), - })), - span: None, - }, - None, - Some("input x = 4".to_string()), - ) - .expect("record assignment"); - let template_entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&template_entry_node, None) - .expect("queue entry node"); - let mut bootstrap_executor = - RunnerExecutor::new(Arc::clone(&dag), state, HashMap::new(), None); - let bootstrap_step = bootstrap_executor - .increment(&[entry_exec.node_id]) - .expect("bootstrap increment should materialize action node"); - let action_exec = bootstrap_step - .actions - .first() - .expect("bootstrap should queue one action call") - .clone(); - - // Simulate a reclaimed instance whose graph says the action execution node - // has finished, but action_results payload was lost. - bootstrap_executor - .state_mut() - .mark_completed(action_exec.node_id) - .expect("mark action completed"); - bootstrap_executor.state_mut().ready_queue.clear(); - assert!( - bootstrap_executor - .state() - .nodes - .get(&action_exec.node_id) - .is_some_and(|node| node.is_action_call() && node.status == NodeStatus::Completed), - "expected completed action execution node" - ); - let state = bootstrap_executor.state().clone(); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test_missing_action_result".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - default_test_config(Uuid::new_v4()), - ); - let instance_id = Uuid::new_v4(); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: action_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id, - scheduled_at: None, - }); - - runloop - .run() - .await - .expect("runloop should continue after instance failure"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - assert_eq!(instances_done[0].executor_id, instance_id); - let Value::Object(error_obj) = instances_done[0] - .error - .clone() - .expect("instance error payload") - else { - panic!("expected error payload object"); - }; - assert_eq!( - error_obj.get("type"), - Some(&Value::String("ExecutionError".to_string())) - ); - let message = error_obj - .get("message") - .and_then(Value::as_str) - .expect("error message"); - assert!( - message.contains("missing action result for"), - "expected missing action result error, got: {message}" - ); - } - - #[tokio::test] - async fn test_runloop_marks_instance_failed_with_dict_key_error() { - let source = r#" -fn main(input: [], output: [result]): - payload = @tests.fixtures.test_actions.make_payload() - result = payload["missing"] - return result -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test_dict_key_error".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let mut actions: HashMap = HashMap::new(); - actions.insert( - "make_payload".to_string(), - Arc::new(|_kwargs| { - Box::pin(async move { - Ok(Value::Object( - [("present".to_string(), Value::Number(1.into()))] - .into_iter() - .collect(), - )) - }) - }), - ); - let worker_pool = crate::workers::InlineWorkerPool::new(actions); - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - default_test_config(Uuid::new_v4()), - ); - let instance_id = Uuid::new_v4(); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id, - scheduled_at: None, - }); - - runloop - .run() - .await - .expect("runloop should continue after instance failure"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - assert_eq!(instances_done[0].executor_id, instance_id); - let Value::Object(error_obj) = instances_done[0] - .error - .clone() - .expect("instance error payload") - else { - panic!("expected error payload object"); - }; - let message = error_obj - .get("message") - .and_then(Value::as_str) - .expect("error message"); - assert!( - message.contains("dict has no key"), - "expected dict key error, got: {message}" - ); - } - - #[tokio::test] - async fn test_runloop_marks_instance_failed_with_attribute_error() { - let source = r#" -fn main(input: [], output: [result]): - payload = @tests.fixtures.test_actions.make_number() - result = payload.missing - return result -"#; - let program = parse_program(source.trim()).expect("parse program"); - let program_proto = program.encode_to_vec(); - let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); - let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); - - let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); - let entry_node = dag - .entry_node - .as_ref() - .expect("DAG entry node not found") - .clone(); - let entry_exec = state - .queue_template_node(&entry_node, None) - .expect("queue entry node"); - - let queue = Arc::new(Mutex::new(VecDeque::new())); - let backend = MemoryBackend::with_queue(queue.clone()); - let workflow_version_id = backend - .upsert_workflow_version(&WorkflowRegistration { - workflow_name: "test_attribute_error".to_string(), - workflow_version: ir_hash.clone(), - ir_hash, - program_proto, - concurrent: false, - }) - .await - .expect("register workflow version"); - - let mut actions: HashMap = HashMap::new(); - actions.insert( - "make_number".to_string(), - Arc::new(|_kwargs| Box::pin(async move { Ok(Value::Number(7.into())) })), - ); - let worker_pool = crate::workers::InlineWorkerPool::new(actions); - let mut runloop = RunLoop::new( - worker_pool, - backend.clone(), - default_test_config(Uuid::new_v4()), - ); - let instance_id = Uuid::new_v4(); - queue.lock().expect("queue lock").push_back(QueuedInstance { - workflow_version_id, - schedule_id: None, - dag: None, - entry_node: entry_exec.node_id, - state: Some(state), - action_results: HashMap::new(), - instance_id, - scheduled_at: None, - }); - - runloop - .run() - .await - .expect("runloop should continue after instance failure"); - let instances_done = backend.instances_done(); - assert_eq!(instances_done.len(), 1); - assert_eq!(instances_done[0].executor_id, instance_id); - let Value::Object(error_obj) = instances_done[0] - .error - .clone() - .expect("instance error payload") - else { - panic!("expected error payload object"); - }; - let message = error_obj - .get("message") - .and_then(Value::as_str) - .expect("error message"); - assert!( - message.contains("attribute not found"), - "expected attribute error, got: {message}" - ); - } - - #[test] - fn test_lock_mismatches_ignores_expired_lock_with_matching_owner() { - let backend = MemoryBackend::new(); - let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); - let lock_uuid = Uuid::new_v4(); - let runloop = RunLoop::new(worker_pool, backend, default_test_config(lock_uuid)); - - let instance_id = Uuid::new_v4(); - let statuses = vec![InstanceLockStatus { - instance_id, - lock_uuid: Some(lock_uuid), - lock_expires_at: Some(Utc::now() - chrono::Duration::seconds(60)), - }]; - assert!( - runloop.lock_mismatches(&statuses).is_empty(), - "matching lock UUID should not evict solely due to stale expiry" - ); - - let mismatched = vec![InstanceLockStatus { - instance_id, - lock_uuid: Some(Uuid::new_v4()), - lock_expires_at: Some(Utc::now() + chrono::Duration::seconds(60)), - }]; - let evict_ids = runloop.lock_mismatches(&mismatched); - assert_eq!(evict_ids, HashSet::from([instance_id])); - } -} +mod tests; diff --git a/crates/waymark/src/waymark_core/runloop/tests.rs b/crates/waymark/src/waymark_core/runloop/tests.rs new file mode 100644 index 00000000..53d6c634 --- /dev/null +++ b/crates/waymark/src/waymark_core/runloop/tests.rs @@ -0,0 +1,964 @@ +use super::*; +use std::collections::{HashMap, VecDeque}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use chrono::Utc; +use prost::Message; +use sha2::{Digest, Sha256}; +use waymark_backend_fault_injection::FaultInjectingBackend; +use waymark_backend_memory::MemoryBackend; +use waymark_core_backend::{ActionAttemptStatus, CoreBackend}; +use waymark_workflow_registry_backend::WorkflowRegistration; + +use crate::messages::ast as ir; +use crate::workers::ActionCallable; + +use waymark_dag::convert_to_dag; +use waymark_ir_parser::parse_program; +use waymark_runner_state::NodeStatus; +use waymark_runner_state::RunnerState; + +fn default_test_config(lock_uuid: Uuid) -> RunLoopSupervisorConfig { + RunLoopSupervisorConfig { + max_concurrent_instances: 25, + executor_shards: 1, + instance_done_batch_size: None, + poll_interval: Duration::from_millis(10), + persistence_interval: Duration::from_millis(10), + lock_uuid, + lock_ttl: Duration::from_secs(15), + lock_heartbeat: Duration::from_secs(5), + evict_sleep_threshold: Duration::from_secs(10), + skip_sleep: false, + active_instance_gauge: None, + } +} + +#[tokio::test] +async fn test_runloop_executes_actions() { + let source = r#" +fn main(input: [x], output: [y]): + y = @tests.fixtures.test_actions.double(value=x) + return y +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let _ = state + .record_assignment( + vec!["x".to_string()], + &ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(4)), + })), + span: None, + }, + None, + Some("input x = 4".to_string()), + ) + .expect("record assignment"); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let mut actions: HashMap = HashMap::new(); + actions.insert( + "double".to_string(), + Arc::new(|kwargs| { + Box::pin(async move { + let value = kwargs + .get("value") + .and_then(|value| value.as_i64()) + .unwrap_or(0); + Ok(Value::Number((value * 2).into())) + }) + }), + ); + let worker_pool = crate::workers::InlineWorkerPool::new(actions); + + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + RunLoopSupervisorConfig { + max_concurrent_instances: 25, + executor_shards: 1, + instance_done_batch_size: None, + poll_interval: Duration::from_secs_f64(0.0), + persistence_interval: Duration::from_secs_f64(0.1), + lock_uuid: Uuid::new_v4(), + lock_ttl: Duration::from_secs(15), + lock_heartbeat: Duration::from_secs(5), + evict_sleep_threshold: Duration::from_secs(10), + skip_sleep: false, + active_instance_gauge: None, + }, + ); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id: Uuid::new_v4(), + scheduled_at: None, + }); + + tracing::info!("1"); + + runloop.run().await.expect("runloop"); + + tracing::info!("1"); + + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + let done = &instances_done[0]; + let output = done.result.clone().expect("instance result"); + let Value::Object(map) = output else { + panic!("expected output object"); + }; + assert_eq!(map.get("y"), Some(&Value::Number(8.into()))); +} + +#[tokio::test] +async fn test_runloop_times_out_action_and_persists_timestamps() { + let source = r#" +fn main(input: [], output: [y]): + y = @tests.fixtures.test_actions.hang()[timeout: 1 s] + return y +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test_timeout".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let mut actions: HashMap = HashMap::new(); + actions.insert( + "hang".to_string(), + Arc::new(|_kwargs| { + Box::pin(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + Ok(Value::String("late".to_string())) + }) + }), + ); + let worker_pool = crate::workers::InlineWorkerPool::new(actions); + + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + RunLoopSupervisorConfig { + max_concurrent_instances: 25, + executor_shards: 1, + instance_done_batch_size: None, + poll_interval: Duration::from_secs_f64(0.0), + persistence_interval: Duration::from_secs_f64(0.05), + lock_uuid: Uuid::new_v4(), + lock_ttl: Duration::from_secs(15), + lock_heartbeat: Duration::from_secs(5), + evict_sleep_threshold: Duration::from_secs(10), + skip_sleep: false, + active_instance_gauge: None, + }, + ); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id: Uuid::new_v4(), + scheduled_at: None, + }); + + runloop.run().await.expect("runloop"); + + let actions_done = backend.actions_done(); + assert_eq!(actions_done.len(), 1); + let action_done = &actions_done[0]; + assert_eq!(action_done.status, ActionAttemptStatus::TimedOut); + assert!(action_done.started_at.is_some()); + assert!(action_done.completed_at.is_some()); + assert!(action_done.duration_ms.is_some()); + + let execution_id = action_done.execution_id; + let graph_updates = backend.graph_updates(); + let mut saw_running_snapshot = false; + let mut saw_failed_snapshot = false; + for update in graph_updates { + let Some(node) = update.nodes.get(&execution_id) else { + continue; + }; + if node.status == NodeStatus::Running && node.started_at.is_some() { + saw_running_snapshot = true; + } + if node.status == NodeStatus::Failed + && node.started_at.is_some() + && node.completed_at.is_some() + { + saw_failed_snapshot = true; + } + } + assert!(saw_running_snapshot, "expected running graph snapshot"); + assert!(saw_failed_snapshot, "expected failed graph snapshot"); + + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + assert!(instances_done[0].result.is_none()); + let Value::Object(error_obj) = instances_done[0] + .error + .clone() + .expect("instance error payload") + else { + panic!("expected error payload object"); + }; + assert_eq!( + error_obj.get("type"), + Some(&Value::String("ActionTimeout".to_string())) + ); +} + +#[tokio::test] +async fn test_runloop_marks_instance_failed_on_executor_error() { + let source = r#" +fn main(input: [x], output: [y]): + y = @tests.fixtures.test_actions.double(value=x) + return y +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + // Intentionally omit input assignment so action kwarg resolution fails at runtime. + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + RunLoopSupervisorConfig { + max_concurrent_instances: 25, + executor_shards: 1, + instance_done_batch_size: None, + poll_interval: Duration::from_secs_f64(0.0), + persistence_interval: Duration::from_secs_f64(0.1), + lock_uuid: Uuid::new_v4(), + lock_ttl: Duration::from_secs(15), + lock_heartbeat: Duration::from_secs(5), + evict_sleep_threshold: Duration::from_secs(10), + skip_sleep: false, + active_instance_gauge: None, + }, + ); + let instance_id = Uuid::new_v4(); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id, + scheduled_at: None, + }); + + runloop + .run() + .await + .expect("runloop should continue after instance failure"); + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + + let done = &instances_done[0]; + assert_eq!(done.executor_id, instance_id); + assert!(done.result.is_none()); + let error = done.error.as_ref().expect("instance error"); + let Value::Object(error_obj) = error else { + panic!("expected error payload object"); + }; + assert_eq!( + error_obj.get("type"), + Some(&Value::String("ExecutionError".to_string())) + ); + let message = error_obj + .get("message") + .and_then(Value::as_str) + .expect("error message"); + assert!(message.contains("variable not found: x")); +} + +#[tokio::test] +async fn test_runloop_executes_for_loop_action_assignments() { + let source = r#" +fn main(input: [limit], output: [result]): + current = 0 + iterations = 0 + for _ in range(limit): + current = @tests.fixtures.test_actions.increment(value=current) + iterations = iterations + 1 + result = @tests.fixtures.test_actions.pack(limit=limit, final=current, iterations=iterations) + return result +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let _ = state + .record_assignment( + vec!["limit".to_string()], + &ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(4)), + })), + span: None, + }, + None, + Some("input limit = 4".to_string()), + ) + .expect("record assignment"); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test_loop_actions".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let mut actions: HashMap = HashMap::new(); + let increment_inputs = Arc::new(Mutex::new(Vec::new())); + let increment_inputs_clone = Arc::clone(&increment_inputs); + actions.insert( + "increment".to_string(), + Arc::new(move |kwargs| { + let increment_inputs = Arc::clone(&increment_inputs_clone); + Box::pin(async move { + let value = kwargs + .get("value") + .and_then(|value| value.as_i64()) + .unwrap_or(0); + increment_inputs + .lock() + .expect("increment inputs lock") + .push(value); + Ok(Value::Number((value + 1).into())) + }) + }), + ); + actions.insert( + "pack".to_string(), + Arc::new(|kwargs| { + Box::pin(async move { + let limit = kwargs.get("limit").cloned().unwrap_or(Value::Null); + let final_value = kwargs.get("final").cloned().unwrap_or(Value::Null); + let iterations = kwargs.get("iterations").cloned().unwrap_or(Value::Null); + Ok(Value::Object( + [ + ("limit".to_string(), limit), + ("final".to_string(), final_value), + ("iterations".to_string(), iterations), + ] + .into_iter() + .collect(), + )) + }) + }), + ); + let worker_pool = crate::workers::InlineWorkerPool::new(actions); + + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + RunLoopSupervisorConfig { + max_concurrent_instances: 25, + executor_shards: 1, + instance_done_batch_size: None, + poll_interval: Duration::from_secs_f64(0.0), + persistence_interval: Duration::from_secs_f64(0.1), + lock_uuid: Uuid::new_v4(), + lock_ttl: Duration::from_secs(15), + lock_heartbeat: Duration::from_secs(5), + evict_sleep_threshold: Duration::from_secs(10), + skip_sleep: false, + active_instance_gauge: None, + }, + ); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id: Uuid::new_v4(), + scheduled_at: None, + }); + + runloop.run().await.expect("runloop"); + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + let done = &instances_done[0]; + let output = done.result.clone().expect("instance result"); + let Value::Object(map) = output else { + panic!("expected output object"); + }; + let Value::Object(result_map) = map + .get("result") + .cloned() + .expect("result payload should include result") + else { + panic!("expected nested result object"); + }; + assert_eq!( + *increment_inputs.lock().expect("increment inputs lock"), + vec![0, 1, 2, 3] + ); + assert_eq!(result_map.get("limit"), Some(&Value::Number(4.into()))); + assert_eq!(result_map.get("final"), Some(&Value::Number(4.into()))); + assert_eq!(result_map.get("iterations"), Some(&Value::Number(4.into()))); +} + +#[tokio::test] +async fn test_instance_poller_send_unblocks_on_stop_notification() { + let (instance_tx, mut instance_rx) = mpsc::channel::(1); + instance_tx + .send(InstanceMessage::Batch { + instances: Vec::new(), + }) + .await + .expect("seed channel"); + + let shutdown_token = tokio_util::sync::CancellationToken::new(); + let send_task = tokio::spawn({ + let instance_tx = instance_tx.clone(); + let shutdown_token = shutdown_token.clone(); + async move { + send_with_stop( + &instance_tx, + InstanceMessage::Batch { + instances: Vec::new(), + }, + shutdown_token.cancelled(), + "instance message", + ) + .await + } + }); + + tokio::time::sleep(Duration::from_millis(20)).await; + shutdown_token.cancel(); + let sent = tokio::time::timeout(Duration::from_millis(300), send_task) + .await + .expect("send task should complete") + .expect("send task should not panic"); + assert!(!sent, "send should abort when stop is notified"); + + let _ = instance_rx.recv().await; +} + +#[tokio::test] +async fn test_instance_poller_send_succeeds_when_channel_has_capacity() { + let (instance_tx, mut instance_rx) = mpsc::channel::(1); + let shutdown_token = tokio_util::sync::CancellationToken::new(); + let sent = send_with_stop( + &instance_tx, + InstanceMessage::Batch { + instances: Vec::new(), + }, + shutdown_token.cancelled(), + "instance message", + ) + .await; + assert!(sent); + + let received = instance_rx.recv().await.expect("queued message"); + match received { + InstanceMessage::Batch { instances } => assert!(instances.is_empty()), + InstanceMessage::Error(err) => panic!("unexpected error message: {err}"), + } +} + +#[tokio::test] +async fn test_runloop_supervisor_restarts_on_depth_limit_backend_errors() { + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = + FaultInjectingBackend::with_depth_limit_poll_failures(MemoryBackend::with_queue(queue)); + let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); + let shutdown_token = tokio_util::sync::CancellationToken::new(); + + let supervisor = tokio::spawn(runloop_supervisor( + backend.clone(), + worker_pool, + default_test_config(Uuid::new_v4()), + shutdown_token.clone(), + )); + + tokio::time::sleep(Duration::from_millis(750)).await; + shutdown_token.cancel(); + tokio::time::timeout(Duration::from_secs(2), supervisor) + .await + .expect("supervisor should stop") + .expect("supervisor task should not panic"); + + assert!( + backend.get_queued_instances_calls() >= 2, + "expected multiple polling attempts while supervisor restarts" + ); +} + +#[tokio::test] +async fn test_runloop_supervisor_reproduces_no_progress_with_continued_queue_growth() { + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = + FaultInjectingBackend::with_depth_limit_poll_failures(MemoryBackend::with_queue(queue)); + let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); + let shutdown_token = tokio_util::sync::CancellationToken::new(); + + let supervisor = tokio::spawn(runloop_supervisor( + backend.clone(), + worker_pool, + default_test_config(Uuid::new_v4()), + shutdown_token.clone(), + )); + + for _ in 0..20 { + backend + .queue_instances(&[QueuedInstance { + workflow_version_id: Uuid::new_v4(), + schedule_id: None, + dag: None, + entry_node: Uuid::new_v4(), + state: None, + action_results: HashMap::new(), + instance_id: Uuid::new_v4(), + scheduled_at: None, + }]) + .await + .expect("queue synthetic instance"); + } + + tokio::time::sleep(Duration::from_millis(500)).await; + shutdown_token.cancel(); + tokio::time::timeout(Duration::from_secs(2), supervisor) + .await + .expect("supervisor should stop") + .expect("supervisor task should not panic"); + + assert!( + backend.get_queued_instances_calls() >= 1, + "expected polling attempts during stall simulation" + ); + assert!( + backend.queue_len() >= 20, + "queued work should continue to grow when poller cannot read instances" + ); + assert_eq!( + backend.instances_done_len(), + 0, + "no instances should complete while poller is failing" + ); +} + +#[tokio::test] +async fn test_runloop_marks_instance_failed_when_rehydrated_state_is_missing_action_result() { + let source = r#" +fn main(input: [x], output: [y]): + y = @tests.fixtures.test_actions.double(value=x) + return y +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let _ = state + .record_assignment( + vec!["x".to_string()], + &ir::Expr { + kind: Some(ir::expr::Kind::Literal(ir::Literal { + value: Some(ir::literal::Value::IntValue(4)), + })), + span: None, + }, + None, + Some("input x = 4".to_string()), + ) + .expect("record assignment"); + let template_entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&template_entry_node, None) + .expect("queue entry node"); + let mut bootstrap_executor = RunnerExecutor::new(Arc::clone(&dag), state, HashMap::new(), None); + let bootstrap_step = bootstrap_executor + .increment(&[entry_exec.node_id]) + .expect("bootstrap increment should materialize action node"); + let action_exec = bootstrap_step + .actions + .first() + .expect("bootstrap should queue one action call") + .clone(); + + // Simulate a reclaimed instance whose graph says the action execution node + // has finished, but action_results payload was lost. + bootstrap_executor + .state_mut() + .mark_completed(action_exec.node_id) + .expect("mark action completed"); + bootstrap_executor.state_mut().ready_queue.clear(); + assert!( + bootstrap_executor + .state() + .nodes + .get(&action_exec.node_id) + .is_some_and(|node| node.is_action_call() && node.status == NodeStatus::Completed), + "expected completed action execution node" + ); + let state = bootstrap_executor.state().clone(); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test_missing_action_result".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + default_test_config(Uuid::new_v4()), + ); + let instance_id = Uuid::new_v4(); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: action_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id, + scheduled_at: None, + }); + + runloop + .run() + .await + .expect("runloop should continue after instance failure"); + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + assert_eq!(instances_done[0].executor_id, instance_id); + let Value::Object(error_obj) = instances_done[0] + .error + .clone() + .expect("instance error payload") + else { + panic!("expected error payload object"); + }; + assert_eq!( + error_obj.get("type"), + Some(&Value::String("ExecutionError".to_string())) + ); + let message = error_obj + .get("message") + .and_then(Value::as_str) + .expect("error message"); + assert!( + message.contains("missing action result for"), + "expected missing action result error, got: {message}" + ); +} + +#[tokio::test] +async fn test_runloop_marks_instance_failed_with_dict_key_error() { + let source = r#" +fn main(input: [], output: [result]): + payload = @tests.fixtures.test_actions.make_payload() + result = payload["missing"] + return result +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test_dict_key_error".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let mut actions: HashMap = HashMap::new(); + actions.insert( + "make_payload".to_string(), + Arc::new(|_kwargs| { + Box::pin(async move { + Ok(Value::Object( + [("present".to_string(), Value::Number(1.into()))] + .into_iter() + .collect(), + )) + }) + }), + ); + let worker_pool = crate::workers::InlineWorkerPool::new(actions); + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + default_test_config(Uuid::new_v4()), + ); + let instance_id = Uuid::new_v4(); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id, + scheduled_at: None, + }); + + runloop + .run() + .await + .expect("runloop should continue after instance failure"); + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + assert_eq!(instances_done[0].executor_id, instance_id); + let Value::Object(error_obj) = instances_done[0] + .error + .clone() + .expect("instance error payload") + else { + panic!("expected error payload object"); + }; + let message = error_obj + .get("message") + .and_then(Value::as_str) + .expect("error message"); + assert!( + message.contains("dict has no key"), + "expected dict key error, got: {message}" + ); +} + +#[tokio::test] +async fn test_runloop_marks_instance_failed_with_attribute_error() { + let source = r#" +fn main(input: [], output: [result]): + payload = @tests.fixtures.test_actions.make_number() + result = payload.missing + return result +"#; + let program = parse_program(source.trim()).expect("parse program"); + let program_proto = program.encode_to_vec(); + let ir_hash = format!("{:x}", Sha256::digest(&program_proto)); + let dag = Arc::new(convert_to_dag(&program).expect("convert to dag")); + + let mut state = RunnerState::new(Some(Arc::clone(&dag)), None, None, false); + let entry_node = dag + .entry_node + .as_ref() + .expect("DAG entry node not found") + .clone(); + let entry_exec = state + .queue_template_node(&entry_node, None) + .expect("queue entry node"); + + let queue = Arc::new(Mutex::new(VecDeque::new())); + let backend = MemoryBackend::with_queue(queue.clone()); + let workflow_version_id = backend + .upsert_workflow_version(&WorkflowRegistration { + workflow_name: "test_attribute_error".to_string(), + workflow_version: ir_hash.clone(), + ir_hash, + program_proto, + concurrent: false, + }) + .await + .expect("register workflow version"); + + let mut actions: HashMap = HashMap::new(); + actions.insert( + "make_number".to_string(), + Arc::new(|_kwargs| Box::pin(async move { Ok(Value::Number(7.into())) })), + ); + let worker_pool = crate::workers::InlineWorkerPool::new(actions); + let mut runloop = RunLoop::new( + worker_pool, + backend.clone(), + default_test_config(Uuid::new_v4()), + ); + let instance_id = Uuid::new_v4(); + queue.lock().expect("queue lock").push_back(QueuedInstance { + workflow_version_id, + schedule_id: None, + dag: None, + entry_node: entry_exec.node_id, + state: Some(state), + action_results: HashMap::new(), + instance_id, + scheduled_at: None, + }); + + runloop + .run() + .await + .expect("runloop should continue after instance failure"); + let instances_done = backend.instances_done(); + assert_eq!(instances_done.len(), 1); + assert_eq!(instances_done[0].executor_id, instance_id); + let Value::Object(error_obj) = instances_done[0] + .error + .clone() + .expect("instance error payload") + else { + panic!("expected error payload object"); + }; + let message = error_obj + .get("message") + .and_then(Value::as_str) + .expect("error message"); + assert!( + message.contains("attribute not found"), + "expected attribute error, got: {message}" + ); +} + +#[test] +fn test_lock_mismatches_ignores_expired_lock_with_matching_owner() { + let backend = MemoryBackend::new(); + let worker_pool = crate::workers::InlineWorkerPool::new(HashMap::new()); + let lock_uuid = Uuid::new_v4(); + let runloop = RunLoop::new(worker_pool, backend, default_test_config(lock_uuid)); + + let instance_id = Uuid::new_v4(); + let statuses = vec![InstanceLockStatus { + instance_id, + lock_uuid: Some(lock_uuid), + lock_expires_at: Some(Utc::now() - chrono::Duration::seconds(60)), + }]; + assert!( + runloop.lock_mismatches(&statuses).is_empty(), + "matching lock UUID should not evict solely due to stale expiry" + ); + + let mismatched = vec![InstanceLockStatus { + instance_id, + lock_uuid: Some(Uuid::new_v4()), + lock_expires_at: Some(Utc::now() + chrono::Duration::seconds(60)), + }]; + let evict_ids = runloop.lock_mismatches(&mismatched); + assert_eq!(evict_ids, HashSet::from([instance_id])); +} diff --git a/crates/waymark/src/webapp/server.rs b/crates/waymark/src/webapp/server.rs index 43818ca3..9afda350 100644 --- a/crates/waymark/src/webapp/server.rs +++ b/crates/waymark/src/webapp/server.rs @@ -16,12 +16,15 @@ use tera::{Context as TeraContext, Tera}; use tokio::net::TcpListener; use tracing::{error, info}; use uuid::Uuid; +use waymark_webapp_backend::WebappBackend; +use waymark_webapp_core::WorkerStatus; -use super::types::{ +use waymark_webapp_core::{ ActionLogsResponse, FilterValuesResponse, HealthResponse, InstanceExportInfo, TimelineEntry, - WebappConfig, WorkflowInstanceExport, WorkflowRunDataResponse, + WorkflowInstanceExport, WorkflowRunDataResponse, }; -use crate::backends::WebappBackend; + +use crate::WebappConfig; // Embed templates at compile time const TEMPLATE_BASE: &str = include_str!("../../templates/base.html"); @@ -367,7 +370,7 @@ async fn get_action_logs( let logs: Vec<_> = timeline .into_iter() .filter(|e| e.action_id == action_id_str) - .map(|e| super::types::ActionLogEntry { + .map(|e| waymark_webapp_core::ActionLogEntry { action_id: e.action_id, action_name: e.action_name, module_name: e.module_name, @@ -736,7 +739,7 @@ struct InvocationRow { fn render_invocations_page( templates: &Tera, - instances: &[super::types::InstanceSummary], + instances: &[waymark_webapp_core::InstanceSummary], current_page: i64, total_pages: i64, search_query: Option, @@ -812,8 +815,8 @@ struct GraphNode { fn render_instance_detail_page( templates: &Tera, - instance: &super::types::InstanceDetail, - graph: Option, + instance: &waymark_webapp_core::InstanceDetail, + graph: Option, ) -> String { let graph_data = graph .as_ref() @@ -843,8 +846,8 @@ fn render_instance_detail_page( render_template(templates, "workflow_run.html", &context) } -fn build_graph_data(graph: &super::types::ExecutionGraphView) -> GraphData { - let action_nodes: Vec<&super::types::ExecutionNodeView> = graph +fn build_graph_data(graph: &waymark_webapp_core::ExecutionGraphView) -> GraphData { + let action_nodes: Vec<&waymark_webapp_core::ExecutionNodeView> = graph .nodes .iter() .filter(|node| is_action_node(&node.node_type)) @@ -1055,7 +1058,7 @@ struct ScheduleRow { fn render_schedules_page( templates: &Tera, - schedules: &[super::types::ScheduleSummary], + schedules: &[waymark_webapp_core::ScheduleSummary], current_page: i64, total_pages: i64, total_count: i64, @@ -1136,8 +1139,8 @@ struct ScheduleInvocationRow { fn render_schedule_detail_page( templates: &Tera, - schedule: &super::types::ScheduleDetail, - invocations: &[super::types::ScheduleInvocationSummary], + schedule: &waymark_webapp_core::ScheduleDetail, + invocations: &[waymark_webapp_core::ScheduleInvocationSummary], current_page: i64, total_pages: i64, ) -> String { @@ -1234,11 +1237,7 @@ struct WorkerInstanceRowView { updated_at: String, } -fn render_workers_page( - templates: &Tera, - statuses: &[super::WorkerStatus], - window_minutes: i64, -) -> String { +fn render_workers_page(templates: &Tera, statuses: &[WorkerStatus], window_minutes: i64) -> String { use crate::pool_status::PoolTimeSeries; // Build action rows @@ -1373,13 +1372,15 @@ mod tests { use sqlx::postgres::PgPoolOptions; use tower::util::ServiceExt; use uuid::Uuid; + use waymark_backend_memory::MemoryBackend; + use waymark_backend_postgres::PostgresBackend; + use waymark_webapp_backend::WebappBackend; + use waymark_worker_status_backend::{WorkerStatusBackend as _, WorkerStatusUpdate}; use super::{WebappState, build_graph_data, build_router, init_templates}; - use crate::backends::{ - MemoryBackend, PostgresBackend, WebappBackend, WorkerStatusBackend, WorkerStatusUpdate, - }; - use crate::test_support::postgres_setup; - use crate::webapp::{ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView}; + + use waymark_test_support::postgres_setup; + use waymark_webapp_core::{ExecutionEdgeView, ExecutionGraphView, ExecutionNodeView}; #[test] fn build_graph_data_projects_internal_nodes_to_action_dependencies() { diff --git a/crates/waymark/src/webapp/types.rs b/crates/waymark/src/webapp/types.rs index 7805c428..0b2ec6e8 100644 --- a/crates/waymark/src/webapp/types.rs +++ b/crates/waymark/src/webapp/types.rs @@ -1,8 +1,4 @@ -//! Shared types for the webapp. - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use uuid::Uuid; +//! Shared types for the webapp server. /// Configuration for the webapp server. #[derive(Debug, Clone)] @@ -55,245 +51,3 @@ impl WebappConfig { format!("{}:{}", self.host, self.port) } } - -/// Instance status. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum InstanceStatus { - Queued, - Running, - Completed, - Failed, -} - -impl std::fmt::Display for InstanceStatus { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Queued => write!(f, "queued"), - Self::Running => write!(f, "running"), - Self::Completed => write!(f, "completed"), - Self::Failed => write!(f, "failed"), - } - } -} - -/// Summary of a workflow instance for listing. -#[derive(Debug, Clone, Serialize)] -pub struct InstanceSummary { - pub id: Uuid, - pub entry_node: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, - pub workflow_name: Option, - pub input_preview: String, -} - -/// Full details of a workflow instance. -#[derive(Debug, Clone, Serialize)] -pub struct InstanceDetail { - pub id: Uuid, - pub entry_node: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, - pub workflow_name: Option, - pub input_payload: String, - pub result_payload: String, - pub error_payload: Option, -} - -/// Node in the execution graph for display. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionNodeView { - pub id: String, - pub node_type: String, - pub label: String, - pub status: String, - pub action_name: Option, - pub module_name: Option, -} - -/// Edge in the execution graph for display. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionEdgeView { - pub source: String, - pub target: String, - pub edge_type: String, -} - -/// Execution graph data for rendering. -#[derive(Debug, Clone, Serialize)] -pub struct ExecutionGraphView { - pub nodes: Vec, - pub edges: Vec, -} - -/// Timeline entry for an action execution. -#[derive(Debug, Clone, Serialize)] -pub struct TimelineEntry { - pub action_id: String, - pub action_name: String, - pub module_name: Option, - pub status: String, - pub attempt_number: i32, - pub dispatched_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub request_preview: String, - pub response_preview: String, - pub error: Option, -} - -/// Action log entry with full details. -#[derive(Debug, Clone, Serialize)] -pub struct ActionLogEntry { - pub action_id: String, - pub action_name: String, - pub module_name: Option, - pub status: String, - pub attempt_number: i32, - pub dispatched_at: Option, - pub completed_at: Option, - pub duration_ms: Option, - pub request: String, - pub response: String, - pub error: Option, -} - -/// Response for the workflow run data API. -#[derive(Debug, Serialize)] -pub struct WorkflowRunDataResponse { - pub nodes: Vec, - pub timeline: Vec, - pub page: i64, - pub per_page: i64, - pub total: i64, - pub has_more: bool, -} - -/// Response for action logs API. -#[derive(Debug, Serialize)] -pub struct ActionLogsResponse { - pub logs: Vec, -} - -/// Filter values response. -#[derive(Debug, Serialize)] -pub struct FilterValuesResponse { - pub values: Vec, -} - -/// Health check response. -#[derive(Debug, Serialize)] -pub struct HealthResponse { - pub status: &'static str, - pub service: &'static str, -} - -/// Export format for a workflow instance. -#[derive(Debug, Serialize)] -pub struct WorkflowInstanceExport { - pub export_version: &'static str, - pub exported_at: String, - pub instance: InstanceExportInfo, - pub nodes: Vec, - pub timeline: Vec, -} - -/// Full worker status for webapp display. -#[derive(Debug, Clone)] -pub struct WorkerStatus { - pub pool_id: Uuid, - pub active_workers: i32, - pub throughput_per_min: f64, - pub actions_per_sec: f64, - pub total_completed: i64, - pub last_action_at: Option>, - pub updated_at: DateTime, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub dispatch_queue_size: Option, - pub total_in_flight: Option, - pub median_instance_duration_secs: Option, - pub active_instance_count: i32, - pub total_instances_completed: i64, - pub instances_per_sec: f64, - pub instances_per_min: f64, - pub time_series: Option>, -} - -/// Worker action stats row for display. -#[derive(Debug, Clone)] -pub struct WorkerActionRow { - pub pool_id: String, - pub active_workers: i64, - pub actions_per_sec: String, - pub throughput_per_min: i64, - pub total_completed: i64, - pub median_dequeue_ms: Option, - pub median_handling_ms: Option, - pub last_action_at: Option, - pub updated_at: String, -} - -/// Aggregate worker stats for overview cards. -#[derive(Debug, Clone)] -pub struct WorkerAggregateStats { - pub active_worker_count: i64, - pub actions_per_sec: String, - pub total_in_flight: i64, - pub total_queue_depth: i64, -} - -/// Instance info for export. -#[derive(Debug, Serialize)] -pub struct InstanceExportInfo { - pub id: String, - pub status: String, - pub created_at: String, - pub input_payload: String, - pub result_payload: String, -} - -/// Schedule summary for listing. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleSummary { - pub id: String, - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: String, - pub cron_expression: Option, - pub interval_seconds: Option, - pub status: String, - pub next_run_at: Option, - pub last_run_at: Option, - pub created_at: String, -} - -/// Full schedule details. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleDetail { - pub id: String, - pub workflow_name: String, - pub schedule_name: String, - pub schedule_type: String, - pub cron_expression: Option, - pub interval_seconds: Option, - pub jitter_seconds: i64, - pub status: String, - pub next_run_at: Option, - pub last_run_at: Option, - pub last_instance_id: Option, - pub created_at: String, - pub updated_at: String, - pub priority: i32, - pub allow_duplicate: bool, - pub input_payload: Option, -} - -/// Invocation summary row for schedule detail pages. -#[derive(Debug, Clone, Serialize)] -pub struct ScheduleInvocationSummary { - pub id: Uuid, - pub created_at: DateTime, - pub status: InstanceStatus, -} diff --git a/crates/waymark/src/workers/status.rs b/crates/waymark/src/workers/status.rs index dfd0a962..03fec671 100644 --- a/crates/waymark/src/workers/status.rs +++ b/crates/waymark/src/workers/status.rs @@ -7,11 +7,10 @@ use std::sync::{ use std::time::Duration; use chrono::{DateTime, Utc}; -use tokio::sync::watch; use tracing::{info, warn}; use uuid::Uuid; +use waymark_worker_status_backend::{WorkerStatusBackend, WorkerStatusUpdate}; -use crate::backends::{WorkerStatusBackend, WorkerStatusUpdate}; use crate::pool_status::{PoolTimeSeries, TimeSeriesEntry}; #[derive(Debug, Clone)] @@ -37,7 +36,7 @@ pub fn spawn_status_reporter( worker_pool: P, active_instances: Arc, interval: Duration, - mut shutdown_rx: watch::Receiver, + shutdown: tokio_util::sync::WaitForCancellationFutureOwned, ) -> tokio::task::JoinHandle<()> where B: WorkerStatusBackend + Send + Sync + 'static, @@ -54,6 +53,8 @@ where "status reporter started" ); + let mut shutdown = std::pin::pin!(shutdown); + loop { tokio::select! { _ = ticker.tick() => { @@ -99,11 +100,9 @@ where warn!(error = %err, "failed to update worker status"); } } - _ = shutdown_rx.changed() => { - if *shutdown_rx.borrow() { - info!("status reporter shutting down"); - break; - } + _ = &mut shutdown => { + info!("status reporter shutting down"); + break; } } }