diff --git a/Cargo.lock b/Cargo.lock index 83728d95..c6a1d88f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10034,6 +10034,7 @@ dependencies = [ "rustls", "serde", "serde_json", + "serial_test", "sha2 0.10.9", "testcontainers", "thiserror 2.0.12", @@ -10293,6 +10294,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scc" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc" +dependencies = [ + "sdd", +] + [[package]] name = "schannel" version = "0.1.27" @@ -10349,6 +10359,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sdd" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" + [[package]] name = "sec1" version = "0.7.3" @@ -10597,6 +10613,31 @@ dependencies = [ "serde", ] +[[package]] +name = "serial_test" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b258109f244e1d6891bf1053a55d63a5cd4f8f4c30cf9a1280989f80e7a1fa9" +dependencies = [ + "futures", + "log", + "once_cell", + "parking_lot", + "scc", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] + [[package]] name = "sha1" version = "0.10.6" diff --git a/crates/rollup-boost/Cargo.toml b/crates/rollup-boost/Cargo.toml index 5f1bf069..49967399 100644 --- a/crates/rollup-boost/Cargo.toml +++ b/crates/rollup-boost/Cargo.toml @@ -66,6 +66,7 @@ parking_lot = "0.12.3" tokio-util = { version = "0.7.13" } [dev-dependencies] +serial_test = "3" rand = "0.9.0" time = { version = "0.3.36", features = ["macros", "formatting", "parsing"] } op-alloy-consensus = "0.17.2" diff --git a/crates/rollup-boost/src/health.rs b/crates/rollup-boost/src/health.rs index fed2e27c..045f958b 100644 --- a/crates/rollup-boost/src/health.rs +++ b/crates/rollup-boost/src/health.rs @@ -16,6 +16,7 @@ use crate::{EngineApiExt, ExecutionMode, Health, Probes}; pub struct HealthHandle { pub probes: Arc, pub execution_mode: Arc>, + pub l2_client: Arc, pub builder_client: Arc, pub health_check_interval: Duration, pub max_unsafe_interval: u64, @@ -26,6 +27,7 @@ impl HealthHandle { pub fn new( probes: Arc, execution_mode: Arc>, + l2_client: Arc, builder_client: Arc, health_check_interval: Duration, max_unsafe_interval: u64, @@ -33,6 +35,7 @@ impl HealthHandle { Self { probes, execution_mode, + l2_client, builder_client, health_check_interval, max_unsafe_interval, @@ -46,34 +49,63 @@ impl HealthHandle { let mut timestamp = MonotonicTimestamp::new(); loop { - let latest_unsafe = match self - .builder_client + let t = timestamp.tick(); + + // Check L2 client health. If its unhealthy, set the health status to ServiceUnavailable + // If in disabled or dry run execution mode, set the health status to Healthy if the l2 client is healthy + match self + .l2_client .get_block_by_number(BlockNumberOrTag::Latest, false) .await { - Ok(block) => block, - Err(e) => { - warn!(target: "rollup_boost::health", "Failed to get unsafe block from builder client: {} - updating health status", e); - if self.execution_mode.lock().is_enabled() { - self.probes.set_health(Health::PartialContent); + Ok(block) => { + if t.saturating_sub(block.header.timestamp) + .gt(&self.max_unsafe_interval) + { + warn!(target: "rollup_boost::health", curr_unix = %t, unsafe_unix = %block.header.timestamp, "L2 client - unsafe block timestamp is too old, updating health status to ServiceUnavailable"); + self.probes.set_health(Health::ServiceUnavailable); + sleep_until(Instant::now() + self.health_check_interval).await; + continue; + } else if self.execution_mode.lock().is_disabled() + || self.execution_mode.lock().is_dry_run() + { + self.probes.set_health(Health::Healthy); + sleep_until(Instant::now() + self.health_check_interval).await; + continue; } + } + Err(e) => { + warn!(target: "rollup_boost::health", "L2 client - Failed to get unsafe block {} - updating health status", e); + self.probes.set_health(Health::ServiceUnavailable); sleep_until(Instant::now() + self.health_check_interval).await; continue; } }; - let t = timestamp.tick(); - if t.saturating_sub(latest_unsafe.header.timestamp) - .gt(&self.max_unsafe_interval) - { - warn!(target: "rollup_boost::health", curr_unix = %t, unsafe_unix = %latest_unsafe.header.timestamp, "Unsafe block timestamp is too old updating health status"); - if self.execution_mode.lock().is_enabled() { - self.probes.set_health(Health::PartialContent); - } - } else { - self.probes.set_health(Health::Healthy); + if self.execution_mode.lock().is_enabled() { + // Only check builder client health if execution mode is enabled + // If its unhealthy, set the health status to PartialContent + match self + .builder_client + .get_block_by_number(BlockNumberOrTag::Latest, false) + .await + { + Ok(block) => { + if t.saturating_sub(block.header.timestamp) + .gt(&self.max_unsafe_interval) + { + warn!(target: "rollup_boost::health", curr_unix = %t, unsafe_unix = %block.header.timestamp, "Builder client - unsafe block timestamp is too old updating health status"); + self.probes.set_health(Health::PartialContent); + } else { + self.probes.set_health(Health::Healthy); + } + } + Err(e) => { + warn!(target: "rollup_boost::health", "Builder client - Failed to get unsafe block {} - updating health status", e); + self.probes.set_health(Health::PartialContent); + } + }; } - sleep_until(Instant::now() + self.health_check_interval).await; } }) @@ -141,6 +173,7 @@ mod tests { use super::*; use crate::{Probes, payload::PayloadSource}; + use serial_test::serial; pub struct MockHttpServer { addr: SocketAddr, @@ -258,15 +291,23 @@ mod tests { Ok(hyper::Response::new(response.to_string())) } + #[serial] #[tokio::test] async fn test_health_check_healthy() -> eyre::Result<()> { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; let probes = Arc::new(Probes::default()); let now = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Time went backwards") .as_secs(); + let l2 = MockHttpServer::serve(handler, now).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); + let builder = MockHttpServer::serve(handler, now).await.unwrap(); let builder_client = Arc::new(RpcClient::new( format!("http://{}", builder.addr).parse::()?, @@ -278,6 +319,7 @@ mod tests { let health_handle = HealthHandle { probes: probes.clone(), execution_mode: Arc::new(Mutex::new(ExecutionMode::Enabled)), + l2_client: l2_client.clone(), builder_client: builder_client.clone(), health_check_interval: Duration::from_secs(60), max_unsafe_interval: 5, @@ -289,16 +331,26 @@ mod tests { Ok(()) } + #[serial] #[tokio::test] - async fn test_health_check_exceeds_max_unsafe_interval() -> eyre::Result<()> { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + async fn test_health_check_builder_exceeds_max_unsafe_interval() -> eyre::Result<()> { let probes = Arc::new(Probes::default()); let now = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Time went backwards") .as_secs(); - let builder = MockHttpServer::serve(handler, now - 10).await.unwrap(); + // L2 healthy + let l2 = MockHttpServer::serve(handler, now).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); + + // Builder unhealthy + let builder = MockHttpServer::serve(handler, now - 10).await.unwrap(); let builder_client = Arc::new(RpcClient::new( format!("http://{}", builder.addr).parse::()?, JwtSecret::random(), @@ -309,6 +361,7 @@ mod tests { let health_handle = HealthHandle { probes: probes.clone(), execution_mode: Arc::new(Mutex::new(ExecutionMode::Enabled)), + l2_client: l2_client.clone(), builder_client: builder_client.clone(), health_check_interval: Duration::from_secs(60), max_unsafe_interval: 5, @@ -320,15 +373,65 @@ mod tests { Ok(()) } + #[serial] + #[tokio::test] + async fn test_health_check_l2_exceeds_max_unsafe_interval() -> eyre::Result<()> { + let probes = Arc::new(Probes::default()); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + + // L2 healthy unhealth + let l2 = MockHttpServer::serve(handler, now - 10).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); + + // Builder healthy + let builder = MockHttpServer::serve(handler, now).await.unwrap(); + let builder_client = Arc::new(RpcClient::new( + format!("http://{}", builder.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::Builder, + )?); + + let health_handle = HealthHandle { + probes: probes.clone(), + execution_mode: Arc::new(Mutex::new(ExecutionMode::Enabled)), + l2_client: l2_client.clone(), + builder_client: builder_client.clone(), + health_check_interval: Duration::from_secs(60), + max_unsafe_interval: 5, + }; + + health_handle.spawn(); + tokio::time::sleep(Duration::from_secs(2)).await; + assert!(matches!(probes.health(), Health::ServiceUnavailable)); + Ok(()) + } + + #[serial] #[tokio::test] async fn test_health_check_exceeds_max_unsafe_interval_execution_mode_disabled() -> eyre::Result<()> { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; let probes = Arc::new(Probes::default()); let now = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Time went backwards") .as_secs(); + // L2 healthy + let l2 = MockHttpServer::serve(handler, now).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); let builder = MockHttpServer::serve(handler, now - 10).await.unwrap(); let builder_client = Arc::new(RpcClient::new( @@ -341,6 +444,7 @@ mod tests { let health_handle = HealthHandle { probes: probes.clone(), execution_mode: Arc::new(Mutex::new(ExecutionMode::Disabled)), + l2_client: l2_client.clone(), builder_client: builder_client.clone(), health_check_interval: Duration::from_secs(60), max_unsafe_interval: 5, @@ -352,15 +456,23 @@ mod tests { Ok(()) } + #[serial] #[tokio::test] async fn test_health_check_exceeds_max_unsafe_interval_execution_mode_dryrun() -> eyre::Result<()> { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; let probes = Arc::new(Probes::default()); let now = SystemTime::now() .duration_since(UNIX_EPOCH) .expect("Time went backwards") .as_secs(); + // L2 healthy + let l2 = MockHttpServer::serve(handler, now).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); let builder = MockHttpServer::serve(handler, now - 10).await.unwrap(); let builder_client = Arc::new(RpcClient::new( @@ -373,6 +485,7 @@ mod tests { let health_handle = HealthHandle { probes: probes.clone(), execution_mode: Arc::new(Mutex::new(ExecutionMode::DryRun)), + l2_client: l2_client.clone(), builder_client: builder_client.clone(), health_check_interval: Duration::from_secs(60), max_unsafe_interval: 5, @@ -384,10 +497,24 @@ mod tests { Ok(()) } + #[serial] #[tokio::test] - async fn test_health_check_service_unavailable() -> eyre::Result<()> { - tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + async fn test_health_check_service_builder_unavailable() -> eyre::Result<()> { let probes = Arc::new(Probes::default()); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + // L2 healthy + let l2 = MockHttpServer::serve(handler, now).await.unwrap(); + let l2_client = Arc::new(RpcClient::new( + format!("http://{}", l2.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); + + // Builder unhealthy let builder_client = Arc::new(RpcClient::new( "http://127.0.0.1:6000".parse::()?, JwtSecret::random(), @@ -398,6 +525,7 @@ mod tests { let health_handle = HealthHandle { probes: probes.clone(), execution_mode: Arc::new(Mutex::new(ExecutionMode::Enabled)), + l2_client: l2_client.clone(), builder_client: builder_client.clone(), health_check_interval: Duration::from_secs(60), max_unsafe_interval: 5, @@ -409,9 +537,51 @@ mod tests { Ok(()) } + #[serial] + #[tokio::test] + async fn test_health_check_service_l2_unavailable() -> eyre::Result<()> { + let probes = Arc::new(Probes::default()); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("Time went backwards") + .as_secs(); + + // L2 returns an error + let l2_client = Arc::new(RpcClient::new( + "http://127.0.0.1:6000".parse::()?, + JwtSecret::random(), + 100, + PayloadSource::L2, + )?); + + // Builder healthy + let builder = MockHttpServer::serve(handler, now).await.unwrap(); + let builder_client = Arc::new(RpcClient::new( + format!("http://{}", builder.addr).parse::()?, + JwtSecret::random(), + 100, + PayloadSource::Builder, + )?); + + let health_handle = HealthHandle { + probes: probes.clone(), + execution_mode: Arc::new(Mutex::new(ExecutionMode::Enabled)), + l2_client: l2_client.clone(), + builder_client: builder_client.clone(), + health_check_interval: Duration::from_secs(60), + max_unsafe_interval: 5, + }; + + health_handle.spawn(); + tokio::time::sleep(Duration::from_secs(2)).await; + assert!(matches!(probes.health(), Health::ServiceUnavailable)); + Ok(()) + } + + #[serial] #[tokio::test] async fn tick_advances_after_sleep() { - let mut ts = MonotonicTimestamp::new(); + let mut ts: MonotonicTimestamp = MonotonicTimestamp::new(); let t1 = ts.tick(); tokio::time::sleep(Duration::from_secs(1)).await; let t2 = ts.tick(); @@ -419,6 +589,7 @@ mod tests { assert!(t2 >= t1 + 1,); } + #[serial] #[tokio::test] async fn tick_matches_system_clock() { let mut ts = MonotonicTimestamp::new(); diff --git a/crates/rollup-boost/src/server.rs b/crates/rollup-boost/src/server.rs index d6f315a2..60472559 100644 --- a/crates/rollup-boost/src/server.rs +++ b/crates/rollup-boost/src/server.rs @@ -82,6 +82,7 @@ where let handle = HealthHandle::new( self.probes.clone(), self.execution_mode.clone(), + self.l2_client.clone(), self.builder_client.clone(), Duration::from_secs(health_check_interval), max_unsafe_interval,