From 18744a51e8aea55c839da56208ae919210bbcab0 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 18:26:39 -0800 Subject: [PATCH 1/4] =?UTF-8?q?fix:=20browser=20context=20management=20?= =?UTF-8?q?=E2=80=94=20dedup=20snapshots,=20consecutive=20loop=20guard,=20?= =?UTF-8?q?overflow=20resilience?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Loop guard was counting total lifetime calls per (tool, args) hash, so browser_snapshot (empty args) got permanently blocked after 7 uses even when other tools ran between each call. Now tracks consecutive identical calls — the counter resets whenever a different tool runs. Observation tools (browser_snapshot, browser_tab_list) also get the poll multiplier and are excluded from ping-pong detection since snapshot→click is normal browser workflow. Worker context overflow was killing the worker after 3 retries of force-compact. Now: dedup stale tool results before every LLM call (replaces all but the most recent browser_snapshot/browser_tab_list result with a one-liner), pre-prompt compaction check every segment boundary, and reduced segment size (25→15 turns) for more frequent checks. Overflow retry kept at 2 as a safety net since dedup+compaction should prevent hitting it. Also fixed the get_element_center error message to tell the model to run browser_snapshot instead of suggesting screenshots/scrolling. --- src/agent/worker.rs | 127 +++++++++++++++++++++++++++++++++++++-- src/hooks/loop_guard.rs | 130 +++++++++++++++++++++++++++++++++++++++- src/tools/browser.rs | 3 +- 3 files changed, 253 insertions(+), 7 deletions(-) diff --git a/src/agent/worker.rs b/src/agent/worker.rs index acf4af1de..5778526e4 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -9,17 +9,28 @@ use crate::llm::routing::is_context_overflow_error; use crate::{AgentDeps, ChannelId, ProcessId, ProcessType, WorkerId}; use rig::agent::AgentBuilder; use rig::completion::CompletionModel; +use std::collections::HashMap; use std::fmt::Write as _; use std::path::PathBuf; use tokio::sync::{mpsc, watch}; use uuid::Uuid; /// How many turns per segment before we check context and potentially compact. -const TURNS_PER_SEGMENT: usize = 25; +/// +/// Kept relatively low so compaction checks run frequently. Fast models can +/// burn through many tool-call turns quickly, and each turn may add large +/// tool results (browser snapshots, shell output). Checking every 15 turns +/// instead of 25 reduces the chance of blowing past the context window +/// within a single segment. +const TURNS_PER_SEGMENT: usize = 15; /// Max consecutive context overflow recoveries before giving up. -/// Prevents infinite compact-retry loops if something is fundamentally wrong. -const MAX_OVERFLOW_RETRIES: usize = 3; +/// Each retry dedup-strips stale tool results and force-compacts 75% of +/// remaining messages. Two retries is enough to handle the edge case where +/// a single message is enormous — beyond that, something is fundamentally +/// broken (system prompt alone exceeds the context window, or the compaction +/// floor of 4 messages is still too large). +const MAX_OVERFLOW_RETRIES: usize = 2; /// Max segments before the worker gives up and returns a partial result. /// Prevents unbounded worker loops when the LLM keeps hitting max_turns @@ -311,6 +322,17 @@ impl Worker { loop { segments_run += 1; + // Pre-prompt maintenance: dedup stale tool results and check + // context usage *before* each LLM call, not just at segment + // boundaries. Fast models can accumulate large tool results + // within a single segment and exceed the context window before + // we ever reach a checkpoint. + if segments_run > 1 { + dedup_tool_results(&mut history); + self.maybe_compact_history(&mut compacted_history, &mut history) + .await; + } + match self .hook .prompt_with_tool_nudge_retry(&agent, &mut history, &prompt) @@ -337,6 +359,7 @@ impl Worker { } self.persist_transcript(&compacted_history, &history).await; + dedup_tool_results(&mut history); self.maybe_compact_history(&mut compacted_history, &mut history) .await; prompt = @@ -377,6 +400,7 @@ impl Worker { "context overflow, compacting and retrying" ); self.hook.send_status("compacting (overflow recovery)"); + dedup_tool_results(&mut history); self.force_compact_history(&mut compacted_history, &mut history) .await; prompt = "Continue where you left off. Do not repeat completed work. \ @@ -440,7 +464,8 @@ impl Worker { self.state = WorkerState::Running; self.hook.send_status("processing follow-up"); - // Compact before follow-up if needed + // Dedup stale tool results and compact before follow-up if needed + dedup_tool_results(&mut history); self.maybe_compact_history(&mut compacted_history, &mut history) .await; @@ -474,6 +499,7 @@ impl Worker { "follow-up context overflow, compacting and retrying" ); self.hook.send_status("compacting (overflow recovery)"); + dedup_tool_results(&mut history); self.force_compact_history(&mut compacted_history, &mut history) .await; let prompt_engine = self.deps.runtime_config.prompts.load(); @@ -865,6 +891,99 @@ impl Worker { } } +/// Tool names whose results are bulky and superseded by the latest call. +/// Only the most recent result for each tool is kept in full; older results +/// are replaced with a short marker to save context space. +/// +/// This runs in-place on the history before every LLM call, so the model +/// always has the latest snapshot but doesn't waste context on stale ones. +const DEDUP_TOOL_RESULTS: &[&str] = &["browser_snapshot", "browser_tab_list"]; + +/// Replace all but the most recent result for each tool in `DEDUP_TOOL_RESULTS` +/// with a short placeholder. This dramatically reduces context usage for +/// browser-heavy workflows where `browser_snapshot` returns large ARIA trees +/// on every call. +/// +/// The full results are still preserved in the transcript (via +/// `compacted_history` and `persist_transcript`). +fn dedup_tool_results(history: &mut [rig::message::Message]) { + // Step 1: Build a map from tool-call ID → tool name for dedup-eligible tools. + // We need this because ToolResult only has call_id, not the tool name. + let mut call_id_to_tool: HashMap = HashMap::new(); + for message in history.iter() { + if let rig::message::Message::Assistant { content, .. } = message { + for item in content.iter() { + if let rig::message::AssistantContent::ToolCall(tc) = item { + if DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) { + // Rig uses call_id when present, falls back to id. + let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id); + call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone()); + } + } + } + } + } + + if call_id_to_tool.is_empty() { + return; + } + + // Step 2: Find the last (most recent) result index for each tool name. + let mut last_result_index: HashMap<&str, usize> = HashMap::new(); + for (message_index, message) in history.iter().enumerate() { + if let rig::message::Message::User { content } = message { + for item in content.iter() { + if let rig::message::UserContent::ToolResult(tr) = item { + if let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + last_result_index.insert( + // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static + DEDUP_TOOL_RESULTS + .iter() + .find(|&&name| name == tool_name) + .expect("tool name came from DEDUP_TOOL_RESULTS"), + message_index, + ); + } + } + } + } + } + + // Step 3: Replace older results with a compact marker. + let mut replaced = 0usize; + for (message_index, message) in history.iter_mut().enumerate() { + if let rig::message::Message::User { content } = message { + for item in content.iter_mut() { + if let rig::message::UserContent::ToolResult(tr) = item { + if let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + let is_last = last_result_index + .get(tool_name.as_str()) + .is_some_and(|&last| last == message_index); + + if !is_last { + tr.content = rig::OneOrMany::one( + rig::message::ToolResultContent::text(format!( + "[{tool_name} output superseded by a more recent call — \ + see latest {tool_name} result below]" + )), + ); + replaced += 1; + } + } + } + } + } + } + + if replaced > 0 { + tracing::debug!(replaced, "deduped stale tool results in history"); + } +} + /// Build a recap of removed worker history for the compaction marker. /// /// Extracts tool calls, assistant text, and tool results so the worker diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs index 7d37e4294..8fcbe5c24 100644 --- a/src/hooks/loop_guard.rs +++ b/src/hooks/loop_guard.rs @@ -8,6 +8,14 @@ use std::collections::{HashMap, HashSet, VecDeque}; const POLL_TOOLS: &[&str] = &["shell"]; const HISTORY_CAPACITY: usize = 30; +/// Tools that read changing state and are expected to be called repeatedly with +/// identical arguments across a session. `browser_snapshot` always takes `{}` +/// but returns different content after each page mutation. These tools get the +/// poll multiplier on their consecutive-call threshold and are excluded from +/// ping-pong detection patterns (since alternating snapshot→click is the normal +/// browser workflow, not a stuck loop). +const OBSERVATION_TOOLS: &[&str] = &["browser_snapshot", "browser_tab_list"]; + /// Avoids hashing multi-MB tool outputs while still catching identical short /// results. Large outputs that differ only in the tail (growing log files, /// etc.) won't hash-match, which is correct — the tool is returning new data. @@ -79,7 +87,15 @@ pub enum LoopGuardVerdict { /// within one Rig `agent.prompt()` invocation, reset at prompt boundaries. pub struct LoopGuard { config: LoopGuardConfig, + /// Consecutive call count per `(tool, args)` hash. Reset to 0 for all + /// hashes except the current one whenever a *different* call is made. + /// This means `browser_snapshot` (empty args) can be called 50 times + /// across a session as long as other tools run in between — but 7 calls + /// in a row with nothing else is a real loop. call_counts: HashMap, + /// The call hash of the most recent `check()`. Used to detect whether the + /// current call is a continuation of the same repeated sequence. + last_call_hash: Option, total_calls: u32, outcome_counts: HashMap, // Call hashes poisoned by outcome detection — next check() auto-blocks. @@ -94,6 +110,7 @@ impl LoopGuard { Self { config, call_counts: HashMap::new(), + last_call_hash: None, total_calls: 0, outcome_counts: HashMap::new(), blocked_outcomes: HashSet::new(), @@ -105,6 +122,7 @@ impl LoopGuard { pub fn reset(&mut self) { self.call_counts.clear(); + self.last_call_hash = None; self.total_calls = 0; self.outcome_counts.clear(); self.blocked_outcomes.clear(); @@ -142,6 +160,20 @@ impl LoopGuard { )); } + // Reset consecutive counts when the call hash changes. A different + // tool call in between means the model is doing real work, not looping. + // This prevents observation tools like `browser_snapshot` (which always + // have the same empty args) from being permanently blocked after N + // total uses across a long session. + let is_same_as_last = self.last_call_hash.as_ref() == Some(&call_hash); + if !is_same_as_last { + self.call_counts.clear(); + // Also reset per-hash warnings so the model gets fresh warnings + // if it starts a new loop with this tool later. + self.warnings_emitted.clear(); + } + self.last_call_hash = Some(call_hash.clone()); + let count = self.call_counts.entry(call_hash.clone()).or_insert(0); *count += 1; let count_value = *count; @@ -240,6 +272,14 @@ impl LoopGuard { if a != b && tail[2] == a && tail[3] == b && tail[4] == a && tail[5] == b { let tool_a = self.resolve_tool_name(a); let tool_b = self.resolve_tool_name(b); + + // Observation tools alternate with action tools as part of + // normal workflow (snapshot→click→snapshot→click). Don't flag + // these patterns as ping-pong. + if Self::involves_observation_tool(&tool_a, &tool_b) { + return None; + } + return Some(format!( "Ping-pong detected: tools '{}' and '{}' are alternating \ repeatedly. Break the cycle by trying a different approach.", @@ -266,6 +306,14 @@ impl LoopGuard { let tool_a = self.resolve_tool_name(a); let tool_b = self.resolve_tool_name(b); let tool_c = self.resolve_tool_name(c); + + if Self::involves_observation_tool(&tool_a, &tool_b) + || Self::involves_observation_tool(&tool_a, &tool_c) + || Self::involves_observation_tool(&tool_b, &tool_c) + { + return None; + } + return Some(format!( "Ping-pong detected: tools '{}', '{}', '{}' are cycling \ repeatedly. Break the cycle by trying a different approach.", @@ -277,6 +325,13 @@ impl LoopGuard { None } + /// Returns true if either tool is an observation tool. Observation tools + /// (like `browser_snapshot`) naturally alternate with action tools as part + /// of normal workflow and should not be flagged as ping-pong. + fn involves_observation_tool(tool_a: &str, tool_b: &str) -> bool { + OBSERVATION_TOOLS.contains(&tool_a) || OBSERVATION_TOOLS.contains(&tool_b) + } + fn count_ping_pong_repeats(&self) -> u32 { let history: Vec<&String> = self.recent_calls.iter().collect(); let length = history.len(); @@ -332,9 +387,16 @@ impl LoopGuard { .unwrap_or_else(|| "unknown".to_string()) } - // Poll tools get relaxed thresholds because they're expected to be - // called repeatedly (checking build output, watching deployment status). + // Poll/observation tools get relaxed thresholds because they're expected + // to be called repeatedly (checking build output, watching deployment + // status, re-snapshotting browser state after page mutations). fn is_poll_call(tool_name: &str, args: &str) -> bool { + // Observation tools are always considered poll calls — they read + // changing state with identical args every time. + if OBSERVATION_TOOLS.contains(&tool_name) { + return true; + } + if POLL_TOOLS.contains(&tool_name) { let args_lower = args.to_lowercase(); if args.len() < 200 @@ -614,6 +676,70 @@ mod tests { assert_eq!(guard.total_calls, 1); } + #[test] + fn non_consecutive_identical_calls_allowed() { + // This is the browser_snapshot bug fix: calling the same parameterless + // tool many times across a session should be fine as long as other + // tools run in between (the page state changes between snapshots). + let mut guard = LoopGuard::new(worker_config()); + let snapshot_args = "{}"; + let click_args = r#"{"index": 3}"#; + + // Simulate 20 rounds of snapshot → click → snapshot → click... + for _ in 0..20 { + let verdict = guard.check("browser_snapshot", snapshot_args); + assert_eq!( + verdict, + LoopGuardVerdict::Allow, + "browser_snapshot should be allowed when interleaved with other tools" + ); + let verdict = guard.check("browser_click", click_args); + assert_eq!(verdict, LoopGuardVerdict::Allow); + } + } + + #[test] + fn consecutive_identical_calls_still_blocked() { + // Calling the same tool with the same args many times IN A ROW should + // still trigger the block — that's a real loop. + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"query":"same thing"}"#; + + // Worker warn_threshold = 4. Calls 1-3 are Allow, call 4 hits warn. + for _ in 0..3 { + assert_eq!(guard.check("web_search", args), LoopGuardVerdict::Allow); + } + // Call 4 hits warn threshold. + let verdict = guard.check("web_search", args); + assert!(matches!(verdict, LoopGuardVerdict::Block(_))); + } + + #[test] + fn observation_tool_consecutive_gets_relaxed_threshold() { + // Observation tools like browser_snapshot get the poll multiplier even + // when called consecutively, so they can handle legitimate sequences + // of multiple snapshots. + let mut guard = LoopGuard::new(worker_config()); + let args = "{}"; + + // Worker warn_threshold=4, poll_multiplier=3, so effective warn=12. + // Calls 1-11 should all be allowed. + for i in 1..=11 { + let verdict = guard.check("browser_snapshot", args); + assert_eq!( + verdict, + LoopGuardVerdict::Allow, + "Call {i} should be allowed for observation tool" + ); + } + // Call 12 should trigger a warning. + let verdict = guard.check("browser_snapshot", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Warning")), + "Expected warning at relaxed threshold, got: {verdict:?}" + ); + } + #[test] fn warning_bucket_escalates_to_block() { let config = LoopGuardConfig { diff --git a/src/tools/browser.rs b/src/tools/browser.rs index fb231e09e..d0590846c 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -2235,7 +2235,8 @@ async fn get_element_center( .map_err(|error| { BrowserError::new(format!( "failed to get box model for element: {error}. \ - The element may not be visible — try scrolling or taking a screenshot." + The element may not be visible or may have been removed from the DOM. \ + Run browser_snapshot to get the current interactable elements and their indices." )) })?; From c9dc6becb09321613b0978c6deea647983085897 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 18:49:31 -0800 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20dedup=20item=20tracking,=20XOR=20ping-pong=20exempt?= =?UTF-8?q?ion,=20retain=20pingpong=20warnings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix dedup doc comment: transcript is not lossless since dedup mutates history in-place before persist_transcript runs. - Track (message_index, item_index) in dedup so multiple ToolResult entries in the same User message are handled correctly. - Use XOR in involves_observation_tool so two observation tools alternating (snapshot↔tab_list) is still caught as ping-pong. - Retain pingpong_ warning buckets when resetting consecutive counts so max_warnings_per_call escalation still works for alternating patterns. --- src/agent/worker.rs | 22 ++++++++++++---------- src/hooks/loop_guard.rs | 19 ++++++++++++------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/agent/worker.rs b/src/agent/worker.rs index 5778526e4..35c3f1864 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -904,8 +904,8 @@ const DEDUP_TOOL_RESULTS: &[&str] = &["browser_snapshot", "browser_tab_list"]; /// browser-heavy workflows where `browser_snapshot` returns large ARIA trees /// on every call. /// -/// The full results are still preserved in the transcript (via -/// `compacted_history` and `persist_transcript`). +/// Note: this mutates `history` in-place, so superseded results are also +/// replaced in the persisted transcript. fn dedup_tool_results(history: &mut [rig::message::Message]) { // Step 1: Build a map from tool-call ID → tool name for dedup-eligible tools. // We need this because ToolResult only has call_id, not the tool name. @@ -928,22 +928,24 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) { return; } - // Step 2: Find the last (most recent) result index for each tool name. - let mut last_result_index: HashMap<&str, usize> = HashMap::new(); + // Step 2: Find the last (most recent) result position for each tool name. + // Tracked as (message_index, item_index) since Rig can pack multiple + // ToolResult entries into a single User message. + let mut last_result_position: HashMap<&str, (usize, usize)> = HashMap::new(); for (message_index, message) in history.iter().enumerate() { if let rig::message::Message::User { content } = message { - for item in content.iter() { + for (item_index, item) in content.iter().enumerate() { if let rig::message::UserContent::ToolResult(tr) = item { if let Some(call_id) = &tr.call_id && let Some(tool_name) = call_id_to_tool.get(call_id) { - last_result_index.insert( + last_result_position.insert( // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static DEDUP_TOOL_RESULTS .iter() .find(|&&name| name == tool_name) .expect("tool name came from DEDUP_TOOL_RESULTS"), - message_index, + (message_index, item_index), ); } } @@ -955,14 +957,14 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) { let mut replaced = 0usize; for (message_index, message) in history.iter_mut().enumerate() { if let rig::message::Message::User { content } = message { - for item in content.iter_mut() { + for (item_index, item) in content.iter_mut().enumerate() { if let rig::message::UserContent::ToolResult(tr) = item { if let Some(call_id) = &tr.call_id && let Some(tool_name) = call_id_to_tool.get(call_id) { - let is_last = last_result_index + let is_last = last_result_position .get(tool_name.as_str()) - .is_some_and(|&last| last == message_index); + .is_some_and(|&last| last == (message_index, item_index)); if !is_last { tr.content = rig::OneOrMany::one( diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs index 8fcbe5c24..984e62ad7 100644 --- a/src/hooks/loop_guard.rs +++ b/src/hooks/loop_guard.rs @@ -168,9 +168,12 @@ impl LoopGuard { let is_same_as_last = self.last_call_hash.as_ref() == Some(&call_hash); if !is_same_as_last { self.call_counts.clear(); - // Also reset per-hash warnings so the model gets fresh warnings - // if it starts a new loop with this tool later. - self.warnings_emitted.clear(); + // Reset per-hash warnings so the model gets fresh warnings if it + // starts a new loop with this tool later. Retain ping-pong warning + // counters so the max_warnings_per_call escalation still works for + // alternating patterns detected across tool switches. + self.warnings_emitted + .retain(|key, _| key.starts_with("pingpong_")); } self.last_call_hash = Some(call_hash.clone()); @@ -325,11 +328,13 @@ impl LoopGuard { None } - /// Returns true if either tool is an observation tool. Observation tools - /// (like `browser_snapshot`) naturally alternate with action tools as part - /// of normal workflow and should not be flagged as ping-pong. + /// Returns true if exactly one tool is an observation tool. Observation + /// tools (like `browser_snapshot`) naturally alternate with action tools + /// as part of normal workflow (snapshot→click) and should not be flagged + /// as ping-pong. Two observation tools alternating (snapshot↔tab_list) + /// is still suspicious and should be caught. fn involves_observation_tool(tool_a: &str, tool_b: &str) -> bool { - OBSERVATION_TOOLS.contains(&tool_a) || OBSERVATION_TOOLS.contains(&tool_b) + OBSERVATION_TOOLS.contains(&tool_a) ^ OBSERVATION_TOOLS.contains(&tool_b) } fn count_ping_pong_repeats(&self) -> u32 { From 6966cea2729eac752fc2e3bd4acc790dafab0295 Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 19:02:01 -0800 Subject: [PATCH 3/4] fix: collapse nested if blocks for clippy collapsible_if --- src/agent/worker.rs | 69 ++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/src/agent/worker.rs b/src/agent/worker.rs index 35c3f1864..7a5449179 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -913,12 +913,12 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) { for message in history.iter() { if let rig::message::Message::Assistant { content, .. } = message { for item in content.iter() { - if let rig::message::AssistantContent::ToolCall(tc) = item { - if DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) { - // Rig uses call_id when present, falls back to id. - let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id); - call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone()); - } + if let rig::message::AssistantContent::ToolCall(tc) = item + && DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) + { + // Rig uses call_id when present, falls back to id. + let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id); + call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone()); } } } @@ -935,19 +935,18 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) { for (message_index, message) in history.iter().enumerate() { if let rig::message::Message::User { content } = message { for (item_index, item) in content.iter().enumerate() { - if let rig::message::UserContent::ToolResult(tr) = item { - if let Some(call_id) = &tr.call_id - && let Some(tool_name) = call_id_to_tool.get(call_id) - { - last_result_position.insert( - // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static - DEDUP_TOOL_RESULTS - .iter() - .find(|&&name| name == tool_name) - .expect("tool name came from DEDUP_TOOL_RESULTS"), - (message_index, item_index), - ); - } + if let rig::message::UserContent::ToolResult(tr) = item + && let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + last_result_position.insert( + // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static + DEDUP_TOOL_RESULTS + .iter() + .find(|&&name| name == tool_name) + .expect("tool name came from DEDUP_TOOL_RESULTS"), + (message_index, item_index), + ); } } } @@ -958,23 +957,21 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) { for (message_index, message) in history.iter_mut().enumerate() { if let rig::message::Message::User { content } = message { for (item_index, item) in content.iter_mut().enumerate() { - if let rig::message::UserContent::ToolResult(tr) = item { - if let Some(call_id) = &tr.call_id - && let Some(tool_name) = call_id_to_tool.get(call_id) - { - let is_last = last_result_position - .get(tool_name.as_str()) - .is_some_and(|&last| last == (message_index, item_index)); - - if !is_last { - tr.content = rig::OneOrMany::one( - rig::message::ToolResultContent::text(format!( - "[{tool_name} output superseded by a more recent call — \ - see latest {tool_name} result below]" - )), - ); - replaced += 1; - } + if let rig::message::UserContent::ToolResult(tr) = item + && let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + let is_last = last_result_position + .get(tool_name.as_str()) + .is_some_and(|&last| last == (message_index, item_index)); + + if !is_last { + tr.content = + rig::OneOrMany::one(rig::message::ToolResultContent::text(format!( + "[{tool_name} output superseded by a more recent call — \ + see latest {tool_name} result below]" + ))); + replaced += 1; } } } From 9be5251d562cdc5353ed1118c52307e79c7e73ee Mon Sep 17 00:00:00 2001 From: James Pine Date: Sat, 7 Mar 2026 19:32:40 -0800 Subject: [PATCH 4/4] fix: retry transient provider errors instead of killing worker Workers now catch retriable errors (upstream 500s, timeouts, rate limits that survived model-level retries) and back off with exponential delay before retrying, up to 5 attempts. Previously any error that wasn't a context overflow or cancellation killed the worker immediately. Also added missing patterns to is_retriable_error: generic server errors like 'The server had an error' and '500' status codes that OpenRouter wraps in various phrasings. --- src/agent/worker.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++- src/llm/routing.rs | 6 ++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/agent/worker.rs b/src/agent/worker.rs index 7a5449179..1f248ab79 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -5,7 +5,7 @@ use crate::config::BrowserConfig; use crate::error::Result; use crate::hooks::{SpacebotHook, ToolNudgePolicy}; use crate::llm::SpacebotModel; -use crate::llm::routing::is_context_overflow_error; +use crate::llm::routing::{is_context_overflow_error, is_retriable_error}; use crate::{AgentDeps, ChannelId, ProcessId, ProcessType, WorkerId}; use rig::agent::AgentBuilder; use rig::completion::CompletionModel; @@ -32,6 +32,15 @@ const TURNS_PER_SEGMENT: usize = 15; /// floor of 4 messages is still too large). const MAX_OVERFLOW_RETRIES: usize = 2; +/// Max consecutive transient provider error retries before giving up. +/// Transient errors (upstream 500s, timeouts, rate limits that survived +/// model-level retries) get a backoff-and-retry at the worker level so +/// the worker survives temporary provider outages. +const MAX_TRANSIENT_RETRIES: usize = 5; + +/// Base delay for worker-level transient error backoff (doubles each retry). +const TRANSIENT_RETRY_BASE_DELAY: std::time::Duration = std::time::Duration::from_secs(5); + /// Max segments before the worker gives up and returns a partial result. /// Prevents unbounded worker loops when the LLM keeps hitting max_turns /// without completing the task. @@ -313,6 +322,7 @@ impl Worker { let mut prompt = self.task.clone(); let mut segments_run = 0; let mut overflow_retries = 0; + let mut transient_retries = 0; let mut result = if resuming { // For resumed workers, synthesize a "result" from the task @@ -343,6 +353,7 @@ impl Worker { } Err(rig::completion::PromptError::MaxTurnsError { .. }) => { overflow_retries = 0; + transient_retries = 0; if segments_run >= MAX_SEGMENTS { tracing::warn!( @@ -408,6 +419,43 @@ impl Worker { has been compacted." .into(); } + Err(error) if is_retriable_error(&error.to_string()) => { + transient_retries += 1; + if transient_retries > MAX_TRANSIENT_RETRIES { + self.state = WorkerState::Failed; + self.hook.send_status("failed"); + self.write_failure_log(&history, &format!( + "transient provider error after {MAX_TRANSIENT_RETRIES} retries: {error}" + )); + self.persist_transcript(&compacted_history, &history).await; + tracing::error!( + worker_id = %self.id, + retries = MAX_TRANSIENT_RETRIES, + %error, + "worker transient error retries exhausted" + ); + return Err(crate::error::AgentError::Other(error.into()).into()); + } + + let delay = + TRANSIENT_RETRY_BASE_DELAY * 2u32.pow((transient_retries - 1) as u32); + tracing::warn!( + worker_id = %self.id, + attempt = transient_retries, + delay_secs = delay.as_secs(), + %error, + "transient provider error, backing off and retrying" + ); + self.hook.send_status(format!( + "provider error, retrying in {}s ({transient_retries}/{MAX_TRANSIENT_RETRIES})", + delay.as_secs() + )); + tokio::time::sleep(delay).await; + + // Don't change the prompt — just retry with the same + // state. The LLM never saw this request so there's + // nothing to "continue" from. + } Err(error) => { self.state = WorkerState::Failed; self.hook.send_status("failed"); @@ -471,6 +519,7 @@ impl Worker { let mut follow_up_prompt = follow_up.clone(); let mut follow_up_overflow_retries = 0; + let mut follow_up_transient_retries = 0u32; let follow_up_hook = self .hook .clone() @@ -506,6 +555,31 @@ impl Worker { let overflow_msg = prompt_engine.render_system_worker_overflow()?; follow_up_prompt = format!("{follow_up}\n\n{overflow_msg}"); } + Err(error) if is_retriable_error(&error.to_string()) => { + follow_up_transient_retries += 1; + if follow_up_transient_retries > MAX_TRANSIENT_RETRIES as u32 { + let failure_reason = format!( + "follow-up transient error after {MAX_TRANSIENT_RETRIES} retries: {error}" + ); + self.write_failure_log(&history, &failure_reason); + tracing::error!(worker_id = %self.id, %error, "follow-up transient retries exhausted"); + break Err(failure_reason); + } + let delay = TRANSIENT_RETRY_BASE_DELAY + * 2u32.pow(follow_up_transient_retries - 1); + tracing::warn!( + worker_id = %self.id, + attempt = follow_up_transient_retries, + delay_secs = delay.as_secs(), + %error, + "follow-up transient error, backing off and retrying" + ); + self.hook.send_status(format!( + "provider error, retrying in {}s ({follow_up_transient_retries}/{MAX_TRANSIENT_RETRIES})", + delay.as_secs() + )); + tokio::time::sleep(delay).await; + } Err(error) => { let failure_reason = format!("follow-up failed: {error}"); self.write_failure_log(&history, &failure_reason); diff --git a/src/llm/routing.rs b/src/llm/routing.rs index 0c3b6744c..d6a435dec 100644 --- a/src/llm/routing.rs +++ b/src/llm/routing.rs @@ -120,6 +120,7 @@ pub fn is_retriable_error(error_message: &str) -> bool { let lower = error_message.to_lowercase(); // Rate limits and server errors lower.contains("429") + || lower.contains("500") || lower.contains("502") || lower.contains("503") || lower.contains("504") @@ -127,6 +128,11 @@ pub fn is_retriable_error(error_message: &str) -> bool { || lower.contains("overloaded") || lower.contains("timeout") || lower.contains("connection") + // Generic server errors (OpenRouter wraps upstream 500s in various + // phrasings like "The server had an error while processing your request") + || lower.contains("server error") + || lower.contains("server had an error") + || lower.contains("internal error") // Empty/malformed responses are transient provider issues || lower.contains("empty response") || lower.contains("failed to read response body")