From 18744a51e8aea55c839da56208ae919210bbcab0 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Sat, 7 Mar 2026 18:26:39 -0800
Subject: [PATCH 1/4] =?UTF-8?q?fix:=20browser=20context=20management=20?=
 =?UTF-8?q?=E2=80=94=20dedup=20snapshots,=20consecutive=20loop=20guard,=20?=
 =?UTF-8?q?overflow=20resilience?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loop guard was counting total lifetime calls per (tool, args) hash, so
browser_snapshot (empty args) got permanently blocked after 7 uses even
when other tools ran between each call. Now tracks consecutive identical
calls — the counter resets whenever a different tool runs. Observation
tools (browser_snapshot, browser_tab_list) also get the poll multiplier
and are excluded from ping-pong detection since snapshot→click is normal
browser workflow.

Worker context overflow was killing the worker after 3 retries of
force-compact. Now: dedup stale tool results before every LLM call
(replaces all but the most recent browser_snapshot/browser_tab_list
result with a one-liner), pre-prompt compaction check every segment
boundary, and reduced segment size (25→15 turns) for more frequent
checks. Overflow retry kept at 2 as a safety net since dedup+compaction
should prevent hitting it.

Also fixed the get_element_center error message to tell the model to
run browser_snapshot instead of suggesting screenshots/scrolling.
---
 src/agent/worker.rs     | 127 +++++++++++++++++++++++++++++++++++++--
 src/hooks/loop_guard.rs | 130 +++++++++++++++++++++++++++++++++++++++-
 src/tools/browser.rs    |   3 +-
 3 files changed, 253 insertions(+), 7 deletions(-)

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
index acf4af1de..5778526e4 100644
--- a/src/agent/worker.rs
+++ b/src/agent/worker.rs
@@ -9,17 +9,28 @@ use crate::llm::routing::is_context_overflow_error;
 use crate::{AgentDeps, ChannelId, ProcessId, ProcessType, WorkerId};
 use rig::agent::AgentBuilder;
 use rig::completion::CompletionModel;
+use std::collections::HashMap;
 use std::fmt::Write as _;
 use std::path::PathBuf;
 use tokio::sync::{mpsc, watch};
 use uuid::Uuid;
 
 /// How many turns per segment before we check context and potentially compact.
-const TURNS_PER_SEGMENT: usize = 25;
+///
+/// Kept relatively low so compaction checks run frequently. Fast models can
+/// burn through many tool-call turns quickly, and each turn may add large
+/// tool results (browser snapshots, shell output). Checking every 15 turns
+/// instead of 25 reduces the chance of blowing past the context window
+/// within a single segment.
+const TURNS_PER_SEGMENT: usize = 15;
 
 /// Max consecutive context overflow recoveries before giving up.
-/// Prevents infinite compact-retry loops if something is fundamentally wrong.
-const MAX_OVERFLOW_RETRIES: usize = 3;
+/// Each retry dedup-strips stale tool results and force-compacts 75% of
+/// remaining messages. Two retries is enough to handle the edge case where
+/// a single message is enormous — beyond that, something is fundamentally
+/// broken (system prompt alone exceeds the context window, or the compaction
+/// floor of 4 messages is still too large).
+const MAX_OVERFLOW_RETRIES: usize = 2;
 
 /// Max segments before the worker gives up and returns a partial result.
 /// Prevents unbounded worker loops when the LLM keeps hitting max_turns
@@ -311,6 +322,17 @@ impl Worker {
             loop {
                 segments_run += 1;
 
+                // Pre-prompt maintenance: dedup stale tool results and check
+                // context usage *before* each LLM call, not just at segment
+                // boundaries. Fast models can accumulate large tool results
+                // within a single segment and exceed the context window before
+                // we ever reach a checkpoint.
+                if segments_run > 1 {
+                    dedup_tool_results(&mut history);
+                    self.maybe_compact_history(&mut compacted_history, &mut history)
+                        .await;
+                }
+
                 match self
                     .hook
                     .prompt_with_tool_nudge_retry(&agent, &mut history, &prompt)
@@ -337,6 +359,7 @@ impl Worker {
                         }
 
                         self.persist_transcript(&compacted_history, &history).await;
+                        dedup_tool_results(&mut history);
                         self.maybe_compact_history(&mut compacted_history, &mut history)
                             .await;
                         prompt =
@@ -377,6 +400,7 @@ impl Worker {
                             "context overflow, compacting and retrying"
                         );
                         self.hook.send_status("compacting (overflow recovery)");
+                        dedup_tool_results(&mut history);
                         self.force_compact_history(&mut compacted_history, &mut history)
                             .await;
                         prompt = "Continue where you left off. Do not repeat completed work. \
@@ -440,7 +464,8 @@ impl Worker {
                 self.state = WorkerState::Running;
                 self.hook.send_status("processing follow-up");
 
-                // Compact before follow-up if needed
+                // Dedup stale tool results and compact before follow-up if needed
+                dedup_tool_results(&mut history);
                 self.maybe_compact_history(&mut compacted_history, &mut history)
                     .await;
 
@@ -474,6 +499,7 @@ impl Worker {
                                 "follow-up context overflow, compacting and retrying"
                             );
                             self.hook.send_status("compacting (overflow recovery)");
+                            dedup_tool_results(&mut history);
                             self.force_compact_history(&mut compacted_history, &mut history)
                                 .await;
                             let prompt_engine = self.deps.runtime_config.prompts.load();
@@ -865,6 +891,99 @@ impl Worker {
     }
 }
 
+/// Tool names whose results are bulky and superseded by the latest call.
+/// Only the most recent result for each tool is kept in full; older results
+/// are replaced with a short marker to save context space.
+///
+/// This runs in-place on the history before every LLM call, so the model
+/// always has the latest snapshot but doesn't waste context on stale ones.
+const DEDUP_TOOL_RESULTS: &[&str] = &["browser_snapshot", "browser_tab_list"];
+
+/// Replace all but the most recent result for each tool in `DEDUP_TOOL_RESULTS`
+/// with a short placeholder. This dramatically reduces context usage for
+/// browser-heavy workflows where `browser_snapshot` returns large ARIA trees
+/// on every call.
+///
+/// The full results are still preserved in the transcript (via
+/// `compacted_history` and `persist_transcript`).
+fn dedup_tool_results(history: &mut [rig::message::Message]) {
+    // Step 1: Build a map from tool-call ID → tool name for dedup-eligible tools.
+    // We need this because ToolResult only has call_id, not the tool name.
+    let mut call_id_to_tool: HashMap<String, String> = HashMap::new();
+    for message in history.iter() {
+        if let rig::message::Message::Assistant { content, .. } = message {
+            for item in content.iter() {
+                if let rig::message::AssistantContent::ToolCall(tc) = item {
+                    if DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) {
+                        // Rig uses call_id when present, falls back to id.
+                        let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id);
+                        call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone());
+                    }
+                }
+            }
+        }
+    }
+
+    if call_id_to_tool.is_empty() {
+        return;
+    }
+
+    // Step 2: Find the last (most recent) result index for each tool name.
+    let mut last_result_index: HashMap<&str, usize> = HashMap::new();
+    for (message_index, message) in history.iter().enumerate() {
+        if let rig::message::Message::User { content } = message {
+            for item in content.iter() {
+                if let rig::message::UserContent::ToolResult(tr) = item {
+                    if let Some(call_id) = &tr.call_id
+                        && let Some(tool_name) = call_id_to_tool.get(call_id)
+                    {
+                        last_result_index.insert(
+                            // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static
+                            DEDUP_TOOL_RESULTS
+                                .iter()
+                                .find(|&&name| name == tool_name)
+                                .expect("tool name came from DEDUP_TOOL_RESULTS"),
+                            message_index,
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    // Step 3: Replace older results with a compact marker.
+    let mut replaced = 0usize;
+    for (message_index, message) in history.iter_mut().enumerate() {
+        if let rig::message::Message::User { content } = message {
+            for item in content.iter_mut() {
+                if let rig::message::UserContent::ToolResult(tr) = item {
+                    if let Some(call_id) = &tr.call_id
+                        && let Some(tool_name) = call_id_to_tool.get(call_id)
+                    {
+                        let is_last = last_result_index
+                            .get(tool_name.as_str())
+                            .is_some_and(|&last| last == message_index);
+
+                        if !is_last {
+                            tr.content = rig::OneOrMany::one(
+                                rig::message::ToolResultContent::text(format!(
+                                    "[{tool_name} output superseded by a more recent call — \
+                                     see latest {tool_name} result below]"
+                                )),
+                            );
+                            replaced += 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if replaced > 0 {
+        tracing::debug!(replaced, "deduped stale tool results in history");
+    }
+}
+
 /// Build a recap of removed worker history for the compaction marker.
 ///
 /// Extracts tool calls, assistant text, and tool results so the worker
diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs
index 7d37e4294..8fcbe5c24 100644
--- a/src/hooks/loop_guard.rs
+++ b/src/hooks/loop_guard.rs
@@ -8,6 +8,14 @@ use std::collections::{HashMap, HashSet, VecDeque};
 const POLL_TOOLS: &[&str] = &["shell"];
 const HISTORY_CAPACITY: usize = 30;
 
+/// Tools that read changing state and are expected to be called repeatedly with
+/// identical arguments across a session. `browser_snapshot` always takes `{}`
+/// but returns different content after each page mutation. These tools get the
+/// poll multiplier on their consecutive-call threshold and are excluded from
+/// ping-pong detection patterns (since alternating snapshot→click is the normal
+/// browser workflow, not a stuck loop).
+const OBSERVATION_TOOLS: &[&str] = &["browser_snapshot", "browser_tab_list"];
+
 /// Avoids hashing multi-MB tool outputs while still catching identical short
 /// results. Large outputs that differ only in the tail (growing log files,
 /// etc.) won't hash-match, which is correct — the tool is returning new data.
@@ -79,7 +87,15 @@ pub enum LoopGuardVerdict {
 /// within one Rig `agent.prompt()` invocation, reset at prompt boundaries.
 pub struct LoopGuard {
     config: LoopGuardConfig,
+    /// Consecutive call count per `(tool, args)` hash. Reset to 0 for all
+    /// hashes except the current one whenever a *different* call is made.
+    /// This means `browser_snapshot` (empty args) can be called 50 times
+    /// across a session as long as other tools run in between — but 7 calls
+    /// in a row with nothing else is a real loop.
     call_counts: HashMap<String, u32>,
+    /// The call hash of the most recent `check()`. Used to detect whether the
+    /// current call is a continuation of the same repeated sequence.
+    last_call_hash: Option<String>,
     total_calls: u32,
     outcome_counts: HashMap<String, u32>,
     // Call hashes poisoned by outcome detection — next check() auto-blocks.
@@ -94,6 +110,7 @@ impl LoopGuard {
         Self {
             config,
             call_counts: HashMap::new(),
+            last_call_hash: None,
             total_calls: 0,
             outcome_counts: HashMap::new(),
             blocked_outcomes: HashSet::new(),
@@ -105,6 +122,7 @@ impl LoopGuard {
 
     pub fn reset(&mut self) {
         self.call_counts.clear();
+        self.last_call_hash = None;
         self.total_calls = 0;
         self.outcome_counts.clear();
         self.blocked_outcomes.clear();
@@ -142,6 +160,20 @@ impl LoopGuard {
             ));
         }
 
+        // Reset consecutive counts when the call hash changes. A different
+        // tool call in between means the model is doing real work, not looping.
+        // This prevents observation tools like `browser_snapshot` (which always
+        // have the same empty args) from being permanently blocked after N
+        // total uses across a long session.
+        let is_same_as_last = self.last_call_hash.as_ref() == Some(&call_hash);
+        if !is_same_as_last {
+            self.call_counts.clear();
+            // Also reset per-hash warnings so the model gets fresh warnings
+            // if it starts a new loop with this tool later.
+            self.warnings_emitted.clear();
+        }
+        self.last_call_hash = Some(call_hash.clone());
+
         let count = self.call_counts.entry(call_hash.clone()).or_insert(0);
         *count += 1;
         let count_value = *count;
@@ -240,6 +272,14 @@ impl LoopGuard {
             if a != b && tail[2] == a && tail[3] == b && tail[4] == a && tail[5] == b {
                 let tool_a = self.resolve_tool_name(a);
                 let tool_b = self.resolve_tool_name(b);
+
+                // Observation tools alternate with action tools as part of
+                // normal workflow (snapshot→click→snapshot→click). Don't flag
+                // these patterns as ping-pong.
+                if Self::involves_observation_tool(&tool_a, &tool_b) {
+                    return None;
+                }
+
                 return Some(format!(
                     "Ping-pong detected: tools '{}' and '{}' are alternating \
                      repeatedly. Break the cycle by trying a different approach.",
@@ -266,6 +306,14 @@ impl LoopGuard {
                 let tool_a = self.resolve_tool_name(a);
                 let tool_b = self.resolve_tool_name(b);
                 let tool_c = self.resolve_tool_name(c);
+
+                if Self::involves_observation_tool(&tool_a, &tool_b)
+                    || Self::involves_observation_tool(&tool_a, &tool_c)
+                    || Self::involves_observation_tool(&tool_b, &tool_c)
+                {
+                    return None;
+                }
+
                 return Some(format!(
                     "Ping-pong detected: tools '{}', '{}', '{}' are cycling \
                      repeatedly. Break the cycle by trying a different approach.",
@@ -277,6 +325,13 @@ impl LoopGuard {
         None
     }
 
+    /// Returns true if either tool is an observation tool. Observation tools
+    /// (like `browser_snapshot`) naturally alternate with action tools as part
+    /// of normal workflow and should not be flagged as ping-pong.
+    fn involves_observation_tool(tool_a: &str, tool_b: &str) -> bool {
+        OBSERVATION_TOOLS.contains(&tool_a) || OBSERVATION_TOOLS.contains(&tool_b)
+    }
+
     fn count_ping_pong_repeats(&self) -> u32 {
         let history: Vec<&String> = self.recent_calls.iter().collect();
         let length = history.len();
@@ -332,9 +387,16 @@ impl LoopGuard {
             .unwrap_or_else(|| "unknown".to_string())
     }
 
-    // Poll tools get relaxed thresholds because they're expected to be
-    // called repeatedly (checking build output, watching deployment status).
+    // Poll/observation tools get relaxed thresholds because they're expected
+    // to be called repeatedly (checking build output, watching deployment
+    // status, re-snapshotting browser state after page mutations).
     fn is_poll_call(tool_name: &str, args: &str) -> bool {
+        // Observation tools are always considered poll calls — they read
+        // changing state with identical args every time.
+        if OBSERVATION_TOOLS.contains(&tool_name) {
+            return true;
+        }
+
         if POLL_TOOLS.contains(&tool_name) {
             let args_lower = args.to_lowercase();
             if args.len() < 200
@@ -614,6 +676,70 @@ mod tests {
         assert_eq!(guard.total_calls, 1);
     }
 
+    #[test]
+    fn non_consecutive_identical_calls_allowed() {
+        // This is the browser_snapshot bug fix: calling the same parameterless
+        // tool many times across a session should be fine as long as other
+        // tools run in between (the page state changes between snapshots).
+        let mut guard = LoopGuard::new(worker_config());
+        let snapshot_args = "{}";
+        let click_args = r#"{"index": 3}"#;
+
+        // Simulate 20 rounds of snapshot → click → snapshot → click...
+        for _ in 0..20 {
+            let verdict = guard.check("browser_snapshot", snapshot_args);
+            assert_eq!(
+                verdict,
+                LoopGuardVerdict::Allow,
+                "browser_snapshot should be allowed when interleaved with other tools"
+            );
+            let verdict = guard.check("browser_click", click_args);
+            assert_eq!(verdict, LoopGuardVerdict::Allow);
+        }
+    }
+
+    #[test]
+    fn consecutive_identical_calls_still_blocked() {
+        // Calling the same tool with the same args many times IN A ROW should
+        // still trigger the block — that's a real loop.
+        let mut guard = LoopGuard::new(worker_config());
+        let args = r#"{"query":"same thing"}"#;
+
+        // Worker warn_threshold = 4. Calls 1-3 are Allow, call 4 hits warn.
+        for _ in 0..3 {
+            assert_eq!(guard.check("web_search", args), LoopGuardVerdict::Allow);
+        }
+        // Call 4 hits warn threshold.
+        let verdict = guard.check("web_search", args);
+        assert!(matches!(verdict, LoopGuardVerdict::Block(_)));
+    }
+
+    #[test]
+    fn observation_tool_consecutive_gets_relaxed_threshold() {
+        // Observation tools like browser_snapshot get the poll multiplier even
+        // when called consecutively, so they can handle legitimate sequences
+        // of multiple snapshots.
+        let mut guard = LoopGuard::new(worker_config());
+        let args = "{}";
+
+        // Worker warn_threshold=4, poll_multiplier=3, so effective warn=12.
+        // Calls 1-11 should all be allowed.
+        for i in 1..=11 {
+            let verdict = guard.check("browser_snapshot", args);
+            assert_eq!(
+                verdict,
+                LoopGuardVerdict::Allow,
+                "Call {i} should be allowed for observation tool"
+            );
+        }
+        // Call 12 should trigger a warning.
+        let verdict = guard.check("browser_snapshot", args);
+        assert!(
+            matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Warning")),
+            "Expected warning at relaxed threshold, got: {verdict:?}"
+        );
+    }
+
     #[test]
     fn warning_bucket_escalates_to_block() {
         let config = LoopGuardConfig {
diff --git a/src/tools/browser.rs b/src/tools/browser.rs
index fb231e09e..d0590846c 100644
--- a/src/tools/browser.rs
+++ b/src/tools/browser.rs
@@ -2235,7 +2235,8 @@ async fn get_element_center(
         .map_err(|error| {
             BrowserError::new(format!(
                 "failed to get box model for element: {error}. \
-                 The element may not be visible — try scrolling or taking a screenshot."
+                 The element may not be visible or may have been removed from the DOM. \
+                 Run browser_snapshot to get the current interactable elements and their indices."
             ))
         })?;
 

From c9dc6becb09321613b0978c6deea647983085897 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Sat, 7 Mar 2026 18:49:31 -0800
Subject: [PATCH 2/4] =?UTF-8?q?fix:=20address=20review=20feedback=20?=
 =?UTF-8?q?=E2=80=94=20dedup=20item=20tracking,=20XOR=20ping-pong=20exempt?=
 =?UTF-8?q?ion,=20retain=20pingpong=20warnings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix dedup doc comment: transcript is not lossless since dedup mutates
  history in-place before persist_transcript runs.
- Track (message_index, item_index) in dedup so multiple ToolResult
  entries in the same User message are handled correctly.
- Use XOR in involves_observation_tool so two observation tools
  alternating (snapshot↔tab_list) is still caught as ping-pong.
- Retain pingpong_ warning buckets when resetting consecutive counts
  so max_warnings_per_call escalation still works for alternating
  patterns.
---
 src/agent/worker.rs     | 22 ++++++++++++----------
 src/hooks/loop_guard.rs | 19 ++++++++++++-------
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
index 5778526e4..35c3f1864 100644
--- a/src/agent/worker.rs
+++ b/src/agent/worker.rs
@@ -904,8 +904,8 @@ const DEDUP_TOOL_RESULTS: &[&str] = &["browser_snapshot", "browser_tab_list"];
 /// browser-heavy workflows where `browser_snapshot` returns large ARIA trees
 /// on every call.
 ///
-/// The full results are still preserved in the transcript (via
-/// `compacted_history` and `persist_transcript`).
+/// Note: this mutates `history` in-place, so superseded results are also
+/// replaced in the persisted transcript.
 fn dedup_tool_results(history: &mut [rig::message::Message]) {
     // Step 1: Build a map from tool-call ID → tool name for dedup-eligible tools.
     // We need this because ToolResult only has call_id, not the tool name.
@@ -928,22 +928,24 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) {
         return;
     }
 
-    // Step 2: Find the last (most recent) result index for each tool name.
-    let mut last_result_index: HashMap<&str, usize> = HashMap::new();
+    // Step 2: Find the last (most recent) result position for each tool name.
+    // Tracked as (message_index, item_index) since Rig can pack multiple
+    // ToolResult entries into a single User message.
+    let mut last_result_position: HashMap<&str, (usize, usize)> = HashMap::new();
     for (message_index, message) in history.iter().enumerate() {
         if let rig::message::Message::User { content } = message {
-            for item in content.iter() {
+            for (item_index, item) in content.iter().enumerate() {
                 if let rig::message::UserContent::ToolResult(tr) = item {
                     if let Some(call_id) = &tr.call_id
                         && let Some(tool_name) = call_id_to_tool.get(call_id)
                     {
-                        last_result_index.insert(
+                        last_result_position.insert(
                             // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static
                             DEDUP_TOOL_RESULTS
                                 .iter()
                                 .find(|&&name| name == tool_name)
                                 .expect("tool name came from DEDUP_TOOL_RESULTS"),
-                            message_index,
+                            (message_index, item_index),
                         );
                     }
                 }
@@ -955,14 +957,14 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) {
     let mut replaced = 0usize;
     for (message_index, message) in history.iter_mut().enumerate() {
         if let rig::message::Message::User { content } = message {
-            for item in content.iter_mut() {
+            for (item_index, item) in content.iter_mut().enumerate() {
                 if let rig::message::UserContent::ToolResult(tr) = item {
                     if let Some(call_id) = &tr.call_id
                         && let Some(tool_name) = call_id_to_tool.get(call_id)
                     {
-                        let is_last = last_result_index
+                        let is_last = last_result_position
                             .get(tool_name.as_str())
-                            .is_some_and(|&last| last == message_index);
+                            .is_some_and(|&last| last == (message_index, item_index));
 
                         if !is_last {
                             tr.content = rig::OneOrMany::one(
diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs
index 8fcbe5c24..984e62ad7 100644
--- a/src/hooks/loop_guard.rs
+++ b/src/hooks/loop_guard.rs
@@ -168,9 +168,12 @@ impl LoopGuard {
         let is_same_as_last = self.last_call_hash.as_ref() == Some(&call_hash);
         if !is_same_as_last {
             self.call_counts.clear();
-            // Also reset per-hash warnings so the model gets fresh warnings
-            // if it starts a new loop with this tool later.
-            self.warnings_emitted.clear();
+            // Reset per-hash warnings so the model gets fresh warnings if it
+            // starts a new loop with this tool later. Retain ping-pong warning
+            // counters so the max_warnings_per_call escalation still works for
+            // alternating patterns detected across tool switches.
+            self.warnings_emitted
+                .retain(|key, _| key.starts_with("pingpong_"));
         }
         self.last_call_hash = Some(call_hash.clone());
 
@@ -325,11 +328,13 @@ impl LoopGuard {
         None
     }
 
-    /// Returns true if either tool is an observation tool. Observation tools
-    /// (like `browser_snapshot`) naturally alternate with action tools as part
-    /// of normal workflow and should not be flagged as ping-pong.
+    /// Returns true if exactly one tool is an observation tool. Observation
+    /// tools (like `browser_snapshot`) naturally alternate with action tools
+    /// as part of normal workflow (snapshot→click) and should not be flagged
+    /// as ping-pong. Two observation tools alternating (snapshot↔tab_list)
+    /// is still suspicious and should be caught.
     fn involves_observation_tool(tool_a: &str, tool_b: &str) -> bool {
-        OBSERVATION_TOOLS.contains(&tool_a) || OBSERVATION_TOOLS.contains(&tool_b)
+        OBSERVATION_TOOLS.contains(&tool_a) ^ OBSERVATION_TOOLS.contains(&tool_b)
     }
 
     fn count_ping_pong_repeats(&self) -> u32 {

From 6966cea2729eac752fc2e3bd4acc790dafab0295 Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Sat, 7 Mar 2026 19:02:01 -0800
Subject: [PATCH 3/4] fix: collapse nested if blocks for clippy collapsible_if

---
 src/agent/worker.rs | 69 ++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
index 35c3f1864..7a5449179 100644
--- a/src/agent/worker.rs
+++ b/src/agent/worker.rs
@@ -913,12 +913,12 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) {
     for message in history.iter() {
         if let rig::message::Message::Assistant { content, .. } = message {
             for item in content.iter() {
-                if let rig::message::AssistantContent::ToolCall(tc) = item {
-                    if DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) {
-                        // Rig uses call_id when present, falls back to id.
-                        let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id);
-                        call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone());
-                    }
+                if let rig::message::AssistantContent::ToolCall(tc) = item
+                    && DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str())
+                {
+                    // Rig uses call_id when present, falls back to id.
+                    let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id);
+                    call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone());
                 }
             }
         }
@@ -935,19 +935,18 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) {
     for (message_index, message) in history.iter().enumerate() {
         if let rig::message::Message::User { content } = message {
             for (item_index, item) in content.iter().enumerate() {
-                if let rig::message::UserContent::ToolResult(tr) = item {
-                    if let Some(call_id) = &tr.call_id
-                        && let Some(tool_name) = call_id_to_tool.get(call_id)
-                    {
-                        last_result_position.insert(
-                            // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static
-                            DEDUP_TOOL_RESULTS
-                                .iter()
-                                .find(|&&name| name == tool_name)
-                                .expect("tool name came from DEDUP_TOOL_RESULTS"),
-                            (message_index, item_index),
-                        );
-                    }
+                if let rig::message::UserContent::ToolResult(tr) = item
+                    && let Some(call_id) = &tr.call_id
+                    && let Some(tool_name) = call_id_to_tool.get(call_id)
+                {
+                    last_result_position.insert(
+                        // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static
+                        DEDUP_TOOL_RESULTS
+                            .iter()
+                            .find(|&&name| name == tool_name)
+                            .expect("tool name came from DEDUP_TOOL_RESULTS"),
+                        (message_index, item_index),
+                    );
                 }
             }
         }
@@ -958,23 +957,21 @@ fn dedup_tool_results(history: &mut [rig::message::Message]) {
     for (message_index, message) in history.iter_mut().enumerate() {
         if let rig::message::Message::User { content } = message {
             for (item_index, item) in content.iter_mut().enumerate() {
-                if let rig::message::UserContent::ToolResult(tr) = item {
-                    if let Some(call_id) = &tr.call_id
-                        && let Some(tool_name) = call_id_to_tool.get(call_id)
-                    {
-                        let is_last = last_result_position
-                            .get(tool_name.as_str())
-                            .is_some_and(|&last| last == (message_index, item_index));
-
-                        if !is_last {
-                            tr.content = rig::OneOrMany::one(
-                                rig::message::ToolResultContent::text(format!(
-                                    "[{tool_name} output superseded by a more recent call — \
-                                     see latest {tool_name} result below]"
-                                )),
-                            );
-                            replaced += 1;
-                        }
+                if let rig::message::UserContent::ToolResult(tr) = item
+                    && let Some(call_id) = &tr.call_id
+                    && let Some(tool_name) = call_id_to_tool.get(call_id)
+                {
+                    let is_last = last_result_position
+                        .get(tool_name.as_str())
+                        .is_some_and(|&last| last == (message_index, item_index));
+
+                    if !is_last {
+                        tr.content =
+                            rig::OneOrMany::one(rig::message::ToolResultContent::text(format!(
+                                "[{tool_name} output superseded by a more recent call — \
+                                 see latest {tool_name} result below]"
+                            )));
+                        replaced += 1;
                     }
                 }
             }

From 9be5251d562cdc5353ed1118c52307e79c7e73ee Mon Sep 17 00:00:00 2001
From: James Pine <ijamespine@me.com>
Date: Sat, 7 Mar 2026 19:32:40 -0800
Subject: [PATCH 4/4] fix: retry transient provider errors instead of killing
 worker

Workers now catch retriable errors (upstream 500s, timeouts, rate limits
that survived model-level retries) and back off with exponential delay
before retrying, up to 5 attempts. Previously any error that wasn't a
context overflow or cancellation killed the worker immediately.

Also added missing patterns to is_retriable_error: generic server errors
like 'The server had an error' and '500' status codes that OpenRouter
wraps in various phrasings.
---
 src/agent/worker.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++-
 src/llm/routing.rs  |  6 ++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/agent/worker.rs b/src/agent/worker.rs
index 7a5449179..1f248ab79 100644
--- a/src/agent/worker.rs
+++ b/src/agent/worker.rs
@@ -5,7 +5,7 @@ use crate::config::BrowserConfig;
 use crate::error::Result;
 use crate::hooks::{SpacebotHook, ToolNudgePolicy};
 use crate::llm::SpacebotModel;
-use crate::llm::routing::is_context_overflow_error;
+use crate::llm::routing::{is_context_overflow_error, is_retriable_error};
 use crate::{AgentDeps, ChannelId, ProcessId, ProcessType, WorkerId};
 use rig::agent::AgentBuilder;
 use rig::completion::CompletionModel;
@@ -32,6 +32,15 @@ const TURNS_PER_SEGMENT: usize = 15;
 /// floor of 4 messages is still too large).
 const MAX_OVERFLOW_RETRIES: usize = 2;
 
+/// Max consecutive transient provider error retries before giving up.
+/// Transient errors (upstream 500s, timeouts, rate limits that survived
+/// model-level retries) get a backoff-and-retry at the worker level so
+/// the worker survives temporary provider outages.
+const MAX_TRANSIENT_RETRIES: usize = 5;
+
+/// Base delay for worker-level transient error backoff (doubles each retry).
+const TRANSIENT_RETRY_BASE_DELAY: std::time::Duration = std::time::Duration::from_secs(5);
+
 /// Max segments before the worker gives up and returns a partial result.
 /// Prevents unbounded worker loops when the LLM keeps hitting max_turns
 /// without completing the task.
@@ -313,6 +322,7 @@ impl Worker {
         let mut prompt = self.task.clone();
         let mut segments_run = 0;
         let mut overflow_retries = 0;
+        let mut transient_retries = 0;
 
         let mut result = if resuming {
             // For resumed workers, synthesize a "result" from the task
@@ -343,6 +353,7 @@ impl Worker {
                     }
                     Err(rig::completion::PromptError::MaxTurnsError { .. }) => {
                         overflow_retries = 0;
+                        transient_retries = 0;
 
                         if segments_run >= MAX_SEGMENTS {
                             tracing::warn!(
@@ -408,6 +419,43 @@ impl Worker {
                               has been compacted."
                             .into();
                     }
+                    Err(error) if is_retriable_error(&error.to_string()) => {
+                        transient_retries += 1;
+                        if transient_retries > MAX_TRANSIENT_RETRIES {
+                            self.state = WorkerState::Failed;
+                            self.hook.send_status("failed");
+                            self.write_failure_log(&history, &format!(
+                                "transient provider error after {MAX_TRANSIENT_RETRIES} retries: {error}"
+                            ));
+                            self.persist_transcript(&compacted_history, &history).await;
+                            tracing::error!(
+                                worker_id = %self.id,
+                                retries = MAX_TRANSIENT_RETRIES,
+                                %error,
+                                "worker transient error retries exhausted"
+                            );
+                            return Err(crate::error::AgentError::Other(error.into()).into());
+                        }
+
+                        let delay =
+                            TRANSIENT_RETRY_BASE_DELAY * 2u32.pow((transient_retries - 1) as u32);
+                        tracing::warn!(
+                            worker_id = %self.id,
+                            attempt = transient_retries,
+                            delay_secs = delay.as_secs(),
+                            %error,
+                            "transient provider error, backing off and retrying"
+                        );
+                        self.hook.send_status(format!(
+                            "provider error, retrying in {}s ({transient_retries}/{MAX_TRANSIENT_RETRIES})",
+                            delay.as_secs()
+                        ));
+                        tokio::time::sleep(delay).await;
+
+                        // Don't change the prompt — just retry with the same
+                        // state. The LLM never saw this request so there's
+                        // nothing to "continue" from.
+                    }
                     Err(error) => {
                         self.state = WorkerState::Failed;
                         self.hook.send_status("failed");
@@ -471,6 +519,7 @@ impl Worker {
 
                 let mut follow_up_prompt = follow_up.clone();
                 let mut follow_up_overflow_retries = 0;
+                let mut follow_up_transient_retries = 0u32;
                 let follow_up_hook = self
                     .hook
                     .clone()
@@ -506,6 +555,31 @@ impl Worker {
                             let overflow_msg = prompt_engine.render_system_worker_overflow()?;
                             follow_up_prompt = format!("{follow_up}\n\n{overflow_msg}");
                         }
+                        Err(error) if is_retriable_error(&error.to_string()) => {
+                            follow_up_transient_retries += 1;
+                            if follow_up_transient_retries > MAX_TRANSIENT_RETRIES as u32 {
+                                let failure_reason = format!(
+                                    "follow-up transient error after {MAX_TRANSIENT_RETRIES} retries: {error}"
+                                );
+                                self.write_failure_log(&history, &failure_reason);
+                                tracing::error!(worker_id = %self.id, %error, "follow-up transient retries exhausted");
+                                break Err(failure_reason);
+                            }
+                            let delay = TRANSIENT_RETRY_BASE_DELAY
+                                * 2u32.pow(follow_up_transient_retries - 1);
+                            tracing::warn!(
+                                worker_id = %self.id,
+                                attempt = follow_up_transient_retries,
+                                delay_secs = delay.as_secs(),
+                                %error,
+                                "follow-up transient error, backing off and retrying"
+                            );
+                            self.hook.send_status(format!(
+                                "provider error, retrying in {}s ({follow_up_transient_retries}/{MAX_TRANSIENT_RETRIES})",
+                                delay.as_secs()
+                            ));
+                            tokio::time::sleep(delay).await;
+                        }
                         Err(error) => {
                             let failure_reason = format!("follow-up failed: {error}");
                             self.write_failure_log(&history, &failure_reason);
diff --git a/src/llm/routing.rs b/src/llm/routing.rs
index 0c3b6744c..d6a435dec 100644
--- a/src/llm/routing.rs
+++ b/src/llm/routing.rs
@@ -120,6 +120,7 @@ pub fn is_retriable_error(error_message: &str) -> bool {
     let lower = error_message.to_lowercase();
     // Rate limits and server errors
     lower.contains("429")
+        || lower.contains("500")
         || lower.contains("502")
         || lower.contains("503")
         || lower.contains("504")
@@ -127,6 +128,11 @@ pub fn is_retriable_error(error_message: &str) -> bool {
         || lower.contains("overloaded")
         || lower.contains("timeout")
         || lower.contains("connection")
+        // Generic server errors (OpenRouter wraps upstream 500s in various
+        // phrasings like "The server had an error while processing your request")
+        || lower.contains("server error")
+        || lower.contains("server had an error")
+        || lower.contains("internal error")
         // Empty/malformed responses are transient provider issues
         || lower.contains("empty response")
         || lower.contains("failed to read response body")