diff --git a/src/agent/worker.rs b/src/agent/worker.rs index acf4af1de..1f248ab79 100644 --- a/src/agent/worker.rs +++ b/src/agent/worker.rs @@ -5,21 +5,41 @@ use crate::config::BrowserConfig; use crate::error::Result; use crate::hooks::{SpacebotHook, ToolNudgePolicy}; use crate::llm::SpacebotModel; -use crate::llm::routing::is_context_overflow_error; +use crate::llm::routing::{is_context_overflow_error, is_retriable_error}; use crate::{AgentDeps, ChannelId, ProcessId, ProcessType, WorkerId}; use rig::agent::AgentBuilder; use rig::completion::CompletionModel; +use std::collections::HashMap; use std::fmt::Write as _; use std::path::PathBuf; use tokio::sync::{mpsc, watch}; use uuid::Uuid; /// How many turns per segment before we check context and potentially compact. -const TURNS_PER_SEGMENT: usize = 25; +/// +/// Kept relatively low so compaction checks run frequently. Fast models can +/// burn through many tool-call turns quickly, and each turn may add large +/// tool results (browser snapshots, shell output). Checking every 15 turns +/// instead of 25 reduces the chance of blowing past the context window +/// within a single segment. +const TURNS_PER_SEGMENT: usize = 15; /// Max consecutive context overflow recoveries before giving up. -/// Prevents infinite compact-retry loops if something is fundamentally wrong. -const MAX_OVERFLOW_RETRIES: usize = 3; +/// Each retry dedup-strips stale tool results and force-compacts 75% of +/// remaining messages. Two retries is enough to handle the edge case where +/// a single message is enormous — beyond that, something is fundamentally +/// broken (system prompt alone exceeds the context window, or the compaction +/// floor of 4 messages is still too large). +const MAX_OVERFLOW_RETRIES: usize = 2; + +/// Max consecutive transient provider error retries before giving up. +/// Transient errors (upstream 500s, timeouts, rate limits that survived +/// model-level retries) get a backoff-and-retry at the worker level so +/// the worker survives temporary provider outages. +const MAX_TRANSIENT_RETRIES: usize = 5; + +/// Base delay for worker-level transient error backoff (doubles each retry). +const TRANSIENT_RETRY_BASE_DELAY: std::time::Duration = std::time::Duration::from_secs(5); /// Max segments before the worker gives up and returns a partial result. /// Prevents unbounded worker loops when the LLM keeps hitting max_turns @@ -302,6 +322,7 @@ impl Worker { let mut prompt = self.task.clone(); let mut segments_run = 0; let mut overflow_retries = 0; + let mut transient_retries = 0; let mut result = if resuming { // For resumed workers, synthesize a "result" from the task @@ -311,6 +332,17 @@ impl Worker { loop { segments_run += 1; + // Pre-prompt maintenance: dedup stale tool results and check + // context usage *before* each LLM call, not just at segment + // boundaries. Fast models can accumulate large tool results + // within a single segment and exceed the context window before + // we ever reach a checkpoint. + if segments_run > 1 { + dedup_tool_results(&mut history); + self.maybe_compact_history(&mut compacted_history, &mut history) + .await; + } + match self .hook .prompt_with_tool_nudge_retry(&agent, &mut history, &prompt) @@ -321,6 +353,7 @@ impl Worker { } Err(rig::completion::PromptError::MaxTurnsError { .. }) => { overflow_retries = 0; + transient_retries = 0; if segments_run >= MAX_SEGMENTS { tracing::warn!( @@ -337,6 +370,7 @@ impl Worker { } self.persist_transcript(&compacted_history, &history).await; + dedup_tool_results(&mut history); self.maybe_compact_history(&mut compacted_history, &mut history) .await; prompt = @@ -377,6 +411,7 @@ impl Worker { "context overflow, compacting and retrying" ); self.hook.send_status("compacting (overflow recovery)"); + dedup_tool_results(&mut history); self.force_compact_history(&mut compacted_history, &mut history) .await; prompt = "Continue where you left off. Do not repeat completed work. \ @@ -384,6 +419,43 @@ impl Worker { has been compacted." .into(); } + Err(error) if is_retriable_error(&error.to_string()) => { + transient_retries += 1; + if transient_retries > MAX_TRANSIENT_RETRIES { + self.state = WorkerState::Failed; + self.hook.send_status("failed"); + self.write_failure_log(&history, &format!( + "transient provider error after {MAX_TRANSIENT_RETRIES} retries: {error}" + )); + self.persist_transcript(&compacted_history, &history).await; + tracing::error!( + worker_id = %self.id, + retries = MAX_TRANSIENT_RETRIES, + %error, + "worker transient error retries exhausted" + ); + return Err(crate::error::AgentError::Other(error.into()).into()); + } + + let delay = + TRANSIENT_RETRY_BASE_DELAY * 2u32.pow((transient_retries - 1) as u32); + tracing::warn!( + worker_id = %self.id, + attempt = transient_retries, + delay_secs = delay.as_secs(), + %error, + "transient provider error, backing off and retrying" + ); + self.hook.send_status(format!( + "provider error, retrying in {}s ({transient_retries}/{MAX_TRANSIENT_RETRIES})", + delay.as_secs() + )); + tokio::time::sleep(delay).await; + + // Don't change the prompt — just retry with the same + // state. The LLM never saw this request so there's + // nothing to "continue" from. + } Err(error) => { self.state = WorkerState::Failed; self.hook.send_status("failed"); @@ -440,12 +512,14 @@ impl Worker { self.state = WorkerState::Running; self.hook.send_status("processing follow-up"); - // Compact before follow-up if needed + // Dedup stale tool results and compact before follow-up if needed + dedup_tool_results(&mut history); self.maybe_compact_history(&mut compacted_history, &mut history) .await; let mut follow_up_prompt = follow_up.clone(); let mut follow_up_overflow_retries = 0; + let mut follow_up_transient_retries = 0u32; let follow_up_hook = self .hook .clone() @@ -474,12 +548,38 @@ impl Worker { "follow-up context overflow, compacting and retrying" ); self.hook.send_status("compacting (overflow recovery)"); + dedup_tool_results(&mut history); self.force_compact_history(&mut compacted_history, &mut history) .await; let prompt_engine = self.deps.runtime_config.prompts.load(); let overflow_msg = prompt_engine.render_system_worker_overflow()?; follow_up_prompt = format!("{follow_up}\n\n{overflow_msg}"); } + Err(error) if is_retriable_error(&error.to_string()) => { + follow_up_transient_retries += 1; + if follow_up_transient_retries > MAX_TRANSIENT_RETRIES as u32 { + let failure_reason = format!( + "follow-up transient error after {MAX_TRANSIENT_RETRIES} retries: {error}" + ); + self.write_failure_log(&history, &failure_reason); + tracing::error!(worker_id = %self.id, %error, "follow-up transient retries exhausted"); + break Err(failure_reason); + } + let delay = TRANSIENT_RETRY_BASE_DELAY + * 2u32.pow(follow_up_transient_retries - 1); + tracing::warn!( + worker_id = %self.id, + attempt = follow_up_transient_retries, + delay_secs = delay.as_secs(), + %error, + "follow-up transient error, backing off and retrying" + ); + self.hook.send_status(format!( + "provider error, retrying in {}s ({follow_up_transient_retries}/{MAX_TRANSIENT_RETRIES})", + delay.as_secs() + )); + tokio::time::sleep(delay).await; + } Err(error) => { let failure_reason = format!("follow-up failed: {error}"); self.write_failure_log(&history, &failure_reason); @@ -865,6 +965,98 @@ impl Worker { } } +/// Tool names whose results are bulky and superseded by the latest call. +/// Only the most recent result for each tool is kept in full; older results +/// are replaced with a short marker to save context space. +/// +/// This runs in-place on the history before every LLM call, so the model +/// always has the latest snapshot but doesn't waste context on stale ones. +const DEDUP_TOOL_RESULTS: &[&str] = &["browser_snapshot", "browser_tab_list"]; + +/// Replace all but the most recent result for each tool in `DEDUP_TOOL_RESULTS` +/// with a short placeholder. This dramatically reduces context usage for +/// browser-heavy workflows where `browser_snapshot` returns large ARIA trees +/// on every call. +/// +/// Note: this mutates `history` in-place, so superseded results are also +/// replaced in the persisted transcript. +fn dedup_tool_results(history: &mut [rig::message::Message]) { + // Step 1: Build a map from tool-call ID → tool name for dedup-eligible tools. + // We need this because ToolResult only has call_id, not the tool name. + let mut call_id_to_tool: HashMap = HashMap::new(); + for message in history.iter() { + if let rig::message::Message::Assistant { content, .. } = message { + for item in content.iter() { + if let rig::message::AssistantContent::ToolCall(tc) = item + && DEDUP_TOOL_RESULTS.contains(&tc.function.name.as_str()) + { + // Rig uses call_id when present, falls back to id. + let effective_id = tc.call_id.as_ref().unwrap_or(&tc.id); + call_id_to_tool.insert(effective_id.clone(), tc.function.name.clone()); + } + } + } + } + + if call_id_to_tool.is_empty() { + return; + } + + // Step 2: Find the last (most recent) result position for each tool name. + // Tracked as (message_index, item_index) since Rig can pack multiple + // ToolResult entries into a single User message. + let mut last_result_position: HashMap<&str, (usize, usize)> = HashMap::new(); + for (message_index, message) in history.iter().enumerate() { + if let rig::message::Message::User { content } = message { + for (item_index, item) in content.iter().enumerate() { + if let rig::message::UserContent::ToolResult(tr) = item + && let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + last_result_position.insert( + // Safe: tool_name came from DEDUP_TOOL_RESULTS which is 'static + DEDUP_TOOL_RESULTS + .iter() + .find(|&&name| name == tool_name) + .expect("tool name came from DEDUP_TOOL_RESULTS"), + (message_index, item_index), + ); + } + } + } + } + + // Step 3: Replace older results with a compact marker. + let mut replaced = 0usize; + for (message_index, message) in history.iter_mut().enumerate() { + if let rig::message::Message::User { content } = message { + for (item_index, item) in content.iter_mut().enumerate() { + if let rig::message::UserContent::ToolResult(tr) = item + && let Some(call_id) = &tr.call_id + && let Some(tool_name) = call_id_to_tool.get(call_id) + { + let is_last = last_result_position + .get(tool_name.as_str()) + .is_some_and(|&last| last == (message_index, item_index)); + + if !is_last { + tr.content = + rig::OneOrMany::one(rig::message::ToolResultContent::text(format!( + "[{tool_name} output superseded by a more recent call — \ + see latest {tool_name} result below]" + ))); + replaced += 1; + } + } + } + } + } + + if replaced > 0 { + tracing::debug!(replaced, "deduped stale tool results in history"); + } +} + /// Build a recap of removed worker history for the compaction marker. /// /// Extracts tool calls, assistant text, and tool results so the worker diff --git a/src/hooks/loop_guard.rs b/src/hooks/loop_guard.rs index 7d37e4294..984e62ad7 100644 --- a/src/hooks/loop_guard.rs +++ b/src/hooks/loop_guard.rs @@ -8,6 +8,14 @@ use std::collections::{HashMap, HashSet, VecDeque}; const POLL_TOOLS: &[&str] = &["shell"]; const HISTORY_CAPACITY: usize = 30; +/// Tools that read changing state and are expected to be called repeatedly with +/// identical arguments across a session. `browser_snapshot` always takes `{}` +/// but returns different content after each page mutation. These tools get the +/// poll multiplier on their consecutive-call threshold and are excluded from +/// ping-pong detection patterns (since alternating snapshot→click is the normal +/// browser workflow, not a stuck loop). +const OBSERVATION_TOOLS: &[&str] = &["browser_snapshot", "browser_tab_list"]; + /// Avoids hashing multi-MB tool outputs while still catching identical short /// results. Large outputs that differ only in the tail (growing log files, /// etc.) won't hash-match, which is correct — the tool is returning new data. @@ -79,7 +87,15 @@ pub enum LoopGuardVerdict { /// within one Rig `agent.prompt()` invocation, reset at prompt boundaries. pub struct LoopGuard { config: LoopGuardConfig, + /// Consecutive call count per `(tool, args)` hash. Reset to 0 for all + /// hashes except the current one whenever a *different* call is made. + /// This means `browser_snapshot` (empty args) can be called 50 times + /// across a session as long as other tools run in between — but 7 calls + /// in a row with nothing else is a real loop. call_counts: HashMap, + /// The call hash of the most recent `check()`. Used to detect whether the + /// current call is a continuation of the same repeated sequence. + last_call_hash: Option, total_calls: u32, outcome_counts: HashMap, // Call hashes poisoned by outcome detection — next check() auto-blocks. @@ -94,6 +110,7 @@ impl LoopGuard { Self { config, call_counts: HashMap::new(), + last_call_hash: None, total_calls: 0, outcome_counts: HashMap::new(), blocked_outcomes: HashSet::new(), @@ -105,6 +122,7 @@ impl LoopGuard { pub fn reset(&mut self) { self.call_counts.clear(); + self.last_call_hash = None; self.total_calls = 0; self.outcome_counts.clear(); self.blocked_outcomes.clear(); @@ -142,6 +160,23 @@ impl LoopGuard { )); } + // Reset consecutive counts when the call hash changes. A different + // tool call in between means the model is doing real work, not looping. + // This prevents observation tools like `browser_snapshot` (which always + // have the same empty args) from being permanently blocked after N + // total uses across a long session. + let is_same_as_last = self.last_call_hash.as_ref() == Some(&call_hash); + if !is_same_as_last { + self.call_counts.clear(); + // Reset per-hash warnings so the model gets fresh warnings if it + // starts a new loop with this tool later. Retain ping-pong warning + // counters so the max_warnings_per_call escalation still works for + // alternating patterns detected across tool switches. + self.warnings_emitted + .retain(|key, _| key.starts_with("pingpong_")); + } + self.last_call_hash = Some(call_hash.clone()); + let count = self.call_counts.entry(call_hash.clone()).or_insert(0); *count += 1; let count_value = *count; @@ -240,6 +275,14 @@ impl LoopGuard { if a != b && tail[2] == a && tail[3] == b && tail[4] == a && tail[5] == b { let tool_a = self.resolve_tool_name(a); let tool_b = self.resolve_tool_name(b); + + // Observation tools alternate with action tools as part of + // normal workflow (snapshot→click→snapshot→click). Don't flag + // these patterns as ping-pong. + if Self::involves_observation_tool(&tool_a, &tool_b) { + return None; + } + return Some(format!( "Ping-pong detected: tools '{}' and '{}' are alternating \ repeatedly. Break the cycle by trying a different approach.", @@ -266,6 +309,14 @@ impl LoopGuard { let tool_a = self.resolve_tool_name(a); let tool_b = self.resolve_tool_name(b); let tool_c = self.resolve_tool_name(c); + + if Self::involves_observation_tool(&tool_a, &tool_b) + || Self::involves_observation_tool(&tool_a, &tool_c) + || Self::involves_observation_tool(&tool_b, &tool_c) + { + return None; + } + return Some(format!( "Ping-pong detected: tools '{}', '{}', '{}' are cycling \ repeatedly. Break the cycle by trying a different approach.", @@ -277,6 +328,15 @@ impl LoopGuard { None } + /// Returns true if exactly one tool is an observation tool. Observation + /// tools (like `browser_snapshot`) naturally alternate with action tools + /// as part of normal workflow (snapshot→click) and should not be flagged + /// as ping-pong. Two observation tools alternating (snapshot↔tab_list) + /// is still suspicious and should be caught. + fn involves_observation_tool(tool_a: &str, tool_b: &str) -> bool { + OBSERVATION_TOOLS.contains(&tool_a) ^ OBSERVATION_TOOLS.contains(&tool_b) + } + fn count_ping_pong_repeats(&self) -> u32 { let history: Vec<&String> = self.recent_calls.iter().collect(); let length = history.len(); @@ -332,9 +392,16 @@ impl LoopGuard { .unwrap_or_else(|| "unknown".to_string()) } - // Poll tools get relaxed thresholds because they're expected to be - // called repeatedly (checking build output, watching deployment status). + // Poll/observation tools get relaxed thresholds because they're expected + // to be called repeatedly (checking build output, watching deployment + // status, re-snapshotting browser state after page mutations). fn is_poll_call(tool_name: &str, args: &str) -> bool { + // Observation tools are always considered poll calls — they read + // changing state with identical args every time. + if OBSERVATION_TOOLS.contains(&tool_name) { + return true; + } + if POLL_TOOLS.contains(&tool_name) { let args_lower = args.to_lowercase(); if args.len() < 200 @@ -614,6 +681,70 @@ mod tests { assert_eq!(guard.total_calls, 1); } + #[test] + fn non_consecutive_identical_calls_allowed() { + // This is the browser_snapshot bug fix: calling the same parameterless + // tool many times across a session should be fine as long as other + // tools run in between (the page state changes between snapshots). + let mut guard = LoopGuard::new(worker_config()); + let snapshot_args = "{}"; + let click_args = r#"{"index": 3}"#; + + // Simulate 20 rounds of snapshot → click → snapshot → click... + for _ in 0..20 { + let verdict = guard.check("browser_snapshot", snapshot_args); + assert_eq!( + verdict, + LoopGuardVerdict::Allow, + "browser_snapshot should be allowed when interleaved with other tools" + ); + let verdict = guard.check("browser_click", click_args); + assert_eq!(verdict, LoopGuardVerdict::Allow); + } + } + + #[test] + fn consecutive_identical_calls_still_blocked() { + // Calling the same tool with the same args many times IN A ROW should + // still trigger the block — that's a real loop. + let mut guard = LoopGuard::new(worker_config()); + let args = r#"{"query":"same thing"}"#; + + // Worker warn_threshold = 4. Calls 1-3 are Allow, call 4 hits warn. + for _ in 0..3 { + assert_eq!(guard.check("web_search", args), LoopGuardVerdict::Allow); + } + // Call 4 hits warn threshold. + let verdict = guard.check("web_search", args); + assert!(matches!(verdict, LoopGuardVerdict::Block(_))); + } + + #[test] + fn observation_tool_consecutive_gets_relaxed_threshold() { + // Observation tools like browser_snapshot get the poll multiplier even + // when called consecutively, so they can handle legitimate sequences + // of multiple snapshots. + let mut guard = LoopGuard::new(worker_config()); + let args = "{}"; + + // Worker warn_threshold=4, poll_multiplier=3, so effective warn=12. + // Calls 1-11 should all be allowed. + for i in 1..=11 { + let verdict = guard.check("browser_snapshot", args); + assert_eq!( + verdict, + LoopGuardVerdict::Allow, + "Call {i} should be allowed for observation tool" + ); + } + // Call 12 should trigger a warning. + let verdict = guard.check("browser_snapshot", args); + assert!( + matches!(verdict, LoopGuardVerdict::Block(ref message) if message.contains("Warning")), + "Expected warning at relaxed threshold, got: {verdict:?}" + ); + } + #[test] fn warning_bucket_escalates_to_block() { let config = LoopGuardConfig { diff --git a/src/llm/routing.rs b/src/llm/routing.rs index 0c3b6744c..d6a435dec 100644 --- a/src/llm/routing.rs +++ b/src/llm/routing.rs @@ -120,6 +120,7 @@ pub fn is_retriable_error(error_message: &str) -> bool { let lower = error_message.to_lowercase(); // Rate limits and server errors lower.contains("429") + || lower.contains("500") || lower.contains("502") || lower.contains("503") || lower.contains("504") @@ -127,6 +128,11 @@ pub fn is_retriable_error(error_message: &str) -> bool { || lower.contains("overloaded") || lower.contains("timeout") || lower.contains("connection") + // Generic server errors (OpenRouter wraps upstream 500s in various + // phrasings like "The server had an error while processing your request") + || lower.contains("server error") + || lower.contains("server had an error") + || lower.contains("internal error") // Empty/malformed responses are transient provider issues || lower.contains("empty response") || lower.contains("failed to read response body") diff --git a/src/tools/browser.rs b/src/tools/browser.rs index fb231e09e..d0590846c 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -2235,7 +2235,8 @@ async fn get_element_center( .map_err(|error| { BrowserError::new(format!( "failed to get box model for element: {error}. \ - The element may not be visible — try scrolling or taking a screenshot." + The element may not be visible or may have been removed from the DOM. \ + Run browser_snapshot to get the current interactable elements and their indices." )) })?;