From a660a36968a7006cc1f2f3764e4ec106a7348f6a Mon Sep 17 00:00:00 2001
From: Kostas Karachalios <vrinek@hey.com>
Date: Fri, 20 Feb 2026 20:00:02 +0100
Subject: [PATCH 1/3] fix: add 429 rate-limit retry with countdown to HTTP
 transport

When the Anthropic API returns HTTP 429, the app now retries up to 5
times honoring the Retry-After header, with a visible per-second
countdown via on_retry callback. Both _http_stream_sse (streaming) and
_http_json (model listing) are covered. Non-429 errors still raise
immediately. Connection-timeout retries remain independent.
---
 agent/engine.py                               |   5 +
 agent/model.py                                | 157 +++++--
 ...2-20-429-rate-limit-handling-brainstorm.md |  62 +++
 ...026-02-20-fix-429-rate-limit-retry-plan.md | 437 ++++++++++++++++++
 tests/conftest.py                             |   4 +-
 tests/test_model_complex.py                   |   6 +-
 tests/test_rate_limit.py                      | 369 +++++++++++++++
 7 files changed, 994 insertions(+), 46 deletions(-)
 create mode 100644 docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md
 create mode 100644 docs/plans/2026-02-20-fix-429-rate-limit-retry-plan.md
 create mode 100644 tests/test_rate_limit.py

diff --git a/agent/engine.py b/agent/engine.py
index 8bd2b65a..f04845df 100644
--- a/agent/engine.py
+++ b/agent/engine.py
@@ -325,6 +325,9 @@ def _solve_recursive(
             # Stream thinking/text deltas only for top-level calls
             if on_content_delta and depth == 0 and hasattr(model, "on_content_delta"):
                 model.on_content_delta = on_content_delta
+            # Rate-limit messages fire at all depths
+            if on_event and hasattr(model, "on_retry"):
+                model.on_retry = lambda msg: self._emit(msg, on_event)
             try:
                 turn = model.complete(conversation)
             except ModelError as exc:
@@ -333,6 +336,8 @@ def _solve_recursive(
             finally:
                 if hasattr(model, "on_content_delta"):
                     model.on_content_delta = None
+                if hasattr(model, "on_retry"):
+                    model.on_retry = None
             elapsed = time.monotonic() - t0
 
             if replay_logger:
diff --git a/agent/model.py b/agent/model.py
index b82e5f28..cdebd7b8 100644
--- a/agent/model.py
+++ b/agent/model.py
@@ -2,6 +2,7 @@
 
 import json
 import socket
+import time
 import urllib.error
 import urllib.request
 from dataclasses import dataclass, field
@@ -95,37 +96,83 @@ def _extract_content(content: object) -> str:
     return ""
 
 
+def _parse_retry_after(exc: urllib.error.HTTPError, default: int = 5) -> int:
+    """Parse Retry-After header from an HTTPError, clamped to [1, 120]."""
+    raw = exc.headers.get("Retry-After")
+    if raw is None:
+        return default
+    try:
+        return max(1, min(120, int(raw)))
+    except (ValueError, TypeError):
+        return default
+
+
+def _notify_retry(on_retry: "Callable[[str], None] | None", msg: str) -> None:
+    """Fire on_retry callback, swallowing any exception."""
+    if on_retry is not None:
+        try:
+            on_retry(msg)
+        except Exception:
+            pass
+
+
+def _sleep_with_countdown(
+    seconds: int,
+    attempt: int,
+    max_attempts: int,
+    on_retry: "Callable[[str], None] | None",
+) -> None:
+    """Sleep for *seconds*, firing on_retry with a countdown each second."""
+    for remaining in range(seconds, 0, -1):
+        _notify_retry(
+            on_retry,
+            f"Rate limited (attempt {attempt}/{max_attempts}). "
+            f"Retrying in {remaining}s...",
+        )
+        time.sleep(1)
+
+
 def _http_json(
     url: str,
     method: str,
     headers: dict[str, str],
     payload: dict[str, Any] | None = None,
     timeout_sec: int = 90,
+    max_rate_limit_retries: int = 5,
 ) -> dict[str, Any]:
-    req = urllib.request.Request(
-        url=url,
-        data=(json.dumps(payload).encode("utf-8") if payload is not None else None),
-        headers=headers,
-        method=method,
-    )
-    try:
-        with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
-            raw = resp.read().decode("utf-8", errors="replace")
-    except urllib.error.HTTPError as exc:  # pragma: no cover - network path
-        body = exc.read().decode("utf-8", errors="replace")
-        raise ModelError(f"HTTP {exc.code} calling {url}: {body}") from exc
-    except urllib.error.URLError as exc:  # pragma: no cover - network path
-        raise ModelError(f"Connection error calling {url}: {exc}") from exc
-    except OSError as exc:  # pragma: no cover - bare socket.timeout, etc.
-        raise ModelError(f"Network error calling {url}: {exc}") from exc
+    for rate_limit_attempt in range(1, max_rate_limit_retries + 1):
+        req = urllib.request.Request(
+            url=url,
+            data=(json.dumps(payload).encode("utf-8") if payload is not None else None),
+            headers=headers,
+            method=method,
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=timeout_sec) as resp:
+                raw = resp.read().decode("utf-8", errors="replace")
+        except urllib.error.HTTPError as exc:
+            if exc.code == 429:
+                retry_after = _parse_retry_after(exc)
+                _sleep_with_countdown(retry_after, rate_limit_attempt, max_rate_limit_retries, None)
+                continue
+            body = exc.read().decode("utf-8", errors="replace")[:8192]
+            raise ModelError(f"HTTP {exc.code} calling {url}: {body}") from exc
+        except urllib.error.URLError as exc:  # pragma: no cover - network path
+            raise ModelError(f"Connection error calling {url}: {exc}") from exc
+        except OSError as exc:  # pragma: no cover - bare socket.timeout, etc.
+            raise ModelError(f"Network error calling {url}: {exc}") from exc
 
-    try:
-        parsed = json.loads(raw)
-    except json.JSONDecodeError as exc:
-        raise ModelError(f"Non-JSON response from {url}: {raw[:500]}") from exc
-    if not isinstance(parsed, dict):
-        raise ModelError(f"Unexpected non-object JSON from {url}: {type(parsed)!r}")
-    return parsed
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError as exc:
+            raise ModelError(f"Non-JSON response from {url}: {raw[:500]}") from exc
+        if not isinstance(parsed, dict):
+            raise ModelError(f"Unexpected non-object JSON from {url}: {type(parsed)!r}")
+        return parsed
+
+    raise ModelError(
+        f"HTTP 429 calling {url}: rate limited, exhausted {max_rate_limit_retries} retries"
+    )
 
 
 def _extend_socket_timeout(resp: Any, timeout: float) -> None:
@@ -212,32 +259,54 @@ def _http_stream_sse(
     stream_timeout: float = 120,
     max_retries: int = 3,
     on_sse_event: "Callable[[str, dict[str, Any]], None] | None" = None,
+    on_retry: "Callable[[str], None] | None" = None,
+    max_rate_limit_retries: int = 5,
 ) -> list[tuple[str, dict[str, Any]]]:
     """Stream an SSE endpoint with first-byte timeout and retry logic."""
     data = json.dumps(payload).encode("utf-8")
 
-    last_exc: Exception | None = None
-    for attempt in range(max_retries):
-        req = urllib.request.Request(url=url, data=data, headers=headers, method=method)
-        try:
-            resp = urllib.request.urlopen(req, timeout=first_byte_timeout)
-        except urllib.error.HTTPError as exc:
-            body = exc.read().decode("utf-8", errors="replace")
-            raise ModelError(f"HTTP {exc.code} calling {url}: {body}") from exc
-        except (socket.timeout, urllib.error.URLError, OSError) as exc:
-            # Timeout or connection error — retry
-            last_exc = exc
+    for rate_limit_attempt in range(1, max_rate_limit_retries + 1):
+        got_429 = False
+        retry_after = 0
+
+        last_exc: Exception | None = None
+        for attempt in range(max_retries):
+            req = urllib.request.Request(url=url, data=data, headers=headers, method=method)
+            try:
+                resp = urllib.request.urlopen(req, timeout=first_byte_timeout)
+            except urllib.error.HTTPError as exc:
+                if exc.code == 429:
+                    retry_after = _parse_retry_after(exc)
+                    got_429 = True
+                    break
+                body = exc.read().decode("utf-8", errors="replace")[:8192]
+                raise ModelError(f"HTTP {exc.code} calling {url}: {body}") from exc
+            except (socket.timeout, urllib.error.URLError, OSError) as exc:
+                # Timeout or connection error — retry
+                last_exc = exc
+                continue
+
+            # First byte received — extend timeout for the rest of the stream
+            _extend_socket_timeout(resp, stream_timeout)
+            try:
+                return _read_sse_events(resp, on_sse_event=on_sse_event)
+            finally:
+                resp.close()
+        else:
+            if not got_429:
+                raise ModelError(
+                    f"Timed out after {max_retries} attempts calling {url}: {last_exc}"
+                )
+
+        if got_429:
+            _sleep_with_countdown(retry_after, rate_limit_attempt, max_rate_limit_retries, on_retry)
             continue
 
-        # First byte received — extend timeout for the rest of the stream
-        _extend_socket_timeout(resp, stream_timeout)
-        try:
-            return _read_sse_events(resp, on_sse_event=on_sse_event)
-        finally:
-            resp.close()
+        # If we reach here without got_429 and without returning, something went wrong
+        break  # pragma: no cover
 
     raise ModelError(
-        f"Timed out after {max_retries} attempts calling {url}: {last_exc}"
+        f"HTTP 429 calling {url}: rate limited, exhausted {max_rate_limit_retries} retries"
     )
 
 
@@ -577,6 +646,7 @@ class OpenAICompatibleModel:
     strict_tools: bool = True
     tool_defs: list[dict[str, Any]] | None = None
     on_content_delta: Callable[[str, str], None] | None = None
+    on_retry: Callable[[str], None] | None = None
 
     def _is_reasoning_model(self) -> bool:
         """OpenAI reasoning models (o-series, gpt-5 series) have different API constraints."""
@@ -656,6 +726,7 @@ def _forward_delta(_event_type: str, data: dict[str, Any]) -> None:
                 payload=payload,
                 stream_timeout=self.timeout_sec,
                 on_sse_event=sse_cb,
+                on_retry=self.on_retry,
             )
             parsed = _accumulate_openai_stream(events)
         except ModelError as exc:
@@ -675,6 +746,7 @@ def _forward_delta(_event_type: str, data: dict[str, Any]) -> None:
                 payload=payload,
                 stream_timeout=self.timeout_sec,
                 on_sse_event=sse_cb,
+                on_retry=self.on_retry,
             )
             parsed = _accumulate_openai_stream(events)
 
@@ -771,6 +843,7 @@ class AnthropicModel:
     timeout_sec: int = 300
     tool_defs: list[dict[str, Any]] | None = None
     on_content_delta: Callable[[str, str], None] | None = None
+    on_retry: Callable[[str], None] | None = None
 
     def create_conversation(self, system_prompt: str, initial_user_message: str) -> Conversation:
         messages: list[Any] = [
@@ -849,6 +922,7 @@ def _forward_delta(_event_type: str, data: dict[str, Any]) -> None:
                 payload=payload,
                 stream_timeout=self.timeout_sec,
                 on_sse_event=sse_cb,
+                on_retry=self.on_retry,
             )
             parsed = _accumulate_anthropic_stream(events)
         except ModelError as exc:
@@ -869,6 +943,7 @@ def _forward_delta(_event_type: str, data: dict[str, Any]) -> None:
                 payload=payload,
                 stream_timeout=self.timeout_sec,
                 on_sse_event=sse_cb,
+                on_retry=self.on_retry,
             )
             parsed = _accumulate_anthropic_stream(events)
 
diff --git a/docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md b/docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md
new file mode 100644
index 00000000..5275b42c
--- /dev/null
+++ b/docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md
@@ -0,0 +1,62 @@
+# Brainstorm: Graceful 429 Rate Limit Handling
+
+**Date:** 2026-02-20
+**Status:** Ready for planning
+
+## What We're Building
+
+Retry logic for HTTP 429 (rate limit) responses from the Anthropic API. When the
+app hits a rate limit, it should wait using the `Retry-After` header and retry
+automatically — up to 5 attempts — with a visible countdown in the TUI so the
+user knows what's happening.
+
+## Why This Approach
+
+**Problem:** The app currently treats 429 as a fatal error. `_http_stream_sse()`
+catches `HTTPError` and immediately raises `ModelError`, which terminates the
+engine's solve loop. The user sees an error and has to restart.
+
+**Approach A (inline retry)** was chosen over a retry decorator or model-layer
+retry because:
+
+- The existing `_http_stream_sse()` already has a retry loop for connection
+  errors — extending it for 429 is natural
+- Only two call sites need updating (`_http_stream_sse` and `_http_json`), which
+  doesn't justify a new abstraction
+- Keeps HTTP retry concerns in the HTTP transport layer where they belong
+
+## Key Decisions
+
+1. **Scope: Anthropic 429 only (for now).** Other providers and other transient
+   errors (5xx) are out of scope. The shared code path means expanding later is
+   straightforward.
+
+2. **Retry strategy: Respect `Retry-After` header, max 5 attempts.** Anthropic
+   includes this header in 429 responses. We honor it directly rather than
+   implementing our own exponential backoff.
+
+3. **UX: Visible countdown.** During the wait, show a live countdown
+   (e.g., "Rate limited. Retrying in 8s... 7s... 6s...") in the TUI event stream
+   so the user knows the app hasn't hung.
+
+4. **Both call paths covered.** Retry logic goes in both `_http_stream_sse()`
+   (streaming LLM calls) and `_http_json()` (model listing).
+
+5. **Failure mode: ModelError as today.** After 5 failed attempts, raise the same
+   `ModelError` the app raises now. No new exception subclasses.
+
+6. **Implementation: Inline in transport functions.** No decorators, no new
+   abstractions. Extend the existing retry pattern.
+
+## Affected Code
+
+| File | Function | Change |
+|------|----------|--------|
+| `agent/model.py` | `_http_stream_sse()` | Add 429 detection in HTTPError handler, parse Retry-After, sleep with countdown, loop |
+| `agent/model.py` | `_http_json()` | Same pattern |
+| `agent/model.py` | Both functions' signatures | Add optional `on_retry` callback for countdown display |
+| `agent/engine.py` | Call sites of above | Thread `on_event` through as `on_retry` callback |
+
+## Open Questions
+
+None — all questions resolved during brainstorming.
diff --git a/docs/plans/2026-02-20-fix-429-rate-limit-retry-plan.md b/docs/plans/2026-02-20-fix-429-rate-limit-retry-plan.md
new file mode 100644
index 00000000..2e6e3c99
--- /dev/null
+++ b/docs/plans/2026-02-20-fix-429-rate-limit-retry-plan.md
@@ -0,0 +1,437 @@
+---
+title: "fix: Graceful 429 rate limit retry with countdown"
+type: fix
+status: completed
+date: 2026-02-20
+deepened: 2026-02-20
+brainstorm: docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md
+---
+
+# fix: Graceful 429 Rate Limit Retry with Countdown
+
+## Enhancement Summary
+
+**Deepened on:** 2026-02-20
+**Sections enhanced:** 6
+**Research agents used:** architecture-strategist, kieran-python-reviewer,
+code-simplicity-reviewer, performance-oracle, security-sentinel,
+pattern-recognition-specialist, best-practices-researcher,
+framework-docs-researcher
+
+### Key Improvements
+
+1. Extract `_parse_retry_after()` and `_sleep_with_countdown()` as pure/reusable
+   helpers -- keeps the retry loop body clean and individually testable
+2. `on_retry` fires at ALL depths (not depth-gated like `on_content_delta`) so
+   rate-limit messages always reach the TUI regardless of recursion depth
+3. Truncate HTTP error response bodies to 8 KB before embedding in `ModelError`
+   messages to prevent memory issues from large error payloads
+
+### New Considerations Discovered
+
+- Anthropic uses HTTP 529 for `overloaded_error` (separate from 429
+  `rate_limit_error`). Out of scope for v1 but the architecture accommodates it
+  trivially by adding `exc.code in (429, 529)`.
+- The worst-case hang is 5 rate-limit retries x 120s cap = ~10 minutes per call.
+  Document as known limitation; a total wall-clock timeout can be added later.
+- `on_retry` callback must be wrapped in try/except (matching `_emit` pattern)
+  so a broken callback never kills the retry loop.
+- `import time` is not currently in `agent/model.py` and must be added.
+- Tests must patch `time.sleep` via `unittest.mock.patch("agent.model.time.sleep")`
+  to avoid actually sleeping.
+
+## Overview
+
+When the app receives an HTTP 429 from the Anthropic API, it crashes the solve
+loop. This plan adds retry logic with `Retry-After` header support and a visible
+TUI countdown, so the user sees what's happening and the app recovers
+automatically.
+
+## Problem Statement
+
+`_http_stream_sse()` catches `urllib.error.HTTPError` and immediately raises
+`ModelError` regardless of status code. The engine's `_solve_recursive()` catches
+that `ModelError` and terminates the task. The user sees an error and must
+re-submit. Rate limits are transient -- the app should wait and retry.
+
+## Proposed Solution
+
+Add 429 detection and retry-with-countdown inline in both transport functions
+(`_http_stream_sse` and `_http_json`). Honor the `Retry-After` header. Fire
+status messages through an `on_retry` callback for TUI display.
+
+**Key design decisions (from brainstorm):**
+
+- **Generic 429 handling** in shared transport functions. The code checks
+  `exc.code == 429` regardless of provider. "Anthropic only" refers to testing
+  scope, not implementation restriction.
+- **Separate retry budget** from connection-timeout retries. 429 retries (max 5)
+  and connection retries (existing max 3) are independent counters.
+- **`on_retry` callback signature:** `Callable[[str], None]` -- receives a
+  human-readable status message. Matches the `on_event` pattern. The transport
+  function generates the string; the caller decides how to display it.
+- **`on_retry` lifecycle:** Follows the `on_content_delta` pattern -- set as a
+  field on model instances, managed by the engine before/after `complete()`.
+- **`_http_json()` retries silently** for now. No callback threading through the
+  4-function model-listing chain. Countdown display is for streaming calls only.
+- **No parallel worker coordination.** Each worker retries independently.
+  Document as a known limitation.
+
+## Technical Considerations
+
+### `Retry-After` header parsing
+
+- Parse as integer seconds only (Anthropic's format). If missing or unparseable,
+  fall back to **5 seconds**.
+- **Floor:** 1 second. **Cap:** 120 seconds. Prevents tight loops and absurd waits.
+- Access via `exc.headers.get("Retry-After")` on `urllib.error.HTTPError`.
+
+#### Research Insights
+
+**Anthropic API behavior (confirmed from official docs):**
+
+- 429 error body: `{"type": "error", "error": {"type": "rate_limit_error", "message": "..."}}`
+- `Retry-After` header: integer seconds format (not HTTP-date)
+- Additional headers on every response: `anthropic-ratelimit-requests-limit`,
+  `anthropic-ratelimit-requests-remaining`, `anthropic-ratelimit-requests-reset`,
+  and matching `tokens-*` variants. These can inform proactive throttling in a
+  future enhancement but are not needed for v1.
+- HTTP 529 (`overloaded_error`) is a separate status code. Not handled in v1.
+
+**Implementation: Extract `_parse_retry_after()` as a pure function:**
+
+```python
+def _parse_retry_after(exc: urllib.error.HTTPError, default: int = 5) -> int:
+    """Parse Retry-After header from an HTTPError, clamped to [1, 120]."""
+    raw = exc.headers.get("Retry-After")
+    if raw is None:
+        return default
+    try:
+        return max(1, min(120, int(raw)))
+    except (ValueError, TypeError):
+        return default
+```
+
+This is independently testable without mocking any HTTP machinery.
+
+### Retry loop architecture in `_http_stream_sse()`
+
+The existing function has a retry loop for connection errors. The 429 retry wraps
+the entire existing attempt logic as an outer concern:
+
+```
+for rate_limit_attempt in range(max_rate_limit_retries):  # new outer loop (5)
+    for attempt in range(max_retries):                     # existing inner loop (3)
+        try:
+            resp = urlopen(req, ...)
+        except HTTPError as exc:
+            if exc.code == 429:
+                break  # break inner, trigger outer retry with sleep
+            raise ModelError(...)
+        except (timeout, URLError, OSError):
+            continue  # existing connection retry
+    else:
+        # connection retries exhausted
+        raise ModelError(...)
+    if got_429:
+        sleep with countdown
+        continue  # outer loop retry
+    return events  # success
+```
+
+#### Research Insights
+
+**Nested loop sentinel pattern:** The `got_429` sentinel variable mediates
+between the inner `break` and the outer `continue`. This is the simplest pattern
+that keeps the two retry budgets independent. An alternative (extracting the
+inner loop into `_urlopen_with_retries()`) was considered but adds a function
+boundary without reducing complexity for the current scope.
+
+**Callback safety:** The `on_retry` callback must be wrapped in try/except to
+prevent a broken callback from killing the retry loop:
+
+```python
+def _notify_retry(on_retry: "Callable[[str], None] | None", msg: str) -> None:
+    if on_retry is not None:
+        try:
+            on_retry(msg)
+        except Exception:
+            pass  # never let a callback kill the retry loop
+```
+
+**Sleep with countdown helper:**
+
+```python
+def _sleep_with_countdown(
+    seconds: int,
+    attempt: int,
+    max_attempts: int,
+    on_retry: "Callable[[str], None] | None",
+) -> None:
+    """Sleep for `seconds`, firing on_retry with a countdown each second."""
+    for remaining in range(seconds, 0, -1):
+        _notify_retry(
+            on_retry,
+            f"Rate limited (attempt {attempt}/{max_attempts}). "
+            f"Retrying in {remaining}s...",
+        )
+        time.sleep(1)
+```
+
+### Threading safety
+
+- `time.sleep()` blocks the calling thread. Safe on main thread (TUI refresh
+  runs on its own thread). Safe on worker threads (they're dedicated to their
+  task).
+- `on_retry` callback fires from the sleeping thread. The existing `on_event`
+  callback is already called from worker threads (in `_run_one_tool`), so the
+  TUI's console locking handles concurrent output.
+- `KeyboardInterrupt` during `time.sleep()` propagates naturally. The engine's
+  `finally` block clears `on_content_delta`/`on_retry`. Verify `_ThinkingDisplay`
+  stops cleanly.
+
+### TUI interaction
+
+- During a 429 wait, the `_ThinkingDisplay` ("Thinking...") may still be active.
+  The retry countdown fires through `on_event`, which appears in the event log.
+  This is adequate for v1 -- a more polished approach (stopping the thinking
+  display and replacing it with a countdown widget) can follow later.
+
+### Security considerations
+
+#### Research Insights
+
+- **Truncate error response bodies:** HTTP error responses can contain large
+  payloads. Cap at 8 KB before embedding in `ModelError` messages to prevent
+  memory issues. Apply in both `_http_stream_sse` and `_http_json`.
+- **Keep `on_retry` messages application-controlled:** The retry message strings
+  are generated by our code, not echoed from the server response. This prevents
+  any server-supplied content from reaching the TUI unescaped.
+- **No credentials in error messages:** The existing pattern already strips
+  `Authorization` headers from error context. No changes needed.
+
+## System-Wide Impact
+
+- **Interaction graph:** `_http_stream_sse()` is called by
+  `OpenAICompatibleModel.complete()` and `AnthropicModel.complete()`, both of
+  which have a fallback retry path for unsupported parameters. All 4 call sites
+  (2 primary + 2 fallback) need `on_retry` threaded through.
+- **Error propagation:** After 5 retries, `ModelError` propagates exactly as
+  today. The error message includes the retry count for debuggability. Format:
+  `"HTTP 429 calling {url}: rate limited, exhausted {N} retries"` (matches
+  existing `"HTTP {code} calling {url}: {detail}"` pattern).
+- **State lifecycle risks:** None. The retry loop is stateless -- it only reads
+  the `Retry-After` header and sleeps. No persistent state is created or modified.
+- **API surface parity:** `_http_json()` gets the same retry logic but without
+  the callback (silent retry). `_http_stream_sse()` gets retry + callback.
+- **Test helpers:** `mock_openai_stream` and `mock_anthropic_stream` in
+  `conftest.py` must accept the new `on_retry` parameter.
+
+#### Research Insights
+
+- **`on_retry` depth gating:** Unlike `on_content_delta` (which only fires at
+  `depth == 0`), `on_retry` should fire at ALL depths. Rate-limit messages are
+  operational status, not content -- the user should always see them regardless
+  of recursion depth. Wire it unconditionally in `_solve_recursive()`.
+- **`finally` block parity:** The `finally` block in `_solve_recursive()` must
+  clear both `on_content_delta` and `on_retry`. Use the existing `hasattr` guard
+  pattern.
+- **Worst-case timing:** 5 rate-limit retries x 120s cap = ~10 minutes max hang
+  per single `_http_stream_sse` call. With the inner connection retry loop, the
+  theoretical ceiling is 5 x (3 connection attempts + 120s sleep) but in practice
+  connection retries are fast failures. Document as known limitation.
+
+## Acceptance Criteria
+
+### Functional
+
+- [x] 429 with `Retry-After: N` header triggers automatic retry after N seconds
+- [x] Missing `Retry-After` header falls back to 5-second wait
+- [x] `Retry-After` values are clamped to [1, 120] seconds
+- [x] Max 5 retry attempts before raising `ModelError`
+- [x] `on_retry` callback receives messages like `"Rate limited (attempt 2/5). Retrying in 8s..."`
+- [x] Countdown ticks once per second (`"...7s"`, `"...6s"`, etc.)
+- [x] Non-429 HTTP errors still raise immediately (no regression)
+- [x] Connection-timeout retries still work independently of 429 retries
+- [x] `_http_json()` retries 429 silently (no callback, same retry logic)
+- [x] After exhausting retries, error message includes retry count
+- [x] Parameter-fallback retry paths (`reasoning_effort`, `thinking`) also get
+      429 protection on their second `_http_stream_sse()` call
+
+### Non-Functional
+
+- [x] Thread-safe: parallel workers can retry independently without corruption
+- [x] `KeyboardInterrupt` during sleep cancels cleanly (no TUI corruption)
+- [x] No new dependencies (stdlib `time.sleep` only)
+- [x] Error response bodies truncated to 8 KB in `ModelError` messages
+- [x] `on_retry` callback failures never kill the retry loop
+
+## MVP
+
+### Step 1: Add 429 retry to `_http_stream_sse()`
+
+**`agent/model.py`**
+
+Add `import time` at top of file. Add `_parse_retry_after()`,
+`_notify_retry()`, and `_sleep_with_countdown()` as module-level helpers. Add
+`on_retry` and `max_rate_limit_retries` parameters to `_http_stream_sse()`.
+Restructure the retry loop with an outer 429-retry layer.
+
+```python
+# agent/model.py - new helpers (above _http_stream_sse)
+
+def _parse_retry_after(exc: urllib.error.HTTPError, default: int = 5) -> int:
+    """Parse Retry-After header from an HTTPError, clamped to [1, 120]."""
+    raw = exc.headers.get("Retry-After")
+    if raw is None:
+        return default
+    try:
+        return max(1, min(120, int(raw)))
+    except (ValueError, TypeError):
+        return default
+
+
+def _notify_retry(on_retry: "Callable[[str], None] | None", msg: str) -> None:
+    """Fire on_retry callback, swallowing any exception."""
+    if on_retry is not None:
+        try:
+            on_retry(msg)
+        except Exception:
+            pass
+
+
+def _sleep_with_countdown(
+    seconds: int,
+    attempt: int,
+    max_attempts: int,
+    on_retry: "Callable[[str], None] | None",
+) -> None:
+    """Sleep for `seconds`, firing on_retry with a countdown each second."""
+    for remaining in range(seconds, 0, -1):
+        _notify_retry(
+            on_retry,
+            f"Rate limited (attempt {attempt}/{max_attempts}). "
+            f"Retrying in {remaining}s...",
+        )
+        time.sleep(1)
+```
+
+```python
+# agent/model.py - _http_stream_sse signature
+def _http_stream_sse(
+    url: str,
+    method: str,
+    headers: dict[str, str],
+    payload: dict[str, Any],
+    first_byte_timeout: float = 10,
+    stream_timeout: float = 120,
+    max_retries: int = 3,
+    on_sse_event: "Callable[[str, dict[str, Any]], None] | None" = None,
+    on_retry: "Callable[[str], None] | None" = None,        # NEW
+    max_rate_limit_retries: int = 5,                         # NEW
+) -> list[tuple[str, dict[str, Any]]]:
+```
+
+### Step 2: Add 429 retry to `_http_json()`
+
+**`agent/model.py`**
+
+Add a retry loop to `_http_json()` for 429 responses. Silent (no callback) since
+the model-listing call chain doesn't pass callbacks.
+
+```python
+# agent/model.py - _http_json with retry loop
+def _http_json(
+    url: str,
+    method: str,
+    headers: dict[str, str],
+    payload: dict[str, Any] | None = None,
+    timeout_sec: int = 90,
+    max_rate_limit_retries: int = 5,                         # NEW
+) -> dict[str, Any]:
+```
+
+### Step 3: Thread `on_retry` through model classes
+
+**`agent/model.py`**
+
+Add `on_retry: Callable[[str], None] | None = None` field to both
+`OpenAICompatibleModel` and `AnthropicModel` (alongside existing
+`on_content_delta`). Pass it through to `_http_stream_sse()` in `complete()` --
+all 4 call sites (2 primary + 2 fallback).
+
+### Step 4: Wire `on_retry` in the engine
+
+**`agent/engine.py`**
+
+Set `model.on_retry` in `_solve_recursive()` before calling `model.complete()`,
+and clear it in the `finally` block. Wire it **unconditionally** (not gated by
+`depth == 0` like `on_content_delta`) so rate-limit messages always reach the TUI.
+
+```python
+# engine.py - in _solve_recursive, alongside on_content_delta setup
+# NOTE: on_retry fires at ALL depths (not depth-gated)
+model.on_retry = (lambda msg: self._emit(msg, on_event)) if on_event else None
+```
+
+```python
+# engine.py - in finally block, clear both callbacks
+if hasattr(model, "on_content_delta"):
+    model.on_content_delta = None
+if hasattr(model, "on_retry"):
+    model.on_retry = None
+```
+
+### Step 5: Update test helpers
+
+**`tests/conftest.py`**
+
+Update `mock_openai_stream` and `mock_anthropic_stream` wrapper signatures to
+accept `on_retry=None` and `max_rate_limit_retries=5`.
+
+### Step 6: Write tests (TDD -- these come first in practice)
+
+**`tests/test_rate_limit.py`** (new file)
+
+Tests must patch `time.sleep` via `unittest.mock.patch("agent.model.time.sleep")`
+to avoid actually sleeping during test runs.
+
+- `test_parse_retry_after_with_valid_header`
+- `test_parse_retry_after_missing_header_uses_default`
+- `test_parse_retry_after_clamped_to_bounds`
+- `test_parse_retry_after_non_integer_uses_default`
+- `test_429_retries_and_succeeds`
+- `test_429_exhausts_retries_raises_model_error`
+- `test_429_on_retry_callback_invoked_with_countdown`
+- `test_non_429_http_error_raises_immediately` (regression guard)
+- `test_429_in_http_json_retries_silently`
+- `test_connection_retry_and_429_retry_are_independent`
+- `test_notify_retry_swallows_callback_exceptions`
+
+## Known Limitations
+
+- **Parallel worker stampede:** Multiple `ThreadPoolExecutor` workers can hit 429
+  simultaneously and all retry at the same Retry-After expiry. No coordination
+  between workers. Acceptable for v1; consider adding jitter later.
+- **`_ThinkingDisplay` overlap:** During a 429 wait, the thinking spinner
+  continues showing "Thinking..." alongside the rate-limit event message. A
+  dedicated countdown widget can be added in a follow-up.
+- **`_http_json()` retries silently:** No countdown for `/model list`. Acceptable
+  since model listing is a quick, infrequent operation.
+- **SSE-level rate limit errors** (Anthropic `overloaded_error` in-stream) are
+  not covered. Only HTTP-level 429 is handled.
+- **HTTP 529 (`overloaded_error`)** is not retried. The architecture supports
+  adding `exc.code in (429, 529)` trivially in a follow-up.
+- **Worst-case hang:** 5 retries x 120s cap = ~10 minutes per call with no
+  total wall-clock timeout. Acceptable for v1; a cumulative timeout can be added.
+
+## References
+
+- Brainstorm: `docs/brainstorms/2026-02-20-429-rate-limit-handling-brainstorm.md`
+- Transport functions: `agent/model.py:98` (`_http_json`), `agent/model.py:206`
+  (`_http_stream_sse`)
+- Engine error handling: `agent/engine.py:329`
+- Existing retry tests: `tests/test_streaming.py:195`
+- Test helpers: `tests/conftest.py:164`
+- Anthropic rate limits docs: https://docs.anthropic.com/en/api/rate-limits
+- Anthropic error types: https://docs.anthropic.com/en/api/errors
diff --git a/tests/conftest.py b/tests/conftest.py
index 85b5ac48..83107f72 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -163,7 +163,7 @@ def _anthropic_dict_to_events(
 
 def mock_openai_stream(fake_http_json_fn):
     """Wrap a _http_json-style mock into a _http_stream_sse-style mock for OpenAI."""
-    def wrapper(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None):
+    def wrapper(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None, on_retry=None, max_rate_limit_retries=5):
         result = fake_http_json_fn(url, method, headers, payload=payload, timeout_sec=stream_timeout)
         return _openai_dict_to_events(result)
     return wrapper
@@ -171,7 +171,7 @@ def wrapper(url, method, headers, payload, first_byte_timeout=10, stream_timeout
 
 def mock_anthropic_stream(fake_http_json_fn):
     """Wrap a _http_json-style mock into a _http_stream_sse-style mock for Anthropic."""
-    def wrapper(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None):
+    def wrapper(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None, on_retry=None, max_rate_limit_retries=5):
         result = fake_http_json_fn(url, method, headers, payload=payload, timeout_sec=stream_timeout)
         return _anthropic_dict_to_events(result)
     return wrapper
diff --git a/tests/test_model_complex.py b/tests/test_model_complex.py
index 3e6a26c2..e8165cf8 100644
--- a/tests/test_model_complex.py
+++ b/tests/test_model_complex.py
@@ -116,7 +116,7 @@ def test_sorted_models_ordering(self) -> None:
     # 17-19. OpenAICompatibleModel error paths
     # ------------------------------------------------------------------ #
     def test_openai_missing_content_raises(self) -> None:
-        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None):
+        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None, on_retry=None, max_rate_limit_retries=5):
             # Return events that accumulate to empty choices
             return [("", {"choices": [{"delta": {}, "finish_reason": "stop"}]})]
 
@@ -515,7 +515,7 @@ def test_openai_create_conversation_stores_system_prompt(self) -> None:
     # 37. OpenAI non-retryable ModelError re-raised
     # ------------------------------------------------------------------ #
     def test_openai_non_retryable_error_raised(self) -> None:
-        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None):
+        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None, on_retry=None, max_rate_limit_retries=5):
             raise ModelError("HTTP 500 server error")
 
         with patch("agent.model._http_stream_sse", fake_stream_sse):
@@ -747,7 +747,7 @@ def fake_http_json(url, method, headers, payload=None, timeout_sec=90):
     # 49. Anthropic non-retryable ModelError re-raised
     # ------------------------------------------------------------------ #
     def test_anthropic_non_retryable_error_raised(self) -> None:
-        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None):
+        def fake_stream_sse(url, method, headers, payload, first_byte_timeout=10, stream_timeout=120, max_retries=3, on_sse_event=None, on_retry=None, max_rate_limit_retries=5):
             raise ModelError("HTTP 500 server error")
 
         with patch("agent.model._http_stream_sse", fake_stream_sse):
diff --git a/tests/test_rate_limit.py b/tests/test_rate_limit.py
new file mode 100644
index 00000000..214ea4be
--- /dev/null
+++ b/tests/test_rate_limit.py
@@ -0,0 +1,369 @@
+"""Tests for HTTP 429 rate-limit retry logic in agent.model."""
+from __future__ import annotations
+
+import io
+import socket
+import unittest
+import urllib.error
+from unittest.mock import MagicMock, call, patch
+
+from agent.model import (
+    ModelError,
+    _http_json,
+    _http_stream_sse,
+    _notify_retry,
+    _parse_retry_after,
+    _sleep_with_countdown,
+)
+
+
+# ---------------------------------------------------------------------------
+# _parse_retry_after
+# ---------------------------------------------------------------------------
+
+
+class ParseRetryAfterTests(unittest.TestCase):
+    """Tests for the _parse_retry_after pure helper."""
+
+    def _make_exc(self, retry_after: str | None) -> urllib.error.HTTPError:
+        headers = {}
+        if retry_after is not None:
+            headers["Retry-After"] = retry_after
+        exc = urllib.error.HTTPError(
+            url="http://test",
+            code=429,
+            msg="Too Many Requests",
+            hdrs=headers,
+            fp=io.BytesIO(b"{}"),
+        )
+        return exc
+
+    def test_valid_integer_header(self) -> None:
+        exc = self._make_exc("10")
+        self.assertEqual(_parse_retry_after(exc), 10)
+
+    def test_missing_header_uses_default(self) -> None:
+        exc = self._make_exc(None)
+        self.assertEqual(_parse_retry_after(exc), 5)
+
+    def test_missing_header_custom_default(self) -> None:
+        exc = self._make_exc(None)
+        self.assertEqual(_parse_retry_after(exc, default=30), 30)
+
+    def test_clamped_below_floor(self) -> None:
+        exc = self._make_exc("0")
+        self.assertEqual(_parse_retry_after(exc), 1)
+
+    def test_clamped_above_cap(self) -> None:
+        exc = self._make_exc("999")
+        self.assertEqual(_parse_retry_after(exc), 120)
+
+    def test_non_integer_uses_default(self) -> None:
+        exc = self._make_exc("not-a-number")
+        self.assertEqual(_parse_retry_after(exc), 5)
+
+    def test_negative_value_clamped_to_floor(self) -> None:
+        exc = self._make_exc("-5")
+        self.assertEqual(_parse_retry_after(exc), 1)
+
+
+# ---------------------------------------------------------------------------
+# _notify_retry
+# ---------------------------------------------------------------------------
+
+
+class NotifyRetryTests(unittest.TestCase):
+    """Tests for the _notify_retry callback wrapper."""
+
+    def test_calls_callback_with_message(self) -> None:
+        cb = MagicMock()
+        _notify_retry(cb, "test message")
+        cb.assert_called_once_with("test message")
+
+    def test_none_callback_is_noop(self) -> None:
+        # Should not raise
+        _notify_retry(None, "test message")
+
+    def test_swallows_callback_exception(self) -> None:
+        def broken_cb(msg: str) -> None:
+            raise RuntimeError("boom")
+
+        # Should not raise
+        _notify_retry(broken_cb, "test message")
+
+
+# ---------------------------------------------------------------------------
+# _sleep_with_countdown
+# ---------------------------------------------------------------------------
+
+
+class SleepWithCountdownTests(unittest.TestCase):
+    """Tests for the _sleep_with_countdown helper."""
+
+    @patch("agent.model.time.sleep")
+    def test_sleeps_correct_number_of_seconds(self, mock_sleep: MagicMock) -> None:
+        cb = MagicMock()
+        _sleep_with_countdown(seconds=3, attempt=1, max_attempts=5, on_retry=cb)
+        self.assertEqual(mock_sleep.call_count, 3)
+        mock_sleep.assert_has_calls([call(1), call(1), call(1)])
+
+    @patch("agent.model.time.sleep")
+    def test_countdown_messages_descend(self, mock_sleep: MagicMock) -> None:
+        cb = MagicMock()
+        _sleep_with_countdown(seconds=3, attempt=2, max_attempts=5, on_retry=cb)
+        messages = [c.args[0] for c in cb.call_args_list]
+        self.assertEqual(len(messages), 3)
+        self.assertIn("3s", messages[0])
+        self.assertIn("2s", messages[1])
+        self.assertIn("1s", messages[2])
+        # All messages should include attempt info
+        for msg in messages:
+            self.assertIn("attempt 2/5", msg)
+
+    @patch("agent.model.time.sleep")
+    def test_none_callback_still_sleeps(self, mock_sleep: MagicMock) -> None:
+        _sleep_with_countdown(seconds=2, attempt=1, max_attempts=5, on_retry=None)
+        self.assertEqual(mock_sleep.call_count, 2)
+
+
+# ---------------------------------------------------------------------------
+# _http_stream_sse — 429 retry logic
+# ---------------------------------------------------------------------------
+
+
+def _make_successful_response() -> MagicMock:
+    """Create a mock HTTP response that yields a valid SSE stream."""
+    data = 'data: {"choices":[{"delta":{"content":"ok"},"finish_reason":"stop"}]}\n\ndata: [DONE]\n'
+    resp = MagicMock()
+    resp.__iter__ = lambda self: iter(data.encode().split(b"\n"))
+    resp.__enter__ = lambda self: self
+    resp.__exit__ = lambda self, *a: None
+    resp.fp = MagicMock()
+    resp.close = MagicMock()
+    return resp
+
+
+def _make_429_error(retry_after: str | None = "5") -> urllib.error.HTTPError:
+    """Create a 429 HTTPError with optional Retry-After header."""
+    headers = {}
+    if retry_after is not None:
+        headers["Retry-After"] = retry_after
+    return urllib.error.HTTPError(
+        url="http://test",
+        code=429,
+        msg="Too Many Requests",
+        hdrs=headers,
+        fp=io.BytesIO(b'{"type":"error","error":{"type":"rate_limit_error","message":"rate limited"}}'),
+    )
+
+
+class HttpStreamSSE429Tests(unittest.TestCase):
+    """Tests for 429 retry logic in _http_stream_sse."""
+
+    @patch("agent.model.time.sleep")
+    def test_429_retries_and_succeeds(self, mock_sleep: MagicMock) -> None:
+        """First call returns 429, second succeeds."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _make_429_error("2")
+            return _make_successful_response()
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            events = _http_stream_sse(
+                url="http://test/v1/chat/completions",
+                method="POST",
+                headers={},
+                payload={"model": "test"},
+                max_rate_limit_retries=5,
+            )
+        self.assertEqual(call_count, 2)
+        self.assertTrue(len(events) > 0)
+
+    @patch("agent.model.time.sleep")
+    def test_429_exhausts_retries_raises_model_error(self, mock_sleep: MagicMock) -> None:
+        """All attempts return 429 → ModelError with retry count."""
+
+        def fake_urlopen(req, timeout=None):
+            raise _make_429_error("1")
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            with self.assertRaises(ModelError) as ctx:
+                _http_stream_sse(
+                    url="http://test/v1/chat/completions",
+                    method="POST",
+                    headers={},
+                    payload={"model": "test"},
+                    max_rate_limit_retries=3,
+                )
+            msg = str(ctx.exception)
+            self.assertIn("429", msg)
+            self.assertIn("3", msg)  # retry count in message
+
+    @patch("agent.model.time.sleep")
+    def test_429_on_retry_callback_invoked_with_countdown(self, mock_sleep: MagicMock) -> None:
+        """on_retry callback receives countdown messages during 429 wait."""
+        call_count = 0
+        retry_messages: list[str] = []
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _make_429_error("2")
+            return _make_successful_response()
+
+        def on_retry_cb(msg: str) -> None:
+            retry_messages.append(msg)
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            _http_stream_sse(
+                url="http://test/v1/chat/completions",
+                method="POST",
+                headers={},
+                payload={"model": "test"},
+                on_retry=on_retry_cb,
+                max_rate_limit_retries=5,
+            )
+        # Should have countdown messages (2s, 1s)
+        self.assertEqual(len(retry_messages), 2)
+        self.assertIn("2s", retry_messages[0])
+        self.assertIn("1s", retry_messages[1])
+
+    def test_non_429_http_error_raises_immediately(self) -> None:
+        """HTTP 400 errors should raise immediately without retrying."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            raise urllib.error.HTTPError(
+                url="http://test",
+                code=400,
+                msg="Bad Request",
+                hdrs={},
+                fp=io.BytesIO(b'{"error": "bad request"}'),
+            )
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            with self.assertRaises(ModelError) as ctx:
+                _http_stream_sse(
+                    url="http://test/v1/chat/completions",
+                    method="POST",
+                    headers={},
+                    payload={"model": "test"},
+                    max_rate_limit_retries=5,
+                )
+            self.assertIn("HTTP 400", str(ctx.exception))
+        self.assertEqual(call_count, 1)
+
+    @patch("agent.model.time.sleep")
+    def test_connection_retry_and_429_retry_are_independent(self, mock_sleep: MagicMock) -> None:
+        """Connection retries exhaust independently, then 429 retry kicks in."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            # First 2 calls: connection timeout (uses connection retry budget)
+            if call_count <= 2:
+                raise socket.timeout("timed out")
+            # Third call succeeds on connection but gets 429
+            if call_count == 3:
+                raise _make_429_error("1")
+            # Fourth call succeeds
+            return _make_successful_response()
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            events = _http_stream_sse(
+                url="http://test/v1/chat/completions",
+                method="POST",
+                headers={},
+                payload={"model": "test"},
+                max_retries=3,
+                max_rate_limit_retries=5,
+            )
+        self.assertEqual(call_count, 4)
+        self.assertTrue(len(events) > 0)
+
+    @patch("agent.model.time.sleep")
+    def test_429_without_retry_after_uses_fallback(self, mock_sleep: MagicMock) -> None:
+        """429 without Retry-After header falls back to 5-second wait."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _make_429_error(None)  # No Retry-After header
+            return _make_successful_response()
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            _http_stream_sse(
+                url="http://test/v1/chat/completions",
+                method="POST",
+                headers={},
+                payload={"model": "test"},
+                max_rate_limit_retries=5,
+            )
+        # Default fallback is 5 seconds → 5 sleep calls
+        self.assertEqual(mock_sleep.call_count, 5)
+
+
+# ---------------------------------------------------------------------------
+# _http_json — 429 retry logic
+# ---------------------------------------------------------------------------
+
+
+class HttpJson429Tests(unittest.TestCase):
+    """Tests for 429 retry logic in _http_json."""
+
+    @patch("agent.model.time.sleep")
+    def test_429_retries_silently_and_succeeds(self, mock_sleep: MagicMock) -> None:
+        """_http_json retries on 429 without any callback."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _make_429_error("1")
+            resp = MagicMock()
+            resp.read.return_value = b'{"data": "ok"}'
+            resp.__enter__ = lambda self: self
+            resp.__exit__ = lambda self, *a: None
+            return resp
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            result = _http_json(
+                url="http://test/v1/models",
+                method="GET",
+                headers={},
+                max_rate_limit_retries=5,
+            )
+        self.assertEqual(call_count, 2)
+        self.assertEqual(result["data"], "ok")
+
+    @patch("agent.model.time.sleep")
+    def test_429_exhausts_retries_raises_model_error(self, mock_sleep: MagicMock) -> None:
+        def fake_urlopen(req, timeout=None):
+            raise _make_429_error("1")
+
+        with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+            with self.assertRaises(ModelError) as ctx:
+                _http_json(
+                    url="http://test/v1/models",
+                    method="GET",
+                    headers={},
+                    max_rate_limit_retries=3,
+                )
+            msg = str(ctx.exception)
+            self.assertIn("429", msg)
+            self.assertIn("3", msg)
+
+
+if __name__ == "__main__":
+    unittest.main()

From fccc947907e81be4a1a3700ab6af842565de0dd9 Mon Sep 17 00:00:00 2001
From: Kostas Karachalios <vrinek@hey.com>
Date: Fri, 20 Feb 2026 20:14:27 +0100
Subject: [PATCH 2/3] test: add engine integration test for on_retry wiring

Verify that 429 retry messages reach the engine's on_event callback
end-to-end, and that model.on_retry is cleared after complete() returns.
---
 tests/test_rate_limit.py | 66 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/test_rate_limit.py b/tests/test_rate_limit.py
index 214ea4be..bd9f7a43 100644
--- a/tests/test_rate_limit.py
+++ b/tests/test_rate_limit.py
@@ -3,18 +3,25 @@
 
 import io
 import socket
+import tempfile
 import unittest
 import urllib.error
+from pathlib import Path
 from unittest.mock import MagicMock, call, patch
 
+from agent.config import AgentConfig
+from agent.engine import RLMEngine
 from agent.model import (
+    AnthropicModel,
     ModelError,
+    ModelTurn,
     _http_json,
     _http_stream_sse,
     _notify_retry,
     _parse_retry_after,
     _sleep_with_countdown,
 )
+from agent.tools import WorkspaceTools
 
 
 # ---------------------------------------------------------------------------
@@ -365,5 +372,64 @@ def fake_urlopen(req, timeout=None):
             self.assertIn("3", msg)
 
 
+# ---------------------------------------------------------------------------
+# Engine integration — on_retry wiring
+# ---------------------------------------------------------------------------
+
+
+class EngineOnRetryWiringTests(unittest.TestCase):
+    """Verify the engine sets/clears model.on_retry and messages reach on_event."""
+
+    @patch("agent.model.time.sleep")
+    def test_engine_on_event_receives_retry_messages(self, mock_sleep: MagicMock) -> None:
+        """A 429 during model.complete() should surface retry messages via on_event."""
+        call_count = 0
+
+        def fake_urlopen(req, timeout=None):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise _make_429_error("1")
+            # Second call: return a valid Anthropic SSE response
+            data = (
+                'event: message_start\ndata: {"type":"message_start","message":{"usage":{"input_tokens":10}}}\n\n'
+                'event: content_block_start\ndata: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}\n\n'
+                'event: content_block_delta\ndata: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"done"}}\n\n'
+                'event: content_block_stop\ndata: {"type":"content_block_stop","index":0}\n\n'
+                'event: message_delta\ndata: {"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":5}}\n\n'
+                'event: message_stop\ndata: {"type":"message_stop"}\n\n'
+            )
+            resp = MagicMock()
+            resp.__iter__ = lambda self: iter(data.encode().split(b"\n"))
+            resp.__enter__ = lambda self: self
+            resp.__exit__ = lambda self, *a: None
+            resp.fp = MagicMock()
+            resp.close = MagicMock()
+            return resp
+
+        events_received: list[str] = []
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            root = Path(tmpdir)
+            cfg = AgentConfig(workspace=root, max_depth=0, max_steps_per_call=1)
+            tools = WorkspaceTools(root=root)
+            model = AnthropicModel(model="test-model", api_key="test-key")
+            engine = RLMEngine(model=model, tools=tools, config=cfg)
+
+            with patch("agent.model.urllib.request.urlopen", fake_urlopen):
+                engine.solve(
+                    "test objective",
+                    on_event=lambda msg: events_received.append(msg),
+                )
+
+        # The retry message should appear in the event stream
+        retry_events = [e for e in events_received if "Rate limited" in e]
+        self.assertTrue(len(retry_events) > 0, f"Expected retry events, got: {events_received}")
+        self.assertIn("1s", retry_events[0])
+
+        # on_retry should be cleared after complete() returns
+        self.assertIsNone(model.on_retry)
+
+
 if __name__ == "__main__":
     unittest.main()

From 79aac6bea70c9f37a6c70fe5dbb4f1890e68b0c8 Mon Sep 17 00:00:00 2001
From: Kostas Karachalios <vrinek@hey.com>
Date: Fri, 20 Feb 2026 20:22:57 +0100
Subject: [PATCH 3/3] fix: add depth/step prefix to rate-limit retry messages

Retry countdown messages now include the [d0/s6] prefix matching
all other engine trace lines, for consistent TUI output.
---
 agent/engine.py          | 4 +++-
 tests/test_rate_limit.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/agent/engine.py b/agent/engine.py
index f04845df..2c209c56 100644
--- a/agent/engine.py
+++ b/agent/engine.py
@@ -327,7 +327,9 @@ def _solve_recursive(
                 model.on_content_delta = on_content_delta
             # Rate-limit messages fire at all depths
             if on_event and hasattr(model, "on_retry"):
-                model.on_retry = lambda msg: self._emit(msg, on_event)
+                model.on_retry = lambda msg, _d=depth, _s=step: self._emit(
+                    f"[d{_d}/s{_s}] {msg}", on_event
+                )
             try:
                 turn = model.complete(conversation)
             except ModelError as exc:
diff --git a/tests/test_rate_limit.py b/tests/test_rate_limit.py
index bd9f7a43..69d43f7d 100644
--- a/tests/test_rate_limit.py
+++ b/tests/test_rate_limit.py
@@ -422,10 +422,11 @@ def fake_urlopen(req, timeout=None):
                     on_event=lambda msg: events_received.append(msg),
                 )
 
-        # The retry message should appear in the event stream
+        # The retry message should appear in the event stream with depth/step prefix
         retry_events = [e for e in events_received if "Rate limited" in e]
         self.assertTrue(len(retry_events) > 0, f"Expected retry events, got: {events_received}")
         self.assertIn("1s", retry_events[0])
+        self.assertRegex(retry_events[0], r"\[d\d+/s\d+\]")
 
         # on_retry should be cleared after complete() returns
         self.assertIsNone(model.on_retry)