thomwebb
diff --git a/‎src/clippy/agent/conversation.py‎
Lines changed: 8 additions & 3 deletions b/‎src/clippy/agent/conversation.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/clippy/agent/loop.py‎
Lines changed: 66 additions & 11 deletions b/‎src/clippy/agent/loop.py‎
Lines changed: 66 additions & 11 deletions
diff --git a/‎src/clippy/llm/base.py‎
Lines changed: 6 additions & 5 deletions b/‎src/clippy/llm/base.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/clippy/llm/http_client.py‎
Lines changed: 57 additions & 0 deletions b/‎src/clippy/llm/http_client.py‎
Lines changed: 57 additions & 0 deletions
@@ -256,12 +256,17 @@ def compact_conversation(
         # Build temporary conversation for summarization
         summarization_conversation = [system_msg] + to_summarize + [summary_request]
 
-        # Call LLM to create summary
-        response = provider.create_message(
+        # Call LLM to create summary using streaming
+        response_content = ""
+        for chunk in provider.stream_message(
             messages=summarization_conversation,
             tools=[],  # No tools needed for summarization
             model=model,
-        )
+        ):
+            if chunk.get("content"):
+                response_content += chunk["content"]
+
+        response = {"content": response_content}
 
         summary_content = response.get("content", "")
         if not summary_content:
 
@@ -145,12 +145,10 @@ def run_agent_loop(
 
         logger.debug(f"Loaded {len(tools)} tools for iteration {iteration}")
 
-        # Call provider (returns OpenAI message dict)
+        # Call provider using streaming
         try:
-            response = config.provider.create_message(
-                messages=conversation_history,
-                tools=tools,
-                model=config.model,
+            response = _process_streaming_response(
+                config.provider, conversation_history, tools, config.model, config.console
             )
 
             # Track token usage from this API call
@@ -205,12 +203,7 @@ def run_agent_loop(
         # Add to conversation history
         conversation_history.append(assistant_message)
 
-        # Print assistant's text response to the user
-        if response.get("content"):
-            content = response["content"]
-            if isinstance(content, str) and content.strip():
-                cleaned_content = content.lstrip("\n")
-                config.console.print(f"\n[bold blue][📎][/bold blue] {escape(cleaned_content)}")
+        # Note: Response content is already displayed during streaming
 
         # Save conversation automatically after each assistant message
         if config.parent_agent is not None:
@@ -288,6 +281,68 @@ def run_agent_loop(
     # Note: No maximum iterations limit - loop runs until agent completes or is interrupted
 
 
+def _process_streaming_response(
+    provider: LLMProvider,
+    conversation_history: list[dict[str, Any]],
+    tools: list[dict[str, Any]],
+    model: str,
+    console: ConsoleProtocol,
+) -> dict[str, Any]:
+    """
+    Process streaming response from provider and display in real-time.
+
+    Args:
+        provider: LLM provider instance
+        conversation_history: Current conversation history
+        tools: List of available tools
+        model: Model identifier
+        console: Console for output
+
+    Returns:
+        Final consolidated response
+    """
+    # Start with an empty header for the response
+    console.print("\n[bold blue][📎][/bold blue] ", end="")
+
+    # Track accumulated content and tool calls
+    accumulated_content = ""
+    accumulated_tool_calls: list[dict[str, Any]] = []
+
+    for chunk in provider.stream_message(conversation_history, tools, model):
+        # Handle streaming content deltas
+        if chunk.get("delta") and chunk.get("content"):
+            # Print the chunk directly for real-time display
+            console.print(escape(chunk["content"]), end="")
+            accumulated_content += chunk["content"]
+        elif chunk.get("content") and not chunk.get("delta"):
+            # Final full content (for non-streaming models like codex)
+            accumulated_content = chunk["content"]
+            console.print(escape(accumulated_content))
+        elif not chunk.get("delta"):
+            # Final chunk with complete response
+            if chunk.get("content"):
+                accumulated_content = chunk["content"]
+                if not accumulated_content.strip() and accumulated_tool_calls:
+                    # Only tool calls, no content - print newline
+                    console.print()
+
+            # Return the final consolidated response
+            return {
+                "role": "assistant",
+                "content": accumulated_content if accumulated_content else None,
+                "tool_calls": accumulated_tool_calls if accumulated_tool_calls else None,
+                "finish_reason": chunk.get("finish_reason"),
+            }
+
+    # If we get here, return what we accumulated
+    return {
+        "role": "assistant",
+        "content": accumulated_content if accumulated_content else None,
+        "tool_calls": accumulated_tool_calls if accumulated_tool_calls else None,
+        "finish_reason": "stop",
+    }
+
+
 def _display_auto_compaction_notification(console: ConsoleProtocol, stats: dict[str, Any]) -> None:
     """
     Display a subtle but informative auto-compaction notification.
 
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Iterator
 from dataclasses import dataclass, field
 from typing import Any
 
@@ -29,23 +30,23 @@ class LLMResponse:
 class BaseProvider:
     """Abstract base for all LLM providers."""
 
-    def create_message(
+    def stream_message(
         self,
         messages: list[dict[str, Any]],
         tools: list[dict[str, Any]] | None = None,
         model: str = "gpt-5-mini",
         **kwargs: Any,
-    ) -> dict[str, Any]:
-        """Create a chat completion.
+    ) -> Iterator[dict[str, Any]]:
+        """Stream a chat completion.
 
         Args:
             messages: List of messages in OpenAI format
             tools: Optional list of tool definitions in OpenAI format
             model: Model identifier
             **kwargs: Additional provider-specific arguments
 
-        Returns:
-            Response dict in OpenAI format for compatibility
+        Yields:
+            Streaming response chunks in OpenAI format
         """
         raise NotImplementedError
 
 
@@ -2,7 +2,9 @@
 
 from __future__ import annotations
 
+import json as _json
 import logging
+from collections.abc import Iterator
 from typing import Any
 
 import httpx
@@ -71,6 +73,61 @@ def post_with_retry(
     return response
 
 
+def stream_with_retry(
+    client: httpx.Client,
+    url: str,
+    json: dict[str, Any],
+    headers: dict[str, str],
+) -> Iterator[dict[str, Any]]:
+    """POST with streaming response for SSE.
+
+    Args:
+        client: httpx Client instance
+        url: URL to POST to
+        json: JSON payload
+        headers: HTTP headers
+
+    Yields:
+        Parsed SSE data chunks
+
+    Raises:
+        RetryableHTTPError: For retryable status codes
+        httpx.HTTPStatusError: For non-retryable errors
+    """
+    try:
+        with client.stream(
+            "POST",
+            url,
+            json=json,
+            headers=headers,
+        ) as response:
+            if response.status_code in RETRYABLE_STATUS_CODES:
+                logger.warning(f"Retryable error {response.status_code} from {url}")
+                raise RetryableHTTPError(response.status_code, "Retryable status code")
+
+            response.raise_for_status()
+
+            # Parse SSE (Server-Sent Events) stream
+            for line in response.iter_lines():
+                if line.strip() == "":
+                    continue  # Skip empty lines
+                if line.startswith("data: "):
+                    data = line[6:]  # Remove "data: " prefix
+                    if data.strip() == "[DONE]":
+                        break
+                    try:
+                        parsed = _json.loads(data)
+                        yield parsed
+                    except _json.JSONDecodeError:
+                        logger.warning(f"Failed to parse SSE data: {data}")
+                        continue
+
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code in RETRYABLE_STATUS_CODES:
+            raise RetryableHTTPError(e.response.status_code, e.response.text)
+        raise
+
+
 def create_client(timeout: httpx.Timeout | None = None) -> httpx.Client:
     """Create a configured httpx client.