jgarzik · jgarzik · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,7 @@ clap = { version = "4", features = ["derive", "env"] }
 dirs = "5"
 dotenvy = "0.15"
 glob = "0.3"
+once_cell = "1"
 regex = "1"
 rustyline = { version = "17", features = ["with-file-history"] }
 serde = { version = "1", features = ["derive"] }

diff --git a/IDEAS.md b/IDEAS.md
@@ -0,0 +1,183 @@
+# Future Optimization Ideas for `-O` Mode
+
+This document captures ideas for future enhancements to the `-O` optimization flag, building on the research that shorter, denser prompts improve LLM performance.
+
+## Research Foundation
+
+- **Context Rot**: Accuracy degrades as prompts grow longer (Chroma study on 18 models)
+- **LLMLingua**: 20x compression with only 1.5% performance loss
+- **Positive Framing**: "Do this" outperforms "don't do this" in prompts
+- **Signal Density**: "Find the smallest set of high-signal tokens that maximize the likelihood of your desired outcome"
+
+Sources:
+- https://github.com/microsoft/LLMLingua
+- https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents
+- https://gritdaily.com/impact-prompt-length-llm-performance/
+
+---
+
+## Implemented Layers
+
+### Layer 1: Terse System Prompt
+- Reduced from ~60 tokens to ~15 tokens
+- Positive framing: "AI-to-AI mode. Maximum information density. Structure over prose. No narration."
+
+### Layer 2: Compressed Tool Schemas
+- Tool descriptions shortened (e.g., "Read file content. Paths relative to project root." → "Read file")
+- Parameter descriptions stripped in optimize mode
+- Uses `SchemaOptions` struct for extensibility
+
+---
+
+## Future Layers
+
+### Layer 3: Tool Result Compression
+
+**Concept**: Strip metadata from tool results in `-O` mode.
+
+Current Read result:
+```json
+{
+  "path": "foo.rs",
+  "offset": 0,
+  "truncated": false,
+  "content": "...",
+  "sha256": "abc123",
+  "lines": 42
+}
+```
+
+Optimized result:
+```json
+{"content": "..."}
+```
+
+**Implementation**:
+- Add `optimize` flag to `tools::execute()`
+- Conditionally strip fields: `path`, `offset`, `truncated`, `sha256`, `lines`
+- Keep only essential data needed for task completion
+
+**Estimated token savings**: 30-50% per tool result
+
+---
+
+### Layer 4: History Summarization
+
+**Concept**: Compress older conversation turns to maintain context while reducing tokens.
+
+**Approaches**:
+1. **Sliding Window**: Keep only last N turns in full, summarize older ones
+2. **Semantic Compression**: Use small model to compress verbose assistant responses
+3. **Result Deduplication**: Merge repeated tool results (e.g., multiple Read calls on same file)
+
+**Implementation ideas**:
+- Add `conversation_compressor` module
+- Trigger compression when context exceeds threshold
+- Preserve tool call/result structure for agent continuity
+
+**Research reference**: LLMLingua-2 achieves 3-6x faster compression with task-agnostic distillation
+
+---
+
+### Layer 5: Output Style Enforcement
+
+**Concept**: Enforce structured output format in `-O` mode.
+
+**Current**: LLM outputs natural language explanations mixed with actions
+**Optimized**: Pure structured output, no prose
+
+**Implementation ideas**:
+1. **Structured Output Schema**: Add JSON schema for responses
+2. **Response Format Instruction**: "Respond only with tool calls or structured JSON"
+3. **Post-processing**: Strip explanation text, keep only actions
+
+**Example transformation**:
+```
+Before: "I'll read the config file to understand the settings. Let me use the Read tool..."
+After:  [tool_call: Read, path: "config.toml"]
+```
+
+**Trade-off**: May reduce transparency for human review, but ideal for AI-to-AI pipelines
+
+---
+
+### Layer 6: Dynamic Tool Injection
+
+**Concept**: Only include tool schemas likely needed for the current task.
+
+**Current**: All 8 tools included in every request
+**Optimized**: Analyze prompt, inject relevant subset
+
+**Heuristics**:
+- "read", "view", "show" → Read, Grep, Glob
+- "edit", "modify", "change" → Read, Edit, Write
+- "run", "execute", "test", "build" → Bash
+- "find", "search" → Grep, Glob
+- "delegate", "subagent" → Task
+
+**Implementation**:
+- Add `infer_tools_from_prompt(prompt: &str) -> Vec<ToolName>`
+- Apply before schema generation
+- Fall back to full toolset if uncertain
+
+---
+
+### Layer 7: CodeAgents-Style Pseudocode
+
+**Concept**: Use structured pseudocode instead of natural language for reasoning.
+
+**Research**: CodeAgents framework reduces tokens by 55-87%.
+
+**Current**:
+```
+I need to first read the file to understand its structure, then I'll make the edit...
+```
+
+**Optimized**:
+```
+PLAN: Read("src/main.rs") -> Edit(find="old", replace="new")
+```
+
+**Implementation**:
+- Add `--reasoning-format=pseudocode` option
+- Train/prompt model to use structured planning notation
+- Parse pseudocode for execution
+
+---
+
+## Measurement & Validation
+
+To validate optimization effectiveness:
+
+1. **Token Counting**: Compare input/output tokens with and without `-O`
+2. **Task Success Rate**: Ensure optimizations don't reduce accuracy
+3. **Latency**: Measure time-to-first-token improvement
+4. **Cost**: Calculate API cost savings
+
+**Suggested benchmarks**:
+- Simple file read/edit tasks
+- Multi-step refactoring tasks
+- Codebase exploration tasks
+
+---
+
+## Configuration Ideas
+
+Future `SchemaOptions` extensions:
+```rust
+pub struct SchemaOptions {
+    pub optimize: bool,
+    // Future fields:
+    pub compress_results: bool,
+    pub dynamic_tools: bool,
+    pub pseudocode_reasoning: bool,
+    pub max_history_turns: Option<usize>,
+}
+```
+
+Command-line exposure:
+```
+yo -O                    # Enable all optimizations
+yo -O --no-compress      # Optimize schemas but not results
+yo --optimize-level=2    # Granular control
+```
diff --git a/fixtures/mcp_calc_server/src/main.rs b/fixtures/mcp_calc_server/src/main.rs
@@ -6,8 +6,6 @@ use std::io::{self, BufRead, Write};
 
 #[derive(Deserialize)]
 struct JsonRpcRequest {
-    #[allow(dead_code)]
-    jsonrpc: String,
     id: Option<u64>,
     method: String,
     params: Option<Value>,

diff --git a/src/agent.rs b/src/agent.rs
@@ -2,7 +2,7 @@
 
 use crate::{
     cli::Context,
-    llm,
+    llm::{self, LlmClient},
     plan::{self, PlanPhase},
     policy::Decision,
     tools,
@@ -34,6 +34,15 @@ impl CommandStats {
     }
 }
 
+/// Result of a turn, including stats and continuation info
+#[derive(Debug, Default, Clone)]
+pub struct TurnResult {
+    pub stats: CommandStats,
+    /// If true, a Stop hook requested continuation with the given prompt
+    pub force_continue: bool,
+    pub continue_prompt: Option<String>,
+}
+
 const SYSTEM_PROMPT: &str = r#"You are an agentic coding assistant running locally.
 You can only access files via tools. All paths are relative to the project root.
 Use Glob/Grep to find files before Read. Before Edit/Write, explain what you will change.
@@ -53,12 +62,8 @@ fn verbose(ctx: &Context, message: &str) {
     }
 }
 
-pub fn run_turn(
-    ctx: &Context,
-    user_input: &str,
-    messages: &mut Vec<Value>,
-) -> Result<CommandStats> {
-    let mut stats = CommandStats::default();
+pub fn run_turn(ctx: &Context, user_input: &str, messages: &mut Vec<Value>) -> Result<TurnResult> {
+    let mut result = TurnResult::default();
     let _ = ctx.transcript.borrow_mut().user_message(user_input);
 
     messages.push(json!({
@@ -111,9 +116,10 @@ pub fn run_turn(
     }
 
     // Get built-in tool schemas (including Task for main agent) and add MCP tools
+    let schema_opts = tools::SchemaOptions::new(ctx.args.optimize);
     let mut tool_schemas = if in_planning_mode {
         // In planning mode, only provide read-only tools
-        tools::schemas()
+        tools::schemas(&schema_opts)
             .into_iter()
             .filter(|schema| {
                 if let Some(name) = schema
@@ -128,7 +134,7 @@ pub fn run_turn(
             })
             .collect()
     } else {
-        tools::schemas_with_task()
+        tools::schemas_with_task(&schema_opts)
     };
 
     // Only add MCP tools if not in planning mode
@@ -187,6 +193,11 @@ pub fn run_turn(
                 SYSTEM_PROMPT.to_string()
             };
 
+            // Add optimization mode instructions if -O flag is set
+            if ctx.args.optimize {
+                system_prompt.push_str("\n\nAI-to-AI mode. Maximum information density. Structure over prose. No narration.");
+            }
+
             // Add skill pack index
             let skill_index = ctx.skill_index.borrow();
             let skill_prompt = skill_index.format_for_prompt(50);
@@ -222,8 +233,25 @@ pub fn run_turn(
 
         // Track token usage from this LLM call
         if let Some(usage) = &response.usage {
-            stats.input_tokens += usage.prompt_tokens;
-            stats.output_tokens += usage.completion_tokens;
+            result.stats.input_tokens += usage.prompt_tokens;
+            result.stats.output_tokens += usage.completion_tokens;
+
+            // Record cost for this operation
+            let turn_number = *ctx.turn_counter.borrow();
+            let op = ctx.session_costs.borrow_mut().record_operation(
+                turn_number,
+                &target.model,
+                usage.prompt_tokens,
+                usage.completion_tokens,
+            );
+
+            // Log token usage to transcript
+            let _ = ctx.transcript.borrow_mut().token_usage(
+                &target.model,
+                usage.prompt_tokens,
+                usage.completion_tokens,
+                op.cost_usd,
+            );
         }
 
         if response.choices.is_empty() {
@@ -234,6 +262,13 @@ pub fn run_turn(
         let choice = &response.choices[0];
         let msg = &choice.message;
 
+        // Warn if response was truncated due to length limit
+        if choice.finish_reason.as_deref() == Some("length") {
+            eprintln!(
+                "⚠️  Response truncated (max tokens reached). Consider increasing max_tokens or using /compact."
+            );
+        }
+
         if let Some(content) = &msg.content {
             if !content.is_empty() {
                 println!("{}", content);
@@ -311,7 +346,7 @@ pub fn run_turn(
             let args: Value = serde_json::from_str(&tc.function.arguments).unwrap_or(json!({}));
 
             // Count this tool use
-            stats.tool_uses += 1;
+            result.stats.tool_uses += 1;
 
             trace(
                 ctx,
@@ -411,9 +446,9 @@ pub fn run_turn(
                     }
                 } else if name == "Task" {
                     // Execute Task tool (subagent delegation)
-                    let (result, sub_stats) = tools::task::execute(args.clone(), ctx)?;
-                    stats.merge(&sub_stats);
-                    result
+                    let (task_result, sub_stats) = tools::task::execute(args.clone(), ctx)?;
+                    result.stats.merge(&sub_stats);
+                    task_result
                 } else if name.starts_with("mcp.") {
                     // Execute MCP tool
                     let start = std::time::Instant::now();
@@ -508,8 +543,28 @@ pub fn run_turn(
         }
     }
 
-    // Run Stop hooks (note: force_continue not implemented yet)
-    let _ = ctx.hooks.borrow().on_stop("end_turn", None);
+    // Run Stop hooks - may request continuation
+    let last_assistant_message = messages.iter().rev().find_map(|m| {
+        if m["role"].as_str() == Some("assistant") {
+            m["content"].as_str().map(|s| s.to_string())
+        } else {
+            None
+        }
+    });
+
+    let (force_continue, continue_prompt) = ctx
+        .hooks
+        .borrow()
+        .on_stop("end_turn", last_assistant_message.as_deref());
+
+    // If force_continue is requested, signal to caller to run another turn
+    if force_continue {
+        if let Some(prompt) = continue_prompt {
+            result.force_continue = true;
+            result.continue_prompt = Some(prompt);
+            verbose(ctx, "Stop hook requested continuation");
+        }
+    }
 
-    Ok(stats)
+    Ok(result)
 }