diff --git a/agent/config.py b/agent/config.py index 86d368c4..83239de2 100644 --- a/agent/config.py +++ b/agent/config.py @@ -35,6 +35,9 @@ class AgentConfig: voyage_api_key: str | None = None max_depth: int = 4 max_steps_per_call: int = 100 + budget_extension_enabled: bool = True + budget_extension_block_steps: int = 20 + budget_extension_max_blocks: int = 2 max_observation_chars: int = 6000 command_timeout_sec: int = 45 shell: str = "/bin/sh" @@ -45,6 +48,10 @@ class AgentConfig: session_root_dir: str = ".openplanter" max_persisted_observations: int = 400 max_solve_seconds: int = 0 + rate_limit_max_retries: int = 12 + rate_limit_backoff_base_sec: float = 1.0 + rate_limit_backoff_max_sec: float = 60.0 + rate_limit_retry_after_cap_sec: float = 120.0 recursive: bool = True min_subtask_depth: int = 0 acceptance_criteria: bool = True @@ -68,6 +75,18 @@ def from_env(cls, workspace: str | Path) -> "AgentConfig": "OPENPLANTER_BASE_URL", "https://api.openai.com/v1", ) + budget_extension_enabled = ( + os.getenv("OPENPLANTER_BUDGET_EXTENSION_ENABLED", "true").strip().lower() + in {"1", "true", "yes"} + ) + budget_extension_block_steps = max( + 1, + int(os.getenv("OPENPLANTER_BUDGET_EXTENSION_BLOCK_STEPS", "20")), + ) + budget_extension_max_blocks = max( + 0, + int(os.getenv("OPENPLANTER_BUDGET_EXTENSION_MAX_BLOCKS", "2")), + ) return cls( workspace=ws, provider=os.getenv("OPENPLANTER_PROVIDER", "auto").strip().lower() or "auto", @@ -89,6 +108,9 @@ def from_env(cls, workspace: str | Path) -> "AgentConfig": voyage_api_key=voyage_api_key, max_depth=int(os.getenv("OPENPLANTER_MAX_DEPTH", "4")), max_steps_per_call=int(os.getenv("OPENPLANTER_MAX_STEPS", "100")), + budget_extension_enabled=budget_extension_enabled, + budget_extension_block_steps=budget_extension_block_steps, + budget_extension_max_blocks=budget_extension_max_blocks, max_observation_chars=int(os.getenv("OPENPLANTER_MAX_OBS_CHARS", "6000")), command_timeout_sec=int(os.getenv("OPENPLANTER_CMD_TIMEOUT", "45")), shell=os.getenv("OPENPLANTER_SHELL", "/bin/sh"), @@ -99,6 +121,16 @@ def from_env(cls, workspace: str | Path) -> "AgentConfig": session_root_dir=os.getenv("OPENPLANTER_SESSION_DIR", ".openplanter"), max_persisted_observations=int(os.getenv("OPENPLANTER_MAX_PERSISTED_OBS", "400")), max_solve_seconds=int(os.getenv("OPENPLANTER_MAX_SOLVE_SECONDS", "0")), + rate_limit_max_retries=int(os.getenv("OPENPLANTER_RATE_LIMIT_MAX_RETRIES", "12")), + rate_limit_backoff_base_sec=float( + os.getenv("OPENPLANTER_RATE_LIMIT_BACKOFF_BASE_SEC", "1.0") + ), + rate_limit_backoff_max_sec=float( + os.getenv("OPENPLANTER_RATE_LIMIT_BACKOFF_MAX_SEC", "60.0") + ), + rate_limit_retry_after_cap_sec=float( + os.getenv("OPENPLANTER_RATE_LIMIT_RETRY_AFTER_CAP_SEC", "120.0") + ), recursive=os.getenv("OPENPLANTER_RECURSIVE", "true").strip().lower() in ("1", "true", "yes"), min_subtask_depth=int(os.getenv("OPENPLANTER_MIN_SUBTASK_DEPTH", "0")), acceptance_criteria=os.getenv("OPENPLANTER_ACCEPTANCE_CRITERIA", "true").strip().lower() in ("1", "true", "yes"), diff --git a/agent/engine.py b/agent/engine.py index 06c526ca..33ff033c 100644 --- a/agent/engine.py +++ b/agent/engine.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import random import re import time import threading @@ -12,7 +13,7 @@ from typing import Any, Callable from .config import AgentConfig -from .model import BaseModel, ImageData, ModelError, ModelTurn, ToolCall, ToolResult +from .model import BaseModel, ImageData, ModelError, ModelTurn, RateLimitError, ToolCall, ToolResult from .prompts import build_system_prompt from .replay_log import ReplayLogger from .tool_defs import get_tool_definitions @@ -23,6 +24,36 @@ ContentDeltaCallback = Callable[[str, str], None] +_RECON_TOOL_NAMES = { + "list_files", + "search_files", + "repo_map", + "web_search", + "fetch_url", + "read_file", + "read_image", + "list_artifacts", + "read_artifact", +} +_ARTIFACT_TOOL_NAMES = { + "write_file", + "apply_patch", + "edit_file", + "hashline_edit", +} +_WEAK_STRUCTURAL_META_PATTERNS = ( + re.compile(r"^\s*(here(?:'s| is)\s+(?:my|the)\s+(?:plan|approach|analysis))\b", re.I), +) +_STRONG_PROCESS_META_PATTERNS = ( + re.compile(r"\b(i\s+(?:will|can|should|need to|want to|am going to|plan to))\b", re.I), + re.compile(r"\b(let me|next,?\s+i\s+will|i\s+should\s+start\s+by)\b", re.I), +) +_META_DELIVERABLE_OBJECTIVE_PATTERN = re.compile( + r"\b(plan(?:ning)?|approach|strategy|outline|spec(?:ification)?|design|roadmap|proposal|review|audit|analysis|analyze|brainstorm)\b", + re.I, +) + + def _summarize_args(args: dict[str, Any], max_len: int = 120) -> str: """One-line summary of tool call arguments.""" parts: list[str] = [] @@ -63,6 +94,10 @@ def _summarize_observation(text: str, max_len: int = 200) -> str: } _DEFAULT_CONTEXT_WINDOW = 128_000 _CONDENSATION_THRESHOLD = 0.75 +_BUDGET_EXTENSION_WINDOW = 12 +_MIN_EXTENSION_PROGRESS_SIGNALS = 2 +_MIN_MEANINGFUL_RESULT_CHARS = 24 +_NON_PROGRESS_TOOL_NAMES = _RECON_TOOL_NAMES | {"think"} def _model_tier(model_name: str, reasoning_effort: str | None = None) -> int: @@ -122,15 +157,16 @@ def summary(self, max_items: int = 12, max_chars: int = 8000) -> str: @dataclass class TurnSummary: - """Lightweight summary of a completed agent turn for session continuity.""" + """Compact, serializable summary for a completed top-level turn.""" + turn_number: int objective: str - result_preview: str # first ~200 chars - timestamp: str # ISO 8601 UTC + result_preview: str + timestamp: str steps_used: int = 0 replay_seq_start: int = 0 - def to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, int | str]: return { "turn_number": self.turn_number, "objective": self.objective, @@ -141,17 +177,235 @@ def to_dict(self) -> dict[str, Any]: } @classmethod - def from_dict(cls, d: dict[str, Any]) -> "TurnSummary": + def from_dict(cls, payload: dict[str, object]) -> "TurnSummary": return cls( - turn_number=d["turn_number"], - objective=d["objective"], - result_preview=d["result_preview"], - timestamp=d["timestamp"], - steps_used=d.get("steps_used", 0), - replay_seq_start=d.get("replay_seq_start", 0), + turn_number=int(payload["turn_number"]), + objective=str(payload.get("objective", "")), + result_preview=str(payload.get("result_preview", "")), + timestamp=str(payload.get("timestamp", "")), + steps_used=int(payload.get("steps_used", 0) or 0), + replay_seq_start=int(payload.get("replay_seq_start", 0) or 0), ) +@dataclass +class StepProgressRecord: + step: int + phase: str + step_signature: str + tool_count: int + failed_tool_step: bool + successful_action_signatures: set[str] = field(default_factory=set) + state_delta_signatures: set[str] = field(default_factory=set) + completed_previews: list[str] = field(default_factory=list) + + +def _normalize_progress_fragment(text: str, max_len: int = 120) -> str: + collapsed = re.sub(r"\s+", " ", text.strip().lower()) + collapsed = re.sub(r"^(?:\[[^\]]+\]\s*)+", "", collapsed) + if len(collapsed) > max_len: + collapsed = collapsed[: max_len - 3] + "..." + return collapsed + + +def _action_signature(name: str, args: dict[str, Any]) -> str: + payload = json.dumps(args, sort_keys=True, separators=(",", ":"), ensure_ascii=True) + payload = payload[:160] + return f"{name}|{payload}" + + +def _looks_like_failed_tool_result(name: str, result: ToolResult) -> bool: + if result.is_error: + return True + content = result.content.strip() + normalized = _normalize_progress_fragment(content, max_len=200) + exit_match = re.search(r"\[exit_code=(-?\d+)\]", content) + if exit_match: + try: + if int(exit_match.group(1)) != 0: + return True + except ValueError: + pass + failure_prefixes = ( + "file not found:", + "path is a directory, not a file:", + "failed to ", + "blocked:", + "blocked by policy:", + "unsupported image format:", + "image too large:", + "max recursion depth reached;", + "cannot delegate to higher-tier model", + "task cancelled.", + "tool ", + ) + if normalized.startswith(failure_prefixes): + return True + if normalized.startswith("search_files requires ") or normalized.startswith("read_file requires "): + return True + if normalized.startswith("run_shell requires ") or normalized.startswith("apply_patch requires "): + return True + return " crashed:" in normalized + + +def _build_step_progress_record( + step: int, + phase: str, + tool_calls: list[ToolCall], + results: list[ToolResult], +) -> StepProgressRecord: + tool_names = [tc.name for tc in tool_calls] + has_artifact = any(name in _ARTIFACT_TOOL_NAMES for name in tool_names) + failed_results = [ + _looks_like_failed_tool_result(tool_call.name, result) + for tool_call, result in zip(tool_calls, results) + ] + has_error = any(failed_results) + record = StepProgressRecord( + step=step, + phase=phase, + step_signature=f"{','.join(sorted(tool_names))}|artifact={int(has_artifact)}|error={int(has_error)}", + tool_count=len(tool_calls), + failed_tool_step=has_error, + ) + for tool_call, result, failed_result in zip(tool_calls, results, failed_results): + if failed_result or tool_call.name in _NON_PROGRESS_TOOL_NAMES: + continue + normalized_result = _normalize_progress_fragment(result.content) + if len(normalized_result) < _MIN_MEANINGFUL_RESULT_CHARS: + continue + record.successful_action_signatures.add(_action_signature(tool_call.name, tool_call.arguments)) + record.state_delta_signatures.add(f"{tool_call.name}|{normalized_result}") + preview = _summarize_observation(result.content) + if preview not in record.completed_previews: + record.completed_previews.append(preview) + return record + + +def _evaluate_budget_extension( + records: list[StepProgressRecord], + *, + recon_streak: int, +) -> dict[str, Any]: + window = records[-_BUDGET_EXTENSION_WINDOW:] + tool_steps = sum(1 for record in window if record.tool_count > 0) + failed_steps = sum(1 for record in window if record.failed_tool_step) + failure_ratio = (failed_steps / tool_steps) if tool_steps else 0.0 + + repeated_signature_streak = 1 + current_streak = 1 + previous_signature: str | None = None + for record in window: + if previous_signature is not None and record.step_signature == previous_signature: + current_streak += 1 + else: + current_streak = 1 + previous_signature = record.step_signature + repeated_signature_streak = max(repeated_signature_streak, current_streak) + + prior_action_signatures: set[str] = set() + for record in records[: max(0, len(records) - len(window))]: + prior_action_signatures.update(record.successful_action_signatures) + + recent_action_signatures: set[str] = set() + recent_state_delta_signatures: set[str] = set() + has_build_or_finalize = False + for record in window: + recent_action_signatures.update(record.successful_action_signatures) + recent_state_delta_signatures.update(record.state_delta_signatures) + has_build_or_finalize = has_build_or_finalize or record.phase in {"build", "finalize"} + + novel_action_signatures = recent_action_signatures - prior_action_signatures + positive_signals = 0 + if len(novel_action_signatures) >= 2: + positive_signals += 1 + if len(recent_state_delta_signatures) >= 2: + positive_signals += 1 + if has_build_or_finalize: + positive_signals += 1 + + blockers: list[str] = [] + if repeated_signature_streak >= 3: + blockers.append("repeated_signatures") + if failure_ratio > 0.6: + blockers.append("high_failure_ratio") + if recon_streak >= 4: + blockers.append("recon_streak") + + return { + "eligible": not blockers and positive_signals >= _MIN_EXTENSION_PROGRESS_SIGNALS, + "window_size": len(window), + "repeated_signature_streak": repeated_signature_streak, + "failure_ratio": failure_ratio, + "novel_action_count": len(novel_action_signatures), + "state_delta_count": len(recent_state_delta_signatures), + "has_build_or_finalize": has_build_or_finalize, + "positive_signals": positive_signals, + "blockers": blockers, + } + + +def _suggest_next_actions( + objective: str, + evaluation: dict[str, Any], + recent_previews: list[str], +) -> list[str]: + actions: list[str] = [] + blockers = set(evaluation.get("blockers", [])) + if "repeated_signatures" in blockers: + actions.append("Stop retrying the same command pattern and switch to a different source or tactic.") + if "high_failure_ratio" in blockers: + actions.append("Triage the failing tool calls first so the next run is not dominated by avoidable errors.") + if "recon_streak" in blockers: + actions.append("Move from exploration into artifact-building or synthesis before doing more reconnaissance.") + if recent_previews: + actions.append("Turn the completed findings below into a concrete artifact or summary before resuming deeper work.") + actions.append(f"Resume the objective with a narrower next slice: {objective}") + return actions[:4] + + +def _render_partial_completion( + objective: str, + loop_metrics: dict[str, Any], + evaluation: dict[str, Any], + records: list[StepProgressRecord], +) -> str: + recent_previews: list[str] = [] + for record in reversed(records[-_BUDGET_EXTENSION_WINDOW:]): + for preview in record.completed_previews: + if preview not in recent_previews: + recent_previews.append(preview) + if len(recent_previews) >= 3: + break + if len(recent_previews) >= 3: + break + next_actions = _suggest_next_actions(objective, evaluation, recent_previews) + completed = recent_previews or ["The run gathered additional context but did not converge on a final artifact before the bounded limit."] + remaining = ( + "Finish the deliverable using the completed work below and avoid repeating the stalled loop." + if recent_previews + else "Finish the deliverable with a narrower plan or a different tactic." + ) + reason = str(loop_metrics.get("termination_reason", "budget_no_progress")) + header = ( + f"Partial completion for objective: {objective}\n" + f"Stopped after {int(loop_metrics.get('steps', 0))} steps " + f"with {int(loop_metrics.get('extensions_granted', 0))} budget extension(s). " + f"Termination reason: {reason}." + ) + completed_block = "\n".join(f"- {item}" for item in completed) + next_actions_block = "\n".join(f"- {item}" for item in next_actions) + return ( + f"{header}\n\n" + "Completed work:\n" + f"{completed_block}\n\n" + "Remaining work:\n" + f"- {remaining}\n\n" + "Suggested next actions:\n" + f"{next_actions_block}" + ) + + @dataclass class RLMEngine: model: BaseModel @@ -167,6 +421,7 @@ class RLMEngine: _shell_command_counts: dict[tuple[int, str], int] = field(default_factory=dict) _cancel: threading.Event = field(default_factory=threading.Event) _pending_image: threading.local = field(default_factory=threading.local) + last_loop_metrics: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: if not self.system_prompt: @@ -197,6 +452,7 @@ def solve_with_context( on_content_delta: ContentDeltaCallback | None = None, replay_logger: ReplayLogger | None = None, turn_history: list[TurnSummary] | None = None, + question_reasoning_packet: dict[str, Any] | None = None, ) -> tuple[str, ExternalContext]: if not objective.strip(): return "No objective provided.", context or ExternalContext() @@ -216,6 +472,7 @@ def solve_with_context( deadline=deadline, replay_logger=replay_logger, turn_history=turn_history, + question_reasoning_packet=question_reasoning_packet, ) finally: cleanup = getattr(self.tools, "cleanup_bg_jobs", None) @@ -298,6 +555,19 @@ def _judge_result( except Exception as exc: return f"PASS\n(judge error: {exc})" + def _objective_allows_meta_final(self, objective: str) -> bool: + return bool(_META_DELIVERABLE_OBJECTIVE_PATTERN.search(objective)) + + def _is_meta_final_text(self, text: str, objective: str = "") -> bool: + stripped = text.strip() + if not stripped: + return True + if any(pattern.search(stripped) for pattern in _STRONG_PROCESS_META_PATTERNS): + return True + if any(pattern.search(stripped) for pattern in _WEAK_STRUCTURAL_META_PATTERNS): + return not self._objective_allows_meta_final(objective) + return False + def _solve_recursive( self, objective: str, @@ -310,6 +580,7 @@ def _solve_recursive( model_override: BaseModel | None = None, replay_logger: ReplayLogger | None = None, turn_history: list[TurnSummary] | None = None, + question_reasoning_packet: dict[str, Any] | None = None, ) -> str: model = model_override or self.model @@ -349,11 +620,42 @@ def _solve_recursive( f"{len(turn_history)} prior turn(s). " f"Read replay.jsonl/events.jsonl in session_dir for full details." ) + if depth == 0 and question_reasoning_packet is not None: + initial_msg_dict["question_reasoning_packet"] = question_reasoning_packet initial_message = json.dumps(initial_msg_dict, ensure_ascii=True) conversation = model.create_conversation(self.system_prompt, initial_message) - if replay_logger and replay_logger._seq == 0: + loop_metrics: dict[str, Any] = { + "steps": 0, + "model_turns": 0, + "tool_calls": 0, + "phase_counts": {"investigate": 0, "build": 0, "iterate": 0, "finalize": 0}, + "recon_streak": 0, + "max_recon_streak": 0, + "guardrail_warnings": 0, + "final_rejections": 0, + "last_guardrail_streak": 0, + "budget_extension_enabled": bool(self.config.budget_extension_enabled), + "budget_extension_block_steps": int(self.config.budget_extension_block_steps), + "budget_extension_max_blocks": int(self.config.budget_extension_max_blocks), + "extensions_granted": 0, + "extension_eligible_checks": 0, + "extension_denials_no_progress": 0, + "extension_denials_cap": 0, + "termination_reason": "", + } + step_records: list[StepProgressRecord] = [] + active_step_budget = self.config.max_steps_per_call + max_total_steps = self.config.max_steps_per_call + ( + self.config.budget_extension_block_steps * self.config.budget_extension_max_blocks + if self.config.budget_extension_enabled + else 0 + ) + + self.last_loop_metrics = loop_metrics + + if replay_logger and replay_logger.needs_header: replay_logger.write_header( provider=type(model).__name__, model=getattr(model, "model", "(unknown)"), @@ -364,12 +666,16 @@ def _solve_recursive( temperature=getattr(model, "temperature", None), ) - for step in range(1, self.config.max_steps_per_call + 1): + for step in range(1, max_total_steps + 1): if self._cancel.is_set(): self._emit(f"[d{depth}] cancelled by user", on_event) + loop_metrics["termination_reason"] = "cancelled" + self.last_loop_metrics = loop_metrics return "Task cancelled." if deadline and time.monotonic() > deadline: self._emit(f"[d{depth}] wall-clock limit reached", on_event) + loop_metrics["termination_reason"] = "time_limit" + self.last_loop_metrics = loop_metrics return "Time limit exceeded. Try a more focused objective." self._emit(f"[d{depth}/s{step}] calling model...", on_event) t0 = time.monotonic() @@ -377,14 +683,56 @@ def _solve_recursive( if on_content_delta and depth == 0 and hasattr(model, "on_content_delta"): model.on_content_delta = on_content_delta try: - turn = model.complete(conversation) + rate_limit_retries = 0 + while True: + if self._cancel.is_set(): + self._emit(f"[d{depth}] cancelled by user", on_event) + self.last_loop_metrics = loop_metrics + return "Task cancelled." + try: + turn = model.complete(conversation) + break + except RateLimitError as exc: + if rate_limit_retries >= self.config.rate_limit_max_retries: + self._emit(f"[d{depth}/s{step}] model error: {exc}", on_event) + loop_metrics["termination_reason"] = "model_error" + self.last_loop_metrics = loop_metrics + return f"Model error at depth {depth}, step {step}: {exc}" + rate_limit_retries += 1 + delay: float | None = None + if exc.retry_after_sec is not None: + delay = min( + max(exc.retry_after_sec, 0.0), + self.config.rate_limit_retry_after_cap_sec, + ) + if delay is None: + delay = self.config.rate_limit_backoff_base_sec * (2 ** (rate_limit_retries - 1)) + delay += random.uniform(0.0, 0.25) + delay = min(delay, self.config.rate_limit_backoff_max_sec) + if deadline and (time.monotonic() + delay) > deadline: + self._emit(f"[d{depth}] wall-clock limit reached", on_event) + loop_metrics["termination_reason"] = "time_limit" + self.last_loop_metrics = loop_metrics + return "Time limit exceeded. Try a more focused objective." + provider_code = f" ({exc.provider_code})" if exc.provider_code is not None else "" + self._emit( + f"[d{depth}/s{step}] rate limited{provider_code}. " + f"Sleeping {delay:.1f}s before retry {rate_limit_retries}/{self.config.rate_limit_max_retries}...", + on_event, + ) + if delay > 0: + time.sleep(delay) except ModelError as exc: self._emit(f"[d{depth}/s{step}] model error: {exc}", on_event) + loop_metrics["termination_reason"] = "model_error" + self.last_loop_metrics = loop_metrics return f"Model error at depth {depth}, step {step}: {exc}" finally: if hasattr(model, "on_content_delta"): model.on_content_delta = None elapsed = time.monotonic() - t0 + loop_metrics["steps"] = step + loop_metrics["model_turns"] += 1 if replay_logger: try: @@ -434,6 +782,7 @@ def _solve_recursive( "output_tokens": turn.output_tokens, "elapsed_sec": round(elapsed, 2), "is_final": False, + "phase": "model", } ) except Exception: @@ -441,11 +790,31 @@ def _solve_recursive( # No tool calls + text present = final answer if not turn.tool_calls and turn.text: + if self._is_meta_final_text(turn.text, objective): + loop_metrics["final_rejections"] += 1 + self._emit( + f"[d{depth}/s{step}] rejected meta final-answer text; requesting concrete completion", + on_event, + ) + rejection_result = ToolResult( + tool_call_id="meta-final-reject", + name="system", + content=( + "Final-answer candidate rejected: response is meta/process text. " + "Provide a concrete completion summary (what was produced/changed) " + "instead of describing what you will do next." + ), + ) + model.append_tool_results(conversation, [rejection_result]) + continue + loop_metrics["phase_counts"]["finalize"] += 1 + loop_metrics["termination_reason"] = "success" preview = turn.text[:200] + "..." if len(turn.text) > 200 else turn.text self._emit( f"[d{depth}/s{step}] final answer ({len(turn.text)} chars, {elapsed:.1f}s): {preview}", on_event, ) + self.last_loop_metrics = loop_metrics if on_step: try: on_step( @@ -456,6 +825,8 @@ def _solve_recursive( "action": {"name": "final", "arguments": {"text": turn.text}}, "observation": turn.text, "is_final": True, + "phase": "finalize", + "loop_metrics": dict(loop_metrics), } ) except Exception: @@ -475,6 +846,23 @@ def _solve_recursive( # Log tool calls from model tc_names = [tc.name for tc in turn.tool_calls] + loop_metrics["tool_calls"] += len(tc_names) + has_recon = any(name in _RECON_TOOL_NAMES for name in tc_names) + has_artifact = any(name in _ARTIFACT_TOOL_NAMES for name in tc_names) + if has_recon and not has_artifact and all(name in _RECON_TOOL_NAMES for name in tc_names): + loop_metrics["recon_streak"] += 1 + loop_metrics["phase_counts"]["investigate"] += 1 + elif has_artifact: + loop_metrics["recon_streak"] = 0 + loop_metrics["last_guardrail_streak"] = 0 + loop_metrics["phase_counts"]["build"] += 1 + else: + loop_metrics["recon_streak"] = 0 + loop_metrics["last_guardrail_streak"] = 0 + loop_metrics["phase_counts"]["iterate"] += 1 + loop_metrics["max_recon_streak"] = max( + int(loop_metrics["max_recon_streak"]), int(loop_metrics["recon_streak"]) + ) self._emit( f"[d{depth}/s{step}] model returned {len(turn.tool_calls)} tool call(s) ({elapsed:.1f}s): {', '.join(tc_names)}", on_event, @@ -512,6 +900,7 @@ def _solve_recursive( if parallel and final_answer is None: group_id = f"d{depth}-s{step}-{time.monotonic_ns()}" + use_parallel_owner = len(parallel) > 1 begin_group = getattr(self.tools, "begin_parallel_write_group", None) end_group = getattr(self.tools, "end_parallel_write_group", None) if callable(begin_group): @@ -526,7 +915,7 @@ def _solve_recursive( deadline=deadline, current_model=model, replay_logger=replay_logger, parallel_group_id=group_id, - parallel_owner=f"{tc.id or 'tc'}:{idx}", + parallel_owner=(f"{tc.id or 'tc'}:{idx}" if use_parallel_owner else None), ): idx for idx, tc in parallel } @@ -546,7 +935,7 @@ def _solve_recursive( # Timestamp + step budget + context usage awareness if final_answer is None and results: - budget_total = self.config.max_steps_per_call + budget_total = active_step_budget remaining = budget_total - step ts_tag = f"[{datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}]" budget_tag = f"[Step {step}/{budget_total}]" @@ -583,6 +972,42 @@ def _solve_recursive( image=rl.image, ) + phase_name = ( + "build" + if has_artifact + else "investigate" + if has_recon and all(name in _RECON_TOOL_NAMES for name in tc_names) + else "iterate" + ) + step_records.append( + _build_step_progress_record( + step=step, + phase=phase_name, + tool_calls=turn.tool_calls, + results=results, + ) + ) + + if ( + final_answer is None + and results + and int(loop_metrics["recon_streak"]) >= 3 + and not has_artifact + and int(loop_metrics.get("last_guardrail_streak", 0)) == 0 + ): + loop_metrics["guardrail_warnings"] += 1 + loop_metrics["last_guardrail_streak"] = int(loop_metrics["recon_streak"]) + soft_warning = ToolResult( + "recon-guardrail", + "system", + ( + "Soft guardrail: you've spent multiple consecutive steps in read/list/search mode " + "without producing artifacts. Move to implementation now (edit files, run targeted " + "validation, and return concrete outputs)." + ), + ) + results.append(soft_warning) + # Plan injection — find newest *.plan.md in session dir, append to last result if self.session_dir is not None and results and final_answer is None: try: @@ -615,14 +1040,69 @@ def _solve_recursive( if final_answer is not None: self._emit(f"[d{depth}] completed in {step} step(s)", on_event) + loop_metrics["termination_reason"] = "success" + self.last_loop_metrics = loop_metrics return final_answer for r in results: context.add(f"[depth {depth} step {step}]\n{r.content}") - return ( - f"Step budget exhausted at depth {depth} for objective: {objective}\n" - "Please try with a more specific task, higher step budget, or deeper recursion." + if step >= active_step_budget: + evaluation = _evaluate_budget_extension( + step_records, + recon_streak=int(loop_metrics.get("recon_streak", 0)), + ) + loop_metrics["extension_eligible_checks"] = int( + loop_metrics.get("extension_eligible_checks", 0) + ) + 1 + loop_metrics["last_budget_extension_eval"] = evaluation + can_extend = ( + self.config.budget_extension_enabled + and int(loop_metrics.get("extensions_granted", 0)) < self.config.budget_extension_max_blocks + and bool(evaluation.get("eligible")) + ) + if can_extend: + loop_metrics["extensions_granted"] = int(loop_metrics.get("extensions_granted", 0)) + 1 + active_step_budget += self.config.budget_extension_block_steps + extension_notice = ToolResult( + tool_call_id="budget-extension", + name="system", + content=( + "Progress-based budget extension granted. You have a small number of extra steps. " + "Finish the deliverable now and avoid repeating the same loop." + ), + ) + model.append_tool_results(conversation, [extension_notice]) + continue + + if int(loop_metrics.get("extensions_granted", 0)) >= self.config.budget_extension_max_blocks: + loop_metrics["extension_denials_cap"] = int(loop_metrics.get("extension_denials_cap", 0)) + 1 + loop_metrics["termination_reason"] = "budget_cap" + else: + loop_metrics["extension_denials_no_progress"] = int( + loop_metrics.get("extension_denials_no_progress", 0) + ) + 1 + loop_metrics["termination_reason"] = "budget_no_progress" + self.last_loop_metrics = loop_metrics + return _render_partial_completion(objective, loop_metrics, evaluation, step_records) + + loop_metrics["termination_reason"] = "budget_cap" + self.last_loop_metrics = loop_metrics + return _render_partial_completion( + objective, + loop_metrics, + { + "eligible": False, + "window_size": 0, + "repeated_signature_streak": 0, + "failure_ratio": 0.0, + "novel_action_count": 0, + "state_delta_count": 0, + "has_build_or_finalize": False, + "positive_signals": 0, + "blockers": ["max_total_steps"], + }, + step_records, ) def _run_one_tool( @@ -667,6 +1147,7 @@ def _run_one_tool( current_model=current_model, replay_logger=replay_logger, step=step, + child_conversation_owner=parallel_owner, ) except Exception as exc: observation = f"Tool {tc.name} crashed: {type(exc).__name__}: {exc}" @@ -714,6 +1195,7 @@ def _apply_tool_call( current_model: BaseModel | None = None, replay_logger: ReplayLogger | None = None, step: int = 0, + child_conversation_owner: str | None = None, ) -> tuple[bool, str]: name = tool_call.name args = tool_call.arguments @@ -881,7 +1363,10 @@ def _apply_tool_call( subtask_model = self._model_cache[cache_key] self._emit(f"[d{depth}] >> entering subtask: {objective}", on_event) - child_logger = replay_logger.child(depth, step) if replay_logger else None + child_logger = ( + replay_logger.child(depth, step, owner=child_conversation_owner) + if replay_logger else None + ) subtask_result = self._solve_recursive( objective=objective, depth=depth + 1, @@ -937,7 +1422,10 @@ def _apply_tool_call( cur.tool_defs = get_tool_definitions(include_subtask=False, include_acceptance_criteria=self.config.acceptance_criteria) self._emit(f"[d{depth}] >> executing leaf: {objective}", on_event) - child_logger = replay_logger.child(depth, step) if replay_logger else None + child_logger = ( + replay_logger.child(depth, step, owner=child_conversation_owner) + if replay_logger else None + ) exec_result = self._solve_recursive( objective=objective, depth=depth + 1, diff --git a/agent/investigation_state.py b/agent/investigation_state.py new file mode 100644 index 00000000..2a94813d --- /dev/null +++ b/agent/investigation_state.py @@ -0,0 +1,1107 @@ +from __future__ import annotations + +import copy +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +SCHEMA_VERSION = "1.0.0" +ONTOLOGY_NAMESPACE = "openplanter.core" +ONTOLOGY_VERSION = "2026-03" +LOW_CONFIDENCE_THRESHOLD = 0.60 +VERY_LOW_CONFIDENCE_THRESHOLD = 0.40 +MAX_CANDIDATE_ACTIONS = 24 +REQUIRED_EVIDENCE_COUNT = 1 +_PRIORITY_RANK = {"critical": 0, "high": 1, "medium": 2, "low": 3} +_SUGGESTED_TOOLS = { + "search": ["web_search", "fetch_url", "search_files", "read_file"], + "verify_claim": ["web_search", "fetch_url", "read_file", "search_files"], +} +_LEGACY_KNOWN_KEYS = { + "session_id", + "saved_at", + "external_observations", + "observations", + "turn_history", + "loop_metrics", +} + + +def utc_now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def default_state(session_id: str, now: str | None = None) -> dict[str, Any]: + ts = now or utc_now_iso() + return { + "schema_version": SCHEMA_VERSION, + "session_id": session_id, + "created_at": ts, + "updated_at": ts, + "objective": "", + "ontology": { + "namespace": ONTOLOGY_NAMESPACE, + "version": ONTOLOGY_VERSION, + }, + "entities": {}, + "links": {}, + "claims": {}, + "evidence": {}, + "hypotheses": {}, + "questions": {}, + "tasks": {}, + "actions": {}, + "provenance_nodes": {}, + "confidence_profiles": {}, + "timeline": [], + "indexes": { + "by_external_ref": {}, + "by_tag": {}, + }, + "legacy": { + "external_observations": [], + "turn_history": [], + "loop_metrics": {}, + "extra_fields": {}, + }, + } + + +def normalize_legacy_state(session_id: str, raw_state: dict[str, Any]) -> dict[str, Any]: + state = raw_state if isinstance(raw_state, dict) else {} + observations = state.get("external_observations") + if not isinstance(observations, list): + observations = _observations_from_rust_state(state) + + normalized = { + "session_id": str(state.get("session_id") or session_id), + "saved_at": str(state.get("saved_at") or utc_now_iso()), + "external_observations": _string_list(observations), + "turn_history": _json_list(state.get("turn_history")), + "loop_metrics": _json_object(state.get("loop_metrics")), + } + normalized.update(_extra_fields_from_legacy_state(state)) + return normalized + + +def migrate_legacy_state( + session_id: str, + legacy_state: dict[str, Any], + now: str | None = None, +) -> dict[str, Any]: + normalized = normalize_legacy_state(session_id, legacy_state) + ts = now or str(normalized.get("saved_at") or utc_now_iso()) + migrated = default_state(session_id=session_id, now=ts) + migrated["updated_at"] = ts + migrated["legacy"] = { + "external_observations": list(normalized.get("external_observations", [])), + "turn_history": _json_list(normalized.get("turn_history")), + "loop_metrics": _json_object(normalized.get("loop_metrics")), + "extra_fields": { + key: value + for key, value in normalized.items() + if key not in {"session_id", "saved_at", "external_observations", "turn_history", "loop_metrics"} + }, + } + return upsert_legacy_observations(migrated, migrated["legacy"]["external_observations"], now=ts) + + +def state_to_legacy_projection(state: dict[str, Any], session_id: str) -> dict[str, Any]: + legacy = state.get("legacy", {}) + legacy_dict = legacy if isinstance(legacy, dict) else {} + projected = { + "session_id": str(state.get("session_id") or session_id), + "saved_at": str(state.get("updated_at") or utc_now_iso()), + "external_observations": _legacy_observations_from_state(state), + "turn_history": _json_list(legacy_dict.get("turn_history")), + "loop_metrics": _json_object(legacy_dict.get("loop_metrics")), + } + extras = legacy_dict.get("extra_fields") + if isinstance(extras, dict): + projected.update(copy.deepcopy(extras)) + return projected + + +def upsert_legacy_observations( + state: dict[str, Any], + observations: list[str], + now: str | None = None, +) -> dict[str, Any]: + ts = now or utc_now_iso() + out = copy.deepcopy(state) + out.setdefault("schema_version", SCHEMA_VERSION) + out.setdefault("session_id", "") + out.setdefault("created_at", ts) + out["updated_at"] = ts + out.setdefault( + "ontology", + { + "namespace": ONTOLOGY_NAMESPACE, + "version": ONTOLOGY_VERSION, + }, + ) + out.setdefault("entities", {}) + out.setdefault("links", {}) + out.setdefault("claims", {}) + out.setdefault("hypotheses", {}) + out.setdefault("questions", {}) + out.setdefault("tasks", {}) + out.setdefault("actions", {}) + out.setdefault("provenance_nodes", {}) + out.setdefault("confidence_profiles", {}) + out.setdefault("timeline", []) + + indexes = out.setdefault("indexes", {}) + if not isinstance(indexes, dict): + indexes = {} + out["indexes"] = indexes + by_external_ref = indexes.setdefault("by_external_ref", {}) + if not isinstance(by_external_ref, dict): + by_external_ref = {} + indexes["by_external_ref"] = by_external_ref + indexes.setdefault("by_tag", {}) + + legacy = out.setdefault("legacy", {}) + if not isinstance(legacy, dict): + legacy = {} + out["legacy"] = legacy + legacy["external_observations"] = [str(item) for item in observations] + legacy.setdefault("turn_history", []) + legacy.setdefault("loop_metrics", {}) + legacy.setdefault("extra_fields", {}) + + evidence = out.setdefault("evidence", {}) + if not isinstance(evidence, dict): + evidence = {} + out["evidence"] = evidence + + for index, observation in enumerate(observations): + evidence_id = _legacy_evidence_id(index) + source_uri = _legacy_source_uri(index) + existing = evidence.get(evidence_id) + record = existing if isinstance(existing, dict) else {} + created_at = str(record.get("created_at") or ts) + record.update( + { + "id": evidence_id, + "evidence_type": "legacy_observation", + "content": str(observation), + "source_uri": source_uri, + "normalization": { + "kind": "legacy_observation", + "normalization_version": "legacy-v1", + }, + "provenance_ids": [], + "confidence_id": None, + "created_at": created_at, + "updated_at": ts, + } + ) + evidence[evidence_id] = record + by_external_ref[source_uri] = evidence_id + + keep_ids = {_legacy_evidence_id(index) for index in range(len(observations))} + for evidence_id in list(evidence.keys()): + record = evidence.get(evidence_id) + if _is_legacy_evidence(evidence_id, record) and evidence_id not in keep_ids: + del evidence[evidence_id] + + for key in list(by_external_ref.keys()): + value = by_external_ref.get(key) + if ( + isinstance(key, str) + and key.startswith("state.json#external_observations[") + and isinstance(value, str) + and value.startswith("ev_legacy_") + and value not in keep_ids + ): + del by_external_ref[key] + + return out + + +def load_investigation_state(path: Path) -> dict[str, Any]: + state = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(state, dict): + raise json.JSONDecodeError("Investigation state must be a JSON object", str(path), 0) + return state + + +def save_investigation_state(path: Path, state: dict[str, Any]) -> None: + path.write_text(json.dumps(state, indent=2), encoding="utf-8") + + +def build_question_reasoning_packet( + state: dict[str, Any], + *, + max_questions: int = 8, + max_evidence_per_item: int = 6, +) -> dict[str, Any]: + """Build a question-centric reasoning packet from canonical typed state.""" + + questions = state.get("questions") if isinstance(state.get("questions"), dict) else {} + claims = state.get("claims") if isinstance(state.get("claims"), dict) else {} + evidence = state.get("evidence") if isinstance(state.get("evidence"), dict) else {} + provenance_nodes = state.get("provenance_nodes") if isinstance(state.get("provenance_nodes"), dict) else {} + entities = state.get("entities") if isinstance(state.get("entities"), dict) else {} + links = state.get("links") if isinstance(state.get("links"), dict) else {} + + unresolved_questions: list[dict[str, Any]] = [] + question_records: dict[str, dict[str, Any]] = {} + for question_id, raw_question in questions.items(): + if not isinstance(raw_question, dict): + continue + origin = raw_question.get("origin") if isinstance(raw_question.get("origin"), dict) else {} + status = str(raw_question.get("status") or "open").lower() + if status in {"resolved", "closed", "wont_fix", "won't_fix"}: + continue + + normalized_question = { + "id": str(raw_question.get("id") or question_id), + "question": str(raw_question.get("question_text") or raw_question.get("question") or ""), + "status": status, + "priority": str(raw_question.get("priority") or "medium").lower(), + "claim_ids": _id_list(raw_question.get("claim_ids") or raw_question.get("claims") or origin.get("claim_ids")), + "evidence_ids": _id_list(raw_question.get("evidence_ids") or origin.get("evidence_ids"))[:max_evidence_per_item], + "triggers": _id_list( + raw_question.get("trigger") + or raw_question.get("triggers") + or origin.get("trigger") + or origin.get("triggers") + ), + "updated_at": str(raw_question.get("updated_at") or ""), + } + unresolved_questions.append(normalized_question) + question_records[normalized_question["id"]] = raw_question + + unresolved_questions.sort(key=_question_priority_sort_key) + focus_questions = unresolved_questions[: max(1, max_questions)] + + supported: list[dict[str, Any]] = [] + contested: list[dict[str, Any]] = [] + unresolved: list[dict[str, Any]] = [] + contradictions: list[dict[str, Any]] = [] + claim_records: dict[str, dict[str, Any]] = {} + claim_summaries: dict[str, dict[str, Any]] = {} + + for claim_id, raw_claim in claims.items(): + if not isinstance(raw_claim, dict): + continue + normalized_claim_id = str(raw_claim.get("id") or claim_id) + claim_status = str(raw_claim.get("status") or "unresolved").lower() + support_ids = _id_list( + raw_claim.get("support_evidence_ids") + or raw_claim.get("evidence_support_ids") + or raw_claim.get("evidence_ids") + ) + contradiction_ids = _id_list( + raw_claim.get("contradiction_evidence_ids") + or raw_claim.get("evidence_contra_ids") + or raw_claim.get("contradict_evidence_ids") + ) + confidence = raw_claim.get("confidence") + if confidence is None: + confidence = raw_claim.get("confidence_score") + + claim_summary = { + "id": normalized_claim_id, + "claim": str(raw_claim.get("claim_text") or raw_claim.get("text") or ""), + "status": claim_status, + "confidence": confidence, + "support_evidence_ids": support_ids[:max_evidence_per_item], + "contradiction_evidence_ids": contradiction_ids[:max_evidence_per_item], + } + claim_records[normalized_claim_id] = raw_claim + claim_summaries[normalized_claim_id] = claim_summary + + if contradiction_ids: + contradictions.append( + { + "claim_id": normalized_claim_id, + "support_evidence_ids": support_ids[:max_evidence_per_item], + "contradiction_evidence_ids": contradiction_ids[:max_evidence_per_item], + } + ) + + if claim_status == "supported": + supported.append(claim_summary) + elif claim_status == "contested" or contradiction_ids: + contested.append(claim_summary) + else: + unresolved.append(claim_summary) + + evidence_index: dict[str, dict[str, Any]] = {} + for evidence_id in _collect_evidence_ids(focus_questions, supported, contested, unresolved): + record = evidence.get(evidence_id) + if not isinstance(record, dict): + continue + evidence_index[evidence_id] = { + "evidence_type": record.get("evidence_type"), + "provenance_ids": _id_list(record.get("provenance_ids")), + "source_uri": record.get("source_uri"), + "confidence_id": record.get("confidence_id"), + } + + question_ids_by_claim: dict[str, list[str]] = {} + for question in unresolved_questions: + for claim_id in question["claim_ids"]: + question_ids_by_claim.setdefault(claim_id, []).append(question["id"]) + + candidate_actions = _build_candidate_actions( + focus_questions=focus_questions, + unresolved_questions=unresolved_questions, + question_records=question_records, + question_ids_by_claim=question_ids_by_claim, + claim_records=claim_records, + claim_summaries=claim_summaries, + evidence=evidence, + evidence_index=evidence_index, + provenance_nodes=provenance_nodes, + entities=entities, + links=links, + max_evidence_per_item=max_evidence_per_item, + ) + + return { + "reasoning_mode": "question_centric", + "loop": [ + "select_unresolved_question", + "gather_discriminating_evidence", + "update_claim_status_and_confidence", + "record_contradictions", + "synthesize_supported_contested_unresolved", + ], + "focus_question_ids": [item["id"] for item in focus_questions], + "unresolved_questions": focus_questions, + "findings": { + "supported": supported, + "contested": contested, + "unresolved": unresolved, + }, + "contradictions": contradictions, + "evidence_index": evidence_index, + "candidate_actions": candidate_actions, + } + + +def _string_list(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [str(item) for item in value] + + +def _json_list(value: Any) -> list[Any]: + if not isinstance(value, list): + return [] + return copy.deepcopy(value) + + +def _json_object(value: Any) -> dict[str, Any]: + if not isinstance(value, dict): + return {} + return copy.deepcopy(value) + + +def _id_list(value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [str(item) for item in value if item is not None] + + +def _question_priority_sort_key(question: dict[str, Any]) -> tuple[int, str]: + priority = str(question.get("priority") or "medium").lower() + question_id = str(question.get("id") or "") + return (_PRIORITY_RANK.get(priority, 9), question_id) + + +def _build_candidate_actions( + *, + focus_questions: list[dict[str, Any]], + unresolved_questions: list[dict[str, Any]], + question_records: dict[str, dict[str, Any]], + question_ids_by_claim: dict[str, list[str]], + claim_records: dict[str, dict[str, Any]], + claim_summaries: dict[str, dict[str, Any]], + evidence: dict[str, Any], + evidence_index: dict[str, dict[str, Any]], + provenance_nodes: dict[str, Any], + entities: dict[str, Any], + links: dict[str, Any], + max_evidence_per_item: int, +) -> list[dict[str, Any]]: + actions: list[dict[str, Any]] = [] + seen_ids: set[str] = set() + + for question in focus_questions: + question_id = question["id"] + linked_claim_ids = [claim_id for claim_id in question["claim_ids"] if claim_id in claim_summaries] + action_type = "verify_claim" if linked_claim_ids else "search" + evidence_ids = _limit_unique_ids( + question["evidence_ids"] + + [ + evidence_id + for claim_id in linked_claim_ids + for evidence_id in _claim_evidence_ids(claim_summaries[claim_id]) + ], + max_evidence_per_item, + ) + claim_statuses = [str(claim_summaries[claim_id]["status"]) for claim_id in linked_claim_ids] + reason_codes = ["question_unresolved"] + if any(status in {"unresolved", "proposed"} for status in claim_statuses): + reason_codes.append("claim_unresolved") + if any(_claim_is_low_confidence(claim_summaries[claim_id]) for claim_id in linked_claim_ids): + reason_codes.append("claim_low_confidence") + action = { + "id": f"ca_q_{question_id}", + "action_type": action_type, + "status": "proposed", + "priority": _normalize_priority(question.get("priority")), + "opened_by_question_id": question_id, + "target_question_ids": [question_id], + "target_claim_ids": linked_claim_ids, + "rationale": { + "reason_codes": _dedupe_strings(reason_codes), + "question_status": question.get("status"), + "claim_statuses": sorted(set(claim_statuses)), + "current_evidence_count": len(evidence_ids), + "blocking": True, + }, + "required_inputs": { + "question_ids": [question_id], + "claim_ids": linked_claim_ids, + "evidence_ids": evidence_ids, + "entity_ids": _limit_unique_ids( + _collect_related_entity_ids( + question_records.get(question_id, {}), + *[claim_records.get(claim_id, {}) for claim_id in linked_claim_ids], + ), + max_evidence_per_item, + ), + "external_dependencies": [], + }, + "required_sources": _collect_required_sources( + question_records.get(question_id, {}), + *[claim_records.get(claim_id, {}) for claim_id in linked_claim_ids], + evidence_ids=evidence_ids, + evidence=evidence, + provenance_nodes=provenance_nodes, + ), + "suggested_tools": list(_SUGGESTED_TOOLS[action_type]), + "expected_payoff": _build_expected_payoff(action_type, _normalize_priority(question.get("priority"))), + "evidence_gap_refs": _dedupe_gap_refs( + _build_question_gap_refs(question_id, evidence_ids) + + [ + gap + for claim_id in linked_claim_ids + for gap in _build_claim_gap_refs( + claim_id=claim_id, + opened_by_question_id=question_id, + claim_summary=claim_summaries[claim_id], + ) + ] + ), + "ontology_object_refs": _dedupe_object_refs( + _build_ontology_object_refs( + question_ids=[question_id], + claim_ids=linked_claim_ids, + evidence_ids=evidence_ids, + question_records=question_records, + claim_records=claim_records, + evidence=evidence, + provenance_nodes=provenance_nodes, + entities=entities, + links=links, + ) + ), + } + if action["id"] not in seen_ids: + seen_ids.add(action["id"]) + actions.append(action) + + for claim_id, claim_summary in claim_summaries.items(): + claim_status = str(claim_summary.get("status") or "unresolved").lower() + confidence = _parse_confidence(claim_summary.get("confidence")) + if claim_status in {"retracted", "resolved", "closed"}: + continue + if not ( + claim_status in {"unresolved", "proposed"} + or confidence is None + or confidence < LOW_CONFIDENCE_THRESHOLD + ): + continue + opened_by_question_id = next(iter(question_ids_by_claim.get(claim_id, [])), None) + question_priority = None + if opened_by_question_id is not None: + question_priority = _question_priority(unresolved_questions, opened_by_question_id) + priority = _merge_priority(_claim_priority(claim_status, confidence), question_priority) + evidence_ids = _claim_evidence_ids(claim_summary) + action = { + "id": f"ca_c_{claim_id}", + "action_type": "verify_claim", + "status": "proposed", + "priority": priority, + "opened_by_question_id": opened_by_question_id, + "target_question_ids": [opened_by_question_id] if opened_by_question_id else [], + "target_claim_ids": [claim_id], + "rationale": { + "reason_codes": _dedupe_strings( + _claim_reason_codes(claim_status, confidence) + + (["question_unresolved"] if opened_by_question_id else []) + ), + "claim_status": claim_status, + "confidence": confidence, + "current_evidence_count": len(evidence_ids), + "blocking": True, + }, + "required_inputs": { + "question_ids": [opened_by_question_id] if opened_by_question_id else [], + "claim_ids": [claim_id], + "evidence_ids": evidence_ids, + "entity_ids": _limit_unique_ids( + _collect_related_entity_ids( + claim_records.get(claim_id, {}), + question_records.get(opened_by_question_id, {}) if opened_by_question_id else {}, + ), + max_evidence_per_item, + ), + "external_dependencies": [], + }, + "required_sources": _collect_required_sources( + claim_records.get(claim_id, {}), + question_records.get(opened_by_question_id, {}) if opened_by_question_id else {}, + evidence_ids=evidence_ids, + evidence=evidence, + provenance_nodes=provenance_nodes, + ), + "suggested_tools": list(_SUGGESTED_TOOLS["verify_claim"]), + "expected_payoff": _build_expected_payoff("verify_claim", priority), + "evidence_gap_refs": _dedupe_gap_refs( + _build_claim_gap_refs( + claim_id=claim_id, + opened_by_question_id=opened_by_question_id, + claim_summary=claim_summary, + ) + ), + "ontology_object_refs": _dedupe_object_refs( + _build_ontology_object_refs( + question_ids=[opened_by_question_id] if opened_by_question_id else [], + claim_ids=[claim_id], + evidence_ids=evidence_ids, + question_records=question_records, + claim_records=claim_records, + evidence=evidence, + provenance_nodes=provenance_nodes, + entities=entities, + links=links, + ) + ), + } + if action["id"] not in seen_ids: + seen_ids.add(action["id"]) + actions.append(action) + + actions.sort(key=_candidate_action_sort_key) + return actions[:MAX_CANDIDATE_ACTIONS] + + +def _normalize_priority(priority: Any) -> str: + value = str(priority or "medium").lower() + return value if value in _PRIORITY_RANK else "medium" + + +def _question_priority(questions: list[dict[str, Any]], question_id: str) -> str | None: + for question in questions: + if question.get("id") == question_id: + return _normalize_priority(question.get("priority")) + return None + + +def _merge_priority(*priorities: str | None) -> str: + normalized = [_normalize_priority(priority) for priority in priorities if priority] + if not normalized: + return "medium" + return min(normalized, key=lambda value: (_PRIORITY_RANK.get(value, 9), value)) + + +def _claim_priority(claim_status: str, confidence: float | None) -> str: + if claim_status in {"unresolved", "proposed"}: + return "high" + if confidence is None: + return "high" + if confidence <= VERY_LOW_CONFIDENCE_THRESHOLD: + return "high" + if confidence < LOW_CONFIDENCE_THRESHOLD: + return "medium" + return "low" + + +def _parse_confidence(value: Any) -> float | None: + if value is None or isinstance(value, bool): + return None + if isinstance(value, (int, float)): + parsed = float(value) + elif isinstance(value, str): + try: + parsed = float(value.strip()) + except ValueError: + return None + else: + return None + return max(0.0, min(1.0, parsed)) + + +def _claim_evidence_ids(claim_summary: dict[str, Any]) -> list[str]: + return _limit_unique_ids( + _id_list(claim_summary.get("support_evidence_ids")) + + _id_list(claim_summary.get("contradiction_evidence_ids")), + 10_000, + ) + + +def _claim_reason_codes(claim_status: str, confidence: float | None) -> list[str]: + reason_codes: list[str] = [] + if claim_status in {"unresolved", "proposed"}: + reason_codes.append("claim_unresolved") + if confidence is None: + reason_codes.append("claim_missing_confidence") + elif confidence < LOW_CONFIDENCE_THRESHOLD: + reason_codes.append("claim_low_confidence") + return reason_codes + + +def _claim_is_low_confidence(claim_summary: dict[str, Any]) -> bool: + confidence = _parse_confidence(claim_summary.get("confidence")) + return confidence is None or confidence < LOW_CONFIDENCE_THRESHOLD + + +def _limit_unique_ids(values: list[str], max_items: int) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for value in values: + normalized = str(value) + if not normalized or normalized in seen: + continue + seen.add(normalized) + out.append(normalized) + if len(out) >= max_items: + break + return out + + +def _dedupe_strings(values: list[str]) -> list[str]: + return _limit_unique_ids(values, len(values) or 1) + + +def _build_expected_payoff(action_type: str, priority: str) -> dict[str, float]: + base = { + "critical": 0.90, + "high": 0.75, + "medium": 0.55, + "low": 0.35, + }.get(priority, 0.55) + graph_expansion_value = 0.40 if action_type == "search" else 0.30 + payoff_score = round((0.45 * base) + (0.35 * base) + (0.20 * graph_expansion_value), 4) + return { + "uncertainty_reduction": round(base, 4), + "decision_impact": round(base, 4), + "graph_expansion_value": round(graph_expansion_value, 4), + "payoff_score": payoff_score, + } + + +def _build_question_gap_refs(question_id: str, evidence_ids: list[str]) -> list[dict[str, Any]]: + if evidence_ids: + return [] + return [ + { + "gap_id": f"gap:question:{question_id}:missing_evidence", + "kind": "missing_evidence", + "scope": "question", + "question_id": question_id, + "current_evidence_ids": [], + "current_evidence_count": 0, + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": True, + } + ] + + +def _build_claim_gap_refs( + *, + claim_id: str, + opened_by_question_id: str | None, + claim_summary: dict[str, Any], +) -> list[dict[str, Any]]: + support_ids = _id_list(claim_summary.get("support_evidence_ids")) + contradiction_ids = _id_list(claim_summary.get("contradiction_evidence_ids")) + evidence_ids = _limit_unique_ids(support_ids + contradiction_ids, 10_000) + confidence = _parse_confidence(claim_summary.get("confidence")) + claim_status = str(claim_summary.get("status") or "unresolved").lower() + refs: list[dict[str, Any]] = [] + if not evidence_ids: + refs.append( + { + "gap_id": f"gap:claim:{claim_id}:missing_evidence", + "kind": "missing_evidence", + "scope": "claim", + "question_id": opened_by_question_id, + "claim_id": claim_id, + "current_evidence_ids": [], + "current_evidence_count": 0, + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": True, + } + ) + if claim_status in {"unresolved", "contested", "proposed"} and evidence_ids and (not support_ids or not contradiction_ids): + refs.append( + { + "gap_id": f"gap:claim:{claim_id}:missing_counter_evidence", + "kind": "missing_counter_evidence", + "scope": "claim", + "question_id": opened_by_question_id, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": len(evidence_ids), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": True, + } + ) + if confidence is None: + refs.append( + { + "gap_id": f"gap:claim:{claim_id}:missing_confidence", + "kind": "missing_confidence", + "scope": "claim", + "question_id": opened_by_question_id, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": len(evidence_ids), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": True, + } + ) + elif confidence < LOW_CONFIDENCE_THRESHOLD: + refs.append( + { + "gap_id": f"gap:claim:{claim_id}:low_confidence", + "kind": "low_confidence", + "scope": "claim", + "question_id": opened_by_question_id, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": len(evidence_ids), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": True, + } + ) + return refs + + +def _dedupe_gap_refs(refs: list[dict[str, Any]]) -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + seen: set[str] = set() + for ref in refs: + gap_id = str(ref.get("gap_id") or "") + if not gap_id or gap_id in seen: + continue + seen.add(gap_id) + out.append(ref) + return out + + +def _build_ontology_object_refs( + *, + question_ids: list[str], + claim_ids: list[str], + evidence_ids: list[str], + question_records: dict[str, dict[str, Any]], + claim_records: dict[str, dict[str, Any]], + evidence: dict[str, Any], + provenance_nodes: dict[str, Any], + entities: dict[str, Any], + links: dict[str, Any], +) -> list[dict[str, Any]]: + refs: list[dict[str, Any]] = [] + for question_id in question_ids: + record = question_records.get(question_id, {}) + refs.append( + _object_ref( + object_id=question_id, + object_type="question", + relation="opened_by", + label=str(record.get("question_text") or record.get("question") or question_id), + ) + ) + refs.extend(_entity_and_link_refs(record, entities=entities, links=links)) + for claim_id in claim_ids: + record = claim_records.get(claim_id, {}) + refs.append( + _object_ref( + object_id=claim_id, + object_type="claim", + relation="targets", + label=str(record.get("claim_text") or record.get("text") or claim_id), + ) + ) + refs.extend(_entity_and_link_refs(record, entities=entities, links=links)) + for evidence_id in evidence_ids: + record = evidence.get(evidence_id) + if not isinstance(record, dict): + continue + refs.append( + _object_ref( + object_id=evidence_id, + object_type="evidence", + relation="depends_on", + label=str(record.get("source_uri") or record.get("evidence_type") or evidence_id), + ) + ) + refs.extend(_entity_and_link_refs(record, entities=entities, links=links)) + for provenance_id in _id_list(record.get("provenance_ids")): + provenance = provenance_nodes.get(provenance_id) if isinstance(provenance_nodes.get(provenance_id), dict) else {} + refs.append( + _object_ref( + object_id=provenance_id, + object_type="provenance_node", + relation="supported_by", + label=str( + provenance.get("title") + or provenance.get("name") + or provenance.get("source_uri") + or provenance_id + ), + ) + ) + confidence_id = record.get("confidence_id") + if confidence_id is not None: + refs.append( + _object_ref( + object_id=str(confidence_id), + object_type="confidence_profile", + relation="depends_on", + ) + ) + return refs + + +def _entity_and_link_refs( + record: dict[str, Any], + *, + entities: dict[str, Any], + links: dict[str, Any], +) -> list[dict[str, Any]]: + refs: list[dict[str, Any]] = [] + for entity_id in _collect_related_entity_ids(record): + entity = entities.get(entity_id) if isinstance(entities.get(entity_id), dict) else {} + refs.append( + _object_ref( + object_id=entity_id, + object_type="entity", + relation="about", + label=str(entity.get("name") or entity.get("label") or entity_id), + ) + ) + for link_id in _collect_related_link_ids(record): + link = links.get(link_id) if isinstance(links.get(link_id), dict) else {} + refs.append( + _object_ref( + object_id=link_id, + object_type="link", + relation="about", + label=str(link.get("label") or link.get("type") or link_id), + ) + ) + return refs + + +def _object_ref( + *, + object_id: str, + object_type: str, + relation: str, + label: str | None = None, +) -> dict[str, Any]: + ref = { + "object_id": object_id, + "object_type": object_type, + "relation": relation, + } + if label: + ref["label"] = label + return ref + + +def _dedupe_object_refs(refs: list[dict[str, Any]]) -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + seen: set[tuple[str, str]] = set() + for ref in refs: + object_id = str(ref.get("object_id") or "") + relation = str(ref.get("relation") or "") + if not object_id: + continue + key = (object_id, relation) + if key in seen: + continue + seen.add(key) + out.append(ref) + return out + + +def _collect_related_entity_ids(*records: dict[str, Any]) -> list[str]: + keys = ( + "subject_refs", + "related_entity_ids", + "entity_ids", + "entities", + "about_entity_ids", + "subject_entity_ids", + "object_entity_ids", + "target_entity_ids", + ) + return _collect_nested_ids(keys, *records) + + +def _collect_related_link_ids(*records: dict[str, Any]) -> list[str]: + return _collect_nested_ids(("link_ids", "links"), *records) + + +def _collect_nested_ids(keys: tuple[str, ...], *records: dict[str, Any]) -> list[str]: + values: list[str] = [] + for record in records: + if not isinstance(record, dict): + continue + for key in keys: + raw_value = record.get(key) + if isinstance(raw_value, list): + values.extend(str(item) for item in raw_value if item is not None) + elif raw_value is not None and not isinstance(raw_value, dict): + values.append(str(raw_value)) + return _limit_unique_ids(values, 10_000) + + +def _collect_required_sources( + *records: dict[str, Any], + evidence_ids: list[str], + evidence: dict[str, Any], + provenance_nodes: dict[str, Any], +) -> list[str]: + sources: list[str] = [] + for record in records: + if not isinstance(record, dict): + continue + sources.extend(_extract_source_values(record)) + for provenance_id in _id_list(record.get("provenance_ids")): + provenance = provenance_nodes.get(provenance_id) + if isinstance(provenance, dict): + sources.extend(_extract_source_values(provenance)) + for evidence_id in evidence_ids: + record = evidence.get(evidence_id) + if not isinstance(record, dict): + continue + sources.extend(_extract_source_values(record)) + for provenance_id in _id_list(record.get("provenance_ids")): + provenance = provenance_nodes.get(provenance_id) + if isinstance(provenance, dict): + sources.extend(_extract_source_values(provenance)) + return _limit_unique_ids(sources, 32) + + +def _extract_source_values(record: dict[str, Any]) -> list[str]: + values: list[str] = [] + for key in ("source_uri", "canonical_source_uri", "url"): + value = record.get(key) + if value: + values.append(str(value)) + for key in ("source_uris", "required_sources", "sources", "urls"): + value = record.get(key) + if isinstance(value, list): + values.extend(str(item) for item in value if item) + return _limit_unique_ids(values, 32) + + +def _candidate_action_sort_key(action: dict[str, Any]) -> tuple[int, int, str]: + action_id = str(action.get("id") or "") + kind_rank = 0 if action_id.startswith("ca_q_") else 1 + priority = _normalize_priority(action.get("priority")) + return (_PRIORITY_RANK.get(priority, 9), kind_rank, action_id) + + +def _collect_evidence_ids(*collections: list[dict[str, Any]]) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for collection in collections: + for item in collection: + if not isinstance(item, dict): + continue + for key in ("evidence_ids", "support_evidence_ids", "contradiction_evidence_ids"): + values = item.get(key) + if not isinstance(values, list): + continue + for value in values: + evidence_id = str(value) + if evidence_id in seen: + continue + seen.add(evidence_id) + out.append(evidence_id) + return out + + +def _observations_from_rust_state(state: dict[str, Any]) -> list[str]: + observations = state.get("observations") + if not isinstance(observations, list): + return [] + + out: list[str] = [] + for item in observations: + if not isinstance(item, dict): + continue + content = item.get("content") + if isinstance(content, str): + out.append(content) + return out + + +def _extra_fields_from_legacy_state(state: dict[str, Any]) -> dict[str, Any]: + extras: dict[str, Any] = {} + for key, value in state.items(): + if key not in _LEGACY_KNOWN_KEYS: + extras[key] = copy.deepcopy(value) + return extras + + +def _legacy_observations_from_state(state: dict[str, Any]) -> list[str]: + legacy = state.get("legacy", {}) + if isinstance(legacy, dict): + persisted = legacy.get("external_observations") + if isinstance(persisted, list): + return [str(item) for item in persisted] + + evidence = state.get("evidence", {}) + if isinstance(evidence, dict): + legacy_records: list[tuple[str, str]] = [] + for evidence_id, record in evidence.items(): + if not _is_legacy_evidence(str(evidence_id), record): + continue + content = record.get("content") if isinstance(record, dict) else None + if isinstance(content, str): + legacy_records.append((str(evidence_id), content)) + legacy_records.sort(key=lambda item: item[0]) + return [content for _, content in legacy_records] + + return [] + + +def _legacy_evidence_id(index: int) -> str: + return f"ev_legacy_{index + 1:06d}" + + +def _legacy_source_uri(index: int) -> str: + return f"state.json#external_observations[{index}]" + + +def _is_legacy_evidence(evidence_id: str, record: Any) -> bool: + if not evidence_id.startswith("ev_legacy_") or not isinstance(record, dict): + return False + normalization = record.get("normalization") + return isinstance(normalization, dict) and normalization.get("kind") == "legacy_observation" diff --git a/agent/model.py b/agent/model.py index 30bc3ff7..a029dae1 100644 --- a/agent/model.py +++ b/agent/model.py @@ -15,6 +15,23 @@ class ModelError(RuntimeError): pass +class RateLimitError(ModelError): + def __init__( + self, + message: str, + *, + status_code: int | None = None, + provider_code: str | int | None = None, + body: str = "", + retry_after_sec: float | None = None, + ) -> None: + super().__init__(message) + self.status_code = status_code + self.provider_code = provider_code + self.body = body + self.retry_after_sec = retry_after_sec + + # --------------------------------------------------------------------------- # Core data types # --------------------------------------------------------------------------- diff --git a/agent/prompts.py b/agent/prompts.py index 57129d6b..b58ccf6c 100644 --- a/agent/prompts.py +++ b/agent/prompts.py @@ -346,8 +346,10 @@ - {session_dir}/events.jsonl — Trace events log (JSONL). Each record has a timestamp, event type ("objective", "trace", "step", "result"), and payload. Use this for a lightweight overview of objectives and results without full messages. -- {session_dir}/state.json — Persisted external context observations from prior turns. - This is what feeds the external_context_summary in your initial message. +- {session_dir}/investigation_state.json — Canonical typed session state with + structured evidence plus a legacy projection of prior observations. +- {session_dir}/state.json — Legacy compatibility projection of session state. + This still feeds the external_context_summary in your initial message. These files grow throughout the session. If you need to recall prior analysis, check what you did before, or pick up where you left off, read these logs. @@ -364,7 +366,7 @@ - objective: the objective given to that turn - result_preview: first ~200 characters of the turn's result - timestamp: ISO 8601 UTC when the turn ran - - steps_used: how many engine steps were consumed + - steps_used: how many replayed model calls the turn produced, including delegated child conversations - replay_seq_start: starting sequence number in replay.jsonl Use turn history to: @@ -377,6 +379,43 @@ """ +QUESTION_REASONING_SECTION = """ +== QUESTION-CENTRIC REASONING == +Your initial message may contain a "question_reasoning_packet" derived from +{session_dir}/investigation_state.json. Use question-centric reasoning over +document-centric "read more then synthesize" behavior. + +Run this loop until step budget is low or high-priority questions are resolved: +1) Select the next unresolved question from question_reasoning_packet.focus_question_ids + or question_reasoning_packet.unresolved_questions. +2) Gather discriminating evidence targeted at that question. +3) Update related claims in investigation_state.claims with explicit status + (supported / contested / unresolved), confidence, and cited evidence IDs. +4) Record contradictions explicitly, preserving both supporting and contradictory + evidence with provenance IDs instead of collapsing disagreement. +5) Only then synthesize, and repeat for remaining unresolved questions. + +Rules: +- Ground reasoning in typed state references, not raw transcript quotes. Prefer + question IDs, claim IDs, evidence IDs, provenance IDs, and candidate action IDs. +- Treat question_reasoning_packet.candidate_actions as machine-readable, read-only + planner suggestions. Use them to prioritize next steps, but do not assume they + were persisted as canonical tasks/actions. +- Prefer the highest-priority, highest-payoff candidate actions when choosing what + to do next. +- Do not mark a claim supported without support evidence IDs. +- Do not resolve a question without explicit claim/evidence linkage. +- Prefer provenance-backed evidence over uncited notes. + +Final deliverables MUST separate findings into three sections: +- Supported Findings +- Contested Findings +- Unresolved Findings + +Each item should cite the relevant evidence/provenance IDs. +""" + + WIKI_SECTION = """ == DATA SOURCES WIKI == A runtime wiki of data source documentation is available at .openplanter/wiki/. @@ -384,9 +423,14 @@ data sources are documented. Each entry describes access methods, schemas, coverage, and cross-reference potential. -When you discover new information about a data source — updated URLs, new fields, -cross-reference joins, data quality issues, or entirely new sources — update the -relevant entry or create a new one using .openplanter/wiki/template.md. +Treat the wiki as a derived knowledge surface, not your primary memory store. +Primary continuity comes from {session_dir}/investigation_state.json and explicit +evidence/provenance IDs. + +When you discover durable, non-duplicative information about a data source — +updated URLs, new fields, cross-reference joins, data quality issues, or +entirely new sources — update the relevant entry or create a new one using +.openplanter/wiki/template.md. Avoid noisy repeat edits that do not add facts. === MANDATORY WIKI INDEXING === For EVERY investigation, you MUST maintain the wiki as a living knowledge map: @@ -415,6 +459,7 @@ def build_system_prompt( prompt = SYSTEM_PROMPT_BASE prompt += SESSION_LOGS_SECTION prompt += TURN_HISTORY_SECTION + prompt += QUESTION_REASONING_SECTION prompt += WIKI_SECTION if recursive: prompt += RECURSIVE_SECTION diff --git a/agent/replay_log.py b/agent/replay_log.py index 96a399a7..849ad280 100644 --- a/agent/replay_log.py +++ b/agent/replay_log.py @@ -2,12 +2,36 @@ from __future__ import annotations +import hashlib import json -import time +import re +import threading from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path -from typing import Any +from typing import Any, ClassVar + +_OWNER_SLUG_MAX_CHARS = 24 + + +def _normalize_owner_slug(owner: str) -> str: + normalized = re.sub(r"[^A-Za-z0-9._-]+", "_", owner.strip()) + normalized = re.sub(r"_+", "_", normalized).strip("._-") + if not normalized: + return "anon" + return normalized[:_OWNER_SLUG_MAX_CHARS] + + +def _owner_hash(owner: str) -> str: + return hashlib.sha1(owner.encode("utf-8")).hexdigest()[:8] + + +@dataclass +class _ReplayFileState: + """Shared sequencing state for a single replay log file.""" + + lock: threading.RLock = field(default_factory=threading.RLock) + next_seq: int | None = None @dataclass @@ -23,13 +47,45 @@ class ReplayLogger: path: Path conversation_id: str = "root" + force_snapshot_first_call: bool = False _seq: int = field(default=0, init=False) _last_msg_count: int = field(default=0, init=False) + _has_call: bool = field(default=False, init=False) + _has_header: bool = field(default=False, init=False) + _registry_path: Path = field(init=False, repr=False) + _file_state: _ReplayFileState = field(init=False, repr=False) + + _registry_lock: ClassVar[threading.Lock] = threading.Lock() + _file_states: ClassVar[dict[Path, _ReplayFileState]] = {} + + def __post_init__(self) -> None: + self._registry_path = self.path.resolve() + self._file_state = self._get_file_state(self._registry_path) + self._seq = self.current_seq + self._hydrate_conversation_state() + if self.force_snapshot_first_call: + self._has_call = False + self._last_msg_count = 0 - def child(self, depth: int, step: int) -> "ReplayLogger": + @property + def needs_header(self) -> bool: + return not self._has_header + + @property + def current_seq(self) -> int: + with self._file_state.lock: + return self._ensure_next_seq_locked() + + def child(self, depth: int, step: int, owner: str | None = None) -> "ReplayLogger": """Create a child logger for a subtask conversation.""" child_id = f"{self.conversation_id}/d{depth}s{step}" - return ReplayLogger(path=self.path, conversation_id=child_id) + if owner is not None: + child_id = f"{child_id}/o{_normalize_owner_slug(owner)}_{_owner_hash(owner)}" + return ReplayLogger( + path=self.path, + conversation_id=child_id, + force_snapshot_first_call=self.force_snapshot_first_call, + ) def write_header( self, @@ -55,7 +111,9 @@ def write_header( record["reasoning_effort"] = reasoning_effort if temperature is not None: record["temperature"] = temperature - self._append(record) + with self._file_state.lock: + self._append_locked(record) + self._has_header = True def log_call( self, @@ -68,28 +126,97 @@ def log_call( output_tokens: int = 0, elapsed_sec: float = 0.0, ) -> None: - record: dict[str, Any] = { - "type": "call", - "conversation_id": self.conversation_id, - "seq": self._seq, - "depth": depth, - "step": step, - "ts": datetime.now(timezone.utc).isoformat(), - } - if self._seq == 0: - record["messages_snapshot"] = messages - else: - record["messages_delta"] = messages[self._last_msg_count:] - record["response"] = response - record["input_tokens"] = input_tokens - record["output_tokens"] = output_tokens - record["elapsed_sec"] = round(elapsed_sec, 3) - - self._last_msg_count = len(messages) - self._seq += 1 - self._append(record) - - def _append(self, record: dict[str, Any]) -> None: + with self._file_state.lock: + seq = self._ensure_next_seq_locked() + record: dict[str, Any] = { + "type": "call", + "conversation_id": self.conversation_id, + "seq": seq, + "depth": depth, + "step": step, + "ts": datetime.now(timezone.utc).isoformat(), + } + if not self._has_call: + record["messages_snapshot"] = messages + else: + record["messages_delta"] = messages[self._last_msg_count:] + record["response"] = response + record["input_tokens"] = input_tokens + record["output_tokens"] = output_tokens + record["elapsed_sec"] = round(elapsed_sec, 3) + + self._append_locked(record) + self._file_state.next_seq = seq + 1 + self._seq = self._file_state.next_seq + self._last_msg_count = len(messages) + self._has_call = True + + @classmethod + def _get_file_state(cls, path: Path) -> _ReplayFileState: + with cls._registry_lock: + state = cls._file_states.get(path) + if state is None: + state = _ReplayFileState() + cls._file_states[path] = state + return state + + def _ensure_next_seq_locked(self) -> int: + if self._file_state.next_seq is None: + self._file_state.next_seq = self._scan_next_seq() + return self._file_state.next_seq + + def _scan_next_seq(self) -> int: + if not self.path.exists(): + return 0 + next_seq = 0 + for raw_line in self.path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line: + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + seq = record.get("seq") + if isinstance(seq, int) and seq >= next_seq: + next_seq = seq + 1 + return next_seq + + def _hydrate_conversation_state(self) -> None: + with self._file_state.lock: + if not self.path.exists(): + return + msg_count = 0 + has_call = False + has_header = False + for raw_line in self.path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line: + continue + try: + record = json.loads(line) + except json.JSONDecodeError: + continue + if record.get("conversation_id") != self.conversation_id: + continue + if record.get("type") == "header": + has_header = True + continue + if record.get("type") != "call": + continue + has_call = True + snapshot = record.get("messages_snapshot") + if isinstance(snapshot, list): + msg_count = len(snapshot) + continue + delta = record.get("messages_delta") + if isinstance(delta, list): + msg_count += len(delta) + self._has_call = has_call + self._has_header = has_header + self._last_msg_count = msg_count + + def _append_locked(self, record: dict[str, Any]) -> None: self.path.parent.mkdir(parents=True, exist_ok=True) with self.path.open("a", encoding="utf-8") as fh: fh.write(json.dumps(record, ensure_ascii=True, default=str) + "\n") diff --git a/agent/runtime.py b/agent/runtime.py index d28b070e..415d0232 100644 --- a/agent/runtime.py +++ b/agent/runtime.py @@ -4,13 +4,23 @@ import re import secrets import shutil -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Callable from .config import AgentConfig from .engine import ContentDeltaCallback, ExternalContext, RLMEngine, StepCallback, TurnSummary +from .investigation_state import ( + build_question_reasoning_packet, + default_state, + load_investigation_state, + migrate_legacy_state, + normalize_legacy_state, + save_investigation_state, + state_to_legacy_projection, + upsert_legacy_observations, +) from .replay_log import ReplayLogger EventCallback = Callable[[str], None] @@ -33,10 +43,24 @@ def _safe_component(text: str) -> str: return re.sub(r"[^A-Za-z0-9._-]+", "-", text).strip("-") or "artifact" +def _has_reasoning_content(packet: dict[str, Any]) -> bool: + findings = packet.get("findings", {}) + if packet.get("focus_question_ids"): + return True + if packet.get("contradictions"): + return True + if packet.get("candidate_actions"): + return True + if not isinstance(findings, dict): + return False + return any(findings.get(key) for key in ("supported", "contested", "unresolved")) + + @dataclass class SessionStore: workspace: Path session_root_dir: str = ".openplanter" + _warnings: list[str] = field(default_factory=list, init=False, repr=False) def __post_init__(self) -> None: self.workspace = self.workspace.expanduser().resolve() @@ -53,6 +77,9 @@ def _metadata_path(self, session_id: str) -> Path: def _state_path(self, session_id: str) -> Path: return self._session_dir(session_id) / "state.json" + def _investigation_state_path(self, session_id: str) -> Path: + return self._session_dir(session_id) / "investigation_state.json" + def _events_path(self, session_id: str) -> Path: return self._session_dir(session_id) / "events.jsonl" @@ -134,21 +161,112 @@ def open_session( state = self.load_state(sid) return sid, state, created_new + def _warn(self, message: str) -> None: + self._warnings.append(message) + + def drain_warnings(self) -> list[str]: + warnings = list(self._warnings) + self._warnings.clear() + return warnings + + def _try_load_investigation_state( + self, + investigation_path: Path, + *, + on_invalid: str, + ) -> dict[str, Any] | None: + try: + return load_investigation_state(investigation_path) + except json.JSONDecodeError: + self._warn( + f"Session investigation state is invalid JSON: {investigation_path}; {on_invalid}." + ) + return None + def load_state(self, session_id: str) -> dict[str, Any]: + investigation_path = self._investigation_state_path(session_id) + if investigation_path.exists(): + typed_state = self._try_load_investigation_state( + investigation_path, + on_invalid="falling back to legacy state", + ) + if typed_state is not None: + return state_to_legacy_projection(typed_state, session_id=session_id) + state_path = self._state_path(session_id) if not state_path.exists(): return { "session_id": session_id, + "saved_at": _utc_now(), "external_observations": [], } try: - return json.loads(state_path.read_text(encoding="utf-8")) + raw_state = json.loads(state_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SessionError(f"Session state is invalid JSON: {state_path}") from exc + if not isinstance(raw_state, dict): + raise SessionError(f"Session state must be a JSON object: {state_path}") + return normalize_legacy_state(session_id, raw_state) + + def load_typed_state(self, session_id: str) -> dict[str, Any]: + investigation_path = self._investigation_state_path(session_id) + if investigation_path.exists(): + typed_state = self._try_load_investigation_state( + investigation_path, + on_invalid="continuing without typed reasoning state", + ) + if typed_state is not None: + return typed_state + + state_path = self._state_path(session_id) + if not state_path.exists(): + return default_state(session_id=session_id) + try: + raw_state = json.loads(state_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: raise SessionError(f"Session state is invalid JSON: {state_path}") from exc + if not isinstance(raw_state, dict): + raise SessionError(f"Session state must be a JSON object: {state_path}") + return migrate_legacy_state(session_id=session_id, legacy_state=raw_state) def save_state(self, session_id: str, state: dict[str, Any]) -> None: + normalized_legacy = normalize_legacy_state(session_id, state) state_path = self._state_path(session_id) - state_path.write_text(json.dumps(state, indent=2), encoding="utf-8") + state_path.write_text(json.dumps(normalized_legacy, indent=2), encoding="utf-8") + + investigation_path = self._investigation_state_path(session_id) + if investigation_path.exists(): + typed_state = self._try_load_investigation_state( + investigation_path, + on_invalid="preserving the corrupt typed state file and writing legacy state only", + ) + if typed_state is None: + self._touch_metadata(session_id) + return + else: + typed_state = migrate_legacy_state(session_id=session_id, legacy_state=normalized_legacy) + + typed_state = upsert_legacy_observations( + typed_state, + normalized_legacy["external_observations"], + now=normalized_legacy.get("saved_at"), + ) + legacy = typed_state.setdefault("legacy", {}) + if not isinstance(legacy, dict): + legacy = {} + typed_state["legacy"] = legacy + legacy["turn_history"] = normalized_legacy.get("turn_history", []) + legacy["loop_metrics"] = normalized_legacy.get("loop_metrics", {}) + legacy["extra_fields"] = { + key: value + for key, value in normalized_legacy.items() + if key not in {"session_id", "saved_at", "external_observations", "turn_history", "loop_metrics"} + } + + typed_state["session_id"] = session_id + typed_state["updated_at"] = normalized_legacy.get("saved_at", _utc_now()) + typed_state.setdefault("created_at", typed_state["updated_at"]) + save_investigation_state(investigation_path, typed_state) self._touch_metadata(session_id) def append_event(self, session_id: str, event_type: str, payload: dict[str, Any]) -> None: @@ -228,6 +346,21 @@ class SessionRuntime: max_persisted_observations: int = 400 turn_history: list[TurnSummary] | None = None max_turn_summaries: int = 50 + loop_metrics: dict[str, Any] | None = None + + def _flush_store_warnings(self, emit: EventCallback | None = None) -> None: + for message in self.store.drain_warnings(): + if emit is not None: + emit(message) + continue + try: + self.store.append_event( + self.session_id, + "trace", + {"message": message}, + ) + except OSError: + pass @classmethod def bootstrap( @@ -265,6 +398,24 @@ def bootstrap( except (KeyError, TypeError): pass max_turns = max(1, config.max_turn_summaries) + raw_loop_metrics = state.get("loop_metrics", {}) + loop_metrics: dict[str, Any] = raw_loop_metrics if isinstance(raw_loop_metrics, dict) else {} + loop_metrics.setdefault("turns", 0) + loop_metrics.setdefault("steps", 0) + loop_metrics.setdefault("model_turns", 0) + loop_metrics.setdefault("tool_calls", 0) + loop_metrics.setdefault("guardrail_warnings", 0) + loop_metrics.setdefault("final_rejections", 0) + loop_metrics.setdefault("extensions_granted", 0) + loop_metrics.setdefault("extension_eligible_checks", 0) + loop_metrics.setdefault("extension_denials_no_progress", 0) + loop_metrics.setdefault("extension_denials_cap", 0) + loop_metrics.setdefault("termination_reason", "") + loop_metrics.setdefault("phase_counts", {}) + if not isinstance(loop_metrics["phase_counts"], dict): + loop_metrics["phase_counts"] = {} + for phase in ("investigate", "build", "iterate", "finalize"): + loop_metrics["phase_counts"].setdefault(phase, 0) runtime = cls( engine=engine, @@ -274,6 +425,7 @@ def bootstrap( max_persisted_observations=max_obs, turn_history=turn_history[-max_turns:], max_turn_summaries=max_turns, + loop_metrics=loop_metrics, ) try: runtime.store.append_event( @@ -283,10 +435,12 @@ def bootstrap( ) except OSError: pass + runtime._flush_store_warnings() try: runtime._persist_state() except OSError: pass + runtime._flush_store_warnings() return runtime def solve( @@ -359,8 +513,14 @@ def _combined_on_step(step_event: dict[str, Any]) -> None: pass replay_path = self.store._session_dir(self.session_id) / "replay.jsonl" - replay_logger = ReplayLogger(path=replay_path) - replay_seq_start = replay_logger._seq + replay_logger = ReplayLogger(path=replay_path, force_snapshot_first_call=True) + replay_seq_start = replay_logger.current_seq + + typed_state = self.store.load_typed_state(self.session_id) + self._flush_store_warnings(_on_event) + question_reasoning_packet = build_question_reasoning_packet(typed_state) + if not _has_reasoning_content(question_reasoning_packet): + question_reasoning_packet = None result, updated_context = self.engine.solve_with_context( objective=objective, @@ -370,15 +530,55 @@ def _combined_on_step(step_event: dict[str, Any]) -> None: on_content_delta=on_content_delta, replay_logger=replay_logger, turn_history=self.turn_history, + question_reasoning_packet=question_reasoning_packet, ) self.context = updated_context + latest_loop_metrics = self.engine.last_loop_metrics if isinstance(self.engine.last_loop_metrics, dict) else {} + if self.loop_metrics is None: + self.loop_metrics = { + "turns": 0, + "steps": 0, + "model_turns": 0, + "tool_calls": 0, + "guardrail_warnings": 0, + "final_rejections": 0, + "extensions_granted": 0, + "extension_eligible_checks": 0, + "extension_denials_no_progress": 0, + "extension_denials_cap": 0, + "termination_reason": "", + "phase_counts": {"investigate": 0, "build": 0, "iterate": 0, "finalize": 0}, + } + self.loop_metrics["turns"] = int(self.loop_metrics.get("turns", 0)) + 1 + self.loop_metrics["steps"] = int(self.loop_metrics.get("steps", 0)) + int(latest_loop_metrics.get("steps", 0)) + self.loop_metrics["model_turns"] = int(self.loop_metrics.get("model_turns", 0)) + int(latest_loop_metrics.get("model_turns", 0)) + self.loop_metrics["tool_calls"] = int(self.loop_metrics.get("tool_calls", 0)) + int(latest_loop_metrics.get("tool_calls", 0)) + self.loop_metrics["guardrail_warnings"] = int(self.loop_metrics.get("guardrail_warnings", 0)) + int(latest_loop_metrics.get("guardrail_warnings", 0)) + self.loop_metrics["final_rejections"] = int(self.loop_metrics.get("final_rejections", 0)) + int(latest_loop_metrics.get("final_rejections", 0)) + self.loop_metrics["extensions_granted"] = int(self.loop_metrics.get("extensions_granted", 0)) + int(latest_loop_metrics.get("extensions_granted", 0)) + self.loop_metrics["extension_eligible_checks"] = int(self.loop_metrics.get("extension_eligible_checks", 0)) + int(latest_loop_metrics.get("extension_eligible_checks", 0)) + self.loop_metrics["extension_denials_no_progress"] = int(self.loop_metrics.get("extension_denials_no_progress", 0)) + int(latest_loop_metrics.get("extension_denials_no_progress", 0)) + self.loop_metrics["extension_denials_cap"] = int(self.loop_metrics.get("extension_denials_cap", 0)) + int(latest_loop_metrics.get("extension_denials_cap", 0)) + self.loop_metrics["termination_reason"] = str(latest_loop_metrics.get("termination_reason", "")) + phase_counts = self.loop_metrics.setdefault("phase_counts", {}) + latest_phase_counts = latest_loop_metrics.get("phase_counts", {}) + if not isinstance(phase_counts, dict): + phase_counts = {} + self.loop_metrics["phase_counts"] = phase_counts + if not isinstance(latest_phase_counts, dict): + latest_phase_counts = {} + for phase in ("investigate", "build", "iterate", "finalize"): + phase_counts[phase] = int(phase_counts.get(phase, 0)) + int(latest_phase_counts.get(phase, 0)) + self.loop_metrics["last_turn"] = latest_loop_metrics + # Generate turn summary if self.turn_history is None: self.turn_history = [] turn_number = (self.turn_history[-1].turn_number + 1) if self.turn_history else 1 result_preview = result[:200] + "..." if len(result) > 200 else result - steps_used = replay_logger._seq - replay_seq_start + replay_seq_end = replay_logger.current_seq + steps_used = max(0, replay_seq_end - replay_seq_start) summary = TurnSummary( turn_number=turn_number, objective=objective, @@ -402,6 +602,7 @@ def _combined_on_step(step_event: dict[str, Any]) -> None: self._persist_state() except OSError: pass + self._flush_store_warnings(_on_event) return result def _persist_state(self) -> None: @@ -414,5 +615,6 @@ def _persist_state(self) -> None: } if self.turn_history: state["turn_history"] = [t.to_dict() for t in self.turn_history] + if self.loop_metrics: + state["loop_metrics"] = self.loop_metrics self.store.save_state(self.session_id, state) - diff --git a/openplanter-desktop/crates/op-core/src/config.rs b/openplanter-desktop/crates/op-core/src/config.rs index f6ff3039..194e9a16 100644 --- a/openplanter-desktop/crates/op-core/src/config.rs +++ b/openplanter-desktop/crates/op-core/src/config.rs @@ -32,6 +32,13 @@ fn env_int(key: &str, default: i64) -> i64 { .unwrap_or(default) } +fn env_float(key: &str, default: f64) -> f64 { + env::var(key) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(default) +} + fn env_bool(key: &str, default: bool) -> bool { match env::var(key) { Ok(v) => matches!(v.trim().to_lowercase().as_str(), "1" | "true" | "yes"), @@ -70,6 +77,9 @@ pub struct AgentConfig { // Limits pub max_depth: i64, pub max_steps_per_call: i64, + pub budget_extension_enabled: bool, + pub budget_extension_block_steps: i64, + pub budget_extension_max_blocks: i64, pub max_observation_chars: i64, pub command_timeout_sec: i64, pub shell: String, @@ -80,6 +90,10 @@ pub struct AgentConfig { pub session_root_dir: String, pub max_persisted_observations: i64, pub max_solve_seconds: i64, + pub rate_limit_max_retries: i64, + pub rate_limit_backoff_base_sec: f64, + pub rate_limit_backoff_max_sec: f64, + pub rate_limit_retry_after_cap_sec: f64, pub recursive: bool, pub min_subtask_depth: i64, pub acceptance_criteria: bool, @@ -111,6 +125,9 @@ impl Default for AgentConfig { voyage_api_key: None, max_depth: 4, max_steps_per_call: 100, + budget_extension_enabled: true, + budget_extension_block_steps: 20, + budget_extension_max_blocks: 2, max_observation_chars: 6000, command_timeout_sec: 45, shell: "/bin/sh".into(), @@ -121,6 +138,10 @@ impl Default for AgentConfig { session_root_dir: ".openplanter".into(), max_persisted_observations: 400, max_solve_seconds: 0, + rate_limit_max_retries: 12, + rate_limit_backoff_base_sec: 1.0, + rate_limit_backoff_max_sec: 60.0, + rate_limit_retry_after_cap_sec: 120.0, recursive: true, min_subtask_depth: 0, acceptance_criteria: true, @@ -209,6 +230,11 @@ impl AgentConfig { voyage_api_key, max_depth: env_int("OPENPLANTER_MAX_DEPTH", 4), max_steps_per_call: env_int("OPENPLANTER_MAX_STEPS", 100), + budget_extension_enabled: env_bool("OPENPLANTER_BUDGET_EXTENSION_ENABLED", true), + budget_extension_block_steps: env_int("OPENPLANTER_BUDGET_EXTENSION_BLOCK_STEPS", 20) + .max(1), + budget_extension_max_blocks: env_int("OPENPLANTER_BUDGET_EXTENSION_MAX_BLOCKS", 2) + .max(0), max_observation_chars: env_int("OPENPLANTER_MAX_OBS_CHARS", 6000), command_timeout_sec: env_int("OPENPLANTER_CMD_TIMEOUT", 45), shell: env_or("OPENPLANTER_SHELL", "/bin/sh"), @@ -219,6 +245,13 @@ impl AgentConfig { session_root_dir: env_or("OPENPLANTER_SESSION_DIR", ".openplanter"), max_persisted_observations: env_int("OPENPLANTER_MAX_PERSISTED_OBS", 400), max_solve_seconds: env_int("OPENPLANTER_MAX_SOLVE_SECONDS", 0), + rate_limit_max_retries: env_int("OPENPLANTER_RATE_LIMIT_MAX_RETRIES", 12), + rate_limit_backoff_base_sec: env_float("OPENPLANTER_RATE_LIMIT_BACKOFF_BASE_SEC", 1.0), + rate_limit_backoff_max_sec: env_float("OPENPLANTER_RATE_LIMIT_BACKOFF_MAX_SEC", 60.0), + rate_limit_retry_after_cap_sec: env_float( + "OPENPLANTER_RATE_LIMIT_RETRY_AFTER_CAP_SEC", + 120.0, + ), recursive: env_bool("OPENPLANTER_RECURSIVE", true), min_subtask_depth: env_int("OPENPLANTER_MIN_SUBTASK_DEPTH", 0), acceptance_criteria: env_bool("OPENPLANTER_ACCEPTANCE_CRITERIA", true), @@ -263,6 +296,13 @@ mod tests { assert_eq!(cfg.reasoning_effort, Some("high".into())); assert_eq!(cfg.max_depth, 4); assert_eq!(cfg.max_steps_per_call, 100); + assert!(cfg.budget_extension_enabled); + assert_eq!(cfg.budget_extension_block_steps, 20); + assert_eq!(cfg.budget_extension_max_blocks, 2); + assert_eq!(cfg.rate_limit_max_retries, 12); + assert_eq!(cfg.rate_limit_backoff_base_sec, 1.0); + assert_eq!(cfg.rate_limit_backoff_max_sec, 60.0); + assert_eq!(cfg.rate_limit_retry_after_cap_sec, 120.0); assert!(cfg.recursive); assert!(cfg.acceptance_criteria); assert!(!cfg.demo); @@ -299,8 +339,15 @@ mod tests { "OPENPLANTER_ANTHROPIC_API_KEY", "ANTHROPIC_API_KEY", "OPENPLANTER_MAX_DEPTH", + "OPENPLANTER_BUDGET_EXTENSION_ENABLED", + "OPENPLANTER_BUDGET_EXTENSION_BLOCK_STEPS", + "OPENPLANTER_BUDGET_EXTENSION_MAX_BLOCKS", "OPENPLANTER_RECURSIVE", "OPENPLANTER_DEMO", + "OPENPLANTER_RATE_LIMIT_MAX_RETRIES", + "OPENPLANTER_RATE_LIMIT_BACKOFF_BASE_SEC", + "OPENPLANTER_RATE_LIMIT_BACKOFF_MAX_SEC", + "OPENPLANTER_RATE_LIMIT_RETRY_AFTER_CAP_SEC", ]; // Save original values let saved: Vec<_> = keys @@ -321,6 +368,13 @@ mod tests { assert_eq!(cfg.model, "claude-opus-4-6"); assert_eq!(cfg.reasoning_effort, Some("high".into())); assert_eq!(cfg.max_depth, 4); + assert!(cfg.budget_extension_enabled); + assert_eq!(cfg.budget_extension_block_steps, 20); + assert_eq!(cfg.budget_extension_max_blocks, 2); + assert_eq!(cfg.rate_limit_max_retries, 12); + assert_eq!(cfg.rate_limit_backoff_base_sec, 1.0); + assert_eq!(cfg.rate_limit_backoff_max_sec, 60.0); + assert_eq!(cfg.rate_limit_retry_after_cap_sec, 120.0); assert!(cfg.recursive); assert!(!cfg.demo); assert!(cfg.openai_api_key.is_none()); @@ -332,9 +386,16 @@ mod tests { env::set_var("OPENPLANTER_MODEL", "gpt-5.2"); env::set_var("OPENPLANTER_REASONING_EFFORT", "low"); env::set_var("OPENPLANTER_MAX_DEPTH", "8"); + env::set_var("OPENPLANTER_BUDGET_EXTENSION_ENABLED", "false"); + env::set_var("OPENPLANTER_BUDGET_EXTENSION_BLOCK_STEPS", "9"); + env::set_var("OPENPLANTER_BUDGET_EXTENSION_MAX_BLOCKS", "1"); env::set_var("OPENPLANTER_RECURSIVE", "false"); env::set_var("OPENPLANTER_DEMO", "true"); env::set_var("OPENAI_API_KEY", "sk-test123"); + env::set_var("OPENPLANTER_RATE_LIMIT_MAX_RETRIES", "5"); + env::set_var("OPENPLANTER_RATE_LIMIT_BACKOFF_BASE_SEC", "2.5"); + env::set_var("OPENPLANTER_RATE_LIMIT_BACKOFF_MAX_SEC", "30.0"); + env::set_var("OPENPLANTER_RATE_LIMIT_RETRY_AFTER_CAP_SEC", "90.0"); } let cfg = AgentConfig::from_env("/tmp"); @@ -342,6 +403,13 @@ mod tests { assert_eq!(cfg.model, "gpt-5.2"); assert_eq!(cfg.reasoning_effort, Some("low".into())); assert_eq!(cfg.max_depth, 8); + assert!(!cfg.budget_extension_enabled); + assert_eq!(cfg.budget_extension_block_steps, 9); + assert_eq!(cfg.budget_extension_max_blocks, 1); + assert_eq!(cfg.rate_limit_max_retries, 5); + assert_eq!(cfg.rate_limit_backoff_base_sec, 2.5); + assert_eq!(cfg.rate_limit_backoff_max_sec, 30.0); + assert_eq!(cfg.rate_limit_retry_after_cap_sec, 90.0); assert!(!cfg.recursive); assert!(cfg.demo); assert_eq!(cfg.openai_api_key, Some("sk-test123".into())); diff --git a/openplanter-desktop/crates/op-core/src/engine/context.rs b/openplanter-desktop/crates/op-core/src/engine/context.rs index 0b522c2f..85fa1fb2 100644 --- a/openplanter-desktop/crates/op-core/src/engine/context.rs +++ b/openplanter-desktop/crates/op-core/src/engine/context.rs @@ -1,9 +1,17 @@ // External context and turn summary types for multi-turn sessions. use serde::{Deserialize, Serialize}; +use serde_json::Value; use std::path::Path; use tokio::fs; +use super::investigation_state::InvestigationState; + +struct ResolvedInvestigationState { + state: InvestigationState, + legacy_rust_observations: Option>, +} + /// Summary of a completed turn for inclusion in subsequent prompts. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TurnSummary { @@ -45,23 +53,57 @@ impl ExternalContext { }); } - /// Load external context from state.json in the session directory. + /// Load external context from canonical investigation_state.json or legacy state.json. pub async fn load(session_dir: &Path) -> std::io::Result { - let path = session_dir.join("state.json"); - if !path.exists() { - return Ok(Self::new()); + let session_id = session_dir + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default(); + let resolved = resolve_investigation_state(session_dir, session_id).await?; + if let Some(observations) = resolved.legacy_rust_observations { + return Ok(Self { observations }); } - let content = fs::read_to_string(&path).await?; - serde_json::from_str(&content) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) + Ok(Self { + observations: resolved + .state + .legacy_observations() + .into_iter() + .map(|content| Observation { + source: "legacy".to_string(), + timestamp: String::new(), + content, + }) + .collect(), + }) } - /// Save external context to state.json in the session directory. + /// Save external context to additive investigation_state.json and legacy state.json. pub async fn save(&self, session_dir: &Path) -> std::io::Result<()> { - let path = session_dir.join("state.json"); - let json = serde_json::to_string_pretty(self) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - fs::write(&path, json).await + let session_id = session_dir + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default(); + let typed_path = session_dir.join("investigation_state.json"); + let legacy_path = session_dir.join("state.json"); + + let mut typed_state = load_or_migrate_investigation_state(session_dir).await?; + if typed_state.session_id.is_empty() { + typed_state.session_id = session_id.to_string(); + } + let observations: Vec = self + .observations + .iter() + .map(|observation| observation.content.clone()) + .collect(); + typed_state.merge_legacy_updates(&observations, None, None, None); + + let typed_json = serde_json::to_string_pretty(&typed_state) + .map_err(|e| std::io::Error::other(e.to_string()))?; + fs::write(&typed_path, typed_json).await?; + + let legacy_json = serde_json::to_string_pretty(&typed_state.to_legacy_python_projection()) + .map_err(|e| std::io::Error::other(e.to_string()))?; + fs::write(&legacy_path, legacy_json).await } } @@ -71,6 +113,126 @@ impl Default for ExternalContext { } } +pub async fn load_or_migrate_investigation_state( + session_dir: &Path, +) -> std::io::Result { + let session_id = session_dir + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default(); + load_existing_investigation_state(session_dir, session_id).await +} + +async fn load_existing_investigation_state( + session_dir: &Path, + session_id: &str, +) -> std::io::Result { + Ok(resolve_investigation_state(session_dir, session_id) + .await? + .state) +} + +async fn resolve_investigation_state( + session_dir: &Path, + session_id: &str, +) -> std::io::Result { + let typed_path = session_dir.join("investigation_state.json"); + if let Some(state) = try_load_typed_state(&typed_path).await? { + return Ok(ResolvedInvestigationState { + state, + legacy_rust_observations: None, + }); + } + + let legacy_path = session_dir.join("state.json"); + if !legacy_path.exists() { + return Ok(ResolvedInvestigationState { + state: InvestigationState::new(session_id), + legacy_rust_observations: None, + }); + } + + let content = fs::read_to_string(&legacy_path).await?; + let value: Value = serde_json::from_str(&content) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + + if legacy_python_observations(&value).is_some() { + return Ok(ResolvedInvestigationState { + state: InvestigationState::from_legacy_python_state(session_id, &value), + legacy_rust_observations: None, + }); + } + if let Some(observations) = legacy_rust_observations(&value) { + return Ok(ResolvedInvestigationState { + state: InvestigationState::from_legacy_rust_state(session_id, &value), + legacy_rust_observations: Some(observations), + }); + } + + Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "state.json format not recognized", + )) +} + +async fn try_load_typed_state(path: &Path) -> std::io::Result> { + if !path.exists() { + return Ok(None); + } + + let content = match fs::read_to_string(path).await { + Ok(content) => content, + Err(err) if err.kind() == std::io::ErrorKind::InvalidData => return Ok(None), + Err(err) => return Err(err), + }; + + match serde_json::from_str(&content) { + Ok(state) => Ok(Some(state)), + Err(_) => Ok(None), + } +} + +fn legacy_python_observations(value: &Value) -> Option> { + value + .as_object()? + .get("external_observations")? + .as_array() + .map(|items| { + items + .iter() + .filter_map(Value::as_str) + .map(ToString::to_string) + .collect() + }) +} + +fn legacy_rust_observations(value: &Value) -> Option> { + let observations = value.as_object()?.get("observations")?.as_array()?; + Some( + observations + .iter() + .filter_map(|item| item.as_object()) + .map(|item| Observation { + source: item + .get("source") + .and_then(Value::as_str) + .unwrap_or("legacy") + .to_string(), + timestamp: item + .get("timestamp") + .and_then(Value::as_str) + .unwrap_or_default() + .to_string(), + content: item + .get("content") + .and_then(Value::as_str) + .unwrap_or_default() + .to_string(), + }) + .collect(), + ) +} + #[cfg(test)] mod tests { use super::*; @@ -102,6 +264,8 @@ mod tests { let loaded = ExternalContext::load(tmp.path()).await.unwrap(); assert_eq!(loaded.observations.len(), 1); assert_eq!(loaded.observations[0].content, "test observation"); + assert!(tmp.path().join("investigation_state.json").exists()); + assert!(tmp.path().join("state.json").exists()); } #[tokio::test] @@ -111,6 +275,277 @@ mod tests { assert!(ctx.observations.is_empty()); } + #[tokio::test] + async fn test_load_legacy_python_state_shape() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"session_id":"sid","external_observations":["one","two"]}"#, + ) + .await + .unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert_eq!(ctx.observations.len(), 2); + assert_eq!(ctx.observations[0].content, "one"); + assert_eq!(ctx.observations[1].content, "two"); + } + + #[tokio::test] + async fn test_load_or_migrate_investigation_state_prefers_typed_state() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("investigation_state.json"), + r#"{"schema_version":"1.0.0","session_id":"sid","questions":{"q_1":{"id":"q_1","question_text":"keep me"}}}"#, + ) + .await + .unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"session_id":"sid","external_observations":["legacy"]}"#, + ) + .await + .unwrap(); + + let state = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap(); + assert!(state.questions.contains_key("q_1")); + assert!(state.legacy.external_observations.is_empty()); + } + + #[tokio::test] + async fn test_load_or_migrate_investigation_state_migrates_legacy_state() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"session_id":"sid","external_observations":["legacy one"]}"#, + ) + .await + .unwrap(); + + let state = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap(); + assert_eq!(state.legacy.external_observations, vec!["legacy one"]); + assert_eq!( + state.evidence["ev_legacy_000001"]["content"], + Value::String("legacy one".to_string()) + ); + } + + #[tokio::test] + async fn test_load_legacy_rust_state_shape() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"observations":[{"source":"wiki","timestamp":"2026-03-13T00:00:00Z","content":"one"},{"source":"tool","timestamp":"2026-03-13T00:00:01Z","content":"two"}]}"#, + ) + .await + .unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert_eq!(ctx.observations.len(), 2); + assert_eq!(ctx.observations[0].source, "wiki"); + assert_eq!(ctx.observations[1].content, "two"); + } + + #[tokio::test] + async fn test_load_typed_state_falls_back_to_evidence() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("investigation_state.json"), + r#"{"schema_version":"1.0.0","session_id":"sid","evidence":{"ev_legacy_000002":{"content":"two","normalization":{"kind":"legacy_observation"}},"ev_legacy_000001":{"content":"one","normalization":{"kind":"legacy_observation"}}}}"#, + ) + .await + .unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert_eq!(ctx.observations.len(), 2); + assert_eq!(ctx.observations[0].content, "one"); + assert_eq!(ctx.observations[1].content, "two"); + } + + #[tokio::test] + async fn test_invalid_typed_state_falls_back_to_legacy_python_state() { + let tmp = tempdir().unwrap(); + let typed_path = tmp.path().join("investigation_state.json"); + let corrupt_typed = "{not-json"; + fs::write(&typed_path, corrupt_typed).await.unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"session_id":"sid","external_observations":["legacy fallback"]}"#, + ) + .await + .unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert_eq!(ctx.observations.len(), 1); + assert_eq!(ctx.observations[0].content, "legacy fallback"); + + let state = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap(); + assert_eq!(state.legacy.external_observations, vec!["legacy fallback"]); + assert_eq!( + state.evidence["ev_legacy_000001"]["content"], + Value::String("legacy fallback".to_string()) + ); + assert_eq!( + fs::read_to_string(&typed_path).await.unwrap(), + corrupt_typed + ); + } + + #[tokio::test] + async fn test_invalid_typed_state_falls_back_to_legacy_rust_observations() { + let tmp = tempdir().unwrap(); + let typed_path = tmp.path().join("investigation_state.json"); + fs::write(&typed_path, "{not-json").await.unwrap(); + fs::write( + tmp.path().join("state.json"), + r#"{"observations":[{"source":"wiki","timestamp":"2026-03-13T00:00:00Z","content":"one"},{"source":"tool","timestamp":"2026-03-13T00:00:01Z","content":"two"}]}"#, + ) + .await + .unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert_eq!(ctx.observations.len(), 2); + assert_eq!(ctx.observations[0].source, "wiki"); + assert_eq!(ctx.observations[0].timestamp, "2026-03-13T00:00:00Z"); + assert_eq!(ctx.observations[1].content, "two"); + + let state = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap(); + assert_eq!(state.legacy.external_observations, vec!["one", "two"]); + assert_eq!(fs::read_to_string(&typed_path).await.unwrap(), "{not-json"); + } + + #[tokio::test] + async fn test_invalid_typed_state_without_legacy_returns_empty_state() { + let tmp = tempdir().unwrap(); + let typed_path = tmp.path().join("investigation_state.json"); + fs::write(&typed_path, "{not-json").await.unwrap(); + + let ctx = ExternalContext::load(tmp.path()).await.unwrap(); + assert!(ctx.observations.is_empty()); + + let state = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap(); + assert_eq!( + state.session_id, + tmp.path() + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or_default() + ); + assert!(state.legacy.external_observations.is_empty()); + assert!(state.evidence.is_empty()); + assert_eq!(fs::read_to_string(&typed_path).await.unwrap(), "{not-json"); + } + + #[tokio::test] + async fn test_invalid_typed_state_with_malformed_legacy_remains_error() { + let tmp = tempdir().unwrap(); + fs::write(tmp.path().join("investigation_state.json"), "{not-json") + .await + .unwrap(); + fs::write(tmp.path().join("state.json"), "{still-not-json") + .await + .unwrap(); + + let ctx_err = ExternalContext::load(tmp.path()).await.unwrap_err(); + assert_eq!(ctx_err.kind(), std::io::ErrorKind::InvalidData); + + let state_err = load_or_migrate_investigation_state(tmp.path()) + .await + .unwrap_err(); + assert_eq!(state_err.kind(), std::io::ErrorKind::InvalidData); + } + + #[tokio::test] + async fn test_save_preserves_existing_typed_fields_and_extra_fields() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("investigation_state.json"), + r#"{ + "schema_version": "1.0.0", + "session_id": "", + "created_at": "2026-03-13T00:00:00Z", + "updated_at": "2026-03-13T00:00:00Z", + "objective": "", + "ontology": {"namespace": "openplanter.core", "version": "2026-03"}, + "entities": {}, + "links": {}, + "claims": {}, + "evidence": { + "ev_legacy_000002": { + "id": "ev_legacy_000002", + "content": "stale", + "normalization": {"kind": "legacy_observation"} + }, + "ev_other": { + "id": "ev_other", + "content": "keep me", + "normalization": {"kind": "web_fetch"} + } + }, + "hypotheses": {}, + "questions": {"q_1": {"id": "q_1", "question_text": "keep me"}}, + "tasks": {}, + "actions": {}, + "provenance_nodes": {}, + "confidence_profiles": {}, + "timeline": [], + "indexes": {"by_external_ref": {}, "by_tag": {}}, + "legacy": { + "external_observations": ["stale"], + "turn_history": [{"turn_number": 2}], + "loop_metrics": {"turns": 2}, + "extra_fields": {"custom_field": "persist"} + } +}"#, + ) + .await + .unwrap(); + + let mut ctx = ExternalContext::new(); + ctx.add_observation("wiki", "fresh"); + ctx.save(tmp.path()).await.unwrap(); + + let typed: Value = serde_json::from_str( + &fs::read_to_string(tmp.path().join("investigation_state.json")) + .await + .unwrap(), + ) + .unwrap(); + assert_eq!( + typed["questions"]["q_1"]["question_text"], + Value::String("keep me".to_string()) + ); + assert!(typed["evidence"].get("ev_other").is_some()); + assert!(typed["evidence"].get("ev_legacy_000002").is_none()); + assert_eq!( + typed["evidence"]["ev_legacy_000001"]["content"], + Value::String("fresh".to_string()) + ); + + let legacy: Value = serde_json::from_str( + &fs::read_to_string(tmp.path().join("state.json")) + .await + .unwrap(), + ) + .unwrap(); + assert_eq!( + legacy["external_observations"], + serde_json::json!(["fresh"]) + ); + assert_eq!(legacy["custom_field"], Value::String("persist".to_string())); + assert_eq!(legacy["loop_metrics"]["turns"], Value::from(2)); + } + #[test] fn test_turn_summary_serialization() { let ts = TurnSummary { diff --git a/openplanter-desktop/crates/op-core/src/engine/curator.rs b/openplanter-desktop/crates/op-core/src/engine/curator.rs index e0015567..610fa8c0 100644 --- a/openplanter-desktop/crates/op-core/src/engine/curator.rs +++ b/openplanter-desktop/crates/op-core/src/engine/curator.rs @@ -1,16 +1,18 @@ -/// Background wiki curator agent. +/// Checkpointed wiki curator synthesizer. /// -/// Runs as a non-blocking background task after each main agent step. -/// Reads the latest step context, decides if wiki updates are needed, -/// and writes to `.openplanter/wiki/` using a restricted tool set. +/// Runs at explicit solve-loop phase boundaries and updates wiki files from +/// typed state deltas rather than raw transcript slices. +use std::collections::BTreeSet; +use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use crate::builder::build_model; use crate::config::AgentConfig; +use crate::events::LoopPhase; use crate::model::Message; -use crate::tools::defs::build_curator_tool_defs; use crate::tools::WorkspaceTools; +use crate::tools::defs::build_curator_tool_defs; /// Result of a curator run. #[derive(Debug, Clone)] @@ -19,21 +21,49 @@ pub struct CuratorResult { pub files_changed: u32, } -const CURATOR_SYSTEM_PROMPT: &str = r#"You are the Wiki Curator, a background agent that maintains the investigation wiki. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CuratorToolObservation { + pub tool_call_id: String, + pub tool_name: String, + pub arguments_json: String, + pub output_excerpt: String, + pub is_error: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CuratorStateDelta { + pub step: u32, + pub phase: LoopPhase, + pub objective: String, + pub observations: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CuratorCheckpoint { + pub boundary: String, + pub deltas: Vec, +} + +const CURATOR_SYSTEM_PROMPT: &str = r#"You are the Wiki Curator Synthesizer. + +You run ONLY at explicit solve-loop phase boundaries and receive typed checkpoint +deltas rather than raw transcript slices. -Your ONLY job is to update the wiki at .openplanter/wiki/ based on the main agent's latest step. +The wiki at `.openplanter/wiki/` is a DERIVED knowledge surface. It is not the +agent's primary memory store. == RULES == -1. You may ONLY modify files under .openplanter/wiki/ -2. Read .openplanter/wiki/index.md first to understand existing entries -3. If the main agent discovered a new data source, create a wiki entry using the template format -4. If the main agent found new information about an existing source, update the relevant entry -5. Update .openplanter/wiki/index.md to link any new entries in the correct category table -6. Use EXACT source names in Cross-Reference sections to power the knowledge graph -7. If nothing in the step context is wiki-relevant, respond with ONLY: "No wiki updates needed" -8. Keep entries factual and concise — document what was found, not speculation -9. Never modify files outside .openplanter/wiki/ -10. Maximum 8 tool calls — be efficient +1. You may ONLY modify files under `.openplanter/wiki/`. +2. Read `.openplanter/wiki/index.md` before writing so links and exact source names stay consistent. +3. Use ONLY tool-grounded facts from the checkpoint payload. Do not invent or infer unsupported details. +4. Preserve provenance. When adding facts, keep concise evidence anchors using the originating step, tool name, and tool call ID. +5. Eliminate duplicate and noisy updates. Prefer a no-op over restating facts already captured in the wiki. +6. Ignore low-information operational traces unless they reveal durable source facts worth documenting. +7. If the checkpoint contains no wiki-relevant net-new knowledge, respond with EXACTLY: "No wiki updates needed". +8. Keep entries factual and concise. Document what was learned, not speculation. +9. Prefer `edit_file` over whole-file rewrites when possible. +10. Only use `write_file` or `edit_file` for mutations. +11. Maximum 8 tool calls. == WIKI ENTRY TEMPLATE == When creating a new entry, use this format: @@ -61,58 +91,67 @@ Brief description of what this data source provides. - [Other Source Name]: how they connect - [Another Source]: join key or relationship -== STEP CONTEXT == -Below is the main agent's latest step. Analyze it for wiki-relevant discoveries."#; +== CHECKPOINT PAYLOAD == +Below is a typed checkpoint payload with per-step tool observations. Analyze it +for durable wiki-relevant discoveries."#; /// Maximum number of tool-call steps for the curator. const MAX_CURATOR_STEPS: usize = 8; +const MAX_TOOL_OUTPUT_EXCERPT: usize = 1_200; -/// Maximum chars of context to extract from the main agent's messages. -const MAX_CONTEXT_CHARS: usize = 8_000; +fn trim_excerpt(raw: &str) -> String { + if raw.len() <= MAX_TOOL_OUTPUT_EXCERPT { + return raw.to_string(); + } -/// Extract the latest step context from the main conversation. -/// -/// Walks backwards from the end to find the last Assistant message, -/// then collects it plus any subsequent Tool messages. -pub fn extract_step_context(messages: &[Message]) -> String { - let mut context = String::new(); - - // Find last Assistant message index - let assistant_idx = messages.iter().rposition(|m| matches!(m, Message::Assistant { .. })); - let start = match assistant_idx { - Some(idx) => idx, - None => return context, + let end = if raw.is_char_boundary(MAX_TOOL_OUTPUT_EXCERPT) { + MAX_TOOL_OUTPUT_EXCERPT + } else { + raw.char_indices() + .map(|(idx, _)| idx) + .take_while(|idx| *idx < MAX_TOOL_OUTPUT_EXCERPT) + .last() + .unwrap_or(0) }; - for msg in &messages[start..] { - match msg { - Message::Assistant { content, tool_calls } => { - context.push_str("=== Assistant ===\n"); - context.push_str(content); - context.push('\n'); - if let Some(tcs) = tool_calls { - for tc in tcs { - context.push_str(&format!("[Tool call: {}]\n", tc.name)); - } - } - } - Message::Tool { content, .. } => { - context.push_str("=== Tool Result ===\n"); - context.push_str(content); - context.push('\n'); + let mut trimmed = raw[..end].to_string(); + trimmed.push_str("\n...[truncated]"); + trimmed +} + +pub fn build_state_delta( + step: u32, + phase: LoopPhase, + objective: &str, + tools: &[(String, String, String, String, bool)], +) -> Option { + let observations = tools + .iter() + .filter_map(|(id, name, args, content, is_error)| { + if content.trim().is_empty() && !*is_error { + return None; } - _ => {} - } - } - // Truncate to budget - if context.len() > MAX_CONTEXT_CHARS { - let end = context.floor_char_boundary(MAX_CONTEXT_CHARS); - context.truncate(end); - context.push_str("\n...[truncated]"); + Some(CuratorToolObservation { + tool_call_id: id.clone(), + tool_name: name.clone(), + arguments_json: args.clone(), + output_excerpt: trim_excerpt(content), + is_error: *is_error, + }) + }) + .collect::>(); + + if observations.is_empty() { + return None; } - context + Some(CuratorStateDelta { + step, + phase, + objective: objective.to_string(), + observations, + }) } /// Curator tool names — the subset of tools the curator is allowed to use. @@ -122,62 +161,85 @@ pub const CURATOR_TOOL_NAMES: &[&str] = &[ "read_file", "write_file", "edit_file", - "apply_patch", - "hashline_edit", "think", ]; -/// Run the curator agent with the given step context. -/// -/// Creates its own model instance and tool set, runs a mini agentic loop -/// with restricted tools, and returns a summary of changes made. +/// Legacy context entry point retained for migration and initialization flows. pub async fn run_curator( context: &str, config: &AgentConfig, cancel: CancellationToken, ) -> Result { - if context.is_empty() { + if context.trim().is_empty() { return Ok(CuratorResult { - summary: "No context to curate".into(), + summary: "No checkpoint deltas to curate".into(), files_changed: 0, }); } - // Build model - let model = build_model(config).map_err(|e| e.to_string())?; + let checkpoint = CuratorCheckpoint { + boundary: "migration_context".to_string(), + deltas: vec![CuratorStateDelta { + step: 0, + phase: LoopPhase::Iterate, + objective: "workspace initialization wiki rewrite".to_string(), + observations: vec![CuratorToolObservation { + tool_call_id: "migration_context".to_string(), + tool_name: "workspace_init".to_string(), + arguments_json: "{}".to_string(), + output_excerpt: trim_excerpt(context), + is_error: false, + }], + }], + }; + run_curator_checkpoint(&checkpoint, config, cancel).await +} +/// Run the curator agent with an explicit checkpoint payload. +pub async fn run_curator_checkpoint( + checkpoint: &CuratorCheckpoint, + config: &AgentConfig, + cancel: CancellationToken, +) -> Result { + if checkpoint.deltas.is_empty() { + return Ok(CuratorResult { + summary: "No checkpoint deltas to curate".into(), + files_changed: 0, + }); + } + + let model = build_model(config).map_err(|e| e.to_string())?; let provider = model.provider_name().to_string(); let tool_defs = build_curator_tool_defs(&provider); - let mut tools = WorkspaceTools::new(config); + let mut tools = WorkspaceTools::new_curator(config); let mut messages = vec![ Message::System { content: CURATOR_SYSTEM_PROMPT.to_string(), }, Message::User { - content: context.to_string(), + content: serde_json::to_string_pretty(checkpoint) + .map_err(|e| format!("failed to serialize checkpoint: {e}"))?, }, ]; - let mut files_changed: u32 = 0; - let mut summary_parts: Vec = Vec::new(); + let mut touched_paths = BTreeSet::new(); + let mut summary_parts = Vec::new(); - // Mini agentic loop - for _step in 1..=MAX_CURATOR_STEPS { + for _ in 1..=MAX_CURATOR_STEPS { if cancel.is_cancelled() { + tools.cleanup(); return Ok(CuratorResult { summary: "Curator cancelled".into(), - files_changed, + files_changed: touched_paths.len() as u32, }); } - // Call model (non-streaming — curator runs silently) let turn = model .chat(&messages, &tool_defs) .await .map_err(|e| e.to_string())?; - // Append assistant message let tool_calls_opt = if turn.tool_calls.is_empty() { None } else { @@ -188,30 +250,29 @@ pub async fn run_curator( tool_calls: tool_calls_opt, }); - // No tool calls → curator is done if turn.tool_calls.is_empty() { - if turn.text.contains("No wiki updates needed") { + if turn.text.trim() == "No wiki updates needed" { + tools.cleanup(); return Ok(CuratorResult { summary: "No wiki updates needed".into(), files_changed: 0, }); } - if !turn.text.is_empty() && summary_parts.is_empty() { - summary_parts.push(turn.text.clone()); + if !turn.text.trim().is_empty() { + summary_parts.push(turn.text.trim().to_string()); } break; } - // Execute tool calls for tc in &turn.tool_calls { if cancel.is_cancelled() { + tools.cleanup(); return Ok(CuratorResult { summary: "Curator cancelled".into(), - files_changed, + files_changed: touched_paths.len() as u32, }); } - // Validate tool is in allowed set if !CURATOR_TOOL_NAMES.contains(&tc.name.as_str()) { messages.push(Message::Tool { tool_call_id: tc.id.clone(), @@ -221,16 +282,10 @@ pub async fn run_curator( } let result = tools.execute(&tc.name, &tc.arguments).await; - - // Track file modifications - if matches!(tc.name.as_str(), "write_file" | "edit_file" | "apply_patch" | "hashline_edit") - && !result.is_error - { - files_changed += 1; - // Extract path for summary + if matches!(tc.name.as_str(), "write_file" | "edit_file") && !result.is_error { if let Ok(args) = serde_json::from_str::(&tc.arguments) { - if let Some(path) = args.get("path").and_then(|p| p.as_str()) { - summary_parts.push(format!("Updated {}", path)); + if let Some(path) = args.get("path").and_then(|value| value.as_str()) { + touched_paths.insert(path.to_string()); } } } @@ -244,6 +299,10 @@ pub async fn run_curator( tools.cleanup(); + if !touched_paths.is_empty() { + summary_parts.push(format!("Updated {} wiki file(s)", touched_paths.len())); + } + let summary = if summary_parts.is_empty() { "Curator completed with no changes".into() } else { @@ -252,98 +311,63 @@ pub async fn run_curator( Ok(CuratorResult { summary, - files_changed, + files_changed: touched_paths.len() as u32, }) } #[cfg(test)] mod tests { use super::*; - use crate::model::ToolCall; - - #[test] - fn test_extract_step_context_empty() { - let messages: Vec = vec![]; - assert_eq!(extract_step_context(&messages), ""); - } - - #[test] - fn test_extract_step_context_no_assistant() { - let messages = vec![ - Message::System { content: "sys".into() }, - Message::User { content: "hello".into() }, - ]; - assert_eq!(extract_step_context(&messages), ""); - } - - #[test] - fn test_extract_step_context_with_tool_calls() { - let messages = vec![ - Message::System { content: "sys".into() }, - Message::User { content: "investigate".into() }, - Message::Assistant { - content: "I'll search for data.".into(), - tool_calls: Some(vec![ToolCall { - id: "t1".into(), - name: "web_search".into(), - arguments: r#"{"query":"test"}"#.into(), - }]), - }, - Message::Tool { - tool_call_id: "t1".into(), - content: "Search results here".into(), - }, - ]; - let ctx = extract_step_context(&messages); - assert!(ctx.contains("I'll search for data")); - assert!(ctx.contains("web_search")); - assert!(ctx.contains("Search results here")); - } #[test] - fn test_extract_step_context_truncation() { - let big_content = "x".repeat(MAX_CONTEXT_CHARS + 1000); - let messages = vec![Message::Assistant { - content: big_content, - tool_calls: None, - }]; - let ctx = extract_step_context(&messages); - assert!(ctx.len() <= MAX_CONTEXT_CHARS + 50); // +50 for prefix/suffix - assert!(ctx.contains("[truncated]")); + fn test_build_state_delta_trims_tool_output() { + let tools = vec![( + "call-1".to_string(), + "read_file".to_string(), + "{\"path\":\"a.md\"}".to_string(), + "x".repeat(MAX_TOOL_OUTPUT_EXCERPT + 64), + false, + )]; + + let delta = + build_state_delta(3, LoopPhase::Investigate, "Investigate sources", &tools).unwrap(); + + assert_eq!(delta.step, 3); + assert_eq!(delta.phase, LoopPhase::Investigate); + assert_eq!(delta.observations.len(), 1); + assert!(delta.observations[0].output_excerpt.contains("[truncated]")); } #[test] - fn test_extract_step_context_last_assistant_only() { - let messages = vec![ - Message::Assistant { - content: "old step".into(), - tool_calls: None, - }, - Message::User { content: "continue".into() }, - Message::Assistant { - content: "new step".into(), - tool_calls: Some(vec![ToolCall { - id: "t2".into(), - name: "read_file".into(), - arguments: "{}".into(), - }]), - }, - Message::Tool { - tool_call_id: "t2".into(), - content: "file contents".into(), - }, - ]; - let ctx = extract_step_context(&messages); - assert!(!ctx.contains("old step")); - assert!(ctx.contains("new step")); - assert!(ctx.contains("file contents")); + fn test_build_state_delta_skips_empty_success_observations() { + let tools = vec![( + "call-1".to_string(), + "read_file".to_string(), + "{}".to_string(), + String::new(), + false, + )]; + + assert!(build_state_delta(1, LoopPhase::Investigate, "Investigate", &tools).is_none()); } #[test] fn test_curator_tool_names_no_dangerous_tools() { for name in CURATOR_TOOL_NAMES { - assert!(!["web_search", "fetch_url", "run_shell", "run_shell_bg", "check_shell_bg", "kill_shell_bg"] - .contains(name), "Curator should not have access to {name}"); + assert!( + ![ + "web_search", + "fetch_url", + "run_shell", + "run_shell_bg", + "check_shell_bg", + "kill_shell_bg", + "apply_patch", + "hashline_edit" + ] + .contains(name), + "Curator should not have access to {name}" + ); } } } diff --git a/openplanter-desktop/crates/op-core/src/engine/investigation_state.rs b/openplanter-desktop/crates/op-core/src/engine/investigation_state.rs new file mode 100644 index 00000000..c4e80d2b --- /dev/null +++ b/openplanter-desktop/crates/op-core/src/engine/investigation_state.rs @@ -0,0 +1,2251 @@ +use chrono::Utc; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use std::collections::{BTreeMap, BTreeSet}; + +const SCHEMA_VERSION: &str = "1.0.0"; +const ONTOLOGY_NAMESPACE: &str = "openplanter.core"; +const ONTOLOGY_VERSION: &str = "2026-03"; +const LOW_CONFIDENCE_THRESHOLD: f64 = 0.60; +const VERY_LOW_CONFIDENCE_THRESHOLD: f64 = 0.40; +const MAX_CANDIDATE_ACTIONS: usize = 24; +const REQUIRED_EVIDENCE_COUNT: usize = 1; +const PLANNER_GENERATED_BY: &str = "question_reasoning_packet.v1"; +const LEGACY_KNOWN_KEYS: &[&str] = &[ + "session_id", + "saved_at", + "external_observations", + "observations", + "turn_history", + "loop_metrics", +]; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct InvestigationState { + #[serde(default = "default_schema_version")] + pub schema_version: String, + #[serde(default)] + pub session_id: String, + #[serde(default)] + pub created_at: String, + #[serde(default)] + pub updated_at: String, + #[serde(default)] + pub objective: String, + #[serde(default)] + pub ontology: Ontology, + #[serde(default)] + pub entities: BTreeMap, + #[serde(default)] + pub links: BTreeMap, + #[serde(default)] + pub claims: BTreeMap, + #[serde(default)] + pub evidence: BTreeMap, + #[serde(default)] + pub hypotheses: BTreeMap, + #[serde(default)] + pub questions: BTreeMap, + #[serde(default)] + pub tasks: BTreeMap, + #[serde(default)] + pub actions: BTreeMap, + #[serde(default)] + pub provenance_nodes: BTreeMap, + #[serde(default)] + pub confidence_profiles: BTreeMap, + #[serde(default)] + pub timeline: Vec, + #[serde(default)] + pub indexes: Indexes, + #[serde(default)] + pub legacy: LegacyState, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Ontology { + #[serde(default = "default_ontology_namespace")] + pub namespace: String, + #[serde(default = "default_ontology_version")] + pub version: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct LegacyState { + #[serde(default)] + pub external_observations: Vec, + #[serde(default)] + pub turn_history: Vec, + #[serde(default)] + pub loop_metrics: Map, + #[serde(default)] + pub extra_fields: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct Indexes { + #[serde(default)] + pub by_external_ref: BTreeMap, + #[serde(default)] + pub by_tag: BTreeMap, +} + +impl Default for InvestigationState { + fn default() -> Self { + Self::new("") + } +} + +impl Default for Ontology { + fn default() -> Self { + Self { + namespace: default_ontology_namespace(), + version: default_ontology_version(), + } + } +} + +impl InvestigationState { + pub fn new(session_id: &str) -> Self { + let ts = now(); + Self { + schema_version: default_schema_version(), + session_id: session_id.to_string(), + created_at: ts.clone(), + updated_at: ts, + objective: String::new(), + ontology: Ontology::default(), + entities: BTreeMap::new(), + links: BTreeMap::new(), + claims: BTreeMap::new(), + evidence: BTreeMap::new(), + hypotheses: BTreeMap::new(), + questions: BTreeMap::new(), + tasks: BTreeMap::new(), + actions: BTreeMap::new(), + provenance_nodes: BTreeMap::new(), + confidence_profiles: BTreeMap::new(), + timeline: vec![], + indexes: Indexes::default(), + legacy: LegacyState::default(), + } + } + + pub fn from_legacy_python_state(session_id: &str, legacy_json: &Value) -> Self { + let mut state = Self::new(session_id); + let Some(obj) = legacy_json.as_object() else { + return state; + }; + + if let Some(saved_at) = obj.get("saved_at").and_then(Value::as_str) { + state.updated_at = saved_at.to_string(); + state.created_at = saved_at.to_string(); + } + if let Some(session_id) = obj.get("session_id").and_then(Value::as_str) { + state.session_id = session_id.to_string(); + } + state.legacy.external_observations = obj + .get("external_observations") + .and_then(Value::as_array) + .map(|items| string_vec(items)) + .unwrap_or_default(); + state.legacy.turn_history = obj + .get("turn_history") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + state.legacy.loop_metrics = obj + .get("loop_metrics") + .and_then(Value::as_object) + .cloned() + .unwrap_or_default(); + state.legacy.extra_fields = extra_fields_from_object(obj); + let observations = state.legacy.external_observations.clone(); + state.merge_legacy_updates( + &observations, + Some(&state.legacy.turn_history.clone()), + Some(&state.legacy.loop_metrics.clone()), + Some(&state.legacy.extra_fields.clone()), + ); + state + } + + pub fn from_legacy_rust_state(session_id: &str, legacy_json: &Value) -> Self { + let mut state = Self::new(session_id); + let Some(obj) = legacy_json.as_object() else { + return state; + }; + + state.legacy.external_observations = obj + .get("observations") + .and_then(Value::as_array) + .map(|items| { + items + .iter() + .filter_map(|item| item.get("content").and_then(Value::as_str)) + .map(ToString::to_string) + .collect() + }) + .unwrap_or_default(); + state.legacy.extra_fields = extra_fields_from_object(obj); + let observations = state.legacy.external_observations.clone(); + state.merge_legacy_updates( + &observations, + Some(&state.legacy.turn_history.clone()), + Some(&state.legacy.loop_metrics.clone()), + Some(&state.legacy.extra_fields.clone()), + ); + state + } + + pub fn legacy_observations(&self) -> Vec { + if !self.legacy.external_observations.is_empty() { + return self.legacy.external_observations.clone(); + } + + let mut observations: Vec<(String, String)> = self + .evidence + .iter() + .filter_map(|(evidence_id, record)| { + if !is_legacy_evidence(evidence_id, record) { + return None; + } + record + .get("content") + .and_then(Value::as_str) + .map(|content| (evidence_id.clone(), content.to_string())) + }) + .collect(); + observations.sort_by(|left, right| left.0.cmp(&right.0)); + observations + .into_iter() + .map(|(_, content)| content) + .collect() + } + + pub fn merge_legacy_updates( + &mut self, + observations: &[String], + turn_history: Option<&[Value]>, + loop_metrics: Option<&Map>, + extra_fields: Option<&BTreeMap>, + ) { + let ts = now(); + if self.created_at.is_empty() { + self.created_at = ts.clone(); + } + self.updated_at = ts.clone(); + self.schema_version = default_schema_version(); + self.legacy.external_observations = observations.to_vec(); + if let Some(turn_history) = turn_history { + self.legacy.turn_history = turn_history.to_vec(); + } + if let Some(loop_metrics) = loop_metrics { + self.legacy.loop_metrics = loop_metrics.clone(); + } + if let Some(extra_fields) = extra_fields { + self.legacy.extra_fields = extra_fields.clone(); + } + + for (index, observation) in observations.iter().enumerate() { + let evidence_id = legacy_evidence_id(index); + let source_uri = legacy_source_uri(index); + let created_at = self + .evidence + .get(&evidence_id) + .and_then(|value| value.get("created_at")) + .and_then(Value::as_str) + .unwrap_or(ts.as_str()) + .to_string(); + self.evidence.insert( + evidence_id.clone(), + serde_json::json!({ + "id": evidence_id, + "evidence_type": "legacy_observation", + "content": observation, + "source_uri": source_uri, + "normalization": { + "kind": "legacy_observation", + "normalization_version": "legacy-v1", + }, + "provenance_ids": [], + "confidence_id": Value::Null, + "created_at": created_at, + "updated_at": ts, + }), + ); + self.indexes + .by_external_ref + .insert(source_uri, Value::String(legacy_evidence_id(index))); + } + + let keep_ids: BTreeSet = (0..observations.len()).map(legacy_evidence_id).collect(); + self.evidence.retain(|evidence_id, record| { + !is_legacy_evidence(evidence_id, record) || keep_ids.contains(evidence_id) + }); + self.indexes.by_external_ref.retain(|source_ref, target| { + if !source_ref.starts_with("state.json#external_observations[") { + return true; + } + target + .as_str() + .map(|target| keep_ids.contains(target)) + .unwrap_or(false) + }); + } + + pub fn to_legacy_python_projection(&self) -> Value { + let mut projected = Map::new(); + projected.insert( + "session_id".to_string(), + Value::String(self.session_id.clone()), + ); + projected.insert( + "saved_at".to_string(), + Value::String(self.updated_at.clone()), + ); + projected.insert( + "external_observations".to_string(), + Value::Array( + self.legacy_observations() + .into_iter() + .map(Value::String) + .collect(), + ), + ); + projected.insert( + "turn_history".to_string(), + Value::Array(self.legacy.turn_history.clone()), + ); + projected.insert( + "loop_metrics".to_string(), + Value::Object(self.legacy.loop_metrics.clone()), + ); + for (key, value) in &self.legacy.extra_fields { + projected + .entry(key.clone()) + .or_insert_with(|| value.clone()); + } + Value::Object(projected) + } +} + +pub fn build_question_reasoning_packet( + state: &InvestigationState, + max_questions: usize, + max_evidence_per_item: usize, +) -> Value { + let mut unresolved_questions: Vec = state + .questions + .iter() + .filter_map(|(question_id, raw_question)| { + let question = raw_question.as_object()?; + let status = question + .get("status") + .and_then(Value::as_str) + .unwrap_or("open") + .to_ascii_lowercase(); + if matches!( + status.as_str(), + "resolved" | "closed" | "wont_fix" | "won't_fix" + ) { + return None; + } + + Some(normalize_unresolved_question( + question_id, + question, + &status, + max_evidence_per_item, + )) + }) + .collect(); + unresolved_questions.sort_by(question_priority_sort_key); + unresolved_questions.truncate(std::cmp::max(1, max_questions)); + let focus_question_ids = unresolved_questions + .iter() + .filter_map(|item| { + item.get("id") + .and_then(Value::as_str) + .map(ToString::to_string) + }) + .collect::>(); + + let mut supported = Vec::new(); + let mut contested = Vec::new(); + let mut unresolved = Vec::new(); + let mut contradictions = Vec::new(); + + for (claim_id, raw_claim) in &state.claims { + let Some(claim) = raw_claim.as_object() else { + continue; + }; + let claim_status = claim + .get("status") + .and_then(Value::as_str) + .unwrap_or("unresolved") + .to_ascii_lowercase(); + let support_ids = claim_support_evidence_ids(claim, max_evidence_per_item); + let contradiction_ids = claim_contradiction_evidence_ids(claim, max_evidence_per_item); + let has_contradictions = !contradiction_ids.is_empty(); + let confidence = claim + .get("confidence") + .cloned() + .or_else(|| claim.get("confidence_score").cloned()) + .unwrap_or(Value::Null); + let claim_summary = serde_json::json!({ + "id": claim.get("id").and_then(Value::as_str).unwrap_or(claim_id), + "claim": claim + .get("claim_text") + .and_then(Value::as_str) + .or_else(|| claim.get("text").and_then(Value::as_str)) + .unwrap_or_default(), + "status": claim_status, + "confidence": confidence, + "support_evidence_ids": support_ids, + "contradiction_evidence_ids": contradiction_ids, + }); + + if has_contradictions { + contradictions.push(serde_json::json!({ + "claim_id": claim.get("id").and_then(Value::as_str).unwrap_or(claim_id), + "support_evidence_ids": claim_summary["support_evidence_ids"].clone(), + "contradiction_evidence_ids": claim_summary["contradiction_evidence_ids"].clone(), + })); + } + + if claim_status == "supported" { + supported.push(claim_summary); + } else if claim_status == "contested" || has_contradictions { + contested.push(claim_summary); + } else { + unresolved.push(claim_summary); + } + } + + let mut evidence_index = Map::new(); + for evidence_id in + collect_evidence_ids(&[&unresolved_questions, &supported, &contested, &unresolved]) + { + let Some(record) = state.evidence.get(&evidence_id).and_then(Value::as_object) else { + continue; + }; + evidence_index.insert( + evidence_id.clone(), + serde_json::json!({ + "evidence_type": record.get("evidence_type").cloned().unwrap_or(Value::Null), + "provenance_ids": id_list(record.get("provenance_ids")), + "source_uri": record.get("source_uri").cloned().unwrap_or(Value::Null), + "confidence_id": record.get("confidence_id").cloned().unwrap_or(Value::Null), + }), + ); + } + let candidate_actions = build_candidate_actions( + state, + &unresolved_questions, + &focus_question_ids, + max_evidence_per_item, + ); + + serde_json::json!({ + "reasoning_mode": "question_centric", + "loop": [ + "select_unresolved_question", + "gather_discriminating_evidence", + "update_claim_status_and_confidence", + "record_contradictions", + "synthesize_supported_contested_unresolved", + ], + "focus_question_ids": focus_question_ids, + "unresolved_questions": unresolved_questions, + "findings": { + "supported": supported, + "contested": contested, + "unresolved": unresolved, + }, + "contradictions": contradictions, + "evidence_index": evidence_index, + "candidate_actions": candidate_actions, + }) +} + +pub fn has_reasoning_content(packet: &Value) -> bool { + let Some(obj) = packet.as_object() else { + return false; + }; + if obj + .get("candidate_actions") + .and_then(Value::as_array) + .is_some_and(|items| !items.is_empty()) + { + return true; + } + if obj + .get("focus_question_ids") + .and_then(Value::as_array) + .is_some_and(|items| !items.is_empty()) + { + return true; + } + if obj + .get("contradictions") + .and_then(Value::as_array) + .is_some_and(|items| !items.is_empty()) + { + return true; + } + obj.get("findings") + .and_then(Value::as_object) + .is_some_and(|findings| { + ["supported", "contested", "unresolved"].iter().any(|key| { + findings + .get(*key) + .and_then(Value::as_array) + .is_some_and(|items| !items.is_empty()) + }) + }) +} + +fn build_candidate_actions( + state: &InvestigationState, + focus_questions: &[Value], + focus_question_ids: &[String], + max_evidence_per_item: usize, +) -> Vec { + let mut actions = Vec::new(); + let mut seen = BTreeSet::new(); + + for question in focus_questions { + let Some(question_obj) = question.as_object() else { + continue; + }; + let question_id = question_obj + .get("id") + .and_then(Value::as_str) + .unwrap_or_default(); + if question_id.is_empty() { + continue; + } + let question_record = find_question_record(state, question_id).unwrap_or(question_obj); + let claim_ids = question_claim_ids(question_obj); + let question_evidence_ids = question_evidence_ids(question_obj, max_evidence_per_item); + let linked_claim_evidence_ids = claim_ids + .iter() + .flat_map(|claim_id| claim_evidence_ids(state, claim_id, max_evidence_per_item)) + .collect::>(); + let evidence_ids = dedupe_strings( + question_evidence_ids + .iter() + .chain(linked_claim_evidence_ids.iter()) + .cloned() + .collect(), + ); + let entity_ids = question_entity_ids(state, question_record, &claim_ids, &evidence_ids); + let mut reason_codes = vec!["question_unresolved".to_string()]; + for claim_id in &claim_ids { + for code in claim_reason_codes(state, claim_id) { + if !reason_codes.contains(&code) { + reason_codes.push(code); + } + } + } + let evidence_gap_refs = build_question_gap_refs( + state, + question_id, + question_record, + &claim_ids, + &question_evidence_ids, + max_evidence_per_item, + ); + let dependency_refs = gap_ids(&evidence_gap_refs); + let ontology_object_refs = build_ontology_object_refs( + state, + Some(question_id), + &claim_ids, + &evidence_ids, + &entity_ids, + &dependency_refs, + ); + let mut source_records = vec![question_record]; + for claim_id in &claim_ids { + let Some(claim) = state.claims.get(claim_id).and_then(Value::as_object) else { + continue; + }; + source_records.push(claim); + } + let required_sources = collect_required_sources(state, &source_records, &evidence_ids); + let action_type = if claim_ids.is_empty() { + "search" + } else { + "verify_claim" + }; + let priority = normalize_priority( + question_obj + .get("priority") + .and_then(Value::as_str) + .unwrap_or("medium"), + ); + let action_id = format!("ca_q_{question_id}"); + if seen.insert(action_id.clone()) { + actions.push(serde_json::json!({ + "id": action_id, + "action_type": action_type, + "status": "proposed", + "priority": priority, + "title": format!("Resolve question {question_id}"), + "description": format!("Advance question {question_id} using discriminating evidence tied to canonical state refs."), + "opened_by_question_id": question_id, + "target_question_ids": [question_id], + "target_claim_ids": claim_ids, + "reason_codes": reason_codes.clone(), + "rationale": { + "summary": "question_unresolved", + "reason_codes": reason_codes, + "blocking_gap_ids": dependency_refs, + }, + "required_sources": required_sources, + "required_inputs": { + "question_ids": [question_id], + "claim_ids": claim_ids, + "evidence_ids": evidence_ids, + "entity_ids": entity_ids, + "external_dependencies": Vec::::new(), + }, + "expected_payoff": payoff_for_priority(priority, action_type), + "suggested_tools": suggested_tools(action_type), + "evidence_gap_refs": evidence_gap_refs, + "ontology_object_refs": ontology_object_refs, + "generated_by": PLANNER_GENERATED_BY, + })); + } + } + + for (claim_id, raw_claim) in &state.claims { + let Some(claim) = raw_claim.as_object() else { + continue; + }; + let claim_status = claim_status(claim); + if matches!(claim_status.as_str(), "retracted" | "resolved" | "closed") { + continue; + } + let confidence = claim_confidence(claim); + let reason_codes = claim_reason_codes(state, claim_id); + if reason_codes.is_empty() { + continue; + } + let evidence_ids = claim_evidence_ids(state, claim_id, max_evidence_per_item); + let entity_ids = claim_entity_ids(state, claim, &evidence_ids); + let evidence_gap_refs = build_claim_gap_refs( + state, + claim_id, + &claim_status, + confidence, + &evidence_ids, + max_evidence_per_item, + ); + let dependency_refs = gap_ids(&evidence_gap_refs); + let opened_by_question_id = focus_questions + .iter() + .filter_map(Value::as_object) + .find(|question| { + question_claim_ids(question) + .iter() + .any(|candidate| candidate == claim_id) + }) + .and_then(|question| question.get("id").and_then(Value::as_str)) + .map(ToString::to_string); + let target_question_ids = opened_by_question_id + .clone() + .into_iter() + .filter(|question_id| { + focus_question_ids + .iter() + .any(|candidate| candidate == question_id) + }) + .collect::>(); + let ontology_object_refs = build_ontology_object_refs( + state, + opened_by_question_id.as_deref(), + &[claim_id.clone()], + &evidence_ids, + &entity_ids, + &dependency_refs, + ); + let mut source_records = vec![claim]; + if let Some(question_id) = opened_by_question_id.as_deref() { + if let Some(question) = find_question_record(state, question_id) { + source_records.push(question); + } + } + let required_sources = collect_required_sources(state, &source_records, &evidence_ids); + let priority = claim_candidate_priority(&claim_status, confidence); + let action_id = format!("ca_c_{claim_id}"); + if seen.insert(action_id.clone()) { + actions.push(serde_json::json!({ + "id": action_id, + "action_type": "verify_claim", + "status": "proposed", + "priority": priority, + "title": format!("Verify claim {claim_id}"), + "description": format!("Raise confidence for claim {claim_id} with additional cited evidence and contradiction tracking."), + "opened_by_question_id": opened_by_question_id, + "target_question_ids": target_question_ids, + "target_claim_ids": [claim_id], + "reason_codes": reason_codes.clone(), + "rationale": { + "summary": "claim_requires_verification", + "reason_codes": reason_codes, + "blocking_gap_ids": dependency_refs, + }, + "required_sources": required_sources, + "required_inputs": { + "question_ids": target_question_ids, + "claim_ids": [claim_id], + "evidence_ids": evidence_ids, + "entity_ids": entity_ids, + "external_dependencies": Vec::::new(), + }, + "expected_payoff": payoff_for_priority(priority, "verify_claim"), + "suggested_tools": suggested_tools("verify_claim"), + "evidence_gap_refs": evidence_gap_refs, + "ontology_object_refs": ontology_object_refs, + "generated_by": PLANNER_GENERATED_BY, + })); + } + } + + actions.sort_by(candidate_action_sort_key); + actions.truncate(MAX_CANDIDATE_ACTIONS); + actions +} + +fn candidate_action_sort_key(left: &Value, right: &Value) -> std::cmp::Ordering { + let left_priority = question_priority_rank(left.get("priority").and_then(Value::as_str)); + let right_priority = question_priority_rank(right.get("priority").and_then(Value::as_str)); + left_priority + .cmp(&right_priority) + .then_with(|| candidate_action_origin_rank(left).cmp(&candidate_action_origin_rank(right))) + .then_with(|| { + left.get("id") + .and_then(Value::as_str) + .unwrap_or_default() + .cmp(right.get("id").and_then(Value::as_str).unwrap_or_default()) + }) +} + +fn candidate_action_origin_rank(action: &Value) -> u8 { + match action + .get("id") + .and_then(Value::as_str) + .unwrap_or_default() + .starts_with("ca_q_") + { + true => 0, + false => 1, + } +} + +fn normalize_priority(priority: &str) -> &'static str { + match priority.to_ascii_lowercase().as_str() { + "critical" => "critical", + "high" => "high", + "medium" => "medium", + "low" => "low", + _ => "medium", + } +} + +fn claim_candidate_priority(claim_status: &str, confidence: Option) -> &'static str { + if matches!(claim_status, "unresolved" | "proposed") { + "high" + } else if confidence.is_some_and(|value| value <= VERY_LOW_CONFIDENCE_THRESHOLD) { + "high" + } else { + "medium" + } +} + +fn claim_status(claim: &Map) -> String { + claim + .get("status") + .and_then(Value::as_str) + .unwrap_or("unresolved") + .to_ascii_lowercase() +} + +fn claim_confidence(claim: &Map) -> Option { + parse_confidence( + claim + .get("confidence") + .or_else(|| claim.get("confidence_score")), + ) +} + +fn find_question_record<'a>( + state: &'a InvestigationState, + question_id: &str, +) -> Option<&'a Map> { + state + .questions + .get(question_id) + .and_then(Value::as_object) + .or_else(|| { + state + .questions + .values() + .filter_map(Value::as_object) + .find(|record| record.get("id").and_then(Value::as_str) == Some(question_id)) + }) +} + +fn question_claim_ids(question: &Map) -> Vec { + id_list( + question + .get("claim_ids") + .or_else(|| question.get("claims")) + .or_else(|| { + question + .get("origin") + .and_then(Value::as_object) + .and_then(|origin| origin.get("claim_ids")) + }), + ) +} + +fn question_trigger_ids(question: &Map) -> Vec { + let origin = question.get("origin").and_then(Value::as_object); + id_list( + question + .get("trigger") + .or_else(|| question.get("triggers")) + .or_else(|| origin.and_then(|origin| origin.get("trigger"))) + .or_else(|| origin.and_then(|origin| origin.get("triggers"))), + ) +} + +fn question_evidence_ids( + question: &Map, + max_evidence_per_item: usize, +) -> Vec { + limit_ids( + question.get("evidence_ids").or_else(|| { + question + .get("origin") + .and_then(Value::as_object) + .and_then(|origin| origin.get("evidence_ids")) + }), + max_evidence_per_item, + ) +} + +fn normalize_unresolved_question( + question_id: &str, + question: &Map, + status: &str, + max_evidence_per_item: usize, +) -> Value { + serde_json::json!({ + "id": question.get("id").and_then(Value::as_str).unwrap_or(question_id), + "question": question + .get("question_text") + .and_then(Value::as_str) + .or_else(|| question.get("question").and_then(Value::as_str)) + .unwrap_or_default(), + "status": status, + "priority": question + .get("priority") + .and_then(Value::as_str) + .unwrap_or("medium") + .to_ascii_lowercase(), + "claim_ids": question_claim_ids(question), + "evidence_ids": question_evidence_ids(question, max_evidence_per_item), + "triggers": question_trigger_ids(question), + "updated_at": question + .get("updated_at") + .and_then(Value::as_str) + .unwrap_or_default(), + }) +} + +fn claim_evidence_ids( + state: &InvestigationState, + claim_id: &str, + max_evidence_per_item: usize, +) -> Vec { + let Some(claim) = state.claims.get(claim_id).and_then(Value::as_object) else { + return Vec::new(); + }; + dedupe_strings( + claim_support_evidence_ids(claim, max_evidence_per_item) + .into_iter() + .chain(claim_contradiction_evidence_ids( + claim, + max_evidence_per_item, + )) + .collect(), + ) +} + +fn claim_support_evidence_ids( + claim: &Map, + max_evidence_per_item: usize, +) -> Vec { + limit_ids( + claim + .get("support_evidence_ids") + .or_else(|| claim.get("evidence_support_ids")) + .or_else(|| claim.get("evidence_ids")), + max_evidence_per_item, + ) +} + +fn claim_contradiction_evidence_ids( + claim: &Map, + max_evidence_per_item: usize, +) -> Vec { + limit_ids( + claim + .get("contradiction_evidence_ids") + .or_else(|| claim.get("evidence_contra_ids")) + .or_else(|| claim.get("contradict_evidence_ids")), + max_evidence_per_item, + ) +} + +fn claim_reason_codes(state: &InvestigationState, claim_id: &str) -> Vec { + let Some(claim) = state.claims.get(claim_id).and_then(Value::as_object) else { + return Vec::new(); + }; + let claim_status = claim_status(claim); + let confidence = claim_confidence(claim); + let mut reason_codes = Vec::new(); + if matches!(claim_status.as_str(), "unresolved" | "proposed") { + reason_codes.push("claim_unresolved".to_string()); + } + if confidence.is_none() { + reason_codes.push("claim_missing_confidence".to_string()); + } else if confidence.is_some_and(|value| value < LOW_CONFIDENCE_THRESHOLD) { + reason_codes.push("claim_low_confidence".to_string()); + } + reason_codes +} + +fn build_question_gap_refs( + state: &InvestigationState, + question_id: &str, + question: &Map, + claim_ids: &[String], + question_evidence_ids: &[String], + max_evidence_per_item: usize, +) -> Vec { + let mut refs = Vec::new(); + if question_evidence_ids.is_empty() { + refs.push(serde_json::json!({ + "gap_id": format!("gap:question:{question_id}:missing_evidence"), + "kind": "missing_evidence", + "scope": "question", + "question_id": question_id, + "claim_id": Value::Null, + "current_evidence_ids": [], + "current_evidence_count": 0, + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } + let related_entity_ids = id_list(question.get("related_entity_ids")); + if !related_entity_ids.is_empty() && question_evidence_ids.is_empty() { + refs.push(serde_json::json!({ + "gap_id": format!("gap:question:{question_id}:missing_entity_evidence"), + "kind": "missing_evidence", + "scope": "question", + "question_id": question_id, + "claim_id": Value::Null, + "current_evidence_ids": question_evidence_ids, + "current_evidence_count": question_evidence_ids.len(), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } + for claim_id in claim_ids { + refs.extend(build_claim_gap_refs( + state, + claim_id, + &state + .claims + .get(claim_id) + .and_then(Value::as_object) + .map(claim_status) + .unwrap_or_else(|| "unresolved".to_string()), + state + .claims + .get(claim_id) + .and_then(Value::as_object) + .and_then(claim_confidence), + &claim_evidence_ids(state, claim_id, max_evidence_per_item), + max_evidence_per_item, + )); + } + dedupe_objects_by_id(refs, "gap_id") +} + +fn build_claim_gap_refs( + state: &InvestigationState, + claim_id: &str, + claim_status: &str, + confidence: Option, + evidence_ids: &[String], + max_evidence_per_item: usize, +) -> Vec { + let Some(claim) = state.claims.get(claim_id).and_then(Value::as_object) else { + return Vec::new(); + }; + let support_ids = claim_support_evidence_ids(claim, max_evidence_per_item); + let contradiction_ids = claim_contradiction_evidence_ids(claim, max_evidence_per_item); + let mut refs = Vec::new(); + if evidence_ids.is_empty() { + refs.push(serde_json::json!({ + "gap_id": format!("gap:claim:{claim_id}:missing_evidence"), + "kind": "missing_evidence", + "scope": "claim", + "question_id": Value::Null, + "claim_id": claim_id, + "current_evidence_ids": [], + "current_evidence_count": 0, + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } + if matches!( + claim_status.to_ascii_lowercase().as_str(), + "unresolved" | "contested" | "proposed" + ) && (!support_ids.is_empty() || !contradiction_ids.is_empty()) + && (support_ids.is_empty() || contradiction_ids.is_empty()) + { + refs.push(serde_json::json!({ + "gap_id": format!("gap:claim:{claim_id}:missing_counter_evidence"), + "kind": "missing_counter_evidence", + "scope": "claim", + "question_id": Value::Null, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": evidence_ids.len(), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } + if confidence.is_none() { + refs.push(serde_json::json!({ + "gap_id": format!("gap:claim:{claim_id}:missing_confidence"), + "kind": "missing_confidence", + "scope": "claim", + "question_id": Value::Null, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": evidence_ids.len(), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } else if confidence.is_some_and(|value| value < LOW_CONFIDENCE_THRESHOLD) { + refs.push(serde_json::json!({ + "gap_id": format!("gap:claim:{claim_id}:low_confidence"), + "kind": "low_confidence", + "scope": "claim", + "question_id": Value::Null, + "claim_id": claim_id, + "current_evidence_ids": evidence_ids, + "current_evidence_count": evidence_ids.len(), + "required_evidence_count": REQUIRED_EVIDENCE_COUNT, + "blocking": true, + })); + } + refs +} + +fn gap_ids(gap_refs: &[Value]) -> Vec { + gap_refs + .iter() + .filter_map(|item| { + item.get("gap_id") + .and_then(Value::as_str) + .map(ToString::to_string) + }) + .collect() +} + +fn suggested_tools(action_type: &str) -> Vec<&'static str> { + match action_type { + "search" => vec!["web_search", "fetch_url", "search_files", "read_file"], + _ => vec!["web_search", "fetch_url", "read_file", "search_files"], + } +} + +fn payoff_for_priority(priority: &str, action_type: &str) -> Value { + let base = match priority { + "critical" => 0.90, + "high" => 0.75, + "medium" => 0.55, + "low" => 0.35, + _ => 0.55, + }; + let graph_expansion_value = if action_type == "search" { 0.40 } else { 0.30 }; + let estimated_cost = 0.0; + let payoff_score = + (0.45 * base) + (0.35 * base) + (0.20 * graph_expansion_value) - estimated_cost; + serde_json::json!({ + "uncertainty_reduction": base, + "decision_impact": base, + "graph_expansion_value": graph_expansion_value, + "estimated_cost": estimated_cost, + "payoff_score": payoff_score, + }) +} + +fn collect_required_sources( + state: &InvestigationState, + records: &[&Map], + evidence_ids: &[String], +) -> Vec { + let mut sources = BTreeSet::new(); + for record in records { + for candidate in source_values_from_record(record) { + sources.insert(candidate); + } + for provenance_id in id_list(record.get("provenance_ids")) { + let Some(provenance) = state + .provenance_nodes + .get(&provenance_id) + .and_then(Value::as_object) + else { + continue; + }; + for candidate in source_values_from_record(provenance) { + sources.insert(candidate); + } + } + } + for evidence_id in evidence_ids { + let Some(record) = state.evidence.get(evidence_id).and_then(Value::as_object) else { + continue; + }; + for candidate in source_values_from_record(record) { + sources.insert(candidate); + } + for provenance_id in id_list(record.get("provenance_ids")) { + let Some(provenance) = state + .provenance_nodes + .get(&provenance_id) + .and_then(Value::as_object) + else { + continue; + }; + for candidate in source_values_from_record(provenance) { + sources.insert(candidate); + } + } + } + sources.into_iter().collect() +} + +fn source_values_from_record(record: &Map) -> Vec { + let mut values = Vec::new(); + for key in ["source_uri", "canonical_source_uri", "url"] { + if let Some(source) = record.get(key).and_then(Value::as_str) { + if !source.trim().is_empty() { + values.push(source.to_string()); + } + } + } + for key in ["source_uris", "required_sources", "sources", "urls"] { + values.extend(id_list(record.get(key))); + } + values +} + +fn build_ontology_object_refs( + state: &InvestigationState, + question_id: Option<&str>, + claim_ids: &[String], + evidence_ids: &[String], + entity_ids: &[String], + dependency_refs: &[String], +) -> Vec { + let mut refs = Vec::new(); + let mut seen = BTreeSet::new(); + if let Some(question_id) = question_id { + add_object_ref( + &mut refs, + &mut seen, + question_id, + "question", + "opened_by", + state.questions.get(question_id), + ); + } + for claim_id in claim_ids { + add_object_ref( + &mut refs, + &mut seen, + claim_id, + "claim", + "targets", + state.claims.get(claim_id), + ); + } + for evidence_id in evidence_ids { + add_object_ref( + &mut refs, + &mut seen, + evidence_id, + "evidence", + "depends_on", + state.evidence.get(evidence_id), + ); + let Some(record) = state.evidence.get(evidence_id).and_then(Value::as_object) else { + continue; + }; + for provenance_id in id_list(record.get("provenance_ids")) { + add_object_ref( + &mut refs, + &mut seen, + &provenance_id, + "provenance_node", + "supported_by", + state.provenance_nodes.get(&provenance_id), + ); + } + if let Some(confidence_id) = record.get("confidence_id").and_then(Value::as_str) { + add_object_ref( + &mut refs, + &mut seen, + confidence_id, + "confidence_profile", + "scored_by", + state.confidence_profiles.get(confidence_id), + ); + } + } + for entity_id in entity_ids { + add_object_ref( + &mut refs, + &mut seen, + entity_id, + "entity", + "about", + state.entities.get(entity_id), + ); + } + for dependency_ref in dependency_refs { + add_object_ref( + &mut refs, + &mut seen, + dependency_ref, + "evidence_gap", + "blocked_by", + None, + ); + } + refs +} + +fn add_object_ref( + refs: &mut Vec, + seen: &mut BTreeSet, + object_id: &str, + object_type: &str, + relation: &str, + record: Option<&Value>, +) { + let key = format!("{object_type}:{object_id}:{relation}"); + if !seen.insert(key) { + return; + } + refs.push(serde_json::json!({ + "object_id": object_id, + "object_type": object_type, + "relation": relation, + "label": record.and_then(object_label), + })); +} + +fn object_label(record: &Value) -> Option { + let obj = record.as_object()?; + for key in [ + "title", + "label", + "name", + "question_text", + "question", + "claim_text", + "text", + "content", + ] { + if let Some(value) = obj.get(key).and_then(Value::as_str) { + if !value.trim().is_empty() { + return Some(safe_label(value)); + } + } + } + obj.get("source_uri") + .and_then(Value::as_str) + .map(safe_label) +} + +fn safe_label(value: &str) -> String { + let trimmed = value.trim(); + let end = trimmed.floor_char_boundary(trimmed.len().min(96)); + trimmed[..end].to_string() +} + +fn question_entity_ids( + state: &InvestigationState, + question: &Map, + claim_ids: &[String], + evidence_ids: &[String], +) -> Vec { + let mut ids = collect_related_object_ids( + question, + &[ + "related_entity_ids", + "entity_ids", + "entities", + "target_entity_ids", + ], + ); + for claim_id in claim_ids { + let Some(claim) = state.claims.get(claim_id).and_then(Value::as_object) else { + continue; + }; + ids.extend(claim_entity_ids(state, claim, evidence_ids)); + } + dedupe_strings(ids) +} + +fn claim_entity_ids( + state: &InvestigationState, + claim: &Map, + evidence_ids: &[String], +) -> Vec { + let mut ids = collect_related_object_ids( + claim, + &[ + "subject_refs", + "related_entity_ids", + "entity_ids", + "entities", + "subject_entity_ids", + "object_entity_ids", + "about_entity_ids", + ], + ); + for evidence_id in evidence_ids { + let Some(evidence) = state.evidence.get(evidence_id).and_then(Value::as_object) else { + continue; + }; + ids.extend(collect_related_object_ids( + evidence, + &[ + "related_entity_ids", + "entity_ids", + "entities", + "subject_entity_ids", + "object_entity_ids", + "about_entity_ids", + ], + )); + } + dedupe_strings(ids) +} + +fn collect_related_object_ids(record: &Map, keys: &[&str]) -> Vec { + let mut ids = Vec::new(); + for key in keys { + ids.extend(id_list(record.get(*key))); + } + ids +} + +fn dedupe_strings(items: Vec) -> Vec { + let mut seen = BTreeSet::new(); + let mut out = Vec::new(); + for item in items { + if item.trim().is_empty() || !seen.insert(item.clone()) { + continue; + } + out.push(item); + } + out +} + +fn dedupe_objects_by_id(items: Vec, key: &str) -> Vec { + let mut seen = BTreeSet::new(); + let mut out = Vec::new(); + for item in items { + let Some(id) = item.get(key).and_then(Value::as_str) else { + continue; + }; + if seen.insert(id.to_string()) { + out.push(item); + } + } + out +} + +fn parse_confidence(value: Option<&Value>) -> Option { + let value = value?; + let parsed = if let Some(number) = value.as_f64() { + Some(number) + } else { + value + .as_str() + .and_then(|text| text.trim().parse::().ok()) + }?; + Some(parsed.clamp(0.0, 1.0)) +} + +fn default_schema_version() -> String { + SCHEMA_VERSION.to_string() +} + +fn default_ontology_namespace() -> String { + ONTOLOGY_NAMESPACE.to_string() +} + +fn default_ontology_version() -> String { + ONTOLOGY_VERSION.to_string() +} + +fn now() -> String { + Utc::now().to_rfc3339() +} + +fn legacy_evidence_id(index: usize) -> String { + format!("ev_legacy_{:06}", index + 1) +} + +fn legacy_source_uri(index: usize) -> String { + format!("state.json#external_observations[{index}]") +} + +fn string_vec(items: &[Value]) -> Vec { + items + .iter() + .filter_map(Value::as_str) + .map(ToString::to_string) + .collect() +} + +fn extra_fields_from_object(obj: &Map) -> BTreeMap { + obj.iter() + .filter(|(key, _)| !LEGACY_KNOWN_KEYS.contains(&key.as_str())) + .map(|(key, value)| (key.clone(), value.clone())) + .collect() +} + +fn is_legacy_evidence(evidence_id: &str, record: &Value) -> bool { + if !evidence_id.starts_with("ev_legacy_") { + return false; + } + record + .get("normalization") + .and_then(Value::as_object) + .and_then(|normalization| normalization.get("kind")) + .and_then(Value::as_str) + == Some("legacy_observation") +} + +fn id_list(value: Option<&Value>) -> Vec { + value + .and_then(Value::as_array) + .map(|items| { + items + .iter() + .filter(|item| !item.is_null()) + .map(stringify_value) + .collect() + }) + .unwrap_or_default() +} + +fn limit_ids(value: Option<&Value>, max_items: usize) -> Vec { + let mut ids = id_list(value); + ids.truncate(max_items); + ids +} + +fn stringify_value(value: &Value) -> String { + value + .as_str() + .map(ToString::to_string) + .unwrap_or_else(|| value.to_string()) +} + +fn question_priority_sort_key(left: &Value, right: &Value) -> std::cmp::Ordering { + let left_rank = question_priority_rank(left.get("priority").and_then(Value::as_str)); + let right_rank = question_priority_rank(right.get("priority").and_then(Value::as_str)); + left_rank.cmp(&right_rank).then_with(|| { + left.get("id") + .and_then(Value::as_str) + .unwrap_or_default() + .cmp(right.get("id").and_then(Value::as_str).unwrap_or_default()) + }) +} + +fn question_priority_rank(priority: Option<&str>) -> u8 { + match priority.unwrap_or("medium").to_ascii_lowercase().as_str() { + "critical" => 0, + "high" => 1, + "medium" => 2, + "low" => 3, + _ => 9, + } +} + +fn collect_evidence_ids(collections: &[&Vec]) -> Vec { + let mut seen = BTreeSet::new(); + let mut out = Vec::new(); + for collection in collections { + for item in *collection { + let Some(obj) = item.as_object() else { + continue; + }; + for key in [ + "evidence_ids", + "support_evidence_ids", + "contradiction_evidence_ids", + ] { + let Some(values) = obj.get(key).and_then(Value::as_array) else { + continue; + }; + for value in values { + let evidence_id = stringify_value(value); + if seen.insert(evidence_id.clone()) { + out.push(evidence_id); + } + } + } + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn migrates_legacy_python_state_with_extra_fields() { + let legacy = serde_json::json!({ + "session_id": "sid", + "saved_at": "2026-03-13T00:00:00Z", + "external_observations": ["obs-a", "obs-b"], + "turn_history": [{"turn_number": 1}], + "loop_metrics": {"turns": 1}, + "custom_field": "keep-me" + }); + + let state = InvestigationState::from_legacy_python_state("sid", &legacy); + assert_eq!(state.legacy.external_observations, vec!["obs-a", "obs-b"]); + assert_eq!( + state.legacy.extra_fields.get("custom_field"), + Some(&Value::String("keep-me".to_string())) + ); + assert_eq!( + state.evidence["ev_legacy_000001"]["source_uri"], + Value::String("state.json#external_observations[0]".to_string()) + ); + } + + #[test] + fn merge_legacy_updates_preserves_non_legacy_fields_and_prunes_old_legacy_entries() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_1".to_string(), + serde_json::json!({"id": "q_1", "question_text": "keep me"}), + ); + state.evidence.insert( + "ev_other".to_string(), + serde_json::json!({ + "id": "ev_other", + "content": "keep me", + "normalization": {"kind": "web_fetch"} + }), + ); + state.evidence.insert( + "ev_legacy_000002".to_string(), + serde_json::json!({ + "id": "ev_legacy_000002", + "content": "remove me", + "normalization": {"kind": "legacy_observation"} + }), + ); + let extra_fields = BTreeMap::from([( + "custom_field".to_string(), + Value::String("after".to_string()), + )]); + + state.merge_legacy_updates(&[String::from("fresh")], None, None, Some(&extra_fields)); + + assert!(state.questions.contains_key("q_1")); + assert!(state.evidence.contains_key("ev_other")); + assert!(!state.evidence.contains_key("ev_legacy_000002")); + assert_eq!( + state.evidence["ev_legacy_000001"]["content"], + Value::String("fresh".to_string()) + ); + assert_eq!( + state.legacy.extra_fields.get("custom_field"), + Some(&Value::String("after".to_string())) + ); + } + + #[test] + fn build_question_reasoning_packet_groups_findings_and_contradictions() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_2".to_string(), + serde_json::json!({ + "id": "q_2", + "question_text": "Is claim 2 true?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_2"], + "evidence_ids": ["ev_2"], + }), + ); + state.questions.insert( + "q_1".to_string(), + serde_json::json!({ + "id": "q_1", + "question_text": "Is claim 1 true?", + "status": "open", + "priority": "critical", + "claim_ids": ["cl_1"], + "evidence_ids": ["ev_1", "ev_3"], + }), + ); + state.questions.insert( + "q_done".to_string(), + serde_json::json!({ + "id": "q_done", + "question_text": "Ignore", + "status": "resolved", + }), + ); + state.claims.insert( + "cl_1".to_string(), + serde_json::json!({ + "claim_text": "Claim supported", + "status": "supported", + "support_evidence_ids": ["ev_1"], + "confidence": 0.91, + }), + ); + state.claims.insert( + "cl_2".to_string(), + serde_json::json!({ + "claim_text": "Claim contested", + "status": "contested", + "support_evidence_ids": ["ev_2"], + "contradiction_evidence_ids": ["ev_3"], + "confidence_score": 0.4, + }), + ); + state.claims.insert( + "cl_3".to_string(), + serde_json::json!({ + "claim_text": "Claim unresolved", + "status": "unresolved", + "evidence_ids": ["ev_4"], + }), + ); + state.evidence.insert( + "ev_1".to_string(), + serde_json::json!({"evidence_type": "doc", "provenance_ids": ["pv_1"], "source_uri": "s1"}), + ); + state.evidence.insert( + "ev_2".to_string(), + serde_json::json!({"evidence_type": "doc", "provenance_ids": ["pv_2"], "source_uri": "s2"}), + ); + state.evidence.insert( + "ev_3".to_string(), + serde_json::json!({"evidence_type": "doc", "provenance_ids": ["pv_3"], "source_uri": "s3"}), + ); + state.evidence.insert( + "ev_4".to_string(), + serde_json::json!({"evidence_type": "doc", "provenance_ids": ["pv_4"], "source_uri": "s4"}), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + + assert_eq!( + packet["reasoning_mode"], + Value::String("question_centric".to_string()) + ); + assert_eq!( + packet["focus_question_ids"], + serde_json::json!(["q_1", "q_2"]) + ); + assert_eq!( + packet["findings"]["supported"][0]["id"], + Value::String("cl_1".to_string()) + ); + assert_eq!( + packet["findings"]["contested"][0]["id"], + Value::String("cl_2".to_string()) + ); + assert_eq!( + packet["findings"]["unresolved"][0]["id"], + Value::String("cl_3".to_string()) + ); + assert_eq!( + packet["contradictions"][0]["claim_id"], + Value::String("cl_2".to_string()) + ); + assert!(packet["evidence_index"].get("ev_3").is_some()); + assert_eq!( + packet["candidate_actions"][0]["id"], + Value::String("ca_q_q_1".to_string()) + ); + assert_eq!( + packet["candidate_actions"][0]["required_sources"], + serde_json::json!(["s1", "s3"]) + ); + assert_eq!( + packet["candidate_actions"][1]["id"], + Value::String("ca_q_q_2".to_string()) + ); + assert_eq!( + packet["candidate_actions"][2]["reason_codes"], + serde_json::json!(["claim_low_confidence"]) + ); + assert_eq!( + packet["candidate_actions"][2]["evidence_gap_refs"][0]["kind"], + Value::String("low_confidence".to_string()) + ); + assert_eq!( + packet["candidate_actions"][3]["id"], + Value::String("ca_c_cl_3".to_string()) + ); + assert_eq!( + packet["candidate_actions"][3]["evidence_gap_refs"][0]["kind"], + Value::String("missing_counter_evidence".to_string()) + ); + assert!(has_reasoning_content(&packet)); + } + + #[test] + fn candidate_actions_keep_entity_inputs_entity_only_and_collect_question_sources() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_1".to_string(), + serde_json::json!({ + "id": "q_1", + "question_text": "What source confirms the claim?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_1"], + "resolution_claim_id": "cl_resolution", + "provenance_ids": ["pv_q_1"], + }), + ); + state.claims.insert( + "cl_1".to_string(), + serde_json::json!({ + "id": "cl_1", + "claim_text": "Needs evidence", + "status": "proposed", + "evidence_ids": [], + "confidence": 0.2, + }), + ); + state.provenance_nodes.insert( + "pv_q_1".to_string(), + serde_json::json!({ + "id": "pv_q_1", + "source_uri": "https://question-source.test", + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + let action = packet["candidate_actions"] + .as_array() + .and_then(|items| { + items + .iter() + .find(|item| item.get("id") == Some(&Value::String("ca_q_q_1".to_string()))) + }) + .expect("question action"); + + assert_eq!( + action["required_inputs"]["entity_ids"], + serde_json::json!([]) + ); + assert_eq!( + action["required_sources"], + serde_json::json!(["https://question-source.test"]) + ); + assert!( + !action["ontology_object_refs"] + .as_array() + .is_some_and(|refs| refs + .iter() + .any(|item| item.get("object_type") + == Some(&Value::String("entity".to_string())))) + ); + } + + #[test] + fn question_normalization_falls_back_to_origin_claims_and_evidence() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_origin".to_string(), + serde_json::json!({ + "id": "q_origin", + "question_text": "Origin-backed question", + "status": "open", + "priority": "high", + "origin": { + "claim_ids": ["cl_origin"], + "evidence_ids": ["ev_origin_1", "ev_origin_2"], + }, + }), + ); + state.claims.insert( + "cl_origin".to_string(), + serde_json::json!({ + "id": "cl_origin", + "claim_text": "Origin-backed claim", + "status": "supported", + "support_evidence_ids": ["ev_claim"], + "confidence": 0.8, + }), + ); + state.evidence.insert( + "ev_origin_1".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://origin-question-1.test", + }), + ); + state.evidence.insert( + "ev_origin_2".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://origin-question-2.test", + }), + ); + state.evidence.insert( + "ev_claim".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://origin-claim.test", + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 1); + let normalized_question = packet["unresolved_questions"] + .as_array() + .and_then(|items| items.first()) + .expect("normalized question"); + let question_action = packet["candidate_actions"] + .as_array() + .and_then(|items| { + items.iter().find(|item| { + item.get("id") == Some(&Value::String("ca_q_q_origin".to_string())) + }) + }) + .expect("question action"); + + assert_eq!( + packet["focus_question_ids"], + serde_json::json!(["q_origin"]) + ); + assert_eq!( + normalized_question["claim_ids"], + serde_json::json!(["cl_origin"]) + ); + assert_eq!( + normalized_question["evidence_ids"], + serde_json::json!(["ev_origin_1"]) + ); + assert_eq!( + question_action["target_claim_ids"], + serde_json::json!(["cl_origin"]) + ); + assert_eq!( + question_action["required_inputs"]["evidence_ids"], + serde_json::json!(["ev_origin_1", "ev_claim"]) + ); + assert!(packet["evidence_index"].get("ev_origin_1").is_some()); + assert!(packet["evidence_index"].get("ev_claim").is_some()); + assert!(packet["evidence_index"].get("ev_origin_2").is_none()); + } + + #[test] + fn question_normalization_uses_origin_trigger_aliases() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_origin_trigger".to_string(), + serde_json::json!({ + "id": "q_origin_trigger", + "question_text": "Origin trigger question", + "status": "open", + "priority": "high", + "origin": { + "trigger": ["trigger_a"], + }, + }), + ); + state.questions.insert( + "q_origin_triggers".to_string(), + serde_json::json!({ + "id": "q_origin_triggers", + "question_text": "Origin triggers question", + "status": "open", + "priority": "medium", + "origin": { + "triggers": ["trigger_b", "trigger_c"], + }, + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + let questions = packet["unresolved_questions"] + .as_array() + .expect("unresolved questions"); + let trigger_question = questions + .iter() + .find(|item| item.get("id") == Some(&Value::String("q_origin_trigger".to_string()))) + .expect("origin trigger question"); + let triggers_question = questions + .iter() + .find(|item| item.get("id") == Some(&Value::String("q_origin_triggers".to_string()))) + .expect("origin triggers question"); + + assert_eq!( + trigger_question["triggers"], + serde_json::json!(["trigger_a"]) + ); + assert_eq!( + triggers_question["triggers"], + serde_json::json!(["trigger_b", "trigger_c"]) + ); + } + + #[test] + fn question_normalization_prefers_top_level_values_over_origin() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_override".to_string(), + serde_json::json!({ + "id": "q_override", + "question_text": "Override question", + "status": "open", + "priority": "high", + "claim_ids": ["cl_top"], + "evidence_ids": ["ev_top"], + "trigger": ["trigger_top"], + "origin": { + "claim_ids": ["cl_origin"], + "evidence_ids": ["ev_origin"], + "trigger": ["trigger_origin"], + "triggers": ["trigger_origin_fallback"], + }, + }), + ); + state.claims.insert( + "cl_top".to_string(), + serde_json::json!({ + "id": "cl_top", + "claim_text": "Top-level claim", + "status": "supported", + "support_evidence_ids": ["ev_claim_top"], + "confidence": 0.9, + }), + ); + state.claims.insert( + "cl_origin".to_string(), + serde_json::json!({ + "id": "cl_origin", + "claim_text": "Origin claim", + "status": "supported", + "support_evidence_ids": ["ev_claim_origin"], + "confidence": 0.9, + }), + ); + state.evidence.insert( + "ev_top".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://top-level-question.test", + }), + ); + state.evidence.insert( + "ev_claim_top".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://top-level-claim.test", + }), + ); + state.evidence.insert( + "ev_claim_origin".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://origin-claim.test", + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + let normalized_question = packet["unresolved_questions"] + .as_array() + .and_then(|items| items.first()) + .expect("normalized question"); + let question_action = packet["candidate_actions"] + .as_array() + .and_then(|items| { + items.iter().find(|item| { + item.get("id") == Some(&Value::String("ca_q_q_override".to_string())) + }) + }) + .expect("question action"); + + assert_eq!( + normalized_question["claim_ids"], + serde_json::json!(["cl_top"]) + ); + assert_eq!( + normalized_question["evidence_ids"], + serde_json::json!(["ev_top"]) + ); + assert_eq!( + normalized_question["triggers"], + serde_json::json!(["trigger_top"]) + ); + assert_eq!( + question_action["target_claim_ids"], + serde_json::json!(["cl_top"]) + ); + assert_eq!( + question_action["required_inputs"]["evidence_ids"], + serde_json::json!(["ev_top", "ev_claim_top"]) + ); + assert_eq!( + question_action["required_sources"], + serde_json::json!([ + "https://top-level-claim.test", + "https://top-level-question.test" + ]) + ); + } + + #[test] + fn reasoning_packet_uses_canonical_claim_evidence_aliases_in_findings_and_sources() { + let mut state = InvestigationState::new("sid"); + state.questions.insert( + "q_alias".to_string(), + serde_json::json!({ + "id": "q_alias", + "question_text": "What evidence supports the alias-backed claim?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_alias"], + "evidence_ids": [], + }), + ); + state.claims.insert( + "cl_alias".to_string(), + serde_json::json!({ + "id": "cl_alias", + "claim_text": "Alias-backed claim", + "status": "contested", + "evidence_support_ids": ["ev_support_alias"], + "evidence_contra_ids": ["ev_contra_alias"], + "confidence": 0.4, + }), + ); + state.evidence.insert( + "ev_support_alias".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://support.test", + }), + ); + state.evidence.insert( + "ev_contra_alias".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://contra.test", + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + + assert_eq!( + packet["findings"]["contested"][0]["support_evidence_ids"], + serde_json::json!(["ev_support_alias"]) + ); + assert_eq!( + packet["findings"]["contested"][0]["contradiction_evidence_ids"], + serde_json::json!(["ev_contra_alias"]) + ); + assert_eq!( + packet["contradictions"][0]["support_evidence_ids"], + serde_json::json!(["ev_support_alias"]) + ); + assert_eq!( + packet["contradictions"][0]["contradiction_evidence_ids"], + serde_json::json!(["ev_contra_alias"]) + ); + assert!(packet["evidence_index"].get("ev_support_alias").is_some()); + assert!(packet["evidence_index"].get("ev_contra_alias").is_some()); + + let actions = packet["candidate_actions"] + .as_array() + .expect("candidate actions"); + let question_action = actions + .iter() + .find(|action| action.get("id") == Some(&Value::String("ca_q_q_alias".to_string()))) + .expect("question action"); + let claim_action = actions + .iter() + .find(|action| action.get("id") == Some(&Value::String("ca_c_cl_alias".to_string()))) + .expect("claim action"); + + assert_eq!( + question_action["required_sources"], + serde_json::json!(["https://contra.test", "https://support.test"]) + ); + assert_eq!( + claim_action["required_sources"], + serde_json::json!(["https://contra.test", "https://support.test"]) + ); + } + + #[test] + fn candidate_actions_skip_resolved_and_closed_claims() { + let mut state = InvestigationState::new("sid"); + state.claims.insert( + "cl_resolved".to_string(), + serde_json::json!({ + "id": "cl_resolved", + "claim_text": "Resolved claim", + "status": "resolved", + "confidence": 0.2, + "support_evidence_ids": ["ev_resolved"], + }), + ); + state.claims.insert( + "cl_closed".to_string(), + serde_json::json!({ + "id": "cl_closed", + "claim_text": "Closed claim", + "status": "closed", + "support_evidence_ids": ["ev_closed"], + }), + ); + state.claims.insert( + "cl_retracted".to_string(), + serde_json::json!({ + "id": "cl_retracted", + "claim_text": "Retracted claim", + "status": "retracted", + "confidence": 0.1, + }), + ); + state.claims.insert( + "cl_control".to_string(), + serde_json::json!({ + "id": "cl_control", + "claim_text": "Low-confidence supported claim", + "status": "supported", + "confidence": 0.2, + "support_evidence_ids": ["ev_control"], + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + let ids = packet["candidate_actions"] + .as_array() + .expect("candidate actions") + .iter() + .filter_map(|action| action.get("id").and_then(Value::as_str)) + .map(ToString::to_string) + .collect::>(); + + assert!(ids.contains(&"ca_c_cl_control".to_string())); + assert!(!ids.contains(&"ca_c_cl_resolved".to_string())); + assert!(!ids.contains(&"ca_c_cl_closed".to_string())); + assert!(!ids.contains(&"ca_c_cl_retracted".to_string())); + } + + #[test] + fn proposed_claims_emit_missing_counter_evidence_gap() { + let mut state = InvestigationState::new("sid"); + state.claims.insert( + "cl_proposed".to_string(), + serde_json::json!({ + "id": "cl_proposed", + "claim_text": "Proposed claim", + "status": "proposed", + "support_evidence_ids": ["ev_proposed"], + "confidence": 0.2, + }), + ); + state.evidence.insert( + "ev_proposed".to_string(), + serde_json::json!({ + "evidence_type": "doc", + "source_uri": "https://proposed.test", + }), + ); + + let packet = build_question_reasoning_packet(&state, 8, 6); + let claim_action = packet["candidate_actions"] + .as_array() + .and_then(|items| { + items.iter().find(|item| { + item.get("id") == Some(&Value::String("ca_c_cl_proposed".to_string())) + }) + }) + .expect("proposed claim action"); + + assert!( + claim_action["evidence_gap_refs"] + .as_array() + .is_some_and(|refs| refs.iter().any(|gap| { + gap.get("kind") == Some(&Value::String("missing_counter_evidence".to_string())) + })) + ); + } + + #[test] + fn has_reasoning_content_returns_false_for_empty_packet() { + let packet = serde_json::json!({ + "focus_question_ids": [], + "findings": { + "supported": [], + "contested": [], + "unresolved": [], + }, + "contradictions": [], + }); + assert!(!has_reasoning_content(&packet)); + } + + #[test] + fn has_reasoning_content_returns_true_for_candidate_actions_only() { + let packet = serde_json::json!({ + "focus_question_ids": [], + "findings": { + "supported": [], + "contested": [], + "unresolved": [], + }, + "contradictions": [], + "candidate_actions": [ + {"id": "ca_c_cl_9", "action_type": "verify_claim", "status": "proposed"} + ], + }); + assert!(has_reasoning_content(&packet)); + } +} diff --git a/openplanter-desktop/crates/op-core/src/engine/judge.rs b/openplanter-desktop/crates/op-core/src/engine/judge.rs index 355000cd..0e4be82e 100644 --- a/openplanter-desktop/crates/op-core/src/engine/judge.rs +++ b/openplanter-desktop/crates/op-core/src/engine/judge.rs @@ -86,13 +86,11 @@ impl Default for AcceptanceCriteriaJudge { /// Extract significant terms from criteria text (words >= 4 chars, excluding stop words). fn extract_terms(text: &str) -> Vec<&str> { const STOP_WORDS: &[&str] = &[ - "the", "and", "for", "are", "but", "not", "you", "all", - "can", "has", "her", "was", "one", "our", "out", "with", - "that", "this", "have", "from", "they", "been", "said", - "each", "which", "their", "will", "other", "about", "many", - "then", "them", "these", "some", "would", "make", "like", - "into", "could", "time", "very", "when", "what", "your", - "there", "should", "must", "also", + "the", "and", "for", "are", "but", "not", "you", "all", "can", "has", "her", "was", "one", + "our", "out", "with", "that", "this", "have", "from", "they", "been", "said", "each", + "which", "their", "will", "other", "about", "many", "then", "them", "these", "some", + "would", "make", "like", "into", "could", "time", "very", "when", "what", "your", "there", + "should", "must", "also", ]; text.split_whitespace() diff --git a/openplanter-desktop/crates/op-core/src/engine/mod.rs b/openplanter-desktop/crates/op-core/src/engine/mod.rs index cdf2847e..ea3b1517 100644 --- a/openplanter-desktop/crates/op-core/src/engine/mod.rs +++ b/openplanter-desktop/crates/op-core/src/engine/mod.rs @@ -5,93 +5,116 @@ pub mod context; pub mod curator; +pub mod investigation_state; pub mod judge; -use tokio::sync::mpsc; -use tokio::task::JoinHandle; +use std::collections::HashSet; +use std::time::Duration; + +use anyhow::anyhow; +use chrono::Utc; +use serde_json::{Map, Value}; use tokio_util::sync::CancellationToken; use crate::builder::build_model; use crate::config::AgentConfig; -use crate::events::{DeltaEvent, DeltaKind, StepEvent, TokenUsage}; -use crate::model::Message; +use crate::events::{ + CompletionMeta, DeltaEvent, DeltaKind, LoopMetrics, LoopPhase, StepEvent, TokenUsage, +}; +use crate::model::{BaseModel, Message, ModelTurn, RateLimitError}; use crate::prompts::build_system_prompt; -use crate::tools::defs::build_tool_defs; use crate::tools::WorkspaceTools; +use crate::tools::defs::build_tool_defs; -use self::curator::{extract_step_context, run_curator, CuratorResult}; +use self::curator::{ + CuratorCheckpoint, CuratorStateDelta, build_state_delta, run_curator_checkpoint, +}; -/// Outcome from a background curator task (success or error). -enum CuratorOutcome { - Done(CuratorResult), - Error(String), +#[derive(Debug, Clone, Default)] +pub struct SolveInitialContext { + pub session_id: Option, + pub session_dir: Option, + pub question_reasoning_packet: Option, } -/// Abort all in-flight curator tasks. -fn abort_curators(handles: &mut Vec>) { - for h in handles.drain(..) { - h.abort(); - } +fn take_curator_phase_checkpoint( + pending_deltas: &mut Vec, + active_phase: &mut Option, + next_phase: LoopPhase, +) -> Option { + let checkpoint = match active_phase.as_ref() { + Some(previous_phase) if previous_phase != &next_phase && !pending_deltas.is_empty() => { + Some(CuratorCheckpoint { + boundary: format!("phase_transition:{previous_phase:?}->{next_phase:?}"), + deltas: std::mem::take(pending_deltas), + }) + } + _ => None, + }; + + *active_phase = Some(next_phase); + checkpoint } -/// Drain completed curator results from the channel, inject system messages -/// and emit events for any that changed files. -fn drain_curator_results( - rx: &mut mpsc::UnboundedReceiver, - messages: &mut Vec, - emitter: &dyn SolveEmitter, -) { - while let Ok(outcome) = rx.try_recv() { - match outcome { - CuratorOutcome::Done(result) => { - if result.files_changed > 0 { - emitter.emit_trace(&format!( - "[curator] wiki updated: {} ({} files)", - result.summary, result.files_changed - )); - messages.push(Message::System { - content: format!("[Wiki Curator] {}", result.summary), - }); - emitter.emit_curator_update(&result.summary, result.files_changed); - } - } - CuratorOutcome::Error(e) => { - emitter.emit_trace(&format!("[curator] error: {e}")); - } - } +fn take_pending_curator_checkpoint( + pending_deltas: &mut Vec, + boundary: &str, +) -> Option { + if pending_deltas.is_empty() { + return None; } + + Some(CuratorCheckpoint { + boundary: boundary.to_string(), + deltas: std::mem::take(pending_deltas), + }) } -/// Wait for in-flight curators (up to timeout), drain final results, abort rest. -async fn finish_curators( - handles: &mut Vec>, - rx: &mut mpsc::UnboundedReceiver, - messages: &mut Vec, +async fn emit_curator_checkpoint( + checkpoint: CuratorCheckpoint, + config: &AgentConfig, + cancel: &CancellationToken, emitter: &dyn SolveEmitter, ) { - if handles.is_empty() { - return; - } emitter.emit_trace(&format!( - "[curator] waiting for {} in-flight curator(s)...", - handles.len() + "[curator] synthesizing checkpoint at {} ({} deltas)", + checkpoint.boundary, + checkpoint.deltas.len() )); - // Wait up to 30 seconds total for all curators to finish - let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(30); - for h in handles.iter_mut() { - let remaining = deadline - tokio::time::Instant::now(); - if remaining.is_zero() { - break; + match run_curator_checkpoint(&checkpoint, config, cancel.clone()).await { + Ok(result) if result.files_changed > 0 => { + emitter.emit_trace(&format!( + "[curator] wiki updated: {} ({} files)", + result.summary, result.files_changed + )); + emitter.emit_curator_update(&result.summary, result.files_changed); + } + Ok(_) => { + emitter.emit_trace(&format!( + "[curator] no net wiki updates at {}", + checkpoint.boundary + )); + } + Err(err) => { + emitter.emit_trace(&format!( + "[curator] checkpoint {} error: {err}", + checkpoint.boundary + )); } - let _ = tokio::time::timeout(remaining, h).await; } +} - // Final drain - drain_curator_results(rx, messages, emitter); - - // Abort any still running - abort_curators(handles); +async fn flush_pending_curator_checkpoint( + pending_deltas: &mut Vec, + boundary: &str, + config: &AgentConfig, + cancel: &CancellationToken, + emitter: &dyn SolveEmitter, +) { + if let Some(checkpoint) = take_pending_curator_checkpoint(pending_deltas, boundary) { + emit_curator_checkpoint(checkpoint, config, cancel, emitter).await; + } } // Abstraction for emitting solve events. @@ -102,9 +125,23 @@ pub trait SolveEmitter: Send + Sync { fn emit_trace(&self, message: &str); fn emit_delta(&self, event: DeltaEvent); fn emit_step(&self, event: StepEvent); - fn emit_complete(&self, result: &str); + fn emit_complete( + &self, + result: &str, + loop_metrics: Option, + completion: Option, + ); fn emit_error(&self, message: &str); - /// Called when a background curator finishes updating wiki files. + fn emit_loop_health( + &self, + _depth: u32, + _step: u32, + _phase: LoopPhase, + _metrics: LoopMetrics, + _is_final: bool, + ) { + } + /// Called when a checkpointed curator finishes updating wiki files. /// Default no-op — override in TauriEmitter/LoggingEmitter. fn emit_curator_update(&self, _summary: &str, _files_changed: u32) {} } @@ -114,11 +151,7 @@ pub trait SolveEmitter: Send + Sync { // This is a placeholder until the full engine is implemented in Phase 4. // It emits the standard event sequence so the frontend can be developed // and tested against a working backend. -pub async fn demo_solve( - objective: &str, - emitter: &dyn SolveEmitter, - cancel: CancellationToken, -) { +pub async fn demo_solve(objective: &str, emitter: &dyn SolveEmitter, cancel: CancellationToken) { emitter.emit_trace(&format!("Solving: {objective}")); if cancel.is_cancelled() { @@ -154,6 +187,26 @@ pub async fn demo_solve( tokio::time::sleep(std::time::Duration::from_millis(50)).await; } + let loop_metrics = LoopMetrics { + steps: 1, + model_turns: 1, + tool_calls: 0, + investigate_steps: 0, + build_steps: 0, + iterate_steps: 0, + finalize_steps: 1, + recon_streak: 0, + max_recon_streak: 0, + guardrail_warnings: 0, + final_rejections: 0, + extensions_granted: 0, + extension_eligible_checks: 0, + extension_denials_no_progress: 0, + extension_denials_cap: 0, + termination_reason: "success".into(), + }; + emitter.emit_loop_health(0, 1, LoopPhase::Finalize, loop_metrics.clone(), true); + // Emit step summary emitter.emit_step(StepEvent { depth: 0, @@ -165,9 +218,11 @@ pub async fn demo_solve( }, elapsed_ms: 350, is_final: true, + loop_phase: Some(LoopPhase::Finalize), + loop_metrics: Some(loop_metrics.clone()), }); - emitter.emit_complete(&response); + emitter.emit_complete(&response, Some(loop_metrics), None); } /// Rough token estimate: ~4 chars per token. @@ -176,11 +231,18 @@ fn estimate_tokens(messages: &[Message]) -> usize { .iter() .map(|m| match m { Message::System { content } | Message::User { content } => content.len(), - Message::Assistant { content, tool_calls } => { + Message::Assistant { + content, + tool_calls, + } => { content.len() + tool_calls .as_ref() - .map(|tcs| tcs.iter().map(|tc| tc.arguments.len() + tc.name.len()).sum()) + .map(|tcs| { + tcs.iter() + .map(|tc| tc.arguments.len() + tc.name.len()) + .sum() + }) .unwrap_or(0) } Message::Tool { content, .. } => content.len(), @@ -189,6 +251,61 @@ fn estimate_tokens(messages: &[Message]) -> usize { / 4 } +fn safe_prefix(text: &str, max_chars: usize) -> &str { + let end = text.floor_char_boundary(text.len().min(max_chars)); + &text[..end] +} + +fn build_initial_user_message( + objective: &str, + config: &AgentConfig, + initial_context: Option<&SolveInitialContext>, +) -> Result { + let Some(initial_context) = initial_context else { + return Ok(objective.to_string()); + }; + + let mut payload = Map::new(); + payload.insert( + "timestamp".to_string(), + Value::String(Utc::now().to_rfc3339()), + ); + payload.insert( + "objective".to_string(), + Value::String(objective.to_string()), + ); + payload.insert( + "max_steps_per_call".to_string(), + Value::from(config.max_steps_per_call), + ); + payload.insert( + "workspace".to_string(), + Value::String(config.workspace.display().to_string()), + ); + if let Some(session_id) = initial_context + .session_id + .as_ref() + .filter(|value| !value.trim().is_empty()) + { + payload.insert("session_id".to_string(), Value::String(session_id.clone())); + } + if let Some(session_dir) = initial_context + .session_dir + .as_ref() + .filter(|value| !value.trim().is_empty()) + { + payload.insert( + "session_dir".to_string(), + Value::String(session_dir.clone()), + ); + } + if let Some(packet) = initial_context.question_reasoning_packet.clone() { + payload.insert("question_reasoning_packet".to_string(), packet); + } + + serde_json::to_string(&payload) +} + /// Compact conversation context when it grows too large. /// /// Keeps the system prompt, user objective, and the most recent messages @@ -206,13 +323,454 @@ fn compact_messages(messages: &mut Vec, max_tokens: usize) { for i in 2..protected_tail { if let Message::Tool { content, .. } = &mut messages[i] { if content.len() > 200 { - let preview = &content[..content.len().min(150)]; + let preview = safe_prefix(content, 150); *content = format!("{preview}\n...[truncated — older tool result]"); } } } } +fn compute_rate_limit_delay_sec( + config: &AgentConfig, + retry_count: usize, + err: &RateLimitError, +) -> f64 { + let retry_after_cap = config.rate_limit_retry_after_cap_sec.max(0.0); + let backoff_max = config.rate_limit_backoff_max_sec.max(0.0); + let delay = err + .retry_after_sec + .map(|value| value.max(0.0).min(retry_after_cap)) + .unwrap_or_else(|| { + let base = config.rate_limit_backoff_base_sec.max(0.0); + base * 2_f64.powi((retry_count.saturating_sub(1)) as i32) + }); + delay.min(backoff_max) +} + +async fn chat_stream_with_rate_limit_retries( + model: &dyn BaseModel, + messages: &[Message], + tool_defs: &[serde_json::Value], + on_delta: &(dyn Fn(DeltaEvent) + Send + Sync), + cancel: &CancellationToken, + config: &AgentConfig, + emitter: &dyn SolveEmitter, + step: usize, +) -> anyhow::Result { + let max_retries = config.rate_limit_max_retries.max(0) as usize; + let mut retries = 0usize; + + loop { + if cancel.is_cancelled() { + return Err(anyhow!("Cancelled")); + } + + match model + .chat_stream(messages, tool_defs, on_delta, cancel) + .await + { + Ok(turn) => return Ok(turn), + Err(err) => { + if let Some(rate_limit) = err.downcast_ref::() { + if retries >= max_retries { + return Err(err); + } + retries += 1; + let delay_sec = compute_rate_limit_delay_sec(config, retries, rate_limit); + let provider_code = rate_limit + .provider_code + .as_deref() + .map(|code| format!(" ({code})")) + .unwrap_or_default(); + emitter.emit_trace(&format!( + "[d0/s{step}] rate limited{provider_code}. Sleeping {delay_sec:.1}s before retry {retries}/{max_retries}..." + )); + if delay_sec > 0.0 { + tokio::select! { + _ = cancel.cancelled() => return Err(anyhow!("Cancelled")), + _ = tokio::time::sleep(Duration::from_secs_f64(delay_sec)) => {} + } + } + continue; + } + return Err(err); + } + } + } +} + +fn objective_allows_meta_final(objective: &str) -> bool { + objective + .split(|c: char| !c.is_ascii_alphanumeric()) + .filter(|token| !token.is_empty()) + .any(|token| { + matches!( + token.to_ascii_lowercase().as_str(), + "plan" + | "planning" + | "approach" + | "strategy" + | "outline" + | "spec" + | "specification" + | "design" + | "roadmap" + | "proposal" + | "review" + | "audit" + | "analysis" + | "analyze" + | "brainstorm" + ) + }) +} + +fn is_meta_final_text(text: &str, objective: &str) -> bool { + let stripped = text.trim(); + if stripped.is_empty() { + return true; + } + let lower = stripped.to_ascii_lowercase(); + let weak_structural_meta = [ + "here is my plan", + "here's my plan", + "here is the plan", + "here's the plan", + "here is my approach", + "here's my approach", + "here is the approach", + "here's the approach", + "here is my analysis", + "here's my analysis", + "here is the analysis", + "here's the analysis", + ]; + let padded = format!(" {lower} "); + let strong_process_meta = [ + " i will ", + " i can ", + " i should ", + " i need to ", + " i want to ", + " i am going to ", + " plan to ", + " let me ", + " next, i will ", + " next i will ", + " i should start by ", + ]; + if strong_process_meta + .iter() + .any(|needle| padded.contains(needle)) + { + return true; + } + if weak_structural_meta.iter().any(|p| lower.starts_with(p)) { + return !objective_allows_meta_final(objective); + } + false +} + +fn is_recon_tool(name: &str) -> bool { + matches!( + name, + "list_files" + | "search_files" + | "repo_map" + | "web_search" + | "fetch_url" + | "read_file" + | "read_image" + | "list_artifacts" + | "read_artifact" + ) +} + +fn is_artifact_tool(name: &str) -> bool { + matches!( + name, + "write_file" | "apply_patch" | "edit_file" | "hashline_edit" + ) +} + +fn classify_loop_phase(tool_calls: &[crate::model::ToolCall], is_final: bool) -> LoopPhase { + if is_final { + return LoopPhase::Finalize; + } + if tool_calls.is_empty() { + return LoopPhase::Iterate; + } + let has_recon = tool_calls.iter().any(|tc| is_recon_tool(&tc.name)); + let has_artifact = tool_calls.iter().any(|tc| is_artifact_tool(&tc.name)); + if has_artifact { + LoopPhase::Build + } else if has_recon && tool_calls.iter().all(|tc| is_recon_tool(&tc.name)) { + LoopPhase::Investigate + } else { + LoopPhase::Iterate + } +} + +fn increment_phase(metrics: &mut LoopMetrics, phase: &LoopPhase) { + match phase { + LoopPhase::Investigate => metrics.investigate_steps += 1, + LoopPhase::Build => metrics.build_steps += 1, + LoopPhase::Iterate => metrics.iterate_steps += 1, + LoopPhase::Finalize => metrics.finalize_steps += 1, + } +} + +fn should_emit_recon_guardrail(recon_streak: u32, last_guardrail_streak: u32) -> bool { + recon_streak >= 3 && last_guardrail_streak == 0 +} + +const BUDGET_EXTENSION_WINDOW: usize = 12; +const MIN_MEANINGFUL_RESULT_CHARS: usize = 24; +const MIN_EXTENSION_PROGRESS_SIGNALS: usize = 2; + +#[derive(Debug, Clone)] +struct StepProgressRecord { + phase: LoopPhase, + step_signature: String, + tool_count: usize, + failed_tool_step: bool, + successful_action_signatures: HashSet, + state_delta_signatures: HashSet, + completed_previews: Vec, +} + +fn normalize_progress_fragment(text: &str, max_len: usize) -> String { + let mut normalized = text.split_whitespace().collect::>().join(" "); + normalized = normalized.to_lowercase(); + while normalized.starts_with('[') { + if let Some(idx) = normalized.find(']') { + normalized = normalized[idx + 1..].trim_start().to_string(); + } else { + break; + } + } + if normalized.len() > max_len { + normalized = safe_prefix(&normalized, max_len).to_string(); + } + normalized +} + +fn summarize_observation(text: &str, max_len: usize) -> String { + let first = text.lines().next().unwrap_or("").trim(); + if first.len() > max_len { + format!("{}...", safe_prefix(first, max_len.saturating_sub(3))) + } else { + first.to_string() + } +} + +fn is_non_progress_tool(name: &str) -> bool { + is_recon_tool(name) || name == "think" +} + +fn action_signature(name: &str, args: &str) -> String { + format!("{}|{}", name, normalize_progress_fragment(args, 160)) +} + +fn build_step_progress_record( + tool_calls: &[crate::model::ToolCall], + observations: &[(String, String, String, String, bool)], + phase: LoopPhase, +) -> StepProgressRecord { + let tool_names: Vec<&str> = tool_calls.iter().map(|tc| tc.name.as_str()).collect(); + let has_artifact = tool_names.iter().any(|name| is_artifact_tool(name)); + let has_error = observations.iter().any(|(_, _, _, _, is_error)| *is_error); + let mut record = StepProgressRecord { + phase, + step_signature: format!( + "{}|artifact={}|error={}", + tool_names.join(","), + if has_artifact { 1 } else { 0 }, + if has_error { 1 } else { 0 } + ), + tool_count: tool_calls.len(), + failed_tool_step: has_error, + successful_action_signatures: HashSet::new(), + state_delta_signatures: HashSet::new(), + completed_previews: Vec::new(), + }; + for (_, name, args, content, is_error) in observations { + if *is_error || is_non_progress_tool(name) { + continue; + } + let normalized = normalize_progress_fragment(content, 120); + if normalized.len() < MIN_MEANINGFUL_RESULT_CHARS { + continue; + } + record + .successful_action_signatures + .insert(action_signature(name, args)); + record + .state_delta_signatures + .insert(format!("{}|{}", name, normalized)); + let preview = summarize_observation(content, 120); + if !preview.is_empty() && !record.completed_previews.contains(&preview) { + record.completed_previews.push(preview); + } + } + record +} + +fn evaluate_budget_extension( + records: &[StepProgressRecord], + recon_streak: u32, +) -> (bool, Map) { + let start = records.len().saturating_sub(BUDGET_EXTENSION_WINDOW); + let window = &records[start..]; + + let tool_steps = window.iter().filter(|record| record.tool_count > 0).count(); + let failed_steps = window.iter().filter(|record| record.failed_tool_step).count(); + let failure_ratio = if tool_steps == 0 { + 0.0 + } else { + failed_steps as f64 / tool_steps as f64 + }; + + let mut repeated_signature_streak = 1usize; + let mut current_streak = 1usize; + let mut previous_signature: Option<&str> = None; + for record in window { + match previous_signature { + Some(previous) if previous == record.step_signature => { + current_streak += 1; + } + _ => { + current_streak = 1; + previous_signature = Some(record.step_signature.as_str()); + } + } + repeated_signature_streak = repeated_signature_streak.max(current_streak); + } + + let mut prior_action_signatures = HashSet::new(); + for record in &records[..start] { + prior_action_signatures.extend(record.successful_action_signatures.iter().cloned()); + } + + let mut recent_action_signatures = HashSet::new(); + let mut recent_state_delta_signatures = HashSet::new(); + let mut has_build_or_finalize = false; + for record in window { + recent_action_signatures.extend(record.successful_action_signatures.iter().cloned()); + recent_state_delta_signatures.extend(record.state_delta_signatures.iter().cloned()); + has_build_or_finalize |= matches!(record.phase, LoopPhase::Build | LoopPhase::Finalize); + } + + let novel_action_count = recent_action_signatures + .difference(&prior_action_signatures) + .count(); + let state_delta_count = recent_state_delta_signatures.len(); + let positive_signals = usize::from(novel_action_count >= 2) + + usize::from(state_delta_count >= 2) + + usize::from(has_build_or_finalize); + + let mut blockers = Vec::new(); + if repeated_signature_streak >= 3 { + blockers.push("repeated_signatures"); + } + if failure_ratio > 0.6 { + blockers.push("high_failure_ratio"); + } + if recon_streak >= 4 { + blockers.push("recon_streak"); + } + + let mut payload = Map::new(); + payload.insert("window_size".into(), Value::from(window.len() as u64)); + payload.insert( + "repeated_signature_streak".into(), + Value::from(repeated_signature_streak as u64), + ); + payload.insert("failure_ratio".into(), Value::from(failure_ratio)); + payload.insert("novel_action_count".into(), Value::from(novel_action_count as u64)); + payload.insert("state_delta_count".into(), Value::from(state_delta_count as u64)); + payload.insert("has_build_or_finalize".into(), Value::from(has_build_or_finalize)); + payload.insert("positive_signals".into(), Value::from(positive_signals as u64)); + payload.insert( + "blockers".into(), + Value::Array( + blockers + .iter() + .map(|blocker| Value::from((*blocker).to_string())) + .collect(), + ), + ); + + ( + blockers.is_empty() && positive_signals >= MIN_EXTENSION_PROGRESS_SIGNALS, + payload, + ) +} + +fn build_partial_completion_text( + objective: &str, + loop_metrics: &LoopMetrics, + records: &[StepProgressRecord], +) -> String { + let mut completed_previews = Vec::new(); + for record in records.iter().rev().take(BUDGET_EXTENSION_WINDOW) { + for preview in &record.completed_previews { + if !completed_previews.contains(preview) { + completed_previews.push(preview.clone()); + } + if completed_previews.len() >= 3 { + break; + } + } + if completed_previews.len() >= 3 { + break; + } + } + + let completed_block = if completed_previews.is_empty() { + "- The run gathered additional context but did not converge on a final artifact before the bounded limit.".to_string() + } else { + completed_previews + .iter() + .map(|item| format!("- {item}")) + .collect::>() + .join("\n") + }; + + let mut next_actions = Vec::new(); + if loop_metrics.termination_reason == "budget_no_progress" { + next_actions.push( + "Stop repeating the stalled loop and resume with a narrower next slice or a different tactic." + .to_string(), + ); + } + if loop_metrics.termination_reason == "budget_cap" { + next_actions.push( + "Resume from the saved state and focus on finishing the deliverable instead of reopening the full search space." + .to_string(), + ); + } + next_actions.push(format!("Continue the objective with the strongest completed lead: {objective}")); + next_actions.push( + "Turn the completed work below into a concrete artifact or summary before doing more exploration." + .to_string(), + ); + + format!( + "Partial completion for objective: {objective}\nStopped after {} steps with {} budget extension(s). Termination reason: {}.\n\nCompleted work:\n{}\n\nRemaining work:\n- Finish the deliverable using the completed work below and avoid repeating the stalled loop.\n\nSuggested next actions:\n{}", + loop_metrics.steps, + loop_metrics.extensions_granted, + loop_metrics.termination_reason, + completed_block, + next_actions + .iter() + .take(4) + .map(|item| format!("- {item}")) + .collect::>() + .join("\n") + ) +} + /// Real solve flow with a multi-step agentic loop. /// /// Calls the model with tool definitions. If the model returns tool calls, @@ -225,6 +783,17 @@ pub async fn solve( config: &AgentConfig, emitter: &dyn SolveEmitter, cancel: CancellationToken, +) { + solve_with_initial_context(objective, config, emitter, cancel, None).await; +} + +/// Real solve flow with optional initial structured context. +pub async fn solve_with_initial_context( + objective: &str, + config: &AgentConfig, + emitter: &dyn SolveEmitter, + cancel: CancellationToken, + initial_context: Option, ) { if config.demo { return demo_solve(objective, emitter, cancel).await; @@ -240,63 +809,106 @@ pub async fn solve( }; let provider = model.provider_name().to_string(); - emitter.emit_trace(&format!( - "Solving with {}/{}", - provider, - model.model_name() - )); + emitter.emit_trace(&format!("Solving with {}/{}", provider, model.model_name())); // 2. Build tools and messages let tool_defs = build_tool_defs(&provider); let mut tools = WorkspaceTools::new(config); - let system_prompt = build_system_prompt( - config.recursive, - config.acceptance_criteria, - config.demo, - ); + let system_prompt = + build_system_prompt(config.recursive, config.acceptance_criteria, config.demo); + let initial_user_message = match build_initial_user_message( + objective, + config, + initial_context.as_ref(), + ) { + Ok(message) => message, + Err(err) => { + emitter.emit_trace(&format!( + "[solve] failed to serialize initial context; falling back to plain objective: {err}" + )); + objective.to_string() + } + }; let mut messages = vec![ Message::System { content: system_prompt, }, Message::User { - content: objective.to_string(), + content: initial_user_message, }, ]; - let max_steps = config.max_steps_per_call as usize; - - // 3. Background curator channel - let (curator_tx, mut curator_rx) = mpsc::unbounded_channel::(); - let mut curator_handles: Vec> = Vec::new(); + let mut loop_metrics = LoopMetrics::default(); + let mut last_guardrail_streak = 0u32; + let mut active_curator_phase: Option = None; + let mut pending_curator_deltas: Vec = Vec::new(); + let mut step_records: Vec = Vec::new(); + let mut active_step_budget = config.max_steps_per_call.max(1) as usize; + let max_total_steps = active_step_budget + + if config.budget_extension_enabled { + (config.budget_extension_block_steps.max(1) * config.budget_extension_max_blocks.max(0)) + as usize + } else { + 0 + }; // 4. Agentic loop - for step in 1..=max_steps { + for step in 1..=max_total_steps { if cancel.is_cancelled() { - emitter.emit_error("Cancelled"); tools.cleanup(); - abort_curators(&mut curator_handles); + loop_metrics.termination_reason = "cancelled".into(); + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + "cancelled", + config, + &cancel, + emitter, + ) + .await; + emitter.emit_error("Cancelled"); return; } - // Drain completed curator results and inject as system messages - drain_curator_results(&mut curator_rx, &mut messages, emitter); - let step_start = std::time::Instant::now(); // Compact context if it's grown too large (~100k token budget) compact_messages(&mut messages, 100_000); // Call model with streaming - let turn = match model - .chat_stream(&messages, &tool_defs, &|delta| emitter.emit_delta(delta), &cancel) - .await + let turn = match chat_stream_with_rate_limit_retries( + model.as_ref(), + &messages, + &tool_defs, + &|delta| emitter.emit_delta(delta), + &cancel, + config, + emitter, + step, + ) + .await { Ok(t) => t, Err(e) => { let msg = e.to_string(); tools.cleanup(); - abort_curators(&mut curator_handles); + loop_metrics.termination_reason = if msg == "Cancelled" { + "cancelled".into() + } else { + "model_error".into() + }; + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + if msg == "Cancelled" { + "cancelled" + } else { + "model_error" + }, + config, + &cancel, + emitter, + ) + .await; if msg == "Cancelled" { emitter.emit_error("Cancelled"); } else { @@ -306,6 +918,9 @@ pub async fn solve( } }; + loop_metrics.steps = step as u32; + loop_metrics.model_turns += 1; + // Append assistant message to conversation let tool_calls_opt = if turn.tool_calls.is_empty() { None @@ -317,8 +932,31 @@ pub async fn solve( tool_calls: tool_calls_opt, }); - // No tool calls → final answer + // No tool calls → final answer (unless rejected by governance) if turn.tool_calls.is_empty() { + if turn.text.trim().is_empty() { + emitter.emit_trace(&format!( + "[d0/s{step}] empty model response, requesting tool use or concrete final answer" + )); + messages.push(Message::User { + content: "No tool calls and no final answer were returned. Continue solving: use tools if needed or return the concrete final deliverable.".to_string(), + }); + continue; + } + if is_meta_final_text(&turn.text, objective) { + loop_metrics.final_rejections += 1; + emitter.emit_trace(&format!( + "[d0/s{step}] rejected meta final answer; requesting concrete deliverable" + )); + messages.push(Message::User { + content: "Your previous response was process/meta commentary rather than a concrete final answer. Continue solving: use tools if needed and return a direct final deliverable.".to_string(), + }); + continue; + } + let phase = LoopPhase::Finalize; + increment_phase(&mut loop_metrics, &phase); + loop_metrics.termination_reason = "success".into(); + emitter.emit_loop_health(0, step as u32, phase.clone(), loop_metrics.clone(), true); let tool_name = None; emitter.emit_step(StepEvent { depth: 0, @@ -330,35 +968,108 @@ pub async fn solve( }, elapsed_ms: step_start.elapsed().as_millis() as u64, is_final: true, + loop_phase: Some(phase), + loop_metrics: Some(loop_metrics.clone()), }); - emitter.emit_complete(&turn.text); + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + "finalize", + config, + &cancel, + emitter, + ) + .await; + emitter.emit_complete(&turn.text, Some(loop_metrics.clone()), None); tools.cleanup(); - // Wait for in-flight curators before exiting - finish_curators(&mut curator_handles, &mut curator_rx, &mut messages, emitter).await; return; } + loop_metrics.tool_calls += turn.tool_calls.len() as u32; + // Execute each tool call and collect results + let mut tool_observations: Vec<(String, String, String, String, bool)> = Vec::new(); for tc in &turn.tool_calls { if cancel.is_cancelled() { - emitter.emit_error("Cancelled"); tools.cleanup(); - abort_curators(&mut curator_handles); + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + "cancelled", + config, + &cancel, + emitter, + ) + .await; + emitter.emit_error("Cancelled"); return; } emitter.emit_trace(&format!("Executing tool: {} ({})", tc.name, tc.id)); let result = tools.execute(&tc.name, &tc.arguments).await; - - if result.is_error { - emitter.emit_trace(&format!("Tool {} error: {}", tc.name, &result.content[..result.content.len().min(200)])); + let result_content = result.content; + let result_is_error = result.is_error; + + if result_is_error { + emitter.emit_trace(&format!( + "Tool {} error: {}", + tc.name, + safe_prefix(&result_content, 200) + )); } messages.push(Message::Tool { tool_call_id: tc.id.clone(), - content: result.content, + content: result_content.clone(), + }); + tool_observations.push(( + tc.id.clone(), + tc.name.clone(), + tc.arguments.clone(), + result_content, + result_is_error, + )); + } + + let phase = classify_loop_phase(&turn.tool_calls, false); + if let Some(checkpoint) = take_curator_phase_checkpoint( + &mut pending_curator_deltas, + &mut active_curator_phase, + phase.clone(), + ) { + emit_curator_checkpoint(checkpoint, config, &cancel, emitter).await; + } + + if let Some(delta) = + build_state_delta(step as u32, phase.clone(), objective, &tool_observations) + { + pending_curator_deltas.push(delta); + } + if matches!(phase, LoopPhase::Investigate) { + loop_metrics.recon_streak += 1; + } else { + loop_metrics.recon_streak = 0; + last_guardrail_streak = 0; + } + loop_metrics.max_recon_streak = + loop_metrics.max_recon_streak.max(loop_metrics.recon_streak); + increment_phase(&mut loop_metrics, &phase); + if matches!(phase, LoopPhase::Investigate) + && should_emit_recon_guardrail(loop_metrics.recon_streak, last_guardrail_streak) + { + loop_metrics.guardrail_warnings += 1; + last_guardrail_streak = loop_metrics.recon_streak; + emitter.emit_trace(&format!( + "[d0/s{step}] soft guardrail: multiple consecutive recon steps without artifacts; nudging toward implementation" + )); + messages.push(Message::User { + content: "Soft guardrail: you've spent multiple consecutive steps in read/list/search mode without producing artifacts. Move to implementation now: edit files, run targeted validation, and return concrete outputs.".to_string(), }); } + step_records.push(build_step_progress_record( + &turn.tool_calls, + &tool_observations, + phase.clone(), + )); + emitter.emit_loop_health(0, step as u32, phase.clone(), loop_metrics.clone(), false); // Emit step (non-final) AFTER tools execute so the frontend // can refresh the wiki graph with newly written files. @@ -373,44 +1084,101 @@ pub async fn solve( }, elapsed_ms: step_start.elapsed().as_millis() as u64, is_final: false, + loop_phase: Some(phase), + loop_metrics: Some(loop_metrics.clone()), }); - // Spawn background curator after each non-final step - let context = extract_step_context(&messages); - if !context.is_empty() { - let tx = curator_tx.clone(); - let curator_cfg = config.clone(); - let curator_cancel = cancel.clone(); - emitter.emit_trace(&format!("[curator] spawning for step {step}")); - curator_handles.push(tokio::spawn(async move { - let outcome = match run_curator(&context, &curator_cfg, curator_cancel).await { - Ok(result) => CuratorOutcome::Done(result), - Err(e) => CuratorOutcome::Error(e), - }; - let _ = tx.send(outcome); - })); - } - // Budget warnings - let remaining = max_steps - step; - if remaining == max_steps / 2 { + let remaining = active_step_budget.saturating_sub(step); + if remaining == active_step_budget / 2 { emitter.emit_trace(&format!( - "Step budget: {remaining}/{max_steps} steps remaining (50%)" + "Step budget: {remaining}/{active_step_budget} steps remaining (50%)" )); - } else if remaining == max_steps / 4 { + } else if remaining == active_step_budget / 4 { emitter.emit_trace(&format!( - "Step budget: {remaining}/{max_steps} steps remaining (25%)" + "Step budget: {remaining}/{active_step_budget} steps remaining (25%)" )); } + + if step >= active_step_budget { + let (eligible, evaluation) = + evaluate_budget_extension(&step_records, loop_metrics.recon_streak); + loop_metrics.extension_eligible_checks += 1; + emitter.emit_trace(&format!( + "[d0/s{step}] budget boundary reached: eligible={} evaluation={}", + eligible, + Value::Object(evaluation.clone()) + )); + let can_extend = config.budget_extension_enabled + && loop_metrics.extensions_granted < config.budget_extension_max_blocks as u32 + && eligible; + if can_extend { + loop_metrics.extensions_granted += 1; + active_step_budget += config.budget_extension_block_steps.max(1) as usize; + messages.push(Message::User { + content: "Progress-based budget extension granted. You have a small number of extra steps. Finish the deliverable now and avoid repeating the same loop.".to_string(), + }); + continue; + } + + if loop_metrics.extensions_granted >= config.budget_extension_max_blocks as u32 { + loop_metrics.extension_denials_cap += 1; + loop_metrics.termination_reason = "budget_cap".into(); + } else { + loop_metrics.extension_denials_no_progress += 1; + loop_metrics.termination_reason = "budget_no_progress".into(); + } + + tools.cleanup(); + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + "budget_exhausted", + config, + &cancel, + emitter, + ) + .await; + emitter.emit_complete( + &build_partial_completion_text(objective, &loop_metrics, &step_records), + Some(loop_metrics.clone()), + Some(CompletionMeta { + kind: "partial".into(), + reason: loop_metrics.termination_reason.clone(), + steps_used: loop_metrics.steps, + max_steps: active_step_budget as u32, + extensions_granted: loop_metrics.extensions_granted, + extension_block_steps: config.budget_extension_block_steps.max(1) as u32, + extension_max_blocks: config.budget_extension_max_blocks.max(0) as u32, + }), + ); + return; + } } // Budget exhausted tools.cleanup(); - finish_curators(&mut curator_handles, &mut curator_rx, &mut messages, emitter).await; - emitter.emit_error(&format!( - "Step budget exhausted after {max_steps} steps. \ - The model did not produce a final answer within the allowed steps." - )); + loop_metrics.termination_reason = "budget_cap".into(); + flush_pending_curator_checkpoint( + &mut pending_curator_deltas, + "budget_exhausted", + config, + &cancel, + emitter, + ) + .await; + emitter.emit_complete( + &build_partial_completion_text(objective, &loop_metrics, &step_records), + Some(loop_metrics.clone()), + Some(CompletionMeta { + kind: "partial".into(), + reason: loop_metrics.termination_reason.clone(), + steps_used: loop_metrics.steps, + max_steps: active_step_budget as u32, + extensions_granted: loop_metrics.extensions_granted, + extension_block_steps: config.budget_extension_block_steps.max(1) as u32, + extension_max_blocks: config.budget_extension_max_blocks.max(0) as u32, + }), + ); } #[cfg(test)] @@ -418,6 +1186,33 @@ mod tests { use super::*; use std::sync::{Arc, Mutex}; + fn tool_call(name: &str) -> crate::model::ToolCall { + crate::model::ToolCall { + id: format!("call-{name}"), + name: name.to_string(), + arguments: "{}".to_string(), + } + } + + fn progress_record( + phase: LoopPhase, + step_signature: &str, + action_sigs: &[&str], + delta_sigs: &[&str], + previews: &[&str], + failed_tool_step: bool, + ) -> StepProgressRecord { + StepProgressRecord { + phase, + step_signature: step_signature.to_string(), + tool_count: 1, + failed_tool_step, + successful_action_signatures: action_sigs.iter().map(|s| (*s).to_string()).collect(), + state_delta_signatures: delta_sigs.iter().map(|s| (*s).to_string()).collect(), + completed_previews: previews.iter().map(|s| (*s).to_string()).collect(), + } + } + #[derive(Debug, Clone)] #[allow(dead_code)] enum RecordedEvent { @@ -460,13 +1255,15 @@ mod tests { } fn emit_step(&self, event: StepEvent) { - self.events - .lock() - .unwrap() - .push(RecordedEvent::Step(event)); + self.events.lock().unwrap().push(RecordedEvent::Step(event)); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + _loop_metrics: Option, + _completion: Option, + ) { self.events .lock() .unwrap() @@ -489,7 +1286,11 @@ mod tests { demo_solve("Test objective", &emitter, token).await; let events = emitter.events(); - assert!(events.len() >= 4, "expected at least 4 events, got {}", events.len()); + assert!( + events.len() >= 4, + "expected at least 4 events, got {}", + events.len() + ); // First event: trace assert!(matches!(&events[0], RecordedEvent::Trace(_))); @@ -531,8 +1332,13 @@ mod tests { .any(|e| matches!(e, RecordedEvent::Error(m) if m == "Cancelled")); assert!(has_error, "expected a Cancelled error event"); - let has_complete = events.iter().any(|e| matches!(e, RecordedEvent::Complete(_))); - assert!(!has_complete, "should not have a Complete event when cancelled"); + let has_complete = events + .iter() + .any(|e| matches!(e, RecordedEvent::Complete(_))); + assert!( + !has_complete, + "should not have a Complete event when cancelled" + ); } #[tokio::test] @@ -607,7 +1413,10 @@ mod tests { let has_error = recorded .iter() .any(|e| matches!(e, RecordedEvent::Error(m) if m == "Cancelled")); - assert!(has_error, "expected Cancelled error after mid-flight cancel"); + assert!( + has_error, + "expected Cancelled error after mid-flight cancel" + ); // Should NOT have a Complete event let has_complete = recorded @@ -619,6 +1428,140 @@ mod tests { ); } + #[test] + fn test_evaluate_budget_extension_grants_on_real_progress() { + let records = vec![ + progress_record( + LoopPhase::Build, + "write_file|artifact=1|error=0", + &["write_file|{\"path\":\"a.txt\"}"], + &["write_file|wrote a.txt"], + &["Wrote a.txt"], + false, + ), + progress_record( + LoopPhase::Build, + "write_file|artifact=1|error=0", + &["write_file|{\"path\":\"b.txt\"}"], + &["write_file|wrote b.txt"], + &["Wrote b.txt"], + false, + ), + ]; + + let (eligible, payload) = evaluate_budget_extension(&records, 0); + assert!(eligible, "expected progress window to earn an extension"); + assert_eq!(payload.get("novel_action_count"), Some(&Value::from(2u64))); + assert_eq!(payload.get("state_delta_count"), Some(&Value::from(2u64))); + assert_eq!( + payload.get("blockers"), + Some(&Value::Array(Vec::new())) + ); + } + + #[test] + fn test_evaluate_budget_extension_blocks_repeated_signatures() { + let records = vec![ + progress_record( + LoopPhase::Investigate, + "run_shell|artifact=0|error=0", + &["run_shell|{\"command\":\"echo a\"}"], + &["run_shell|echo a"], + &["echo a"], + false, + ), + progress_record( + LoopPhase::Investigate, + "run_shell|artifact=0|error=0", + &["run_shell|{\"command\":\"echo b\"}"], + &["run_shell|echo b"], + &["echo b"], + false, + ), + progress_record( + LoopPhase::Investigate, + "run_shell|artifact=0|error=0", + &["run_shell|{\"command\":\"echo c\"}"], + &["run_shell|echo c"], + &["echo c"], + false, + ), + ]; + + let (eligible, payload) = evaluate_budget_extension(&records, 0); + assert!(!eligible, "repeated signatures should block extension"); + let blockers = payload + .get("blockers") + .and_then(Value::as_array) + .cloned() + .unwrap_or_default(); + assert!(blockers.contains(&Value::from("repeated_signatures"))); + } + + #[test] + fn test_normalize_progress_fragment_truncates_on_utf8_boundary() { + let normalized = + normalize_progress_fragment("[Step 1/100] [Context 10/20] 日本語テスト", 7); + + assert_eq!(normalized, "日本"); + assert!(normalized.len() <= 7); + } + + #[test] + fn test_summarize_observation_truncates_on_utf8_boundary() { + let summary = summarize_observation("abc日本語の長い説明\nsecond line", 8); + + assert_eq!(summary, "abc..."); + assert!(summary.ends_with("...")); + } + + #[test] + fn test_summarize_observation_small_limit_still_returns_ellipsis() { + let summary = summarize_observation("日本語の長い説明", 2); + + assert_eq!(summary, "..."); + } + + #[test] + fn test_build_partial_completion_text_mentions_budget_reason_and_preview() { + let records = vec![progress_record( + LoopPhase::Build, + "write_file|artifact=1|error=0", + &["write_file|{\"path\":\"artifact.txt\"}"], + &["write_file|wrote artifact"], + &["Wrote 8 chars to artifact.txt"], + false, + )]; + let loop_metrics = LoopMetrics { + steps: 4, + model_turns: 4, + tool_calls: 2, + investigate_steps: 0, + build_steps: 1, + iterate_steps: 0, + finalize_steps: 0, + recon_streak: 0, + max_recon_streak: 0, + guardrail_warnings: 0, + final_rejections: 0, + extensions_granted: 1, + extension_eligible_checks: 2, + extension_denials_no_progress: 0, + extension_denials_cap: 1, + termination_reason: "budget_cap".into(), + }; + + let text = build_partial_completion_text( + "finish the artifact", + &loop_metrics, + &records, + ); + + assert!(text.contains("Partial completion for objective: finish the artifact")); + assert!(text.contains("Termination reason: budget_cap")); + assert!(text.contains("Wrote 8 chars to artifact.txt")); + } + #[tokio::test] async fn test_demo_solve_spawned_task_completes() { // Simulates the exact pattern used in agent.rs: @@ -658,24 +1601,166 @@ mod tests { assert!(complete_text.contains("Spawned test")); } + #[test] + fn test_take_curator_phase_checkpoint_flushes_previous_phase_only() { + let mut pending = vec![CuratorStateDelta { + step: 1, + phase: LoopPhase::Investigate, + objective: "Investigate sources".to_string(), + observations: vec![crate::engine::curator::CuratorToolObservation { + tool_call_id: "call-1".to_string(), + tool_name: "read_file".to_string(), + arguments_json: "{}".to_string(), + output_excerpt: "source details".to_string(), + is_error: false, + }], + }]; + let mut active_phase = Some(LoopPhase::Investigate); + + let checkpoint = + take_curator_phase_checkpoint(&mut pending, &mut active_phase, LoopPhase::Build) + .expect("phase transition should flush checkpoint"); + + assert_eq!(checkpoint.boundary, "phase_transition:Investigate->Build"); + assert_eq!(checkpoint.deltas.len(), 1); + assert_eq!(checkpoint.deltas[0].phase, LoopPhase::Investigate); + assert!(pending.is_empty()); + assert_eq!(active_phase, Some(LoopPhase::Build)); + } + + #[test] + fn test_take_curator_phase_checkpoint_initializes_without_flush() { + let mut pending = Vec::new(); + let mut active_phase = None; + + let checkpoint = + take_curator_phase_checkpoint(&mut pending, &mut active_phase, LoopPhase::Investigate); + + assert!(checkpoint.is_none()); + assert_eq!(active_phase, Some(LoopPhase::Investigate)); + } + + #[test] + fn test_take_pending_curator_checkpoint_returns_none_when_empty() { + let mut pending = Vec::new(); + assert!(take_pending_curator_checkpoint(&mut pending, "finalize").is_none()); + } + #[test] fn test_estimate_tokens() { let messages = vec![ - Message::System { content: "System prompt".into() }, // 13 chars - Message::User { content: "Hello".into() }, // 5 chars - Message::Tool { tool_call_id: "t1".into(), content: "x".repeat(4000) }, + Message::System { + content: "System prompt".into(), + }, // 13 chars + Message::User { + content: "Hello".into(), + }, // 5 chars + Message::Tool { + tool_call_id: "t1".into(), + content: "x".repeat(4000), + }, ]; let tokens = estimate_tokens(&messages); // (13 + 5 + 4000) / 4 = 1004 assert_eq!(tokens, 1004); } + #[test] + fn test_build_initial_user_message_preserves_plain_objective_without_context() { + let config = AgentConfig::default(); + let message = build_initial_user_message("just objective", &config, None).unwrap(); + assert_eq!(message, "just objective"); + } + + #[test] + fn test_build_initial_user_message_includes_context_payload() { + let config = AgentConfig::default(); + let message = build_initial_user_message( + "investigate", + &config, + Some(&SolveInitialContext { + session_id: Some("session-1".to_string()), + session_dir: Some("/tmp/session-1".to_string()), + question_reasoning_packet: Some(serde_json::json!({ + "reasoning_mode": "question_centric", + "focus_question_ids": ["q_1"], + "candidate_actions": [{ + "id": "ca_q_q_1", + "action_type": "verify_claim", + "status": "proposed", + }], + "findings": { + "supported": [], + "contested": [], + "unresolved": [], + }, + "contradictions": [], + "evidence_index": {}, + })), + }), + ) + .unwrap(); + + let parsed: Value = serde_json::from_str(&message).unwrap(); + assert_eq!( + parsed["objective"], + Value::String("investigate".to_string()) + ); + assert_eq!(parsed["session_id"], Value::String("session-1".to_string())); + assert_eq!( + parsed["session_dir"], + Value::String("/tmp/session-1".to_string()) + ); + assert_eq!( + parsed["question_reasoning_packet"]["focus_question_ids"], + serde_json::json!(["q_1"]) + ); + assert_eq!( + parsed["question_reasoning_packet"]["candidate_actions"][0]["id"], + serde_json::json!("ca_q_q_1") + ); + assert!(parsed.get("timestamp").is_some()); + assert_eq!( + parsed["max_steps_per_call"], + Value::from(config.max_steps_per_call) + ); + } + + #[test] + fn test_build_initial_user_message_omits_packet_when_empty() { + let config = AgentConfig::default(); + let message = build_initial_user_message( + "investigate", + &config, + Some(&SolveInitialContext { + session_id: Some("session-1".to_string()), + session_dir: Some("/tmp/session-1".to_string()), + question_reasoning_packet: None, + }), + ) + .unwrap(); + + let parsed: Value = serde_json::from_str(&message).unwrap(); + assert!(parsed.get("question_reasoning_packet").is_none()); + assert_eq!( + parsed["objective"], + Value::String("investigate".to_string()) + ); + } + #[test] fn test_compact_messages_no_op_when_under_limit() { let mut messages = vec![ - Message::System { content: "System".into() }, - Message::User { content: "Hello".into() }, - Message::Tool { tool_call_id: "t1".into(), content: "Short result".into() }, + Message::System { + content: "System".into(), + }, + Message::User { + content: "Hello".into(), + }, + Message::Tool { + tool_call_id: "t1".into(), + content: "Short result".into(), + }, ]; compact_messages(&mut messages, 100_000); // Should be unchanged @@ -688,14 +1773,24 @@ mod tests { fn test_compact_messages_truncates_old_tool_results() { let big_result = "x".repeat(8000); let mut messages = vec![ - Message::System { content: "System".into() }, - Message::User { content: "Hello".into() }, + Message::System { + content: "System".into(), + }, + Message::User { + content: "Hello".into(), + }, ]; // Add 15 old steps (assistant + tool pairs) to exceed keep_recent for i in 0..15 { - messages.push(Message::Assistant { content: format!("step{i}"), tool_calls: None }); - messages.push(Message::Tool { tool_call_id: format!("t{i}"), content: big_result.clone() }); + messages.push(Message::Assistant { + content: format!("step{i}"), + tool_calls: None, + }); + messages.push(Message::Tool { + tool_call_id: format!("t{i}"), + content: big_result.clone(), + }); } // Total: ~(6 + 5 + 15*(5+8000)) / 4 ≈ 30_000 tokens @@ -704,14 +1799,87 @@ mod tests { // Old tool result (index 3, early in the list) should be truncated if let Message::Tool { content, .. } = &messages[3] { - assert!(content.len() < 300, "old tool result should be truncated, got {} chars", content.len()); + assert!( + content.len() < 300, + "old tool result should be truncated, got {} chars", + content.len() + ); assert!(content.contains("truncated")); } // Recent tool result (last one) should be intact - let last_tool = messages.iter().rev().find(|m| matches!(m, Message::Tool { .. })).unwrap(); + let last_tool = messages + .iter() + .rev() + .find(|m| matches!(m, Message::Tool { .. })) + .unwrap(); if let Message::Tool { content, .. } = last_tool { assert_eq!(content.len(), 8000, "recent tool result should be intact"); } } + + #[test] + fn test_is_meta_final_text_rejects_empty_and_strong_process_meta() { + assert!(is_meta_final_text("", "Answer the question directly")); + assert!(is_meta_final_text( + "I should start by checking the workspace layout.", + "Answer the question directly" + )); + assert!(!is_meta_final_text( + "Completed the fix and updated the failing test.", + "Answer the question directly" + )); + } + + #[test] + fn test_is_meta_final_text_respects_objective_policy_for_structural_meta() { + assert!(is_meta_final_text( + "Here is my plan for finishing the task.", + "Answer the question directly" + )); + assert!(!is_meta_final_text( + "Here is my plan for finishing the task.", + "Write a plan for finishing the task" + )); + assert!(is_meta_final_text( + "Here is my plan: I will inspect files and then implement.", + "Write a plan for finishing the task" + )); + } + + #[test] + fn test_classify_loop_phase_recon_only_is_investigate() { + let phase = classify_loop_phase(&[tool_call("read_file"), tool_call("list_files")], false); + assert_eq!(phase, LoopPhase::Investigate); + } + + #[test] + fn test_classify_loop_phase_artifact_tools_are_build() { + let phase = classify_loop_phase(&[tool_call("read_file"), tool_call("write_file")], false); + assert_eq!(phase, LoopPhase::Build); + } + + #[test] + fn test_classify_loop_phase_mixed_recon_and_non_recon_is_iterate() { + let phase = classify_loop_phase(&[tool_call("read_file"), tool_call("run_shell")], false); + assert_eq!(phase, LoopPhase::Iterate); + } + + #[test] + fn test_should_emit_recon_guardrail_once_per_episode() { + let mut last_guardrail_streak = 0; + + assert!(!should_emit_recon_guardrail(1, last_guardrail_streak)); + assert!(!should_emit_recon_guardrail(2, last_guardrail_streak)); + assert!(should_emit_recon_guardrail(3, last_guardrail_streak)); + + last_guardrail_streak = 3; + assert!(!should_emit_recon_guardrail(4, last_guardrail_streak)); + assert!(!should_emit_recon_guardrail(5, last_guardrail_streak)); + + last_guardrail_streak = 0; + assert!(!should_emit_recon_guardrail(1, last_guardrail_streak)); + assert!(!should_emit_recon_guardrail(2, last_guardrail_streak)); + assert!(should_emit_recon_guardrail(3, last_guardrail_streak)); + } } diff --git a/openplanter-desktop/crates/op-core/src/events.rs b/openplanter-desktop/crates/op-core/src/events.rs index 70a648a1..5434e4a1 100644 --- a/openplanter-desktop/crates/op-core/src/events.rs +++ b/openplanter-desktop/crates/op-core/src/events.rs @@ -18,6 +18,42 @@ pub struct StepEvent { pub tokens: TokenUsage, pub elapsed_ms: u64, pub is_final: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub loop_phase: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub loop_metrics: Option, +} + +/// High-level phase classification for the current loop step. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LoopPhase { + Investigate, + Build, + Iterate, + Finalize, +} + +/// Cumulative loop telemetry for health and governance UX. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default)] +pub struct LoopMetrics { + pub steps: u32, + pub model_turns: u32, + pub tool_calls: u32, + pub investigate_steps: u32, + pub build_steps: u32, + pub iterate_steps: u32, + pub finalize_steps: u32, + pub recon_streak: u32, + pub max_recon_streak: u32, + pub guardrail_warnings: u32, + pub final_rejections: u32, + pub extensions_granted: u32, + pub extension_eligible_checks: u32, + pub extension_denials_no_progress: u32, + pub extension_denials_cap: u32, + pub termination_reason: String, } /// Token usage counters. @@ -44,10 +80,52 @@ pub enum DeltaKind { ToolCallArgs, } +/// Agent solve completed successfully. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum CompletionKind { + Final, + Partial, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum CompletionReason { + FinalAnswer, + BudgetNoProgress, + BudgetCap, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(default)] +pub struct CompletionMeta { + pub kind: String, + pub reason: String, + pub steps_used: u32, + pub max_steps: u32, + pub extensions_granted: u32, + pub extension_block_steps: u32, + pub extension_max_blocks: u32, +} + /// Agent solve completed successfully. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CompleteEvent { pub result: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub loop_metrics: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub completion: Option, +} + +/// Periodic loop health telemetry event. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LoopHealthEvent { + pub depth: u32, + pub step: u32, + pub phase: LoopPhase, + pub metrics: LoopMetrics, + pub is_final: bool, } /// Agent encountered an error. @@ -56,7 +134,7 @@ pub struct ErrorEvent { pub message: String, } -/// Background wiki curator completed an update. +/// Checkpointed wiki curator completed an update. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CuratorUpdateEvent { pub summary: String, @@ -112,6 +190,7 @@ pub enum AgentEvent { Complete(CompleteEvent), Error(ErrorEvent), WikiUpdated(GraphData), + LoopHealth(LoopHealthEvent), } /// Configuration view sent to the frontend. @@ -250,6 +329,8 @@ mod tests { }, elapsed_ms: 2345, is_final: false, + loop_phase: None, + loop_metrics: None, }; let json = serde_json::to_string(&step).unwrap(); let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); @@ -258,4 +339,44 @@ mod tests { assert_eq!(parsed["tool_name"], "read_file"); assert_eq!(parsed["tokens"]["input_tokens"], 1234); } + #[test] + fn test_loop_metrics_deserialize_backfills_new_fields() { + let parsed: LoopMetrics = serde_json::from_str( + r#"{ + "steps": 2, + "model_turns": 2, + "tool_calls": 1, + "investigate_steps": 1, + "build_steps": 0, + "iterate_steps": 0, + "finalize_steps": 1, + "recon_streak": 0, + "max_recon_streak": 1, + "final_rejections": 1 + }"#, + ) + .unwrap(); + + assert_eq!( + parsed, + LoopMetrics { + steps: 2, + model_turns: 2, + tool_calls: 1, + investigate_steps: 1, + build_steps: 0, + iterate_steps: 0, + finalize_steps: 1, + recon_streak: 0, + max_recon_streak: 1, + guardrail_warnings: 0, + final_rejections: 1, + extensions_granted: 0, + extension_eligible_checks: 0, + extension_denials_no_progress: 0, + extension_denials_cap: 0, + termination_reason: String::new(), + } + ); + } } diff --git a/openplanter-desktop/crates/op-core/src/lib.rs b/openplanter-desktop/crates/op-core/src/lib.rs index 05b9c49a..62efa5cf 100644 --- a/openplanter-desktop/crates/op-core/src/lib.rs +++ b/openplanter-desktop/crates/op-core/src/lib.rs @@ -1,11 +1,11 @@ +pub mod builder; pub mod config; -pub mod prompts; pub mod credentials; -pub mod settings; -pub mod builder; +pub mod engine; pub mod events; pub mod model; -pub mod engine; -pub mod tools; +pub mod prompts; pub mod session; +pub mod settings; +pub mod tools; pub mod wiki; diff --git a/openplanter-desktop/crates/op-core/src/model/mod.rs b/openplanter-desktop/crates/op-core/src/model/mod.rs index 4f2781ec..81b04ca3 100644 --- a/openplanter-desktop/crates/op-core/src/model/mod.rs +++ b/openplanter-desktop/crates/op-core/src/model/mod.rs @@ -8,6 +8,24 @@ use serde::{Deserialize, Serialize}; use crate::events::DeltaEvent; use tokio_util::sync::CancellationToken; +/// Structured model error for provider rate limiting. +#[derive(Debug, Clone)] +pub struct RateLimitError { + pub message: String, + pub status_code: Option, + pub provider_code: Option, + pub body: String, + pub retry_after_sec: Option, +} + +impl std::fmt::Display for RateLimitError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for RateLimitError {} + /// A single tool call returned by the model. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ToolCall { diff --git a/openplanter-desktop/crates/op-core/src/prompts.rs b/openplanter-desktop/crates/op-core/src/prompts.rs index 83a5e85a..8cb3faea 100644 --- a/openplanter-desktop/crates/op-core/src/prompts.rs +++ b/openplanter-desktop/crates/op-core/src/prompts.rs @@ -214,8 +214,10 @@ logs you can read with read_file to recall prior work: - {session_dir}/events.jsonl — Trace events log (JSONL). Each record has a timestamp, event type ("objective", "trace", "step", "result"), and payload. Use this for a lightweight overview of objectives and results without full messages. -- {session_dir}/state.json — Persisted external context observations from prior turns. - This is what feeds the external_context_summary in your initial message. +- {session_dir}/investigation_state.json — Canonical typed session state with + structured evidence plus a legacy projection of prior observations. +- {session_dir}/state.json — Legacy compatibility projection of session state. + This still feeds the external_context_summary in your initial message. These files grow throughout the session. If you need to recall prior analysis, check what you did before, or pick up where you left off, read these logs. @@ -230,7 +232,7 @@ from prior turns in this session. Each entry has: - objective: the objective given to that turn - result_preview: first ~200 characters of the turn's result - timestamp: ISO 8601 UTC when the turn ran - - steps_used: how many engine steps were consumed + - steps_used: how many replayed model calls the turn produced, including delegated child conversations - replay_seq_start: starting sequence number in replay.jsonl Use turn history to: @@ -241,6 +243,42 @@ Use turn history to: For full details of any prior turn, read the session logs: replay.jsonl (full transcript) or events.jsonl (lightweight trace)."#; +pub const QUESTION_REASONING_SECTION: &str = r#" +== QUESTION-CENTRIC REASONING == +Your initial message may contain a "question_reasoning_packet" derived from +{session_dir}/investigation_state.json. Use question-centric reasoning over +document-centric "read more then synthesize" behavior. + +Run this loop until step budget is low or high-priority questions are resolved: +1) Select the next unresolved question from question_reasoning_packet.focus_question_ids + or question_reasoning_packet.unresolved_questions. +2) Gather discriminating evidence targeted at that question. +3) Update related claims in investigation_state.claims with explicit status + (supported / contested / unresolved), confidence, and cited evidence IDs. +4) Record contradictions explicitly, preserving both supporting and contradictory + evidence with provenance IDs instead of collapsing disagreement. +5) Only then synthesize, and repeat for remaining unresolved questions. + +Rules: +- Ground reasoning in typed state references, not raw transcript quotes. Prefer + question IDs, claim IDs, evidence IDs, and provenance IDs. +- Do not mark a claim supported without support evidence IDs. +- Do not resolve a question without explicit claim/evidence linkage. +- Prefer provenance-backed evidence over uncited notes. +- `question_reasoning_packet.candidate_actions` is a machine-readable, read-only + planner surface. Use it to prioritize next steps, but do not assume those + actions were persisted as canonical tasks or executed actions yet. +- Keep any use of `candidate_actions` structured: preserve rationale, + required_sources, expected_payoff, evidence_gap_refs, and ontology_object_refs + instead of turning them into prose recommendations. + +Final deliverables MUST separate findings into three sections: +- Supported Findings +- Contested Findings +- Unresolved Findings + +Each item should cite the relevant evidence/provenance IDs."#; + pub const WIKI_SECTION: &str = r#" == DATA SOURCES WIKI == A runtime wiki of data source documentation is available at .openplanter/wiki/. @@ -248,9 +286,14 @@ Read .openplanter/wiki/index.md at the start of any investigation to see what data sources are documented. Each entry describes access methods, schemas, coverage, and cross-reference potential. -When you discover new information about a data source — updated URLs, new fields, -cross-reference joins, data quality issues, or entirely new sources — update the -relevant entry or create a new one using .openplanter/wiki/template.md. +Treat the wiki as a derived knowledge surface, not your primary memory store. +Primary continuity comes from {session_dir}/investigation_state.json and explicit +evidence/provenance IDs. + +When you discover durable, non-duplicative information about a data source — +updated URLs, new fields, cross-reference joins, data quality issues, or +entirely new sources — update the relevant entry or create a new one using +.openplanter/wiki/template.md. Avoid noisy repeat edits that do not add facts. === MANDATORY WIKI INDEXING === For EVERY investigation, you MUST maintain the wiki as a living knowledge map: @@ -393,6 +436,7 @@ pub fn build_system_prompt(recursive: bool, acceptance_criteria: bool, demo: boo prompt.push_str(SYSTEM_PROMPT_BASE); prompt.push_str(SESSION_LOGS_SECTION); prompt.push_str(TURN_HISTORY_SECTION); + prompt.push_str(QUESTION_REASONING_SECTION); prompt.push_str(WIKI_SECTION); if recursive { prompt.push_str(RECURSIVE_SECTION); @@ -416,6 +460,8 @@ mod tests { assert!(prompt.contains("You are OpenPlanter")); assert!(prompt.contains("SESSION LOGS AND TRANSCRIPTS")); assert!(prompt.contains("TURN HISTORY")); + assert!(prompt.contains("QUESTION-CENTRIC REASONING")); + assert!(prompt.contains("candidate_actions")); assert!(prompt.contains("DATA SOURCES WIKI")); assert!(!prompt.contains("REPL STRUCTURE")); assert!(!prompt.contains("ACCEPTANCE CRITERIA")); @@ -447,6 +493,7 @@ mod tests { let base_pos = prompt.find("You are OpenPlanter").unwrap(); let session_pos = prompt.find("SESSION LOGS AND TRANSCRIPTS").unwrap(); let turn_pos = prompt.find("TURN HISTORY").unwrap(); + let question_pos = prompt.find("QUESTION-CENTRIC REASONING").unwrap(); let wiki_pos = prompt.find("DATA SOURCES WIKI").unwrap(); let repl_pos = prompt.find("REPL STRUCTURE").unwrap(); let accept_pos = prompt.find("ACCEPTANCE CRITERIA").unwrap(); @@ -454,7 +501,8 @@ mod tests { assert!(base_pos < session_pos); assert!(session_pos < turn_pos); - assert!(turn_pos < wiki_pos); + assert!(turn_pos < question_pos); + assert!(question_pos < wiki_pos); assert!(wiki_pos < repl_pos); assert!(repl_pos < accept_pos); assert!(accept_pos < demo_pos); diff --git a/openplanter-desktop/crates/op-core/src/session/mod.rs b/openplanter-desktop/crates/op-core/src/session/mod.rs index 83085b45..3ef1e89c 100644 --- a/openplanter-desktop/crates/op-core/src/session/mod.rs +++ b/openplanter-desktop/crates/op-core/src/session/mod.rs @@ -1,6 +1,6 @@ +pub mod credentials; /// Session store and runtime. /// /// Full implementation in Phase 5. pub mod replay; pub mod settings; -pub mod credentials; diff --git a/openplanter-desktop/crates/op-core/src/session/replay.rs b/openplanter-desktop/crates/op-core/src/session/replay.rs index 367c27cc..df6895aa 100644 --- a/openplanter-desktop/crates/op-core/src/session/replay.rs +++ b/openplanter-desktop/crates/op-core/src/session/replay.rs @@ -58,6 +58,7 @@ impl ReplayLogger { /// Append an entry to the replay log. pub async fn append(&mut self, mut entry: ReplayEntry) -> std::io::Result<()> { + self.seq = self.seq.max(Self::max_seq_from_file(&self.path).await?); self.seq += 1; entry.seq = self.seq; if entry.timestamp.is_empty() { @@ -77,6 +78,29 @@ impl ReplayLogger { Ok(()) } + async fn max_seq_from_file(path: &Path) -> std::io::Result { + if !path.exists() { + return Ok(0); + } + let content = fs::read_to_string(path).await?; + let mut max_seq = 0_u64; + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + match serde_json::from_str::(trimmed) { + Ok(entry) => { + max_seq = max_seq.max(entry.seq); + } + Err(e) => { + eprintln!("[replay] skipping malformed line: {e}"); + } + } + } + Ok(max_seq) + } + /// Read all entries from a session's replay log. pub async fn read_all(session_dir: &Path) -> std::io::Result> { let path = session_dir.join("replay.jsonl"); @@ -179,13 +203,11 @@ mod tests { step_tokens_out: Some(2100), step_elapsed: Some(5000), step_model_preview: Some("The analysis shows...".into()), - step_tool_calls: Some(vec![ - StepToolCallEntry { - name: "read_file".into(), - key_arg: "/src/main.ts".into(), - elapsed: 1200, - }, - ]), + step_tool_calls: Some(vec![StepToolCallEntry { + name: "read_file".into(), + key_arg: "/src/main.ts".into(), + elapsed: 1200, + }]), }; logger.append(entry).await.unwrap(); @@ -218,7 +240,8 @@ mod tests { step_elapsed: None, step_model_preview: None, step_tool_calls: None, - }).unwrap(), + }) + .unwrap(), serde_json::to_string(&ReplayEntry { seq: 2, timestamp: "2026-01-01T00:01:00Z".into(), @@ -232,7 +255,8 @@ mod tests { step_elapsed: None, step_model_preview: None, step_tool_calls: None, - }).unwrap(), + }) + .unwrap(), ); fs::write(&path, content).await.unwrap(); @@ -285,9 +309,124 @@ mod tests { }; logger.append(entry).await.unwrap(); - let content = fs::read_to_string(tmp.path().join("replay.jsonl")).await.unwrap(); + let content = fs::read_to_string(tmp.path().join("replay.jsonl")) + .await + .unwrap(); assert!(!content.contains("tool_name")); assert!(!content.contains("step_number")); assert!(!content.contains("step_tool_calls")); } + + #[tokio::test] + async fn test_append_continues_seq_from_existing_file() { + let tmp = tempdir().unwrap(); + let path = tmp.path().join("replay.jsonl"); + let content = format!( + "{}\n{}\n", + serde_json::to_string(&ReplayEntry { + seq: 4, + timestamp: "2026-01-01T00:00:00Z".into(), + role: "user".into(), + content: "first".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .unwrap(), + serde_json::to_string(&ReplayEntry { + seq: 6, + timestamp: "2026-01-01T00:01:00Z".into(), + role: "assistant".into(), + content: "second".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .unwrap(), + ); + fs::write(&path, content).await.unwrap(); + + let mut logger = ReplayLogger::new(tmp.path()); + logger + .append(ReplayEntry { + seq: 0, + timestamp: String::new(), + role: "user".into(), + content: "third".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .await + .unwrap(); + + let entries = ReplayLogger::read_all(tmp.path()).await.unwrap(); + assert_eq!(entries.last().unwrap().seq, 7); + } + + #[tokio::test] + async fn test_append_ignores_malformed_lines_when_scanning_seq() { + let tmp = tempdir().unwrap(); + let path = tmp.path().join("replay.jsonl"); + fs::write( + &path, + format!( + "{}\nnot json\n", + serde_json::to_string(&ReplayEntry { + seq: 2, + timestamp: "2026-01-01T00:00:00Z".into(), + role: "user".into(), + content: "first".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .unwrap() + ), + ) + .await + .unwrap(); + + let mut logger = ReplayLogger::new(tmp.path()); + logger + .append(ReplayEntry { + seq: 0, + timestamp: String::new(), + role: "assistant".into(), + content: "next".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .await + .unwrap(); + + let entries = ReplayLogger::read_all(tmp.path()).await.unwrap(); + assert_eq!(entries.last().unwrap().seq, 3); + } } diff --git a/openplanter-desktop/crates/op-core/src/tools/filesystem.rs b/openplanter-desktop/crates/op-core/src/tools/filesystem.rs index bee02d2f..2c67d62b 100644 --- a/openplanter-desktop/crates/op-core/src/tools/filesystem.rs +++ b/openplanter-desktop/crates/op-core/src/tools/filesystem.rs @@ -1,5 +1,4 @@ /// Filesystem tools: read, write, edit, list, search. - use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::process::Command; @@ -20,10 +19,7 @@ pub(crate) fn clip(text: &str, max_chars: usize) -> String { } let end = text.floor_char_boundary(max_chars); let omitted = text.len() - end; - format!( - "{}\n\n...[truncated {omitted} chars]...", - &text[..end] - ) + format!("{}\n\n...[truncated {omitted} chars]...", &text[..end]) } pub(crate) fn resolve_path(root: &Path, raw_path: &str) -> Result { @@ -374,12 +370,7 @@ pub fn search_files( let rel = entry.path().strip_prefix(root).unwrap_or(entry.path()); for (idx, line) in text.lines().enumerate() { if line.to_lowercase().contains(&lower_query) { - matches.push(format!( - "{}:{}:{}", - rel.to_string_lossy(), - idx + 1, - line - )); + matches.push(format!("{}:{}:{}", rel.to_string_lossy(), idx + 1, line)); if matches.len() >= max_hits { let mut result = matches.join("\n"); result.push_str("\n...[match limit reached]..."); @@ -467,13 +458,7 @@ mod tests { let dir = TempDir::new().unwrap(); std::fs::write(dir.path().join("test.txt"), "hello world").unwrap(); let mut files_read = HashSet::new(); - let result = edit_file( - dir.path(), - "test.txt", - "hello", - "goodbye", - &mut files_read, - ); + let result = edit_file(dir.path(), "test.txt", "hello", "goodbye", &mut files_read); assert!(!result.is_error); assert_eq!( std::fs::read_to_string(dir.path().join("test.txt")).unwrap(), diff --git a/openplanter-desktop/crates/op-core/src/tools/mod.rs b/openplanter-desktop/crates/op-core/src/tools/mod.rs index 6781f4e2..6ae2065d 100644 --- a/openplanter-desktop/crates/op-core/src/tools/mod.rs +++ b/openplanter-desktop/crates/op-core/src/tools/mod.rs @@ -37,9 +37,16 @@ impl ToolResult { } } +#[derive(Debug, Clone)] +enum ToolScope { + FullWorkspace, + CuratorWikiOnly { allowed_root: PathBuf }, +} + /// Central dispatcher for workspace tools. pub struct WorkspaceTools { root: PathBuf, + scope: ToolScope, shell_path: String, command_timeout_sec: u64, max_shell_output_chars: usize, @@ -57,6 +64,7 @@ impl WorkspaceTools { pub fn new(config: &AgentConfig) -> Self { Self { root: config.workspace.clone(), + scope: ToolScope::FullWorkspace, shell_path: config.shell.clone(), command_timeout_sec: config.command_timeout_sec as u64, max_shell_output_chars: config.max_shell_output_chars as usize, @@ -71,6 +79,46 @@ impl WorkspaceTools { } } + pub fn new_curator(config: &AgentConfig) -> Self { + let allowed_root = filesystem::resolve_path( + &config.workspace, + &format!("{}/wiki", config.session_root_dir), + ) + .unwrap_or_else(|_| config.workspace.join(&config.session_root_dir).join("wiki")); + Self { + root: config.workspace.clone(), + scope: ToolScope::CuratorWikiOnly { allowed_root }, + shell_path: config.shell.clone(), + command_timeout_sec: config.command_timeout_sec as u64, + max_shell_output_chars: config.max_shell_output_chars as usize, + max_file_chars: config.max_file_chars as usize, + max_files_listed: config.max_files_listed as usize, + max_search_hits: config.max_search_hits as usize, + max_observation_chars: config.max_observation_chars as usize, + exa_api_key: config.exa_api_key.clone(), + exa_base_url: config.exa_base_url.clone(), + files_read: HashSet::new(), + bg_jobs: shell::BgJobs::new(), + } + } + + fn enforce_write_scope(&self, raw_path: &str) -> Result<(), ToolResult> { + match &self.scope { + ToolScope::FullWorkspace => Ok(()), + ToolScope::CuratorWikiOnly { allowed_root } => { + let resolved = + filesystem::resolve_path(&self.root, raw_path).map_err(ToolResult::error)?; + if resolved == *allowed_root || resolved.starts_with(allowed_root) { + Ok(()) + } else { + Err(ToolResult::error( + "Curator writes are restricted to .openplanter/wiki/**".to_string(), + )) + } + } + } + } + /// Execute a tool by name with JSON arguments string. /// Returns the tool result, clipped to max_observation_chars. pub async fn execute(&mut self, name: &str, args_json: &str) -> ToolResult { @@ -93,12 +141,18 @@ impl WorkspaceTools { "write_file" => { let path = args.get("path").and_then(|v| v.as_str()).unwrap_or(""); let content = args.get("content").and_then(|v| v.as_str()).unwrap_or(""); + if let Err(result) = self.enforce_write_scope(path) { + return result; + } filesystem::write_file(&self.root, path, content, &mut self.files_read) } "edit_file" => { let path = args.get("path").and_then(|v| v.as_str()).unwrap_or(""); let old_text = args.get("old_text").and_then(|v| v.as_str()).unwrap_or(""); let new_text = args.get("new_text").and_then(|v| v.as_str()).unwrap_or(""); + if let Err(result) = self.enforce_write_scope(path) { + return result; + } filesystem::edit_file( &self.root, path, diff --git a/openplanter-desktop/crates/op-core/src/tools/patching.rs b/openplanter-desktop/crates/op-core/src/tools/patching.rs index 8a136b09..2db9d1d6 100644 --- a/openplanter-desktop/crates/op-core/src/tools/patching.rs +++ b/openplanter-desktop/crates/op-core/src/tools/patching.rs @@ -1,5 +1,4 @@ /// Codex-style patch application and hashline editing. - use std::collections::HashSet; use std::path::{Path, PathBuf}; @@ -24,8 +23,13 @@ fn resolve_path(root: &Path, raw_path: &str) -> Result { // ── Codex-style patch format ── enum PatchOp { - Add { path: String, content: String }, - Delete { path: String }, + Add { + path: String, + content: String, + }, + Delete { + path: String, + }, Update { path: String, move_to: Option, @@ -66,10 +70,7 @@ fn parse_agent_patch(text: &str) -> Result, String> { let line = body[i].trim(); if line.starts_with("*** Add File:") { - let path = line - .trim_start_matches("*** Add File:") - .trim() - .to_string(); + let path = line.trim_start_matches("*** Add File:").trim().to_string(); i += 1; let mut content_lines: Vec = Vec::new(); while i < body.len() && !body[i].trim().starts_with("***") { @@ -174,11 +175,7 @@ fn parse_chunks(raw_lines: &[&str]) -> Vec { } } -fn find_subsequence( - haystack: &[String], - needle: &[String], - start_idx: usize, -) -> Option { +fn find_subsequence(haystack: &[String], needle: &[String], start_idx: usize) -> Option { if needle.is_empty() { return Some(start_idx.min(haystack.len())); } @@ -202,8 +199,7 @@ fn find_subsequence( } // Pass 2: whitespace-normalized match - let normalize = - |s: &str| -> String { s.split_whitespace().collect::>().join(" ") }; + let normalize = |s: &str| -> String { s.split_whitespace().collect::>().join(" ") }; let norm_needle: Vec = needle.iter().map(|s| normalize(s)).collect(); for i in 0..=max_start { @@ -219,11 +215,7 @@ fn find_subsequence( None } -pub fn apply_patch( - root: &Path, - patch_text: &str, - files_read: &mut HashSet, -) -> ToolResult { +pub fn apply_patch(root: &Path, patch_text: &str, files_read: &mut HashSet) -> ToolResult { if patch_text.trim().is_empty() { return ToolResult::error("apply_patch requires non-empty patch text".into()); } @@ -248,9 +240,7 @@ pub fn apply_patch( let _ = std::fs::create_dir_all(parent); } if let Err(e) = std::fs::write(&resolved, &content) { - return ToolResult::error(format!( - "Patch failed: could not write {path}: {e}" - )); + return ToolResult::error(format!("Patch failed: could not write {path}: {e}")); } files_read.insert(resolved); added.push(path); @@ -261,9 +251,7 @@ pub fn apply_patch( Err(e) => return ToolResult::error(format!("Patch failed: {e}")), }; if !resolved.exists() { - return ToolResult::error(format!( - "Patch failed: file not found: {path}" - )); + return ToolResult::error(format!("Patch failed: file not found: {path}")); } if let Err(e) = std::fs::remove_file(&resolved) { return ToolResult::error(format!( @@ -286,14 +274,13 @@ pub fn apply_patch( Err(e) => { return ToolResult::error(format!( "Patch failed: could not read {path}: {e}" - )) + )); } }; files_read.insert(resolved.clone()); let had_trailing_newline = content.ends_with('\n'); - let mut lines: Vec = - content.lines().map(|l| l.to_string()).collect(); + let mut lines: Vec = content.lines().map(|l| l.to_string()).collect(); let mut cursor = 0usize; for chunk in &chunks { @@ -333,9 +320,7 @@ pub fn apply_patch( let _ = std::fs::create_dir_all(parent); } if let Err(e) = std::fs::write(&target, &result) { - return ToolResult::error(format!( - "Patch failed: could not write {path}: {e}" - )); + return ToolResult::error(format!("Patch failed: could not write {path}: {e}")); } files_read.insert(target); updated.push(path); @@ -414,10 +399,8 @@ pub fn hashline_edit( new_lines: vec![new_line], }); } else if let Some(range) = edit.get("replace_lines") { - let start_anchor = - range.get("start").and_then(|v| v.as_str()).unwrap_or(""); - let end_anchor = - range.get("end").and_then(|v| v.as_str()).unwrap_or(""); + let start_anchor = range.get("start").and_then(|v| v.as_str()).unwrap_or(""); + let end_anchor = range.get("end").and_then(|v| v.as_str()).unwrap_or(""); let (start, err) = validate_anchor(start_anchor, &line_hashes, &lines); if let Some(e) = err { return ToolResult::error(e); @@ -427,12 +410,9 @@ pub fn hashline_edit( return ToolResult::error(e); } if end < start { - return ToolResult::error(format!( - "End line {end} is before start line {start}" - )); + return ToolResult::error(format!("End line {end} is before start line {start}")); } - let raw_content = - edit.get("content").and_then(|v| v.as_str()).unwrap_or(""); + let raw_content = edit.get("content").and_then(|v| v.as_str()).unwrap_or(""); let new_lines: Vec = raw_content .lines() .map(|l| HASHLINE_PREFIX_RE.replace(l, "").to_string()) @@ -443,15 +423,12 @@ pub fn hashline_edit( end, new_lines, }); - } else if let Some(anchor) = - edit.get("insert_after").and_then(|v| v.as_str()) - { + } else if let Some(anchor) = edit.get("insert_after").and_then(|v| v.as_str()) { let (lineno, err) = validate_anchor(anchor, &line_hashes, &lines); if let Some(e) = err { return ToolResult::error(e); } - let raw_content = - edit.get("content").and_then(|v| v.as_str()).unwrap_or(""); + let raw_content = edit.get("content").and_then(|v| v.as_str()).unwrap_or(""); let new_lines: Vec = raw_content .lines() .map(|l| HASHLINE_PREFIX_RE.replace(l, "").to_string()) @@ -483,13 +460,9 @@ pub fn hashline_edit( } } "replace" => { - let old_slice: Vec = - lines[edit.start - 1..edit.end].to_vec(); + let old_slice: Vec = lines[edit.start - 1..edit.end].to_vec(); if old_slice != edit.new_lines { - lines.splice( - edit.start - 1..edit.end, - edit.new_lines.iter().cloned(), - ); + lines.splice(edit.start - 1..edit.end, edit.new_lines.iter().cloned()); changed += 1; } } @@ -527,9 +500,7 @@ fn validate_anchor( if parts.len() != 2 || parts[1].len() != 2 { return ( 0, - Some(format!( - "Invalid anchor format: {anchor:?} (expected N:HH)" - )), + Some(format!("Invalid anchor format: {anchor:?} (expected N:HH)")), ); } let lineno: usize = match parts[0].parse() { @@ -537,10 +508,8 @@ fn validate_anchor( Err(_) => { return ( 0, - Some(format!( - "Invalid anchor format: {anchor:?} (expected N:HH)" - )), - ) + Some(format!("Invalid anchor format: {anchor:?} (expected N:HH)")), + ); } }; let expected_hash = parts[1]; @@ -553,10 +522,7 @@ fn validate_anchor( )), ); } - let actual_hash = line_hashes - .get(&lineno) - .map(|s| s.as_str()) - .unwrap_or(""); + let actual_hash = line_hashes.get(&lineno).map(|s| s.as_str()).unwrap_or(""); if actual_hash != expected_hash { let ctx_start = lineno.saturating_sub(2).max(1); let ctx_end = (lineno + 2).min(lines.len()); @@ -565,10 +531,7 @@ fn validate_anchor( format!( " {}:{}|{}", i, - line_hashes - .get(&i) - .map(|s| s.as_str()) - .unwrap_or("??"), + line_hashes.get(&i).map(|s| s.as_str()).unwrap_or("??"), lines[i - 1] ) }) @@ -603,8 +566,7 @@ mod tests { let result = apply_patch(dir.path(), patch, &mut files_read); assert!(!result.is_error, "error: {}", result.content); assert!(result.content.contains("Added")); - let content = - std::fs::read_to_string(dir.path().join("new_file.txt")).unwrap(); + let content = std::fs::read_to_string(dir.path().join("new_file.txt")).unwrap(); assert_eq!(content, "hello\nworld\n"); } @@ -625,8 +587,7 @@ mod tests { #[test] fn test_apply_patch_update_file() { let dir = TempDir::new().unwrap(); - std::fs::write(dir.path().join("test.txt"), "line1\nline2\nline3\n") - .unwrap(); + std::fs::write(dir.path().join("test.txt"), "line1\nline2\nline3\n").unwrap(); let mut files_read = HashSet::new(); let patch = "\ *** Begin Patch @@ -639,8 +600,7 @@ mod tests { *** End Patch"; let result = apply_patch(dir.path(), patch, &mut files_read); assert!(!result.is_error, "error: {}", result.content); - let content = - std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); + let content = std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); assert!(content.contains("line2_modified")); assert!(!content.contains("\nline2\n")); } @@ -656,11 +616,9 @@ mod tests { "set_line": format!("2:{hash}"), "content": "BBB" })]; - let result = - hashline_edit(dir.path(), "test.txt", &edits, &mut files_read); + let result = hashline_edit(dir.path(), "test.txt", &edits, &mut files_read); assert!(!result.is_error, "error: {}", result.content); - let content = - std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); + let content = std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); assert!(content.contains("BBB")); assert!(!content.contains("\nbbb\n")); } @@ -676,11 +634,9 @@ mod tests { "insert_after": format!("2:{hash}"), "content": "inserted_line" })]; - let result = - hashline_edit(dir.path(), "test.txt", &edits, &mut files_read); + let result = hashline_edit(dir.path(), "test.txt", &edits, &mut files_read); assert!(!result.is_error, "error: {}", result.content); - let content = - std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); + let content = std::fs::read_to_string(dir.path().join("test.txt")).unwrap(); let lines: Vec<&str> = content.lines().collect(); assert_eq!(lines[2], "inserted_line"); } @@ -693,16 +649,14 @@ mod tests { #[test] fn test_find_subsequence_exact() { - let haystack: Vec = - vec!["a".into(), "b".into(), "c".into()]; + let haystack: Vec = vec!["a".into(), "b".into(), "c".into()]; let needle: Vec = vec!["b".into(), "c".into()]; assert_eq!(find_subsequence(&haystack, &needle, 0), Some(1)); } #[test] fn test_find_subsequence_whitespace() { - let haystack: Vec = - vec!["a".into(), " b ".into(), "c".into()]; + let haystack: Vec = vec!["a".into(), " b ".into(), "c".into()]; let needle: Vec = vec!["b".into(), "c".into()]; assert_eq!(find_subsequence(&haystack, &needle, 0), Some(1)); } diff --git a/openplanter-desktop/crates/op-core/src/tools/shell.rs b/openplanter-desktop/crates/op-core/src/tools/shell.rs index f1023b37..5f032f75 100644 --- a/openplanter-desktop/crates/op-core/src/tools/shell.rs +++ b/openplanter-desktop/crates/op-core/src/tools/shell.rs @@ -1,5 +1,4 @@ /// Shell execution tools: run_shell, run_shell_bg, check_shell_bg, kill_shell_bg. - use std::collections::HashMap; use std::path::Path; use std::process::{Child, Command, Stdio}; @@ -22,10 +21,7 @@ fn clip(text: &str, max_chars: usize) -> String { } let end = text.floor_char_boundary(max_chars); let omitted = text.len() - end; - format!( - "{}\n\n...[truncated {omitted} chars]...", - &text[..end] - ) + format!("{}\n\n...[truncated {omitted} chars]...", &text[..end]) } fn check_shell_policy(command: &str) -> Option { @@ -134,18 +130,11 @@ pub fn run_shell( let stderr = String::from_utf8_lossy(&output.stderr); let code = output.status.code().unwrap_or(-1); - let merged = format!( - "$ {command}\n[exit_code={code}]\n[stdout]\n{stdout}\n[stderr]\n{stderr}" - ); + let merged = format!("$ {command}\n[exit_code={code}]\n[stdout]\n{stdout}\n[stderr]\n{stderr}"); ToolResult::ok(clip(&merged, max_output_chars)) } -pub fn run_shell_bg( - root: &Path, - shell: &str, - command: &str, - bg_jobs: &mut BgJobs, -) -> ToolResult { +pub fn run_shell_bg(root: &Path, shell: &str, command: &str, bg_jobs: &mut BgJobs) -> ToolResult { if let Some(err) = check_shell_policy(command) { return ToolResult::error(err); } @@ -195,11 +184,7 @@ pub fn run_shell_bg( )) } -pub fn check_shell_bg( - job_id: u32, - bg_jobs: &mut BgJobs, - max_output_chars: usize, -) -> ToolResult { +pub fn check_shell_bg(job_id: u32, bg_jobs: &mut BgJobs, max_output_chars: usize) -> ToolResult { let job = match bg_jobs.jobs.get_mut(&job_id) { Some(j) => j, None => return ToolResult::error(format!("No background job with id {job_id}")), @@ -220,9 +205,7 @@ pub fn check_shell_bg( } Ok(None) => { let pid = job.child.id(); - ToolResult::ok(format!( - "[job {job_id} still running, pid={pid}]\n{output}" - )) + ToolResult::ok(format!("[job {job_id} still running, pid={pid}]\n{output}")) } Err(e) => ToolResult::error(format!("Error checking job {job_id}: {e}")), } @@ -258,13 +241,7 @@ mod tests { #[test] fn test_run_shell_heredoc_blocked() { let dir = TempDir::new().unwrap(); - let result = run_shell( - dir.path(), - "/bin/sh", - "cat << EOF\nhello\nEOF", - 10, - 16000, - ); + let result = run_shell(dir.path(), "/bin/sh", "cat << EOF\nhello\nEOF", 10, 16000); assert!(result.is_error); assert!(result.content.contains("BLOCKED")); } diff --git a/openplanter-desktop/crates/op-core/src/wiki/matching.rs b/openplanter-desktop/crates/op-core/src/wiki/matching.rs index 4955b0ac..48620b1d 100644 --- a/openplanter-desktop/crates/op-core/src/wiki/matching.rs +++ b/openplanter-desktop/crates/op-core/src/wiki/matching.rs @@ -18,15 +18,13 @@ impl NameRegistry { /// Register a canonical name for an entity. pub fn register(&mut self, name: &str, entity_id: &str) { - self.entries - .push((name.to_string(), entity_id.to_string())); + self.entries.push((name.to_string(), entity_id.to_string())); } /// Register multiple aliases for the same entity. pub fn register_aliases(&mut self, aliases: &[String], entity_id: &str) { for alias in aliases { - self.entries - .push((alias.clone(), entity_id.to_string())); + self.entries.push((alias.clone(), entity_id.to_string())); } } @@ -123,10 +121,7 @@ mod tests { fn test_aliases() { let mut reg = NameRegistry::new(); reg.register("Acme Corp", "acme-corp"); - reg.register_aliases( - &["AC".to_string(), "Acme".to_string()], - "acme-corp", - ); + reg.register_aliases(&["AC".to_string(), "Acme".to_string()], "acme-corp"); assert_eq!(reg.len(), 3); let result = reg.find_best("Acme"); diff --git a/openplanter-desktop/crates/op-core/src/wiki/mod.rs b/openplanter-desktop/crates/op-core/src/wiki/mod.rs index 149037b4..02051a6d 100644 --- a/openplanter-desktop/crates/op-core/src/wiki/mod.rs +++ b/openplanter-desktop/crates/op-core/src/wiki/mod.rs @@ -1,6 +1,6 @@ +pub mod matching; /// Wiki knowledge graph model (petgraph). /// /// Full implementation in Phase 5. pub mod parser; -pub mod matching; pub mod watcher; diff --git a/openplanter-desktop/crates/op-core/src/wiki/parser.rs b/openplanter-desktop/crates/op-core/src/wiki/parser.rs index 638222e3..f24746b5 100644 --- a/openplanter-desktop/crates/op-core/src/wiki/parser.rs +++ b/openplanter-desktop/crates/op-core/src/wiki/parser.rs @@ -37,9 +37,7 @@ pub fn parse_index(content: &str) -> Vec { if trimmed.starts_with("### ") { current_category = trimmed[4..].trim().to_lowercase(); // Normalize common category names - current_category = current_category - .replace(' ', "-") - .replace('_', "-"); + current_category = current_category.replace(' ', "-").replace('_', "-"); continue; } diff --git a/openplanter-desktop/crates/op-core/src/wiki/watcher.rs b/openplanter-desktop/crates/op-core/src/wiki/watcher.rs index a9d4ca86..19abf89d 100644 --- a/openplanter-desktop/crates/op-core/src/wiki/watcher.rs +++ b/openplanter-desktop/crates/op-core/src/wiki/watcher.rs @@ -37,34 +37,35 @@ impl WikiWatcher { ) -> std::io::Result<(Self, mpsc::UnboundedReceiver)> { let (tx, rx) = mpsc::unbounded_channel(); - let mut watcher = notify::recommended_watcher(move |result: Result| { - let event = match result { - Ok(e) => e, - Err(err) => { - eprintln!("[wiki-watcher] error: {err}"); - return; - } - }; - - let kind = match event.kind { - EventKind::Create(_) => WikiChangeKind::Created, - EventKind::Modify(_) => WikiChangeKind::Modified, - EventKind::Remove(_) => WikiChangeKind::Deleted, - _ => return, - }; - - for path in event.paths { - // Only watch .md files - if path.extension().and_then(|e| e.to_str()) != Some("md") { - continue; + let mut watcher = + notify::recommended_watcher(move |result: Result| { + let event = match result { + Ok(e) => e, + Err(err) => { + eprintln!("[wiki-watcher] error: {err}"); + return; + } + }; + + let kind = match event.kind { + EventKind::Create(_) => WikiChangeKind::Created, + EventKind::Modify(_) => WikiChangeKind::Modified, + EventKind::Remove(_) => WikiChangeKind::Deleted, + _ => return, + }; + + for path in event.paths { + // Only watch .md files + if path.extension().and_then(|e| e.to_str()) != Some("md") { + continue; + } + let _ = tx.send(WikiChangeEvent { + path, + kind: kind.clone(), + }); } - let _ = tx.send(WikiChangeEvent { - path, - kind: kind.clone(), - }); - } - }) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + }) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; watcher .watch(&wiki_dir, RecursiveMode::Recursive) @@ -83,7 +84,7 @@ impl WikiWatcher { mod tests { use super::*; use tempfile::tempdir; - use tokio::time::{sleep, Duration}; + use tokio::time::{Duration, sleep}; #[tokio::test] async fn test_watcher_detects_create() { diff --git a/openplanter-desktop/crates/op-core/tests/test_model_streaming.rs b/openplanter-desktop/crates/op-core/tests/test_model_streaming.rs index 5e792de0..2b8eab1a 100644 --- a/openplanter-desktop/crates/op-core/tests/test_model_streaming.rs +++ b/openplanter-desktop/crates/op-core/tests/test_model_streaming.rs @@ -13,7 +13,7 @@ use axum::routing::post; use axum::Router; use tokio_util::sync::CancellationToken; -use op_core::events::{DeltaEvent, DeltaKind}; +use op_core::events::{CompletionMeta, DeltaEvent, DeltaKind, LoopMetrics}; use op_core::model::openai::OpenAIModel; use op_core::model::anthropic::AnthropicModel; use op_core::model::{BaseModel, Message}; @@ -448,7 +448,12 @@ async fn test_solve_with_mock_anthropic() { fn emit_step(&self, event: StepEvent) { self.events.lock().unwrap().push(Ev::Step(event)); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + _: Option, + _: Option, + ) { self.events.lock().unwrap().push(Ev::Complete(result.to_string())); } fn emit_error(&self, message: &str) { @@ -539,7 +544,12 @@ async fn test_solve_with_mock_openai() { fn emit_step(&self, event: StepEvent) { self.events.lock().unwrap().push(Ev2::Step(event)); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + _: Option, + _: Option, + ) { self.events.lock().unwrap().push(Ev2::Complete(result.to_string())); } fn emit_error(&self, message: &str) { @@ -619,7 +629,7 @@ async fn test_solve_http_error_emits_error() { fn emit_trace(&self, _: &str) {} fn emit_delta(&self, _: DeltaEvent) {} fn emit_step(&self, _: StepEvent) {} - fn emit_complete(&self, _: &str) {} + fn emit_complete(&self, _: &str, _: Option, _: Option) {} fn emit_error(&self, msg: &str) { self.errors.lock().unwrap().push(msg.to_string()); } @@ -664,7 +674,7 @@ async fn test_solve_cancel_emits_cancelled() { fn emit_trace(&self, _: &str) {} fn emit_delta(&self, _: DeltaEvent) {} fn emit_step(&self, _: StepEvent) {} - fn emit_complete(&self, _: &str) {} + fn emit_complete(&self, _: &str, _: Option, _: Option) {} fn emit_error(&self, msg: &str) { self.events.lock().unwrap().push(msg.to_string()); } @@ -707,7 +717,12 @@ async fn test_solve_demo_mode_bypasses_llm() { fn emit_trace(&self, _: &str) {} fn emit_delta(&self, _: DeltaEvent) {} fn emit_step(&self, _: StepEvent) {} - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + _: Option, + _: Option, + ) { self.events.lock().unwrap().push(result.to_string()); } fn emit_error(&self, msg: &str) { @@ -746,7 +761,7 @@ async fn test_solve_missing_key_emits_error() { fn emit_trace(&self, _: &str) {} fn emit_delta(&self, _: DeltaEvent) {} fn emit_step(&self, _: StepEvent) {} - fn emit_complete(&self, _: &str) {} + fn emit_complete(&self, _: &str, _: Option, _: Option) {} fn emit_error(&self, msg: &str) { self.errors.lock().unwrap().push(msg.to_string()); } @@ -872,7 +887,12 @@ async fn test_solve_multi_step_agentic_loop() { fn emit_step(&self, event: StepEvent) { self.events.lock().unwrap().push(Ev3::Step(event)); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + _: Option, + _: Option, + ) { self.events.lock().unwrap().push(Ev3::Complete(result.to_string())); } fn emit_error(&self, message: &str) { diff --git a/openplanter-desktop/crates/op-tauri/src/bridge.rs b/openplanter-desktop/crates/op-tauri/src/bridge.rs index 8904bd1d..ec7294f2 100644 --- a/openplanter-desktop/crates/op-tauri/src/bridge.rs +++ b/openplanter-desktop/crates/op-tauri/src/bridge.rs @@ -10,9 +10,53 @@ use std::sync::{Arc, Mutex}; use tauri::{AppHandle, Emitter}; use op_core::engine::SolveEmitter; -use op_core::events::{CompleteEvent, CuratorUpdateEvent, DeltaEvent, DeltaKind, ErrorEvent, StepEvent, TraceEvent}; +use op_core::events::{ + CompleteEvent, CompletionMeta, CuratorUpdateEvent, DeltaEvent, DeltaKind, ErrorEvent, + LoopHealthEvent, LoopMetrics, LoopPhase, StepEvent, TraceEvent, +}; use op_core::session::replay::{ReplayEntry, ReplayLogger, StepToolCallEntry}; +const MAX_STEP_MODEL_PREVIEW_CHARS: usize = 4 * 1024; +const MAX_TOOL_ARGS_CAPTURE_CHARS: usize = 16 * 1024; +const MAX_DELTA_LOG_CHARS: usize = 120; + +fn preview_text(text: &str, max_chars: usize) -> String { + if text.len() <= max_chars { + return text.to_string(); + } + + let end = text.floor_char_boundary(max_chars); + format!("{}...[truncated {} chars]", &text[..end], text.len() - end) +} + +fn append_with_cap(buffer: &mut String, text: &str, max_chars: usize, truncated: &mut bool) { + if *truncated { + return; + } + if buffer.len() >= max_chars { + *truncated = true; + return; + } + + let remaining = max_chars - buffer.len(); + let end = text.floor_char_boundary(text.len().min(remaining)); + buffer.push_str(&text[..end]); + if end < text.len() { + *truncated = true; + } +} + +fn format_model_preview(buffer: &str, truncated: bool) -> Option { + let trimmed = buffer.trim(); + if trimmed.is_empty() { + None + } else if truncated { + Some(format!("{trimmed}\n...[truncated]")) + } else { + Some(trimmed.to_string()) + } +} + pub struct TauriEmitter { handle: AppHandle, } @@ -35,21 +79,48 @@ impl SolveEmitter for TauriEmitter { } fn emit_delta(&self, event: DeltaEvent) { - eprintln!("[bridge] delta: kind={:?} text={:?}", event.kind, event.text); + match event.kind { + DeltaKind::ToolCallArgs => eprintln!( + "[bridge] delta: kind={:?} len={} preview={:?}", + event.kind, + event.text.len(), + preview_text(&event.text, MAX_DELTA_LOG_CHARS) + ), + _ if event.text.len() > MAX_DELTA_LOG_CHARS => eprintln!( + "[bridge] delta: kind={:?} len={} preview={:?}", + event.kind, + event.text.len(), + preview_text(&event.text, MAX_DELTA_LOG_CHARS) + ), + _ => eprintln!( + "[bridge] delta: kind={:?} text={:?}", + event.kind, event.text + ), + } let _ = self.handle.emit("agent:delta", event); } fn emit_step(&self, event: StepEvent) { - eprintln!("[bridge] step: depth={} step={} is_final={}", event.depth, event.step, event.is_final); + eprintln!( + "[bridge] step: depth={} step={} is_final={}", + event.depth, event.step, event.is_final + ); let _ = self.handle.emit("agent:step", event); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + loop_metrics: Option, + completion: Option, + ) { eprintln!("[bridge] complete: {result}"); let _ = self.handle.emit( "agent:complete", CompleteEvent { result: result.to_string(), + loop_metrics, + completion, }, ); } @@ -64,6 +135,26 @@ impl SolveEmitter for TauriEmitter { ); } + fn emit_loop_health( + &self, + depth: u32, + step: u32, + phase: LoopPhase, + metrics: LoopMetrics, + is_final: bool, + ) { + let _ = self.handle.emit( + "agent:loop-health", + LoopHealthEvent { + depth, + step, + phase, + metrics, + is_final, + }, + ); + } + fn emit_curator_update(&self, summary: &str, files_changed: u32) { eprintln!("[bridge] curator update: {summary} ({files_changed} files)"); let _ = self.handle.emit( @@ -85,12 +176,16 @@ pub struct LoggingEmitter { replay: Arc>, /// Accumulated streaming text for the current step (std::sync for non-async ops). streaming_buf: Mutex, + /// Whether the current step preview was truncated. + streaming_truncated: Mutex, /// Tool calls accumulated during the current step. step_tool_calls: Mutex>, /// Name of the tool currently being generated. current_tool: Mutex, /// Accumulated args JSON for the current tool. current_args_buf: Mutex, + /// Whether the current tool args buffer was truncated. + current_args_truncated: Mutex, } /// A tool call being accumulated during streaming. @@ -122,9 +217,11 @@ impl LoggingEmitter { inner, replay: Arc::new(tokio::sync::Mutex::new(replay)), streaming_buf: Mutex::new(String::new()), + streaming_truncated: Mutex::new(false), step_tool_calls: Mutex::new(Vec::new()), current_tool: Mutex::new(String::new()), current_args_buf: Mutex::new(String::new()), + current_args_truncated: Mutex::new(false), } } } @@ -138,12 +235,19 @@ impl SolveEmitter for LoggingEmitter { // Accumulate streaming data for step summary logging (sync — no I/O) match event.kind { DeltaKind::Text => { - self.streaming_buf.lock().unwrap().push_str(&event.text); + let mut truncated = self.streaming_truncated.lock().unwrap(); + append_with_cap( + &mut self.streaming_buf.lock().unwrap(), + &event.text, + MAX_STEP_MODEL_PREVIEW_CHARS, + &mut truncated, + ); } DeltaKind::ToolCallStart => { let tool_name = event.text.clone(); *self.current_tool.lock().unwrap() = tool_name.clone(); *self.current_args_buf.lock().unwrap() = String::new(); + *self.current_args_truncated.lock().unwrap() = false; self.step_tool_calls.lock().unwrap().push(PendingToolCall { name: tool_name, key_arg: String::new(), @@ -152,7 +256,13 @@ impl SolveEmitter for LoggingEmitter { } DeltaKind::ToolCallArgs => { let mut buf = self.current_args_buf.lock().unwrap(); - buf.push_str(&event.text); + let mut truncated = self.current_args_truncated.lock().unwrap(); + append_with_cap( + &mut buf, + &event.text, + MAX_TOOL_ARGS_CAPTURE_CHARS, + &mut truncated, + ); let tool_name = self.current_tool.lock().unwrap().clone(); if let Some(key_arg) = extract_key_arg(&tool_name, &buf) { let mut calls = self.step_tool_calls.lock().unwrap(); @@ -171,8 +281,7 @@ impl SolveEmitter for LoggingEmitter { // Collect accumulated data (sync) let model_preview = { let buf = self.streaming_buf.lock().unwrap(); - let trimmed = buf.trim().to_string(); - if trimmed.is_empty() { None } else { Some(trimmed) } + format_model_preview(&buf, *self.streaming_truncated.lock().unwrap()) }; let step_tools: Vec = { @@ -218,12 +327,21 @@ impl SolveEmitter for LoggingEmitter { // Reset buffers for next step self.streaming_buf.lock().unwrap().clear(); + *self.streaming_truncated.lock().unwrap() = false; self.step_tool_calls.lock().unwrap().clear(); + self.current_tool.lock().unwrap().clear(); + self.current_args_buf.lock().unwrap().clear(); + *self.current_args_truncated.lock().unwrap() = false; self.inner.emit_step(event); } - fn emit_complete(&self, result: &str) { + fn emit_complete( + &self, + result: &str, + loop_metrics: Option, + completion: Option, + ) { let entry = ReplayEntry { seq: 0, timestamp: String::new(), @@ -249,13 +367,25 @@ impl SolveEmitter for LoggingEmitter { }); }); - self.inner.emit_complete(result); + self.inner.emit_complete(result, loop_metrics, completion); } fn emit_error(&self, message: &str) { self.inner.emit_error(message); } + fn emit_loop_health( + &self, + depth: u32, + step: u32, + phase: LoopPhase, + metrics: LoopMetrics, + is_final: bool, + ) { + self.inner + .emit_loop_health(depth, step, phase, metrics, is_final); + } + fn emit_curator_update(&self, summary: &str, files_changed: u32) { // Log curator update to replay let entry = ReplayEntry { @@ -301,7 +431,7 @@ mod tests { fn emit_trace(&self, _: &str) {} fn emit_delta(&self, _: DeltaEvent) {} fn emit_step(&self, _: StepEvent) {} - fn emit_complete(&self, _: &str) {} + fn emit_complete(&self, _: &str, _: Option, _: Option) {} fn emit_error(&self, _: &str) {} } @@ -327,7 +457,12 @@ mod tests { assert_eq!(step.step_number, Some(1)); assert!(step.step_tokens_in.is_some()); assert!(step.step_model_preview.is_some()); - assert!(step.step_model_preview.as_ref().unwrap().contains("Test persistence")); + assert!( + step.step_model_preview + .as_ref() + .unwrap() + .contains("Test persistence") + ); let assistant = entries.iter().find(|e| e.role == "assistant"); assert!(assistant.is_some(), "expected an assistant entry"); @@ -355,20 +490,23 @@ mod tests { // 1. Log user message let mut replay = ReplayLogger::new(tmp.path()); - replay.append(ReplayEntry { - seq: 0, - timestamp: String::new(), - role: "user".into(), - content: "Roundtrip test".into(), - tool_name: None, - is_rendered: None, - step_number: None, - step_tokens_in: None, - step_tokens_out: None, - step_elapsed: None, - step_model_preview: None, - step_tool_calls: None, - }).await.unwrap(); + replay + .append(ReplayEntry { + seq: 0, + timestamp: String::new(), + role: "user".into(), + content: "Roundtrip test".into(), + tool_name: None, + is_rendered: None, + step_number: None, + step_tokens_in: None, + step_tokens_out: None, + step_elapsed: None, + step_model_preview: None, + step_tool_calls: None, + }) + .await + .unwrap(); // 2. Run demo_solve through LoggingEmitter let emitter = LoggingEmitter::new(NullEmitter, replay); @@ -377,7 +515,11 @@ mod tests { // 3. Read back full conversation let entries = ReplayLogger::read_all(tmp.path()).await.unwrap(); - assert!(entries.len() >= 3, "expected user + step-summary + assistant, got {}", entries.len()); + assert!( + entries.len() >= 3, + "expected user + step-summary + assistant, got {}", + entries.len() + ); assert_eq!(entries[0].role, "user"); assert_eq!(entries[0].content, "Roundtrip test"); @@ -394,4 +536,106 @@ mod tests { assert_eq!(entry.seq, (i + 1) as u64); } } + + #[derive(Default)] + struct CapturingEmitter { + deltas: Arc>>, + } + + impl SolveEmitter for CapturingEmitter { + fn emit_trace(&self, _: &str) {} + fn emit_delta(&self, event: DeltaEvent) { + self.deltas.lock().unwrap().push(event); + } + fn emit_step(&self, _: StepEvent) {} + fn emit_complete(&self, _: &str, _: Option, _: Option) {} + fn emit_error(&self, _: &str) {} + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_logging_emitter_caps_model_preview_and_preserves_deltas() { + let tmp = tempdir().unwrap(); + let replay = ReplayLogger::new(tmp.path()); + let inner = CapturingEmitter::default(); + let deltas = inner.deltas.clone(); + let emitter = LoggingEmitter::new(inner, replay); + let big_text = "x".repeat(MAX_STEP_MODEL_PREVIEW_CHARS + 256); + + emitter.emit_delta(DeltaEvent { + kind: DeltaKind::Text, + text: big_text.clone(), + }); + emitter.emit_step(StepEvent { + depth: 0, + step: 1, + tool_name: None, + tokens: Default::default(), + elapsed_ms: 1, + is_final: false, + loop_phase: None, + loop_metrics: None, + }); + + let entries = ReplayLogger::read_all(tmp.path()).await.unwrap(); + let step = entries + .iter() + .find(|entry| entry.role == "step-summary") + .unwrap(); + let preview = step.step_model_preview.as_ref().unwrap(); + assert!(preview.contains("[truncated]")); + assert!(preview.len() < big_text.len()); + + let captured = deltas.lock().unwrap(); + assert_eq!(captured.len(), 1); + assert_eq!(captured[0].text, big_text); + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_logging_emitter_caps_tool_args_buffer_and_keeps_key_arg() { + let tmp = tempdir().unwrap(); + let replay = ReplayLogger::new(tmp.path()); + let inner = CapturingEmitter::default(); + let deltas = inner.deltas.clone(); + let emitter = LoggingEmitter::new(inner, replay); + let filler = "x".repeat(MAX_TOOL_ARGS_CAPTURE_CHARS + 512); + + emitter.emit_delta(DeltaEvent { + kind: DeltaKind::ToolCallStart, + text: "read_file".to_string(), + }); + emitter.emit_delta(DeltaEvent { + kind: DeltaKind::ToolCallArgs, + text: "{\"path\":\"foo.md\",\"other\":\"".to_string(), + }); + emitter.emit_delta(DeltaEvent { + kind: DeltaKind::ToolCallArgs, + text: filler.clone(), + }); + + assert!(emitter.current_args_buf.lock().unwrap().len() <= MAX_TOOL_ARGS_CAPTURE_CHARS); + assert!(*emitter.current_args_truncated.lock().unwrap()); + + emitter.emit_step(StepEvent { + depth: 0, + step: 1, + tool_name: Some("read_file".into()), + tokens: Default::default(), + elapsed_ms: 1, + is_final: false, + loop_phase: None, + loop_metrics: None, + }); + + let entries = ReplayLogger::read_all(tmp.path()).await.unwrap(); + let step = entries + .iter() + .find(|entry| entry.role == "step-summary") + .unwrap(); + let tool_calls = step.step_tool_calls.as_ref().unwrap(); + assert_eq!(tool_calls[0].key_arg, "foo.md"); + + let captured = deltas.lock().unwrap(); + assert_eq!(captured.len(), 3); + assert_eq!(captured[2].text, filler); + } } diff --git a/openplanter-desktop/crates/op-tauri/src/commands/agent.rs b/openplanter-desktop/crates/op-tauri/src/commands/agent.rs index 2144ab50..f2a7b3af 100644 --- a/openplanter-desktop/crates/op-tauri/src/commands/agent.rs +++ b/openplanter-desktop/crates/op-tauri/src/commands/agent.rs @@ -1,11 +1,45 @@ +use std::path::Path; + use tauri::{AppHandle, Emitter, State}; use tokio_util::sync::CancellationToken; use crate::bridge::{LoggingEmitter, TauriEmitter}; use crate::commands::session::sessions_dir; use crate::state::AppState; +use op_core::engine::context::load_or_migrate_investigation_state; +use op_core::engine::investigation_state::{ + build_question_reasoning_packet, has_reasoning_content, +}; +use op_core::engine::{SolveEmitter, SolveInitialContext}; use op_core::session::replay::{ReplayEntry, ReplayLogger}; +async fn build_solve_initial_context( + session_dir: &Path, + session_id: &str, +) -> (SolveInitialContext, Option) { + let mut initial_context = SolveInitialContext { + session_id: Some(session_id.to_string()), + session_dir: Some(session_dir.display().to_string()), + question_reasoning_packet: None, + }; + + match load_or_migrate_investigation_state(session_dir).await { + Ok(state) => { + let packet = build_question_reasoning_packet(&state, 8, 6); + if has_reasoning_content(&packet) { + initial_context.question_reasoning_packet = Some(packet); + } + (initial_context, None) + } + Err(err) => ( + initial_context, + Some(format!( + "[solve] failed to load investigation state for reasoning packet; continuing without packet: {err}" + )), + ), + } +} + /// Start solving an objective. Result streamed via events. #[tauri::command] pub async fn solve( @@ -55,10 +89,33 @@ pub async fn solve( } let emitter = LoggingEmitter::new(TauriEmitter::new(app), replay); + let cwd = std::env::current_dir() + .map(|dir| dir.display().to_string()) + .unwrap_or_else(|_| "".to_string()); + emitter.emit_trace(&format!( + "[solve] pid={} cwd={} workspace={} session={}", + std::process::id(), + cwd, + cfg.workspace.display(), + session_id + )); + emitter.emit_trace(&format!("[startup:info] {}", state.startup_trace())); + let (initial_context, initial_context_warning) = + build_solve_initial_context(&session_dir, &session_id).await; + if let Some(warning) = initial_context_warning.as_deref() { + emitter.emit_trace(warning); + } tokio::spawn(async move { let result = tokio::spawn(async move { - op_core::engine::solve(&objective, &cfg, &emitter, token).await; + op_core::engine::solve_with_initial_context( + &objective, + &cfg, + &emitter, + token, + Some(initial_context), + ) + .await; }) .await; @@ -67,12 +124,7 @@ pub async fn solve( if let Err(e) = result { let msg = format!("Internal error: {e}"); eprintln!("[bridge] panic: {msg}"); - let _ = error_handle.emit( - "agent:error", - op_core::events::ErrorEvent { - message: msg, - }, - ); + let _ = error_handle.emit("agent:error", op_core::events::ErrorEvent { message: msg }); } }); @@ -81,9 +133,7 @@ pub async fn solve( /// Cancel a running solve. #[tauri::command] -pub async fn cancel( - state: State<'_, AppState>, -) -> Result<(), String> { +pub async fn cancel(state: State<'_, AppState>) -> Result<(), String> { let token = state.cancel_token.lock().await; token.cancel(); Ok(()) @@ -95,3 +145,54 @@ pub async fn debug_log(msg: String) -> Result<(), String> { eprintln!("[frontend] {msg}"); Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + use tokio::fs; + + #[tokio::test] + async fn test_build_solve_initial_context_includes_packet_when_state_has_reasoning() { + let tmp = tempdir().unwrap(); + fs::write( + tmp.path().join("investigation_state.json"), + r#"{ + "schema_version":"1.0.0", + "session_id":"sid", + "questions":{"q_1":{"id":"q_1","question_text":"Open question","status":"open","priority":"high","claim_ids":["cl_1"]}}, + "claims":{"cl_1":{"id":"cl_1","claim_text":"Needs support","status":"unresolved","evidence_ids":["ev_1"]}}, + "evidence":{"ev_1":{"id":"ev_1","evidence_type":"web_fetch","source_uri":"https://example.test","provenance_ids":["pv_1"]}} + }"#, + ) + .await + .unwrap(); + + let (context, warning) = build_solve_initial_context(tmp.path(), "sid").await; + assert!(warning.is_none()); + let packet = context + .question_reasoning_packet + .expect("packet should be present"); + assert_eq!(packet["focus_question_ids"], serde_json::json!(["q_1"])); + assert_eq!( + packet["candidate_actions"][0]["id"], + serde_json::json!("ca_q_q_1") + ); + assert_eq!(context.session_id, Some("sid".to_string())); + assert_eq!(context.session_dir, Some(tmp.path().display().to_string())); + } + + #[tokio::test] + async fn test_build_solve_initial_context_degrades_to_no_packet_on_load_failure() { + let tmp = tempdir().unwrap(); + fs::write(tmp.path().join("investigation_state.json"), "{not-json") + .await + .unwrap(); + + let (context, warning) = build_solve_initial_context(tmp.path(), "sid").await; + assert!(warning.is_some()); + assert!(context.question_reasoning_packet.is_none()); + assert_eq!(context.session_id, Some("sid".to_string())); + assert_eq!(context.session_dir, Some(tmp.path().display().to_string())); + } +} diff --git a/openplanter-desktop/crates/op-tauri/src/commands/session.rs b/openplanter-desktop/crates/op-tauri/src/commands/session.rs index cc975958..d504afbc 100644 --- a/openplanter-desktop/crates/op-tauri/src/commands/session.rs +++ b/openplanter-desktop/crates/op-tauri/src/commands/session.rs @@ -1,9 +1,9 @@ -use std::fs; -use std::path::{Path, PathBuf}; -use tauri::State; use crate::state::AppState; use op_core::events::SessionInfo; use op_core::session::replay::{ReplayEntry, ReplayLogger}; +use std::fs; +use std::path::{Path, PathBuf}; +use tauri::State; /// Get the sessions directory path from config. pub async fn sessions_dir(state: &State<'_, AppState>) -> PathBuf { @@ -54,11 +54,7 @@ pub fn create_session(dir: &Path) -> Result { fs::create_dir_all(dir)?; let now = chrono::Utc::now(); - let new_id = format!( - "{}-{:08x}", - now.format("%Y%m%d-%H%M%S"), - rand_hex() - ); + let new_id = format!("{}-{:08x}", now.format("%Y%m%d-%H%M%S"), rand_hex()); let session_dir = dir.join(&new_id); fs::create_dir_all(&session_dir)?; @@ -120,10 +116,7 @@ pub async fn open_session( /// Delete a session by removing its directory. #[tauri::command] -pub async fn delete_session( - id: String, - state: State<'_, AppState>, -) -> Result<(), String> { +pub async fn delete_session(id: String, state: State<'_, AppState>) -> Result<(), String> { let dir = sessions_dir(&state).await; let session_dir = dir.join(&id); @@ -135,7 +128,9 @@ pub async fn delete_session( } // Ensure it's actually a session directory (has metadata.json) if !session_dir.join("metadata.json").exists() { - return Err(format!("Session '{id}' has no metadata — refusing to delete")); + return Err(format!( + "Session '{id}' has no metadata — refusing to delete" + )); } fs::remove_dir_all(&session_dir).map_err(|e| format!("Failed to delete session: {e}"))?; @@ -156,7 +151,9 @@ pub async fn get_session_history( state: State<'_, AppState>, ) -> Result, String> { let dir = sessions_dir(&state).await.join(&session_id); - ReplayLogger::read_all(&dir).await.map_err(|e| e.to_string()) + ReplayLogger::read_all(&dir) + .await + .map_err(|e| e.to_string()) } /// Update session metadata: increment turn_count, set last_objective. @@ -172,13 +169,12 @@ pub async fn update_session_metadata( let mut info: SessionInfo = serde_json::from_str(&content) .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; info.turn_count += 1; - info.last_objective = Some( - if objective.len() > 100 { - format!("{}...", &objective[..97]) - } else { - objective.to_string() - }, - ); + info.last_objective = Some(if objective.len() > 100 { + let end = objective.floor_char_boundary(97); + format!("{}...", &objective[..end]) + } else { + objective.to_string() + }); let json = serde_json::to_string_pretty(&info) .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; tokio::fs::write(&meta_path, json).await @@ -317,8 +313,14 @@ mod tests { let info = create_session(&dir).unwrap(); let session_dir = dir.join(&info.id); assert!(session_dir.exists(), "session dir should exist"); - assert!(session_dir.join("artifacts").exists(), "artifacts/ should exist"); - assert!(session_dir.join("metadata.json").exists(), "metadata.json should exist"); + assert!( + session_dir.join("artifacts").exists(), + "artifacts/ should exist" + ); + assert!( + session_dir.join("metadata.json").exists(), + "metadata.json should exist" + ); } #[test] diff --git a/openplanter-desktop/crates/op-tauri/src/commands/wiki.rs b/openplanter-desktop/crates/op-tauri/src/commands/wiki.rs index de84697e..53df0a6e 100644 --- a/openplanter-desktop/crates/op-tauri/src/commands/wiki.rs +++ b/openplanter-desktop/crates/op-tauri/src/commands/wiki.rs @@ -1,16 +1,15 @@ +use crate::state::AppState; +use op_core::events::{GraphData, GraphEdge, GraphNode, NodeType}; +use regex::Regex; use std::collections::{HashMap, HashSet}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::LazyLock; -use regex::Regex; use tauri::State; -use crate::state::AppState; -use op_core::events::{GraphData, GraphEdge, GraphNode, NodeType}; static LINK_RE: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+\.md)\)").unwrap()); -static CATEGORY_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^#{2,3}\s+(.+)").unwrap()); +static CATEGORY_RE: LazyLock = LazyLock::new(|| Regex::new(r"^#{2,3}\s+(.+)").unwrap()); /// Walk up from `start` to find a directory containing `wiki/index.md`. /// Checks both `.openplanter/wiki/` (preferred) and `wiki/` at each level. @@ -117,27 +116,47 @@ pub fn parse_index_nodes(content: &str) -> Vec { /// Extract distinctive search terms from a node's label for text-based matching. fn search_terms_for_node(node: &GraphNode) -> Vec { let stopwords: HashSet<&str> = [ - "a", "an", "the", "of", "and", "or", "in", "to", "for", "by", - "on", "at", "is", "it", "its", "us", "gov", "list", - ].into_iter().collect(); + "a", "an", "the", "of", "and", "or", "in", "to", "for", "by", "on", "at", "is", "it", + "its", "us", "gov", "list", + ] + .into_iter() + .collect(); let generic: HashSet<&str> = [ - "federal", "state", "united", "states", "government", "bureau", - "department", "database", "national", "public", - ].into_iter().collect(); + "federal", + "state", + "united", + "states", + "government", + "bureau", + "department", + "database", + "national", + "public", + ] + .into_iter() + .collect(); let mut terms = Vec::new(); // Full label (lowercased) terms.push(node.label.to_lowercase()); - for word in node.label.split(|c: char| c.is_whitespace() || c == '/' || c == '(' || c == ')') { - let clean: String = word.chars() + for word in node + .label + .split(|c: char| c.is_whitespace() || c == '/' || c == '(' || c == ')') + { + let clean: String = word + .chars() .filter(|c| c.is_alphanumeric() || *c == '.' || *c == '-') .collect(); - if clean.is_empty() { continue; } + if clean.is_empty() { + continue; + } let lower = clean.to_lowercase(); - if stopwords.contains(lower.as_str()) { continue; } + if stopwords.contains(lower.as_str()) { + continue; + } // Acronyms: all uppercase, >= 2 chars (OCPF, FEC, EDGAR, FDIC, etc.) let alpha_chars: String = clean.chars().filter(|c| c.is_alphabetic()).collect(); @@ -165,15 +184,16 @@ pub fn find_cross_references(nodes: &[GraphNode], wiki_dir: &Path) -> Vec = HashSet::new(); // Pre-compute search terms for all nodes - let node_terms: Vec> = nodes.iter() - .map(|n| search_terms_for_node(n)) - .collect(); + let node_terms: Vec> = nodes.iter().map(|n| search_terms_for_node(n)).collect(); // Read all file contents upfront - let file_contents: HashMap = nodes.iter() + let file_contents: HashMap = nodes + .iter() .filter_map(|node| { let file_path = wiki_dir.join(&node.path); - fs::read_to_string(&file_path).ok().map(|c| (node.id.clone(), c)) + fs::read_to_string(&file_path) + .ok() + .map(|c| (node.id.clone(), c)) }) .collect(); @@ -207,11 +227,17 @@ pub fn find_cross_references(nodes: &[GraphNode], wiki_dir: &Path) -> Vec = nodes.iter().map(|n| n.id.clone()).collect(); // Find section IDs that are the source of at least one structural child edge - let parent_section_ids: HashSet<&str> = edges.iter() + let parent_section_ids: HashSet<&str> = edges + .iter() .filter(|e| { let label = e.label.as_deref().unwrap_or(""); (label == "has-section" || label == "contains") && node_ids.contains(&e.target) @@ -499,7 +526,8 @@ pub fn parse_source_file( .collect(); // IDs to remove: childless sections + empty-content facts - let remove_ids: HashSet = nodes.iter() + let remove_ids: HashSet = nodes + .iter() .filter(|n| { match n.node_type.as_ref() { Some(NodeType::Section) => !parent_section_ids.contains(n.id.as_str()), @@ -554,9 +582,10 @@ pub fn extract_cross_ref_edges( continue; } // Check if this fact is under a cross-reference section - let in_cross_ref = node.parent_id.as_ref().map_or(false, |pid| { - pid.contains("cross-reference") - }); + let in_cross_ref = node + .parent_id + .as_ref() + .map_or(false, |pid| pid.contains("cross-reference")); if !in_cross_ref { continue; } @@ -604,15 +633,21 @@ pub fn find_shared_field_edges(all_nodes: &[GraphNode]) -> Vec { continue; } // Check if this fact is under a data-schema section - let in_data_schema = node.parent_id.as_ref().map_or(false, |pid| { - pid.contains("data-schema") - }); + let in_data_schema = node + .parent_id + .as_ref() + .map_or(false, |pid| pid.contains("data-schema")); if !in_data_schema { continue; } // Normalize field name: lowercase, strip backticks - let normalized = node.label.to_lowercase().replace('`', "").trim().to_string(); + let normalized = node + .label + .to_lowercase() + .replace('`', "") + .trim() + .to_string(); if !normalized.is_empty() { field_map.entry(normalized).or_default().push(node); } @@ -651,13 +686,16 @@ pub fn find_shared_field_edges(all_nodes: &[GraphNode]) -> Vec { /// Get the wiki knowledge graph data by parsing wiki/index.md and all source files. #[tauri::command] -pub async fn get_graph_data( - state: State<'_, AppState>, -) -> Result { +pub async fn get_graph_data(state: State<'_, AppState>) -> Result { let cfg = state.config.lock().await; let wiki_dir = match find_wiki_dir(&cfg.workspace) { Some(d) => d, - None => return Ok(GraphData { nodes: vec![], edges: vec![] }), + None => { + return Ok(GraphData { + nodes: vec![], + edges: vec![], + }); + } }; let index_path = wiki_dir.join("index.md"); @@ -690,15 +728,15 @@ pub async fn get_graph_data( let shared_field_edges = find_shared_field_edges(&all_nodes); all_edges.extend(shared_field_edges); - Ok(GraphData { nodes: all_nodes, edges: all_edges }) + Ok(GraphData { + nodes: all_nodes, + edges: all_edges, + }) } /// Read a wiki markdown file's contents, given a relative path like "wiki/fec.md". #[tauri::command] -pub async fn read_wiki_file( - path: String, - state: State<'_, AppState>, -) -> Result { +pub async fn read_wiki_file(path: String, state: State<'_, AppState>) -> Result { // Validate: must end in .md if !path.ends_with(".md") { return Err("Path must end in .md".into()); @@ -713,14 +751,16 @@ pub async fn read_wiki_file( } let cfg = state.config.lock().await; - let wiki_dir = find_wiki_dir(&cfg.workspace) - .ok_or_else(|| "Wiki directory not found".to_string())?; + let wiki_dir = + find_wiki_dir(&cfg.workspace).ok_or_else(|| "Wiki directory not found".to_string())?; let project_root = wiki_dir.parent().unwrap_or(&cfg.workspace); let resolved = project_root.join(&path); // Canonicalize and verify it's under the wiki dir - let canonical = resolved.canonicalize().map_err(|e| format!("File not found: {e}"))?; + let canonical = resolved + .canonicalize() + .map_err(|e| format!("File not found: {e}"))?; let canon_wiki = wiki_dir.canonicalize().map_err(|e| e.to_string())?; if !canonical.starts_with(&canon_wiki) { return Err("Path is outside wiki directory".into()); @@ -862,7 +902,9 @@ mod tests { label: "A".to_string(), category: "test".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }]; let edges = find_cross_references(&nodes, tmp.path()); assert!(edges.is_empty()); @@ -883,14 +925,18 @@ mod tests { label: "A".to_string(), category: "test".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, GraphNode { id: "b".to_string(), label: "B".to_string(), category: "test".to_string(), path: "wiki/b.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, ]; let edges = find_cross_references(&nodes, tmp.path()); @@ -996,7 +1042,10 @@ mod tests { // project_root should be .openplanter/ so joining with wiki/fec.md works let project_root = found.parent().unwrap(); let file_path = project_root.join(&nodes[0].path); - assert!(file_path.exists(), "should resolve to .openplanter/wiki/fec.md"); + assert!( + file_path.exists(), + "should resolve to .openplanter/wiki/fec.md" + ); } #[test] @@ -1046,7 +1095,11 @@ mod tests { let wiki_dir = tmp.path().join("wiki"); fs::create_dir_all(&wiki_dir).unwrap(); // File A mentions EDGAR (from B's label "SEC EDGAR") but doesn't link to it - fs::write(wiki_dir.join("a.md"), "Cross-reference with EDGAR filings for details.").unwrap(); + fs::write( + wiki_dir.join("a.md"), + "Cross-reference with EDGAR filings for details.", + ) + .unwrap(); fs::write(wiki_dir.join("b.md"), "# SEC EDGAR\nContent.").unwrap(); let nodes = vec![ @@ -1055,14 +1108,18 @@ mod tests { label: "FEC Data".to_string(), category: "campaign-finance".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, GraphNode { id: "b".to_string(), label: "SEC EDGAR".to_string(), category: "corporate".to_string(), path: "wiki/b.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, ]; let edges = find_cross_references(&nodes, tmp.path()); @@ -1080,17 +1137,20 @@ mod tests { // File A mentions its own label — should not create edge fs::write(wiki_dir.join("a.md"), "# EDGAR\nThis is SEC EDGAR data.").unwrap(); - let nodes = vec![ - GraphNode { - id: "a".to_string(), - label: "SEC EDGAR".to_string(), - category: "corporate".to_string(), - path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, - }, - ]; + let nodes = vec![GraphNode { + id: "a".to_string(), + label: "SEC EDGAR".to_string(), + category: "corporate".to_string(), + path: "wiki/a.md".to_string(), + node_type: None, + parent_id: None, + content: None, + }]; let edges = find_cross_references(&nodes, tmp.path()); - assert!(edges.is_empty(), "should not create self-referencing edge from text mention"); + assert!( + edges.is_empty(), + "should not create self-referencing edge from text mention" + ); } #[test] @@ -1107,14 +1167,18 @@ mod tests { label: "EPA Data".to_string(), category: "regulatory".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, GraphNode { id: "b".to_string(), label: "OSHA Inspections".to_string(), category: "regulatory".to_string(), path: "wiki/b.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, ]; let edges = find_cross_references(&nodes, tmp.path()); @@ -1136,14 +1200,18 @@ mod tests { label: "A Data".to_string(), category: "test".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, GraphNode { id: "b".to_string(), label: "SEC EDGAR".to_string(), category: "corporate".to_string(), path: "wiki/b.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }, ]; let edges = find_cross_references(&nodes, tmp.path()); @@ -1163,7 +1231,9 @@ mod tests { label: "A".to_string(), category: "test".to_string(), path: "wiki/a.md".to_string(), - node_type: None, parent_id: None, content: None, + node_type: None, + parent_id: None, + content: None, }]; let edges = find_cross_references(&nodes, tmp.path()); assert!(edges.is_empty(), "self-references should be excluded"); @@ -1174,7 +1244,10 @@ mod tests { #[test] fn test_slugify_basic() { assert_eq!(slugify("Data Schema"), "data-schema"); - assert_eq!(slugify("Cross-Reference Potential"), "cross-reference-potential"); + assert_eq!( + slugify("Cross-Reference Potential"), + "cross-reference-potential" + ); assert_eq!(slugify("Legal & Licensing"), "legal-licensing"); assert_eq!(slugify(" multiple spaces "), "multiple-spaces"); } @@ -1258,13 +1331,19 @@ mod tests { let (nodes, edges) = parse_source_file(&source, content); // Data Schema + 2 subsections + 2 facts = 5 assert_eq!(nodes.len(), 5); - let sections: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Section)).collect(); + let sections: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Section)) + .collect(); assert_eq!(sections.len(), 3); // Subsections are children of the h2 assert_eq!(sections[1].parent_id.as_deref(), Some("fec::data-schema")); assert_eq!(sections[2].parent_id.as_deref(), Some("fec::data-schema")); // has-section edges - let has_section: Vec<_> = edges.iter().filter(|e| e.label.as_deref() == Some("has-section")).collect(); + let has_section: Vec<_> = edges + .iter() + .filter(|e| e.label.as_deref() == Some("has-section")) + .collect(); assert_eq!(has_section.len(), 3); } @@ -1275,16 +1354,26 @@ mod tests { let (nodes, edges) = parse_source_file(&source, content); // 1 section + 2 facts assert_eq!(nodes.len(), 3); - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); assert_eq!(facts.len(), 2); assert_eq!(facts[0].label, "Jurisdiction"); assert_eq!(facts[1].label, "Time range"); // Facts should have content assert!(facts[0].content.as_ref().unwrap().contains("Federal")); // Facts parented to section - assert!(facts.iter().all(|f| f.parent_id.as_deref() == Some("fec::coverage"))); + assert!( + facts + .iter() + .all(|f| f.parent_id.as_deref() == Some("fec::coverage")) + ); // Contains edges - let contains: Vec<_> = edges.iter().filter(|e| e.label.as_deref() == Some("contains")).collect(); + let contains: Vec<_> = edges + .iter() + .filter(|e| e.label.as_deref() == Some("contains")) + .collect(); assert_eq!(contains.len(), 2); } @@ -1293,13 +1382,22 @@ mod tests { let source = make_source("fec"); let content = "## Coverage\n\n- **Time range**:\n - Records: 1979-present\n - Contributions: 1979-present\n- **Jurisdiction**: Federal"; let (nodes, _) = parse_source_file(&source, content); - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); assert_eq!(facts.len(), 2); // Time range should have accumulated sub-bullets let time_range = facts.iter().find(|f| f.label == "Time range").unwrap(); let content = time_range.content.as_ref().unwrap(); - assert!(content.contains("Records: 1979-present"), "should contain sub-bullet"); - assert!(content.contains("Contributions: 1979-present"), "should contain second sub-bullet"); + assert!( + content.contains("Records: 1979-present"), + "should contain sub-bullet" + ); + assert!( + content.contains("Contributions: 1979-present"), + "should contain second sub-bullet" + ); } #[test] @@ -1308,7 +1406,10 @@ mod tests { // Bold bullet with NO sub-bullets and NO value after colon → should be pruned let content = "## Coverage\n\n- **Empty**:\n- **Jurisdiction**: Federal"; let (nodes, _) = parse_source_file(&source, content); - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); // "Empty" should be pruned, only "Jurisdiction" remains assert_eq!(facts.len(), 1); assert_eq!(facts[0].label, "Jurisdiction"); @@ -1320,7 +1421,10 @@ mod tests { let content = "## Data Schema\n\n| Field | Description |\n|-------|-------------|\n| `candidate_id` | Unique ID |\n| `name` | Full name |"; let (nodes, edges) = parse_source_file(&source, content); // 1 section + 2 fact rows (header + separator skipped) - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); assert_eq!(facts.len(), 2); assert_eq!(facts[0].label, "candidate_id"); // backticks stripped assert_eq!(facts[1].label, "name"); @@ -1331,7 +1435,10 @@ mod tests { let source = make_source("fec"); let content = "## Schema\n\n| Header1 | Header2 |\n| --- | --- |\n| value1 | desc1 |"; let (nodes, _edges) = parse_source_file(&source, content); - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); assert_eq!(facts.len(), 1); assert_eq!(facts[0].label, "value1"); } @@ -1339,11 +1446,17 @@ mod tests { #[test] fn test_parse_fact_parents_correct() { let source = make_source("fec"); - let content = "## Data Schema\n\n### Candidate Records\n\n| Field | Desc |\n|---|---|\n| cid | ID |"; + let content = + "## Data Schema\n\n### Candidate Records\n\n| Field | Desc |\n|---|---|\n| cid | ID |"; let (nodes, _) = parse_source_file(&source, content); let fact = nodes.iter().find(|n| n.label == "cid").unwrap(); // Fact should be parented to the h3 section, not the h2 - assert!(fact.parent_id.as_ref().unwrap().contains("candidate-records")); + assert!( + fact.parent_id + .as_ref() + .unwrap() + .contains("candidate-records") + ); } #[test] @@ -1352,7 +1465,10 @@ mod tests { // Two sections with same name, each with a fact so they survive pruning let content = "## Summary\n\n- **A**: 1\n\n## Summary\n\n- **B**: 2"; let (nodes, _) = parse_source_file(&source, content); - let sections: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Section)).collect(); + let sections: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Section)) + .collect(); assert_eq!(sections.len(), 2); assert_eq!(sections[0].id, "fec::summary"); assert_eq!(sections[1].id, "fec::summary-2"); // deduplicated @@ -1393,15 +1509,27 @@ Overview paragraph. Links here."; let (nodes, edges) = parse_source_file(&source, content); - let sections: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Section)).collect(); - let facts: Vec<_> = nodes.iter().filter(|n| n.node_type == Some(NodeType::Fact)).collect(); + let sections: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Section)) + .collect(); + let facts: Vec<_> = nodes + .iter() + .filter(|n| n.node_type == Some(NodeType::Fact)) + .collect(); // Summary and References pruned (no children), Coverage + Data Schema + Records remain = 3 assert_eq!(sections.len(), 3); // 2 bullets + 2 table rows = 4 facts assert_eq!(facts.len(), 4); // Structural edges: 2 has-section (Coverage→source, Data Schema→source) + 1 has-section (Records→Data Schema) + 4 contains - let has_section_count = edges.iter().filter(|e| e.label.as_deref() == Some("has-section")).count(); - let contains_count = edges.iter().filter(|e| e.label.as_deref() == Some("contains")).count(); + let has_section_count = edges + .iter() + .filter(|e| e.label.as_deref() == Some("has-section")) + .count(); + let contains_count = edges + .iter() + .filter(|e| e.label.as_deref() == Some("contains")) + .count(); assert_eq!(has_section_count, 3); assert_eq!(contains_count, 4); } @@ -1481,7 +1609,10 @@ Links here."; let all_nodes = vec![source_a.clone(), source_b.clone(), fact]; let source_nodes = vec![source_a, source_b]; let edges = extract_cross_ref_edges(&all_nodes, &source_nodes); - assert!(edges.is_empty(), "should only match facts under cross-reference sections"); + assert!( + edges.is_empty(), + "should only match facts under cross-reference sections" + ); } // ── find_shared_field_edges ── @@ -1532,7 +1663,10 @@ Links here."; content: None, }; let edges = find_shared_field_edges(&vec![fact_a, fact_b]); - assert!(edges.is_empty(), "should not create edge between same-source facts"); + assert!( + edges.is_empty(), + "should not create edge between same-source facts" + ); } #[test] @@ -1601,7 +1735,9 @@ Links here."; content: None, }; let edges = find_shared_field_edges(&vec![fact_a, fact_b]); - assert!(edges.is_empty(), "should only match facts under data-schema sections"); + assert!( + edges.is_empty(), + "should only match facts under data-schema sections" + ); } - } diff --git a/openplanter-desktop/crates/op-tauri/src/main.rs b/openplanter-desktop/crates/op-tauri/src/main.rs index e5b80c36..edf948cf 100644 --- a/openplanter-desktop/crates/op-tauri/src/main.rs +++ b/openplanter-desktop/crates/op-tauri/src/main.rs @@ -1,16 +1,19 @@ // Prevents additional console window on Windows in release. #![cfg_attr(not(debug_assertions), windows_subsystem = "windows")] -mod state; mod bridge; mod commands; +mod state; use state::AppState; fn main() { + let state = AppState::new(); + eprintln!("[startup:info] {}", state.startup_trace()); + tauri::Builder::default() .plugin(tauri_plugin_shell::init()) - .manage(AppState::new()) + .manage(state) .invoke_handler(tauri::generate_handler![ commands::agent::solve, commands::agent::cancel, @@ -27,8 +30,6 @@ fn main() { commands::wiki::get_graph_data, commands::wiki::read_wiki_file, ]) - .run(tauri::generate_context!( - "tauri.conf.json" - )) + .run(tauri::generate_context!("tauri.conf.json")) .expect("error while running tauri application"); } diff --git a/openplanter-desktop/crates/op-tauri/tauri.conf.json b/openplanter-desktop/crates/op-tauri/tauri.conf.json index d5d88bda..3d46600b 100644 --- a/openplanter-desktop/crates/op-tauri/tauri.conf.json +++ b/openplanter-desktop/crates/op-tauri/tauri.conf.json @@ -5,7 +5,7 @@ "build": { "frontendDist": "../../frontend/dist", "devUrl": "http://localhost:5173", - "beforeDevCommand": "", + "beforeDevCommand": "npm run dev", "beforeBuildCommand": "npm run build" }, "app": { diff --git a/openplanter-desktop/frontend/src/api/events.test.ts b/openplanter-desktop/frontend/src/api/events.test.ts index f7620ae7..859e859e 100644 --- a/openplanter-desktop/frontend/src/api/events.test.ts +++ b/openplanter-desktop/frontend/src/api/events.test.ts @@ -16,7 +16,9 @@ import { onAgentStep, onAgentDelta, onAgentComplete, + onAgentCompleteEvent, onAgentError, + onLoopHealth, onWikiUpdated, } from "./events"; @@ -46,7 +48,6 @@ describe("event listeners", () => { const handler = listeners.get("agent:step")!; const payload = { - type: "step", step: 1, depth: 0, tokens: { input_tokens: 100, output_tokens: 50 }, @@ -62,7 +63,7 @@ describe("event listeners", () => { await onAgentDelta(callback); const handler = listeners.get("agent:delta")!; - const payload = { type: "delta", kind: "text", text: "hello" }; + const payload = { kind: "text", text: "hello" }; handler({ payload }); expect(callback).toHaveBeenCalledWith(payload); }); @@ -72,10 +73,45 @@ describe("event listeners", () => { await onAgentComplete(callback); const handler = listeners.get("agent:complete")!; - handler({ payload: { result: "final answer" } }); + handler({ + payload: { + result: "final answer", + loop_metrics: { final_rejections: 1 }, + }, + }); expect(callback).toHaveBeenCalledWith("final answer"); }); + it("onAgentCompleteEvent registers listener and forwards full payload", async () => { + const callback = vi.fn(); + await onAgentCompleteEvent(callback); + + const handler = listeners.get("agent:complete")!; + const payload = { + result: "final answer", + loop_metrics: { + steps: 2, + model_turns: 2, + tool_calls: 1, + investigate_steps: 1, + build_steps: 0, + iterate_steps: 0, + finalize_steps: 1, + recon_streak: 0, + max_recon_streak: 1, + guardrail_warnings: 0, + final_rejections: 1, + extensions_granted: 0, + extension_eligible_checks: 0, + extension_denials_no_progress: 0, + extension_denials_cap: 0, + termination_reason: "success", + }, + }; + handler({ payload }); + expect(callback).toHaveBeenCalledWith(payload); + }); + it("onAgentError registers listener and extracts message", async () => { const callback = vi.fn(); await onAgentError(callback); @@ -98,6 +134,38 @@ describe("event listeners", () => { expect(callback).toHaveBeenCalledWith(graphData); }); + it("onLoopHealth registers listener and forwards payload", async () => { + const callback = vi.fn(); + await onLoopHealth(callback); + + const handler = listeners.get("agent:loop-health")!; + const payload = { + depth: 0, + step: 3, + phase: "investigate", + metrics: { + steps: 3, + model_turns: 3, + tool_calls: 2, + investigate_steps: 2, + build_steps: 0, + iterate_steps: 0, + finalize_steps: 0, + recon_streak: 2, + max_recon_streak: 2, + guardrail_warnings: 1, + final_rejections: 1, + extensions_granted: 0, + extension_eligible_checks: 1, + extension_denials_no_progress: 1, + extension_denials_cap: 0, + termination_reason: "budget_no_progress", + }, + is_final: false, + }; + handler({ payload }); + expect(callback).toHaveBeenCalledWith(payload); + }); it("all listeners return unlisten function", async () => { const noop = vi.fn(); const unlistens = await Promise.all([ @@ -105,7 +173,9 @@ describe("event listeners", () => { onAgentStep(noop), onAgentDelta(noop), onAgentComplete(noop), + onAgentCompleteEvent(noop), onAgentError(noop), + onLoopHealth(noop), onWikiUpdated(noop), ]); for (const u of unlistens) { diff --git a/openplanter-desktop/frontend/src/api/events.ts b/openplanter-desktop/frontend/src/api/events.ts index 30cb0704..f736485c 100644 --- a/openplanter-desktop/frontend/src/api/events.ts +++ b/openplanter-desktop/frontend/src/api/events.ts @@ -1,6 +1,13 @@ /** Tauri event subscriptions. */ import { listen, type UnlistenFn } from "@tauri-apps/api/event"; -import type { AgentEvent, CuratorUpdateEvent, GraphData } from "./types"; +import type { + CompleteEvent, + CuratorUpdateEvent, + DeltaEvent, + GraphData, + LoopHealthEvent, + StepEvent, +} from "./types"; export function onAgentTrace( callback: (message: string) => void @@ -11,23 +18,27 @@ export function onAgentTrace( } export function onAgentStep( - callback: (event: AgentEvent & { type: "step" }) => void + callback: (event: StepEvent) => void ): Promise { - return listen("agent:step", (e) => callback(e.payload as any)); + return listen("agent:step", (e) => callback(e.payload)); } export function onAgentDelta( - callback: (event: AgentEvent & { type: "delta" }) => void + callback: (event: DeltaEvent) => void ): Promise { - return listen("agent:delta", (e) => callback(e.payload as any)); + return listen("agent:delta", (e) => callback(e.payload)); +} + +export function onAgentCompleteEvent( + callback: (event: CompleteEvent) => void +): Promise { + return listen("agent:complete", (e) => callback(e.payload)); } export function onAgentComplete( callback: (result: string) => void ): Promise { - return listen<{ result: string }>("agent:complete", (e) => - callback(e.payload.result) - ); + return onAgentCompleteEvent((event) => callback(event.result)); } export function onAgentError( @@ -51,3 +62,8 @@ export function onCuratorUpdate( callback(e.payload) ); } +export function onLoopHealth( + callback: (event: LoopHealthEvent) => void +): Promise { + return listen("agent:loop-health", (e) => callback(e.payload)); +} diff --git a/openplanter-desktop/frontend/src/api/types.ts b/openplanter-desktop/frontend/src/api/types.ts index a47e1fb4..6464f98c 100644 --- a/openplanter-desktop/frontend/src/api/types.ts +++ b/openplanter-desktop/frontend/src/api/types.ts @@ -9,6 +9,27 @@ export interface TraceEvent { message: string; } +export type LoopPhase = "investigate" | "build" | "iterate" | "finalize"; + +export interface LoopMetrics { + steps: number; + model_turns: number; + tool_calls: number; + investigate_steps: number; + build_steps: number; + iterate_steps: number; + finalize_steps: number; + recon_streak: number; + max_recon_streak: number; + guardrail_warnings: number; + final_rejections: number; + extensions_granted: number; + extension_eligible_checks: number; + extension_denials_no_progress: number; + extension_denials_cap: number; + termination_reason: string; +} + export interface StepEvent { depth: number; step: number; @@ -16,6 +37,8 @@ export interface StepEvent { tokens: TokenUsage; elapsed_ms: number; is_final: boolean; + loop_phase?: LoopPhase; + loop_metrics?: LoopMetrics; } export type DeltaKind = "text" | "thinking" | "tool_call_start" | "tool_call_args"; @@ -25,8 +48,28 @@ export interface DeltaEvent { text: string; } +export interface CompletionMeta { + kind: string; + reason: string; + steps_used: number; + max_steps: number; + extensions_granted: number; + extension_block_steps: number; + extension_max_blocks: number; +} + export interface CompleteEvent { result: string; + loop_metrics?: LoopMetrics; + completion?: CompletionMeta; +} + +export interface LoopHealthEvent { + depth: number; + step: number; + phase: LoopPhase; + metrics: LoopMetrics; + is_final: boolean; } export interface ErrorEvent { @@ -130,8 +173,19 @@ export interface ReplayEntry { export type AgentEvent = | { type: "trace"; message: string } - | { type: "step"; depth: number; step: number; tool_name: string | null; tokens: TokenUsage; elapsed_ms: number; is_final: boolean } + | { + type: "step"; + depth: number; + step: number; + tool_name: string | null; + tokens: TokenUsage; + elapsed_ms: number; + is_final: boolean; + loop_phase?: LoopPhase; + loop_metrics?: LoopMetrics; + } | { type: "delta"; kind: DeltaKind; text: string } - | { type: "complete"; result: string } + | { type: "complete"; result: string; loop_metrics?: LoopMetrics; completion?: CompletionMeta } | { type: "error"; message: string } - | { type: "wiki_updated"; nodes: GraphNode[]; edges: GraphEdge[] }; + | { type: "wiki_updated"; nodes: GraphNode[]; edges: GraphEdge[] } + | { type: "loop_health"; depth: number; step: number; phase: LoopPhase; metrics: LoopMetrics; is_final: boolean }; diff --git a/openplanter-desktop/frontend/src/components/App.ts b/openplanter-desktop/frontend/src/components/App.ts index 22047445..a2766096 100644 --- a/openplanter-desktop/frontend/src/components/App.ts +++ b/openplanter-desktop/frontend/src/components/App.ts @@ -98,6 +98,9 @@ async function switchToNewSession(sessionList: HTMLElement): Promise { outputTokens: 0, currentStep: 0, currentDepth: 0, + loopHealth: null, + lastLoopMetrics: null, + lastCompletion: null, inputQueue: [], })); // Dispatch event to clear ChatPane DOM @@ -155,6 +158,9 @@ async function switchToSession(sessionId: string, sessionList: HTMLElement): Pro outputTokens: 0, currentStep: 0, currentDepth: 0, + loopHealth: null, + lastLoopMetrics: null, + lastCompletion: null, inputQueue: [], })); // Dispatch event to clear ChatPane DOM diff --git a/openplanter-desktop/frontend/src/components/InputBar.ts b/openplanter-desktop/frontend/src/components/InputBar.ts index cad43240..d174cb45 100644 --- a/openplanter-desktop/frontend/src/components/InputBar.ts +++ b/openplanter-desktop/frontend/src/components/InputBar.ts @@ -96,6 +96,11 @@ export function createInputBar(): HTMLElement { appState.update((s) => ({ ...s, isRunning: true, + currentStep: 0, + currentDepth: 0, + loopHealth: null, + lastLoopMetrics: null, + lastCompletion: null, messages: [ ...s.messages, { @@ -224,6 +229,11 @@ export function createInputBar(): HTMLElement { appState.update((s) => ({ ...s, isRunning: true, + currentStep: 0, + currentDepth: 0, + loopHealth: null, + lastLoopMetrics: null, + lastCompletion: null, messages: [ ...s.messages, { diff --git a/openplanter-desktop/frontend/src/main.ts b/openplanter-desktop/frontend/src/main.ts index c797da10..193ba796 100644 --- a/openplanter-desktop/frontend/src/main.ts +++ b/openplanter-desktop/frontend/src/main.ts @@ -3,11 +3,12 @@ import { getConfig } from "./api/invoke"; import { onAgentTrace, onAgentDelta, - onAgentComplete, + onAgentCompleteEvent, onAgentError, onAgentStep, onWikiUpdated, onCuratorUpdate, + onLoopHealth, } from "./api/events"; import { appState } from "./state/store"; @@ -93,12 +94,11 @@ async function init() { outputTokens: s.outputTokens + event.tokens.output_tokens, currentStep: event.step, currentDepth: event.depth, + lastLoopMetrics: event.loop_metrics ?? s.lastLoopMetrics, })); // Dispatch to ChatPane for rich step summary rendering - window.dispatchEvent( - new CustomEvent("agent-step", { detail: event }) - ); + window.dispatchEvent(new CustomEvent("agent-step", { detail: event })); }); await onAgentDelta((event) => { @@ -106,21 +106,35 @@ async function init() { window.dispatchEvent(detail); }); - await onAgentComplete((result) => { + await onAgentCompleteEvent((event) => { appState.update((s) => ({ ...s, isRunning: false, currentStep: 0, currentDepth: 0, + loopHealth: null, + lastLoopMetrics: event.loop_metrics ?? s.lastLoopMetrics, + lastCompletion: event.completion ?? null, messages: [ ...s.messages, { id: crypto.randomUUID(), role: "assistant" as const, - content: result, + content: event.result, timestamp: Date.now(), isRendered: true, }, + ...(event.completion?.kind === "partial" + ? [ + { + id: crypto.randomUUID(), + role: "system" as const, + content: + "Partial completion: the run used its bounded step budget and stopped cleanly. Resume to continue from the saved state.", + timestamp: Date.now(), + }, + ] + : []), ], })); @@ -134,6 +148,8 @@ async function init() { isRunning: false, currentStep: 0, currentDepth: 0, + loopHealth: null, + lastCompletion: null, messages: [ ...s.messages, { @@ -154,22 +170,16 @@ async function init() { window.dispatchEvent(detail); }); - await onCuratorUpdate((event) => { + await onCuratorUpdate(() => { + // Notify graph pane to refresh with curator's wiki changes + window.dispatchEvent(new CustomEvent("curator-done")); + }); + await onLoopHealth((event) => { appState.update((s) => ({ ...s, - messages: [ - ...s.messages, - { - id: crypto.randomUUID(), - role: "system" as const, - content: `[Wiki Curator] ${event.summary}`, - timestamp: Date.now(), - }, - ], + loopHealth: event, + lastLoopMetrics: event.metrics, })); - - // Notify graph pane to refresh with curator's wiki changes - window.dispatchEvent(new CustomEvent("curator-done")); }); } @@ -180,7 +190,7 @@ function processQueue() { appState.update((s) => ({ ...s, inputQueue: rest })); // Dispatch queued-submit event for InputBar to pick up window.dispatchEvent( - new CustomEvent("queued-submit", { detail: { text: next } }) + new CustomEvent("queued-submit", { detail: { text: next } }), ); } } diff --git a/openplanter-desktop/frontend/src/state/store.ts b/openplanter-desktop/frontend/src/state/store.ts index 1cd3d3fb..5f5cc6ba 100644 --- a/openplanter-desktop/frontend/src/state/store.ts +++ b/openplanter-desktop/frontend/src/state/store.ts @@ -1,4 +1,9 @@ /** Simple observable state store. */ +import type { + CompletionMeta, + LoopMetrics, + LoopHealthEvent, +} from "../api/types"; type Listener = (value: T) => void; export class Store { @@ -73,6 +78,9 @@ export interface AppState { maxStepsPerCall: number; currentStep: number; currentDepth: number; + loopHealth: LoopHealthEvent | null; + lastLoopMetrics: LoopMetrics | null; + lastCompletion: CompletionMeta | null; inputHistory: string[]; inputQueue: string[]; } @@ -92,6 +100,9 @@ export const appState = new Store({ maxStepsPerCall: 100, currentStep: 0, currentDepth: 0, + loopHealth: null, + lastLoopMetrics: null, + lastCompletion: null, inputHistory: [], inputQueue: [], }); diff --git a/tests/test_boundary_conditions.py b/tests/test_boundary_conditions.py index ab1d1714..17c452f9 100644 --- a/tests/test_boundary_conditions.py +++ b/tests/test_boundary_conditions.py @@ -145,7 +145,7 @@ def test_single_step_exhaustion(self) -> None: ]) engine = _make_engine(root, model, max_steps_per_call=1) result = engine.solve("one step only") - self.assertIn("Step budget exhausted", result) + self.assertIn("Partial completion for objective", result) # --------------------------------------------------------------------------- diff --git a/tests/test_engine.py b/tests/test_engine.py index c0780fb9..4e8c58bc 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -136,6 +136,98 @@ def test_runtime_policy_blocks_repeated_shell_command(self) -> None: "expected policy block observation in context", ) + def test_meta_text_not_accepted_as_final_answer(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(text="Here is my plan: I will inspect files and then implement.", stop_reason="end_turn"), + ModelTurn(text="Concrete result delivered.", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("meta final rejection") + self.assertEqual(result, "Concrete result delivered.") + self.assertEqual(engine.last_loop_metrics.get("final_rejections"), 1) + + def test_plan_objective_allows_structural_meta_final(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=2, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(text="Here is my plan for finishing the task.", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("Draft a plan for finishing the task") + self.assertEqual(result, "Here is my plan for finishing the task.") + self.assertEqual(engine.last_loop_metrics.get("final_rejections"), 0) + + def test_plan_objective_still_rejects_strong_process_meta(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(text="Here is my plan: I will inspect files and then implement.", stop_reason="end_turn"), + ModelTurn(text="Concrete planning deliverable.", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("Write an implementation plan for the fix") + self.assertEqual(result, "Concrete planning deliverable.") + self.assertEqual(engine.last_loop_metrics.get("final_rejections"), 1) + + def test_soft_guardrail_fires_once_per_recon_episode(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=7, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(tool_calls=[_tc("search_files", query="x")]), + ModelTurn(tool_calls=[_tc("repo_map")]), + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result, ctx = engine.solve_with_context("trigger recon guardrail") + self.assertEqual(result, "done") + warnings = [obs for obs in ctx.observations if "Soft guardrail" in obs] + self.assertEqual(len(warnings), 1) + self.assertEqual(int(engine.last_loop_metrics.get("guardrail_warnings", 0)), 1) + + def test_soft_guardrail_resets_for_second_recon_episode(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=9, acceptance_criteria=False) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(tool_calls=[_tc("search_files", query="x")]), + ModelTurn(tool_calls=[_tc("repo_map")]), + ModelTurn(tool_calls=[_tc("write_file", path="artifact.txt", content="data")]), + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(tool_calls=[_tc("search_files", query="x")]), + ModelTurn(tool_calls=[_tc("repo_map")]), + ModelTurn(text="done", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result, ctx = engine.solve_with_context("trigger two recon episodes") + self.assertEqual(result, "done") + warnings = [obs for obs in ctx.observations if "Soft guardrail" in obs] + self.assertEqual(len(warnings), 2) + self.assertEqual(int(engine.last_loop_metrics.get("guardrail_warnings", 0)), 2) + class CustomSystemPromptTests(unittest.TestCase): def test_custom_system_prompt_override(self) -> None: @@ -172,6 +264,14 @@ def test_flat_prompt_excludes_repl(self) -> None: prompt = _build_system_prompt(recursive=False) self.assertNotIn("REPL STRUCTURE", prompt) + def test_prompt_includes_question_centric_reasoning_rules(self) -> None: + prompt = _build_system_prompt(recursive=False) + self.assertIn("QUESTION-CENTRIC REASONING", prompt) + self.assertIn("supported / contested / unresolved", prompt) + self.assertIn("Supported Findings", prompt) + self.assertIn("candidate_actions", prompt) + self.assertIn("machine-readable, read-only", prompt) + def test_recursive_initial_message_has_repl_hint(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: root = Path(tmpdir) @@ -219,6 +319,46 @@ def create_conversation(self, system_prompt: str, initial_user_message: str): parsed = json.loads(captured[0]) self.assertNotIn("repl_hint", parsed) + def test_initial_message_includes_question_reasoning_packet(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=2, max_steps_per_call=3, recursive=False) + tools = WorkspaceTools(root=root) + + captured: list[str] = [] + + class CapturingModel(ScriptedModel): + def create_conversation(self, system_prompt: str, initial_user_message: str): + captured.append(initial_user_message) + return super().create_conversation(system_prompt, initial_user_message) + + model = CapturingModel(scripted_turns=[ + ModelTurn(text="done", stop_reason="end_turn"), + ]) + engine = RLMEngine(model=model, tools=tools, config=cfg) + packet = { + "reasoning_mode": "question_centric", + "focus_question_ids": ["q_1"], + "unresolved_questions": [{"id": "q_1", "question": "Open question"}], + "findings": {"supported": [], "contested": [], "unresolved": []}, + "contradictions": [], + "evidence_index": {}, + "candidate_actions": [ + { + "id": "ca_q_q_1", + "action_type": "search", + "status": "proposed", + "priority": "high", + } + ], + } + + engine.solve_with_context("test objective", question_reasoning_packet=packet) + + self.assertEqual(len(captured), 1) + parsed = json.loads(captured[0]) + self.assertEqual(parsed["question_reasoning_packet"], packet) + @dataclass class ThreadSafeScriptedModel: diff --git a/tests/test_engine_complex.py b/tests/test_engine_complex.py index e5bb29b7..2bed324f 100644 --- a/tests/test_engine_complex.py +++ b/tests/test_engine_complex.py @@ -8,7 +8,7 @@ from conftest import _tc from agent.config import AgentConfig from agent.engine import RLMEngine, ExternalContext -from agent.model import ModelTurn, ScriptedModel +from agent.model import Conversation, ModelTurn, RateLimitError, ScriptedModel, ToolResult from agent.tools import WorkspaceTools @@ -33,7 +33,88 @@ def test_step_budget_exhaustion(self) -> None: ) engine = RLMEngine(model=model, tools=tools, config=cfg) result = engine.solve("infinite thinking") - self.assertIn("Step budget exhausted", result) + self.assertIn("Partial completion for objective", result) + self.assertEqual(engine.last_loop_metrics.get("termination_reason"), "budget_no_progress") + + def test_budget_extension_granted_on_real_progress(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + budget_extension_enabled=True, + budget_extension_block_steps=2, + budget_extension_max_blocks=1, + ) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("run_shell", command="printf 'alpha\\n'")]), + ModelTurn(tool_calls=[_tc("write_file", path="artifact.txt", content="artifact")]), + ModelTurn(text="done after extension", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("real progress") + self.assertEqual(result, "done after extension") + self.assertEqual(engine.last_loop_metrics.get("extensions_granted"), 1) + self.assertEqual(engine.last_loop_metrics.get("termination_reason"), "success") + + def test_budget_extension_denied_on_high_failure_ratio(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=3, + budget_extension_enabled=True, + budget_extension_block_steps=2, + budget_extension_max_blocks=1, + ) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("read_file", path="missing-a.txt")]), + ModelTurn(tool_calls=[_tc("read_file", path="missing-b.txt")]), + ModelTurn(tool_calls=[_tc("run_shell", command="printf 'ok\\n'")]), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("failure-heavy objective") + self.assertIn("Partial completion for objective", result) + blockers = engine.last_loop_metrics.get("last_budget_extension_eval", {}).get("blockers", []) + self.assertIn("high_failure_ratio", blockers) + + def test_budget_extension_cap_produces_partial_completion(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + budget_extension_enabled=True, + budget_extension_block_steps=2, + budget_extension_max_blocks=1, + ) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("write_file", path="one.txt", content="one")]), + ModelTurn(tool_calls=[_tc("write_file", path="two.txt", content="two")]), + ModelTurn(tool_calls=[_tc("write_file", path="three.txt", content="three")]), + ModelTurn(tool_calls=[_tc("write_file", path="four.txt", content="four")]), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + result = engine.solve("cap objective") + self.assertIn("Partial completion for objective", result) + self.assertEqual(engine.last_loop_metrics.get("termination_reason"), "budget_cap") + self.assertEqual(engine.last_loop_metrics.get("extensions_granted"), 1) + self.assertLessEqual( + int(engine.last_loop_metrics.get("steps", 0)), + cfg.max_steps_per_call + cfg.budget_extension_block_steps * cfg.budget_extension_max_blocks, + ) # ------------------------------------------------------------------ # 2. Nested subtasks at depth 2 (3-level recursion) @@ -617,7 +698,7 @@ def test_step_budget_message_includes_objective(self) -> None: ) engine = RLMEngine(model=model, tools=tools, config=cfg) result = engine.solve("my specific objective") - self.assertIn("Step budget exhausted", result) + self.assertIn("Partial completion for objective", result) self.assertIn("my specific objective", result) # ------------------------------------------------------------------ @@ -640,6 +721,117 @@ def test_think_tool_observation(self) -> None: self.assertEqual(result, "done") self.assertIn("Thought noted: my thought", returned_ctx.observations[0]) + # ------------------------------------------------------------------ + # 30. Rate-limit retries succeed without consuming extra step budget + # ------------------------------------------------------------------ + def test_rate_limit_retries_then_succeeds(self) -> None: + class RetryThenSuccessModel: + def __init__(self) -> None: + self.calls = 0 + + def create_conversation(self, system_prompt: str, initial_user_message: str) -> Conversation: + return Conversation(_provider_messages=[{"role": "user", "content": initial_user_message}]) + + def complete(self, conversation: Conversation) -> ModelTurn: + self.calls += 1 + if self.calls == 1: + raise RateLimitError( + "rate limit", + status_code=429, + provider_code="1302", + ) + return ModelTurn(text="done", stop_reason="end_turn") + + def append_assistant_turn(self, conversation: Conversation, turn: ModelTurn) -> None: + pass + + def append_tool_results(self, conversation: Conversation, results: list[ToolResult]) -> None: + pass + + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=1, + rate_limit_max_retries=3, + rate_limit_backoff_base_sec=0.0, + rate_limit_backoff_max_sec=0.0, + rate_limit_retry_after_cap_sec=0.0, + ) + tools = WorkspaceTools(root=root) + model = RetryThenSuccessModel() + engine = RLMEngine(model=model, tools=tools, config=cfg) + with patch("agent.engine.random.uniform", return_value=0.0): + result = engine.solve("retry test") + self.assertEqual(result, "done") + self.assertEqual(model.calls, 2) + + # ------------------------------------------------------------------ + # 31. Exhausted rate-limit retries surfaces model error + # ------------------------------------------------------------------ + def test_rate_limit_retries_exhausted_returns_model_error(self) -> None: + class AlwaysRateLimitModel: + def create_conversation(self, system_prompt: str, initial_user_message: str) -> Conversation: + return Conversation(_provider_messages=[{"role": "user", "content": initial_user_message}]) + + def complete(self, conversation: Conversation) -> ModelTurn: + raise RateLimitError("still rate limited", status_code=429, provider_code="1302") + + def append_assistant_turn(self, conversation: Conversation, turn: ModelTurn) -> None: + pass + + def append_tool_results(self, conversation: Conversation, results: list[ToolResult]) -> None: + pass + + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=1, + rate_limit_max_retries=2, + rate_limit_backoff_base_sec=0.0, + rate_limit_backoff_max_sec=0.0, + rate_limit_retry_after_cap_sec=0.0, + ) + tools = WorkspaceTools(root=root) + engine = RLMEngine(model=AlwaysRateLimitModel(), tools=tools, config=cfg) + with patch("agent.engine.random.uniform", return_value=0.0): + result = engine.solve("retry test") + self.assertIn("Model error at depth 0, step 1", result) + + # ------------------------------------------------------------------ + # 32. Deadline exits gracefully during rate-limit wait + # ------------------------------------------------------------------ + def test_rate_limit_wait_respects_deadline(self) -> None: + class SlowRateLimitModel: + def create_conversation(self, system_prompt: str, initial_user_message: str) -> Conversation: + return Conversation(_provider_messages=[{"role": "user", "content": initial_user_message}]) + + def complete(self, conversation: Conversation) -> ModelTurn: + raise RateLimitError("wait", status_code=429, retry_after_sec=10.0) + + def append_assistant_turn(self, conversation: Conversation, turn: ModelTurn) -> None: + pass + + def append_tool_results(self, conversation: Conversation, results: list[ToolResult]) -> None: + pass + + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=1, + max_solve_seconds=1, + rate_limit_max_retries=3, + ) + tools = WorkspaceTools(root=root) + engine = RLMEngine(model=SlowRateLimitModel(), tools=tools, config=cfg) + result = engine.solve("deadline retry test") + self.assertIn("Time limit exceeded", result) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_investigation_state.py b/tests/test_investigation_state.py new file mode 100644 index 00000000..73e4404a --- /dev/null +++ b/tests/test_investigation_state.py @@ -0,0 +1,371 @@ +from __future__ import annotations + +import json +import tempfile +import unittest +from pathlib import Path + +from agent.investigation_state import ( + build_question_reasoning_packet, + migrate_legacy_state, + state_to_legacy_projection, +) +from agent.runtime import SessionStore + + +class InvestigationStateMigrationTests(unittest.TestCase): + def test_migrate_legacy_state_creates_structured_evidence(self) -> None: + legacy = { + "session_id": "sid", + "saved_at": "2026-03-13T00:00:00+00:00", + "external_observations": ["obs a", "obs b"], + "turn_history": [{"turn_number": 1}], + "loop_metrics": {"turns": 1}, + "custom_field": "keep me", + } + state = migrate_legacy_state("sid", legacy) + + self.assertEqual(state["schema_version"], "1.0.0") + self.assertEqual(state["legacy"]["external_observations"], ["obs a", "obs b"]) + self.assertEqual(state["legacy"]["extra_fields"]["custom_field"], "keep me") + self.assertEqual( + state["evidence"]["ev_legacy_000001"]["evidence_type"], + "legacy_observation", + ) + self.assertEqual( + state["evidence"]["ev_legacy_000002"]["source_uri"], + "state.json#external_observations[1]", + ) + + def test_state_to_legacy_projection_falls_back_to_evidence(self) -> None: + state = { + "schema_version": "1.0.0", + "session_id": "sid", + "updated_at": "2026-03-13T00:00:00+00:00", + "legacy": {"turn_history": [], "loop_metrics": {}, "extra_fields": {"custom_field": "hello"}}, + "evidence": { + "ev_legacy_000002": { + "content": "second", + "normalization": {"kind": "legacy_observation"}, + }, + "ev_legacy_000001": { + "content": "first", + "normalization": {"kind": "legacy_observation"}, + }, + }, + } + + projected = state_to_legacy_projection(state, session_id="sid") + self.assertEqual(projected["external_observations"], ["first", "second"]) + self.assertEqual(projected["custom_field"], "hello") + + +class SessionStoreTypedStateTests(unittest.TestCase): + def test_save_state_writes_typed_file_and_typed_first_load_preserves_extras(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + store = SessionStore(workspace=root) + sid, _, _ = store.open_session(session_id="typed-save", resume=False) + + store.save_state( + sid, + { + "session_id": sid, + "saved_at": "2026-03-13T12:00:00+00:00", + "external_observations": ["alpha", "beta"], + "turn_history": [{"turn_number": 1}], + "loop_metrics": {"turns": 1}, + "custom_field": "hello", + }, + ) + + session_dir = root / ".openplanter" / "sessions" / sid + typed_path = session_dir / "investigation_state.json" + self.assertTrue(typed_path.exists()) + + typed = json.loads(typed_path.read_text(encoding="utf-8")) + self.assertEqual(typed["legacy"]["extra_fields"]["custom_field"], "hello") + self.assertEqual(typed["evidence"]["ev_legacy_000001"]["content"], "alpha") + + (session_dir / "state.json").write_text("{}", encoding="utf-8") + loaded = store.load_state(sid) + self.assertEqual(loaded["external_observations"], ["alpha", "beta"]) + self.assertEqual(loaded["custom_field"], "hello") + self.assertEqual(loaded["turn_history"], [{"turn_number": 1}]) + + def test_load_state_accepts_legacy_rust_external_context_shape(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + store = SessionStore(workspace=root) + sid, _, _ = store.open_session(session_id="rust-legacy", resume=False) + session_dir = root / ".openplanter" / "sessions" / sid + + (session_dir / "state.json").write_text( + json.dumps( + { + "observations": [ + { + "source": "wiki", + "timestamp": "2026-03-13T00:00:00Z", + "content": "obs one", + }, + { + "source": "tool", + "timestamp": "2026-03-13T00:00:01Z", + "content": "obs two", + }, + ], + "custom_field": "preserve-me", + } + ), + encoding="utf-8", + ) + + loaded = store.load_state(sid) + self.assertEqual(loaded["external_observations"], ["obs one", "obs two"]) + self.assertEqual(loaded["custom_field"], "preserve-me") + + def test_save_state_preserves_existing_typed_fields_and_prunes_only_legacy_evidence(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + store = SessionStore(workspace=root) + sid, _, _ = store.open_session(session_id="typed-merge", resume=False) + session_dir = root / ".openplanter" / "sessions" / sid + + typed = { + "schema_version": "1.0.0", + "session_id": sid, + "created_at": "2026-03-13T00:00:00+00:00", + "updated_at": "2026-03-13T00:00:00+00:00", + "objective": "", + "ontology": {"namespace": "openplanter.core", "version": "2026-03"}, + "entities": {}, + "links": {}, + "claims": {}, + "evidence": { + "ev_legacy_000001": { + "id": "ev_legacy_000001", + "content": "stale", + "normalization": {"kind": "legacy_observation"}, + }, + "ev_legacy_000002": { + "id": "ev_legacy_000002", + "content": "remove me", + "normalization": {"kind": "legacy_observation"}, + }, + "ev_other": { + "id": "ev_other", + "content": "keep me", + "normalization": {"kind": "web_fetch"}, + }, + }, + "hypotheses": {}, + "questions": {"q_1": {"id": "q_1", "question_text": "keep me"}}, + "tasks": {}, + "actions": {}, + "provenance_nodes": {}, + "confidence_profiles": {}, + "timeline": [], + "indexes": {"by_external_ref": {}, "by_tag": {}}, + "legacy": { + "external_observations": ["stale", "remove me"], + "turn_history": [], + "loop_metrics": {}, + "extra_fields": {"custom_field": "before"}, + }, + } + (session_dir / "investigation_state.json").write_text( + json.dumps(typed), + encoding="utf-8", + ) + + store.save_state( + sid, + { + "session_id": sid, + "saved_at": "2026-03-13T12:30:00+00:00", + "external_observations": ["fresh"], + "turn_history": [{"turn_number": 3}], + "loop_metrics": {"turns": 3}, + "custom_field": "after", + }, + ) + + updated = json.loads((session_dir / "investigation_state.json").read_text(encoding="utf-8")) + self.assertEqual(updated["questions"]["q_1"]["question_text"], "keep me") + self.assertIn("ev_other", updated["evidence"]) + self.assertEqual(updated["evidence"]["ev_legacy_000001"]["content"], "fresh") + self.assertNotIn("ev_legacy_000002", updated["evidence"]) + self.assertEqual(updated["legacy"]["extra_fields"]["custom_field"], "after") + + projected = json.loads((session_dir / "state.json").read_text(encoding="utf-8")) + self.assertEqual(projected["external_observations"], ["fresh"]) + self.assertEqual(projected["custom_field"], "after") + + +class QuestionReasoningPacketTests(unittest.TestCase): + def test_build_question_reasoning_packet_groups_findings_and_contradictions(self) -> None: + state = { + "questions": { + "q_2": { + "id": "q_2", + "question_text": "Is claim 2 true?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_2"], + "evidence_ids": ["ev_2"], + }, + "q_1": { + "id": "q_1", + "question_text": "Is claim 1 true?", + "status": "open", + "priority": "critical", + "claim_ids": ["cl_1"], + "evidence_ids": ["ev_1", "ev_3"], + }, + "q_done": { + "id": "q_done", + "question_text": "Ignore", + "status": "resolved", + }, + }, + "claims": { + "cl_1": { + "claim_text": "Claim supported", + "status": "supported", + "support_evidence_ids": ["ev_1"], + "confidence": 0.91, + }, + "cl_2": { + "claim_text": "Claim contested", + "status": "contested", + "support_evidence_ids": ["ev_2"], + "contradiction_evidence_ids": ["ev_3"], + "confidence_score": 0.4, + }, + "cl_3": { + "claim_text": "Claim unresolved", + "status": "unresolved", + "evidence_ids": ["ev_4"], + }, + }, + "evidence": { + "ev_1": {"evidence_type": "doc", "provenance_ids": ["pv_1"], "source_uri": "s1"}, + "ev_2": {"evidence_type": "doc", "provenance_ids": ["pv_2"], "source_uri": "s2"}, + "ev_3": {"evidence_type": "doc", "provenance_ids": ["pv_3"], "source_uri": "s3"}, + "ev_4": {"evidence_type": "doc", "provenance_ids": ["pv_4"], "source_uri": "s4"}, + }, + } + + packet = build_question_reasoning_packet(state) + + self.assertEqual(packet["reasoning_mode"], "question_centric") + self.assertEqual(packet["focus_question_ids"], ["q_1", "q_2"]) + self.assertEqual(len(packet["findings"]["supported"]), 1) + self.assertEqual(packet["findings"]["supported"][0]["id"], "cl_1") + self.assertEqual(len(packet["findings"]["contested"]), 1) + self.assertEqual(packet["findings"]["contested"][0]["id"], "cl_2") + self.assertEqual(len(packet["findings"]["unresolved"]), 1) + self.assertEqual(packet["findings"]["unresolved"][0]["id"], "cl_3") + self.assertEqual(packet["contradictions"][0]["claim_id"], "cl_2") + self.assertIn("ev_3", packet["evidence_index"]) + self.assertEqual( + [action["id"] for action in packet["candidate_actions"]], + ["ca_q_q_1", "ca_q_q_2", "ca_c_cl_2", "ca_c_cl_3"], + ) + self.assertEqual(packet["candidate_actions"][0]["required_sources"], ["s1", "s3"]) + self.assertEqual( + packet["candidate_actions"][1]["rationale"]["reason_codes"], + ["question_unresolved", "claim_low_confidence"], + ) + self.assertEqual( + packet["candidate_actions"][2]["evidence_gap_refs"][0]["kind"], + "low_confidence", + ) + self.assertEqual( + packet["candidate_actions"][3]["evidence_gap_refs"][0]["kind"], + "missing_counter_evidence", + ) + self.assertEqual(packet["candidate_actions"][3]["required_sources"], ["s4"]) + self.assertTrue(packet["candidate_actions"][0]["ontology_object_refs"]) + + def test_build_question_reasoning_packet_emits_candidate_actions_for_missing_evidence(self) -> None: + state = { + "questions": { + "q_1": { + "id": "q_1", + "question_text": "What source confirms the claim?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_1"], + "evidence_ids": [], + } + }, + "claims": { + "cl_1": { + "id": "cl_1", + "claim_text": "Needs evidence", + "status": "proposed", + "evidence_ids": [], + "confidence": "0.20", + } + }, + "evidence": {}, + } + + packet = build_question_reasoning_packet(state) + + self.assertEqual(packet["candidate_actions"][0]["id"], "ca_q_q_1") + self.assertEqual(packet["candidate_actions"][0]["evidence_gap_refs"][0]["kind"], "missing_evidence") + self.assertEqual(packet["candidate_actions"][1]["id"], "ca_c_cl_1") + self.assertEqual( + packet["candidate_actions"][1]["rationale"]["reason_codes"], + ["claim_unresolved", "claim_low_confidence", "question_unresolved"], + ) + self.assertEqual(packet["candidate_actions"][1]["required_sources"], []) + + def test_build_question_reasoning_packet_keeps_entity_inputs_entity_only_and_collects_question_sources( + self, + ) -> None: + state = { + "questions": { + "q_1": { + "id": "q_1", + "question_text": "What source confirms the claim?", + "status": "open", + "priority": "high", + "claim_ids": ["cl_1"], + "resolution_claim_id": "cl_resolution", + "provenance_ids": ["pv_q_1"], + } + }, + "claims": { + "cl_1": { + "id": "cl_1", + "claim_text": "Needs evidence", + "status": "proposed", + "evidence_ids": [], + "confidence": 0.2, + } + }, + "provenance_nodes": { + "pv_q_1": { + "id": "pv_q_1", + "source_uri": "https://question-source.test", + } + }, + "evidence": {}, + } + + packet = build_question_reasoning_packet(state) + question_action = next(action for action in packet["candidate_actions"] if action["id"] == "ca_q_q_1") + + self.assertEqual(question_action["required_inputs"]["entity_ids"], []) + self.assertEqual(question_action["required_sources"], ["https://question-source.test"]) + self.assertFalse( + any(ref.get("object_type") == "entity" for ref in question_action["ontology_object_refs"]) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_replay_log.py b/tests/test_replay_log.py index ff31e7a9..fad6033f 100644 --- a/tests/test_replay_log.py +++ b/tests/test_replay_log.py @@ -4,6 +4,7 @@ import json import tempfile +import threading import unittest from pathlib import Path @@ -174,9 +175,35 @@ def test_child_logger(self) -> None: self.assertEqual(records[2]["conversation_id"], "root/d0s2") self.assertEqual(records[2]["model"], "m-child") self.assertEqual(records[3]["conversation_id"], "root/d0s2") - self.assertEqual(records[3]["seq"], 0) + self.assertEqual(records[3]["seq"], 1) self.assertIn("messages_snapshot", records[3]) + def test_child_logger_owner_suffix_keeps_ids_unique(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + parent = ReplayLogger(path=p) + + left = parent.child(depth=0, step=2, owner="call_subtask:0") + right = parent.child(depth=0, step=2, owner="call_subtask:1") + + self.assertNotEqual(left.conversation_id, right.conversation_id) + self.assertRegex(left.conversation_id, r"^root/d0s2/o[A-Za-z0-9._-]+_[0-9a-f]{8}$") + self.assertRegex(right.conversation_id, r"^root/d0s2/o[A-Za-z0-9._-]+_[0-9a-f]{8}$") + + def test_child_logger_owner_suffix_normalizes_and_hashes(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + parent = ReplayLogger(path=p) + + same_left = parent.child(depth=0, step=2, owner=" odd owner/with spaces? ") + same_right = parent.child(depth=0, step=2, owner=" odd owner/with spaces? ") + collided_slug_a = parent.child(depth=0, step=2, owner="abc/def") + collided_slug_b = parent.child(depth=0, step=2, owner="abc:def") + + self.assertEqual(same_left.conversation_id, same_right.conversation_id) + self.assertIn("/oodd_owner_with_spaces_", same_left.conversation_id) + self.assertNotEqual(collided_slug_a.conversation_id, collided_slug_b.conversation_id) + def test_creates_parent_dirs(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: p = Path(tmpdir) / "deep" / "nested" / "replay.jsonl" @@ -187,6 +214,149 @@ def test_creates_parent_dirs(self) -> None: ) self.assertTrue(p.exists()) + def test_initializes_seq_from_existing_file(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + p.write_text( + "\n".join([ + json.dumps({"type": "header", "conversation_id": "root"}), + json.dumps({"type": "call", "conversation_id": "root", "seq": 3, "messages_snapshot": [{"role": "user", "content": "hi"}]}), + "{malformed", + json.dumps({"type": "call", "conversation_id": "other", "seq": 8, "messages_snapshot": [{"role": "user", "content": "x"}]}), + ]) + + "\n", + encoding="utf-8", + ) + + logger = ReplayLogger(path=p) + logger.log_call( + depth=0, + step=2, + messages=[ + {"role": "user", "content": "hi"}, + {"role": "assistant", "content": "hello"}, + ], + response={"r": 1}, + ) + + records = [] + for line in p.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + continue + calls = [r for r in records if r.get("type") == "call" and r.get("conversation_id") == "root"] + self.assertEqual(calls[-1]["seq"], 9) + self.assertIn("messages_delta", calls[-1]) + self.assertEqual(calls[-1]["messages_delta"], [{"role": "assistant", "content": "hello"}]) + + def test_force_snapshot_first_call_resets_root_message_latch(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + first = ReplayLogger(path=p, force_snapshot_first_call=True) + first.log_call( + depth=0, + step=1, + messages=[{"role": "user", "content": "turn one"}], + response={"r": 1}, + ) + + second = ReplayLogger(path=p, force_snapshot_first_call=True) + second.log_call( + depth=0, + step=1, + messages=[{"role": "user", "content": "turn two"}], + response={"r": 2}, + ) + + calls = [r for r in self._read_records(p) if r.get("type") == "call" and r.get("conversation_id") == "root"] + self.assertEqual(calls[0]["seq"], 0) + self.assertIn("messages_snapshot", calls[0]) + self.assertEqual(calls[1]["seq"], 1) + self.assertIn("messages_snapshot", calls[1]) + self.assertNotIn("messages_delta", calls[1]) + + def test_force_snapshot_first_call_propagates_to_child_logger(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + + first = ReplayLogger(path=p, force_snapshot_first_call=True) + child_first = first.child(depth=0, step=1) + child_first.log_call( + depth=1, + step=1, + messages=[{"role": "user", "content": "child turn one"}], + response={"r": 1}, + ) + + second = ReplayLogger(path=p, force_snapshot_first_call=True) + child_second = second.child(depth=0, step=1) + child_second.log_call( + depth=1, + step=1, + messages=[{"role": "user", "content": "child turn two"}], + response={"r": 2}, + ) + + calls = [ + r + for r in self._read_records(p) + if r.get("type") == "call" and r.get("conversation_id") == "root/d0s1" + ] + self.assertEqual(calls[0]["seq"], 0) + self.assertIn("messages_snapshot", calls[0]) + self.assertEqual(calls[1]["seq"], 1) + self.assertIn("messages_snapshot", calls[1]) + self.assertNotIn("messages_delta", calls[1]) + + def test_parallel_child_loggers_keep_seq_unique_and_contiguous(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + p = Path(tmpdir) / "replay.jsonl" + parent = ReplayLogger(path=p) + parent.log_call( + depth=0, + step=1, + messages=[{"role": "user", "content": "root"}], + response={"r": "root"}, + ) + + barrier = threading.Barrier(3) + errors: list[BaseException] = [] + + def _worker(step: int) -> None: + try: + child = parent.child(depth=0, step=step) + barrier.wait(timeout=5.0) + child.log_call( + depth=1, + step=1, + messages=[{"role": "user", "content": f"child-{step}"}], + response={"r": step}, + ) + except BaseException as exc: # pragma: no cover - surfaced below + errors.append(exc) + + threads = [ + threading.Thread(target=_worker, args=(1,)), + threading.Thread(target=_worker, args=(2,)), + ] + for thread in threads: + thread.start() + barrier.wait(timeout=5.0) + for thread in threads: + thread.join(timeout=5.0) + + if errors: + raise errors[0] + + call_records = [r for r in self._read_records(p) if r.get("type") == "call"] + seqs = [record["seq"] for record in call_records] + self.assertEqual(seqs, sorted(seqs)) + self.assertEqual(seqs, list(range(len(call_records)))) + class ReplayLoggerIntegrationTests(unittest.TestCase): def _read_records(self, path: Path) -> list[dict]: @@ -275,6 +445,69 @@ def test_subtask_logged_with_child_conversation(self) -> None: self.assertEqual(len(child_calls), 1) self.assertEqual(child_calls[0]["depth"], 1) + def test_parallel_subtasks_log_distinct_child_conversations_for_same_step(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=3, + max_steps_per_call=6, + recursive=True, + acceptance_criteria=False, + ) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[ + _tc("subtask", objective="task A", model="worker-a"), + _tc("subtask", objective="task B", model="worker-b"), + ]), + ModelTurn(text="root done", stop_reason="end_turn"), + ] + ) + + def factory(model_name: str, _effort: str | None) -> ScriptedModel: + objective = "task A" if model_name == "worker-a" else "task B" + return ScriptedModel( + scripted_turns=[ + ModelTurn(text=f"{objective} done", stop_reason="end_turn"), + ] + ) + + engine = RLMEngine(model=model, tools=tools, config=cfg, model_factory=factory) + replay_path = root / "replay.jsonl" + replay_logger = ReplayLogger(path=replay_path) + + result, _ = engine.solve_with_context( + objective="top level", + replay_logger=replay_logger, + ) + self.assertEqual(result, "root done") + + records = self._read_records(replay_path) + headers = [r for r in records if r["type"] == "header"] + calls = [r for r in records if r["type"] == "call"] + + child_ids = sorted( + { + record["conversation_id"] + for record in headers + if record["conversation_id"].startswith("root/d0s1/o") + } + ) + self.assertEqual(len(child_ids), 2) + self.assertNotEqual(child_ids[0], child_ids[1]) + + root_calls = [c for c in calls if c["conversation_id"] == "root"] + self.assertEqual(len(root_calls), 2) + for child_id in child_ids: + child_headers = [h for h in headers if h["conversation_id"] == child_id] + child_calls = [c for c in calls if c["conversation_id"] == child_id] + self.assertEqual(len(child_headers), 1) + self.assertEqual(len(child_calls), 1) + self.assertEqual(child_calls[0]["depth"], 1) + self.assertIn("messages_snapshot", child_calls[0]) + def test_replay_log_via_runtime(self) -> None: """SessionRuntime.solve() creates replay.jsonl in session dir.""" with tempfile.TemporaryDirectory() as tmpdir: @@ -304,6 +537,37 @@ def test_replay_log_via_runtime(self) -> None: self.assertIn("header", types) self.assertIn("call", types) + def test_runtime_second_solve_starts_with_snapshot(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig(workspace=root, max_depth=1, max_steps_per_call=4) + tools = WorkspaceTools(root=root) + model = ScriptedModel( + scripted_turns=[ + ModelTurn(text="first", stop_reason="end_turn"), + ModelTurn(text="second", stop_reason="end_turn"), + ] + ) + engine = RLMEngine(model=model, tools=tools, config=cfg) + + from agent.runtime import SessionRuntime + + runtime = SessionRuntime.bootstrap(engine=engine, config=cfg, session_id="sess-replay-two", resume=False) + self.assertEqual(runtime.solve("first objective"), "first") + self.assertEqual(runtime.solve("second objective"), "second") + + replay_path = ( + root / cfg.session_root_dir / "sessions" / runtime.session_id / "replay.jsonl" + ) + records = self._read_records(replay_path) + calls = [r for r in records if r.get("type") == "call" and r.get("conversation_id") == "root"] + self.assertEqual(len(calls), 2) + self.assertEqual(calls[0]["seq"], 0) + self.assertIn("messages_snapshot", calls[0]) + self.assertEqual(calls[1]["seq"], 1) + self.assertIn("messages_snapshot", calls[1]) + self.assertNotIn("messages_delta", calls[1]) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_session.py b/tests/test_session.py index 0b6428ef..3a3b5f7e 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -9,7 +9,7 @@ from agent.config import AgentConfig from agent.engine import RLMEngine from agent.model import ModelTurn, ScriptedModel -from agent.runtime import SessionRuntime +from agent.runtime import SessionRuntime, _has_reasoning_content from agent.tools import WorkspaceTools @@ -61,6 +61,234 @@ def test_session_persist_and_resume(self) -> None: result2 = runtime2.solve("finish") self.assertEqual(result2, "second done") + def test_runtime_solve_injects_question_reasoning_packet_from_typed_state(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + session_root_dir=".openplanter", + max_persisted_observations=50, + ) + + captured: list[str] = [] + + class CapturingModel(ScriptedModel): + def create_conversation(self, system_prompt: str, initial_user_message: str): + captured.append(initial_user_message) + return super().create_conversation(system_prompt, initial_user_message) + + model = CapturingModel(scripted_turns=[ModelTurn(text="ok", stop_reason="end_turn")]) + engine = RLMEngine(model=model, tools=WorkspaceTools(root=root), config=cfg) + runtime = SessionRuntime.bootstrap( + engine=engine, + config=cfg, + session_id="session-packet", + resume=False, + ) + + session_dir = root / ".openplanter" / "sessions" / "session-packet" + typed_state_path = session_dir / "investigation_state.json" + typed = json.loads(typed_state_path.read_text(encoding="utf-8")) + typed["questions"] = { + "q_1": { + "id": "q_1", + "question_text": "Open question", + "status": "open", + "priority": "high", + "claim_ids": ["cl_1"], + } + } + typed["claims"] = { + "cl_1": { + "id": "cl_1", + "claim_text": "Needs support", + "status": "unresolved", + "evidence_ids": ["ev_1"], + } + } + typed["evidence"] = { + "ev_1": { + "id": "ev_1", + "evidence_type": "web_fetch", + "source_uri": "https://example.test", + "provenance_ids": ["pv_1"], + } + } + typed_state_path.write_text(json.dumps(typed), encoding="utf-8") + + result = runtime.solve("continue") + + self.assertEqual(result, "ok") + self.assertEqual(len(captured), 1) + parsed = json.loads(captured[0]) + packet = parsed["question_reasoning_packet"] + self.assertEqual(packet["reasoning_mode"], "question_centric") + self.assertEqual(packet["focus_question_ids"], ["q_1"]) + self.assertEqual(packet["findings"]["unresolved"][0]["id"], "cl_1") + self.assertEqual(packet["candidate_actions"][0]["id"], "ca_q_q_1") + self.assertEqual(packet["candidate_actions"][1]["id"], "ca_c_cl_1") + self.assertEqual(packet["candidate_actions"][1]["required_sources"], ["https://example.test"]) + + def test_runtime_reasoning_gate_accepts_candidate_actions_only(self) -> None: + packet = { + "focus_question_ids": [], + "findings": {"supported": [], "contested": [], "unresolved": []}, + "contradictions": [], + "candidate_actions": [{"id": "ca_q_q_1"}], + } + + self.assertTrue(_has_reasoning_content(packet)) + + def test_runtime_resume_falls_back_to_legacy_state_when_typed_state_is_invalid(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + session_root_dir=".openplanter", + max_persisted_observations=50, + ) + session_id = "session-invalid-typed-resume" + + engine1 = RLMEngine( + model=ScriptedModel(scripted_turns=[ModelTurn(text="ok", stop_reason="end_turn")]), + tools=WorkspaceTools(root=root), + config=cfg, + ) + SessionRuntime.bootstrap( + engine=engine1, + config=cfg, + session_id=session_id, + resume=False, + ) + + session_dir = root / ".openplanter" / "sessions" / session_id + state_path = session_dir / "state.json" + typed_state_path = session_dir / "investigation_state.json" + events_path = session_dir / "events.jsonl" + + legacy_state = json.loads(state_path.read_text(encoding="utf-8")) + legacy_state["external_observations"] = ["legacy fallback observation"] + state_path.write_text(json.dumps(legacy_state), encoding="utf-8") + typed_state_path.write_text("{not-json", encoding="utf-8") + + engine2 = RLMEngine( + model=ScriptedModel(scripted_turns=[ModelTurn(text="ok", stop_reason="end_turn")]), + tools=WorkspaceTools(root=root), + config=cfg, + ) + runtime = SessionRuntime.bootstrap( + engine=engine2, + config=cfg, + session_id=session_id, + resume=True, + ) + + self.assertIn("legacy fallback observation", runtime.context.observations) + self.assertEqual(typed_state_path.read_text(encoding="utf-8"), "{not-json") + + traces = [ + json.loads(line)["payload"]["message"] + for line in events_path.read_text(encoding="utf-8").splitlines() + if line.strip() and json.loads(line).get("type") == "trace" + ] + self.assertTrue( + any("falling back to legacy state" in trace for trace in traces), + traces, + ) + self.assertTrue( + any("preserving the corrupt typed state file" in trace for trace in traces), + traces, + ) + + def test_runtime_solve_continues_without_reasoning_packet_when_typed_state_is_invalid(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + session_root_dir=".openplanter", + max_persisted_observations=50, + ) + + captured: list[str] = [] + + class CapturingModel(ScriptedModel): + def create_conversation(self, system_prompt: str, initial_user_message: str): + captured.append(initial_user_message) + return super().create_conversation(system_prompt, initial_user_message) + + model = CapturingModel(scripted_turns=[ModelTurn(text="ok", stop_reason="end_turn")]) + engine = RLMEngine(model=model, tools=WorkspaceTools(root=root), config=cfg) + runtime = SessionRuntime.bootstrap( + engine=engine, + config=cfg, + session_id="session-invalid-typed-solve", + resume=False, + ) + + session_dir = root / ".openplanter" / "sessions" / "session-invalid-typed-solve" + typed_state_path = session_dir / "investigation_state.json" + typed_state_path.write_text("{not-json", encoding="utf-8") + + events: list[str] = [] + result = runtime.solve("continue", on_event=events.append) + + self.assertEqual(result, "ok") + self.assertEqual(typed_state_path.read_text(encoding="utf-8"), "{not-json") + self.assertTrue( + any("continuing without typed reasoning state" in message for message in events), + events, + ) + + parsed = json.loads(captured[0]) + self.assertNotIn("question_reasoning_packet", parsed) + + def test_runtime_persist_preserves_corrupt_typed_state_file(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = AgentConfig( + workspace=root, + max_depth=1, + max_steps_per_call=2, + session_root_dir=".openplanter", + max_persisted_observations=50, + ) + + engine = RLMEngine( + model=ScriptedModel(scripted_turns=[ModelTurn(text="ok", stop_reason="end_turn")]), + tools=WorkspaceTools(root=root), + config=cfg, + ) + runtime = SessionRuntime.bootstrap( + engine=engine, + config=cfg, + session_id="session-invalid-typed-persist", + resume=False, + ) + + session_dir = root / ".openplanter" / "sessions" / "session-invalid-typed-persist" + state_path = session_dir / "state.json" + typed_state_path = session_dir / "investigation_state.json" + typed_state_path.write_text("{not-json", encoding="utf-8") + + runtime.context.observations.append("fresh observation") + runtime._persist_state() + + persisted = json.loads(state_path.read_text(encoding="utf-8")) + self.assertIn("fresh observation", persisted["external_observations"]) + self.assertEqual(typed_state_path.read_text(encoding="utf-8"), "{not-json") + self.assertTrue( + any( + "preserving the corrupt typed state file" in warning + for warning in runtime.store.drain_warnings() + ) + ) + def test_patch_artifact_saved(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: root = Path(tmpdir) diff --git a/tests/test_turn_summaries.py b/tests/test_turn_summaries.py index c7e99828..ab61bb2b 100644 --- a/tests/test_turn_summaries.py +++ b/tests/test_turn_summaries.py @@ -282,6 +282,179 @@ def test_backward_compat_old_state_no_turn_history(self) -> None: self.assertEqual(len(rt.turn_history), 1) self.assertEqual(rt.turn_history[0].turn_number, 1) + def test_loop_metrics_persisted_and_loaded_additively(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = self._make_config(root) + + model1 = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[_tc("list_files")]), + ModelTurn(text="done-1", stop_reason="end_turn"), + ] + ) + engine1 = RLMEngine(model=model1, tools=WorkspaceTools(root=root), config=cfg) + rt1 = SessionRuntime.bootstrap( + engine=engine1, config=cfg, session_id="sess-loop", resume=False, + ) + rt1.solve("first") + + state_path = root / ".openplanter" / "sessions" / "sess-loop" / "state.json" + state_after_first = json.loads(state_path.read_text(encoding="utf-8")) + self.assertIn("loop_metrics", state_after_first) + self.assertEqual(state_after_first["loop_metrics"]["turns"], 1) + + model2 = ScriptedModel( + scripted_turns=[ModelTurn(text="done-2", stop_reason="end_turn")] + ) + engine2 = RLMEngine(model=model2, tools=WorkspaceTools(root=root), config=cfg) + rt2 = SessionRuntime.bootstrap( + engine=engine2, config=cfg, session_id="sess-loop", resume=True, + ) + self.assertIn("turns", rt2.loop_metrics) + rt2.solve("second") + + state_after_second = json.loads(state_path.read_text(encoding="utf-8")) + self.assertEqual(state_after_second["loop_metrics"]["turns"], 2) + self.assertIn("last_turn", state_after_second["loop_metrics"]) + + def test_replay_seq_start_stays_monotonic_and_second_turn_starts_with_snapshot(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = self._make_config(root) + + model1 = ScriptedModel( + scripted_turns=[ModelTurn(text="done-1", stop_reason="end_turn")] + ) + engine1 = RLMEngine(model=model1, tools=WorkspaceTools(root=root), config=cfg) + rt1 = SessionRuntime.bootstrap( + engine=engine1, config=cfg, session_id="sess-replay-boundary", resume=False, + ) + rt1.solve("first turn") + + model2 = ScriptedModel( + scripted_turns=[ModelTurn(text="done-2", stop_reason="end_turn")] + ) + engine2 = RLMEngine(model=model2, tools=WorkspaceTools(root=root), config=cfg) + rt2 = SessionRuntime.bootstrap( + engine=engine2, config=cfg, session_id="sess-replay-boundary", resume=True, + ) + rt2.solve("second turn") + + state_path = root / ".openplanter" / "sessions" / "sess-replay-boundary" / "state.json" + state = json.loads(state_path.read_text(encoding="utf-8")) + history = state["turn_history"] + self.assertEqual(len(history), 2) + self.assertLess(history[0]["replay_seq_start"], history[1]["replay_seq_start"]) + + replay_path = root / ".openplanter" / "sessions" / "sess-replay-boundary" / "replay.jsonl" + records = [ + json.loads(line) + for line in replay_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + calls = [r for r in records if r.get("type") == "call" and r.get("conversation_id") == "root"] + self.assertEqual(len(calls), 2) + self.assertIn("messages_snapshot", calls[0]) + self.assertIn("messages_snapshot", calls[1]) + self.assertNotIn("messages_delta", calls[1]) + + def test_steps_used_counts_parallel_child_conversations(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = self._make_config( + root, + max_depth=3, + max_steps_per_call=6, + recursive=True, + acceptance_criteria=False, + ) + + parent_model = ScriptedModel( + scripted_turns=[ + ModelTurn(tool_calls=[ + _tc("subtask", objective="task A", model="worker-a"), + _tc("subtask", objective="task B", model="worker-b"), + ]), + ModelTurn(text="root done", stop_reason="end_turn"), + ] + ) + + def factory(model_name: str, _effort: str | None) -> ScriptedModel: + if model_name == "worker-a": + return ScriptedModel( + scripted_turns=[ModelTurn(text="child A", stop_reason="end_turn")] + ) + if model_name == "worker-b": + return ScriptedModel( + scripted_turns=[ModelTurn(text="child B", stop_reason="end_turn")] + ) + raise AssertionError(f"unexpected model request: {model_name}") + + engine = RLMEngine( + model=parent_model, + tools=WorkspaceTools(root=root), + config=cfg, + model_factory=factory, + ) + runtime = SessionRuntime.bootstrap( + engine=engine, + config=cfg, + session_id="sess-parallel-steps", + resume=False, + ) + + result = runtime.solve("parallel task") + self.assertEqual(result, "root done") + + state_path = root / ".openplanter" / "sessions" / "sess-parallel-steps" / "state.json" + state = json.loads(state_path.read_text(encoding="utf-8")) + entry = state["turn_history"][0] + self.assertEqual(entry["steps_used"], 4) + self.assertEqual(entry["replay_seq_start"], 0) + + replay_path = root / ".openplanter" / "sessions" / "sess-parallel-steps" / "replay.jsonl" + records = [ + json.loads(line) + for line in replay_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + calls = [r for r in records if r.get("type") == "call"] + self.assertEqual(len(calls), 4) + + def test_backward_compat_old_state_no_loop_metrics(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + root = Path(tmpdir) + cfg = self._make_config(root) + + session_dir = root / ".openplanter" / "sessions" / "sess-no-loop" + session_dir.mkdir(parents=True) + (session_dir / "artifacts").mkdir() + (session_dir / "metadata.json").write_text( + json.dumps({"session_id": "sess-no-loop", "workspace": str(root)}), + encoding="utf-8", + ) + (session_dir / "state.json").write_text( + json.dumps({ + "session_id": "sess-no-loop", + "saved_at": "2026-01-01T00:00:00Z", + "external_observations": [], + }), + encoding="utf-8", + ) + + model = ScriptedModel( + scripted_turns=[ModelTurn(text="resumed", stop_reason="end_turn")] + ) + engine = RLMEngine(model=model, tools=WorkspaceTools(root=root), config=cfg) + rt = SessionRuntime.bootstrap( + engine=engine, config=cfg, session_id="sess-no-loop", resume=True, + ) + self.assertIsNotNone(rt.loop_metrics) + self.assertEqual(rt.loop_metrics.get("turns"), 0) + rt.solve("new turn") + self.assertEqual(rt.loop_metrics.get("turns"), 1) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_user_stories.py b/tests/test_user_stories.py index 4ab5cdaa..10513874 100644 --- a/tests/test_user_stories.py +++ b/tests/test_user_stories.py @@ -1044,8 +1044,8 @@ def test_multiple_errors_then_success(self) -> None: ) self.assertGreaterEqual(error_count, 2) - def test_all_steps_fail_then_budget_exhausted(self) -> None: - """Every step errors; budget exhausted message still includes objective.""" + def test_all_steps_fail_then_returns_partial_completion(self) -> None: + """Every step errors; partial completion still includes objective.""" with tempfile.TemporaryDirectory() as tmpdir: root = Path(tmpdir) cfg = _make_config(root, max_steps_per_call=3) @@ -1060,8 +1060,9 @@ def test_all_steps_fail_then_budget_exhausted(self) -> None: runtime = _make_runtime(root, cfg, turns, "all-fail") result = runtime.solve("attempt impossible reads") - self.assertIn("Step budget exhausted", result) + self.assertIn("Partial completion for objective", result) self.assertIn("attempt impossible reads", result) + self.assertEqual(runtime.loop_metrics.get("termination_reason"), "budget_no_progress") # ===================================================================