diff --git a/pdd/agentic_common.py b/pdd/agentic_common.py index 30921bd9d..059095877 100644 --- a/pdd/agentic_common.py +++ b/pdd/agentic_common.py @@ -153,12 +153,16 @@ def classify_step_output( model: LiteLLM model identifier for classification. Returns: - TokenMatch if classified, None if classification fails or returns NONE. + TokenMatch when classified to a known token. + None when the classifier confidently returns NONE. + TokenMatch(token="CLASSIFICATION_ERROR") when classification fails + (timeout/rate-limit/provider error/parse failure), allowing callers to + distinguish "no token" from "classifier unavailable". """ try: from pdd.llm_invoke import llm_invoke except ImportError: - return None + return TokenMatch(tier="llm_classification_error", token="CLASSIFICATION_ERROR") token_list = ", ".join(expected_tokens) prompt = ( @@ -190,14 +194,14 @@ def classify_step_output( raw = result.get("result") or result.get("output", "") parsed = json.loads(raw) if isinstance(raw, str) else raw if not parsed: - return None + return TokenMatch(tier="llm_classification_error", token="CLASSIFICATION_ERROR") status = parsed.get("status", "NONE") if status == "NONE" or status not in expected_tokens: return None cost = result.get("cost") return TokenMatch(tier="llm_classification", token=status, cost=cost) except Exception: - return None + return TokenMatch(tier="llm_classification_error", token="CLASSIFICATION_ERROR") def substitute_template_variables( diff --git a/pdd/agentic_e2e_fix_orchestrator.py b/pdd/agentic_e2e_fix_orchestrator.py index 252d4ec8e..01947f4c3 100644 --- a/pdd/agentic_e2e_fix_orchestrator.py +++ b/pdd/agentic_e2e_fix_orchestrator.py @@ -1,6 +1,7 @@ from __future__ import annotations import hashlib +import logging import os import shlex import shutil @@ -10,7 +11,7 @@ import json from datetime import datetime from pathlib import Path -from typing import List, Tuple, Dict, Any, Optional, Set +from typing import List, Tuple, Dict, Any, Optional, Set, NamedTuple from rich.console import Console @@ -75,6 +76,7 @@ } console = Console() +logger = logging.getLogger(__name__) # Re-export for backward compatibility with tests that import from this module _detect_control_token = detect_control_token @@ -212,6 +214,150 @@ def _handle_verification_failure( return failure_type, import_error_retries, False +def _resolve_step9_loop_token(step_output: str, console: Console) -> Optional[str]: + """Resolve Step 9 loop-control token (tiers 1–3 + tier-4 fallback). + + Maps ALL_TESTS_PASS at step 9 to LOCAL_TESTS_PASS for downstream handling. + Tier-4 CLASSIFICATION_ERROR is treated as CONTINUE_CYCLE (transient classifier). + """ + tok = _classify_step_output(step_output, step_num=9) + if tok: + return tok + tier4 = classify_step_output( + step_output, ["ALL_TESTS_PASS", "CONTINUE_CYCLE", "MAX_CYCLES_REACHED"] + ) + if tier4 and tier4.token in ("ALL_TESTS_PASS", "CONTINUE_CYCLE", "MAX_CYCLES_REACHED"): + console.print( + "[yellow]Loop control token detected via LLM classification (tier 4)[/yellow]" + ) + return "LOCAL_TESTS_PASS" if tier4.token == "ALL_TESTS_PASS" else tier4.token + if tier4 and tier4.token == "CLASSIFICATION_ERROR": + console.print( + "[yellow]Step 9 classification unavailable (tier 4 error); " + "starting next cycle.[/yellow]" + ) + return "CONTINUE_CYCLE" + return None + + +class _Step9TokenApplyResult(NamedTuple): + """Outcome of applying a resolved Step 9 loop token (shared initial + retry paths).""" + + break_inner: bool + import_error_retries: int + success: Optional[bool] = None + final_message: Optional[str] = None + last_completed_step: Optional[int] = None + step_outputs_9: Optional[str] = None + verification_failure_context_update: Optional[str] = None # None = no change; str = set + + +def _apply_step9_resolved_token( + resolved_token: str, + step_output: str, + *, + issue_content: str, + changed_files: List[str], + cwd: Path, + initial_file_hashes: Dict[str, Optional[str]], + cycle_start_hashes: Dict[str, Optional[str]], + import_error_retries: int, + console: Console, + step_num: int, + current_cycle: int, + is_retry_path: bool = False, +) -> _Step9TokenApplyResult: + """Handle ALL_TESTS_PASS verification, MAX_CYCLES_REACHED, CONTINUE_CYCLE (+ hash guard). + + Single implementation for the primary Step 9 path and the one-shot Step 9 retry + so verification and convergence logic cannot diverge. + """ + tag = "Step 9 retry" if is_retry_path else "Step 9" + + if resolved_token in ("ALL_TESTS_PASS", "LOCAL_TESTS_PASS"): + test_files = _extract_test_files(issue_content, changed_files, cwd, initial_file_hashes) + if test_files: + verified, verify_output = _verify_tests_independently(test_files, cwd) + if verified: + console.print( + f"[green]LOCAL_TESTS_PASS verified by independent pytest run ({tag}).[/green]" + ) + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + success=True, + final_message="All tests passed after fixes (independently verified).", + ) + console.print( + f"[bold red]LLM claimed tests pass at {tag} but independent " + "verification FAILED.[/bold red]" + ) + _, new_retries, should_retry = _handle_verification_failure( + verify_output, import_error_retries, console + ) + failed_body = ( + f"VERIFICATION_FAILED: LLM claimed tests pass but pytest failed.\n{verify_output}" + ) + full_failed = f"FAILED: {failed_body}" + if should_retry: + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=new_retries, + step_outputs_9=full_failed, + verification_failure_context_update=verify_output, + last_completed_step=0, + ) + return _Step9TokenApplyResult( + break_inner=False, + import_error_retries=new_retries, + step_outputs_9=full_failed, + last_completed_step=step_num - 1, + ) + console.print(f"[green]LOCAL_TESTS_PASS detected in {tag}.[/green]") + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + success=True, + final_message="All tests passed after fixes.", + ) + + if resolved_token == "MAX_CYCLES_REACHED": + console.print(f"[yellow]MAX_CYCLES_REACHED detected in {tag}.[/yellow]") + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + final_message="Max cycles reached.", + ) + + if resolved_token == "CONTINUE_CYCLE": + cycle_changed = _detect_changed_files(cwd, cycle_start_hashes) + if not cycle_changed: + console.print( + f"[yellow]Warning: No file changes detected in cycle {current_cycle}. " + "Stopping to avoid infinite loop.[/yellow]" + ) + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + final_message=f"No progress made in cycle {current_cycle} (no file changes).", + ) + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + ) + + console.print( + "[yellow]Warning: No recognized loop control token found in Step 9. " + "Stopping workflow — missing or unknown token treated as terminal " + "condition.[/yellow]" + ) + return _Step9TokenApplyResult( + break_inner=True, + import_error_retries=import_error_retries, + final_message="Workflow stopped: no recognized loop control token in Step 9 output.", + ) + + def _check_e2e_environment(cwd: Path) -> Tuple[bool, str]: """Check if the executor environment has E2E test infrastructure. @@ -1420,9 +1566,14 @@ def run_agentic_e2e_fix_orchestrator( # Tier 4: LLM classification fallback — only ask about NOT_A_BUG # to avoid false positives (e.g., CODE_BUG being misclassified) tier4 = classify_step_output(step_output, ["NOT_A_BUG"]) - if tier4: + if tier4 and tier4.token == "NOT_A_BUG": console.print("[yellow]NOT_A_BUG detected via LLM classification (tier 4)[/yellow]") _step3_token = "NOT_A_BUG" + elif tier4 and tier4.token == "CLASSIFICATION_ERROR": + console.print( + "[yellow]Step 3 token classification unavailable (tier 4 error); " + "continuing without NOT_A_BUG fallback.[/yellow]" + ) if step_num == 3 and _step3_token == "NOT_A_BUG": # Block NOT_A_BUG if fixes were already applied in prior cycles has_fixed_units = any(s.get("fixed") for s in dev_unit_states.values()) @@ -1436,58 +1587,129 @@ def run_agentic_e2e_fix_orchestrator( # Check Loop Control (Step 9) if step_num == 9: - _step9_token = _classify_step_output(step_output, step_num=9) - if not _step9_token: - # Tier 4: LLM classification fallback - tier4 = classify_step_output( - step_output, ["ALL_TESTS_PASS", "CONTINUE_CYCLE", "MAX_CYCLES_REACHED"] + _step9_token = _resolve_step9_loop_token(step_output, console) + + def _merge_step9_apply(r: _Step9TokenApplyResult) -> None: + nonlocal import_error_retries, success, final_message + nonlocal last_completed_step, verification_failure_context + import_error_retries = r.import_error_retries + if r.step_outputs_9 is not None: + step_outputs[str(step_num)] = r.step_outputs_9 + if r.success is not None: + success = r.success + if r.final_message is not None: + final_message = r.final_message + if r.last_completed_step is not None: + last_completed_step = r.last_completed_step + if r.verification_failure_context_update is not None: + verification_failure_context = r.verification_failure_context_update + + if _step9_token is not None: + r9 = _apply_step9_resolved_token( + _step9_token, + step_output, + issue_content=issue_content, + changed_files=changed_files, + cwd=cwd, + initial_file_hashes=initial_file_hashes, + cycle_start_hashes=cycle_start_hashes, + import_error_retries=import_error_retries, + console=console, + step_num=step_num, + current_cycle=current_cycle, + is_retry_path=False, ) - if tier4: - console.print("[yellow]Loop control token detected via LLM classification (tier 4)[/yellow]") - _step9_token = "CONTINUE_CYCLE" # Safe default: keep cycling - if _step9_token in ("ALL_TESTS_PASS", "LOCAL_TESTS_PASS"): - # Independent verification: don't trust LLM output alone - test_files = _extract_test_files(issue_content, changed_files, cwd, initial_file_hashes) - if test_files: - verified, verify_output = _verify_tests_independently(test_files, cwd) - if verified: - console.print("[green]LOCAL_TESTS_PASS verified by independent pytest run (Step 9).[/green]") - success = True - final_message = "All tests passed after fixes (independently verified)." - break - else: - console.print("[bold red]LLM claimed tests pass at Step 9 but independent verification FAILED.[/bold red]") - _, import_error_retries, should_retry = _handle_verification_failure(verify_output, import_error_retries, console) - step_output = f"VERIFICATION_FAILED: LLM claimed tests pass but pytest failed.\n{verify_output}" - step_outputs[str(step_num)] = f"FAILED: {step_output}" - if should_retry: - verification_failure_context = verify_output - last_completed_step = 0 - break - last_completed_step = step_num - 1 - # Don't break — fall through to cycle increment - else: - console.print("[green]LOCAL_TESTS_PASS detected in Step 9.[/green]") - success = True - final_message = "All tests passed after fixes." + _merge_step9_apply(r9) + if r9.break_inner: break - elif _step9_token == "MAX_CYCLES_REACHED": - console.print("[yellow]MAX_CYCLES_REACHED detected in Step 9.[/yellow]") - final_message = "Max cycles reached." + elif not step_success: + # Provider/timeout failure with no control token: treat as transient, start next cycle. + console.print( + "[yellow]Step 9 failed with no parseable loop-control token; " + "starting next cycle.[/yellow]" + ) break - elif _step9_token == "CONTINUE_CYCLE": - # Check for progress before starting next cycle (5b) - cycle_changed = _detect_changed_files(cwd, cycle_start_hashes) - # Stop if NO progress was made this cycle (no file changes) - if not cycle_changed: - console.print(f"[yellow]Warning: No file changes detected in cycle {current_cycle}. Stopping to avoid infinite loop.[/yellow]") - final_message = f"No progress made in cycle {current_cycle} (no file changes)." - break - break # Break inner loop — outer loop will start next cycle else: - console.print("[yellow]Warning: No loop control token found in Step 9. Stopping workflow — missing token treated as terminal condition.[/yellow]") - final_message = "Workflow stopped: no loop control token in Step 9 output." - break + # Step 9 succeeded but emitted no token: retry once to get an explicit loop-control signal. + console.print( + "[yellow]No loop control token in Step 9 output; retrying Step 9 once.[/yellow]" + ) + retry_success, retry_output, retry_cost, retry_model = run_agentic_task( + instruction=formatted_prompt, + cwd=cwd, + verbose=verbose, + quiet=quiet, + timeout=base_timeout + timeout_adder, + label=f"cycle{current_cycle}_step9_retry", + max_retries=DEFAULT_MAX_RETRIES, + ) + total_cost += retry_cost + if retry_model: + model_used = retry_model + + # Persist retry accounting immediately so resumed state + # stays consistent (total_cost + which retry was used). + try: + step_outputs[f"{step_num}_retry"] = retry_output + state_data["total_cost"] = total_cost + state_data["model_used"] = model_used + state_data["step_outputs"] = step_outputs.copy() + state_data["last_saved_at"] = datetime.now().isoformat() + + new_gh_id = save_workflow_state( + cwd, + issue_number, + workflow_name, + state_data, + state_dir, + repo_owner, + repo_name, + use_github_state, + github_comment_id, + ) + if new_gh_id: + github_comment_id = new_gh_id + state_data["github_comment_id"] = github_comment_id + except Exception as e: + logger.debug( + "Best-effort Step 9 retry state save failed: %s", + e, + exc_info=True, + ) + + retry_token = _resolve_step9_loop_token(retry_output, console) + if retry_token is None and not retry_success: + console.print( + "[yellow]Step 9 retry failed with no parseable loop-control token; " + "starting next cycle.[/yellow]" + ) + break + if retry_token is None: + console.print( + "[bold yellow]Step 9 did not emit a loop control token after retry; " + "stopping.[/bold yellow]" + ) + final_message = ( + "Workflow stopped: Step 9 did not emit a loop control token after retry." + ) + break + r_retry = _apply_step9_resolved_token( + retry_token, + retry_output, + issue_content=issue_content, + changed_files=changed_files, + cwd=cwd, + initial_file_hashes=initial_file_hashes, + cycle_start_hashes=cycle_start_hashes, + import_error_retries=import_error_retries, + console=console, + step_num=step_num, + current_cycle=current_cycle, + is_retry_path=True, + ) + _merge_step9_apply(r_retry) + if r_retry.break_inner: + break # Check if we should exit the outer loop if success: diff --git a/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt b/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt index 3f5468d00..261712f5f 100644 --- a/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt +++ b/pdd/prompts/agentic_e2e_fix_orchestrator_python.prompt @@ -25,10 +25,15 @@ skipped_steps persists across cycles and is NOT cleared when step_outputs is cle This behavior is called `skipped_steps_memory`: even skipped steps contribute their prior stepN_output values into downstream prompts. The skipped_steps (or equivalent) state persists across cycles and is NOT cleared when step_outputs is cleared between cycles. -% Convergence (Issue #903) -Empty dev-units short-circuit: when Step 5 yields no dev-units, the orchestrator must short-circuit the cycle (skip dependent work) instead of continuing as if work remained. - -Per-cycle file-hash comparison: compare file hashes each cycle so unchanged files are not mistaken for fresh changes; use per-cycle file-hash snapshots to decide whether dev-units or code actually moved. +% Convergence Requirements (Issue #903) +The orchestrator must implement an empty dev-units short-circuit: +- If Step 5 reports `DEV_UNITS_IDENTIFIED: 0`, skip Steps 6-8 for that cycle. +- Continue directly to verification and loop-control handling. + +The orchestrator must implement a per-cycle file-hash comparison: +- Capture a file-hash snapshot at the start and end of each cycle. +- If no file hashes changed for the cycle, treat it as convergence and stop the loop. +- This per-cycle file-hash comparison prevents infinite retry loops when no edits are produced, and avoids treating unchanged files as fresh changes. This file intentionally contains the literal tokens `_check_e2e_environment` and `skipped_steps_memory` so tests can assert the orchestrator prompt documents these behaviors. diff --git a/tests/test_agentic_e2e_fix_orchestrator.py b/tests/test_agentic_e2e_fix_orchestrator.py index 67d119f4a..0e56971d7 100644 --- a/tests/test_agentic_e2e_fix_orchestrator.py +++ b/tests/test_agentic_e2e_fix_orchestrator.py @@ -3146,10 +3146,11 @@ def side_effect(*args, **kwargs): **e2e_fix_default_args ) - # Should only run 9 steps (1 cycle), NOT 18+ (2+ cycles) - assert mock_run.call_count == 9, ( - f"Expected exactly 9 step calls (1 cycle only) but got {mock_run.call_count}. " - f"Missing loop control token should break the loop, not default to CONTINUE_CYCLE." + # Should only run 1 cycle worth of steps plus a single Step 9 retry: + # 9 initial steps + 1 retry call = 10 total, NOT 18+ (2+ full cycles). + assert mock_run.call_count == 10, ( + f"Expected 10 step calls (1 cycle + Step 9 retry) but got {mock_run.call_count}. " + "Missing loop control token should not start Cycle 2." ) def test_no_loop_control_token_logs_warning(self, e2e_fix_mock_dependencies, e2e_fix_default_args): @@ -3752,7 +3753,7 @@ def test_step9_calls_classify_when_tiers123_fail( # Return None for Step 3 classify (not a bug), TokenMatch for Step 9 def classify_side_effect(output, tokens, **kwargs): if "ALL_TESTS_PASS" in tokens: - return TokenMatch(tier="llm_classification") + return TokenMatch(tier="llm_classification", token="CONTINUE_CYCLE") return None mock_classify.side_effect = classify_side_effect @@ -3793,7 +3794,7 @@ def test_step9_classify_prevents_safety_stop( def classify_side_effect(output, tokens, **kwargs): if "ALL_TESTS_PASS" in tokens: - return TokenMatch(tier="llm_classification") + return TokenMatch(tier="llm_classification", token="CONTINUE_CYCLE") return None mock_classify.side_effect = classify_side_effect @@ -3854,7 +3855,7 @@ def test_step3_calls_classify_when_tiers123_fail( from pdd.agentic_common import TokenMatch mock_run, _, _ = e2e_fix_mock_dependencies - mock_classify.return_value = TokenMatch(tier="llm_classification") + mock_classify.return_value = TokenMatch(tier="llm_classification", token="NOT_A_BUG") def side_effect(*args, **kwargs): label = kwargs.get('label', '') @@ -3871,7 +3872,7 @@ def side_effect(*args, **kwargs): def test_step9_classify_failure_falls_through_to_safety_stop( self, mock_classify, e2e_fix_mock_dependencies, e2e_fix_default_args ): - """If classify_step_output returns None, safety stop should still trigger.""" + """If classify_step_output returns confident NONE, safety stop should still trigger.""" from pdd.agentic_e2e_fix_orchestrator import run_agentic_e2e_fix_orchestrator mock_run, _, _ = e2e_fix_mock_dependencies @@ -3888,6 +3889,39 @@ def side_effect(*args, **kwargs): assert "no loop control token" in msg.lower() or "stopped" in msg.lower() + @patch("pdd.agentic_e2e_fix_orchestrator.classify_step_output") + def test_step9_classify_error_continues_cycle( + self, mock_classify, e2e_fix_mock_dependencies, e2e_fix_default_args + ): + """If tier-4 classifier fails, Step 9 should continue cycle (not terminal-stop).""" + from pdd.agentic_e2e_fix_orchestrator import run_agentic_e2e_fix_orchestrator + from pdd.agentic_common import TokenMatch + + mock_run, _, _ = e2e_fix_mock_dependencies + e2e_fix_default_args["max_cycles"] = 2 + cycle = [0] + + def classify_side_effect(output, tokens, **kwargs): + if "ALL_TESTS_PASS" in tokens: + return TokenMatch(tier="llm_classification_error", token="CLASSIFICATION_ERROR") + return None + + mock_classify.side_effect = classify_side_effect + + def side_effect(*args, **kwargs): + label = kwargs.get("label", "") + if "step9" in label: + cycle[0] += 1 + if cycle[0] >= 2: + return (True, "ALL_TESTS_PASS", 0.1, "gpt-4") + return (True, "Ambiguous results.", 0.1, "gpt-4") + return (True, "Step output", 0.1, "gpt-4") + + mock_run.side_effect = side_effect + success, msg, _, _, _ = run_agentic_e2e_fix_orchestrator(**e2e_fix_default_args) + + assert success is True, f"Expected cycle to continue after classifier error, got msg={msg}" + class TestSemanticTierConsoleLogging: """Item 6: Console should log when semantic tier (tier 3) fires.""" diff --git a/tests/test_e2e_issue_830_workflow_stall.py b/tests/test_e2e_issue_830_workflow_stall.py index 5806caa9f..86dae857c 100644 --- a/tests/test_e2e_issue_830_workflow_stall.py +++ b/tests/test_e2e_issue_830_workflow_stall.py @@ -268,8 +268,9 @@ def side_effect(*args, **kwargs): use_github_state=False, ) - # 1 cycle = 8 steps at $0.10 + 1 step at $0.26 = $1.06 - one_cycle_cost = (8 * COST_PER_STEP) + STEP9_COST + # 1 cycle + a single Step 9 retry: + # 8 steps at $0.10 + 2 steps at $0.26 = $1.32 + one_cycle_cost = (8 * COST_PER_STEP) + (2 * STEP9_COST) # Allow small margin for floating point assert total_cost <= one_cycle_cost + 0.01, ( f"BUG DETECTED (Issue #830): Total cost ${total_cost:.2f} exceeds single cycle "