From 7e23bfb557f1d498f3b49061394a3440375f679b Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:35:01 -0500 Subject: [PATCH 1/8] docs: add refactor plan and FLY analysis --- docs/analysis/fly-callgraph.md | 95 +++++++++++++ docs/analysis/fly-invariants.md | 45 ++++++ docs/codex-refactor-02052026.md | 236 ++++++++++++++++++++++++++++++++ 3 files changed, 376 insertions(+) create mode 100644 docs/analysis/fly-callgraph.md create mode 100644 docs/analysis/fly-invariants.md create mode 100644 docs/codex-refactor-02052026.md diff --git a/docs/analysis/fly-callgraph.md b/docs/analysis/fly-callgraph.md new file mode 100644 index 0000000..a86c3a4 --- /dev/null +++ b/docs/analysis/fly-callgraph.md @@ -0,0 +1,95 @@ +# FLY Call Graph (2026-02-05) + +This document maps the current FLY execution flow from UI actions down to orchestration and execution. The goal is to identify extraction boundaries for a dedicated execution controller. + +## Entry Points (User Actions) + +- `FlyScreen.on_mount()` + - `coordinator.reset_stale_in_progress()` + - `_refresh_waypoint_list()` + - `_select_next_waypoint(include_in_progress=True)` + - `_update_git_status()` + timer + - `_update_project_metrics()` + +- `action_start()` + - Handles retry of selected failed waypoint + - Handles resume from `PAUSED` + - Handles start from `READY` or after `CHART_REVIEW` / `LAND_REVIEW` + - Transitions via `coordinator.transition(...)` + - Sets `execution_state = RUNNING` + - `_execute_current_waypoint()` + +- `action_pause()` + - Sets `execution_state = PAUSE_PENDING` + - Cancels executor if running (logs pause) + +- `action_skip()` + - Marks current waypoint skipped (via selection change) + - `_select_next_waypoint()` + +- `action_back()` + - Transitions `FLY_* -> CHART_REVIEW` + - Switches phase to `chart` + +- `action_forward()` + - Validates `LAND_REVIEW` availability + - `coordinator.transition(LAND_REVIEW)` + `_switch_to_land_screen()` + +- Intervention flow + - `_handle_intervention(...)` → `InterventionModal` → `_on_intervention_result(...)` + +## Execution Flow + +- `_execute_current_waypoint()` + - Marks waypoint `IN_PROGRESS` + saves flight plan + - Builds `WaypointExecutor` with callbacks and limits + - `run_worker(self._run_executor())` + +- `_run_executor()` + - `WaypointExecutor.execute()` → returns `ExecutionResult` + +- `on_worker_state_changed()` + - Handles `InterventionNeededError` or other failures + - Calls `_handle_execution_result(result)` + +- `_handle_execution_result(result)` + - SUCCESS + - Mark COMPLETE + save + - Commit via git (receipt validation) + - Parent epic check + - Select next waypoint + - If all complete: transition `LAND_REVIEW` + - INTERVENTION_NEEDED / MAX_ITERATIONS / FAILED + - Mark FAILED + - Transition `FLY_INTERVENTION` + - CANCELLED + - Transition `FLY_PAUSED` + +## Cross-Cutting Services + +- `JourneyCoordinator` + - Transition validation and persistence + - Waypoint selection and completion checks + +- `WaypointExecutor` + - Iterative execution loop + - Calls progress callback with `ExecutionContext` + +- `ExecutionLogReader` / `ExecutionLogWriter` + - Audit trail for each waypoint + +- `GitService` + `ReceiptValidator` + - Receipt validation + - Commit/tag integration + +--- + +## Extraction Boundary (Target) + +Introduce `ExecutionController` to own the flow currently distributed across `FlyScreen`: +- `start / pause / resume / skip / retry` +- State transitions +- Selection logic + execution sequencing +- Handling of `ExecutionResult` + +`FlyScreen` should become a thin UI layer: inputs, rendering, and modal handling. diff --git a/docs/analysis/fly-invariants.md b/docs/analysis/fly-invariants.md new file mode 100644 index 0000000..daecf69 --- /dev/null +++ b/docs/analysis/fly-invariants.md @@ -0,0 +1,45 @@ +# FLY Invariants (2026-02-05) + +These invariants define expected behavior in FLY execution. They should be preserved during refactor and enforced through tests. + +## State and Transition Invariants + +- `JourneyCoordinator.transition(...)` is the single source of truth for journey state transitions. +- `ExecutionState` is a UI execution mode, but must be consistent with `JourneyState`: + - `ExecutionState.RUNNING` implies `JourneyState.FLY_EXECUTING`. + - `ExecutionState.PAUSED` implies `JourneyState.FLY_PAUSED`. + - `ExecutionState.INTERVENTION` implies `JourneyState.FLY_INTERVENTION`. + - `ExecutionState.DONE` implies all waypoints complete and `JourneyState.LAND_REVIEW` is reachable. +- Non-recoverable states should not be persisted as resume checkpoints. + +## Waypoint Status Invariants + +- When execution starts, current waypoint becomes `IN_PROGRESS`. +- On success, waypoint must be marked `COMPLETE`, persisted, and logged. +- On intervention or failure, waypoint must be marked `FAILED` (or `SKIPPED` for explicit skips). +- Parent epic completion is checked after a child completes, but epics are not auto-completed. + +## Selection Invariants + +- Selection prefers resumable waypoints (`IN_PROGRESS`, `FAILED`) when resuming. +- Selection should not allow a waypoint whose dependencies are incomplete. +- Epics become eligible only when all children complete. + +## Execution Invariants + +- Execution uses `WaypointExecutor` exclusively. +- UI must remain responsive (execution runs in background worker). +- Progress updates are handled on main thread via `call_later`. +- `ExecutionResult` drives state transitions; no silent fall-through. + +## Logging and Metrics Invariants + +- Each waypoint execution produces an execution log. +- Cost and token metrics are updated after each waypoint. +- Receipt validation must occur before auto-commit. + +## Recovery Invariants + +- Stale `IN_PROGRESS` waypoints are reset to `PENDING` on screen mount. +- Intervention must surface a modal with explicit user action choices. +- Rollback is best-effort and must not corrupt the flight plan state. diff --git a/docs/codex-refactor-02052026.md b/docs/codex-refactor-02052026.md new file mode 100644 index 0000000..accb858 --- /dev/null +++ b/docs/codex-refactor-02052026.md @@ -0,0 +1,236 @@ +# Waypoints Refactor Plan (2026-02-05) + +**Goal**: Address the current philosophical shortcomings (simplicity drift in FLY, incomplete flight tests, missing decision records, residual TODOs) while strengthening domain boundaries, testability, and iteration discipline. + +This plan follows the Waypoints philosophy: bikes not Rube Goldberg; explicit domain language; alternatives considered; staged implementation; tests first; artifacts and UX quality treated as first-class. + +--- + +## 1) Problem Framing (Symptoms vs Root Causes) + +### Symptoms +- `src/waypoints/tui/screens/fly.py` mixes UI, orchestration, execution, git, and process management. +- Flight tests described in `docs/testing-strategy.md` are not implemented (only `flight-tests/self-host/`). +- Architectural decisions are not documented in a durable, discoverable format (no ADRs). +- TODOs indicate incomplete reliability paths (rollback, project status, prompt summarization). + +### Root Causes +- FLY phase lacks a dedicated orchestration boundary with a minimal interface. +- Testing strategy is documented but not operationalized into a repeatable pipeline. +- Decision making is visible in review docs but not captured as formal architectural records. +- Recovery and rollback are acknowledged but not embedded in execution flow. + +--- + +## 2) Solution Space (Alternatives) + +### A) Minimal Reorg (Low risk, lowest impact) +- Move a few helper methods out of `fly.py` and keep orchestration in screen. +- Add one flight test (L0) to prove the pipeline. +- Document a single ADR. + +**Pros**: Fast, minimal change. +**Cons**: Doesn’t address cross-layer coupling or reliability; doesn’t scale. + +### B) Domain-First Refactor (Recommended) +- Introduce a dedicated FLY orchestration service to separate UI from execution. +- Implement a flight test harness with L0–L2 coverage. +- Add a lightweight ADR system and document the most significant changes. + +**Pros**: Aligns with “bicycles,” clarifies boundaries, testable in isolation. +**Cons**: Moderate effort; requires careful migrations. + +### C) Full Protocol-Driven Execution (High impact, high risk) +- Redesign FLY as a strict protocol engine with structured reports and stateful iteration. +- Add schema validation for all JSONL artifacts. +- Build a full QA system for acceptance criteria. + +**Pros**: Strong correctness guarantees. +**Cons**: Large refactor; not necessary to address current shortcomings. + +**Chosen**: **B) Domain-First Refactor**. It fixes current issues while keeping scope tight and allowing iterative upgrades to protocol rigor later. + +--- + +## 3) Design Principles and Ubiquitous Language + +### Domain Language +- **Execution Session**: the lifecycle of executing a flight plan. +- **Execution Controller**: domain service that governs run/pause/resume, metrics, and waypoint transitions. +- **Execution Report**: structured record emitted per waypoint attempt. +- **Flight Test**: an input spec with expected artifacts and a validation script. +- **Checkpoint**: a recoverable state boundary with persisted artifacts. + +### Boundaries +- **TUI Screen**: render state, bind keys, dispatch domain commands. +- **Orchestration Layer**: enforce business invariants and state transitions. +- **Execution Engine**: run a waypoint with a protocol, return a report. +- **Persistence Layer**: versioned artifacts, recovery, replay. + +--- + +## 4) Implementation Plan (Phased) + +### Phase 0 — Baseline and Discovery (1-2 sessions) +**Objective**: Confirm scope, inventory risk, and lock acceptance tests before refactor. + +**Tasks** +- Map current FLY flow (screen → coordinator → executor → logs) into a call graph. +- List all FLY entry points, side effects, and persistence paths. +- Identify contract surfaces for extraction (inputs, outputs, invariants). + +**Artifacts** +- `docs/analysis/fly-callgraph.md` (new) +- `docs/analysis/fly-invariants.md` (new) + +**Acceptance Criteria** +- All current FLY interactions documented and traceable to code. + +--- + +### Phase 1 — Extract Execution Controller (Core Refactor) +**Objective**: Remove orchestration logic from `fly.py` and centralize it in the domain layer. + +**Design** +- Create `src/waypoints/orchestration/execution_controller.py`. +- Provide a narrow interface: + - `start_execution()` + - `pause_execution()` + - `resume_execution()` + - `execute_next()` + - `handle_intervention()` +- The controller owns: + - transitions between `FLY_READY`, `FLY_EXECUTING`, `FLY_PAUSED`, `FLY_INTERVENTION`, `LAND_REVIEW` + - selection of next waypoint via coordinator + - metrics aggregation per waypoint + - persistence of execution reports/logs + +**Tasks** +- Move execution state transitions and waypoint selection logic into controller. +- Keep UI-specific concerns in `fly.py` (rendering, key bindings, modal display). +- Introduce a `ExecutionReport` data model in `src/waypoints/fly/`. + +**Tests (TDD)** +- Add `tests/test_execution_controller.py` with happy-path and failure-path tests. +- Ensure controller behavior is deterministic and easily mocked in TUI tests. + +**Acceptance Criteria** +- `fly.py` no longer manages execution state transitions directly. +- `ExecutionController` is test-covered and used by TUI. +- All existing tests pass. + +--- + +### Phase 2 — Flight Test Harness (BDD) +**Objective**: Operationalize the documented testing strategy with L0–L2 flight tests. + +**Design** +- Create structure: + - `flight-tests/L0-hello-world/` + - `flight-tests/L1-todo-cli/` + - `flight-tests/L2-rest-api/` +- For each flight test: + - `input/idea.txt` + - `expected/min_files.txt` + - `expected/smoke_test.sh` + - `results//` (generated) + +**Tasks** +- Add a small runner in `scripts/run_flight_test.py`. +- Document usage in `docs/testing-strategy.md`. + +**Tests** +- Add `tests/test_flight_test_runner.py` to validate runner behavior. + +**Acceptance Criteria** +- L0–L2 tests are runnable and repeatable locally. +- Results are stored in timestamped directories. + +--- + +### Phase 3 — Decision Records (ADR system) +**Objective**: Capture architectural decisions in a durable, searchable format. + +**Design** +- Add `docs/adr/README.md` (index). +- Create ADRs for: + - FLY execution boundary extraction + - Flight test harness + - Execution report model + +**Acceptance Criteria** +- ADR index linked from `docs/README.md` and `README.md`. + +--- + +### Phase 4 — Reliability Polish (Targeted TODOs) +**Objective**: Resolve remaining TODOs that affect reliability and trust. + +**Tasks** +- Implement rollback in coordinator when GitService supports it (or define explicit TODO with issue ID). +- Add `status` to `Project` model if still missing. +- Replace prompt prefix usage in `llm/prompts/fly.py` with proper spec summary. + +**Acceptance Criteria** +- All TODOs in `rg "TODO"` for core runtime are resolved or turned into tracked issues. + +--- + +## 5) Acceptance Criteria (Global) + +- FLY orchestration is isolated in `ExecutionController` and test-covered. +- TUI screens are thin and focused on display and user interaction. +- L0–L2 flight tests can be executed with a single command. +- ADRs exist and are linked from doc indexes. +- No runtime TODOs remain untracked. + +--- + +## 6) Testing Strategy Alignment + +**Unit** +- `tests/test_execution_controller.py` +- `tests/test_flight_test_runner.py` + +**Integration** +- FLY screen tests should mock the controller and verify UI flow only. + +**BDD / Acceptance** +- L0–L2 flight tests with smoke tests and minimal expected files. + +--- + +## 7) Migration & Compatibility + +- Provide adapters in `fly.py` to minimize UI breakage during refactor. +- Keep existing log formats; add new `ExecutionReport` as additive data. +- If schema versioning is introduced, add migration in `models/schema.py`. + +--- + +## 8) Work Breakdown (Issue-Oriented) + +1. **Execution Controller extraction** +2. **Execution report model** +3. **TUI FLY screen integration** +4. **Flight test runner + L0** +5. **L1 and L2 flight tests** +6. **ADR system + first three ADRs** +7. **TODO reliability fixes** + +--- + +## 9) Definition of Done + +- New architecture reviewed, tests passing, and behavior preserved. +- All acceptance criteria satisfied. +- Docs updated and consistent with implementation. +- Flight tests operational and repeatable. + +--- + +## 10) Ownership and Iteration + +This plan is staged to keep every step testable and reviewable. Each phase is an MVP for the next: the execution controller enables better tests, the flight tests expose reliability gaps, and ADRs preserve context for future contributors. + +If any step requires scope expansion, create a new ADR and update the plan rather than silently extending complexity. From ec022b4a729adb4cbe3182998723c99c742964af Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:35:11 -0500 Subject: [PATCH 2/8] feat(fly): extract execution controller --- src/waypoints/fly/execution_report.py | 21 + src/waypoints/fly/state.py | 16 + src/waypoints/orchestration/__init__.py | 6 + .../orchestration/execution_controller.py | 336 +++++++++++++++ src/waypoints/tui/screens/fly.py | 395 +++++------------- tests/test_execution_controller.py | 170 ++++++++ tests/test_fly_screen.py | 6 +- 7 files changed, 659 insertions(+), 291 deletions(-) create mode 100644 src/waypoints/fly/execution_report.py create mode 100644 src/waypoints/fly/state.py create mode 100644 src/waypoints/orchestration/execution_controller.py create mode 100644 tests/test_execution_controller.py diff --git a/src/waypoints/fly/execution_report.py b/src/waypoints/fly/execution_report.py new file mode 100644 index 0000000..5762ab4 --- /dev/null +++ b/src/waypoints/fly/execution_report.py @@ -0,0 +1,21 @@ +"""Execution report for a waypoint run.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime + +from waypoints.fly.executor import ExecutionResult + + +@dataclass(frozen=True, slots=True) +class ExecutionReport: + """Structured summary of a waypoint execution attempt.""" + + waypoint_id: str + result: ExecutionResult + started_at: datetime | None = None + completed_at: datetime | None = None + iterations: int | None = None + total_iterations: int | None = None + criteria_completed: set[int] = field(default_factory=set) diff --git a/src/waypoints/fly/state.py b/src/waypoints/fly/state.py new file mode 100644 index 0000000..9cfd1c1 --- /dev/null +++ b/src/waypoints/fly/state.py @@ -0,0 +1,16 @@ +"""Execution state model for the FLY phase.""" + +from __future__ import annotations + +from enum import Enum + + +class ExecutionState(Enum): + """State of waypoint execution.""" + + IDLE = "idle" + RUNNING = "running" + PAUSE_PENDING = "pause_pending" # Pause requested, finishing current waypoint + PAUSED = "paused" + DONE = "done" + INTERVENTION = "intervention" diff --git a/src/waypoints/orchestration/__init__.py b/src/waypoints/orchestration/__init__.py index b8fc042..319ab02 100644 --- a/src/waypoints/orchestration/__init__.py +++ b/src/waypoints/orchestration/__init__.py @@ -13,6 +13,10 @@ """ from waypoints.orchestration.coordinator import JourneyCoordinator +from waypoints.orchestration.execution_controller import ( + ExecutionController, + ExecutionDirective, +) from waypoints.orchestration.types import ( ChunkCallback, CompletionStatus, @@ -24,6 +28,8 @@ __all__ = [ "JourneyCoordinator", + "ExecutionController", + "ExecutionDirective", "NextAction", "CompletionStatus", "ProgressCallback", diff --git a/src/waypoints/orchestration/execution_controller.py b/src/waypoints/orchestration/execution_controller.py new file mode 100644 index 0000000..c0be62a --- /dev/null +++ b/src/waypoints/orchestration/execution_controller.py @@ -0,0 +1,336 @@ +"""Execution controller for FLY phase orchestration.""" + +from __future__ import annotations + +import logging +from collections.abc import Callable +from dataclasses import dataclass +from datetime import UTC, datetime +from typing import TYPE_CHECKING, Literal + +from waypoints.fly.execution_report import ExecutionReport +from waypoints.fly.executor import ( + ExecutionContext, + ExecutionResult, + WaypointExecutor, +) +from waypoints.fly.intervention import Intervention, InterventionAction, InterventionResult +from waypoints.fly.state import ExecutionState +from waypoints.models import JourneyState, Waypoint, WaypointStatus +from waypoints.orchestration.coordinator import JourneyCoordinator + +if TYPE_CHECKING: + from waypoints.llm.metrics import MetricsCollector + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class ExecutionDirective: + """Directive returned by the controller after an execution event.""" + + action: Literal["execute", "pause", "intervention", "land", "noop"] + waypoint: Waypoint | None = None + message: str | None = None + completed: Waypoint | None = None + + +class ExecutionController: + """Orchestrates execution logic for the FLY phase.""" + + def __init__(self, coordinator: JourneyCoordinator) -> None: + self.coordinator = coordinator + self.execution_state = ExecutionState.IDLE + self._current_intervention: Intervention | None = None + self._additional_iterations = 0 + self._execution_started_at: datetime | None = None + self.last_report: ExecutionReport | None = None + + @property + def current_waypoint(self) -> Waypoint | None: + """Get the current waypoint from the coordinator.""" + return self.coordinator.current_waypoint + + @current_waypoint.setter + def current_waypoint(self, waypoint: Waypoint | None) -> None: + """Set the current waypoint on the coordinator.""" + self.coordinator.current_waypoint = waypoint + + def initialize(self) -> None: + """Initialize execution state on screen mount.""" + self.coordinator.reset_stale_in_progress() + self.select_next_waypoint(include_in_progress=True) + + def select_next_waypoint(self, include_in_progress: bool = False) -> Waypoint | None: + """Select the next eligible waypoint and update execution state.""" + wp = self.coordinator.select_next_waypoint(include_failed=include_in_progress) + if wp: + return wp + + status = self.coordinator.get_completion_status() + pending = status.pending + status.in_progress + + if status.all_complete: + self.execution_state = ExecutionState.DONE + elif status.blocked > 0 or pending > 0 or status.failed > 0: + self.execution_state = ExecutionState.PAUSED + else: + self.execution_state = ExecutionState.PAUSED + + return None + + def start(self, selected_waypoint: Waypoint | None) -> ExecutionDirective: + """Start or resume execution.""" + if selected_waypoint and selected_waypoint.status == WaypointStatus.FAILED: + selected_waypoint.status = WaypointStatus.PENDING + self.coordinator.save_flight_plan() + self.current_waypoint = selected_waypoint + self._transition_to_executing() + self.execution_state = ExecutionState.RUNNING + return ExecutionDirective( + action="execute", + waypoint=selected_waypoint, + message=f"Retrying {selected_waypoint.id}", + ) + + if self.execution_state == ExecutionState.DONE: + status = self.coordinator.get_completion_status() + if status.failed > 0 or status.blocked > 0: + return ExecutionDirective( + action="noop", + message="Select a failed waypoint and press 'r' to retry", + ) + return ExecutionDirective(action="noop", message="All waypoints complete!") + + if self.execution_state == ExecutionState.PAUSED: + self.select_next_waypoint(include_in_progress=True) + if not self.current_waypoint: + status = self.coordinator.get_completion_status() + if status.failed > 0: + return ExecutionDirective( + action="noop", + message="Select a failed waypoint and press 'r' to retry", + ) + return ExecutionDirective(action="noop", message="No waypoints to resume") + self._transition_to_executing() + self.execution_state = ExecutionState.RUNNING + return ExecutionDirective(action="execute", waypoint=self.current_waypoint) + + if not self.current_waypoint: + self.select_next_waypoint() + if not self.current_waypoint: + return ExecutionDirective( + action="noop", message="No waypoints ready to execute" + ) + + self._transition_to_executing() + self.execution_state = ExecutionState.RUNNING + return ExecutionDirective(action="execute", waypoint=self.current_waypoint) + + def request_pause(self) -> bool: + """Request pause after current waypoint.""" + if self.execution_state != ExecutionState.RUNNING: + return False + self.execution_state = ExecutionState.PAUSE_PENDING + return True + + def build_executor( + self, + *, + waypoint: Waypoint, + spec: str, + on_progress: Callable[[ExecutionContext], None] | None, + max_iterations: int, + metrics_collector: "MetricsCollector | None", + host_validations_enabled: bool, + ) -> WaypointExecutor: + """Create a WaypointExecutor and mark waypoint as in progress.""" + waypoint.status = WaypointStatus.IN_PROGRESS + self.coordinator.save_flight_plan() + self._execution_started_at = datetime.now(UTC) + + total_iterations = max_iterations + self.consume_additional_iterations() + return WaypointExecutor( + project=self.coordinator.project, + waypoint=waypoint, + spec=spec, + on_progress=on_progress, + max_iterations=total_iterations, + metrics_collector=metrics_collector, + host_validations_enabled=host_validations_enabled, + ) + + def handle_execution_result( + self, result: ExecutionResult | None + ) -> ExecutionDirective: + """Handle execution result and update state.""" + waypoint = self.current_waypoint + completed_at = datetime.now(UTC) + normalized = result or ExecutionResult.FAILED + + if waypoint: + self.last_report = ExecutionReport( + waypoint_id=waypoint.id, + result=normalized, + started_at=self._execution_started_at, + completed_at=completed_at, + ) + + if normalized == ExecutionResult.SUCCESS: + if waypoint: + waypoint.status = WaypointStatus.COMPLETE + waypoint.completed_at = completed_at + self.coordinator.save_flight_plan() + self.coordinator.check_parent_completion(waypoint) + + if self.execution_state == ExecutionState.PAUSE_PENDING: + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.execution_state = ExecutionState.PAUSED + return ExecutionDirective( + action="pause", waypoint=waypoint, completed=waypoint + ) + + if self.execution_state == ExecutionState.RUNNING: + next_wp = self.select_next_waypoint() + if next_wp: + return ExecutionDirective( + action="execute", + waypoint=next_wp, + completed=waypoint, + ) + if self.execution_state == ExecutionState.DONE: + self.coordinator.transition(JourneyState.LAND_REVIEW) + return ExecutionDirective(action="land", completed=waypoint) + return ExecutionDirective(action="pause", completed=waypoint) + + return ExecutionDirective(action="noop", completed=waypoint) + + if normalized == ExecutionResult.CANCELLED: + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.execution_state = ExecutionState.PAUSED + return ExecutionDirective(action="pause", waypoint=waypoint) + + if normalized in ( + ExecutionResult.INTERVENTION_NEEDED, + ExecutionResult.MAX_ITERATIONS, + ExecutionResult.FAILED, + ): + self._mark_waypoint_failed() + self.coordinator.transition(JourneyState.FLY_INTERVENTION) + self.execution_state = ExecutionState.INTERVENTION + message = self._result_message(normalized) + return ExecutionDirective( + action="intervention", waypoint=waypoint, message=message + ) + + self._mark_waypoint_failed() + self.coordinator.transition(JourneyState.FLY_INTERVENTION) + self.execution_state = ExecutionState.INTERVENTION + return ExecutionDirective( + action="intervention", + waypoint=waypoint, + message="Waypoint execution failed", + ) + + def prepare_intervention(self, intervention: Intervention) -> ExecutionDirective: + """Record an intervention and transition state.""" + self._current_intervention = intervention + self._mark_waypoint_failed() + self.coordinator.transition(JourneyState.FLY_INTERVENTION) + self.execution_state = ExecutionState.INTERVENTION + return ExecutionDirective( + action="intervention", + waypoint=intervention.waypoint, + message=intervention.error_summary, + ) + + def resolve_intervention( + self, result: InterventionResult | None + ) -> ExecutionDirective: + """Resolve an intervention and return next directive.""" + if result is None: + return ExecutionDirective(action="noop", message="Intervention cancelled") + + if not self._current_intervention: + return ExecutionDirective(action="noop", message="No intervention to resolve") + + waypoint = self._current_intervention.waypoint + + if result.action == InterventionAction.RETRY: + self._additional_iterations = result.additional_iterations + waypoint.status = WaypointStatus.IN_PROGRESS + self.coordinator.save_flight_plan() + self.coordinator.transition(JourneyState.FLY_EXECUTING) + self.execution_state = ExecutionState.RUNNING + self._current_intervention = None + return ExecutionDirective(action="execute", waypoint=waypoint) + + if result.action == InterventionAction.SKIP: + waypoint.status = WaypointStatus.SKIPPED + self.coordinator.save_flight_plan() + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.coordinator.transition(JourneyState.FLY_EXECUTING) + self.execution_state = ExecutionState.RUNNING + next_wp = self.select_next_waypoint() + self._current_intervention = None + if next_wp: + return ExecutionDirective(action="execute", waypoint=next_wp) + if self.execution_state == ExecutionState.DONE: + self.coordinator.transition(JourneyState.LAND_REVIEW) + return ExecutionDirective(action="land") + return ExecutionDirective(action="pause") + + if result.action == InterventionAction.EDIT: + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.execution_state = ExecutionState.PAUSED + self._current_intervention = None + return ExecutionDirective( + action="pause", + message="Edit waypoint in flight plan, then retry", + ) + + if result.action == InterventionAction.ROLLBACK: + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.coordinator.transition(JourneyState.FLY_READY) + self.execution_state = ExecutionState.IDLE + self._current_intervention = None + return ExecutionDirective(action="noop", message="Rollback requested") + + if result.action == InterventionAction.ABORT: + self.coordinator.transition(JourneyState.FLY_PAUSED) + self.execution_state = ExecutionState.PAUSED + self._current_intervention = None + return ExecutionDirective(action="pause", message="Execution aborted") + + self._current_intervention = None + return ExecutionDirective(action="pause") + + def consume_additional_iterations(self) -> int: + """Consume additional iterations requested during intervention.""" + extra = self._additional_iterations + self._additional_iterations = 0 + return extra + + def _transition_to_executing(self) -> None: + journey = self.coordinator.project.journey + if journey and journey.state in ( + JourneyState.CHART_REVIEW, + JourneyState.LAND_REVIEW, + ): + self.coordinator.transition(JourneyState.FLY_READY) + self.coordinator.transition(JourneyState.FLY_EXECUTING) + + def _mark_waypoint_failed(self) -> None: + if self.current_waypoint: + self.current_waypoint.status = WaypointStatus.FAILED + self.coordinator.save_flight_plan() + + @staticmethod + def _result_message(result: ExecutionResult) -> str: + if result == ExecutionResult.INTERVENTION_NEEDED: + return "Human intervention needed" + if result == ExecutionResult.MAX_ITERATIONS: + return "Max iterations reached" + if result == ExecutionResult.FAILED: + return "Execution failed" + return "Execution failed" diff --git a/src/waypoints/tui/screens/fly.py b/src/waypoints/tui/screens/fly.py index deccd50..62dcfbe 100644 --- a/src/waypoints/tui/screens/fly.py +++ b/src/waypoints/tui/screens/fly.py @@ -4,7 +4,6 @@ import re import subprocess from datetime import UTC, datetime -from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Any, cast @@ -23,9 +22,7 @@ from waypoints.git.receipt import ChecklistReceipt from waypoints.tui.app import WaypointsApp -from waypoints.fly.execution_log import ( - ExecutionLog as ExecLogType, -) +from waypoints.fly.execution_log import ExecutionLog as ExecLogType from waypoints.fly.execution_log import ( ExecutionLogReader, ) @@ -41,11 +38,12 @@ InterventionNeededError, InterventionResult, ) +from waypoints.fly.state import ExecutionState from waypoints.git import GitConfig, GitService, ReceiptValidator from waypoints.models import JourneyState, Project from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint, WaypointStatus -from waypoints.orchestration import JourneyCoordinator +from waypoints.orchestration import ExecutionController, JourneyCoordinator from waypoints.tui.screens.intervention import InterventionModal from waypoints.tui.utils import ( format_token_count, @@ -95,17 +93,6 @@ def _format_project_metrics( return " · ".join(parts) if parts else "" -class ExecutionState(Enum): - """State of waypoint execution.""" - - IDLE = "idle" - RUNNING = "running" - PAUSE_PENDING = "pause_pending" # Pause requested, finishing current waypoint - PAUSED = "paused" - DONE = "done" - INTERVENTION = "intervention" - - # Regex patterns for markdown CODE_BLOCK_PATTERN = re.compile(r"```(\w+)?\n(.*?)```", re.DOTALL) BOLD_PATTERN = re.compile(r"\*\*(.+?)\*\*") @@ -1256,10 +1243,9 @@ def __init__( project=project, flight_plan=flight_plan, ) + self.execution_controller = ExecutionController(self.coordinator) self._executor: WaypointExecutor | None = None - self._current_intervention: Intervention | None = None - self._additional_iterations: int = 0 # Timer tracking self._execution_start: datetime | None = None @@ -1276,12 +1262,12 @@ def waypoints_app(self) -> "WaypointsApp": @property def current_waypoint(self) -> Waypoint | None: """Get the currently selected waypoint (delegated to coordinator).""" - return self.coordinator.current_waypoint + return self.execution_controller.current_waypoint @current_waypoint.setter def current_waypoint(self, waypoint: Waypoint | None) -> None: """Set the currently selected waypoint (delegated to coordinator).""" - self.coordinator.current_waypoint = waypoint + self.execution_controller.current_waypoint = waypoint def compose(self) -> ComposeResult: yield StatusHeader() @@ -1313,14 +1299,17 @@ def on_mount(self) -> None: # Reflect initial state in status bar self._update_status_bar(self.execution_state) - # Clean up stale IN_PROGRESS from previous sessions (via coordinator) - self.coordinator.reset_stale_in_progress() + # Clean up stale IN_PROGRESS from previous sessions and select next waypoint + self.execution_controller.initialize() # Update waypoint list with cost data self._refresh_waypoint_list() - # Select resumable waypoint (failed/in-progress) or first pending - self._select_next_waypoint(include_in_progress=True) + # Sync UI with selected waypoint (if any) + self._sync_current_waypoint_details() + + # Sync execution state after initialization + self.execution_state = self.execution_controller.execution_state # Update status bar with initial state (watcher doesn't fire on mount) self._update_status_bar(self.execution_state) @@ -1424,6 +1413,24 @@ def _get_waypoint_tokens(self, waypoint_id: str) -> tuple[int, int] | None: return tokens_by_waypoint.get(waypoint_id) return None + def _sync_current_waypoint_details( + self, active_waypoint_id: str | None = None + ) -> None: + """Sync the detail panel with the current waypoint.""" + if not self.current_waypoint: + return + + detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) + cost = self._get_waypoint_cost(self.current_waypoint.id) + tokens = self._get_waypoint_tokens(self.current_waypoint.id) + detail_panel.show_waypoint( + self.current_waypoint, + project=self.project, + active_waypoint_id=active_waypoint_id, + cost=cost, + tokens=tokens, + ) + def _get_completion_status(self) -> tuple[bool, int, int, int]: """Analyze waypoint completion status. @@ -1449,40 +1456,19 @@ def _select_next_waypoint(self, include_in_progress: bool = False) -> None: "=== Selection round (include_in_progress=%s) ===", include_in_progress ) - # Delegate selection to coordinator - wp = self.coordinator.select_next_waypoint(include_failed=include_in_progress) + # Delegate selection to execution controller + wp = self.execution_controller.select_next_waypoint( + include_in_progress=include_in_progress + ) if wp: # Waypoint selected - update UI - logger.info("SELECTED %s via coordinator", wp.id) - detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) - cost = self._get_waypoint_cost(wp.id) - tokens = self._get_waypoint_tokens(wp.id) - detail_panel.show_waypoint( - wp, - project=self.project, - active_waypoint_id=None, - cost=cost, - tokens=tokens, - ) + logger.info("SELECTED %s via execution controller", wp.id) + self._sync_current_waypoint_details() return - # No eligible waypoints found - check why - all_complete, pending, failed, blocked = self._get_completion_status() - - if all_complete: - logger.info("All waypoints complete - DONE") - self.execution_state = ExecutionState.DONE - elif blocked > 0: - logger.info("Waypoints blocked by %d failed waypoint(s)", failed) - self.execution_state = ExecutionState.PAUSED - elif pending > 0: - logger.info("%d waypoints pending with unmet dependencies", pending) - self.execution_state = ExecutionState.PAUSED - else: - # Only failed waypoints remain - logger.info("Only failed waypoints remain (%d)", failed) - self.execution_state = ExecutionState.PAUSED + # No eligible waypoints found - sync state from controller + self.execution_state = self.execution_controller.execution_state def _get_state_message(self, state: ExecutionState) -> str: """Get the status bar message for a given execution state.""" @@ -1611,89 +1597,20 @@ def watch_execution_state(self, state: ExecutionState) -> None: def action_start(self) -> None: """Start or resume waypoint execution.""" - # Check if user has selected a specific failed waypoint to retry list_panel = self.query_one("#waypoint-list", WaypointListPanel) selected = list_panel.selected_waypoint - if selected and selected.status == WaypointStatus.FAILED: - # User wants to retry this specific failed waypoint - selected.status = WaypointStatus.PENDING - self._save_flight_plan() - self._refresh_waypoint_list() - self.current_waypoint = selected - self.notify(f"Retrying {selected.id}") - - # Update detail panel to show this waypoint - detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) - cost = self._get_waypoint_cost(selected.id) - tokens = self._get_waypoint_tokens(selected.id) - detail_panel.show_waypoint( - selected, - project=self.project, - active_waypoint_id=None, - cost=cost, - tokens=tokens, - ) - - # Transition journey state and execute - journey = self.project.journey - if journey and journey.state in ( - JourneyState.FLY_PAUSED, - JourneyState.FLY_INTERVENTION, - ): - self.coordinator.transition(JourneyState.FLY_EXECUTING) - elif journey and journey.state == JourneyState.CHART_REVIEW: - self.coordinator.transition(JourneyState.FLY_READY) - self.coordinator.transition(JourneyState.FLY_EXECUTING) - else: - self.coordinator.transition(JourneyState.FLY_EXECUTING) - self.execution_state = ExecutionState.RUNNING - self._execute_current_waypoint() - return + directive = self.execution_controller.start(selected) + if directive.message: + self.notify(directive.message) - if self.execution_state == ExecutionState.DONE: - # Check if there are actually failed waypoints to retry - _, _, failed, blocked = self._get_completion_status() - if failed > 0 or blocked > 0: - self.notify("Select a failed waypoint and press 'r' to retry") - else: - self.notify("All waypoints complete!") - return + self.execution_state = self.execution_controller.execution_state - # Handle resume from paused state - if self.execution_state == ExecutionState.PAUSED: - # Find waypoint to resume (in_progress first, then pending) - self._select_next_waypoint(include_in_progress=True) - if not self.current_waypoint: - # Check if there are failed waypoints user could retry - _, _, failed, blocked = self._get_completion_status() - if failed > 0: - self.notify("Select a failed waypoint and press 'r' to retry") - else: - self.notify("No waypoints to resume") - return - # Transition journey state: FLY_PAUSED -> FLY_EXECUTING - self.coordinator.transition(JourneyState.FLY_EXECUTING) - self.execution_state = ExecutionState.RUNNING - self._execute_current_waypoint() + if directive.action != "execute": return - if not self.current_waypoint: - self._select_next_waypoint() - if not self.current_waypoint: - self.notify("No waypoints ready to execute") - return - - # Transition journey state to FLY_EXECUTING - # Handle case where we came from Chart or Land (CHART_REVIEW/LAND_REVIEW) - journey = self.project.journey - if journey and journey.state in ( - JourneyState.CHART_REVIEW, - JourneyState.LAND_REVIEW, - ): - self.coordinator.transition(JourneyState.FLY_READY) - self.coordinator.transition(JourneyState.FLY_EXECUTING) - self.execution_state = ExecutionState.RUNNING + self._refresh_waypoint_list() + self._sync_current_waypoint_details() self._execute_current_waypoint() def action_toggle_host_validations(self) -> None: @@ -1714,8 +1631,8 @@ def action_toggle_host_validations(self) -> None: def action_pause(self) -> None: """Pause execution after current waypoint.""" - if self.execution_state == ExecutionState.RUNNING: - self.execution_state = ExecutionState.PAUSE_PENDING + if self.execution_controller.request_pause(): + self.execution_state = self.execution_controller.execution_state if self._executor: self._executor.cancel() # Log pause request @@ -1815,10 +1732,6 @@ def _execute_current_waypoint(self) -> None: detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) log = detail_panel.execution_log - # Update status to IN_PROGRESS - self.current_waypoint.status = WaypointStatus.IN_PROGRESS - self._save_flight_plan() - # Mark this as the active waypoint for output tracking detail_panel._showing_output_for = self.current_waypoint.id @@ -1839,16 +1752,12 @@ def _execute_current_waypoint(self) -> None: # Calculate max iterations (default + any additional from retry) from waypoints.fly.executor import MAX_ITERATIONS - max_iters = MAX_ITERATIONS + self._additional_iterations - self._additional_iterations = 0 # Reset for next execution - # Create executor with progress callback - self._executor = WaypointExecutor( - project=self.project, + self._executor = self.execution_controller.build_executor( waypoint=self.current_waypoint, spec=self.spec, on_progress=self._on_execution_progress, - max_iterations=max_iters, + max_iterations=MAX_ITERATIONS, metrics_collector=self.waypoints_app.metrics_collector, host_validations_enabled=self.waypoints_app.host_validations_enabled, ) @@ -1973,98 +1882,64 @@ def _handle_execution_result(self, result: ExecutionResult | None) -> None: detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) log = detail_panel.execution_log + completed_waypoint = self.current_waypoint + # Update header cost display after execution self.waypoints_app.update_header_cost() # Update project metrics (cost and time) after execution self._update_project_metrics() - if self.current_waypoint: - cost = self._get_waypoint_cost(self.current_waypoint.id) - tokens = self._get_waypoint_tokens(self.current_waypoint.id) + if completed_waypoint: + cost = self._get_waypoint_cost(completed_waypoint.id) + tokens = self._get_waypoint_tokens(completed_waypoint.id) detail_panel.update_metrics(cost, tokens) - if result == ExecutionResult.SUCCESS: - # Mark complete - if self.current_waypoint: - self.current_waypoint.status = WaypointStatus.COMPLETE - self.current_waypoint.completed_at = datetime.now(UTC) - log.log_success(f"Waypoint {self.current_waypoint.id} complete!") - - self._live_criteria_completed = ( - ExecutionLogReader.get_completed_criteria( - self.project, - self.current_waypoint.id, - ) - ) + directive = self.execution_controller.handle_execution_result(result) + self.execution_state = self.execution_controller.execution_state - # Show verification summary - self._log_verification_summary(self.current_waypoint, log) + if directive.completed: + log.log_success(f"Waypoint {directive.completed.id} complete!") - # Check if parent epic should be auto-completed - self._check_parent_completion(self.current_waypoint) + self._live_criteria_completed = ExecutionLogReader.get_completed_criteria( + self.project, + directive.completed.id, + ) - self._save_flight_plan() + # Show verification summary + self._log_verification_summary(directive.completed, log) - # Commit waypoint completion (validates receipt first) - self._commit_waypoint(self.current_waypoint) + # Commit waypoint completion (validates receipt first) + self._commit_waypoint(directive.completed) - # Reset live criteria tracking for next waypoint - self._live_criteria_completed = set() + # Reset live criteria tracking for next waypoint + self._live_criteria_completed = set() detail_panel.clear_iteration() self._refresh_waypoint_list() - # Move to next waypoint if not paused/pausing - if self.execution_state == ExecutionState.RUNNING: - self._select_next_waypoint() - if self.current_waypoint: - self._execute_current_waypoint() - else: - # _select_next_waypoint sets execution_state appropriately - # Only transition to LAND_REVIEW if truly all complete - # (state is DONE). mypy doesn't track state modification. - if self.execution_state == ExecutionState.DONE: # type: ignore[comparison-overlap] - self.coordinator.transition(JourneyState.LAND_REVIEW) - self._switch_to_land_screen() - elif self.execution_state == ExecutionState.PAUSE_PENDING: - # Pause was requested, now actually pause - # Transition journey state: FLY_EXECUTING -> FLY_PAUSED - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.execution_state = ExecutionState.PAUSED + if directive.action == "execute": + self._sync_current_waypoint_details() + self._execute_current_waypoint() + return - elif result == ExecutionResult.INTERVENTION_NEEDED: - log.log_error("Human intervention needed") - self._mark_waypoint_failed() - # Transition journey state: FLY_EXECUTING -> FLY_INTERVENTION - self.coordinator.transition(JourneyState.FLY_INTERVENTION) - self.execution_state = ExecutionState.INTERVENTION - self.query_one(StatusHeader).set_error() - self.notify("Waypoint needs human intervention", severity="warning") - - elif result == ExecutionResult.MAX_ITERATIONS: - log.log_error("Max iterations reached without completion") - self._mark_waypoint_failed() - # Transition journey state: FLY_EXECUTING -> FLY_INTERVENTION - self.coordinator.transition(JourneyState.FLY_INTERVENTION) - self.execution_state = ExecutionState.INTERVENTION - self.query_one(StatusHeader).set_error() - self.notify("Max iterations reached", severity="error") - - elif result == ExecutionResult.CANCELLED: - log.write_log("Execution cancelled") - # Transition journey state: FLY_EXECUTING -> FLY_PAUSED - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.execution_state = ExecutionState.PAUSED - - else: # FAILED or None - log.log_error("Execution failed") - self._mark_waypoint_failed() - # Transition journey state: FLY_EXECUTING -> FLY_INTERVENTION - self.coordinator.transition(JourneyState.FLY_INTERVENTION) - self.execution_state = ExecutionState.INTERVENTION + if directive.action == "land": + self._switch_to_land_screen() + return + + if directive.action == "intervention": + log.log_error(directive.message or "Human intervention needed") self.query_one(StatusHeader).set_error() - self.notify("Waypoint execution failed", severity="error") + if directive.message: + self.notify(directive.message, severity="warning") + self._refresh_waypoint_list() + return + + if directive.action == "pause": + if result == ExecutionResult.CANCELLED: + log.write_log("Execution cancelled") + self._refresh_waypoint_list() + return def _handle_intervention(self, intervention: Intervention) -> None: """Handle an intervention request by showing the modal.""" @@ -2076,15 +1951,9 @@ def _handle_intervention(self, intervention: Intervention) -> None: log.log_error(f"Intervention needed: {type_label}") log.write_log(intervention.error_summary[:500]) - # Store the intervention for retry handling - self._current_intervention = intervention - - # Mark waypoint as failed (can be retried) - self._mark_waypoint_failed() - - # Transition journey state: FLY_EXECUTING -> FLY_INTERVENTION - self.coordinator.transition(JourneyState.FLY_INTERVENTION) - self.execution_state = ExecutionState.INTERVENTION + # Record the intervention and update state + self.execution_controller.prepare_intervention(intervention) + self.execution_state = self.execution_controller.execution_state self.query_one(StatusHeader).set_error() # Show the intervention modal @@ -2118,82 +1987,38 @@ def _on_intervention_result(self, result: InterventionResult | None) -> None: ) if result.action == InterventionAction.RETRY: - # Retry with additional iterations log.write_log( f"Retrying with {result.additional_iterations} additional iterations" ) - self._additional_iterations = result.additional_iterations - - # Reset waypoint status for retry - if self.current_waypoint: - self.current_waypoint.status = WaypointStatus.IN_PROGRESS - self._save_flight_plan() - self._refresh_waypoint_list() - - # Transition: FLY_INTERVENTION -> FLY_EXECUTING - self.coordinator.transition(JourneyState.FLY_EXECUTING) - self.execution_state = ExecutionState.RUNNING - self.query_one(StatusHeader).set_normal() - self._execute_current_waypoint() - elif result.action == InterventionAction.SKIP: - # Skip this waypoint and move to next log.write_log("Skipping waypoint") - if self.current_waypoint: - self.current_waypoint.status = WaypointStatus.SKIPPED - self._save_flight_plan() - self._refresh_waypoint_list() - - # Transition: FLY_INTERVENTION -> FLY_PAUSED -> FLY_EXECUTING - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.coordinator.transition(JourneyState.FLY_EXECUTING) - self.execution_state = ExecutionState.RUNNING - self.query_one(StatusHeader).set_normal() - self._select_next_waypoint() - if self.current_waypoint: - self._execute_current_waypoint() - else: - # _select_next_waypoint sets execution_state appropriately - # Only transition to LAND_REVIEW and notify if truly all complete - if self.execution_state == ExecutionState.DONE: - self.coordinator.transition(JourneyState.LAND_REVIEW) - self.notify("All waypoints complete!") - self._switch_to_land_screen() - elif result.action == InterventionAction.EDIT: - # Open waypoint editor - for now, just notify log.write_log("Edit waypoint requested") self.notify( "Edit waypoint in flight plan, then press 'r' to retry", severity="information", ) - # Stay in intervention state until user edits and retries - # Transition: FLY_INTERVENTION -> FLY_PAUSED - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.execution_state = ExecutionState.PAUSED - self.query_one(StatusHeader).set_normal() - elif result.action == InterventionAction.ROLLBACK: - # Rollback to last safe tag log.write_log("Rolling back to last safe tag") self._rollback_to_safe_tag(result.rollback_tag) - # Transition: FLY_INTERVENTION -> FLY_READY - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.coordinator.transition(JourneyState.FLY_READY) - self.execution_state = ExecutionState.IDLE - self.query_one(StatusHeader).set_normal() - elif result.action == InterventionAction.ABORT: - # Abort execution log.write_log("Execution aborted") - # Transition: FLY_INTERVENTION -> FLY_PAUSED - self.coordinator.transition(JourneyState.FLY_PAUSED) - self.execution_state = ExecutionState.PAUSED - self.query_one(StatusHeader).set_normal() self.notify("Execution aborted") - # Clear the current intervention - self._current_intervention = None + directive = self.execution_controller.resolve_intervention(result) + if directive.message: + self.notify(directive.message) + self.execution_state = self.execution_controller.execution_state + self.query_one(StatusHeader).set_normal() + + self._refresh_waypoint_list() + + if directive.action == "execute": + self._sync_current_waypoint_details() + self._execute_current_waypoint() + elif directive.action == "land": + self.notify("All waypoints complete!") + self._switch_to_land_screen() def _rollback_to_safe_tag(self, tag: str | None) -> None: """Rollback git to the specified tag or find the last safe one.""" @@ -2231,14 +2056,6 @@ def _rollback_to_safe_tag(self, tag: str | None) -> None: else: self.notify(f"Rollback failed: {result.message}", severity="error") - def _mark_waypoint_failed(self) -> None: - """Mark the current waypoint as failed and save.""" - if self.current_waypoint: - self.current_waypoint.status = WaypointStatus.FAILED - self._save_flight_plan() - # Update the tree display - self._refresh_waypoint_list() - def _refresh_waypoint_list( self, execution_state: ExecutionState | None = None ) -> None: diff --git a/tests/test_execution_controller.py b/tests/test_execution_controller.py new file mode 100644 index 0000000..6e5c168 --- /dev/null +++ b/tests/test_execution_controller.py @@ -0,0 +1,170 @@ +"""Tests for ExecutionController behavior.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from waypoints.fly.executor import ExecutionResult +from waypoints.fly.intervention import Intervention, InterventionAction, InterventionResult, InterventionType +from waypoints.fly.state import ExecutionState +from waypoints.models.flight_plan import FlightPlan +from waypoints.models.journey import Journey, JourneyState +from waypoints.models.waypoint import Waypoint, WaypointStatus +from waypoints.orchestration import ExecutionController, JourneyCoordinator + + +class DummyProject: + """Minimal project stub for execution controller tests.""" + + def __init__(self, *, journey_state: JourneyState) -> None: + self.slug = "test-project" + self.journey = Journey(state=journey_state, project_slug=self.slug) + + def get_path(self) -> Path: + return Path("/tmp/test-project") + + def save(self) -> None: + return None + + def transition_journey(self, target: JourneyState, reason: str | None = None) -> None: + if self.journey is None: + self.journey = Journey.new(self.slug) + self.journey = self.journey.transition(target, reason=reason) + + +def make_controller( + *, + journey_state: JourneyState, + flight_plan: FlightPlan, + current_waypoint: Waypoint | None = None, +) -> ExecutionController: + project = DummyProject(journey_state=journey_state) + coordinator = JourneyCoordinator(project=project, flight_plan=flight_plan) + coordinator.current_waypoint = current_waypoint + controller = ExecutionController(coordinator) + return controller + + +def test_start_retries_failed_selected() -> None: + flight_plan = FlightPlan() + failed = Waypoint( + id="WP-001", + title="Failed", + objective="Fix it", + status=WaypointStatus.FAILED, + ) + flight_plan.add_waypoint(failed) + + controller = make_controller( + journey_state=JourneyState.FLY_PAUSED, + flight_plan=flight_plan, + current_waypoint=failed, + ) + + directive = controller.start(failed) + + assert directive.action == "execute" + assert controller.execution_state == ExecutionState.RUNNING + assert failed.status == WaypointStatus.PENDING + assert controller.coordinator.project.journey.state == JourneyState.FLY_EXECUTING + + +def test_handle_success_transitions_to_land() -> None: + flight_plan = FlightPlan() + waypoint = Waypoint( + id="WP-001", + title="Only", + objective="Complete", + status=WaypointStatus.PENDING, + ) + flight_plan.add_waypoint(waypoint) + + controller = make_controller( + journey_state=JourneyState.FLY_EXECUTING, + flight_plan=flight_plan, + current_waypoint=waypoint, + ) + controller.execution_state = ExecutionState.RUNNING + + directive = controller.handle_execution_result(ExecutionResult.SUCCESS) + + assert directive.action == "land" + assert waypoint.status == WaypointStatus.COMPLETE + assert waypoint.completed_at is not None + assert controller.coordinator.project.journey.state == JourneyState.LAND_REVIEW + + +def test_prepare_intervention_marks_failed() -> None: + flight_plan = FlightPlan() + waypoint = Waypoint( + id="WP-002", + title="Needs help", + objective="Intervene", + status=WaypointStatus.PENDING, + ) + flight_plan.add_waypoint(waypoint) + + controller = make_controller( + journey_state=JourneyState.FLY_EXECUTING, + flight_plan=flight_plan, + current_waypoint=waypoint, + ) + + intervention = Intervention( + type=InterventionType.EXECUTION_ERROR, + waypoint=waypoint, + iteration=1, + max_iterations=10, + error_summary="boom", + ) + + directive = controller.prepare_intervention(intervention) + + assert directive.action == "intervention" + assert controller.execution_state == ExecutionState.INTERVENTION + assert waypoint.status == WaypointStatus.FAILED + assert controller.coordinator.project.journey.state == JourneyState.FLY_INTERVENTION + + +def test_resolve_intervention_skip_selects_next() -> None: + flight_plan = FlightPlan() + first = Waypoint( + id="WP-001", + title="First", + objective="Skip", + status=WaypointStatus.PENDING, + ) + second = Waypoint( + id="WP-002", + title="Second", + objective="Next", + status=WaypointStatus.PENDING, + ) + flight_plan.add_waypoint(first) + flight_plan.add_waypoint(second) + + controller = make_controller( + journey_state=JourneyState.FLY_INTERVENTION, + flight_plan=flight_plan, + current_waypoint=first, + ) + + intervention = Intervention( + type=InterventionType.EXECUTION_ERROR, + waypoint=first, + iteration=2, + max_iterations=10, + error_summary="skip", + ) + controller.prepare_intervention(intervention) + + directive = controller.resolve_intervention( + InterventionResult(action=InterventionAction.SKIP) + ) + + assert first.status == WaypointStatus.SKIPPED + assert directive.action == "execute" + assert directive.waypoint == second + assert controller.execution_state == ExecutionState.RUNNING diff --git a/tests/test_fly_screen.py b/tests/test_fly_screen.py index 1d9a168..0f8f1ca 100644 --- a/tests/test_fly_screen.py +++ b/tests/test_fly_screen.py @@ -4,8 +4,9 @@ from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint, WaypointStatus -from waypoints.orchestration import JourneyCoordinator -from waypoints.tui.screens.fly import ExecutionState, FlyScreen +from waypoints.orchestration import ExecutionController, JourneyCoordinator +from waypoints.fly.state import ExecutionState +from waypoints.tui.screens.fly import FlyScreen def make_test_screen(flight_plan: FlightPlan) -> FlyScreen: @@ -28,6 +29,7 @@ def get_path(self): project=MockProject(), # type: ignore flight_plan=flight_plan, ) + screen.execution_controller = ExecutionController(screen.coordinator) return screen From 74edefb2c263132167e5e90116444e87d22b78ec Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:35:23 -0500 Subject: [PATCH 3/8] feat(testing): add flight test harness and fixtures --- docs/testing-strategy.md | 29 +++- .../L0-hello-world/expected/min_files.txt | 4 + .../L0-hello-world/expected/smoke_test.sh | 6 + flight-tests/L0-hello-world/input/idea.txt | 1 + .../L1-todo-cli/expected/min_files.txt | 4 + .../L1-todo-cli/expected/smoke_test.sh | 5 + flight-tests/L1-todo-cli/input/idea.txt | 1 + .../L2-rest-api/expected/min_files.txt | 4 + .../L2-rest-api/expected/smoke_test.sh | 5 + flight-tests/L2-rest-api/input/idea.txt | 1 + scripts/run_flight_test.py | 138 ++++++++++++++++++ tests/test_flight_test_runner.py | 82 +++++++++++ 12 files changed, 273 insertions(+), 7 deletions(-) create mode 100644 flight-tests/L0-hello-world/expected/min_files.txt create mode 100755 flight-tests/L0-hello-world/expected/smoke_test.sh create mode 100644 flight-tests/L0-hello-world/input/idea.txt create mode 100644 flight-tests/L1-todo-cli/expected/min_files.txt create mode 100755 flight-tests/L1-todo-cli/expected/smoke_test.sh create mode 100644 flight-tests/L1-todo-cli/input/idea.txt create mode 100644 flight-tests/L2-rest-api/expected/min_files.txt create mode 100755 flight-tests/L2-rest-api/expected/smoke_test.sh create mode 100644 flight-tests/L2-rest-api/input/idea.txt create mode 100644 scripts/run_flight_test.py create mode 100644 tests/test_flight_test_runner.py diff --git a/docs/testing-strategy.md b/docs/testing-strategy.md index 097b57e..501589e 100644 --- a/docs/testing-strategy.md +++ b/docs/testing-strategy.md @@ -55,23 +55,38 @@ A set of reference projects with increasing complexity that Waypoints should be flight-tests/ ├── L0-hello-world/ │ ├── input/ -│ │ └── idea.txt # "A CLI that prints hello world" +│ │ └── idea.txt # "A Python CLI named hello_world..." │ ├── expected/ │ │ ├── min_files.txt # Minimum expected files -│ │ └── smoke_test.sh # ./hello should print "Hello" +│ │ └── smoke_test.sh # uv run python -m hello_world │ └── results/ # Generated on each run -│ └── 2026-01-08-run1/ +│ └── 2026-02-05-120000/ ├── L1-todo-cli/ │ ├── input/ │ │ └── idea.txt │ ├── expected/ │ │ ├── min_files.txt -│ │ ├── acceptance_criteria.yaml +│ │ └── smoke_test.sh +│ └── results/ +├── L2-rest-api/ +│ ├── input/ +│ │ └── idea.txt +│ ├── expected/ +│ │ ├── min_files.txt │ │ └── smoke_test.sh │ └── results/ ... ``` +Run a flight test against an existing generated project: + +```bash +uv run python scripts/run_flight_test.py flight-tests/L0-hello-world --project-path /path/to/generated/project +``` + +Each run writes a `meta.json` summary and optional `smoke_test.log` into the +results directory for auditability and regression tracking. + ### Flight Test Metrics For each flight test run, capture: @@ -294,9 +309,9 @@ def verify_artifacts(project: Project) -> ArtifactReport: ## Implementation Roadmap ### Phase 1: Foundation -1. Create `flight-tests/` directory structure -2. Implement L0-L1 flight tests (hello world, todo CLI) -3. Create smoke test runner +1. Create `flight-tests/` directory structure (done) +2. Implement L0-L2 flight tests (hello world, todo CLI, REST API) (done) +3. Create smoke test runner (`scripts/run_flight_test.py`) (done) ### Phase 2: Quality Gates 4. Implement LLM-as-judge for idea briefs and specs diff --git a/flight-tests/L0-hello-world/expected/min_files.txt b/flight-tests/L0-hello-world/expected/min_files.txt new file mode 100644 index 0000000..0a4e2a8 --- /dev/null +++ b/flight-tests/L0-hello-world/expected/min_files.txt @@ -0,0 +1,4 @@ +README.md +pyproject.toml +src/hello_world/__init__.py +src/hello_world/__main__.py diff --git a/flight-tests/L0-hello-world/expected/smoke_test.sh b/flight-tests/L0-hello-world/expected/smoke_test.sh new file mode 100755 index 0000000..f51e8a4 --- /dev/null +++ b/flight-tests/L0-hello-world/expected/smoke_test.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -euo pipefail + +uv sync --quiet +output=$(uv run python -m hello_world) +echo "$output" | grep -q "Hello, World" diff --git a/flight-tests/L0-hello-world/input/idea.txt b/flight-tests/L0-hello-world/input/idea.txt new file mode 100644 index 0000000..0181c04 --- /dev/null +++ b/flight-tests/L0-hello-world/input/idea.txt @@ -0,0 +1 @@ +Build a Python CLI named hello_world that prints "Hello, World!" when run. diff --git a/flight-tests/L1-todo-cli/expected/min_files.txt b/flight-tests/L1-todo-cli/expected/min_files.txt new file mode 100644 index 0000000..7868809 --- /dev/null +++ b/flight-tests/L1-todo-cli/expected/min_files.txt @@ -0,0 +1,4 @@ +README.md +pyproject.toml +src/todo_cli/__init__.py +src/todo_cli/__main__.py diff --git a/flight-tests/L1-todo-cli/expected/smoke_test.sh b/flight-tests/L1-todo-cli/expected/smoke_test.sh new file mode 100755 index 0000000..7ace12a --- /dev/null +++ b/flight-tests/L1-todo-cli/expected/smoke_test.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +uv sync --quiet +uv run python -m todo_cli --help >/dev/null diff --git a/flight-tests/L1-todo-cli/input/idea.txt b/flight-tests/L1-todo-cli/input/idea.txt new file mode 100644 index 0000000..d3ca1fb --- /dev/null +++ b/flight-tests/L1-todo-cli/input/idea.txt @@ -0,0 +1 @@ +Build a Python CLI named todo_cli with commands add/list/done and store tasks in a local JSON file. diff --git a/flight-tests/L2-rest-api/expected/min_files.txt b/flight-tests/L2-rest-api/expected/min_files.txt new file mode 100644 index 0000000..f7959ec --- /dev/null +++ b/flight-tests/L2-rest-api/expected/min_files.txt @@ -0,0 +1,4 @@ +README.md +pyproject.toml +src/todo_api/__init__.py +src/todo_api/__main__.py diff --git a/flight-tests/L2-rest-api/expected/smoke_test.sh b/flight-tests/L2-rest-api/expected/smoke_test.sh new file mode 100755 index 0000000..f35f5d6 --- /dev/null +++ b/flight-tests/L2-rest-api/expected/smoke_test.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +set -euo pipefail + +uv sync --quiet +uv run python -m todo_api --help >/dev/null diff --git a/flight-tests/L2-rest-api/input/idea.txt b/flight-tests/L2-rest-api/input/idea.txt new file mode 100644 index 0000000..807c028 --- /dev/null +++ b/flight-tests/L2-rest-api/input/idea.txt @@ -0,0 +1 @@ +Build a Python REST API server named todo_api with CRUD endpoints for /todos and a /health endpoint. Use SQLite for storage. diff --git a/scripts/run_flight_test.py b/scripts/run_flight_test.py new file mode 100644 index 0000000..b07a859 --- /dev/null +++ b/scripts/run_flight_test.py @@ -0,0 +1,138 @@ +"""Flight test runner for Waypoints reference projects.""" + +from __future__ import annotations + +import argparse +import json +import shutil +import subprocess +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path + + +@dataclass(frozen=True, slots=True) +class SmokeResult: + """Outcome of smoke test execution.""" + + ran: bool + exit_code: int | None = None + log_path: Path | None = None + + +def _timestamp() -> str: + return datetime.now(UTC).strftime("%Y-%m-%d-%H%M%S") + + +def _read_lines(path: Path) -> list[str]: + return [line.strip() for line in path.read_text().splitlines() if line.strip()] + + +def _run_smoke_test(script: Path, project_path: Path, results_dir: Path) -> SmokeResult: + log_path = results_dir / "smoke_test.log" + result = subprocess.run( + ["bash", str(script)], + cwd=project_path, + capture_output=True, + text=True, + check=False, + ) + log_path.write_text( + f"$ bash {script}\n\n{result.stdout}\n{result.stderr}", + encoding="utf-8", + ) + return SmokeResult(ran=True, exit_code=result.returncode, log_path=log_path) + + +def _write_meta(results_dir: Path, data: dict[str, object]) -> None: + meta_path = results_dir / "meta.json" + meta_path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def run(flight_test_dir: Path, project_path: Path, *, skip_smoke: bool) -> int: + input_dir = flight_test_dir / "input" + expected_dir = flight_test_dir / "expected" + + idea_path = input_dir / "idea.txt" + min_files_path = expected_dir / "min_files.txt" + smoke_script = expected_dir / "smoke_test.sh" + + if not idea_path.exists(): + raise FileNotFoundError(f"Missing idea file: {idea_path}") + if not min_files_path.exists(): + raise FileNotFoundError(f"Missing min_files file: {min_files_path}") + if not project_path.exists(): + raise FileNotFoundError(f"Project path not found: {project_path}") + + results_dir = flight_test_dir / "results" / _timestamp() + started_at = datetime.now(UTC) + results_dir.mkdir(parents=True, exist_ok=True) + + shutil.copy(idea_path, results_dir / "idea.txt") + + required_files = _read_lines(min_files_path) + missing = [ + str(path) + for path in required_files + if not (project_path / path).exists() + ] + + smoke_result = SmokeResult(ran=False) + if smoke_script.exists() and not skip_smoke: + smoke_result = _run_smoke_test(smoke_script, project_path, results_dir) + + success = not missing and ( + not smoke_result.ran or smoke_result.exit_code == 0 + ) + + _write_meta( + results_dir, + { + "flight_test": flight_test_dir.name, + "project_path": str(project_path), + "started_at": started_at.isoformat(), + "completed_at": datetime.now(UTC).isoformat(), + "required_files": required_files, + "missing_files": missing, + "smoke_test": { + "ran": smoke_result.ran, + "exit_code": smoke_result.exit_code, + "log_path": str(smoke_result.log_path) if smoke_result.log_path else None, + }, + "success": success, + }, + ) + + summary = "PASS" if success else "FAIL" + print(f"{flight_test_dir.name}: {summary}") + if missing: + print("Missing files:") + for path in missing: + print(f" - {path}") + if smoke_result.ran and smoke_result.exit_code != 0: + print(f"Smoke test failed (exit {smoke_result.exit_code}). See {smoke_result.log_path}") + + return 0 if success else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run a Waypoints flight test.") + parser.add_argument("flight_test", type=Path, help="Path to flight test directory") + parser.add_argument( + "--project-path", + type=Path, + required=True, + help="Path to generated project to validate", + ) + parser.add_argument( + "--skip-smoke", + action="store_true", + help="Skip running smoke_test.sh", + ) + + args = parser.parse_args() + return run(args.flight_test, args.project_path, skip_smoke=args.skip_smoke) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_flight_test_runner.py b/tests/test_flight_test_runner.py new file mode 100644 index 0000000..6224258 --- /dev/null +++ b/tests/test_flight_test_runner.py @@ -0,0 +1,82 @@ +"""Tests for the flight test runner.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.append(str(ROOT)) + +from scripts.run_flight_test import run + + +def _setup_flight_test(root: Path, *, min_files: list[str]) -> Path: + flight_test = root / "flight-test" + (flight_test / "input").mkdir(parents=True) + (flight_test / "expected").mkdir(parents=True) + + (flight_test / "input" / "idea.txt").write_text( + "A test idea", encoding="utf-8" + ) + (flight_test / "expected" / "min_files.txt").write_text( + "\n".join(min_files) + "\n", encoding="utf-8" + ) + return flight_test + + +def _latest_results(results_dir: Path) -> Path: + results = sorted(results_dir.iterdir()) + assert results + return results[-1] + + +def test_runner_records_success(tmp_path: Path) -> None: + flight_test = _setup_flight_test(tmp_path, min_files=["README.md"]) + project_path = tmp_path / "project" + project_path.mkdir() + (project_path / "README.md").write_text("ok", encoding="utf-8") + + exit_code = run(flight_test, project_path, skip_smoke=True) + + assert exit_code == 0 + results_dir = _latest_results(flight_test / "results") + meta = json.loads((results_dir / "meta.json").read_text(encoding="utf-8")) + assert meta["success"] is True + assert (results_dir / "idea.txt").exists() + + +def test_runner_detects_missing_files(tmp_path: Path) -> None: + flight_test = _setup_flight_test(tmp_path, min_files=["README.md", "missing.txt"]) + project_path = tmp_path / "project" + project_path.mkdir() + (project_path / "README.md").write_text("ok", encoding="utf-8") + + exit_code = run(flight_test, project_path, skip_smoke=True) + + assert exit_code == 1 + results_dir = _latest_results(flight_test / "results") + meta = json.loads((results_dir / "meta.json").read_text(encoding="utf-8")) + assert meta["success"] is False + assert "missing.txt" in meta["missing_files"] + + +def test_runner_writes_smoke_log(tmp_path: Path) -> None: + flight_test = _setup_flight_test(tmp_path, min_files=["README.md"]) + project_path = tmp_path / "project" + project_path.mkdir() + (project_path / "README.md").write_text("ok", encoding="utf-8") + + smoke_script = flight_test / "expected" / "smoke_test.sh" + smoke_script.write_text("#!/usr/bin/env bash\nexit 0\n", encoding="utf-8") + smoke_script.chmod(0o755) + + exit_code = run(flight_test, project_path, skip_smoke=False) + + assert exit_code == 0 + results_dir = _latest_results(flight_test / "results") + meta = json.loads((results_dir / "meta.json").read_text(encoding="utf-8")) + assert meta["smoke_test"]["ran"] is True + log_path = Path(meta["smoke_test"]["log_path"]) + assert log_path.exists() From cfe88e83dd13deefd0ad83dbf2d2efafbb272a5a Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:35:30 -0500 Subject: [PATCH 4/8] docs: add ADRs and link documentation --- README.md | 1 + docs/README.md | 1 + docs/adr/0001-execution-controller.md | 28 +++++++++++++++++++++++++++ docs/adr/0002-flight-test-harness.md | 28 +++++++++++++++++++++++++++ docs/adr/0003-execution-report.md | 21 ++++++++++++++++++++ docs/adr/README.md | 9 +++++++++ 6 files changed, 88 insertions(+) create mode 100644 docs/adr/0001-execution-controller.md create mode 100644 docs/adr/0002-flight-test-harness.md create mode 100644 docs/adr/0003-execution-report.md create mode 100644 docs/adr/README.md diff --git a/README.md b/README.md index e9b00b2..5d02844 100644 --- a/README.md +++ b/README.md @@ -167,6 +167,7 @@ docs/ - [docs/journey-state-machine.md](./docs/journey-state-machine.md) - Journey states and transitions - [docs/genspec-format.md](./docs/genspec-format.md) - Genspec format reference - [docs/testing-strategy.md](./docs/testing-strategy.md) - Testing approach +- [docs/adr/README.md](./docs/adr/README.md) - Architecture decision records - [docs/architecture-roadmap.md](./docs/architecture-roadmap.md) - Architecture roadmap ## How It Works diff --git a/docs/README.md b/docs/README.md index dfcf9ad..8a85713 100644 --- a/docs/README.md +++ b/docs/README.md @@ -13,6 +13,7 @@ This folder contains the product, architecture, and testing references for Waypo - [journey-state-machine.md](./journey-state-machine.md) - Journey states and transitions - [architecture-roadmap.md](./architecture-roadmap.md) - Long-term architecture plan - [unix-architecture-plan.md](./unix-architecture-plan.md) - UNIX-style architecture notes +- [adr/README.md](./adr/README.md) - Architecture decision records ## Protocols and Formats diff --git a/docs/adr/0001-execution-controller.md b/docs/adr/0001-execution-controller.md new file mode 100644 index 0000000..6dd43f3 --- /dev/null +++ b/docs/adr/0001-execution-controller.md @@ -0,0 +1,28 @@ +# ADR 0001: Extract Execution Controller + +Date: 2026-02-05 +Status: Accepted + +## Context + +The FLY phase mixed UI, orchestration, execution, and state transitions inside +`src/waypoints/tui/screens/fly.py`. This coupling made the execution flow harder +to test, reason about, and evolve. A dedicated orchestration boundary was +needed to align with the “bicycle” philosophy and centralize execution logic. + +## Decision + +Introduce `ExecutionController` in `src/waypoints/orchestration/` to own: +- Execution state transitions +- Waypoint selection and sequencing +- Result handling and intervention flow + +Move `ExecutionState` into `src/waypoints/fly/state.py` to make it a shared +execution concept rather than a UI-local enum. + +## Consequences + +- FLY screen becomes thinner and more focused on UI concerns. +- Execution logic is testable in isolation with unit tests. +- Additional orchestration features (rollback, richer reports) have a clear + home without bloating the UI layer. diff --git a/docs/adr/0002-flight-test-harness.md b/docs/adr/0002-flight-test-harness.md new file mode 100644 index 0000000..3f3f461 --- /dev/null +++ b/docs/adr/0002-flight-test-harness.md @@ -0,0 +1,28 @@ +# ADR 0002: Flight Test Harness + +Date: 2026-02-05 +Status: Accepted + +## Context + +The testing strategy defined flight tests (L0–L5) but lacked operational tooling. +To improve iteration discipline, we needed a repeatable harness that records +results and validates generated projects against minimal expectations. + +## Decision + +Add `scripts/run_flight_test.py` to execute a flight test against an existing +project directory. The runner: +- Creates timestamped results directories +- Validates minimum expected files +- Runs optional smoke tests +- Writes a `meta.json` summary + +Seed L0–L2 fixtures under `flight-tests/` to make the harness immediately usable. + +## Consequences + +- Provides a repeatable baseline for flight test validation. +- Creates an audit trail for regressions and improvements. +- Keeps generation concerns decoupled from validation so the harness is usable + before full automation is in place. diff --git a/docs/adr/0003-execution-report.md b/docs/adr/0003-execution-report.md new file mode 100644 index 0000000..a062847 --- /dev/null +++ b/docs/adr/0003-execution-report.md @@ -0,0 +1,21 @@ +# ADR 0003: Execution Report Model + +Date: 2026-02-05 +Status: Accepted + +## Context + +Execution outcomes were logged but lacked a structured report for summarizing +waypoint attempts. This made it hard to aggregate metrics or build future +observability features on top of execution artifacts. + +## Decision + +Introduce `ExecutionReport` as a structured summary of a waypoint execution +attempt, capturing result, timestamps, and completion data. + +## Consequences + +- Establishes a durable schema for execution summaries. +- Enables future aggregation and reporting without parsing logs. +- Keeps the report model independent of UI layers. diff --git a/docs/adr/README.md b/docs/adr/README.md new file mode 100644 index 0000000..bc7727f --- /dev/null +++ b/docs/adr/README.md @@ -0,0 +1,9 @@ +# Architecture Decision Records + +This directory captures the key architectural decisions for Waypoints. + +## Index + +- [ADR 0001: Extract Execution Controller](./0001-execution-controller.md) +- [ADR 0002: Flight Test Harness](./0002-flight-test-harness.md) +- [ADR 0003: Execution Report Model](./0003-execution-report.md) From 7f0d477fc97a086bb29f779c44dc9862c4f1e815 Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:35:39 -0500 Subject: [PATCH 5/8] feat(reliability): add project status and rollback --- src/waypoints/llm/prompts/fly.py | 3 +-- src/waypoints/models/__init__.py | 3 ++- src/waypoints/models/project.py | 17 +++++++++++++++++ src/waypoints/orchestration/coordinator.py | 20 ++++++++++++++++---- src/waypoints/tui/screens/land.py | 4 +++- tests/test_models.py | 22 +++++++++++++++++++++- 6 files changed, 60 insertions(+), 9 deletions(-) diff --git a/src/waypoints/llm/prompts/fly.py b/src/waypoints/llm/prompts/fly.py index 2a50812..fd74d3b 100644 --- a/src/waypoints/llm/prompts/fly.py +++ b/src/waypoints/llm/prompts/fly.py @@ -47,8 +47,7 @@ def build_execution_prompt( {criteria_list} {resolution_notes} -## Product Spec Summary -## TODO: We should use proper summary of the spec not prefix! +## Product Spec Excerpt (first 2000 chars) {spec[:2000]}{"..." if len(spec) > 2000 else ""} ## Working Directory diff --git a/src/waypoints/models/__init__.py b/src/waypoints/models/__init__.py index 0d35aba..ae1127e 100644 --- a/src/waypoints/models/__init__.py +++ b/src/waypoints/models/__init__.py @@ -12,7 +12,7 @@ Journey, JourneyState, ) -from .project import Project, slugify +from .project import Project, ProjectStatus, slugify from .session import SessionReader, SessionWriter from .state_manager import JourneyStateManager, StateGuardError from .waypoint import Waypoint, WaypointStatus @@ -29,6 +29,7 @@ "MessageRole", "PHASE_TO_STATE", "Project", + "ProjectStatus", "JourneyStateManager", "RECOVERABLE_STATES", "RECOVERY_MAP", diff --git a/src/waypoints/models/project.py b/src/waypoints/models/project.py index a192d35..3bcb2f6 100644 --- a/src/waypoints/models/project.py +++ b/src/waypoints/models/project.py @@ -6,6 +6,7 @@ import re import shutil from dataclasses import dataclass, field +from enum import Enum from datetime import UTC, datetime from pathlib import Path from typing import TYPE_CHECKING, Any @@ -36,6 +37,13 @@ def slugify(name: str) -> str: return slug or "unnamed-project" +class ProjectStatus(Enum): + """Lifecycle status for a project.""" + + ACTIVE = "active" + CLOSED = "closed" + + @dataclass class Project: """A waypoints project containing sessions and documents.""" @@ -46,6 +54,7 @@ class Project: updated_at: datetime initial_idea: str = "" summary: str = "" # LLM-generated project summary + status: ProjectStatus = ProjectStatus.ACTIVE journey: Journey | None = field(default=None, repr=False) @classmethod @@ -107,6 +116,7 @@ def to_dict(self) -> dict[str, Any]: "updated_at": self.updated_at.isoformat(), "initial_idea": self.initial_idea, "summary": self.summary, + "status": self.status.value, } if self.journey is not None: data["journey"] = self.journey.to_dict() @@ -121,6 +131,12 @@ def from_dict(cls, data: dict[str, Any]) -> "Project": if "journey" in data: journey = Journey.from_dict(data["journey"]) + status_value = data.get("status", ProjectStatus.ACTIVE.value) + try: + status = ProjectStatus(status_value) + except ValueError: + status = ProjectStatus.ACTIVE + return cls( name=data["name"], slug=data["slug"], @@ -128,6 +144,7 @@ def from_dict(cls, data: dict[str, Any]) -> "Project": updated_at=datetime.fromisoformat(data["updated_at"]), initial_idea=data.get("initial_idea", ""), summary=data.get("summary", ""), + status=status, journey=journey, ) diff --git a/src/waypoints/orchestration/coordinator.py b/src/waypoints/orchestration/coordinator.py index 7f38407..bde4cf8 100644 --- a/src/waypoints/orchestration/coordinator.py +++ b/src/waypoints/orchestration/coordinator.py @@ -461,10 +461,22 @@ def handle_intervention( return NextAction(action="complete") elif action == InterventionAction.ROLLBACK: - # Rollback to tag and pause - # TODO: Implement rollback when GitService supports it - # if self.git and rollback_tag: - # self.git.rollback_to_tag(rollback_tag) + if not rollback_tag: + return NextAction(action="pause", message="Rollback tag required") + + if self.git: + result = self.git.reset_hard(rollback_tag) + if not result.success: + return NextAction( + action="pause", + message=f"Rollback failed: {result.message}", + ) + else: + return NextAction( + action="pause", + message="Rollback requested but git is not configured", + ) + waypoint.status = WaypointStatus.PENDING self.save_flight_plan() return NextAction(action="pause", message=f"Rolled back to {rollback_tag}") diff --git a/src/waypoints/tui/screens/land.py b/src/waypoints/tui/screens/land.py index d3691df..bb6c0de 100644 --- a/src/waypoints/tui/screens/land.py +++ b/src/waypoints/tui/screens/land.py @@ -25,6 +25,7 @@ from waypoints.git.service import GitService from waypoints.llm.metrics import MetricsCollector from waypoints.models import JourneyState, Project +from waypoints.models.project import ProjectStatus from waypoints.models.flight_plan import FlightPlan, FlightPlanReader from waypoints.models.waypoint import WaypointStatus from waypoints.orchestration import JourneyCoordinator @@ -995,7 +996,8 @@ def action_new_iteration(self) -> None: def action_close_project(self) -> None: """Mark project as closed.""" - # TODO: Add status field to Project model + self.project.status = ProjectStatus.CLOSED + self.project.save() self.notify(f"Project '{self.project.name}' marked as closed") from waypoints.tui.screens.project_selection import ProjectSelectionScreen diff --git a/tests/test_models.py b/tests/test_models.py index b109632..18fd69d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -7,7 +7,7 @@ import pytest from waypoints.models.flight_plan import FlightPlan, FlightPlanReader, FlightPlanWriter -from waypoints.models.project import Project, slugify +from waypoints.models.project import Project, ProjectStatus, slugify from waypoints.models.waypoint import Waypoint, WaypointStatus @@ -664,6 +664,26 @@ def test_create_project(self, temp_projects_dir: Path) -> None: assert project.journey is not None assert (temp_projects_dir / "test-project" / "project.json").exists() + def test_project_status_defaults_active(self, temp_projects_dir: Path) -> None: + """New projects should default to ACTIVE status.""" + project = Project.create("Status Test") + + assert project.status == ProjectStatus.ACTIVE + assert project.to_dict()["status"] == ProjectStatus.ACTIVE.value + + def test_project_status_from_dict(self) -> None: + """Deserialize project status from dictionary.""" + data = { + "name": "Closed Project", + "slug": "closed-project", + "created_at": "2026-01-10T10:00:00", + "updated_at": "2026-01-10T11:00:00", + "status": "closed", + } + project = Project.from_dict(data) + + assert project.status == ProjectStatus.CLOSED + def test_project_directories_created(self, temp_projects_dir: Path) -> None: """Project directories are created on creation.""" project = Project.create("Dir Test") From cb0925889026251d3a3a2dbf1bea219391ac197d Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:54:30 -0500 Subject: [PATCH 6/8] chore: fix lint and typing issues --- scripts/run_flight_test.py | 12 ++++++++-- src/waypoints/models/project.py | 2 +- .../orchestration/execution_controller.py | 21 +++++++++++++----- src/waypoints/tui/screens/land.py | 2 +- tests/test_execution_controller.py | 13 +++++++---- tests/test_flight_test_runner.py | 22 ++++++++++++++----- tests/test_fly_screen.py | 2 +- 7 files changed, 54 insertions(+), 20 deletions(-) diff --git a/scripts/run_flight_test.py b/scripts/run_flight_test.py index b07a859..f13cc16 100644 --- a/scripts/run_flight_test.py +++ b/scripts/run_flight_test.py @@ -85,6 +85,10 @@ def run(flight_test_dir: Path, project_path: Path, *, skip_smoke: bool) -> int: not smoke_result.ran or smoke_result.exit_code == 0 ) + log_path_value = ( + str(smoke_result.log_path) if smoke_result.log_path else None + ) + _write_meta( results_dir, { @@ -97,7 +101,7 @@ def run(flight_test_dir: Path, project_path: Path, *, skip_smoke: bool) -> int: "smoke_test": { "ran": smoke_result.ran, "exit_code": smoke_result.exit_code, - "log_path": str(smoke_result.log_path) if smoke_result.log_path else None, + "log_path": log_path_value, }, "success": success, }, @@ -110,7 +114,11 @@ def run(flight_test_dir: Path, project_path: Path, *, skip_smoke: bool) -> int: for path in missing: print(f" - {path}") if smoke_result.ran and smoke_result.exit_code != 0: - print(f"Smoke test failed (exit {smoke_result.exit_code}). See {smoke_result.log_path}") + message = ( + f"Smoke test failed (exit {smoke_result.exit_code}). " + f"See {smoke_result.log_path}" + ) + print(message) return 0 if success else 1 diff --git a/src/waypoints/models/project.py b/src/waypoints/models/project.py index 3bcb2f6..a132442 100644 --- a/src/waypoints/models/project.py +++ b/src/waypoints/models/project.py @@ -6,8 +6,8 @@ import re import shutil from dataclasses import dataclass, field -from enum import Enum from datetime import UTC, datetime +from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Any diff --git a/src/waypoints/orchestration/execution_controller.py b/src/waypoints/orchestration/execution_controller.py index c0be62a..7b96bff 100644 --- a/src/waypoints/orchestration/execution_controller.py +++ b/src/waypoints/orchestration/execution_controller.py @@ -14,7 +14,11 @@ ExecutionResult, WaypointExecutor, ) -from waypoints.fly.intervention import Intervention, InterventionAction, InterventionResult +from waypoints.fly.intervention import ( + Intervention, + InterventionAction, + InterventionResult, +) from waypoints.fly.state import ExecutionState from waypoints.models import JourneyState, Waypoint, WaypointStatus from waypoints.orchestration.coordinator import JourneyCoordinator @@ -61,7 +65,9 @@ def initialize(self) -> None: self.coordinator.reset_stale_in_progress() self.select_next_waypoint(include_in_progress=True) - def select_next_waypoint(self, include_in_progress: bool = False) -> Waypoint | None: + def select_next_waypoint( + self, include_in_progress: bool = False + ) -> Waypoint | None: """Select the next eligible waypoint and update execution state.""" wp = self.coordinator.select_next_waypoint(include_failed=include_in_progress) if wp: @@ -111,7 +117,9 @@ def start(self, selected_waypoint: Waypoint | None) -> ExecutionDirective: action="noop", message="Select a failed waypoint and press 'r' to retry", ) - return ExecutionDirective(action="noop", message="No waypoints to resume") + return ExecutionDirective( + action="noop", message="No waypoints to resume" + ) self._transition_to_executing() self.execution_state = ExecutionState.RUNNING return ExecutionDirective(action="execute", waypoint=self.current_waypoint) @@ -198,7 +206,8 @@ def handle_execution_result( waypoint=next_wp, completed=waypoint, ) - if self.execution_state == ExecutionState.DONE: + status = self.coordinator.get_completion_status() + if status.all_complete: self.coordinator.transition(JourneyState.LAND_REVIEW) return ExecutionDirective(action="land", completed=waypoint) return ExecutionDirective(action="pause", completed=waypoint) @@ -252,7 +261,9 @@ def resolve_intervention( return ExecutionDirective(action="noop", message="Intervention cancelled") if not self._current_intervention: - return ExecutionDirective(action="noop", message="No intervention to resolve") + return ExecutionDirective( + action="noop", message="No intervention to resolve" + ) waypoint = self._current_intervention.waypoint diff --git a/src/waypoints/tui/screens/land.py b/src/waypoints/tui/screens/land.py index bb6c0de..e4908ce 100644 --- a/src/waypoints/tui/screens/land.py +++ b/src/waypoints/tui/screens/land.py @@ -25,8 +25,8 @@ from waypoints.git.service import GitService from waypoints.llm.metrics import MetricsCollector from waypoints.models import JourneyState, Project -from waypoints.models.project import ProjectStatus from waypoints.models.flight_plan import FlightPlan, FlightPlanReader +from waypoints.models.project import ProjectStatus from waypoints.models.waypoint import WaypointStatus from waypoints.orchestration import JourneyCoordinator from waypoints.tui.utils import format_duration, format_token_count diff --git a/tests/test_execution_controller.py b/tests/test_execution_controller.py index 6e5c168..8820081 100644 --- a/tests/test_execution_controller.py +++ b/tests/test_execution_controller.py @@ -4,10 +4,13 @@ from pathlib import Path -import pytest - from waypoints.fly.executor import ExecutionResult -from waypoints.fly.intervention import Intervention, InterventionAction, InterventionResult, InterventionType +from waypoints.fly.intervention import ( + Intervention, + InterventionAction, + InterventionResult, + InterventionType, +) from waypoints.fly.state import ExecutionState from waypoints.models.flight_plan import FlightPlan from waypoints.models.journey import Journey, JourneyState @@ -28,7 +31,9 @@ def get_path(self) -> Path: def save(self) -> None: return None - def transition_journey(self, target: JourneyState, reason: str | None = None) -> None: + def transition_journey( + self, target: JourneyState, reason: str | None = None + ) -> None: if self.journey is None: self.journey = Journey.new(self.slug) self.journey = self.journey.transition(target, reason=reason) diff --git a/tests/test_flight_test_runner.py b/tests/test_flight_test_runner.py index 6224258..cd9d5c9 100644 --- a/tests/test_flight_test_runner.py +++ b/tests/test_flight_test_runner.py @@ -2,14 +2,24 @@ from __future__ import annotations +import importlib.util import json -import sys from pathlib import Path ROOT = Path(__file__).resolve().parents[1] -sys.path.append(str(ROOT)) -from scripts.run_flight_test import run + +def _load_run() -> callable: + module_path = ROOT / "scripts" / "run_flight_test.py" + spec = importlib.util.spec_from_file_location("run_flight_test", module_path) + if spec is None or spec.loader is None: + raise RuntimeError("Unable to load flight test runner") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.run + + +RUN = _load_run() def _setup_flight_test(root: Path, *, min_files: list[str]) -> Path: @@ -38,7 +48,7 @@ def test_runner_records_success(tmp_path: Path) -> None: project_path.mkdir() (project_path / "README.md").write_text("ok", encoding="utf-8") - exit_code = run(flight_test, project_path, skip_smoke=True) + exit_code = RUN(flight_test, project_path, skip_smoke=True) assert exit_code == 0 results_dir = _latest_results(flight_test / "results") @@ -53,7 +63,7 @@ def test_runner_detects_missing_files(tmp_path: Path) -> None: project_path.mkdir() (project_path / "README.md").write_text("ok", encoding="utf-8") - exit_code = run(flight_test, project_path, skip_smoke=True) + exit_code = RUN(flight_test, project_path, skip_smoke=True) assert exit_code == 1 results_dir = _latest_results(flight_test / "results") @@ -72,7 +82,7 @@ def test_runner_writes_smoke_log(tmp_path: Path) -> None: smoke_script.write_text("#!/usr/bin/env bash\nexit 0\n", encoding="utf-8") smoke_script.chmod(0o755) - exit_code = run(flight_test, project_path, skip_smoke=False) + exit_code = RUN(flight_test, project_path, skip_smoke=False) assert exit_code == 0 results_dir = _latest_results(flight_test / "results") diff --git a/tests/test_fly_screen.py b/tests/test_fly_screen.py index 0f8f1ca..79ab2f8 100644 --- a/tests/test_fly_screen.py +++ b/tests/test_fly_screen.py @@ -2,10 +2,10 @@ import pytest +from waypoints.fly.state import ExecutionState from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint, WaypointStatus from waypoints.orchestration import ExecutionController, JourneyCoordinator -from waypoints.fly.state import ExecutionState from waypoints.tui.screens.fly import FlyScreen From 09f79459a51b8a0fbf409048cbe68a2028be7a5c Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 20:08:05 -0500 Subject: [PATCH 7/8] refactor(fly): delegate commit handling to coordinator --- src/waypoints/orchestration/__init__.py | 4 + src/waypoints/orchestration/coordinator.py | 139 ++++++++++++++++----- src/waypoints/orchestration/types.py | 20 +++ src/waypoints/tui/screens/fly.py | 107 +++------------- 4 files changed, 149 insertions(+), 121 deletions(-) diff --git a/src/waypoints/orchestration/__init__.py b/src/waypoints/orchestration/__init__.py index 319ab02..ad9c5cc 100644 --- a/src/waypoints/orchestration/__init__.py +++ b/src/waypoints/orchestration/__init__.py @@ -19,6 +19,8 @@ ) from waypoints.orchestration.types import ( ChunkCallback, + CommitNotice, + CommitOutcome, CompletionStatus, NextAction, ProgressCallback, @@ -28,6 +30,8 @@ __all__ = [ "JourneyCoordinator", + "CommitNotice", + "CommitOutcome", "ExecutionController", "ExecutionDirective", "NextAction", diff --git a/src/waypoints/orchestration/coordinator.py b/src/waypoints/orchestration/coordinator.py index bde4cf8..60a427f 100644 --- a/src/waypoints/orchestration/coordinator.py +++ b/src/waypoints/orchestration/coordinator.py @@ -58,6 +58,8 @@ ) from waypoints.orchestration.types import ( ChunkCallback, + CommitNotice, + CommitOutcome, CompletionStatus, NextAction, ProgressCallback, @@ -360,7 +362,7 @@ def handle_execution_result( # Commit if git is available if self.git: - self._commit_waypoint(waypoint) + self.commit_waypoint(waypoint) # Find next waypoint next_wp = self.select_next_waypoint() @@ -542,47 +544,116 @@ def get_completion_status(self) -> CompletionStatus: # ─── FLY Phase: Git Integration ────────────────────────────────────── - def _commit_waypoint(self, waypoint: Waypoint) -> bool: - """Commit waypoint changes to git. - - Validates receipt exists before committing. + def commit_waypoint(self, waypoint: Waypoint) -> CommitOutcome: + """Commit waypoint completion if receipt is valid. Returns: - True if commit successful, False otherwise + CommitOutcome describing the commit result. """ - if self.git is None: - return False + from waypoints.git import GitConfig, GitService, ReceiptValidator + + config = GitConfig.load(self.project.slug) + if not config.auto_commit: + logger.debug("Auto-commit disabled, skipping") + return CommitOutcome(status="skipped", reason="auto_commit_disabled") + + git = self.git or GitService(self.project.get_path()) + notices: list[CommitNotice] = [] + + if not git.is_git_repo(): + if config.auto_init: + init_result = git.init_repo() + if init_result.success: + notices.append( + CommitNotice( + message="Initialized git repository", + severity="info", + ) + ) + else: + logger.warning("Failed to init git repo: %s", init_result.message) + return CommitOutcome( + status="skipped", + reason="auto_init_failed", + notices=tuple(notices), + ) + else: + logger.debug("Not a git repo and auto-init disabled") + return CommitOutcome(status="skipped", reason="auto_init_disabled") + + if config.run_checklist: + validator = ReceiptValidator() + receipt_path = validator.find_latest_receipt(self.project, waypoint.id) + + if receipt_path: + validation_result = validator.validate(receipt_path) + if not validation_result.valid: + logger.warning( + "Skipping commit - receipt invalid: %s", + validation_result.message, + ) + notices.append( + CommitNotice( + message=f"Skipping commit: {validation_result.message}", + severity="warning", + ) + ) + return CommitOutcome( + status="skipped", + reason="receipt_invalid", + notices=tuple(notices), + ) + logger.info("Receipt validated: %s", receipt_path) + else: + logger.warning("Skipping commit - no receipt found for %s", waypoint.id) + notices.append( + CommitNotice( + message=f"Skipping commit: no receipt for {waypoint.id}", + severity="warning", + ) + ) + return CommitOutcome( + status="skipped", + reason="receipt_missing", + notices=tuple(notices), + ) - from waypoints.git.receipt import ReceiptValidator + git.stage_project_files(self.project.slug) - validator = ReceiptValidator() - receipt_path = validator.find_latest_receipt(self.project, waypoint.id) + commit_msg = f"feat({self.project.slug}): Complete {waypoint.title}" + result = git.commit(commit_msg) - if receipt_path is None: - logger.warning("No receipt found for %s, skipping commit", waypoint.id) - return False + if result.success: + if "Nothing to commit" in result.message: + logger.info("Nothing to commit for %s", waypoint.id) + return CommitOutcome(status="skipped", reason="nothing_to_commit") - result = validator.validate(receipt_path) - if not result.valid: - logger.warning("Receipt invalid for %s: %s", waypoint.id, result.message) - return False + notices.append( + CommitNotice(message=f"Committed: {waypoint.id}", severity="info") + ) + commit_hash = git.get_head_commit() or "" - # Create commit - try: - # Stage all changed files - self.git.stage_files(".") - commit_result = self.git.commit(f"feat({waypoint.id}): {waypoint.title}") - if not commit_result.success: - logger.warning( - "Commit failed for %s: %s", waypoint.id, commit_result.message - ) - return False - self.git.tag(f"waypoint/{waypoint.id}") - logger.info("Committed waypoint: %s", waypoint.id) - return True - except Exception as e: - logger.error("Failed to commit waypoint %s: %s", waypoint.id, e) - return False + if config.create_waypoint_tags: + tag_name = f"{self.project.slug}/{waypoint.id}" + git.tag(tag_name, f"Completed waypoint: {waypoint.title}") + + return CommitOutcome( + status="success", + commit_hash=commit_hash, + commit_msg=commit_msg, + notices=tuple(notices), + ) + + logger.error("Commit failed: %s", result.message) + notices.append( + CommitNotice(message=f"Commit failed: {result.message}", severity="error") + ) + return CommitOutcome( + status="failure", + commit_msg=commit_msg, + message=result.message, + notices=tuple(notices), + ) # ─── CHART Phase: Flight Plan Generation ───────────────────────────── diff --git a/src/waypoints/orchestration/types.py b/src/waypoints/orchestration/types.py index a80f2e4..876d612 100644 --- a/src/waypoints/orchestration/types.py +++ b/src/waypoints/orchestration/types.py @@ -38,6 +38,26 @@ class ProgressUpdate: # --- Result Types --- +@dataclass(frozen=True, slots=True) +class CommitNotice: + """User-facing notice from git commit handling.""" + + message: str + severity: Literal["info", "warning", "error"] = "info" + + +@dataclass(frozen=True, slots=True) +class CommitOutcome: + """Outcome of a git commit attempt for a waypoint.""" + + status: Literal["skipped", "success", "failure"] + commit_hash: str | None = None + commit_msg: str | None = None + message: str | None = None + reason: str | None = None + notices: tuple[CommitNotice, ...] = () + + @dataclass class NextAction: """What should happen next after an operation. diff --git a/src/waypoints/tui/screens/fly.py b/src/waypoints/tui/screens/fly.py index 62dcfbe..b750e4d 100644 --- a/src/waypoints/tui/screens/fly.py +++ b/src/waypoints/tui/screens/fly.py @@ -39,7 +39,7 @@ InterventionResult, ) from waypoints.fly.state import ExecutionState -from waypoints.git import GitConfig, GitService, ReceiptValidator +from waypoints.git import GitService, ReceiptValidator from waypoints.models import JourneyState, Project from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint, WaypointStatus @@ -1910,7 +1910,25 @@ def _handle_execution_result(self, result: ExecutionResult | None) -> None: self._log_verification_summary(directive.completed, log) # Commit waypoint completion (validates receipt first) - self._commit_waypoint(directive.completed) + commit_outcome = self.coordinator.commit_waypoint(directive.completed) + for notice in commit_outcome.notices: + if notice.severity == "info": + self.notify(notice.message) + else: + self.notify(notice.message, severity=notice.severity) + if self._executor and self._executor._log_writer: + if commit_outcome.status == "success": + self._executor._log_writer.log_git_commit( + True, + commit_outcome.commit_hash or "", + commit_outcome.commit_msg or "", + ) + elif commit_outcome.status == "failure": + self._executor._log_writer.log_git_commit( + False, + "", + commit_outcome.message or "Commit failed", + ) # Reset live criteria tracking for next waypoint self._live_criteria_completed = set() @@ -2119,91 +2137,6 @@ def _log_verification_summary(self, waypoint: Waypoint, log: ExecutionLog) -> No else: log.write_log("[yellow]⚠ No receipt found[/]") - def _commit_waypoint(self, waypoint: Waypoint) -> None: - """Commit waypoint completion if receipt is valid. - - Implements the "trust but verify" pattern: - - Model already produced receipt during execution - - We validate receipt exists and is well-formed - - If valid, commit the changes - - If invalid, skip commit but don't block - """ - project_path = self.project.get_path() - config = GitConfig.load(self.project.slug) - - if not config.auto_commit: - logger.debug("Auto-commit disabled, skipping") - return - - git = GitService(project_path) - - # Auto-init if needed - if not git.is_git_repo(): - if config.auto_init: - init_result = git.init_repo() - if init_result.success: - self.notify("Initialized git repository") - else: - logger.warning("Failed to init git repo: %s", init_result.message) - return - else: - logger.debug("Not a git repo and auto-init disabled") - return - - # Validate receipt (the "dog" checking the "pilot's" work) - if config.run_checklist: - validator = ReceiptValidator() - receipt_path = validator.find_latest_receipt(self.project, waypoint.id) - - if receipt_path: - validation_result = validator.validate(receipt_path) - if not validation_result.valid: - logger.warning( - "Skipping commit - receipt invalid: %s", - validation_result.message, - ) - self.notify( - f"Skipping commit: {validation_result.message}", - severity="warning", - ) - return - logger.info("Receipt validated: %s", receipt_path) - else: - logger.warning("Skipping commit - no receipt found for %s", waypoint.id) - self.notify( - f"Skipping commit: no receipt for {waypoint.id}", severity="warning" - ) - return - - # Stage project files and commit - git.stage_project_files(self.project.slug) - - # Build commit message - commit_msg = f"feat({self.project.slug}): Complete {waypoint.title}" - result = git.commit(commit_msg) - - if result.success: - if "Nothing to commit" not in result.message: - logger.info("Committed: %s", commit_msg) - self.notify(f"Committed: {waypoint.id}") - # Log successful git commit - if self._executor and self._executor._log_writer: - commit_hash = git.get_head_commit() or "" - self._executor._log_writer.log_git_commit( - True, commit_hash, commit_msg - ) - - # Create tag for waypoint if configured - if config.create_waypoint_tags: - tag_name = f"{self.project.slug}/{waypoint.id}" - git.tag(tag_name, f"Completed waypoint: {waypoint.title}") - else: - logger.error("Commit failed: %s", result.message) - self.notify(f"Commit failed: {result.message}", severity="error") - # Log failed git commit - if self._executor and self._executor._log_writer: - self._executor._log_writer.log_git_commit(False, "", result.message) - def _check_parent_completion(self, completed_waypoint: Waypoint) -> None: """Check if parent epic is ready for execution. From a2f181501b13f8cf4c746de84c7833385c152b4d Mon Sep 17 00:00:00 2001 From: Kulesh Shanmugasundaram <164083+kulesh@users.noreply.github.com> Date: Thu, 5 Feb 2026 21:02:42 -0500 Subject: [PATCH 8/8] refactor(fly): move rollback and verification into orchestration --- src/waypoints/orchestration/__init__.py | 4 + src/waypoints/orchestration/coordinator.py | 70 +++++++++--- .../orchestration/execution_controller.py | 39 ++++++- src/waypoints/orchestration/types.py | 22 ++++ src/waypoints/tui/screens/fly.py | 103 ++++++------------ 5 files changed, 152 insertions(+), 86 deletions(-) diff --git a/src/waypoints/orchestration/__init__.py b/src/waypoints/orchestration/__init__.py index ad9c5cc..2149e07 100644 --- a/src/waypoints/orchestration/__init__.py +++ b/src/waypoints/orchestration/__init__.py @@ -25,7 +25,9 @@ NextAction, ProgressCallback, ProgressUpdate, + RollbackOutcome, TextStream, + VerificationSummary, ) __all__ = [ @@ -38,6 +40,8 @@ "CompletionStatus", "ProgressCallback", "ProgressUpdate", + "RollbackOutcome", "ChunkCallback", "TextStream", + "VerificationSummary", ] diff --git a/src/waypoints/orchestration/coordinator.py b/src/waypoints/orchestration/coordinator.py index 60a427f..49510fd 100644 --- a/src/waypoints/orchestration/coordinator.py +++ b/src/waypoints/orchestration/coordinator.py @@ -64,6 +64,8 @@ NextAction, ProgressCallback, ProgressUpdate, + RollbackOutcome, + VerificationSummary, ) if TYPE_CHECKING: @@ -422,6 +424,35 @@ def check_parent_completion(self, waypoint: Waypoint) -> None: # ─── FLY Phase: Intervention Handling ──────────────────────────────── + def rollback_to_tag(self, tag: str | None) -> RollbackOutcome: + """Rollback git state to a tag and reload the flight plan.""" + if not tag: + return RollbackOutcome(status="failure", message="Rollback tag required") + + from waypoints.git import GitService + + git = self.git or GitService(self.project.get_path()) + if not git.is_git_repo(): + return RollbackOutcome( + status="failure", + message="Not a git repository - cannot rollback", + ) + + result = git.reset_hard(tag) + if not result.success: + return RollbackOutcome( + status="failure", + message=f"Rollback failed: {result.message}", + ) + + self._flight_plan = self._load_flight_plan() + self._current_waypoint = None + return RollbackOutcome( + status="success", + message=f"Rolled back to {tag}", + flight_plan=self._flight_plan, + ) + def handle_intervention( self, intervention: Intervention, @@ -466,22 +497,10 @@ def handle_intervention( if not rollback_tag: return NextAction(action="pause", message="Rollback tag required") - if self.git: - result = self.git.reset_hard(rollback_tag) - if not result.success: - return NextAction( - action="pause", - message=f"Rollback failed: {result.message}", - ) - else: - return NextAction( - action="pause", - message="Rollback requested but git is not configured", - ) - - waypoint.status = WaypointStatus.PENDING - self.save_flight_plan() - return NextAction(action="pause", message=f"Rolled back to {rollback_tag}") + outcome = self.rollback_to_tag(rollback_tag) + if outcome.status == "failure": + return NextAction(action="pause", message=outcome.message) + return NextAction(action="pause", message=outcome.message) elif action == InterventionAction.ABORT: # Mark failed and stop @@ -544,6 +563,25 @@ def get_completion_status(self) -> CompletionStatus: # ─── FLY Phase: Git Integration ────────────────────────────────────── + def build_verification_summary( + self, waypoint: Waypoint, completed_criteria: set[int] + ) -> VerificationSummary: + """Build verification summary for a waypoint.""" + from waypoints.git.receipt import ReceiptValidator + + validator = ReceiptValidator() + receipt_path = validator.find_latest_receipt(self.project, waypoint.id) + receipt_validation = None + if receipt_path: + receipt_validation = validator.validate(receipt_path) + + return VerificationSummary( + total_criteria=len(waypoint.acceptance_criteria), + completed_criteria=frozenset(completed_criteria), + receipt_path=receipt_path, + receipt_validation=receipt_validation, + ) + def commit_waypoint(self, waypoint: Waypoint) -> CommitOutcome: """Commit waypoint completion if receipt is valid. diff --git a/src/waypoints/orchestration/execution_controller.py b/src/waypoints/orchestration/execution_controller.py index 7b96bff..68c053e 100644 --- a/src/waypoints/orchestration/execution_controller.py +++ b/src/waypoints/orchestration/execution_controller.py @@ -37,6 +37,7 @@ class ExecutionDirective: waypoint: Waypoint | None = None message: str | None = None completed: Waypoint | None = None + reload_flight_plan: bool = False class ExecutionController: @@ -241,6 +242,28 @@ def handle_execution_result( message="Waypoint execution failed", ) + def request_land(self) -> ExecutionDirective: + """Request transition to LAND, returning a directive.""" + journey = self.coordinator.project.journey + if journey and journey.state == JourneyState.LAND_REVIEW: + return ExecutionDirective(action="land") + + status = self.coordinator.get_completion_status() + if status.all_complete: + self.coordinator.transition(JourneyState.LAND_REVIEW) + return ExecutionDirective(action="land") + + if self.execution_state == ExecutionState.DONE: + return ExecutionDirective( + action="pause", + message="Cannot land yet - some waypoints are blocked or failed", + ) + + return ExecutionDirective( + action="pause", + message="Cannot land yet - waypoints still in progress", + ) + def prepare_intervention(self, intervention: Intervention) -> ExecutionDirective: """Record an intervention and transition state.""" self._current_intervention = intervention @@ -301,11 +324,21 @@ def resolve_intervention( ) if result.action == InterventionAction.ROLLBACK: + outcome = self.coordinator.rollback_to_tag(result.rollback_tag) self.coordinator.transition(JourneyState.FLY_PAUSED) - self.coordinator.transition(JourneyState.FLY_READY) - self.execution_state = ExecutionState.IDLE self._current_intervention = None - return ExecutionDirective(action="noop", message="Rollback requested") + + if outcome.status == "success": + self.coordinator.transition(JourneyState.FLY_READY) + self.execution_state = ExecutionState.IDLE + return ExecutionDirective( + action="pause", + message=outcome.message, + reload_flight_plan=True, + ) + + self.execution_state = ExecutionState.PAUSED + return ExecutionDirective(action="pause", message=outcome.message) if result.action == InterventionAction.ABORT: self.coordinator.transition(JourneyState.FLY_PAUSED) diff --git a/src/waypoints/orchestration/types.py b/src/waypoints/orchestration/types.py index 876d612..f9a85a9 100644 --- a/src/waypoints/orchestration/types.py +++ b/src/waypoints/orchestration/types.py @@ -6,10 +6,13 @@ from collections.abc import AsyncIterator, Callable from dataclasses import dataclass, field +from pathlib import Path from typing import TYPE_CHECKING, Literal if TYPE_CHECKING: from waypoints.fly.intervention import Intervention + from waypoints.git.receipt import ReceiptValidationResult + from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint @@ -58,6 +61,25 @@ class CommitOutcome: notices: tuple[CommitNotice, ...] = () +@dataclass(frozen=True, slots=True) +class VerificationSummary: + """Summary of acceptance criteria and receipt validation for a waypoint.""" + + total_criteria: int + completed_criteria: frozenset[int] + receipt_path: Path | None = None + receipt_validation: "ReceiptValidationResult | None" = None + + +@dataclass(frozen=True, slots=True) +class RollbackOutcome: + """Outcome of a rollback attempt.""" + + status: Literal["success", "failure"] + message: str + flight_plan: "FlightPlan | None" = None + + @dataclass class NextAction: """What should happen next after an operation. diff --git a/src/waypoints/tui/screens/fly.py b/src/waypoints/tui/screens/fly.py index b750e4d..9afc730 100644 --- a/src/waypoints/tui/screens/fly.py +++ b/src/waypoints/tui/screens/fly.py @@ -39,7 +39,6 @@ InterventionResult, ) from waypoints.fly.state import ExecutionState -from waypoints.git import GitService, ReceiptValidator from waypoints.models import JourneyState, Project from waypoints.models.flight_plan import FlightPlan from waypoints.models.waypoint import Waypoint, WaypointStatus @@ -415,11 +414,16 @@ class WaypointDetailPanel(Vertical): """ def __init__( - self, project: Project, flight_plan: FlightPlan, **kwargs: Any + self, + project: Project, + flight_plan: FlightPlan, + coordinator: JourneyCoordinator, + **kwargs: Any, ) -> None: super().__init__(**kwargs) self._project = project self._flight_plan = flight_plan + self._coordinator = coordinator self._waypoint: Waypoint | None = None self._waypoint_cost: float | None = None self._waypoint_tokens: tuple[int, int] | None = None @@ -555,6 +559,10 @@ def _format_metrics_line( metrics_parts.append(f"Cost: ${cost:.2f}") return " · ".join(metrics_parts) + def update_flight_plan(self, flight_plan: FlightPlan) -> None: + """Update the flight plan reference for detail rendering.""" + self._flight_plan = flight_plan + def _update_output_for_waypoint( self, waypoint: Waypoint, active_waypoint_id: str | None ) -> None: @@ -862,6 +870,9 @@ def _log_historical_verification( ) total_criteria = len(waypoint.acceptance_criteria) completed_count = len(completed_criteria) + summary = self._coordinator.build_verification_summary( + waypoint, completed_criteria + ) if total_criteria > 0: for i, criterion in enumerate(waypoint.acceptance_criteria): @@ -878,11 +889,8 @@ def _log_historical_verification( ) # Check receipt status - validator = ReceiptValidator() - receipt_path = validator.find_latest_receipt(self._project, waypoint.id) - - if receipt_path: - result = validator.validate(receipt_path) + if summary.receipt_path and summary.receipt_validation: + result = summary.receipt_validation if result.valid: log.write_log("[green]✓ Receipt validated[/]") else: @@ -893,7 +901,7 @@ def _log_historical_verification( if result.receipt: detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) detail_panel._log_soft_validation_evidence( - log, result.receipt, receipt_path + log, result.receipt, summary.receipt_path ) else: log.write_log("[yellow]⚠ No receipt found[/]") @@ -1276,6 +1284,7 @@ def compose(self) -> ComposeResult: right=WaypointDetailPanel( project=self.project, flight_plan=self.flight_plan, + coordinator=self.coordinator, id="waypoint-detail", ), left_pct=33, @@ -2016,19 +2025,25 @@ def _on_intervention_result(self, result: InterventionResult | None) -> None: "Edit waypoint in flight plan, then press 'r' to retry", severity="information", ) - elif result.action == InterventionAction.ROLLBACK: - log.write_log("Rolling back to last safe tag") - self._rollback_to_safe_tag(result.rollback_tag) elif result.action == InterventionAction.ABORT: log.write_log("Execution aborted") self.notify("Execution aborted") directive = self.execution_controller.resolve_intervention(result) if directive.message: + if result.action == InterventionAction.ROLLBACK: + log.write_log(directive.message) self.notify(directive.message) self.execution_state = self.execution_controller.execution_state self.query_one(StatusHeader).set_normal() + if directive.reload_flight_plan: + if self.coordinator.flight_plan: + self.flight_plan = self.coordinator.flight_plan + detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) + detail_panel.update_flight_plan(self.flight_plan) + self._sync_current_waypoint_details() + self._refresh_waypoint_list() if directive.action == "execute": @@ -2038,42 +2053,6 @@ def _on_intervention_result(self, result: InterventionResult | None) -> None: self.notify("All waypoints complete!") self._switch_to_land_screen() - def _rollback_to_safe_tag(self, tag: str | None) -> None: - """Rollback git to the specified tag or find the last safe one.""" - git = GitService(self.project.get_path()) - - if not git.is_git_repo(): - self.notify("Not a git repository - cannot rollback", severity="error") - return - - if tag: - # Use specified tag - target_tag = tag - else: - # Find last safe tag (project/WP-* pattern) - # This is a simplified version - a full implementation would list tags - self.notify( - "No rollback tag specified - please use git manually", - severity="warning", - ) - return - - # Perform the rollback - result = git.reset_hard(target_tag) - if result.success: - self.notify(f"Rolled back to {target_tag}") - # Reload flight plan from disk after reset - flight_plan_path = self.project.get_path() / "flight-plan.jsonl" - if flight_plan_path.exists(): - from waypoints.models.flight_plan import FlightPlanReader - - loaded = FlightPlanReader.load(self.project) - if loaded: - self.flight_plan = loaded - self._refresh_waypoint_list() - else: - self.notify(f"Rollback failed: {result.message}", severity="error") - def _refresh_waypoint_list( self, execution_state: ExecutionState | None = None ) -> None: @@ -2099,6 +2078,9 @@ def _log_verification_summary(self, waypoint: Waypoint, log: ExecutionLog) -> No log.log_heading("Verification Summary") # Report live acceptance criteria status + summary = self.coordinator.build_verification_summary( + waypoint, self._live_criteria_completed + ) total_criteria = len(waypoint.acceptance_criteria) live_completed = len(self._live_criteria_completed) @@ -2117,11 +2099,8 @@ def _log_verification_summary(self, waypoint: Waypoint, log: ExecutionLog) -> No ) # Check receipt status - validator = ReceiptValidator() - receipt_path = validator.find_latest_receipt(self.project, waypoint.id) - - if receipt_path: - result = validator.validate(receipt_path) + if summary.receipt_path and summary.receipt_validation: + result = summary.receipt_validation if result.valid: log.write_log("[green]✓ Receipt validated[/]") else: @@ -2132,7 +2111,7 @@ def _log_verification_summary(self, waypoint: Waypoint, log: ExecutionLog) -> No if result.receipt: detail_panel = self.query_one("#waypoint-detail", WaypointDetailPanel) detail_panel._log_soft_validation_evidence( - log, result.receipt, receipt_path + log, result.receipt, summary.receipt_path ) else: log.write_log("[yellow]⚠ No receipt found[/]") @@ -2151,22 +2130,12 @@ def _check_parent_completion(self, completed_waypoint: Waypoint) -> None: def action_forward(self) -> None: """Go forward to Land screen if available.""" - # Check if Land is available (all waypoints complete or already in LAND_REVIEW) - journey = self.project.journey - if journey and journey.state == JourneyState.LAND_REVIEW: + directive = self.execution_controller.request_land() + if directive.action == "land": self._switch_to_land_screen() return - - # Check if all waypoints are complete - all_complete, pending, failed, blocked = self._get_completion_status() - if all_complete: - self.coordinator.transition(JourneyState.LAND_REVIEW) - self._switch_to_land_screen() - elif self.execution_state == ExecutionState.DONE: - # DONE but not all_complete - blocked waypoints - self.notify("Cannot land yet - some waypoints are blocked or failed") - else: - self.notify("Cannot land yet - waypoints still in progress") + if directive.message: + self.notify(directive.message) def action_shrink_left(self) -> None: """Shrink the left pane."""