sourcegraph
diff --git a/‎.github/workflows/task_smoke_matrix.yml‎
Lines changed: 84 additions & 0 deletions b/‎.github/workflows/task_smoke_matrix.yml‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 18 additions & 16 deletions b/‎AGENTS.md‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 18 additions & 16 deletions b/‎CLAUDE.md‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 15 additions & 2 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 15 additions & 2 deletions
@@ -0,0 +1,84 @@
+name: Task smoke matrix
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+    paths:
+      - "benchmarks/**"
+      - "configs/validate_one_per_benchmark.sh"
+      - "configs/registry_smoke_matrix.json"
+      - "scripts/validate_tasks_preflight.py"
+      - "docs/reference/TASK_CONTRACT.md"
+      - ".github/workflows/task_smoke_matrix.yml"
+  push:
+    branches: [main]
+    paths:
+      - "benchmarks/**"
+      - "configs/validate_one_per_benchmark.sh"
+      - "configs/registry_smoke_matrix.json"
+      - "scripts/validate_tasks_preflight.py"
+      - "docs/reference/TASK_CONTRACT.md"
+      - ".github/workflows/task_smoke_matrix.yml"
+
+jobs:
+  contract-audit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Full contract audit
+        run: |
+          python3 scripts/validate_tasks_preflight.py --all --contract-only --summary-by-check --format json > contract_audit.json
+          python3 - <<'PY'
+          import json
+          with open("contract_audit.json") as f:
+              data = json.load(f)
+          allowed = {"daytona_storage_over_10g"}
+          disallowed = [issue for issue in data["issues"] if issue["check"] not in allowed]
+          if disallowed:
+              for issue in disallowed[:50]:
+                  print(issue)
+              raise SystemExit(f"{len(disallowed)} disallowed contract issues found")
+          print(data["summary_by_check"])
+          PY
+
+  smoke-runtime:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        variant:
+          - baseline
+          - sg-only
+          - artifact-only
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Smoke curated registry matrix
+        run: |
+          set -euo pipefail
+          extra_args=()
+          case "${{ matrix.variant }}" in
+            baseline) ;;
+            sg-only) extra_args+=(--sg-only) ;;
+            artifact-only) extra_args+=(--artifact-only) ;;
+          esac
+          bash configs/validate_one_per_benchmark.sh \
+            --selection-file configs/registry_smoke_matrix.json \
+            --smoke-runtime \
+            --smoke-timeout-sec 300 \
+            --smoke-timeout-overrides "csb_sdlc_design=450,csb_sdlc_document=450,csb_sdlc_feature=600,csb_sdlc_fix=600,csb_sdlc_refactor=600,csb_sdlc_test=450,csb_sdlc_understand=450" \
+            --max-concurrent 2 \
+            "${extra_args[@]}"
@@ -10,14 +10,17 @@ full operations manual.
 - Before commit/push, run `python3 scripts/repo_health.py` (or `--quick` for docs/config-only changes).
 - Prefer a **remote execution environment** (e.g., Daytona) for large benchmark runs; use local Docker only when a task’s image or registry is incompatible with your cloud environment. See `docs/DAYTONA.md`.
 - Set **parallelism based on your own account and model limits**. Avoid exceeding documented concurrency or rate caps for your environment or provider.
+- Before launching any benchmark batch, check account readiness with `python3 scripts/check_infra.py` or `python3 scripts/account_health.py status`. Do not assume OAuth accounts are usable just because credentials exist.
 
-## Beads Prerequisite
+## Beads Prerequisite and Usage
 - Keep the Beads CLI (`bd`, alias `beads`) up to date before running agent workflows that rely on task graphs.
 - Install or update with the official installer:
 ```bash
 curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/install.sh | bash
 ```
 - Verify install/version with `bd --version` (or `beads --version`).
+- Do not use `bd edit`; use non-interactive `bd create/update/close --json` or stdin-based `--description=-`.
+- Typical flow: `bd ready --json`, `bd create ... --json`, `bd update <id> --claim`, `bd close <id> --reason "Done"`.
 
 ## Minimal Loading Policy
 - Default load order: this file + one relevant skill + one relevant doc.
@@ -41,7 +44,17 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Compact after exploration, before multi-file edits.
 - Compact after launching a benchmark batch.
 - Compact after completing a triage batch or report generation pass.
-- Use `docs/ops/HANDOFF_TEMPLATE.md` when handing work to a new session.
+- When handing work to a new session, use the generic `/handoff` skill to generate an inline copy/paste handoff prompt.
+- Do not create a markdown handoff file unless the user explicitly asks for one.
+- Use `docs/ops/HANDOFF_TEMPLATE.md` as a checklist for what the handoff should include.
+
+## Landing the Plane (Session Completion)
+- Track remaining follow-up in issues or beads.
+- Run `python3 scripts/repo_health.py` (or `--quick` for docs/config-only changes).
+- Update issue/task status.
+- `git pull --rebase && git push && git status` and confirm `main` is up to date with `origin/main`.
+- Clean up and hand off using `/handoff` plus `docs/ops/HANDOFF_TEMPLATE.md`.
+- Work is not complete until push succeeds.
 
 ## Canonical Maps
 - `docs/START_HERE_BY_TASK.md` - task-based read order
@@ -82,8 +95,8 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Timing fields (`started_at`, `finished_at`) live at the **top level** of `result.json`, not nested under `timing`.
 - `trajectory.json` is generated by Harbor's `_convert_events_to_trajectory()` post-processing, NOT by Claude Code CLI directly.
 - SWE-bench `test.sh` redirects stdout to a temp file -- Harbor never sees the parser's `START_TEST_OUTPUT`/`END_TEST_OUTPUT` markers via its normal capture.
-- Token usage data lives in `trajectory.json` per-step metrics with tool attribution. `TranscriptParser` only parses plain text transcripts and ignores trajectory.json.
-- Harbor task contract requires writing to `/logs/verifier/reward.txt`. MCP integration happens at the agent runner level, not the individual task level.
+- Token usage data lives in `trajectory.json`; plain transcript parsers do not see it.
+- Harbor task contract requires writing `/logs/verifier/reward.txt`.
 
 ### Validation / Scoring
 - `validators.py` is duplicated across `ccb_build` tasks. Changes must be applied to **all copies** (verify with `sha256sum`).
@@ -92,12 +105,10 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Trial directory names are truncated with hash suffixes (e.g., `c_api_graphql_expert_079_archite__pm9xcPn`). The real task name lives in `config.json` at `task.path`.
 - LoCoBench task IDs contain multi-word fields (e.g., `game_engine`, `cross_file_refactoring`). Use the 3-digit task number as a positional anchor for parsing instead of rigid regexes that assume single-word fields.
 
-### Gitignore
-- Unanchored `.gitignore` patterns like `dirname/` match at **any directory level**. Use `/dirname/` to anchor to root only. (e.g., `10figure/` inadvertently blocked `benchmarks/10figure/` from being committed.)
-
 ### Git / Auth
 - `gh auth refresh` without `-s <scope>` is a no-op for adding scopes. Must use `gh auth refresh -h github.com -s write:packages` explicitly.
 - Environment variables must be **explicitly exported** for Harbor subprocesses. Use `set -a` before sourcing `.env.local`.
+- Account readiness is tracked in `runs/state/account_health.json`. Launchers source `configs/_common.sh`, filter out unsafe accounts before launch, and record recent runtime rate-limit observations there for operator context.
 - GitHub push protection blocks synthetic/fake API keys in test data. Use `git reset --soft origin/main` to squash intermediate commits that contained fake credentials.
 - Shallow clones (`--depth 1`) fail on push to GitHub with `remote: fatal: did not receive expected object`. Always use full clones for repos that will be pushed.
 - Some repos use `master` as default branch. Detect with `git symbolic-ref refs/remotes/origin/HEAD` and remap to `main` if needed.
@@ -108,15 +119,6 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - `with open(log) as f: subprocess.Popen(stdout=f)` closes the file handle immediately after `Popen()` returns. Use `open()` without context manager for long-running subprocesses.
 - macOS ships Bash 3.2 which lacks associative arrays (`declare -A`). Use pipe-delimited string arrays with `IFS='|' read -r` for compatibility.
 
-### Dashboard / Streamlit
-- Streamlit widget keys in loops must include an index or unique ID to avoid `DuplicateElementKey` errors (e.g., `key=f"nav_{idx}_{page}"` not `key=f"nav_{page}"`).
-- `st.session_state` cannot be modified after widget instantiation. Use `on_click` callback pattern that sets state before widget rerender.
-- Sidebar config below navigation menu is invisible without scrolling. Put critical UI controls in the main content area using `st.columns()`.
-- Always check actual dataclass field names before writing view code. Common mismatches: `agent_results` vs `agent_metrics`, `anomalies` vs `total_anomalies`, dict access vs object attributes.
-- Process handles stored in `st.session_state` are lost on browser refresh. For long-running background processes, use file-based persistent tracking (e.g., `.dashboard_runs/` JSON files) instead.
-- Prefer `st.dataframe` over `st.columns()` with buttons for tabular data -- column layouts squash buttons at narrow viewports.
-- Metric precision matters: use 4+ decimal places for reward/duration comparisons. Rounding to 2 decimals silently loses information needed for meaningful comparison.
-
 ### LLM Judge
 - Always include "Respond with valid JSON only (escape all quotes and special characters)" in judge prompts. Unescaped quotes in LLM-generated JSON break parsing.
 - Judge should use task-type-aware evaluation: different rubrics for code implementation, architectural understanding, and bug fix tasks.
 
@@ -10,14 +10,17 @@ full operations manual.
 - Before commit/push, run `python3 scripts/repo_health.py` (or `--quick` for docs/config-only changes).
 - Prefer a **remote execution environment** (e.g., Daytona) for large benchmark runs; use local Docker only when a task’s image or registry is incompatible with your cloud environment. See `docs/DAYTONA.md`.
 - Set **parallelism based on your own account and model limits**. Avoid exceeding documented concurrency or rate caps for your environment or provider.
+- Before launching any benchmark batch, check account readiness with `python3 scripts/check_infra.py` or `python3 scripts/account_health.py status`. Do not assume OAuth accounts are usable just because credentials exist.
 
-## Beads Prerequisite
+## Beads Prerequisite and Usage
 - Keep the Beads CLI (`bd`, alias `beads`) up to date before running agent workflows that rely on task graphs.
 - Install or update with the official installer:
 ```bash
 curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/install.sh | bash
 ```
 - Verify install/version with `bd --version` (or `beads --version`).
+- Do not use `bd edit`; use non-interactive `bd create/update/close --json` or stdin-based `--description=-`.
+- Typical flow: `bd ready --json`, `bd create ... --json`, `bd update <id> --claim`, `bd close <id> --reason "Done"`.
 
 ## Minimal Loading Policy
 - Default load order: this file + one relevant skill + one relevant doc.
@@ -41,7 +44,17 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Compact after exploration, before multi-file edits.
 - Compact after launching a benchmark batch.
 - Compact after completing a triage batch or report generation pass.
-- Use `docs/ops/HANDOFF_TEMPLATE.md` when handing work to a new session.
+- When handing work to a new session, use the generic `/handoff` skill to generate an inline copy/paste handoff prompt.
+- Do not create a markdown handoff file unless the user explicitly asks for one.
+- Use `docs/ops/HANDOFF_TEMPLATE.md` as a checklist for what the handoff should include.
+
+## Landing the Plane (Session Completion)
+- Track remaining follow-up in issues or beads.
+- Run `python3 scripts/repo_health.py` (or `--quick` for docs/config-only changes).
+- Update issue/task status.
+- `git pull --rebase && git push && git status` and confirm `main` is up to date with `origin/main`.
+- Clean up and hand off using `/handoff` plus `docs/ops/HANDOFF_TEMPLATE.md`.
+- Work is not complete until push succeeds.
 
 ## Canonical Maps
 - `docs/START_HERE_BY_TASK.md` - task-based read order
@@ -82,8 +95,8 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Timing fields (`started_at`, `finished_at`) live at the **top level** of `result.json`, not nested under `timing`.
 - `trajectory.json` is generated by Harbor's `_convert_events_to_trajectory()` post-processing, NOT by Claude Code CLI directly.
 - SWE-bench `test.sh` redirects stdout to a temp file -- Harbor never sees the parser's `START_TEST_OUTPUT`/`END_TEST_OUTPUT` markers via its normal capture.
-- Token usage data lives in `trajectory.json` per-step metrics with tool attribution. `TranscriptParser` only parses plain text transcripts and ignores trajectory.json.
-- Harbor task contract requires writing to `/logs/verifier/reward.txt`. MCP integration happens at the agent runner level, not the individual task level.
+- Token usage data lives in `trajectory.json`; plain transcript parsers do not see it.
+- Harbor task contract requires writing `/logs/verifier/reward.txt`.
 
 ### Validation / Scoring
 - `validators.py` is duplicated across `ccb_build` tasks. Changes must be applied to **all copies** (verify with `sha256sum`).
@@ -92,12 +105,10 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - Trial directory names are truncated with hash suffixes (e.g., `c_api_graphql_expert_079_archite__pm9xcPn`). The real task name lives in `config.json` at `task.path`.
 - LoCoBench task IDs contain multi-word fields (e.g., `game_engine`, `cross_file_refactoring`). Use the 3-digit task number as a positional anchor for parsing instead of rigid regexes that assume single-word fields.
 
-### Gitignore
-- Unanchored `.gitignore` patterns like `dirname/` match at **any directory level**. Use `/dirname/` to anchor to root only. (e.g., `10figure/` inadvertently blocked `benchmarks/10figure/` from being committed.)
-
 ### Git / Auth
 - `gh auth refresh` without `-s <scope>` is a no-op for adding scopes. Must use `gh auth refresh -h github.com -s write:packages` explicitly.
 - Environment variables must be **explicitly exported** for Harbor subprocesses. Use `set -a` before sourcing `.env.local`.
+- Account readiness is tracked in `runs/state/account_health.json`. Launchers source `configs/_common.sh`, filter out unsafe accounts before launch, and record recent runtime rate-limit observations there for operator context.
 - GitHub push protection blocks synthetic/fake API keys in test data. Use `git reset --soft origin/main` to squash intermediate commits that contained fake credentials.
 - Shallow clones (`--depth 1`) fail on push to GitHub with `remote: fatal: did not receive expected object`. Always use full clones for repos that will be pushed.
 - Some repos use `master` as default branch. Detect with `git symbolic-ref refs/remotes/origin/HEAD` and remap to `main` if needed.
@@ -108,15 +119,6 @@ curl -fsSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/insta
 - `with open(log) as f: subprocess.Popen(stdout=f)` closes the file handle immediately after `Popen()` returns. Use `open()` without context manager for long-running subprocesses.
 - macOS ships Bash 3.2 which lacks associative arrays (`declare -A`). Use pipe-delimited string arrays with `IFS='|' read -r` for compatibility.
 
-### Dashboard / Streamlit
-- Streamlit widget keys in loops must include an index or unique ID to avoid `DuplicateElementKey` errors (e.g., `key=f"nav_{idx}_{page}"` not `key=f"nav_{page}"`).
-- `st.session_state` cannot be modified after widget instantiation. Use `on_click` callback pattern that sets state before widget rerender.
-- Sidebar config below navigation menu is invisible without scrolling. Put critical UI controls in the main content area using `st.columns()`.
-- Always check actual dataclass field names before writing view code. Common mismatches: `agent_results` vs `agent_metrics`, `anomalies` vs `total_anomalies`, dict access vs object attributes.
-- Process handles stored in `st.session_state` are lost on browser refresh. For long-running background processes, use file-based persistent tracking (e.g., `.dashboard_runs/` JSON files) instead.
-- Prefer `st.dataframe` over `st.columns()` with buttons for tabular data -- column layouts squash buttons at narrow viewports.
-- Metric precision matters: use 4+ decimal places for reward/duration comparisons. Rounding to 2 decimals silently loses information needed for meaningful comparison.
-
 ### LLM Judge
 - Always include "Respond with valid JSON only (escape all quotes and special characters)" in judge prompts. Unescaped quotes in LLM-generated JSON break parsing.
 - Judge should use task-type-aware evaluation: different rubrics for code implementation, architectural understanding, and bug fix tasks.
 
@@ -941,6 +941,18 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
             if cmd.command and "claude " in cmd.command:
                 # Start with the base command
                 modified_command = cmd.command
+
+                # Harbor's upstream Claude command pipes stream-json output through
+                # `tee` to write /logs/agent/claude-code.txt. In long-running Claude
+                # sessions this can hang after the final answer if descendant
+                # processes inherit the pipeline FDs, leaving Harbor waiting for EOF
+                # long after the agent has logically finished. Redirect directly to
+                # the transcript file instead; Harbor reads the downloaded file for
+                # post-run analysis, so live tee output is unnecessary here.
+                modified_command = modified_command.replace(
+                    "2>&1 </dev/null | stdbuf -oL tee /logs/agent/claude-code.txt",
+                    "</dev/null >/logs/agent/claude-code.txt 2>&1",
+                )
 
                 # Insert flags after "claude " - build them up incrementally
                 flags_to_insert = []
@@ -1123,8 +1135,9 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
 
                 modified_command = (
                     f"{setup_cmds}{file_cmds} && "
-                    f"{run_cmd} ; "
-                    "chmod -R a+rX /logs 2>/dev/null || true"
+                    f"{run_cmd}; _run_status=$?; "
+                    "chmod -R a+rX /logs 2>/dev/null || true; "
+                    "exit $_run_status"
                 )
 
                 # CRITICAL: Add autonomous environment variables