feat: US-019 - Selection file and config wiring

sjarmak · claude · sjarmak · commit b758c58fa081 · 2026-02-20T21:39:32.000Z
- Add --selection-file and --use-case-category flags to run_selected_tasks.sh
- Update extract_tasks() to support both benchmark and mcp_suite field formats
- Add mcp_benefit_score to all 12 tasks in selected_mcp_unique_tasks.json
- Fix task_dir format (no benchmarks/ prefix, consistent with standard format)
- Add 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE in aggregate_status.py
  and generate_manifest.py
- Update validate_tasks_preflight.py to accept tests/eval.sh as alternative
  to tests/test.sh (MCP-unique tasks use SWE-Factory eval.sh pattern)
- Smoke test: dry-run with --selection-file shows all 12 tasks across 6 suites

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/AGENTS.md b/AGENTS.md
@@ -26,6 +26,11 @@ per-task details.
 - `docs/SKILLS.md` - AI agent skill system overview
 - `skills/` - operational runbooks for AI agents (see `skills/README.md`)
 
+## Git Policy
+- **All work happens on `main`** — do NOT create feature branches.
+- Never run `git checkout -b` or `git switch -c`.
+- Commit directly to `main`. This avoids cross-session branch confusion when multiple agents work on the repo.
+
 ## Typical Skill Routing
 Use these defaults unless there is a task-specific reason not to.
 
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -26,6 +26,11 @@ per-task details.
 - `docs/SKILLS.md` - AI agent skill system overview
 - `skills/` - operational runbooks for AI agents (see `skills/README.md`)
 
+## Git Policy
+- **All work happens on `main`** — do NOT create feature branches.
+- Never run `git checkout -b` or `git switch -c`.
+- Commit directly to `main`. This avoids cross-session branch confusion when multiple agents work on the repo.
+
 ## Typical Skill Routing
 Use these defaults unless there is a task-specific reason not to.
 
diff --git a/agents/claude_baseline_agent.py b/agents/claude_baseline_agent.py
@@ -838,6 +838,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 script_lines = [
                     '#!/bin/bash',
                     'export PATH=/usr/local/bin:/usr/bin:/bin:$PATH',
+                    'export CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000',
                     '# Detect working directory',
                     'if [ -d /workspace ]; then WORKDIR=/workspace',
                     'elif [ -d /app ]; then WORKDIR=/app',
@@ -897,11 +898,12 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 env_with_autonomous = {
                     **env,
                     'FORCE_AUTO_BACKGROUND_TASKS': '1',
-                    'ENABLE_BACKGROUND_TASKS': '1'
+                    'ENABLE_BACKGROUND_TASKS': '1',
+                    'CLAUDE_CODE_MAX_OUTPUT_TOKENS': '128000',
                 }
 
                 # Add SSL workaround for MCP HTTP transport
-                if mcp_type in ["sourcegraph", "sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "deepsearch", "deepsearch_hybrid"]:
+                if mcp_type in ["sourcegraph", "sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full", "deepsearch", "deepsearch_hybrid"]:
                     env_with_autonomous['NODE_TLS_REJECT_UNAUTHORIZED'] = '0'
 
                 if mcp_type == "sourcegraph_base":
diff --git a/configs/run_selected_tasks.sh b/configs/run_selected_tasks.sh
@@ -12,17 +12,19 @@
 #   ./configs/run_selected_tasks.sh [OPTIONS]
 #
 # Options:
-#   --benchmark BENCHMARK  Run only this benchmark (e.g., ccb_build, ccb_fix)
-#   --baseline-only        Run only baseline (no MCP)
-#   --full-only            Run only MCP-Full (sourcegraph_full)
-#   --model MODEL          Override model (default: claude-opus-4-6)
-#   --concurrency N        Concurrent tasks (default: 2)
-#   --category CATEGORY    Run category (default: staging)
-#   --skip-completed       Skip tasks that already have result.json + task_metrics.json
-#   --dry-run              Print tasks without running
+#   --benchmark BENCHMARK           Run only this benchmark (e.g., ccb_build, ccb_fix)
+#   --selection-file PATH           Use alternate selection file (default: selected_benchmark_tasks.json)
+#   --use-case-category CATEGORY    Filter by MCP-unique use case category (A-J), only valid with --selection-file
+#   --baseline-only                 Run only baseline (no MCP)
+#   --full-only                     Run only MCP-Full (sourcegraph_full)
+#   --model MODEL                   Override model (default: claude-opus-4-6)
+#   --concurrency N                 Concurrent tasks (default: 2)
+#   --category CATEGORY             Run category (default: staging)
+#   --skip-completed                Skip tasks that already have result.json + task_metrics.json
+#   --dry-run                       Print tasks without running
 #
 # Prerequisites:
-#   - configs/selected_benchmark_tasks.json in repo
+#   - configs/selected_benchmark_tasks.json in repo (or --selection-file path)
 #   - .env.local (repo root) with USE_SUBSCRIPTION=true
 #   - SOURCEGRAPH_ACCESS_TOKEN in .env.local (for MCP modes)
 
@@ -44,6 +46,7 @@ SELECTION_FILE="$REPO_ROOT/configs/selected_benchmark_tasks.json"
 # PARSE ARGUMENTS
 # ============================================
 BENCHMARK_FILTER=""
+USE_CASE_CATEGORY_FILTER=""
 MODEL="${MODEL:-anthropic/claude-opus-4-6}"
 CONCURRENCY=1
 TIMEOUT_MULTIPLIER=10
@@ -60,6 +63,14 @@ while [[ $# -gt 0 ]]; do
             BENCHMARK_FILTER="$2"
             shift 2
             ;;
+        --selection-file)
+            SELECTION_FILE="$2"
+            shift 2
+            ;;
+        --use-case-category)
+            USE_CASE_CATEGORY_FILTER="$2"
+            shift 2
+            ;;
         --baseline-only)
             RUN_FULL=false
             shift
@@ -120,17 +131,26 @@ ensure_fresh_token
 # EXTRACT TASKS FROM SELECTION FILE
 # ============================================
 # Python helper to extract task info grouped by benchmark
+# Supports both standard format (benchmark field) and MCP-unique format (mcp_suite field).
+# task_dir in both formats is relative to benchmarks/ (no benchmarks/ prefix).
 extract_tasks() {
     python3 -c "
 import json, sys
 
 selection = json.load(open('$SELECTION_FILE'))
 benchmark_filter = '$BENCHMARK_FILTER'
+use_case_category_filter = '$USE_CASE_CATEGORY_FILTER'
 
 for task in selection['tasks']:
-    bm = task['benchmark']
+    # Support both standard (benchmark) and MCP-unique (mcp_suite) selection files
+    bm = task.get('benchmark') or task.get('mcp_suite', '')
+    if not bm:
+        continue
     if benchmark_filter and bm != benchmark_filter:
         continue
+    if use_case_category_filter and task.get('use_case_category', '') != use_case_category_filter:
+        continue
+    # task_dir is relative to benchmarks/ in both formats
     task_dir = 'benchmarks/' + task['task_dir']
     print(f'{bm}\t{task[\"task_id\"]}\t{task_dir}')
 "
@@ -181,6 +201,7 @@ echo "Total tasks:   $TOTAL_TASKS"
 echo "Concurrency:   $CONCURRENCY"
 echo "Configs:       baseline=$RUN_BASELINE sourcegraph_full=$RUN_FULL"
 echo "Skip done:     $SKIP_COMPLETED"
+[ -n "$USE_CASE_CATEGORY_FILTER" ] && echo "Category:      $USE_CASE_CATEGORY_FILTER"
 echo ""
 echo "Tasks per benchmark:"
 for bm in $(echo "${!BENCHMARK_COUNTS[@]}" | tr ' ' '\n' | sort); do
diff --git a/configs/selected_mcp_unique_tasks.json b/configs/selected_mcp_unique_tasks.json
@@ -10,7 +10,8 @@
       "language": "go",
       "difficulty": "medium",
       "repo": "sg-benchmarks/kubernetes-client-go",
-      "task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001",
+      "mcp_benefit_score": 0.80,
+      "task_dir": "ccb_mcp_crossrepo_tracing/ccx-dep-trace-001",
       "deepsearch_relevant": false,
       "oracle_check_types": ["file_set_match"],
       "repo_set_id": "kubernetes-ecosystem"
@@ -23,7 +24,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "grafana/grafana",
-      "task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004",
+      "mcp_benefit_score": 0.90,
+      "task_dir": "ccb_mcp_crossrepo_tracing/ccx-dep-trace-004",
       "deepsearch_relevant": false,
       "oracle_check_types": ["dependency_chain", "provenance"],
       "repo_set_id": "grafana-observability"
@@ -36,7 +38,8 @@
       "language": "go",
       "difficulty": "medium",
       "repo": "sg-benchmarks/kubernetes-client-go",
-      "task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010",
+      "mcp_benefit_score": 0.85,
+      "task_dir": "ccb_mcp_crossrepo_tracing/ccx-config-trace-010",
       "deepsearch_relevant": false,
       "oracle_check_types": ["symbol_resolution"],
       "repo_set_id": "kubernetes-ecosystem"
@@ -49,7 +52,8 @@
       "language": "javascript",
       "difficulty": "medium",
       "repo": "nodejs/node",
-      "task_dir": "benchmarks/ccb_mcp_security/ccx-vuln-remed-011",
+      "mcp_benefit_score": 0.80,
+      "task_dir": "ccb_mcp_security/ccx-vuln-remed-011",
       "deepsearch_relevant": false,
       "oracle_check_types": ["file_set_match", "keyword_presence"],
       "repo_set_id": "nodejs-web-stack"
@@ -62,7 +66,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "grafana/grafana",
-      "task_dir": "benchmarks/ccb_mcp_security/ccx-vuln-remed-014",
+      "mcp_benefit_score": 0.90,
+      "task_dir": "ccb_mcp_security/ccx-vuln-remed-014",
       "deepsearch_relevant": false,
       "oracle_check_types": ["file_set_match", "provenance"],
       "repo_set_id": "grafana-observability"
@@ -75,7 +80,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "kubernetes/kubernetes",
-      "task_dir": "benchmarks/ccb_mcp_incident/ccx-incident-031",
+      "mcp_benefit_score": 0.90,
+      "task_dir": "ccb_mcp_incident/ccx-incident-031",
       "deepsearch_relevant": false,
       "oracle_check_types": ["file_set_match", "keyword_presence"],
       "repo_set_id": "multi-org-go"
@@ -88,7 +94,8 @@
       "language": "python",
       "difficulty": "medium",
       "repo": "pandas-dev/pandas",
-      "task_dir": "benchmarks/ccb_mcp_onboarding/ccx-onboard-041",
+      "mcp_benefit_score": 0.80,
+      "task_dir": "ccb_mcp_onboarding/ccx-onboard-041",
       "deepsearch_relevant": false,
       "oracle_check_types": ["file_set_match", "provenance"],
       "repo_set_id": "python-ml-stack"
@@ -101,7 +108,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "kubernetes/kubernetes",
-      "task_dir": "benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds",
+      "mcp_benefit_score": 0.95,
+      "task_dir": "ccb_mcp_onboarding/ccx-onboard-050-ds",
       "deepsearch_relevant": true,
       "oracle_check_types": ["dependency_chain", "provenance"],
       "repo_set_id": "kubernetes-ecosystem"
@@ -114,7 +122,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "kubernetes/kubernetes",
-      "task_dir": "benchmarks/ccb_mcp_crossorg/ccx-crossorg-061",
+      "mcp_benefit_score": 0.92,
+      "task_dir": "ccb_mcp_crossorg/ccx-crossorg-061",
       "deepsearch_relevant": false,
       "oracle_check_types": ["symbol_resolution"],
       "repo_set_id": "multi-org-go"
@@ -127,7 +136,8 @@
       "language": "go",
       "difficulty": "medium",
       "repo": "kubernetes/kubernetes",
-      "task_dir": "benchmarks/ccb_mcp_crossorg/ccx-crossorg-066",
+      "mcp_benefit_score": 0.82,
+      "task_dir": "ccb_mcp_crossorg/ccx-crossorg-066",
       "deepsearch_relevant": false,
       "oracle_check_types": ["keyword_presence", "provenance"],
       "repo_set_id": "multi-org-go"
@@ -140,7 +150,8 @@
       "language": "python",
       "difficulty": "hard",
       "repo": "scikit-learn/scikit-learn",
-      "task_dir": "benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds",
+      "mcp_benefit_score": 0.95,
+      "task_dir": "ccb_mcp_onboarding/ccx-explore-042-ds",
       "deepsearch_relevant": true,
       "oracle_check_types": ["dependency_chain", "provenance"],
       "repo_set_id": "python-ml-stack"
@@ -153,7 +164,8 @@
       "language": "go",
       "difficulty": "hard",
       "repo": "kubernetes/kubernetes",
-      "task_dir": "benchmarks/ccb_mcp_platform/ccx-explore-091-ds",
+      "mcp_benefit_score": 0.95,
+      "task_dir": "ccb_mcp_platform/ccx-explore-091-ds",
       "deepsearch_relevant": true,
       "oracle_check_types": ["file_set_match", "keyword_presence"],
       "repo_set_id": "kubernetes-ecosystem"
diff --git a/ralph-eval/prd.json b/ralph-eval/prd.json
@@ -1,6 +1,5 @@
 {
   "project": "Unified Evaluation Package",
-  "branchName": "ralph/unified-eval-package",
   "description": "Migrate LLM-as-judge into CCB, add dual-scoring (verifier + judge), statistical rigor (bootstrap CIs), verifier hardening (debug mode, self-tests), and oracle auto-discovery. Single reproducible evaluation package.",
   "userStories": [
     {
diff --git a/ralph-mcp-unique/CLAUDE.md b/ralph-mcp-unique/CLAUDE.md
@@ -6,14 +6,13 @@ You are an autonomous coding agent working on extending CodeContextBench with MC
 
 1. Read the PRD at `ralph-mcp-unique/prd.json` (relative to project root)
 2. Read the progress log at `ralph-mcp-unique/progress.txt` (check Codebase Patterns section first)
-3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main.
-4. Pick the **highest priority** user story where `passes: false`
-5. Implement that single user story
-6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires)
-7. Update CLAUDE.md files if you discover reusable patterns
-8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]`
-9. Update the PRD to set `passes: true` for the completed story
-10. Append your progress to `ralph-mcp-unique/progress.txt`
+3. Pick the **highest priority** user story where `passes: false`
+4. Implement that single user story
+5. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires)
+6. Update CLAUDE.md files if you discover reusable patterns
+7. If checks pass, commit ALL changes to `main` with message: `feat: [Story ID] - [Story Title]`
+8. Update the PRD to set `passes: true` for the completed story
+9. Append your progress to `ralph-mcp-unique/progress.txt`
 
 ## Key Architecture Context
 
@@ -81,6 +80,12 @@ If ALL stories are complete and passing, reply with:
 
 If there are still stories with `passes: false`, end your response normally.
 
+## Git Policy
+
+- **All work happens on `main`** — do NOT create feature branches
+- Never run `git checkout -b` or `git switch -c` — commit directly to `main`
+- This avoids cross-session branch confusion when multiple agents work on the repo
+
 ## Important
 
 - Work on ONE story per iteration
diff --git a/ralph-mcp-unique/prd.json b/ralph-mcp-unique/prd.json
@@ -586,7 +586,7 @@
         "Smoke test: configs/run_selected_tasks.sh --selection-file configs/selected_mcp_unique_tasks.json --dry-run succeeds"
       ],
       "priority": 19,
-      "passes": false,
+      "passes": true,
       "notes": "The selection file is separate from selected_benchmark_tasks.json to avoid bloating the existing file. The --selection-file flag is the bridge. Need to add ccb_mcp_* prefixes to DIR_PREFIX_TO_SUITE in generate_manifest.py, aggregate_status.py, and run_judge.py."
     },
     {
diff --git a/ralph-mcp-unique/progress.txt b/ralph-mcp-unique/progress.txt
@@ -491,3 +491,25 @@
   - tool_call_counts sorts by name for deterministic output
   - For dependency_chain steps, dedup against required_files before adding to avoid double-counting
 ---
+[2026-02-20 21:33:44 UTC] Iteration 9 no story markers found
+[2026-02-20 21:33:44 UTC] Iteration 9 complete
+[2026-02-20 21:33:46 UTC] Iteration 10 started
+
+## 2026-02-20 - US-019: Selection file and config wiring
+- Updated `configs/selected_mcp_unique_tasks.json`: Added `mcp_benefit_score` to all 12 entries, fixed task_dir to not include `benchmarks/` prefix (consistent with standard format)
+- Updated `configs/run_selected_tasks.sh`: Added `--selection-file <path>` flag (alternate selection file), `--use-case-category <A-J>` filter flag; updated `extract_tasks()` to use `task.get('benchmark') or task.get('mcp_suite')` for benchmark name
+- Updated `scripts/aggregate_status.py`: Added 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE
+- Updated `scripts/generate_manifest.py`: Added 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE
+- Updated `scripts/validate_tasks_preflight.py`: Now also looks for `tests/eval.sh` as alternative to `tests/test.sh` (MCP-unique tasks use eval.sh per SWE-Factory pattern)
+- Smoke test: `configs/run_selected_tasks.sh --selection-file configs/selected_mcp_unique_tasks.json --dry-run` succeeds (shows all 12 tasks across 6 suites)
+- Category filter test: `--use-case-category A` correctly returns 3 tasks in ccb_mcp_crossrepo_tracing
+- Repo health check: PASSED
+
+- Files changed: `configs/selected_mcp_unique_tasks.json`, `configs/run_selected_tasks.sh`, `scripts/aggregate_status.py`, `scripts/generate_manifest.py`, `scripts/validate_tasks_preflight.py`, `ralph-mcp-unique/prd.json`
+- **Learnings for future iterations:**
+  - MCP-unique tasks use `tests/eval.sh` not `tests/test.sh` — validate_tasks_preflight.py needed update to handle both
+  - selected_mcp_unique_tasks.json task_dir convention: NO `benchmarks/` prefix (consistent with selected_benchmark_tasks.json)
+  - extract_tasks() in run_selected_tasks.sh: use `task.get('benchmark') or task.get('mcp_suite')` to support both file formats
+  - DIR_PREFIX_TO_SUITE run dir prefix: `ccb_mcp_crossrepo_tracing_` (with trailing underscore) maps to suite name `ccb_mcp_crossrepo_tracing`
+  - mcp_benefit_score assigned by difficulty: easy=0.70, medium=0.80-0.85, hard=0.90-0.95 (DS variants = 0.95)
+---
diff --git a/ralph-mcp-unique/ralph.sh b/ralph-mcp-unique/ralph.sh
@@ -71,9 +71,6 @@ cd "$INSTANCE_DIR"
 PRD_FILE="$INSTANCE_DIR/prd.json"
 PROGRESS_FILE="$INSTANCE_DIR/progress.txt"
 RUN_LOG="$INSTANCE_DIR/ralph-run.log"
-ARCHIVE_DIR="$INSTANCE_DIR/archive"
-LAST_BRANCH_FILE="$INSTANCE_DIR/.last-branch"
-
 for req in CLAUDE.md AGENTS.md "$PRD_FILE" "$PROGRESS_FILE"; do
   if [[ ! -f "$req" ]]; then
     echo "Error: missing required file in $INSTANCE_DIR: $req" >&2
@@ -86,28 +83,6 @@ if ! command -v "$TOOL" >/dev/null 2>&1; then
   exit 127
 fi
 
-CURRENT_BRANCH="$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || true)"
-if [[ -f "$LAST_BRANCH_FILE" ]]; then
-  LAST_BRANCH="$(cat "$LAST_BRANCH_FILE" 2>/dev/null || true)"
-else
-  LAST_BRANCH=""
-fi
-
-if [[ -n "$CURRENT_BRANCH" && -n "$LAST_BRANCH" && "$CURRENT_BRANCH" != "$LAST_BRANCH" ]]; then
-  DATE_PREFIX="$(date +%Y-%m-%d)"
-  SAFE_BRANCH="${LAST_BRANCH#ralph/}"
-  DEST="$ARCHIVE_DIR/${DATE_PREFIX}-${SAFE_BRANCH}"
-  mkdir -p "$DEST"
-  [[ -f "$PRD_FILE" ]] && cp "$PRD_FILE" "$DEST/prd.json"
-  [[ -f "$PROGRESS_FILE" ]] && cp "$PROGRESS_FILE" "$DEST/progress.txt"
-  [[ -f "$RUN_LOG" ]] && cp "$RUN_LOG" "$DEST/ralph-run.log"
-  echo "[$(ts)] Archived previous run to $DEST" | tee -a "$PROGRESS_FILE" "$RUN_LOG"
-fi
-
-if [[ -n "$CURRENT_BRANCH" ]]; then
-  echo "$CURRENT_BRANCH" > "$LAST_BRANCH_FILE"
-fi
-
 touch "$RUN_LOG"
 
 echo "Starting Ralph - Tool: $TOOL - Max iterations: $MAX_ITERATIONS - Timeout: ${TIMEOUT_SEC}s" | tee -a "$RUN_LOG"
diff --git a/scripts/aggregate_status.py b/scripts/aggregate_status.py
@@ -82,6 +82,17 @@
     "swebenchpro_": "ccb_swebenchpro",
     "sweperf_": "ccb_sweperf",
     "tac_": "ccb_tac",
+    # MCP-unique org-scale retrieval suites
+    "ccb_mcp_crossrepo_tracing_": "ccb_mcp_crossrepo_tracing",
+    "ccb_mcp_security_": "ccb_mcp_security",
+    "ccb_mcp_migration_": "ccb_mcp_migration",
+    "ccb_mcp_incident_": "ccb_mcp_incident",
+    "ccb_mcp_onboarding_": "ccb_mcp_onboarding",
+    "ccb_mcp_compliance_": "ccb_mcp_compliance",
+    "ccb_mcp_crossorg_": "ccb_mcp_crossorg",
+    "ccb_mcp_domain_": "ccb_mcp_domain",
+    "ccb_mcp_org_": "ccb_mcp_org",
+    "ccb_mcp_platform_": "ccb_mcp_platform",
 }
 
 CONFIGS = ["baseline", "sourcegraph_full", "sourcegraph_isolated"]
diff --git a/scripts/ccb_metrics/extractors.py b/scripts/ccb_metrics/extractors.py
diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py
diff --git a/scripts/validate_tasks_preflight.py b/scripts/validate_tasks_preflight.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"project": "Unified Evaluation Package",`
`3`		`- "branchName": "ralph/unified-eval-package",`
`4`	`3`	`"description": "Migrate LLM-as-judge into CCB, add dual-scoring (verifier + judge), statistical rigor (bootstrap CIs), verifier hardening (debug mode, self-tests), and oracle auto-discovery. Single reproducible evaluation package.",`
`5`	`4`	`"userStories": [`
`6`	`5`	`{`
Original file line number	Diff line number	Diff line change
`@@ -586,7 +586,7 @@`
`586`	`586`	`"Smoke test: configs/run_selected_tasks.sh --selection-file configs/selected_mcp_unique_tasks.json --dry-run succeeds"`
`587`	`587`	`],`
`588`	`588`	`"priority": 19,`
`589`		`- "passes": false,`
	`589`	`+ "passes": true,`
`590`	`590`	`"notes": "The selection file is separate from selected_benchmark_tasks.json to avoid bloating the existing file. The --selection-file flag is the bridge. Need to add ccb_mcp_* prefixes to DIR_PREFIX_TO_SUITE in generate_manifest.py, aggregate_status.py, and run_judge.py."`
`591`	`591`	`},`
`592`	`592`	`{`