Skip to content

Commit b758c58

Browse files
sjarmakclaude
andcommitted
feat: US-019 - Selection file and config wiring
- Add --selection-file and --use-case-category flags to run_selected_tasks.sh - Update extract_tasks() to support both benchmark and mcp_suite field formats - Add mcp_benefit_score to all 12 tasks in selected_mcp_unique_tasks.json - Fix task_dir format (no benchmarks/ prefix, consistent with standard format) - Add 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE in aggregate_status.py and generate_manifest.py - Update validate_tasks_preflight.py to accept tests/eval.sh as alternative to tests/test.sh (MCP-unique tasks use SWE-Factory eval.sh pattern) - Smoke test: dry-run with --selection-file shows all 12 tasks across 6 suites Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 46f42cc commit b758c58

File tree

14 files changed

+251
-65
lines changed

14 files changed

+251
-65
lines changed

AGENTS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ per-task details.
2626
- `docs/SKILLS.md` - AI agent skill system overview
2727
- `skills/` - operational runbooks for AI agents (see `skills/README.md`)
2828

29+
## Git Policy
30+
- **All work happens on `main`** — do NOT create feature branches.
31+
- Never run `git checkout -b` or `git switch -c`.
32+
- Commit directly to `main`. This avoids cross-session branch confusion when multiple agents work on the repo.
33+
2934
## Typical Skill Routing
3035
Use these defaults unless there is a task-specific reason not to.
3136

CLAUDE.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ per-task details.
2626
- `docs/SKILLS.md` - AI agent skill system overview
2727
- `skills/` - operational runbooks for AI agents (see `skills/README.md`)
2828

29+
## Git Policy
30+
- **All work happens on `main`** — do NOT create feature branches.
31+
- Never run `git checkout -b` or `git switch -c`.
32+
- Commit directly to `main`. This avoids cross-session branch confusion when multiple agents work on the repo.
33+
2934
## Typical Skill Routing
3035
Use these defaults unless there is a task-specific reason not to.
3136

agents/claude_baseline_agent.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
838838
script_lines = [
839839
'#!/bin/bash',
840840
'export PATH=/usr/local/bin:/usr/bin:/bin:$PATH',
841+
'export CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000',
841842
'# Detect working directory',
842843
'if [ -d /workspace ]; then WORKDIR=/workspace',
843844
'elif [ -d /app ]; then WORKDIR=/app',
@@ -897,11 +898,12 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
897898
env_with_autonomous = {
898899
**env,
899900
'FORCE_AUTO_BACKGROUND_TASKS': '1',
900-
'ENABLE_BACKGROUND_TASKS': '1'
901+
'ENABLE_BACKGROUND_TASKS': '1',
902+
'CLAUDE_CODE_MAX_OUTPUT_TOKENS': '128000',
901903
}
902904

903905
# Add SSL workaround for MCP HTTP transport
904-
if mcp_type in ["sourcegraph", "sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "deepsearch", "deepsearch_hybrid"]:
906+
if mcp_type in ["sourcegraph", "sourcegraph_full", "sourcegraph_base", "sourcegraph_isolated", "artifact_full", "deepsearch", "deepsearch_hybrid"]:
905907
env_with_autonomous['NODE_TLS_REJECT_UNAUTHORIZED'] = '0'
906908

907909
if mcp_type == "sourcegraph_base":

configs/run_selected_tasks.sh

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,19 @@
1212
# ./configs/run_selected_tasks.sh [OPTIONS]
1313
#
1414
# Options:
15-
# --benchmark BENCHMARK Run only this benchmark (e.g., ccb_build, ccb_fix)
16-
# --baseline-only Run only baseline (no MCP)
17-
# --full-only Run only MCP-Full (sourcegraph_full)
18-
# --model MODEL Override model (default: claude-opus-4-6)
19-
# --concurrency N Concurrent tasks (default: 2)
20-
# --category CATEGORY Run category (default: staging)
21-
# --skip-completed Skip tasks that already have result.json + task_metrics.json
22-
# --dry-run Print tasks without running
15+
# --benchmark BENCHMARK Run only this benchmark (e.g., ccb_build, ccb_fix)
16+
# --selection-file PATH Use alternate selection file (default: selected_benchmark_tasks.json)
17+
# --use-case-category CATEGORY Filter by MCP-unique use case category (A-J), only valid with --selection-file
18+
# --baseline-only Run only baseline (no MCP)
19+
# --full-only Run only MCP-Full (sourcegraph_full)
20+
# --model MODEL Override model (default: claude-opus-4-6)
21+
# --concurrency N Concurrent tasks (default: 2)
22+
# --category CATEGORY Run category (default: staging)
23+
# --skip-completed Skip tasks that already have result.json + task_metrics.json
24+
# --dry-run Print tasks without running
2325
#
2426
# Prerequisites:
25-
# - configs/selected_benchmark_tasks.json in repo
27+
# - configs/selected_benchmark_tasks.json in repo (or --selection-file path)
2628
# - .env.local (repo root) with USE_SUBSCRIPTION=true
2729
# - SOURCEGRAPH_ACCESS_TOKEN in .env.local (for MCP modes)
2830

@@ -44,6 +46,7 @@ SELECTION_FILE="$REPO_ROOT/configs/selected_benchmark_tasks.json"
4446
# PARSE ARGUMENTS
4547
# ============================================
4648
BENCHMARK_FILTER=""
49+
USE_CASE_CATEGORY_FILTER=""
4750
MODEL="${MODEL:-anthropic/claude-opus-4-6}"
4851
CONCURRENCY=1
4952
TIMEOUT_MULTIPLIER=10
@@ -60,6 +63,14 @@ while [[ $# -gt 0 ]]; do
6063
BENCHMARK_FILTER="$2"
6164
shift 2
6265
;;
66+
--selection-file)
67+
SELECTION_FILE="$2"
68+
shift 2
69+
;;
70+
--use-case-category)
71+
USE_CASE_CATEGORY_FILTER="$2"
72+
shift 2
73+
;;
6374
--baseline-only)
6475
RUN_FULL=false
6576
shift
@@ -120,17 +131,26 @@ ensure_fresh_token
120131
# EXTRACT TASKS FROM SELECTION FILE
121132
# ============================================
122133
# Python helper to extract task info grouped by benchmark
134+
# Supports both standard format (benchmark field) and MCP-unique format (mcp_suite field).
135+
# task_dir in both formats is relative to benchmarks/ (no benchmarks/ prefix).
123136
extract_tasks() {
124137
python3 -c "
125138
import json, sys
126139
127140
selection = json.load(open('$SELECTION_FILE'))
128141
benchmark_filter = '$BENCHMARK_FILTER'
142+
use_case_category_filter = '$USE_CASE_CATEGORY_FILTER'
129143
130144
for task in selection['tasks']:
131-
bm = task['benchmark']
145+
# Support both standard (benchmark) and MCP-unique (mcp_suite) selection files
146+
bm = task.get('benchmark') or task.get('mcp_suite', '')
147+
if not bm:
148+
continue
132149
if benchmark_filter and bm != benchmark_filter:
133150
continue
151+
if use_case_category_filter and task.get('use_case_category', '') != use_case_category_filter:
152+
continue
153+
# task_dir is relative to benchmarks/ in both formats
134154
task_dir = 'benchmarks/' + task['task_dir']
135155
print(f'{bm}\t{task[\"task_id\"]}\t{task_dir}')
136156
"
@@ -181,6 +201,7 @@ echo "Total tasks: $TOTAL_TASKS"
181201
echo "Concurrency: $CONCURRENCY"
182202
echo "Configs: baseline=$RUN_BASELINE sourcegraph_full=$RUN_FULL"
183203
echo "Skip done: $SKIP_COMPLETED"
204+
[ -n "$USE_CASE_CATEGORY_FILTER" ] && echo "Category: $USE_CASE_CATEGORY_FILTER"
184205
echo ""
185206
echo "Tasks per benchmark:"
186207
for bm in $(echo "${!BENCHMARK_COUNTS[@]}" | tr ' ' '\n' | sort); do

configs/selected_mcp_unique_tasks.json

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"language": "go",
1111
"difficulty": "medium",
1212
"repo": "sg-benchmarks/kubernetes-client-go",
13-
"task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-001",
13+
"mcp_benefit_score": 0.80,
14+
"task_dir": "ccb_mcp_crossrepo_tracing/ccx-dep-trace-001",
1415
"deepsearch_relevant": false,
1516
"oracle_check_types": ["file_set_match"],
1617
"repo_set_id": "kubernetes-ecosystem"
@@ -23,7 +24,8 @@
2324
"language": "go",
2425
"difficulty": "hard",
2526
"repo": "grafana/grafana",
26-
"task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-dep-trace-004",
27+
"mcp_benefit_score": 0.90,
28+
"task_dir": "ccb_mcp_crossrepo_tracing/ccx-dep-trace-004",
2729
"deepsearch_relevant": false,
2830
"oracle_check_types": ["dependency_chain", "provenance"],
2931
"repo_set_id": "grafana-observability"
@@ -36,7 +38,8 @@
3638
"language": "go",
3739
"difficulty": "medium",
3840
"repo": "sg-benchmarks/kubernetes-client-go",
39-
"task_dir": "benchmarks/ccb_mcp_crossrepo_tracing/ccx-config-trace-010",
41+
"mcp_benefit_score": 0.85,
42+
"task_dir": "ccb_mcp_crossrepo_tracing/ccx-config-trace-010",
4043
"deepsearch_relevant": false,
4144
"oracle_check_types": ["symbol_resolution"],
4245
"repo_set_id": "kubernetes-ecosystem"
@@ -49,7 +52,8 @@
4952
"language": "javascript",
5053
"difficulty": "medium",
5154
"repo": "nodejs/node",
52-
"task_dir": "benchmarks/ccb_mcp_security/ccx-vuln-remed-011",
55+
"mcp_benefit_score": 0.80,
56+
"task_dir": "ccb_mcp_security/ccx-vuln-remed-011",
5357
"deepsearch_relevant": false,
5458
"oracle_check_types": ["file_set_match", "keyword_presence"],
5559
"repo_set_id": "nodejs-web-stack"
@@ -62,7 +66,8 @@
6266
"language": "go",
6367
"difficulty": "hard",
6468
"repo": "grafana/grafana",
65-
"task_dir": "benchmarks/ccb_mcp_security/ccx-vuln-remed-014",
69+
"mcp_benefit_score": 0.90,
70+
"task_dir": "ccb_mcp_security/ccx-vuln-remed-014",
6671
"deepsearch_relevant": false,
6772
"oracle_check_types": ["file_set_match", "provenance"],
6873
"repo_set_id": "grafana-observability"
@@ -75,7 +80,8 @@
7580
"language": "go",
7681
"difficulty": "hard",
7782
"repo": "kubernetes/kubernetes",
78-
"task_dir": "benchmarks/ccb_mcp_incident/ccx-incident-031",
83+
"mcp_benefit_score": 0.90,
84+
"task_dir": "ccb_mcp_incident/ccx-incident-031",
7985
"deepsearch_relevant": false,
8086
"oracle_check_types": ["file_set_match", "keyword_presence"],
8187
"repo_set_id": "multi-org-go"
@@ -88,7 +94,8 @@
8894
"language": "python",
8995
"difficulty": "medium",
9096
"repo": "pandas-dev/pandas",
91-
"task_dir": "benchmarks/ccb_mcp_onboarding/ccx-onboard-041",
97+
"mcp_benefit_score": 0.80,
98+
"task_dir": "ccb_mcp_onboarding/ccx-onboard-041",
9299
"deepsearch_relevant": false,
93100
"oracle_check_types": ["file_set_match", "provenance"],
94101
"repo_set_id": "python-ml-stack"
@@ -101,7 +108,8 @@
101108
"language": "go",
102109
"difficulty": "hard",
103110
"repo": "kubernetes/kubernetes",
104-
"task_dir": "benchmarks/ccb_mcp_onboarding/ccx-onboard-050-ds",
111+
"mcp_benefit_score": 0.95,
112+
"task_dir": "ccb_mcp_onboarding/ccx-onboard-050-ds",
105113
"deepsearch_relevant": true,
106114
"oracle_check_types": ["dependency_chain", "provenance"],
107115
"repo_set_id": "kubernetes-ecosystem"
@@ -114,7 +122,8 @@
114122
"language": "go",
115123
"difficulty": "hard",
116124
"repo": "kubernetes/kubernetes",
117-
"task_dir": "benchmarks/ccb_mcp_crossorg/ccx-crossorg-061",
125+
"mcp_benefit_score": 0.92,
126+
"task_dir": "ccb_mcp_crossorg/ccx-crossorg-061",
118127
"deepsearch_relevant": false,
119128
"oracle_check_types": ["symbol_resolution"],
120129
"repo_set_id": "multi-org-go"
@@ -127,7 +136,8 @@
127136
"language": "go",
128137
"difficulty": "medium",
129138
"repo": "kubernetes/kubernetes",
130-
"task_dir": "benchmarks/ccb_mcp_crossorg/ccx-crossorg-066",
139+
"mcp_benefit_score": 0.82,
140+
"task_dir": "ccb_mcp_crossorg/ccx-crossorg-066",
131141
"deepsearch_relevant": false,
132142
"oracle_check_types": ["keyword_presence", "provenance"],
133143
"repo_set_id": "multi-org-go"
@@ -140,7 +150,8 @@
140150
"language": "python",
141151
"difficulty": "hard",
142152
"repo": "scikit-learn/scikit-learn",
143-
"task_dir": "benchmarks/ccb_mcp_onboarding/ccx-explore-042-ds",
153+
"mcp_benefit_score": 0.95,
154+
"task_dir": "ccb_mcp_onboarding/ccx-explore-042-ds",
144155
"deepsearch_relevant": true,
145156
"oracle_check_types": ["dependency_chain", "provenance"],
146157
"repo_set_id": "python-ml-stack"
@@ -153,7 +164,8 @@
153164
"language": "go",
154165
"difficulty": "hard",
155166
"repo": "kubernetes/kubernetes",
156-
"task_dir": "benchmarks/ccb_mcp_platform/ccx-explore-091-ds",
167+
"mcp_benefit_score": 0.95,
168+
"task_dir": "ccb_mcp_platform/ccx-explore-091-ds",
157169
"deepsearch_relevant": true,
158170
"oracle_check_types": ["file_set_match", "keyword_presence"],
159171
"repo_set_id": "kubernetes-ecosystem"

ralph-eval/prd.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
{
22
"project": "Unified Evaluation Package",
3-
"branchName": "ralph/unified-eval-package",
43
"description": "Migrate LLM-as-judge into CCB, add dual-scoring (verifier + judge), statistical rigor (bootstrap CIs), verifier hardening (debug mode, self-tests), and oracle auto-discovery. Single reproducible evaluation package.",
54
"userStories": [
65
{

ralph-mcp-unique/CLAUDE.md

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@ You are an autonomous coding agent working on extending CodeContextBench with MC
66

77
1. Read the PRD at `ralph-mcp-unique/prd.json` (relative to project root)
88
2. Read the progress log at `ralph-mcp-unique/progress.txt` (check Codebase Patterns section first)
9-
3. Check you're on the correct branch from PRD `branchName`. If not, check it out or create from main.
10-
4. Pick the **highest priority** user story where `passes: false`
11-
5. Implement that single user story
12-
6. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires)
13-
7. Update CLAUDE.md files if you discover reusable patterns
14-
8. If checks pass, commit ALL changes with message: `feat: [Story ID] - [Story Title]`
15-
9. Update the PRD to set `passes: true` for the completed story
16-
10. Append your progress to `ralph-mcp-unique/progress.txt`
9+
3. Pick the **highest priority** user story where `passes: false`
10+
4. Implement that single user story
11+
5. Run quality checks (e.g., typecheck, lint, test - use whatever your project requires)
12+
6. Update CLAUDE.md files if you discover reusable patterns
13+
7. If checks pass, commit ALL changes to `main` with message: `feat: [Story ID] - [Story Title]`
14+
8. Update the PRD to set `passes: true` for the completed story
15+
9. Append your progress to `ralph-mcp-unique/progress.txt`
1716

1817
## Key Architecture Context
1918

@@ -81,6 +80,12 @@ If ALL stories are complete and passing, reply with:
8180

8281
If there are still stories with `passes: false`, end your response normally.
8382

83+
## Git Policy
84+
85+
- **All work happens on `main`** — do NOT create feature branches
86+
- Never run `git checkout -b` or `git switch -c` — commit directly to `main`
87+
- This avoids cross-session branch confusion when multiple agents work on the repo
88+
8489
## Important
8590

8691
- Work on ONE story per iteration

ralph-mcp-unique/prd.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,7 @@
586586
"Smoke test: configs/run_selected_tasks.sh --selection-file configs/selected_mcp_unique_tasks.json --dry-run succeeds"
587587
],
588588
"priority": 19,
589-
"passes": false,
589+
"passes": true,
590590
"notes": "The selection file is separate from selected_benchmark_tasks.json to avoid bloating the existing file. The --selection-file flag is the bridge. Need to add ccb_mcp_* prefixes to DIR_PREFIX_TO_SUITE in generate_manifest.py, aggregate_status.py, and run_judge.py."
591591
},
592592
{

ralph-mcp-unique/progress.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,3 +491,25 @@
491491
- tool_call_counts sorts by name for deterministic output
492492
- For dependency_chain steps, dedup against required_files before adding to avoid double-counting
493493
---
494+
[2026-02-20 21:33:44 UTC] Iteration 9 no story markers found
495+
[2026-02-20 21:33:44 UTC] Iteration 9 complete
496+
[2026-02-20 21:33:46 UTC] Iteration 10 started
497+
498+
## 2026-02-20 - US-019: Selection file and config wiring
499+
- Updated `configs/selected_mcp_unique_tasks.json`: Added `mcp_benefit_score` to all 12 entries, fixed task_dir to not include `benchmarks/` prefix (consistent with standard format)
500+
- Updated `configs/run_selected_tasks.sh`: Added `--selection-file <path>` flag (alternate selection file), `--use-case-category <A-J>` filter flag; updated `extract_tasks()` to use `task.get('benchmark') or task.get('mcp_suite')` for benchmark name
501+
- Updated `scripts/aggregate_status.py`: Added 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE
502+
- Updated `scripts/generate_manifest.py`: Added 10 ccb_mcp_* suite prefixes to DIR_PREFIX_TO_SUITE
503+
- Updated `scripts/validate_tasks_preflight.py`: Now also looks for `tests/eval.sh` as alternative to `tests/test.sh` (MCP-unique tasks use eval.sh per SWE-Factory pattern)
504+
- Smoke test: `configs/run_selected_tasks.sh --selection-file configs/selected_mcp_unique_tasks.json --dry-run` succeeds (shows all 12 tasks across 6 suites)
505+
- Category filter test: `--use-case-category A` correctly returns 3 tasks in ccb_mcp_crossrepo_tracing
506+
- Repo health check: PASSED
507+
508+
- Files changed: `configs/selected_mcp_unique_tasks.json`, `configs/run_selected_tasks.sh`, `scripts/aggregate_status.py`, `scripts/generate_manifest.py`, `scripts/validate_tasks_preflight.py`, `ralph-mcp-unique/prd.json`
509+
- **Learnings for future iterations:**
510+
- MCP-unique tasks use `tests/eval.sh` not `tests/test.sh` — validate_tasks_preflight.py needed update to handle both
511+
- selected_mcp_unique_tasks.json task_dir convention: NO `benchmarks/` prefix (consistent with selected_benchmark_tasks.json)
512+
- extract_tasks() in run_selected_tasks.sh: use `task.get('benchmark') or task.get('mcp_suite')` to support both file formats
513+
- DIR_PREFIX_TO_SUITE run dir prefix: `ccb_mcp_crossrepo_tracing_` (with trailing underscore) maps to suite name `ccb_mcp_crossrepo_tracing`
514+
- mcp_benefit_score assigned by difficulty: easy=0.70, medium=0.80-0.85, hard=0.90-0.95 (DS variants = 0.95)
515+
---

ralph-mcp-unique/ralph.sh

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,6 @@ cd "$INSTANCE_DIR"
7171
PRD_FILE="$INSTANCE_DIR/prd.json"
7272
PROGRESS_FILE="$INSTANCE_DIR/progress.txt"
7373
RUN_LOG="$INSTANCE_DIR/ralph-run.log"
74-
ARCHIVE_DIR="$INSTANCE_DIR/archive"
75-
LAST_BRANCH_FILE="$INSTANCE_DIR/.last-branch"
76-
7774
for req in CLAUDE.md AGENTS.md "$PRD_FILE" "$PROGRESS_FILE"; do
7875
if [[ ! -f "$req" ]]; then
7976
echo "Error: missing required file in $INSTANCE_DIR: $req" >&2
@@ -86,28 +83,6 @@ if ! command -v "$TOOL" >/dev/null 2>&1; then
8683
exit 127
8784
fi
8885

89-
CURRENT_BRANCH="$(jq -r '.branchName // empty' "$PRD_FILE" 2>/dev/null || true)"
90-
if [[ -f "$LAST_BRANCH_FILE" ]]; then
91-
LAST_BRANCH="$(cat "$LAST_BRANCH_FILE" 2>/dev/null || true)"
92-
else
93-
LAST_BRANCH=""
94-
fi
95-
96-
if [[ -n "$CURRENT_BRANCH" && -n "$LAST_BRANCH" && "$CURRENT_BRANCH" != "$LAST_BRANCH" ]]; then
97-
DATE_PREFIX="$(date +%Y-%m-%d)"
98-
SAFE_BRANCH="${LAST_BRANCH#ralph/}"
99-
DEST="$ARCHIVE_DIR/${DATE_PREFIX}-${SAFE_BRANCH}"
100-
mkdir -p "$DEST"
101-
[[ -f "$PRD_FILE" ]] && cp "$PRD_FILE" "$DEST/prd.json"
102-
[[ -f "$PROGRESS_FILE" ]] && cp "$PROGRESS_FILE" "$DEST/progress.txt"
103-
[[ -f "$RUN_LOG" ]] && cp "$RUN_LOG" "$DEST/ralph-run.log"
104-
echo "[$(ts)] Archived previous run to $DEST" | tee -a "$PROGRESS_FILE" "$RUN_LOG"
105-
fi
106-
107-
if [[ -n "$CURRENT_BRANCH" ]]; then
108-
echo "$CURRENT_BRANCH" > "$LAST_BRANCH_FILE"
109-
fi
110-
11186
touch "$RUN_LOG"
11287

11388
echo "Starting Ralph - Tool: $TOOL - Max iterations: $MAX_ITERATIONS - Timeout: ${TIMEOUT_SEC}s" | tee -a "$RUN_LOG"

0 commit comments

Comments
 (0)