Skip to content

Commit 5ae7c73

Browse files
sjarmakclaude
andcommitted
feat: add CSB_SKIP_CONFIRM for unattended benchmark runs
- Add CSB_SKIP_CONFIRM=1 env var to skip interactive confirmation gates - Create launch_sonnet46_benchmark.sh for full 275-task runs with both agents - Regenerate script registry Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2e08d62 commit 5ae7c73

File tree

5 files changed

+123
-3
lines changed

5 files changed

+123
-3
lines changed

configs/_common.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,11 @@ except: print('unknown')
301301
fi
302302

303303
echo "----------------------------------------------"
304-
read -r -p "Press Enter to proceed, Ctrl+C to abort... " _
304+
if [ "${CSB_SKIP_CONFIRM:-}" = "1" ]; then
305+
echo "[auto] Skipping confirmation (CSB_SKIP_CONFIRM=1)"
306+
else
307+
read -r -p "Press Enter to proceed, Ctrl+C to abort... " _
308+
fi
305309
echo ""
306310
}
307311

configs/run_selected_tasks.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,11 @@ fi
496496
echo "----------------------------------------------"
497497
echo "Ready to launch $TOTAL_AGENT_RUNS agent runs ($PARALLEL_TASKS parallel)."
498498
echo ""
499-
read -r -p "Press Enter to proceed, Ctrl+C to abort... " _
499+
if [ "${CSB_SKIP_CONFIRM:-}" = "1" ]; then
500+
echo "[auto] Skipping confirmation (CSB_SKIP_CONFIRM=1)"
501+
else
502+
read -r -p "Press Enter to proceed, Ctrl+C to abort... " _
503+
fi
500504
echo ""
501505

502506
# ============================================

docs/ops/SCRIPT_INDEX.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
227227
- `scripts/integrate_answer_json_wave2.py` - Utility script for integrate answer json wave2.
228228
- `scripts/integrate_answer_json_wave3.py` - Utility script for integrate answer json wave3.
229229
- `scripts/judge_demo.py` - Utility script for judge demo.
230+
- `scripts/launch_sonnet46_benchmark.sh` - Utility script for launch sonnet46 benchmark.
230231
- `scripts/list_gemini_models.py` - Utility script for list gemini models.
231232
- `scripts/mirror_largerepo_expansion.sh` - Utility script for mirror largerepo expansion.
232233
- `scripts/organize_official_by_model.py` - Utility script for organize official by model.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
# Launch full 275-task benchmark with Sonnet 4.6
3+
#
4+
# Two sequential runs:
5+
# 1. Claude Code agent (baseline + MCP pairs)
6+
# 2. OpenHands agent (baseline + MCP pairs)
7+
#
8+
# Each run: 62 task pairs × 2 configs = 124 concurrent Daytona sandboxes
9+
# Total: 275 tasks × 2 configs × 2 agents = 1100 sandbox launches
10+
#
11+
# Usage:
12+
# ./scripts/launch_sonnet46_benchmark.sh # Both agents
13+
# ./scripts/launch_sonnet46_benchmark.sh --claude-only # Claude Code only
14+
# ./scripts/launch_sonnet46_benchmark.sh --openhands-only # OpenHands only
15+
# ./scripts/launch_sonnet46_benchmark.sh --dry-run # Validate without running
16+
17+
set -e
18+
19+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20+
REPO_ROOT="$SCRIPT_DIR/.."
21+
cd "$REPO_ROOT"
22+
23+
# Defaults
24+
RUN_CLAUDE=true
25+
RUN_OPENHANDS=true
26+
DRY_RUN=""
27+
MODEL="anthropic/claude-sonnet-4-6"
28+
CATEGORY="staging"
29+
30+
while [[ $# -gt 0 ]]; do
31+
case $1 in
32+
--claude-only) RUN_OPENHANDS=false; shift ;;
33+
--openhands-only) RUN_CLAUDE=false; shift ;;
34+
--dry-run) DRY_RUN="--dry-run"; shift ;;
35+
--category) CATEGORY="$2"; shift 2 ;;
36+
*) echo "Unknown: $1"; exit 1 ;;
37+
esac
38+
done
39+
40+
# Environment setup
41+
source .env.local 2>/dev/null || true
42+
export HARBOR_ENV=daytona
43+
export DAYTONA_OVERRIDE_STORAGE=10240
44+
export CSB_SKIP_CONFIRM=1
45+
46+
echo "=============================================="
47+
echo "CodeScaleBench Full Benchmark — Sonnet 4.6"
48+
echo "=============================================="
49+
echo "Model: $MODEL"
50+
echo "Tasks: 275 (131 SDLC + 144 Org)"
51+
echo "Configs: baseline-local-direct + mcp-remote-direct"
52+
echo "Environment: Daytona (62 pairs = 124 concurrent sandboxes)"
53+
echo "Category: $CATEGORY"
54+
echo "Claude Code: $RUN_CLAUDE"
55+
echo "OpenHands: $RUN_OPENHANDS"
56+
echo "Dry run: ${DRY_RUN:-no}"
57+
echo ""
58+
59+
# ─────────────────────────────────────────────
60+
# Run 1: Claude Code
61+
# ─────────────────────────────────────────────
62+
if [ "$RUN_CLAUDE" = true ]; then
63+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
64+
echo "Phase 1: Claude Code + Sonnet 4.6"
65+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
66+
67+
bash configs/run_selected_tasks.sh \
68+
--model "$MODEL" \
69+
--category "$CATEGORY" \
70+
--skip-prebuild \
71+
$DRY_RUN
72+
73+
echo ""
74+
echo "Claude Code run complete."
75+
echo ""
76+
fi
77+
78+
# ─────────────────────────────────────────────
79+
# Run 2: OpenHands
80+
# ─────────────────────────────────────────────
81+
if [ "$RUN_OPENHANDS" = true ]; then
82+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
83+
echo "Phase 2: OpenHands + Sonnet 4.6"
84+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
85+
86+
if [ -n "$DRY_RUN" ]; then
87+
echo "[DRY RUN] Would launch OpenHands with 275 tasks × 2 configs on Daytona"
88+
echo "[DRY RUN] Command: bash configs/openhands_2config.sh --model $MODEL --category $CATEGORY"
89+
else
90+
bash configs/openhands_2config.sh \
91+
--model "$MODEL" \
92+
--category "$CATEGORY"
93+
fi
94+
95+
echo ""
96+
echo "OpenHands run complete."
97+
echo ""
98+
fi
99+
100+
echo "=============================================="
101+
echo "All benchmark runs finished."
102+
echo "Results in: runs/$CATEGORY/"
103+
echo "=============================================="

scripts/registry.json

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,14 @@
962962
"language": "python",
963963
"summary": "Utility script for judge demo."
964964
},
965+
{
966+
"name": "launch_sonnet46_benchmark.sh",
967+
"path": "scripts/launch_sonnet46_benchmark.sh",
968+
"category": "misc",
969+
"status": "maintained",
970+
"language": "shell",
971+
"summary": "Utility script for launch sonnet46 benchmark."
972+
},
965973
{
966974
"name": "list_gemini_models.py",
967975
"path": "scripts/list_gemini_models.py",
@@ -1787,7 +1795,7 @@
17871795
"infra_mirrors": 23,
17881796
"library_helpers": 7,
17891797
"migration": 4,
1790-
"misc": 95,
1798+
"misc": 96,
17911799
"qa_quality": 10,
17921800
"submission_reporting": 7,
17931801
"task_creation_selection": 13,

0 commit comments

Comments
 (0)