Skip to content

Commit d89d2e9

Browse files
committed
Migrate final feature verifiers to validation_result
1 parent 8e9990f commit d89d2e9

File tree

6 files changed

+640
-66
lines changed

6 files changed

+640
-66
lines changed

benchmarks/csb_sdlc_feature/bustub-hyperloglog-impl-001/tests/test.sh

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ set -e
99

1010
TRAJECTORY_PATH="${TRAJECTORY_PATH:-/logs/trajectory.jsonl}"
1111
OUTPUT_PATH="/logs/tac_result.json"
12+
PASS_THRESHOLD="1.0"
1213

1314
if [ ! -f "$TRAJECTORY_PATH" ]; then
1415
echo '[]' > "$TRAJECTORY_PATH"
@@ -33,6 +34,118 @@ python_default /utils/eval.py \
3334

3435
mkdir -p /logs/verifier
3536

37+
write_validation_result() {
38+
python3 - "$OUTPUT_PATH" "$PASS_THRESHOLD" <<'PYEOF'
39+
import json
40+
import re
41+
import sys
42+
43+
result_path, pass_threshold = sys.argv[1:3]
44+
threshold = float(pass_threshold)
45+
status = "scored"
46+
reward = 0.0
47+
result_payload = {}
48+
49+
try:
50+
with open(result_path) as f:
51+
result_payload = json.load(f)
52+
except FileNotFoundError:
53+
status = "verifier_error"
54+
result_payload = {"error": f"Missing TAC result payload at {result_path}"}
55+
except Exception as exc:
56+
status = "verifier_error"
57+
result_payload = {"error": f"Failed to parse TAC result payload: {exc}"}
58+
59+
if status == "scored":
60+
final_score = result_payload.get("final_score")
61+
if isinstance(final_score, dict):
62+
total = final_score.get("total", 0) or 0
63+
result = final_score.get("result", 0) or 0
64+
try:
65+
reward = round(float(result) / float(total), 4) if float(total) > 0 else 0.0
66+
except (TypeError, ValueError, ZeroDivisionError):
67+
reward = 0.0
68+
status = "verifier_error"
69+
elif "score" in result_payload:
70+
try:
71+
reward = float(result_payload.get("score", 0.0) or 0.0)
72+
except (TypeError, ValueError):
73+
reward = 0.0
74+
status = "verifier_error"
75+
else:
76+
status = "verifier_error"
77+
result_payload.setdefault("error", "TAC payload did not include final_score or score")
78+
79+
if result_payload.get("error"):
80+
status = "verifier_error"
81+
reward = 0.0
82+
83+
def slugify(value: str) -> str:
84+
value = re.sub(r"[^a-z0-9]+", "_", value.lower()).strip("_")
85+
return value or "unnamed_check"
86+
87+
checks = {}
88+
for index, checkpoint in enumerate(result_payload.get("checkpoints", []), start=1):
89+
if isinstance(checkpoint, dict):
90+
raw_name = (
91+
checkpoint.get("name")
92+
or checkpoint.get("title")
93+
or checkpoint.get("description")
94+
or checkpoint.get("id")
95+
or f"checkpoint_{index}"
96+
)
97+
key = slugify(str(raw_name))
98+
value = checkpoint.get("score")
99+
if value is None and isinstance(checkpoint.get("passed"), bool):
100+
value = 1.0 if checkpoint["passed"] else 0.0
101+
if value is None:
102+
value = checkpoint.get("result")
103+
try:
104+
checks[key] = float(value)
105+
except (TypeError, ValueError):
106+
continue
107+
108+
failure = None
109+
if status != "scored":
110+
failure = {
111+
"code": "verifier_exception",
112+
"message": str(result_payload.get("error") or "TAC verifier did not produce a usable result"),
113+
"stage": "scoring",
114+
}
115+
116+
payload = {
117+
"schema_version": "validation_result.v1alpha1",
118+
"status": status,
119+
"scorable": status == "scored",
120+
"scorer_family": "checklist",
121+
"reward": reward,
122+
"pass_threshold": threshold,
123+
"passed": status == "scored" and reward >= threshold,
124+
"output_contract": {
125+
"mode": "unspecified",
126+
"primary_path": None,
127+
"required_artifact": False,
128+
},
129+
"sub_scores": {"checks": checks} if checks else {},
130+
"failure": failure,
131+
"legacy": {
132+
"reward_json": result_payload,
133+
},
134+
}
135+
136+
details = {}
137+
if isinstance(result_payload.get("final_score"), dict):
138+
details["final_score"] = result_payload["final_score"]
139+
if result_payload.get("checkpoints") is not None:
140+
details["checkpoints"] = result_payload.get("checkpoints")
141+
if details:
142+
payload["details"] = details
143+
144+
with open("/logs/verifier/validation_result.json", "w") as f:
145+
json.dump(payload, f, indent=2)
146+
PYEOF
147+
}
148+
36149
if [ -f "$OUTPUT_PATH" ]; then
37150
SCORE=$(python3 -c "
38151
import json
@@ -49,8 +162,17 @@ else:
49162
echo "TAC Score: $SCORE"
50163
echo "$SCORE" > /logs/verifier/reward.txt
51164
cp "$OUTPUT_PATH" /logs/verifier/reward.json 2>/dev/null || true
165+
write_validation_result
52166
exit 0
53167
else
54168
echo "0.0" > /logs/verifier/reward.txt
169+
python3 - <<'PYEOF'
170+
import json
171+
172+
fallback = {"score": 0.0, "error": "TAC evaluator did not produce /logs/tac_result.json"}
173+
with open("/logs/verifier/reward.json", "w") as f:
174+
json.dump(fallback, f, indent=2)
175+
PYEOF
176+
write_validation_result
55177
exit 0
56178
fi

benchmarks/csb_sdlc_feature/k8s-noschedule-taint-feat-001/tests/test.sh

Lines changed: 127 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
# 2. Tests for the new effect pass
77
# 3. No regressions in existing taint logic
88
#
9-
# NOTE: A reward.json file exists alongside this task defining additional
10-
# manual evaluation criteria (e.g., architecture_understanding, code_quality).
11-
# These criteria are NOT automatically scored by this script and would require
12-
# an LLM judge to evaluate. This script handles automated scoring only.
9+
# NOTE: This task also ships a static rubric JSON file with additional manual
10+
# evaluation criteria (e.g., architecture_understanding, code_quality).
11+
# Those rubric criteria are descriptive only; this verifier scores automated
12+
# repo-state checks and emits the canonical validation_result sidecar.
1313

1414
set -e
1515

@@ -23,17 +23,137 @@ fi
2323
TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
2424
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
2525
VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
26+
TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
27+
PASS_THRESHOLD="0.7"
28+
ARTIFACT_REQUIRED=false
29+
if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
30+
ARTIFACT_REQUIRED=true
31+
fi
2632

2733
cd "$TASK_REPO_ROOT"
2834

2935
# Create log directories
3036
mkdir -p /logs/verifier
3137

38+
TAINT_ADDED=0
39+
CHANGES_MADE=0
40+
TESTS_ADDED=0
41+
BUILD_OK=0
42+
UNIT_TEST_PASS=0
43+
44+
write_invalid_output() {
45+
local code="$1"
46+
local message="$2"
47+
python3 - "$code" "$message" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
48+
import json
49+
import sys
50+
51+
code, message, primary_path, required_artifact, pass_threshold = sys.argv[1:6]
52+
payload = {
53+
"schema_version": "validation_result.v1alpha1",
54+
"status": "invalid_output",
55+
"scorable": False,
56+
"scorer_family": "repo_state_heuristic",
57+
"reward": 0.0,
58+
"pass_threshold": float(pass_threshold),
59+
"passed": False,
60+
"output_contract": {
61+
"mode": "answer_json_bridge",
62+
"primary_path": primary_path,
63+
"required_artifact": required_artifact == "true",
64+
},
65+
"sub_scores": {},
66+
"failure": {
67+
"code": code,
68+
"message": message,
69+
"stage": "output_validation",
70+
},
71+
}
72+
with open("/logs/verifier/validation_result.json", "w") as f:
73+
json.dump(payload, f, indent=2)
74+
PYEOF
75+
echo "0.0" > /logs/verifier/reward.txt
76+
}
77+
78+
write_scored_result() {
79+
local score="$1"
80+
local reason="${2:-}"
81+
VALIDATION_SCORE="$score" \
82+
VALIDATION_REASON="$reason" \
83+
UNSTAGED_COUNT="${UNSTAGED_COUNT:-0}" \
84+
STAGED_COUNT="${STAGED_COUNT:-0}" \
85+
UNTRACKED_COUNT="${UNTRACKED_COUNT:-0}" \
86+
COMMIT_COUNT="${COMMIT_COUNT:-0}" \
87+
python3 - "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
88+
import json
89+
import os
90+
import sys
91+
92+
primary_path, required_artifact, pass_threshold = sys.argv[1:4]
93+
reward = float(os.environ.get("VALIDATION_SCORE", "0.0"))
94+
threshold = float(pass_threshold)
95+
checks = {
96+
"taint_added": float(os.environ.get("TAINT_ADDED", "0") or 0),
97+
"changes_made": float(os.environ.get("CHANGES_MADE", "0") or 0),
98+
"tests_added": float(os.environ.get("TESTS_ADDED", "0") or 0),
99+
"unit_test_pass": float(os.environ.get("UNIT_TEST_PASS", "0") or 0),
100+
"build_ok": float(os.environ.get("BUILD_OK", "0") or 0),
101+
}
102+
details = {
103+
"check_weights": {
104+
"taint_added": 0.3,
105+
"changes_made": 0.2,
106+
"tests_added": 0.2,
107+
"unit_test_pass": 0.3,
108+
"build_ok": "gating",
109+
},
110+
"change_detection": {
111+
"unstaged": int(os.environ.get("UNSTAGED_COUNT", "0") or 0),
112+
"staged": int(os.environ.get("STAGED_COUNT", "0") or 0),
113+
"untracked": int(os.environ.get("UNTRACKED_COUNT", "0") or 0),
114+
"commits": int(os.environ.get("COMMIT_COUNT", "0") or 0),
115+
},
116+
}
117+
reason = os.environ.get("VALIDATION_REASON")
118+
if reason:
119+
details["reason"] = reason
120+
payload = {
121+
"schema_version": "validation_result.v1alpha1",
122+
"status": "scored",
123+
"scorable": True,
124+
"scorer_family": "repo_state_heuristic",
125+
"reward": reward,
126+
"pass_threshold": threshold,
127+
"passed": reward >= threshold,
128+
"output_contract": {
129+
"mode": "answer_json_bridge",
130+
"primary_path": primary_path,
131+
"required_artifact": required_artifact == "true",
132+
},
133+
"sub_scores": {
134+
"checks": checks,
135+
},
136+
"failure": None,
137+
"details": details,
138+
}
139+
with open("/logs/verifier/validation_result.json", "w") as f:
140+
json.dump(payload, f, indent=2)
141+
PYEOF
142+
echo "$score" > /logs/verifier/reward.txt
143+
}
144+
32145
# Fix git safe.directory: the repo was cloned as root during Docker build,
33146
# but the verifier may run as a different user. Without this, all git
34147
# commands silently fail due to CVE-2022-24765 ownership checks.
35148
git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true
36149

150+
if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ ! -f "${ANSWER_JSON:-$TASK_OUTPUT}" ]; then
151+
echo "Required answer.json artifact missing at ${ANSWER_JSON:-$TASK_OUTPUT}"
152+
write_invalid_output "missing_required_output" \
153+
"answer.json not found at ${ANSWER_JSON:-$TASK_OUTPUT}"
154+
exit 0
155+
fi
156+
37157
# Guard: if no code changes were made, the agent didn't execute successfully
38158
# Check unstaged changes, staged changes, untracked files, AND new commits
39159
UNSTAGED_COUNT=$(git diff --stat 2>/dev/null | wc -l)
@@ -63,7 +183,7 @@ fi
63183
echo "Change detection: unstaged=$UNSTAGED_COUNT staged=$STAGED_COUNT untracked=$UNTRACKED_COUNT commits=$COMMIT_COUNT (origin_ref=${ORIGIN_REF:-none})"
64184
if [ "$UNSTAGED_COUNT" -eq 0 ] && [ "$STAGED_COUNT" -eq 0 ] && [ "$UNTRACKED_COUNT" -eq 0 ] && [ "$COMMIT_COUNT" -eq 0 ]; then
65185
echo "No code changes detected — agent did not execute successfully"
66-
echo "0.0" > /logs/verifier/reward.txt
186+
write_scored_result "0.0" "no_code_changes"
67187
echo ""
68188
echo "[ ] Tests completed - Score: 0.0 (no changes)"
69189
exit 0
@@ -91,7 +211,7 @@ fi
91211

92212
if [ "$BUILD_OK" -eq 0 ]; then
93213
echo "Compilation failed — score set to 0.0"
94-
echo "0.0" > /logs/verifier/reward.txt
214+
write_scored_result "0.0" "build_failure"
95215
echo ""
96216
echo "[ ] Tests completed - Score: 0.0 (build failure)"
97217
exit 0
@@ -178,6 +298,6 @@ fi
178298
# Convert back to decimal (using awk for portable floating point)
179299
SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE_NUMERATOR / 10}")
180300

181-
echo "$SCORE" > /logs/verifier/reward.txt
301+
write_scored_result "$SCORE" "completed"
182302
echo ""
183303
echo "[x] Tests completed - Score: $SCORE"

0 commit comments

Comments
 (0)