Skip to content

Commit be8bff8

Browse files
sjarmakclaude
andcommitted
Complete validation_result migration for remaining 50 verifier tasks
Migrate all remaining verifier families (checklist, ir_checklist, f1, f1_hybrid, continuous, test_ratio) to emit validation_result.json sidecar alongside reward.txt. All 264 active tasks now emit the v1alpha1 validation_result contract. Families migrated: - ir_checklist (17 tasks): composite IR scoring with sub_scores - checklist (16 tasks): heterogeneous fault localization + doc gen - f1_hybrid (7 tasks): detection F1 + fix quality blend - continuous (5 tasks): weighted category checklist - test_ratio (3 tasks): SWE-bench test pass ratio via EXIT trap - f1 (2 tasks): pure F1 precision/recall scorer Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 699f9c8 commit be8bff8

File tree

54 files changed

+4359
-437
lines changed
  • benchmarks
    • csb_sdlc_debug
      • linux-acpi-backlight-fault-001/tests
      • linux-hda-intel-suspend-fault-001/tests
      • linux-iwlwifi-subdevice-fault-001/tests
      • linux-nfs-inode-revalidate-fault-001/tests
    • csb_sdlc_design
      • camel-routing-arch-001/tests
      • django-orm-query-arch-001/tests
      • k8s-crd-lifecycle-arch-001/tests
      • postgres-query-exec-arch-001/tests
    • csb_sdlc_document
      • docgen-inline-002/tests
      • envoy-arch-doc-gen-001/tests
      • envoy-migration-doc-gen-001/tests
      • k8s-apiserver-doc-gen-001/tests
      • k8s-applyconfig-doc-gen-001/tests
      • k8s-clientgo-doc-gen-001/tests
      • k8s-fairqueuing-doc-gen-001/tests
      • k8s-kubelet-cm-doc-gen-001/tests
      • kafka-api-doc-gen-001/tests
    • csb_sdlc_feature
      • camel-fix-protocol-feat-001/tests
      • envoy-grpc-server-impl-001/tests
      • k8s-runtime-object-impl-001/tests
      • strata-cds-tranche-feat-001/tests
    • csb_sdlc_fix
      • django-select-for-update-fix-001/tests
      • element-web-roomheaderbuttons-can-crash-fix-001/tests
      • element-web-unread-indicators-diverge-fix-001/tests
      • k8s-dra-scheduler-event-fix-001/tests
      • kafka-producer-bufpool-fix-001/tests
      • teleport-users-can-delete-fix-001/tests
    • csb_sdlc_refactor
      • k8s-score-normalizer-refac-001/tests
      • kafka-batch-accumulator-refac-001/tests
      • strata-fx-european-refac-001/tests
    • csb_sdlc_secure
      • curl-cve-triage-001/tests
      • k8s-rbac-auth-audit-001/tests
      • kafka-sasl-auth-audit-001/tests
    • csb_sdlc_test
      • aspnetcore-code-review-001/tests
      • calcom-code-review-001/tests
      • envoy-code-review-001/tests
      • ghost-code-review-001/tests
      • kafka-security-review-001/tests
      • terraform-code-review-001/tests
      • vscode-code-review-001/tests
    • csb_sdlc_understand
      • argocd-arch-orient-001/tests
      • cilium-ebpf-fault-qa-001/tests
      • cilium-project-orient-001/tests
      • envoy-request-routing-qa-001/tests
      • grafana-platform-orient-001/tests
      • kafka-build-orient-001/tests
      • kafka-contributor-workflow-001/tests
  • configs
  • docs/ops
  • scripts

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+4359
-437
lines changed

benchmarks/csb_sdlc_debug/linux-acpi-backlight-fault-001/tests/test.sh

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
1818
# Create log directories
1919
mkdir -p /logs/verifier
2020

21+
# ── validation_result sidecar ───────────────────────────────────────────────
22+
write_validation_result() {
23+
local reward="$1"
24+
local sub_scores_json="${2:-{\}}"
25+
local details_json="${3:-null}"
26+
local status="${4:-scored}"
27+
local failure_json="${5:-null}"
28+
python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
29+
import json, sys
30+
31+
reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
32+
try:
33+
reward = float(reward_str)
34+
except (TypeError, ValueError):
35+
reward = 0.0
36+
try:
37+
sub_scores = json.loads(sub_scores_raw)
38+
except Exception:
39+
sub_scores = {}
40+
try:
41+
details = json.loads(details_raw) if details_raw != "null" else None
42+
except Exception:
43+
details = None
44+
try:
45+
failure = json.loads(failure_raw) if failure_raw != "null" else None
46+
except Exception:
47+
failure = None
48+
49+
if status != "scored":
50+
failure = failure or {
51+
"code": "missing_required_output",
52+
"message": "Agent did not produce required output",
53+
"stage": "output_validation",
54+
}
55+
56+
payload = {
57+
"schema_version": "validation_result.v1alpha1",
58+
"status": status,
59+
"scorable": status == "scored",
60+
"scorer_family": "checklist",
61+
"reward": reward,
62+
"pass_threshold": 0.5,
63+
"passed": status == "scored" and reward >= 0.5,
64+
"output_contract": {
65+
"mode": "unspecified",
66+
"primary_path": None,
67+
"required_artifact": False,
68+
},
69+
"sub_scores": sub_scores,
70+
"failure": failure,
71+
}
72+
if details:
73+
payload["details"] = details
74+
75+
with open("/logs/verifier/validation_result.json", "w") as f:
76+
json.dump(payload, f, indent=2)
77+
VR_PYEOF
78+
}
79+
80+
2181
# Check that the agent produced a result file
2282
if [ ! -f "/workspace/fault_localization_result.json" ]; then
2383
echo "FAIL: /workspace/fault_localization_result.json not found"
2484
echo "0.0" > /logs/verifier/reward.txt
25-
echo ""
26-
echo "Tests completed - Score: 0.0 (no result file)"
27-
exit 0
85+
write_validation_result "0.0" "{}" "null" "invalid_output"
86+
exit 0
2887
fi
2988

3089
# Validate JSON
3190
if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
3291
echo "FAIL: fault_localization_result.json is not valid JSON"
3392
echo "0.0" > /logs/verifier/reward.txt
34-
echo ""
35-
echo "Tests completed - Score: 0.0 (invalid JSON)"
36-
exit 0
93+
write_validation_result "0.0" "{}" "null" "invalid_output"
94+
exit 0
3795
fi
3896

3997
# ── Scoring ──────────────────────────────────────────────
@@ -163,5 +221,6 @@ fi
163221
FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
164222

165223
echo "$FINAL_SCORE" > /logs/verifier/reward.txt
224+
write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
166225
echo ""
167226
echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"

benchmarks/csb_sdlc_debug/linux-hda-intel-suspend-fault-001/tests/test.sh

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
1818
# Create log directories
1919
mkdir -p /logs/verifier
2020

21+
# ── validation_result sidecar ───────────────────────────────────────────────
22+
write_validation_result() {
23+
local reward="$1"
24+
local sub_scores_json="${2:-{\}}"
25+
local details_json="${3:-null}"
26+
local status="${4:-scored}"
27+
local failure_json="${5:-null}"
28+
python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
29+
import json, sys
30+
31+
reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
32+
try:
33+
reward = float(reward_str)
34+
except (TypeError, ValueError):
35+
reward = 0.0
36+
try:
37+
sub_scores = json.loads(sub_scores_raw)
38+
except Exception:
39+
sub_scores = {}
40+
try:
41+
details = json.loads(details_raw) if details_raw != "null" else None
42+
except Exception:
43+
details = None
44+
try:
45+
failure = json.loads(failure_raw) if failure_raw != "null" else None
46+
except Exception:
47+
failure = None
48+
49+
if status != "scored":
50+
failure = failure or {
51+
"code": "missing_required_output",
52+
"message": "Agent did not produce required output",
53+
"stage": "output_validation",
54+
}
55+
56+
payload = {
57+
"schema_version": "validation_result.v1alpha1",
58+
"status": status,
59+
"scorable": status == "scored",
60+
"scorer_family": "checklist",
61+
"reward": reward,
62+
"pass_threshold": 0.5,
63+
"passed": status == "scored" and reward >= 0.5,
64+
"output_contract": {
65+
"mode": "unspecified",
66+
"primary_path": None,
67+
"required_artifact": False,
68+
},
69+
"sub_scores": sub_scores,
70+
"failure": failure,
71+
}
72+
if details:
73+
payload["details"] = details
74+
75+
with open("/logs/verifier/validation_result.json", "w") as f:
76+
json.dump(payload, f, indent=2)
77+
VR_PYEOF
78+
}
79+
80+
2181
# Check that the agent produced a result file
2282
if [ ! -f "/workspace/fault_localization_result.json" ]; then
2383
echo "FAIL: /workspace/fault_localization_result.json not found"
2484
echo "0.0" > /logs/verifier/reward.txt
25-
echo ""
26-
echo "Tests completed - Score: 0.0 (no result file)"
27-
exit 0
85+
write_validation_result "0.0" "{}" "null" "invalid_output"
86+
exit 0
2887
fi
2988

3089
# Validate JSON
3190
if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
3291
echo "FAIL: fault_localization_result.json is not valid JSON"
3392
echo "0.0" > /logs/verifier/reward.txt
34-
echo ""
35-
echo "Tests completed - Score: 0.0 (invalid JSON)"
36-
exit 0
93+
write_validation_result "0.0" "{}" "null" "invalid_output"
94+
exit 0
3795
fi
3896

3997
# ── Scoring ──────────────────────────────────────────────
@@ -160,5 +218,6 @@ fi
160218
FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
161219

162220
echo "$FINAL_SCORE" > /logs/verifier/reward.txt
221+
write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
163222
echo ""
164223
echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"

benchmarks/csb_sdlc_debug/linux-iwlwifi-subdevice-fault-001/tests/test.sh

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
1818
# Create log directories
1919
mkdir -p /logs/verifier
2020

21+
# ── validation_result sidecar ───────────────────────────────────────────────
22+
write_validation_result() {
23+
local reward="$1"
24+
local sub_scores_json="${2:-{\}}"
25+
local details_json="${3:-null}"
26+
local status="${4:-scored}"
27+
local failure_json="${5:-null}"
28+
python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
29+
import json, sys
30+
31+
reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
32+
try:
33+
reward = float(reward_str)
34+
except (TypeError, ValueError):
35+
reward = 0.0
36+
try:
37+
sub_scores = json.loads(sub_scores_raw)
38+
except Exception:
39+
sub_scores = {}
40+
try:
41+
details = json.loads(details_raw) if details_raw != "null" else None
42+
except Exception:
43+
details = None
44+
try:
45+
failure = json.loads(failure_raw) if failure_raw != "null" else None
46+
except Exception:
47+
failure = None
48+
49+
if status != "scored":
50+
failure = failure or {
51+
"code": "missing_required_output",
52+
"message": "Agent did not produce required output",
53+
"stage": "output_validation",
54+
}
55+
56+
payload = {
57+
"schema_version": "validation_result.v1alpha1",
58+
"status": status,
59+
"scorable": status == "scored",
60+
"scorer_family": "checklist",
61+
"reward": reward,
62+
"pass_threshold": 0.5,
63+
"passed": status == "scored" and reward >= 0.5,
64+
"output_contract": {
65+
"mode": "unspecified",
66+
"primary_path": None,
67+
"required_artifact": False,
68+
},
69+
"sub_scores": sub_scores,
70+
"failure": failure,
71+
}
72+
if details:
73+
payload["details"] = details
74+
75+
with open("/logs/verifier/validation_result.json", "w") as f:
76+
json.dump(payload, f, indent=2)
77+
VR_PYEOF
78+
}
79+
80+
2181
# Check that the agent produced a result file
2282
if [ ! -f "/workspace/fault_localization_result.json" ]; then
2383
echo "FAIL: /workspace/fault_localization_result.json not found"
2484
echo "0.0" > /logs/verifier/reward.txt
25-
echo ""
26-
echo "Tests completed - Score: 0.0 (no result file)"
27-
exit 0
85+
write_validation_result "0.0" "{}" "null" "invalid_output"
86+
exit 0
2887
fi
2988

3089
# Validate JSON
3190
if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
3291
echo "FAIL: fault_localization_result.json is not valid JSON"
3392
echo "0.0" > /logs/verifier/reward.txt
34-
echo ""
35-
echo "Tests completed - Score: 0.0 (invalid JSON)"
36-
exit 0
93+
write_validation_result "0.0" "{}" "null" "invalid_output"
94+
exit 0
3795
fi
3896

3997
# ── Scoring ──────────────────────────────────────────────
@@ -159,5 +217,6 @@ fi
159217
FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
160218

161219
echo "$FINAL_SCORE" > /logs/verifier/reward.txt
220+
write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
162221
echo ""
163222
echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"

benchmarks/csb_sdlc_debug/linux-nfs-inode-revalidate-fault-001/tests/test.sh

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
1818
# Create log directories
1919
mkdir -p /logs/verifier
2020

21+
# ── validation_result sidecar ───────────────────────────────────────────────
22+
write_validation_result() {
23+
local reward="$1"
24+
local sub_scores_json="${2:-{\}}"
25+
local details_json="${3:-null}"
26+
local status="${4:-scored}"
27+
local failure_json="${5:-null}"
28+
python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
29+
import json, sys
30+
31+
reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
32+
try:
33+
reward = float(reward_str)
34+
except (TypeError, ValueError):
35+
reward = 0.0
36+
try:
37+
sub_scores = json.loads(sub_scores_raw)
38+
except Exception:
39+
sub_scores = {}
40+
try:
41+
details = json.loads(details_raw) if details_raw != "null" else None
42+
except Exception:
43+
details = None
44+
try:
45+
failure = json.loads(failure_raw) if failure_raw != "null" else None
46+
except Exception:
47+
failure = None
48+
49+
if status != "scored":
50+
failure = failure or {
51+
"code": "missing_required_output",
52+
"message": "Agent did not produce required output",
53+
"stage": "output_validation",
54+
}
55+
56+
payload = {
57+
"schema_version": "validation_result.v1alpha1",
58+
"status": status,
59+
"scorable": status == "scored",
60+
"scorer_family": "checklist",
61+
"reward": reward,
62+
"pass_threshold": 0.5,
63+
"passed": status == "scored" and reward >= 0.5,
64+
"output_contract": {
65+
"mode": "unspecified",
66+
"primary_path": None,
67+
"required_artifact": False,
68+
},
69+
"sub_scores": sub_scores,
70+
"failure": failure,
71+
}
72+
if details:
73+
payload["details"] = details
74+
75+
with open("/logs/verifier/validation_result.json", "w") as f:
76+
json.dump(payload, f, indent=2)
77+
VR_PYEOF
78+
}
79+
80+
2181
# Check that the agent produced a result file
2282
if [ ! -f "/workspace/fault_localization_result.json" ]; then
2383
echo "FAIL: /workspace/fault_localization_result.json not found"
2484
echo "0.0" > /logs/verifier/reward.txt
25-
echo ""
26-
echo "Tests completed - Score: 0.0 (no result file)"
27-
exit 0
85+
write_validation_result "0.0" "{}" "null" "invalid_output"
86+
exit 0
2887
fi
2988

3089
# Validate JSON
3190
if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
3291
echo "FAIL: fault_localization_result.json is not valid JSON"
3392
echo "0.0" > /logs/verifier/reward.txt
34-
echo ""
35-
echo "Tests completed - Score: 0.0 (invalid JSON)"
36-
exit 0
93+
write_validation_result "0.0" "{}" "null" "invalid_output"
94+
exit 0
3795
fi
3896

3997
# ── Scoring ──────────────────────────────────────────────
@@ -159,5 +217,6 @@ fi
159217
FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
160218

161219
echo "$FINAL_SCORE" > /logs/verifier/reward.txt
220+
write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
162221
echo ""
163222
echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"

0 commit comments

Comments
 (0)