Skip to content

Commit 8f60e2f

Browse files
committed
Add validation results to org oracle verifiers
1 parent d89d2e9 commit 8f60e2f

File tree

83 files changed

+14848
-1235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+14848
-1235
lines changed

benchmarks/csb_org_compliance/ccx-compliance-124/tests/eval.sh

Lines changed: 178 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,193 @@
11
#!/bin/bash
2-
# eval.sh — org-scale benchmark evaluator for CCX-compliance-124
2+
# eval.sh — MCP-unique benchmark evaluator for CCX-compliance-124
33
# Exit-code-first (SWE-Factory pattern):
44
# exit 0 — agent produced useful output (composite score > 0)
55
# exit 1 — total failure (composite score == 0 or missing answer)
66
#
77
# Writes /logs/verifier/reward.txt with the composite score [0.0, 1.0]
8+
# and /logs/verifier/validation_result.json with canonical verifier semantics.
89

910
set -euo pipefail
1011

11-
TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
12-
TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
13-
TASK_OUTPUT="${TASK_OUTPUT:-$TASK_WORKDIR/answer.json}"
14-
1512
TASK_ID="CCX-compliance-124"
13+
TASK_WORKDIR="$(printenv TASK_WORKDIR || true)"
14+
[ -n "$TASK_WORKDIR" ] || TASK_WORKDIR="/workspace"
15+
TASK_REPO_ROOT="$(printenv TASK_REPO_ROOT || true)"
16+
[ -n "$TASK_REPO_ROOT" ] || TASK_REPO_ROOT="$(printenv VERIFY_REPO || true)"
17+
[ -n "$TASK_REPO_ROOT" ] || TASK_REPO_ROOT="$TASK_WORKDIR"
18+
TASK_OUTPUT="$(printenv TASK_OUTPUT || true)"
19+
[ -n "$TASK_OUTPUT" ] || TASK_OUTPUT="$TASK_WORKDIR/answer.json"
1620
ANSWER_PATH="$TASK_OUTPUT"
1721
TASK_SPEC_PATH="/tests/task_spec.json"
1822
ORACLE_CHECKS="/tests/oracle_checks.py"
1923
REWARD_PATH="/logs/verifier/reward.txt"
24+
VALIDATION_RESULT="/logs/verifier/validation_result.json"
25+
VALIDATION_RESULT_SCHEMA="validation_result.v1alpha1"
26+
SCORER_FAMILY="oracle_checks"
27+
PASS_THRESHOLD="0.0"
2028

2129
mkdir -p /logs/verifier
2230

31+
write_validation_failure() {
32+
local code="$1"
33+
local message="$2"
34+
local stage="$3"
35+
python3 - "$VALIDATION_RESULT" "$code" "$message" "$stage" "$TASK_OUTPUT" "$VALIDATION_RESULT_SCHEMA" "$SCORER_FAMILY" "$PASS_THRESHOLD" <<'PYEOF'
36+
import json
37+
import sys
38+
39+
(
40+
output_path,
41+
code,
42+
message,
43+
stage,
44+
primary_path,
45+
schema_version,
46+
scorer_family,
47+
pass_threshold,
48+
) = sys.argv[1:]
49+
50+
status = "invalid_output" if stage == "output_validation" else "verifier_error"
51+
payload = {
52+
"schema_version": schema_version,
53+
"status": status,
54+
"scorable": False,
55+
"scorer_family": scorer_family,
56+
"reward": 0.0,
57+
"pass_threshold": float(pass_threshold),
58+
"passed": False,
59+
"output_contract": {
60+
"mode": "answer_json_native",
61+
"primary_path": primary_path,
62+
"required_artifact": True,
63+
},
64+
"sub_scores": {},
65+
"failure": {
66+
"code": code,
67+
"message": message,
68+
"stage": stage,
69+
},
70+
}
71+
with open(output_path, "w") as f:
72+
json.dump(payload, f, indent=2)
73+
PYEOF
74+
}
75+
76+
run_oracle_validation() {
77+
python3 - "$ORACLE_CHECKS" "$ANSWER_PATH" "$TASK_SPEC_PATH" "$VALIDATION_RESULT" "$TASK_OUTPUT" "$VALIDATION_RESULT_SCHEMA" "$SCORER_FAMILY" "$PASS_THRESHOLD" <<'PYEOF'
78+
import importlib.util
79+
import json
80+
import sys
81+
from pathlib import Path
82+
83+
(
84+
oracle_checks_path,
85+
answer_path,
86+
task_spec_path,
87+
output_path,
88+
primary_path,
89+
schema_version,
90+
scorer_family,
91+
pass_threshold,
92+
) = sys.argv[1:]
93+
94+
spec = importlib.util.spec_from_file_location("oracle_checks", oracle_checks_path)
95+
if spec is None or spec.loader is None:
96+
raise RuntimeError(f"Failed to load oracle checks module from {oracle_checks_path}")
97+
module = importlib.util.module_from_spec(spec)
98+
spec.loader.exec_module(module)
99+
100+
check_result = module.run_all_checks(answer_path, task_spec_path)
101+
threshold = float(pass_threshold)
102+
103+
def primary_score(check_type: str, result: dict) -> float:
104+
if check_type == "file_set_match":
105+
return float(result.get("weighted_f1", result.get("f1", 0.0)))
106+
if check_type == "symbol_resolution":
107+
return float(result.get("recall", 0.0))
108+
if check_type == "dependency_chain":
109+
return float(result.get("chain_recall", 0.0))
110+
if check_type == "provenance":
111+
return float(result.get("provenance_score", 0.0))
112+
if check_type == "keyword_presence":
113+
return float(result.get("keyword_recall", 0.0))
114+
if check_type == "json_schema_match":
115+
return 1.0 if result.get("valid") else 0.0
116+
if check_type == "test_ratio":
117+
return float(result.get("ratio", 0.0))
118+
value = result.get("score", 0.0)
119+
if isinstance(value, bool):
120+
return 1.0 if value else 0.0
121+
return float(value)
122+
123+
if "error" in check_result:
124+
payload = {
125+
"schema_version": schema_version,
126+
"status": "verifier_error",
127+
"scorable": False,
128+
"scorer_family": scorer_family,
129+
"reward": 0.0,
130+
"pass_threshold": threshold,
131+
"passed": False,
132+
"output_contract": {
133+
"mode": "answer_json_native",
134+
"primary_path": primary_path,
135+
"required_artifact": True,
136+
},
137+
"sub_scores": {},
138+
"failure": {
139+
"code": "oracle_checks_error",
140+
"message": str(check_result["error"]),
141+
"stage": "scoring",
142+
},
143+
"details": {
144+
"oracle_checks": check_result,
145+
},
146+
"composite_score": 0.0,
147+
"checks": {},
148+
"error": check_result["error"],
149+
}
150+
score = 0.0
151+
else:
152+
raw_checks = check_result.get("checks", {})
153+
sub_scores = {}
154+
for check_type, result in raw_checks.items():
155+
score = round(primary_score(check_type, result), 4)
156+
sub_scores[check_type] = {
157+
"score": score,
158+
"passed": score > 0.0,
159+
}
160+
161+
score = round(float(check_result.get("composite_score", 0.0)), 4)
162+
payload = {
163+
"schema_version": schema_version,
164+
"status": "scored",
165+
"scorable": True,
166+
"scorer_family": scorer_family,
167+
"reward": score,
168+
"pass_threshold": threshold,
169+
"passed": score > threshold,
170+
"output_contract": {
171+
"mode": "answer_json_native",
172+
"primary_path": primary_path,
173+
"required_artifact": True,
174+
},
175+
"sub_scores": sub_scores,
176+
"failure": None,
177+
"details": {
178+
"oracle_checks": check_result,
179+
},
180+
"composite_score": score,
181+
"checks": raw_checks,
182+
}
183+
184+
with open(output_path, "w") as f:
185+
json.dump(payload, f, indent=2)
186+
187+
print(f"{score:.4f}")
188+
PYEOF
189+
}
190+
23191
echo "=== CCX-compliance-124 evaluator ==="
24192
echo "Task spec: $TASK_SPEC_PATH"
25193
echo "Answer: $ANSWER_PATH"
@@ -36,13 +204,15 @@ fi
36204
if [ ! -f "$ANSWER_PATH" ]; then
37205
echo "ERROR: answer.json not found at $ANSWER_PATH"
38206
echo "0.0" > "$REWARD_PATH"
207+
write_validation_failure "missing_required_output" "answer.json not found at $ANSWER_PATH" "output_validation"
39208
exit 1
40209
fi
41210

42211
# Validate answer is valid JSON
43212
if ! python3 -c "import json; json.load(open('$ANSWER_PATH'))" 2>/dev/null; then
44213
echo "ERROR: answer.json is not valid JSON"
45214
echo "0.0" > "$REWARD_PATH"
215+
write_validation_failure "invalid_answer_json" "answer.json is not valid JSON" "output_validation"
46216
exit 1
47217
fi
48218

@@ -52,16 +222,18 @@ echo "answer.json found and valid JSON"
52222
if [ ! -f "$ORACLE_CHECKS" ]; then
53223
echo "ERROR: oracle_checks.py not found at $ORACLE_CHECKS"
54224
echo "0.0" > "$REWARD_PATH"
225+
write_validation_failure "missing_oracle_checks" "oracle_checks.py not found at $ORACLE_CHECKS" "verifier_runtime"
55226
exit 1
56227
fi
57228

58229
echo "Running oracle checks..."
59-
SCORE=$(python3 "$ORACLE_CHECKS" --answer "$ANSWER_PATH" --spec "$TASK_SPEC_PATH" --verbose 2>&1 | tee /dev/stderr | tail -1) || true
230+
SCORE=$(run_oracle_validation) || true
60231

61232
# Validate score is a number
62233
if ! echo "$SCORE" | python3 -c "import sys; float(sys.stdin.read().strip())" 2>/dev/null; then
63234
echo "ERROR: oracle_checks.py did not return a valid score: $SCORE"
64235
echo "0.0" > "$REWARD_PATH"
236+
write_validation_failure "invalid_verifier_score" "oracle_checks.py did not return a valid score: $SCORE" "scoring"
65237
exit 1
66238
fi
67239

0 commit comments

Comments
 (0)