|
6 | 6 | # 2. Tests for the new effect pass |
7 | 7 | # 3. No regressions in existing taint logic |
8 | 8 | # |
9 | | -# NOTE: A reward.json file exists alongside this task defining additional |
10 | | -# manual evaluation criteria (e.g., architecture_understanding, code_quality). |
11 | | -# These criteria are NOT automatically scored by this script and would require |
12 | | -# an LLM judge to evaluate. This script handles automated scoring only. |
| 9 | +# NOTE: This task also ships a static rubric JSON file with additional manual |
| 10 | +# evaluation criteria (e.g., architecture_understanding, code_quality). |
| 11 | +# Those rubric criteria are descriptive only; this verifier scores automated |
| 12 | +# repo-state checks and emits the canonical validation_result sidecar. |
13 | 13 |
|
14 | 14 | set -e |
15 | 15 |
|
|
23 | 23 | TASK_WORKDIR="${TASK_WORKDIR:-/workspace}" |
24 | 24 | TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}" |
25 | 25 | VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}" |
| 26 | +TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}" |
| 27 | +PASS_THRESHOLD="0.7" |
| 28 | +ARTIFACT_REQUIRED=false |
| 29 | +if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then |
| 30 | + ARTIFACT_REQUIRED=true |
| 31 | +fi |
26 | 32 |
|
27 | 33 | cd "$TASK_REPO_ROOT" |
28 | 34 |
|
29 | 35 | # Create log directories |
30 | 36 | mkdir -p /logs/verifier |
31 | 37 |
|
| 38 | +TAINT_ADDED=0 |
| 39 | +CHANGES_MADE=0 |
| 40 | +TESTS_ADDED=0 |
| 41 | +BUILD_OK=0 |
| 42 | +UNIT_TEST_PASS=0 |
| 43 | + |
| 44 | +write_invalid_output() { |
| 45 | + local code="$1" |
| 46 | + local message="$2" |
| 47 | + python3 - "$code" "$message" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF' |
| 48 | +import json |
| 49 | +import sys |
| 50 | +
|
| 51 | +code, message, primary_path, required_artifact, pass_threshold = sys.argv[1:6] |
| 52 | +payload = { |
| 53 | + "schema_version": "validation_result.v1alpha1", |
| 54 | + "status": "invalid_output", |
| 55 | + "scorable": False, |
| 56 | + "scorer_family": "repo_state_heuristic", |
| 57 | + "reward": 0.0, |
| 58 | + "pass_threshold": float(pass_threshold), |
| 59 | + "passed": False, |
| 60 | + "output_contract": { |
| 61 | + "mode": "answer_json_bridge", |
| 62 | + "primary_path": primary_path, |
| 63 | + "required_artifact": required_artifact == "true", |
| 64 | + }, |
| 65 | + "sub_scores": {}, |
| 66 | + "failure": { |
| 67 | + "code": code, |
| 68 | + "message": message, |
| 69 | + "stage": "output_validation", |
| 70 | + }, |
| 71 | +} |
| 72 | +with open("/logs/verifier/validation_result.json", "w") as f: |
| 73 | + json.dump(payload, f, indent=2) |
| 74 | +PYEOF |
| 75 | + echo "0.0" > /logs/verifier/reward.txt |
| 76 | +} |
| 77 | + |
| 78 | +write_scored_result() { |
| 79 | + local score="$1" |
| 80 | + local reason="${2:-}" |
| 81 | + VALIDATION_SCORE="$score" \ |
| 82 | + VALIDATION_REASON="$reason" \ |
| 83 | + UNSTAGED_COUNT="${UNSTAGED_COUNT:-0}" \ |
| 84 | + STAGED_COUNT="${STAGED_COUNT:-0}" \ |
| 85 | + UNTRACKED_COUNT="${UNTRACKED_COUNT:-0}" \ |
| 86 | + COMMIT_COUNT="${COMMIT_COUNT:-0}" \ |
| 87 | + python3 - "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF' |
| 88 | +import json |
| 89 | +import os |
| 90 | +import sys |
| 91 | +
|
| 92 | +primary_path, required_artifact, pass_threshold = sys.argv[1:4] |
| 93 | +reward = float(os.environ.get("VALIDATION_SCORE", "0.0")) |
| 94 | +threshold = float(pass_threshold) |
| 95 | +checks = { |
| 96 | + "taint_added": float(os.environ.get("TAINT_ADDED", "0") or 0), |
| 97 | + "changes_made": float(os.environ.get("CHANGES_MADE", "0") or 0), |
| 98 | + "tests_added": float(os.environ.get("TESTS_ADDED", "0") or 0), |
| 99 | + "unit_test_pass": float(os.environ.get("UNIT_TEST_PASS", "0") or 0), |
| 100 | + "build_ok": float(os.environ.get("BUILD_OK", "0") or 0), |
| 101 | +} |
| 102 | +details = { |
| 103 | + "check_weights": { |
| 104 | + "taint_added": 0.3, |
| 105 | + "changes_made": 0.2, |
| 106 | + "tests_added": 0.2, |
| 107 | + "unit_test_pass": 0.3, |
| 108 | + "build_ok": "gating", |
| 109 | + }, |
| 110 | + "change_detection": { |
| 111 | + "unstaged": int(os.environ.get("UNSTAGED_COUNT", "0") or 0), |
| 112 | + "staged": int(os.environ.get("STAGED_COUNT", "0") or 0), |
| 113 | + "untracked": int(os.environ.get("UNTRACKED_COUNT", "0") or 0), |
| 114 | + "commits": int(os.environ.get("COMMIT_COUNT", "0") or 0), |
| 115 | + }, |
| 116 | +} |
| 117 | +reason = os.environ.get("VALIDATION_REASON") |
| 118 | +if reason: |
| 119 | + details["reason"] = reason |
| 120 | +payload = { |
| 121 | + "schema_version": "validation_result.v1alpha1", |
| 122 | + "status": "scored", |
| 123 | + "scorable": True, |
| 124 | + "scorer_family": "repo_state_heuristic", |
| 125 | + "reward": reward, |
| 126 | + "pass_threshold": threshold, |
| 127 | + "passed": reward >= threshold, |
| 128 | + "output_contract": { |
| 129 | + "mode": "answer_json_bridge", |
| 130 | + "primary_path": primary_path, |
| 131 | + "required_artifact": required_artifact == "true", |
| 132 | + }, |
| 133 | + "sub_scores": { |
| 134 | + "checks": checks, |
| 135 | + }, |
| 136 | + "failure": None, |
| 137 | + "details": details, |
| 138 | +} |
| 139 | +with open("/logs/verifier/validation_result.json", "w") as f: |
| 140 | + json.dump(payload, f, indent=2) |
| 141 | +PYEOF |
| 142 | + echo "$score" > /logs/verifier/reward.txt |
| 143 | +} |
| 144 | + |
32 | 145 | # Fix git safe.directory: the repo was cloned as root during Docker build, |
33 | 146 | # but the verifier may run as a different user. Without this, all git |
34 | 147 | # commands silently fail due to CVE-2022-24765 ownership checks. |
35 | 148 | git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true |
36 | 149 |
|
| 150 | +if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ ! -f "${ANSWER_JSON:-$TASK_OUTPUT}" ]; then |
| 151 | + echo "Required answer.json artifact missing at ${ANSWER_JSON:-$TASK_OUTPUT}" |
| 152 | + write_invalid_output "missing_required_output" \ |
| 153 | + "answer.json not found at ${ANSWER_JSON:-$TASK_OUTPUT}" |
| 154 | + exit 0 |
| 155 | +fi |
| 156 | + |
37 | 157 | # Guard: if no code changes were made, the agent didn't execute successfully |
38 | 158 | # Check unstaged changes, staged changes, untracked files, AND new commits |
39 | 159 | UNSTAGED_COUNT=$(git diff --stat 2>/dev/null | wc -l) |
|
63 | 183 | echo "Change detection: unstaged=$UNSTAGED_COUNT staged=$STAGED_COUNT untracked=$UNTRACKED_COUNT commits=$COMMIT_COUNT (origin_ref=${ORIGIN_REF:-none})" |
64 | 184 | if [ "$UNSTAGED_COUNT" -eq 0 ] && [ "$STAGED_COUNT" -eq 0 ] && [ "$UNTRACKED_COUNT" -eq 0 ] && [ "$COMMIT_COUNT" -eq 0 ]; then |
65 | 185 | echo "No code changes detected — agent did not execute successfully" |
66 | | - echo "0.0" > /logs/verifier/reward.txt |
| 186 | + write_scored_result "0.0" "no_code_changes" |
67 | 187 | echo "" |
68 | 188 | echo "[ ] Tests completed - Score: 0.0 (no changes)" |
69 | 189 | exit 0 |
|
91 | 211 |
|
92 | 212 | if [ "$BUILD_OK" -eq 0 ]; then |
93 | 213 | echo "Compilation failed — score set to 0.0" |
94 | | - echo "0.0" > /logs/verifier/reward.txt |
| 214 | + write_scored_result "0.0" "build_failure" |
95 | 215 | echo "" |
96 | 216 | echo "[ ] Tests completed - Score: 0.0 (build failure)" |
97 | 217 | exit 0 |
|
178 | 298 | # Convert back to decimal (using awk for portable floating point) |
179 | 299 | SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE_NUMERATOR / 10}") |
180 | 300 |
|
181 | | -echo "$SCORE" > /logs/verifier/reward.txt |
| 301 | +write_scored_result "$SCORE" "completed" |
182 | 302 | echo "" |
183 | 303 | echo "[x] Tests completed - Score: $SCORE" |
0 commit comments