sourcegraph
diff --git a/‎benchmarks/csb_sdlc_feature/bustub-hyperloglog-impl-001/tests/test.sh‎
Lines changed: 122 additions & 0 deletions b/‎benchmarks/csb_sdlc_feature/bustub-hyperloglog-impl-001/tests/test.sh‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎benchmarks/csb_sdlc_feature/k8s-noschedule-taint-feat-001/tests/test.sh‎
Lines changed: 127 additions & 7 deletions b/‎benchmarks/csb_sdlc_feature/k8s-noschedule-taint-feat-001/tests/test.sh‎
Lines changed: 127 additions & 7 deletions
@@ -9,6 +9,7 @@ set -e
 
 TRAJECTORY_PATH="${TRAJECTORY_PATH:-/logs/trajectory.jsonl}"
 OUTPUT_PATH="/logs/tac_result.json"
+PASS_THRESHOLD="1.0"
 
 if [ ! -f "$TRAJECTORY_PATH" ]; then
     echo '[]' > "$TRAJECTORY_PATH"
@@ -33,6 +34,118 @@ python_default /utils/eval.py \
 
 mkdir -p /logs/verifier
 
+write_validation_result() {
+    python3 - "$OUTPUT_PATH" "$PASS_THRESHOLD" <<'PYEOF'
+import json
+import re
+import sys
+
+result_path, pass_threshold = sys.argv[1:3]
+threshold = float(pass_threshold)
+status = "scored"
+reward = 0.0
+result_payload = {}
+
+try:
+    with open(result_path) as f:
+        result_payload = json.load(f)
+except FileNotFoundError:
+    status = "verifier_error"
+    result_payload = {"error": f"Missing TAC result payload at {result_path}"}
+except Exception as exc:
+    status = "verifier_error"
+    result_payload = {"error": f"Failed to parse TAC result payload: {exc}"}
+
+if status == "scored":
+    final_score = result_payload.get("final_score")
+    if isinstance(final_score, dict):
+        total = final_score.get("total", 0) or 0
+        result = final_score.get("result", 0) or 0
+        try:
+            reward = round(float(result) / float(total), 4) if float(total) > 0 else 0.0
+        except (TypeError, ValueError, ZeroDivisionError):
+            reward = 0.0
+            status = "verifier_error"
+    elif "score" in result_payload:
+        try:
+            reward = float(result_payload.get("score", 0.0) or 0.0)
+        except (TypeError, ValueError):
+            reward = 0.0
+            status = "verifier_error"
+    else:
+        status = "verifier_error"
+        result_payload.setdefault("error", "TAC payload did not include final_score or score")
+
+if result_payload.get("error"):
+    status = "verifier_error"
+    reward = 0.0
+
+def slugify(value: str) -> str:
+    value = re.sub(r"[^a-z0-9]+", "_", value.lower()).strip("_")
+    return value or "unnamed_check"
+
+checks = {}
+for index, checkpoint in enumerate(result_payload.get("checkpoints", []), start=1):
+    if isinstance(checkpoint, dict):
+        raw_name = (
+            checkpoint.get("name")
+            or checkpoint.get("title")
+            or checkpoint.get("description")
+            or checkpoint.get("id")
+            or f"checkpoint_{index}"
+        )
+        key = slugify(str(raw_name))
+        value = checkpoint.get("score")
+        if value is None and isinstance(checkpoint.get("passed"), bool):
+            value = 1.0 if checkpoint["passed"] else 0.0
+        if value is None:
+            value = checkpoint.get("result")
+        try:
+            checks[key] = float(value)
+        except (TypeError, ValueError):
+            continue
+
+failure = None
+if status != "scored":
+    failure = {
+        "code": "verifier_exception",
+        "message": str(result_payload.get("error") or "TAC verifier did not produce a usable result"),
+        "stage": "scoring",
+    }
+
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": status,
+    "scorable": status == "scored",
+    "scorer_family": "checklist",
+    "reward": reward,
+    "pass_threshold": threshold,
+    "passed": status == "scored" and reward >= threshold,
+    "output_contract": {
+        "mode": "unspecified",
+        "primary_path": None,
+        "required_artifact": False,
+    },
+    "sub_scores": {"checks": checks} if checks else {},
+    "failure": failure,
+    "legacy": {
+        "reward_json": result_payload,
+    },
+}
+
+details = {}
+if isinstance(result_payload.get("final_score"), dict):
+    details["final_score"] = result_payload["final_score"]
+if result_payload.get("checkpoints") is not None:
+    details["checkpoints"] = result_payload.get("checkpoints")
+if details:
+    payload["details"] = details
+
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+PYEOF
+}
+
 if [ -f "$OUTPUT_PATH" ]; then
     SCORE=$(python3 -c "
 import json
@@ -49,8 +162,17 @@ else:
     echo "TAC Score: $SCORE"
     echo "$SCORE" > /logs/verifier/reward.txt
     cp "$OUTPUT_PATH" /logs/verifier/reward.json 2>/dev/null || true
+    write_validation_result
     exit 0
 else
     echo "0.0" > /logs/verifier/reward.txt
+    python3 - <<'PYEOF'
+import json
+
+fallback = {"score": 0.0, "error": "TAC evaluator did not produce /logs/tac_result.json"}
+with open("/logs/verifier/reward.json", "w") as f:
+    json.dump(fallback, f, indent=2)
+PYEOF
+    write_validation_result
     exit 0
 fi
@@ -6,10 +6,10 @@
 # 2. Tests for the new effect pass
 # 3. No regressions in existing taint logic
 #
-# NOTE: A reward.json file exists alongside this task defining additional
-# manual evaluation criteria (e.g., architecture_understanding, code_quality).
-# These criteria are NOT automatically scored by this script and would require
-# an LLM judge to evaluate. This script handles automated scoring only.
+# NOTE: This task also ships a static rubric JSON file with additional manual
+# evaluation criteria (e.g., architecture_understanding, code_quality).
+# Those rubric criteria are descriptive only; this verifier scores automated
+# repo-state checks and emits the canonical validation_result sidecar.
 
 set -e
 
@@ -23,17 +23,137 @@ fi
 TASK_WORKDIR="${TASK_WORKDIR:-/workspace}"
 TASK_REPO_ROOT="${TASK_REPO_ROOT:-${VERIFY_REPO:-$TASK_WORKDIR}}"
 VERIFY_REPO="${VERIFY_REPO:-$TASK_REPO_ROOT}"
+TASK_OUTPUT="${TASK_OUTPUT:-/workspace/answer.json}"
+PASS_THRESHOLD="0.7"
+ARTIFACT_REQUIRED=false
+if [ "${ARTIFACT_ONLY:-false}" = "true" ]; then
+    ARTIFACT_REQUIRED=true
+fi
 
 cd "$TASK_REPO_ROOT"
 
 # Create log directories
 mkdir -p /logs/verifier
 
+TAINT_ADDED=0
+CHANGES_MADE=0
+TESTS_ADDED=0
+BUILD_OK=0
+UNIT_TEST_PASS=0
+
+write_invalid_output() {
+    local code="$1"
+    local message="$2"
+    python3 - "$code" "$message" "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
+import json
+import sys
+
+code, message, primary_path, required_artifact, pass_threshold = sys.argv[1:6]
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": "invalid_output",
+    "scorable": False,
+    "scorer_family": "repo_state_heuristic",
+    "reward": 0.0,
+    "pass_threshold": float(pass_threshold),
+    "passed": False,
+    "output_contract": {
+        "mode": "answer_json_bridge",
+        "primary_path": primary_path,
+        "required_artifact": required_artifact == "true",
+    },
+    "sub_scores": {},
+    "failure": {
+        "code": code,
+        "message": message,
+        "stage": "output_validation",
+    },
+}
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+PYEOF
+    echo "0.0" > /logs/verifier/reward.txt
+}
+
+write_scored_result() {
+    local score="$1"
+    local reason="${2:-}"
+    VALIDATION_SCORE="$score" \
+    VALIDATION_REASON="$reason" \
+    UNSTAGED_COUNT="${UNSTAGED_COUNT:-0}" \
+    STAGED_COUNT="${STAGED_COUNT:-0}" \
+    UNTRACKED_COUNT="${UNTRACKED_COUNT:-0}" \
+    COMMIT_COUNT="${COMMIT_COUNT:-0}" \
+    python3 - "$TASK_OUTPUT" "$ARTIFACT_REQUIRED" "$PASS_THRESHOLD" <<'PYEOF'
+import json
+import os
+import sys
+
+primary_path, required_artifact, pass_threshold = sys.argv[1:4]
+reward = float(os.environ.get("VALIDATION_SCORE", "0.0"))
+threshold = float(pass_threshold)
+checks = {
+    "taint_added": float(os.environ.get("TAINT_ADDED", "0") or 0),
+    "changes_made": float(os.environ.get("CHANGES_MADE", "0") or 0),
+    "tests_added": float(os.environ.get("TESTS_ADDED", "0") or 0),
+    "unit_test_pass": float(os.environ.get("UNIT_TEST_PASS", "0") or 0),
+    "build_ok": float(os.environ.get("BUILD_OK", "0") or 0),
+}
+details = {
+    "check_weights": {
+        "taint_added": 0.3,
+        "changes_made": 0.2,
+        "tests_added": 0.2,
+        "unit_test_pass": 0.3,
+        "build_ok": "gating",
+    },
+    "change_detection": {
+        "unstaged": int(os.environ.get("UNSTAGED_COUNT", "0") or 0),
+        "staged": int(os.environ.get("STAGED_COUNT", "0") or 0),
+        "untracked": int(os.environ.get("UNTRACKED_COUNT", "0") or 0),
+        "commits": int(os.environ.get("COMMIT_COUNT", "0") or 0),
+    },
+}
+reason = os.environ.get("VALIDATION_REASON")
+if reason:
+    details["reason"] = reason
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": "scored",
+    "scorable": True,
+    "scorer_family": "repo_state_heuristic",
+    "reward": reward,
+    "pass_threshold": threshold,
+    "passed": reward >= threshold,
+    "output_contract": {
+        "mode": "answer_json_bridge",
+        "primary_path": primary_path,
+        "required_artifact": required_artifact == "true",
+    },
+    "sub_scores": {
+        "checks": checks,
+    },
+    "failure": None,
+    "details": details,
+}
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+PYEOF
+    echo "$score" > /logs/verifier/reward.txt
+}
+
 # Fix git safe.directory: the repo was cloned as root during Docker build,
 # but the verifier may run as a different user. Without this, all git
 # commands silently fail due to CVE-2022-24765 ownership checks.
 git config --global --add safe.directory "$TASK_REPO_ROOT" 2>/dev/null || true
 
+if [ "${ARTIFACT_ONLY:-false}" = "true" ] && [ ! -f "${ANSWER_JSON:-$TASK_OUTPUT}" ]; then
+    echo "Required answer.json artifact missing at ${ANSWER_JSON:-$TASK_OUTPUT}"
+    write_invalid_output "missing_required_output" \
+        "answer.json not found at ${ANSWER_JSON:-$TASK_OUTPUT}"
+    exit 0
+fi
+
 # Guard: if no code changes were made, the agent didn't execute successfully
 # Check unstaged changes, staged changes, untracked files, AND new commits
 UNSTAGED_COUNT=$(git diff --stat 2>/dev/null | wc -l)
@@ -63,7 +183,7 @@ fi
 echo "Change detection: unstaged=$UNSTAGED_COUNT staged=$STAGED_COUNT untracked=$UNTRACKED_COUNT commits=$COMMIT_COUNT (origin_ref=${ORIGIN_REF:-none})"
 if [ "$UNSTAGED_COUNT" -eq 0 ] && [ "$STAGED_COUNT" -eq 0 ] && [ "$UNTRACKED_COUNT" -eq 0 ] && [ "$COMMIT_COUNT" -eq 0 ]; then
     echo "No code changes detected — agent did not execute successfully"
-    echo "0.0" > /logs/verifier/reward.txt
+    write_scored_result "0.0" "no_code_changes"
     echo ""
     echo "[ ] Tests completed - Score: 0.0 (no changes)"
     exit 0
@@ -91,7 +211,7 @@ fi
 
 if [ "$BUILD_OK" -eq 0 ]; then
     echo "Compilation failed — score set to 0.0"
-    echo "0.0" > /logs/verifier/reward.txt
+    write_scored_result "0.0" "build_failure"
     echo ""
     echo "[ ] Tests completed - Score: 0.0 (build failure)"
     exit 0
@@ -178,6 +298,6 @@ fi
 # Convert back to decimal (using awk for portable floating point)
 SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE_NUMERATOR / 10}")
 
-echo "$SCORE" > /logs/verifier/reward.txt
+write_scored_result "$SCORE" "completed"
 echo ""
 echo "[x] Tests completed - Score: $SCORE"