sourcegraph
diff --git a/‎benchmarks/csb_sdlc_debug/linux-acpi-backlight-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions b/‎benchmarks/csb_sdlc_debug/linux-acpi-backlight-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions
diff --git a/‎benchmarks/csb_sdlc_debug/linux-hda-intel-suspend-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions b/‎benchmarks/csb_sdlc_debug/linux-hda-intel-suspend-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions
diff --git a/‎benchmarks/csb_sdlc_debug/linux-iwlwifi-subdevice-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions b/‎benchmarks/csb_sdlc_debug/linux-iwlwifi-subdevice-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions
diff --git a/‎benchmarks/csb_sdlc_debug/linux-nfs-inode-revalidate-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions b/‎benchmarks/csb_sdlc_debug/linux-nfs-inode-revalidate-fault-001/tests/test.sh‎
Lines changed: 65 additions & 6 deletions
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
 # Create log directories
 mkdir -p /logs/verifier
 
+# ── validation_result sidecar ───────────────────────────────────────────────
+write_validation_result() {
+    local reward="$1"
+    local sub_scores_json="${2:-{\}}"
+    local details_json="${3:-null}"
+    local status="${4:-scored}"
+    local failure_json="${5:-null}"
+    python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
+import json, sys
+
+reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
+try:
+    reward = float(reward_str)
+except (TypeError, ValueError):
+    reward = 0.0
+try:
+    sub_scores = json.loads(sub_scores_raw)
+except Exception:
+    sub_scores = {}
+try:
+    details = json.loads(details_raw) if details_raw != "null" else None
+except Exception:
+    details = None
+try:
+    failure = json.loads(failure_raw) if failure_raw != "null" else None
+except Exception:
+    failure = None
+
+if status != "scored":
+    failure = failure or {
+        "code": "missing_required_output",
+        "message": "Agent did not produce required output",
+        "stage": "output_validation",
+    }
+
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": status,
+    "scorable": status == "scored",
+    "scorer_family": "checklist",
+    "reward": reward,
+    "pass_threshold": 0.5,
+    "passed": status == "scored" and reward >= 0.5,
+    "output_contract": {
+        "mode": "unspecified",
+        "primary_path": None,
+        "required_artifact": False,
+    },
+    "sub_scores": sub_scores,
+    "failure": failure,
+}
+if details:
+    payload["details"] = details
+
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+VR_PYEOF
+}
+
+
 # Check that the agent produced a result file
 if [ ! -f "/workspace/fault_localization_result.json" ]; then
     echo "FAIL: /workspace/fault_localization_result.json not found"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (no result file)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # Validate JSON
 if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
     echo "FAIL: fault_localization_result.json is not valid JSON"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (invalid JSON)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # ── Scoring ──────────────────────────────────────────────
@@ -163,5 +221,6 @@ fi
 FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
 
 echo "$FINAL_SCORE" > /logs/verifier/reward.txt
+write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
 echo ""
 echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
 # Create log directories
 mkdir -p /logs/verifier
 
+# ── validation_result sidecar ───────────────────────────────────────────────
+write_validation_result() {
+    local reward="$1"
+    local sub_scores_json="${2:-{\}}"
+    local details_json="${3:-null}"
+    local status="${4:-scored}"
+    local failure_json="${5:-null}"
+    python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
+import json, sys
+
+reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
+try:
+    reward = float(reward_str)
+except (TypeError, ValueError):
+    reward = 0.0
+try:
+    sub_scores = json.loads(sub_scores_raw)
+except Exception:
+    sub_scores = {}
+try:
+    details = json.loads(details_raw) if details_raw != "null" else None
+except Exception:
+    details = None
+try:
+    failure = json.loads(failure_raw) if failure_raw != "null" else None
+except Exception:
+    failure = None
+
+if status != "scored":
+    failure = failure or {
+        "code": "missing_required_output",
+        "message": "Agent did not produce required output",
+        "stage": "output_validation",
+    }
+
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": status,
+    "scorable": status == "scored",
+    "scorer_family": "checklist",
+    "reward": reward,
+    "pass_threshold": 0.5,
+    "passed": status == "scored" and reward >= 0.5,
+    "output_contract": {
+        "mode": "unspecified",
+        "primary_path": None,
+        "required_artifact": False,
+    },
+    "sub_scores": sub_scores,
+    "failure": failure,
+}
+if details:
+    payload["details"] = details
+
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+VR_PYEOF
+}
+
+
 # Check that the agent produced a result file
 if [ ! -f "/workspace/fault_localization_result.json" ]; then
     echo "FAIL: /workspace/fault_localization_result.json not found"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (no result file)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # Validate JSON
 if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
     echo "FAIL: fault_localization_result.json is not valid JSON"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (invalid JSON)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # ── Scoring ──────────────────────────────────────────────
@@ -160,5 +218,6 @@ fi
 FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
 
 echo "$FINAL_SCORE" > /logs/verifier/reward.txt
+write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
 echo ""
 echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
 # Create log directories
 mkdir -p /logs/verifier
 
+# ── validation_result sidecar ───────────────────────────────────────────────
+write_validation_result() {
+    local reward="$1"
+    local sub_scores_json="${2:-{\}}"
+    local details_json="${3:-null}"
+    local status="${4:-scored}"
+    local failure_json="${5:-null}"
+    python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
+import json, sys
+
+reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
+try:
+    reward = float(reward_str)
+except (TypeError, ValueError):
+    reward = 0.0
+try:
+    sub_scores = json.loads(sub_scores_raw)
+except Exception:
+    sub_scores = {}
+try:
+    details = json.loads(details_raw) if details_raw != "null" else None
+except Exception:
+    details = None
+try:
+    failure = json.loads(failure_raw) if failure_raw != "null" else None
+except Exception:
+    failure = None
+
+if status != "scored":
+    failure = failure or {
+        "code": "missing_required_output",
+        "message": "Agent did not produce required output",
+        "stage": "output_validation",
+    }
+
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": status,
+    "scorable": status == "scored",
+    "scorer_family": "checklist",
+    "reward": reward,
+    "pass_threshold": 0.5,
+    "passed": status == "scored" and reward >= 0.5,
+    "output_contract": {
+        "mode": "unspecified",
+        "primary_path": None,
+        "required_artifact": False,
+    },
+    "sub_scores": sub_scores,
+    "failure": failure,
+}
+if details:
+    payload["details"] = details
+
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+VR_PYEOF
+}
+
+
 # Check that the agent produced a result file
 if [ ! -f "/workspace/fault_localization_result.json" ]; then
     echo "FAIL: /workspace/fault_localization_result.json not found"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (no result file)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # Validate JSON
 if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
     echo "FAIL: fault_localization_result.json is not valid JSON"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (invalid JSON)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # ── Scoring ──────────────────────────────────────────────
@@ -159,5 +217,6 @@ fi
 FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
 
 echo "$FINAL_SCORE" > /logs/verifier/reward.txt
+write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
 echo ""
 echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"
@@ -18,22 +18,80 @@ cd "$TASK_REPO_ROOT"
 # Create log directories
 mkdir -p /logs/verifier
 
+# ── validation_result sidecar ───────────────────────────────────────────────
+write_validation_result() {
+    local reward="$1"
+    local sub_scores_json="${2:-{\}}"
+    local details_json="${3:-null}"
+    local status="${4:-scored}"
+    local failure_json="${5:-null}"
+    python3 - "$reward" "$sub_scores_json" "$details_json" "$status" "$failure_json" <<'VR_PYEOF'
+import json, sys
+
+reward_str, sub_scores_raw, details_raw, status, failure_raw = sys.argv[1:6]
+try:
+    reward = float(reward_str)
+except (TypeError, ValueError):
+    reward = 0.0
+try:
+    sub_scores = json.loads(sub_scores_raw)
+except Exception:
+    sub_scores = {}
+try:
+    details = json.loads(details_raw) if details_raw != "null" else None
+except Exception:
+    details = None
+try:
+    failure = json.loads(failure_raw) if failure_raw != "null" else None
+except Exception:
+    failure = None
+
+if status != "scored":
+    failure = failure or {
+        "code": "missing_required_output",
+        "message": "Agent did not produce required output",
+        "stage": "output_validation",
+    }
+
+payload = {
+    "schema_version": "validation_result.v1alpha1",
+    "status": status,
+    "scorable": status == "scored",
+    "scorer_family": "checklist",
+    "reward": reward,
+    "pass_threshold": 0.5,
+    "passed": status == "scored" and reward >= 0.5,
+    "output_contract": {
+        "mode": "unspecified",
+        "primary_path": None,
+        "required_artifact": False,
+    },
+    "sub_scores": sub_scores,
+    "failure": failure,
+}
+if details:
+    payload["details"] = details
+
+with open("/logs/verifier/validation_result.json", "w") as f:
+    json.dump(payload, f, indent=2)
+VR_PYEOF
+}
+
+
 # Check that the agent produced a result file
 if [ ! -f "/workspace/fault_localization_result.json" ]; then
     echo "FAIL: /workspace/fault_localization_result.json not found"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (no result file)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # Validate JSON
 if ! python3 -c "import json; json.load(open('/workspace/fault_localization_result.json'))" 2>/dev/null; then
     echo "FAIL: fault_localization_result.json is not valid JSON"
     echo "0.0" > /logs/verifier/reward.txt
-    echo ""
-    echo "Tests completed - Score: 0.0 (invalid JSON)"
-    exit 0
+write_validation_result "0.0" "{}" "null" "invalid_output"
+exit 0
 fi
 
 # ── Scoring ──────────────────────────────────────────────
@@ -159,5 +217,6 @@ fi
 FINAL_SCORE=$(awk "BEGIN {printf \"%.1f\", $SCORE / $MAX_SCORE}")
 
 echo "$FINAL_SCORE" > /logs/verifier/reward.txt
+write_validation_result "${FINAL_SCORE:-$SCORE}" "{}" "null" "scored"
 echo ""
 echo "Tests completed - Score: $FINAL_SCORE (${SCORE}/${MAX_SCORE} checks passed)"