Skip to content

Commit 53b2b86

Browse files
committed
Separate reward and pass semantics in reporting
1 parent 7c8a399 commit 53b2b86

File tree

10 files changed

+576
-27
lines changed

10 files changed

+576
-27
lines changed

docs/REPORT_CONTEXT.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,10 @@ The evaluation uses a multi-layer pipeline:
148148
and task rewards.
149149

150150
4. **Report generator**: Aggregates all layers into structured JSON and
151-
Markdown reports.
151+
Markdown reports. Reporting should preserve continuous `reward`,
152+
authoritative `passed` / `pass_threshold`, scorer family, and output
153+
contract separately rather than collapsing unlike verifier families into
154+
one implicitly calibrated scalar.
152155

153156
### 3.3 Scoring Types
154157

@@ -165,6 +168,12 @@ Different task categories use different verifier types:
165168
| **navigation-verified** | 0.0--1.0 | Regression proving (fail-on-buggy + pass-after-patch) |
166169
| **external** | 0.0--1.0 | TheAgentCompany tasks |
167170

171+
Canonical reporting now treats these families as separate semantic buckets.
172+
Mean reward remains useful within a family or benchmark, but aggregate views
173+
should either partition by `scorer_family` or clearly caveat mixed-family
174+
comparisons. Solved/pass status should come from verifier `passed`, not from
175+
recomputing `reward > 0`.
176+
168177
### 3.4 CodeScaleBench-Org Oracle Evaluation
169178

170179
Org tasks use a closed-world oracle system with 7 deterministic

scripts/csb_metrics/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55
from .extractors import extract_run_config
66
from .task_selection import (
77
load_selected_tasks,
8+
load_canonical_evaluation_audit,
89
build_task_index,
10+
build_task_contract_index,
911
enrich_runs,
12+
enrich_run_contracts,
1013
filter_runs_to_selected,
1114
)
1215

@@ -18,7 +21,10 @@
1821
"collect_retrieval_data",
1922
"extract_run_config",
2023
"load_selected_tasks",
24+
"load_canonical_evaluation_audit",
2125
"build_task_index",
26+
"build_task_contract_index",
2227
"enrich_runs",
28+
"enrich_run_contracts",
2329
"filter_runs_to_selected",
2430
]

scripts/csb_metrics/discovery.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,11 @@ def _process_task_dir(
230230
reward = extract_reward_from_file(reward_path)
231231
if reward is not None:
232232
tm.reward = reward
233-
tm.status = "passed" if reward > 0 else "failed"
233+
if tm.passed is None:
234+
tm.pass_threshold = 0.0 if tm.pass_threshold is None else tm.pass_threshold
235+
tm.passed = reward > 0
236+
if tm.status != "error":
237+
tm.status = "passed" if tm.passed else "failed"
234238

235239
# SWE-bench partial score
236240
if is_swebench:

scripts/csb_metrics/extractors.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,98 @@
2121
_WARNED_UNKNOWN_PRICING_MODELS: set[str] = set()
2222

2323

24+
def _coerce_float(value: object) -> Optional[float]:
25+
try:
26+
return float(value) # type: ignore[arg-type]
27+
except (TypeError, ValueError):
28+
return None
29+
30+
31+
def _coerce_bool(value: object) -> Optional[bool]:
32+
if isinstance(value, bool):
33+
return value
34+
return None
35+
36+
37+
def _infer_passed(reward: Optional[float], pass_threshold: Optional[float]) -> Optional[bool]:
38+
if reward is None:
39+
return None
40+
threshold = 0.0 if pass_threshold is None else pass_threshold
41+
if threshold <= 0.0:
42+
return reward > 0.0
43+
return reward >= threshold
44+
45+
46+
def _extract_validation_result_payload(result_json_path: Path) -> Optional[dict]:
47+
"""Load validation_result.json or compatible legacy sidecar payloads."""
48+
validation_result_path = result_json_path.parent / "verifier" / "validation_result.json"
49+
if not validation_result_path.is_file():
50+
return None
51+
try:
52+
data = json.loads(validation_result_path.read_text())
53+
except (OSError, json.JSONDecodeError):
54+
return None
55+
return data if isinstance(data, dict) else None
56+
57+
58+
def _normalize_validation_result(payload: dict) -> dict[str, object]:
59+
"""Normalize canonical and legacy validation_result payloads."""
60+
output_contract = payload.get("output_contract")
61+
if not isinstance(output_contract, dict):
62+
output_contract = {}
63+
64+
reward = _coerce_float(payload.get("reward"))
65+
if reward is None:
66+
reward = _coerce_float(payload.get("overall_score"))
67+
if reward is None:
68+
reward = _coerce_float(payload.get("score"))
69+
70+
pass_threshold = _coerce_float(payload.get("pass_threshold"))
71+
passed = _coerce_bool(payload.get("passed"))
72+
if passed is None:
73+
passed = _infer_passed(reward, pass_threshold)
74+
75+
status = payload.get("status")
76+
if not isinstance(status, str):
77+
status = "scored" if reward is not None else None
78+
79+
scorable = _coerce_bool(payload.get("scorable"))
80+
if scorable is None:
81+
scorable = status == "scored" if status is not None else reward is not None
82+
83+
scorer_family = payload.get("scorer_family")
84+
if not isinstance(scorer_family, str):
85+
scorer_family = None
86+
87+
output_contract_mode = output_contract.get("mode")
88+
if not isinstance(output_contract_mode, str):
89+
output_contract_mode = output_contract.get("classification")
90+
if not isinstance(output_contract_mode, str):
91+
output_contract_mode = None
92+
93+
output_contract_primary_path = output_contract.get("primary_path")
94+
if not isinstance(output_contract_primary_path, str):
95+
output_contract_primary_path = output_contract.get("primary_output_path")
96+
if not isinstance(output_contract_primary_path, str):
97+
output_contract_primary_path = None
98+
99+
output_contract_required_artifact = _coerce_bool(
100+
output_contract.get("required_artifact")
101+
)
102+
103+
return {
104+
"reward": reward,
105+
"passed": passed,
106+
"pass_threshold": pass_threshold,
107+
"validation_status": status,
108+
"validation_scorable": scorable,
109+
"scorer_family": scorer_family,
110+
"output_contract_mode": output_contract_mode,
111+
"output_contract_primary_path": output_contract_primary_path,
112+
"output_contract_required_artifact": output_contract_required_artifact,
113+
}
114+
115+
24116
def _parse_iso(ts: Optional[str]) -> Optional[datetime]:
25117
"""Parse an ISO 8601 timestamp, returning None on failure."""
26118
if not ts:
@@ -87,6 +179,12 @@ def extract_task_from_result_json(
87179

88180
# Reward
89181
reward = None
182+
validation_payload = _extract_validation_result_payload(path)
183+
normalized_validation = (
184+
_normalize_validation_result(validation_payload)
185+
if validation_payload is not None
186+
else None
187+
)
90188
verifier_result = data.get("verifier_result") or {}
91189
rewards = verifier_result.get("rewards") or {}
92190
for key in ("reward", "score"):
@@ -96,13 +194,35 @@ def extract_task_from_result_json(
96194
except (TypeError, ValueError):
97195
continue
98196
break
197+
if normalized_validation and normalized_validation.get("reward") is not None:
198+
reward = normalized_validation["reward"] # type: ignore[assignment]
199+
200+
passed = None
201+
pass_threshold = None
202+
scorer_family = None
203+
output_contract_mode = None
204+
output_contract_primary_path = None
205+
output_contract_required_artifact = None
206+
validation_status = None
207+
validation_scorable = None
208+
if normalized_validation:
209+
passed = normalized_validation.get("passed")
210+
pass_threshold = normalized_validation.get("pass_threshold")
211+
scorer_family = normalized_validation.get("scorer_family")
212+
output_contract_mode = normalized_validation.get("output_contract_mode")
213+
output_contract_primary_path = normalized_validation.get("output_contract_primary_path")
214+
output_contract_required_artifact = normalized_validation.get("output_contract_required_artifact")
215+
validation_status = normalized_validation.get("validation_status")
216+
validation_scorable = normalized_validation.get("validation_scorable")
99217

100218
# Status — agent timeouts are scored normally (verifier runs on partial work)
101219
exc = data.get("exception_info") or {}
102220
exc_type = exc.get("exception_type", exc.get("type", "")) if isinstance(exc, dict) else ""
103221
timed_out = bool(exc and exc_type == "AgentTimeoutError")
104222
if exc and not timed_out:
105223
status = "error"
224+
elif isinstance(passed, bool):
225+
status = "passed" if passed else "failed"
106226
elif reward is not None:
107227
status = "passed" if reward > 0 else "failed"
108228
else:
@@ -162,7 +282,25 @@ def extract_task_from_result_json(
162282
benchmark=benchmark,
163283
config_name=config_name,
164284
reward=reward,
285+
passed=passed if isinstance(passed, bool) else None,
286+
pass_threshold=pass_threshold if isinstance(pass_threshold, float) else None,
165287
status=status,
288+
scorer_family=scorer_family if isinstance(scorer_family, str) else None,
289+
output_contract_mode=output_contract_mode if isinstance(output_contract_mode, str) else None,
290+
output_contract_primary_path=(
291+
output_contract_primary_path
292+
if isinstance(output_contract_primary_path, str)
293+
else None
294+
),
295+
output_contract_required_artifact=(
296+
output_contract_required_artifact
297+
if isinstance(output_contract_required_artifact, bool)
298+
else None
299+
),
300+
validation_status=validation_status if isinstance(validation_status, str) else None,
301+
validation_scorable=(
302+
validation_scorable if isinstance(validation_scorable, bool) else None
303+
),
166304
timed_out=timed_out,
167305
wall_clock_seconds=wall_clock,
168306
agent_execution_seconds=agent_execution_seconds,

scripts/csb_metrics/models.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,15 @@ class TaskMetrics:
2424
# Scoring
2525
reward: Optional[float] = None
2626
partial_score: Optional[float] = None
27+
passed: Optional[bool] = None
28+
pass_threshold: Optional[float] = None
2729
status: str = "unknown" # passed / failed / error
30+
scorer_family: Optional[str] = None
31+
output_contract_mode: Optional[str] = None
32+
output_contract_primary_path: Optional[str] = None
33+
output_contract_required_artifact: Optional[bool] = None
34+
validation_status: Optional[str] = None
35+
validation_scorable: Optional[bool] = None
2836

2937
# LLM Judge (optional — separate from verifier reward)
3038
judge_score: Optional[float] = None
@@ -155,10 +163,16 @@ def mean_partial_score(self) -> Optional[float]:
155163

156164
@property
157165
def pass_rate(self) -> Optional[float]:
158-
scored = [t for t in self.tasks if t.status in ("passed", "failed")]
166+
scored = [
167+
t for t in self.tasks
168+
if t.passed is not None or t.status in ("passed", "failed")
169+
]
159170
if not scored:
160171
return None
161-
return sum(1 for t in scored if t.status == "passed") / len(scored)
172+
return sum(
173+
1 for t in scored
174+
if t.passed is True or (t.passed is None and t.status == "passed")
175+
) / len(scored)
162176

163177
@property
164178
def mean_judge_score(self) -> Optional[float]:

scripts/csb_metrics/task_selection.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ def load_selected_tasks(path: str | Path) -> dict:
4747
return json.loads(Path(path).read_text())
4848

4949

50+
def load_canonical_evaluation_audit(path: str | Path) -> dict:
51+
"""Load configs/canonical_evaluation_audit.json."""
52+
return json.loads(Path(path).read_text())
53+
54+
5055
def build_task_index(selection: dict) -> dict[str, dict]:
5156
"""Build a task_id → task metadata lookup from the selection document.
5257
@@ -77,6 +82,19 @@ def build_task_index(selection: dict) -> dict[str, dict]:
7782
return index
7883

7984

85+
def build_task_contract_index(audit: dict) -> dict[str, dict]:
86+
"""Build a task_id → canonical evaluation contract lookup."""
87+
index: dict[str, dict] = {}
88+
for task in audit.get("tasks", []):
89+
tid = _normalize_task_id(task["task_id"])
90+
index[tid] = task
91+
if tid.startswith(("csb_", "ccb_")):
92+
bare = tid[4:]
93+
if bare not in index:
94+
index[bare] = task
95+
return index
96+
97+
8098
def enrich_task_metrics(
8199
tm: TaskMetrics,
82100
task_index: dict[str, dict],
@@ -104,6 +122,27 @@ def enrich_task_metrics(
104122
tm.task_files_count = meta.get("files_count")
105123

106124

125+
def enrich_task_contract_metrics(
126+
tm: TaskMetrics,
127+
contract_index: dict[str, dict],
128+
) -> None:
129+
"""Enrich TaskMetrics with canonical scorer-family and output-contract metadata."""
130+
contract = contract_index.get(_normalize_task_id(tm.task_id))
131+
if contract is None:
132+
return
133+
134+
validation_plan = contract.get("validation_result_plan") or {}
135+
evaluator = contract.get("evaluator") or {}
136+
output_contract = contract.get("output_contract") or {}
137+
138+
if tm.scorer_family is None:
139+
tm.scorer_family = validation_plan.get("scorer_family") or evaluator.get("family")
140+
if tm.output_contract_mode is None:
141+
tm.output_contract_mode = output_contract.get("classification")
142+
if tm.output_contract_primary_path is None:
143+
tm.output_contract_primary_path = output_contract.get("primary_output_path")
144+
145+
107146
def enrich_runs(
108147
runs: list[RunMetrics],
109148
task_index: dict[str, dict],
@@ -119,6 +158,16 @@ def enrich_runs(
119158
enrich_task_metrics(tm, task_index)
120159

121160

161+
def enrich_run_contracts(
162+
runs: list[RunMetrics],
163+
contract_index: dict[str, dict],
164+
) -> None:
165+
"""Enrich all TaskMetrics within a list of RunMetrics with contract metadata."""
166+
for run in runs:
167+
for tm in run.tasks:
168+
enrich_task_contract_metrics(tm, contract_index)
169+
170+
122171
def filter_runs_to_selected(
123172
runs: list[RunMetrics],
124173
task_index: dict[str, dict],

0 commit comments

Comments
 (0)