2121_WARNED_UNKNOWN_PRICING_MODELS : set [str ] = set ()
2222
2323
24+ def _coerce_float (value : object ) -> Optional [float ]:
25+ try :
26+ return float (value ) # type: ignore[arg-type]
27+ except (TypeError , ValueError ):
28+ return None
29+
30+
31+ def _coerce_bool (value : object ) -> Optional [bool ]:
32+ if isinstance (value , bool ):
33+ return value
34+ return None
35+
36+
37+ def _infer_passed (reward : Optional [float ], pass_threshold : Optional [float ]) -> Optional [bool ]:
38+ if reward is None :
39+ return None
40+ threshold = 0.0 if pass_threshold is None else pass_threshold
41+ if threshold <= 0.0 :
42+ return reward > 0.0
43+ return reward >= threshold
44+
45+
46+ def _extract_validation_result_payload (result_json_path : Path ) -> Optional [dict ]:
47+ """Load validation_result.json or compatible legacy sidecar payloads."""
48+ validation_result_path = result_json_path .parent / "verifier" / "validation_result.json"
49+ if not validation_result_path .is_file ():
50+ return None
51+ try :
52+ data = json .loads (validation_result_path .read_text ())
53+ except (OSError , json .JSONDecodeError ):
54+ return None
55+ return data if isinstance (data , dict ) else None
56+
57+
58+ def _normalize_validation_result (payload : dict ) -> dict [str , object ]:
59+ """Normalize canonical and legacy validation_result payloads."""
60+ output_contract = payload .get ("output_contract" )
61+ if not isinstance (output_contract , dict ):
62+ output_contract = {}
63+
64+ reward = _coerce_float (payload .get ("reward" ))
65+ if reward is None :
66+ reward = _coerce_float (payload .get ("overall_score" ))
67+ if reward is None :
68+ reward = _coerce_float (payload .get ("score" ))
69+
70+ pass_threshold = _coerce_float (payload .get ("pass_threshold" ))
71+ passed = _coerce_bool (payload .get ("passed" ))
72+ if passed is None :
73+ passed = _infer_passed (reward , pass_threshold )
74+
75+ status = payload .get ("status" )
76+ if not isinstance (status , str ):
77+ status = "scored" if reward is not None else None
78+
79+ scorable = _coerce_bool (payload .get ("scorable" ))
80+ if scorable is None :
81+ scorable = status == "scored" if status is not None else reward is not None
82+
83+ scorer_family = payload .get ("scorer_family" )
84+ if not isinstance (scorer_family , str ):
85+ scorer_family = None
86+
87+ output_contract_mode = output_contract .get ("mode" )
88+ if not isinstance (output_contract_mode , str ):
89+ output_contract_mode = output_contract .get ("classification" )
90+ if not isinstance (output_contract_mode , str ):
91+ output_contract_mode = None
92+
93+ output_contract_primary_path = output_contract .get ("primary_path" )
94+ if not isinstance (output_contract_primary_path , str ):
95+ output_contract_primary_path = output_contract .get ("primary_output_path" )
96+ if not isinstance (output_contract_primary_path , str ):
97+ output_contract_primary_path = None
98+
99+ output_contract_required_artifact = _coerce_bool (
100+ output_contract .get ("required_artifact" )
101+ )
102+
103+ return {
104+ "reward" : reward ,
105+ "passed" : passed ,
106+ "pass_threshold" : pass_threshold ,
107+ "validation_status" : status ,
108+ "validation_scorable" : scorable ,
109+ "scorer_family" : scorer_family ,
110+ "output_contract_mode" : output_contract_mode ,
111+ "output_contract_primary_path" : output_contract_primary_path ,
112+ "output_contract_required_artifact" : output_contract_required_artifact ,
113+ }
114+
115+
24116def _parse_iso (ts : Optional [str ]) -> Optional [datetime ]:
25117 """Parse an ISO 8601 timestamp, returning None on failure."""
26118 if not ts :
@@ -87,6 +179,12 @@ def extract_task_from_result_json(
87179
88180 # Reward
89181 reward = None
182+ validation_payload = _extract_validation_result_payload (path )
183+ normalized_validation = (
184+ _normalize_validation_result (validation_payload )
185+ if validation_payload is not None
186+ else None
187+ )
90188 verifier_result = data .get ("verifier_result" ) or {}
91189 rewards = verifier_result .get ("rewards" ) or {}
92190 for key in ("reward" , "score" ):
@@ -96,13 +194,35 @@ def extract_task_from_result_json(
96194 except (TypeError , ValueError ):
97195 continue
98196 break
197+ if normalized_validation and normalized_validation .get ("reward" ) is not None :
198+ reward = normalized_validation ["reward" ] # type: ignore[assignment]
199+
200+ passed = None
201+ pass_threshold = None
202+ scorer_family = None
203+ output_contract_mode = None
204+ output_contract_primary_path = None
205+ output_contract_required_artifact = None
206+ validation_status = None
207+ validation_scorable = None
208+ if normalized_validation :
209+ passed = normalized_validation .get ("passed" )
210+ pass_threshold = normalized_validation .get ("pass_threshold" )
211+ scorer_family = normalized_validation .get ("scorer_family" )
212+ output_contract_mode = normalized_validation .get ("output_contract_mode" )
213+ output_contract_primary_path = normalized_validation .get ("output_contract_primary_path" )
214+ output_contract_required_artifact = normalized_validation .get ("output_contract_required_artifact" )
215+ validation_status = normalized_validation .get ("validation_status" )
216+ validation_scorable = normalized_validation .get ("validation_scorable" )
99217
100218 # Status — agent timeouts are scored normally (verifier runs on partial work)
101219 exc = data .get ("exception_info" ) or {}
102220 exc_type = exc .get ("exception_type" , exc .get ("type" , "" )) if isinstance (exc , dict ) else ""
103221 timed_out = bool (exc and exc_type == "AgentTimeoutError" )
104222 if exc and not timed_out :
105223 status = "error"
224+ elif isinstance (passed , bool ):
225+ status = "passed" if passed else "failed"
106226 elif reward is not None :
107227 status = "passed" if reward > 0 else "failed"
108228 else :
@@ -162,7 +282,25 @@ def extract_task_from_result_json(
162282 benchmark = benchmark ,
163283 config_name = config_name ,
164284 reward = reward ,
285+ passed = passed if isinstance (passed , bool ) else None ,
286+ pass_threshold = pass_threshold if isinstance (pass_threshold , float ) else None ,
165287 status = status ,
288+ scorer_family = scorer_family if isinstance (scorer_family , str ) else None ,
289+ output_contract_mode = output_contract_mode if isinstance (output_contract_mode , str ) else None ,
290+ output_contract_primary_path = (
291+ output_contract_primary_path
292+ if isinstance (output_contract_primary_path , str )
293+ else None
294+ ),
295+ output_contract_required_artifact = (
296+ output_contract_required_artifact
297+ if isinstance (output_contract_required_artifact , bool )
298+ else None
299+ ),
300+ validation_status = validation_status if isinstance (validation_status , str ) else None ,
301+ validation_scorable = (
302+ validation_scorable if isinstance (validation_scorable , bool ) else None
303+ ),
166304 timed_out = timed_out ,
167305 wall_clock_seconds = wall_clock ,
168306 agent_execution_seconds = agent_execution_seconds ,
0 commit comments