@@ -234,6 +234,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
234234 record ["error_fingerprint" ] = None
235235 record ["metrics" ] = {}
236236 record ["wall_clock_seconds" ] = None
237+ record ["timed_out" ] = False
237238 return record
238239
239240 # Parse result.json
@@ -251,6 +252,7 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
251252 }
252253 record ["metrics" ] = {}
253254 record ["wall_clock_seconds" ] = None
255+ record ["timed_out" ] = False
254256 return record
255257
256258 # Check for exception
@@ -276,14 +278,35 @@ def classify_task(task_dir: Path, timeout_hours: float) -> dict:
276278 record ["started_at" ] = data .get ("started_at" , "" )
277279 record ["finished_at" ] = data .get ("finished_at" , "" )
278280
279- if exception_info is not None :
281+ # Determine exception type for timeout-vs-error classification
282+ exception_type = ""
283+ if isinstance (exception_info , dict ):
284+ exception_type = exception_info .get (
285+ "exception_type" , exception_info .get ("type" , "" )
286+ )
287+
288+ if (
289+ exception_info is not None
290+ and exception_type == "AgentTimeoutError"
291+ and reward is not None
292+ ):
293+ # Agent timed out but verifier scored partial work — treat as scored result
294+ record ["timed_out" ] = True
295+ record ["error_fingerprint" ] = fingerprint_error (exception_info )
296+ if reward > 0 :
297+ record ["status" ] = "completed_pass"
298+ else :
299+ record ["status" ] = "completed_fail"
300+ elif exception_info is not None :
280301 record ["status" ] = "errored"
281302 record ["error_fingerprint" ] = fingerprint_error (exception_info )
282- elif reward is not None and reward > 0 :
283- record ["status" ] = "completed_pass"
284- record ["error_fingerprint" ] = None
303+ record ["timed_out" ] = False
285304 else :
286- record ["status" ] = "completed_fail"
305+ record ["timed_out" ] = False
306+ if reward is not None and reward > 0 :
307+ record ["status" ] = "completed_pass"
308+ else :
309+ record ["status" ] = "completed_fail"
287310 record ["error_fingerprint" ] = None
288311
289312 return record
@@ -355,6 +378,8 @@ def scan_all_tasks(
355378
356379 tasks .append (record )
357380 totals [record ["status" ]] += 1
381+ if record .get ("timed_out" ):
382+ totals ["timed_out" ] += 1
358383 by_suite [suite ][config ][record ["status" ]] += 1
359384
360385 # Accumulate error summary
@@ -592,12 +617,16 @@ def format_table(output: dict) -> str:
592617
593618 # Totals
594619 totals = output ["totals" ]
620+ timed_out_count = totals .pop ("timed_out" , 0 )
595621 total_all = sum (totals .values ())
596622 lines .append (f"TOTALS: { total_all } tasks" )
597623 for status in ("running" , "completed_pass" , "completed_fail" , "errored" , "timeout" ):
598624 count = totals .get (status , 0 )
599625 if count :
600626 lines .append (f" { status :20s} { count :>5d} " )
627+ if timed_out_count :
628+ lines .append (f" { 'timed_out (scored)' :20s} { timed_out_count :>5d} " )
629+ totals ["timed_out" ] = timed_out_count # restore for JSON output
601630 lines .append ("" )
602631
603632 # By suite/config breakdown
@@ -658,14 +687,18 @@ def format_table(output: dict) -> str:
658687
659688 # Task details (only non-pass or if few tasks)
660689 non_pass = [t for t in output ["tasks" ] if t ["status" ] != "completed_pass" ]
661- if non_pass :
662- lines .append (f"NON-PASSING TASKS ({ len (non_pass )} ):" )
663- for t in non_pass :
690+ timed_out_pass = [t for t in output ["tasks" ]
691+ if t ["status" ] == "completed_pass" and t .get ("timed_out" )]
692+ notable = non_pass + timed_out_pass
693+ if notable :
694+ lines .append (f"NON-PASSING / TIMED-OUT TASKS ({ len (notable )} ):" )
695+ for t in notable :
664696 fp_str = ""
665697 if t .get ("error_fingerprint" ):
666698 fp_str = f" [{ t ['error_fingerprint' ]['fingerprint_id' ]} ]"
667699 reward_str = f" reward={ t ['reward' ]:.2f} " if t ["reward" ] is not None else ""
668- lines .append (f" { t ['status' ]:16s} { t .get ('suite' ,'' ):20s} { t .get ('config' ,'' ):18s} { t ['task_name' ]} { reward_str } { fp_str } " )
700+ timeout_str = " [timed_out]" if t .get ("timed_out" ) else ""
701+ lines .append (f" { t ['status' ]:16s} { t .get ('suite' ,'' ):20s} { t .get ('config' ,'' ):18s} { t ['task_name' ]} { reward_str } { fp_str } { timeout_str } " )
669702
670703 return "\n " .join (lines )
671704
0 commit comments