Skip to content

Commit 96892dc

Browse files
sjarmakclaude
andcommitted
feat: US-021 - Rubric judge integration for hybrid tasks
- Add RUBRIC_CRITERIA_PROMPT to judge/prompts.py - Add evaluate_with_criteria() method to LLMJudge engine - Add _format_criteria() and _parse_criteria_scores() helpers - Extend JudgeResult with criteria_scores, rubric_score, hybrid_composite fields - Add --hybrid and --hybrid-weight flags to run_judge.py - Auto-detect tests/criteria.json for criteria-bearing tasks - Hybrid composite = verifier_weight * verifier_reward + (1-w) * rubric_score - Backwards-compatible: hybrid fields absent when criteria.json not found - Add ccb_mcp_* suite prefix mappings to run_judge.py DIR_PREFIX_TO_SUITE Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 38092ed commit 96892dc

File tree

7 files changed

+278
-3
lines changed

7 files changed

+278
-3
lines changed

ralph-mcp-unique/prd.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -619,7 +619,7 @@
619619
"Backwards compatible for tasks without criteria.json"
620620
],
621621
"priority": 21,
622-
"passes": false,
622+
"passes": true,
623623
"notes": "Only E and J-family DS tasks need rubric judging initially. Most tasks are fully deterministic."
624624
},
625625
{

ralph-mcp-unique/progress.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,3 +540,30 @@
540540
- Table builders are None-returning optional functions — consistent with other optional tables (swebench_partial, search_patterns, etc.)
541541
- Collecting retrieval data is a separate pass over the runs dir (not integrated into task discovery) to avoid schema changes to TaskMetrics
542542
---
543+
[2026-02-20 21:44:43 UTC] Iteration 11 no story markers found
544+
[2026-02-20 21:44:43 UTC] Iteration 11 complete
545+
[2026-02-20 21:44:45 UTC] Iteration 12 started
546+
547+
## 2026-02-20 - US-021: Rubric judge integration for hybrid tasks
548+
- Extended `scripts/run_judge.py` with `--hybrid` and `--hybrid-weight` flags
549+
- Added `_find_criteria_json(task_id, benchmark, benchmarks_dir)` helper (normalizes slug, handles case)
550+
- Added `_load_criteria_json(path)` helper
551+
- Added `RUBRIC_CRITERIA_PROMPT` to `scripts/ccb_metrics/judge/prompts.py`
552+
- Added `_format_criteria(criteria)` and `_parse_criteria_scores(response, criteria)` helpers to `engine.py`
553+
- Added `evaluate_with_criteria(judge_input, criteria)` method to `LLMJudge` class
554+
- Updated `JudgeResult` dataclass with `criteria_scores`, `rubric_score`, `hybrid_composite`, `verifier_weight` fields
555+
- `to_dict()` conditionally includes hybrid fields (backwards-compatible: absent when criteria not used)
556+
- `_run_single_task()` runs rubric scoring as non-fatal optional step in hybrid mode
557+
- Added `ccb_mcp_*` prefix mappings to `run_judge.py`'s `DIR_PREFIX_TO_SUITE`
558+
- 8 smoke tests all pass; py_compile succeeds for all 5 modified files
559+
- Repo health check: PASSED
560+
561+
- Files changed: `scripts/ccb_metrics/judge/prompts.py`, `scripts/ccb_metrics/judge/models.py`, `scripts/ccb_metrics/judge/engine.py`, `scripts/ccb_metrics/judge/__init__.py`, `scripts/run_judge.py`, `ralph-mcp-unique/prd.json`, `ralph-mcp-unique/progress.txt`
562+
- **Learnings for future iterations:**
563+
- `_parse_criteria_scores` normalizes each criterion score to [0, max_score] before computing normalized mean
564+
- Rubric scoring is non-fatal in `_run_single_task`: catch exception, log warning, continue without rubric
565+
- `to_dict()` uses conditional inclusion (if self.criteria_scores:) for backwards compat — no schema change needed
566+
- `_find_criteria_json` handles slug normalization (underscore↔hyphen, case) for robust lookup
567+
- hybrid_composite uses configurable verifier_weight (default 0.6/0.4 split) validated in main()
568+
- Composite formula: composite = verifier_weight * verifier_reward + (1 - verifier_weight) * rubric_score
569+
---

scripts/ccb_metrics/judge/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010

1111
from .engine import LLMJudge
1212
from .models import JudgeInput, JudgeResult, OracleBundle, normalize_score
13+
from .prompts import RUBRIC_CRITERIA_PROMPT
1314

1415
__all__ = [
1516
"LLMJudge",
1617
"JudgeInput",
1718
"JudgeResult",
1819
"OracleBundle",
1920
"normalize_score",
21+
"RUBRIC_CRITERIA_PROMPT",
2022
]

scripts/ccb_metrics/judge/engine.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
DIRECT_REVIEW_PROMPT,
2424
REFERENCE_COMPLETENESS_PROMPT,
2525
REFERENCE_CORRECTNESS_PROMPT,
26+
RUBRIC_CRITERIA_PROMPT,
2627
)
2728

2829
# ---------------------------------------------------------------------------
@@ -84,6 +85,56 @@ def _render_prompt(template: str, judge_input: JudgeInput) -> str:
8485
)
8586

8687

88+
def _format_criteria(criteria: list[dict]) -> str:
89+
"""Format a criteria list for inclusion in the rubric prompt."""
90+
lines: list[str] = []
91+
for i, c in enumerate(criteria, 1):
92+
metric = c.get("metric", f"criterion_{i}")
93+
max_score = c.get("max_score", 1)
94+
description = c.get("description", "")
95+
lines.append(f"Criterion {i}: **{metric}** (max score: {max_score})")
96+
lines.append(f" {description}")
97+
lines.append("")
98+
return "\n".join(lines).rstrip()
99+
100+
101+
def _parse_criteria_scores(
102+
response: dict, criteria: list[dict]
103+
) -> tuple[dict[str, dict], float]:
104+
"""Extract per-criterion scores from LLM response.
105+
106+
Returns:
107+
(criteria_scores, rubric_score) where rubric_score is normalized mean.
108+
"""
109+
raw = response.get("criteria_scores", {})
110+
criteria_scores: dict[str, dict] = {}
111+
normalized_scores: list[float] = []
112+
113+
for c in criteria:
114+
metric = c.get("metric", "")
115+
max_score = float(c.get("max_score", 1) or 1)
116+
entry = raw.get(metric, {})
117+
if isinstance(entry, dict):
118+
raw_score = float(entry.get("score", 0.0))
119+
reasoning = entry.get("reasoning", "")
120+
else:
121+
raw_score = 0.0
122+
reasoning = ""
123+
# Clamp to valid range
124+
raw_score = max(0.0, min(raw_score, max_score))
125+
normalized = raw_score / max_score if max_score > 0 else 0.0
126+
criteria_scores[metric] = {
127+
"score": raw_score,
128+
"max_score": max_score,
129+
"normalized_score": round(normalized, 4),
130+
"reasoning": reasoning,
131+
}
132+
normalized_scores.append(normalized)
133+
134+
rubric_score = sum(normalized_scores) / len(normalized_scores) if normalized_scores else 0.0
135+
return criteria_scores, round(rubric_score, 4)
136+
137+
87138
def _select_prompt(judge_input: JudgeInput) -> str:
88139
"""Select the appropriate prompt template based on oracle availability."""
89140
if judge_input.oracle_ground_truth and judge_input.oracle_ground_truth.strip():
@@ -267,6 +318,38 @@ def evaluate_with_voting(
267318
provenance={**provenance, "confidence": confidence_float},
268319
)
269320

321+
def evaluate_with_criteria(
322+
self,
323+
judge_input: JudgeInput,
324+
criteria: list[dict],
325+
) -> tuple[dict[str, dict], float]:
326+
"""Score agent output against AAA rubric criteria from criteria.json.
327+
328+
Args:
329+
judge_input: Task input bundle.
330+
criteria: List of {metric, description, max_score} dicts.
331+
332+
Returns:
333+
(criteria_scores, rubric_score) where:
334+
criteria_scores = {metric: {score, max_score, normalized_score, reasoning}}
335+
rubric_score = mean of normalized per-criterion scores (0.0 – 1.0)
336+
"""
337+
if not criteria:
338+
return {}, 0.0
339+
340+
criteria_text = _format_criteria(criteria)
341+
user_prompt = RUBRIC_CRITERIA_PROMPT.format(
342+
task_description=judge_input.task_description,
343+
agent_output=judge_input.code_changes or "(no output)",
344+
criteria_text=criteria_text,
345+
)
346+
system_prompt = (
347+
"You are a precise rubric evaluator. Always respond with valid JSON only."
348+
)
349+
350+
response = self._backend.call(system_prompt, user_prompt)
351+
return _parse_criteria_scores(response, criteria)
352+
270353
# ---- internals ----
271354

272355
def _build_provenance(self, rounds: int, rationale: str = "") -> dict:

scripts/ccb_metrics/judge/models.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,15 @@ class JudgeResult:
5959
vote_distribution: dict = field(default_factory=dict)
6060
judged_at: str = ""
6161
provenance: dict = field(default_factory=dict)
62+
# Hybrid evaluation fields (populated when criteria.json is present)
63+
criteria_scores: dict = field(default_factory=dict)
64+
rubric_score: float = 0.0
65+
hybrid_composite: Optional[float] = None
66+
verifier_weight: float = 0.6
6267

6368
def to_dict(self) -> dict:
6469
"""Produce JSON-serializable output matching judge_result.schema.json."""
65-
return {
70+
result: dict = {
6671
"task_id": self.task_id,
6772
"benchmark": self.benchmark,
6873
"config": self.config,
@@ -77,6 +82,14 @@ def to_dict(self) -> dict:
7782
"vote_distribution": self.vote_distribution,
7883
"provenance": self.provenance,
7984
}
85+
# Include hybrid fields only when criteria scoring was performed
86+
if self.criteria_scores:
87+
result["criteria_scores"] = self.criteria_scores
88+
result["rubric_score"] = self.rubric_score
89+
if self.hybrid_composite is not None:
90+
result["hybrid_composite"] = self.hybrid_composite
91+
result["verifier_weight"] = self.verifier_weight
92+
return result
8093

8194

8295
# ---- Helpers ----

scripts/ccb_metrics/judge/prompts.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,36 @@
8989
}}
9090
"""
9191

92+
RUBRIC_CRITERIA_PROMPT = """\
93+
You are an expert evaluator assessing an AI agent's output against task-specific rubric criteria.
94+
95+
## Task Description
96+
{task_description}
97+
98+
## Agent Output
99+
{agent_output}
100+
101+
## Rubric Criteria
102+
Evaluate the agent's output against each criterion below. Assign a score from 0 to max_score \
103+
(inclusive). Use fractional values for partial credit.
104+
105+
{criteria_text}
106+
107+
## Instructions
108+
Score each criterion based on the description. Be evidence-based and cite specific output elements.
109+
110+
Respond with ONLY valid JSON in this exact format:
111+
{{
112+
"criteria_scores": {{
113+
"<metric_name>": {{
114+
"score": <float 0 to max_score>,
115+
"reasoning": "<brief explanation citing specific evidence>"
116+
}}
117+
}},
118+
"overall_reasoning": "<synthesis of the evaluation across all criteria>"
119+
}}
120+
"""
121+
92122
DIRECT_REVIEW_PROMPT = """\
93123
You are an expert code evaluator assessing an AI coding agent's output based on \
94124
the task description alone. No reference answer is available.

0 commit comments

Comments
 (0)