feat: US-021 - Rubric judge integration for hybrid tasks

sjarmak · claude · sjarmak · commit 96892dc37837 · 2026-02-20T21:49:58.000Z
- Add RUBRIC_CRITERIA_PROMPT to judge/prompts.py
- Add evaluate_with_criteria() method to LLMJudge engine
- Add _format_criteria() and _parse_criteria_scores() helpers
- Extend JudgeResult with criteria_scores, rubric_score, hybrid_composite fields
- Add --hybrid and --hybrid-weight flags to run_judge.py
- Auto-detect tests/criteria.json for criteria-bearing tasks
- Hybrid composite = verifier_weight * verifier_reward + (1-w) * rubric_score
- Backwards-compatible: hybrid fields absent when criteria.json not found
- Add ccb_mcp_* suite prefix mappings to run_judge.py DIR_PREFIX_TO_SUITE

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/ralph-mcp-unique/prd.json b/ralph-mcp-unique/prd.json
@@ -619,7 +619,7 @@
         "Backwards compatible for tasks without criteria.json"
       ],
       "priority": 21,
-      "passes": false,
+      "passes": true,
       "notes": "Only E and J-family DS tasks need rubric judging initially. Most tasks are fully deterministic."
     },
     {
diff --git a/ralph-mcp-unique/progress.txt b/ralph-mcp-unique/progress.txt
@@ -540,3 +540,30 @@
   - Table builders are None-returning optional functions — consistent with other optional tables (swebench_partial, search_patterns, etc.)
   - Collecting retrieval data is a separate pass over the runs dir (not integrated into task discovery) to avoid schema changes to TaskMetrics
 ---
+[2026-02-20 21:44:43 UTC] Iteration 11 no story markers found
+[2026-02-20 21:44:43 UTC] Iteration 11 complete
+[2026-02-20 21:44:45 UTC] Iteration 12 started
+
+## 2026-02-20 - US-021: Rubric judge integration for hybrid tasks
+- Extended `scripts/run_judge.py` with `--hybrid` and `--hybrid-weight` flags
+- Added `_find_criteria_json(task_id, benchmark, benchmarks_dir)` helper (normalizes slug, handles case)
+- Added `_load_criteria_json(path)` helper
+- Added `RUBRIC_CRITERIA_PROMPT` to `scripts/ccb_metrics/judge/prompts.py`
+- Added `_format_criteria(criteria)` and `_parse_criteria_scores(response, criteria)` helpers to `engine.py`
+- Added `evaluate_with_criteria(judge_input, criteria)` method to `LLMJudge` class
+- Updated `JudgeResult` dataclass with `criteria_scores`, `rubric_score`, `hybrid_composite`, `verifier_weight` fields
+- `to_dict()` conditionally includes hybrid fields (backwards-compatible: absent when criteria not used)
+- `_run_single_task()` runs rubric scoring as non-fatal optional step in hybrid mode
+- Added `ccb_mcp_*` prefix mappings to `run_judge.py`'s `DIR_PREFIX_TO_SUITE`
+- 8 smoke tests all pass; py_compile succeeds for all 5 modified files
+- Repo health check: PASSED
+
+- Files changed: `scripts/ccb_metrics/judge/prompts.py`, `scripts/ccb_metrics/judge/models.py`, `scripts/ccb_metrics/judge/engine.py`, `scripts/ccb_metrics/judge/__init__.py`, `scripts/run_judge.py`, `ralph-mcp-unique/prd.json`, `ralph-mcp-unique/progress.txt`
+- **Learnings for future iterations:**
+  - `_parse_criteria_scores` normalizes each criterion score to [0, max_score] before computing normalized mean
+  - Rubric scoring is non-fatal in `_run_single_task`: catch exception, log warning, continue without rubric
+  - `to_dict()` uses conditional inclusion (if self.criteria_scores:) for backwards compat — no schema change needed
+  - `_find_criteria_json` handles slug normalization (underscore↔hyphen, case) for robust lookup
+  - hybrid_composite uses configurable verifier_weight (default 0.6/0.4 split) validated in main()
+  - Composite formula: composite = verifier_weight * verifier_reward + (1 - verifier_weight) * rubric_score
+---
diff --git a/scripts/ccb_metrics/judge/__init__.py b/scripts/ccb_metrics/judge/__init__.py
@@ -10,11 +10,13 @@
 
 from .engine import LLMJudge
 from .models import JudgeInput, JudgeResult, OracleBundle, normalize_score
+from .prompts import RUBRIC_CRITERIA_PROMPT
 
 __all__ = [
     "LLMJudge",
     "JudgeInput",
     "JudgeResult",
     "OracleBundle",
     "normalize_score",
+    "RUBRIC_CRITERIA_PROMPT",
 ]
diff --git a/scripts/ccb_metrics/judge/engine.py b/scripts/ccb_metrics/judge/engine.py
@@ -23,6 +23,7 @@
     DIRECT_REVIEW_PROMPT,
     REFERENCE_COMPLETENESS_PROMPT,
     REFERENCE_CORRECTNESS_PROMPT,
+    RUBRIC_CRITERIA_PROMPT,
 )
 
 # ---------------------------------------------------------------------------
@@ -84,6 +85,56 @@ def _render_prompt(template: str, judge_input: JudgeInput) -> str:
     )
 
 
+def _format_criteria(criteria: list[dict]) -> str:
+    """Format a criteria list for inclusion in the rubric prompt."""
+    lines: list[str] = []
+    for i, c in enumerate(criteria, 1):
+        metric = c.get("metric", f"criterion_{i}")
+        max_score = c.get("max_score", 1)
+        description = c.get("description", "")
+        lines.append(f"Criterion {i}: **{metric}** (max score: {max_score})")
+        lines.append(f"  {description}")
+        lines.append("")
+    return "\n".join(lines).rstrip()
+
+
+def _parse_criteria_scores(
+    response: dict, criteria: list[dict]
+) -> tuple[dict[str, dict], float]:
+    """Extract per-criterion scores from LLM response.
+
+    Returns:
+        (criteria_scores, rubric_score) where rubric_score is normalized mean.
+    """
+    raw = response.get("criteria_scores", {})
+    criteria_scores: dict[str, dict] = {}
+    normalized_scores: list[float] = []
+
+    for c in criteria:
+        metric = c.get("metric", "")
+        max_score = float(c.get("max_score", 1) or 1)
+        entry = raw.get(metric, {})
+        if isinstance(entry, dict):
+            raw_score = float(entry.get("score", 0.0))
+            reasoning = entry.get("reasoning", "")
+        else:
+            raw_score = 0.0
+            reasoning = ""
+        # Clamp to valid range
+        raw_score = max(0.0, min(raw_score, max_score))
+        normalized = raw_score / max_score if max_score > 0 else 0.0
+        criteria_scores[metric] = {
+            "score": raw_score,
+            "max_score": max_score,
+            "normalized_score": round(normalized, 4),
+            "reasoning": reasoning,
+        }
+        normalized_scores.append(normalized)
+
+    rubric_score = sum(normalized_scores) / len(normalized_scores) if normalized_scores else 0.0
+    return criteria_scores, round(rubric_score, 4)
+
+
 def _select_prompt(judge_input: JudgeInput) -> str:
     """Select the appropriate prompt template based on oracle availability."""
     if judge_input.oracle_ground_truth and judge_input.oracle_ground_truth.strip():
@@ -267,6 +318,38 @@ def evaluate_with_voting(
             provenance={**provenance, "confidence": confidence_float},
         )
 
+    def evaluate_with_criteria(
+        self,
+        judge_input: JudgeInput,
+        criteria: list[dict],
+    ) -> tuple[dict[str, dict], float]:
+        """Score agent output against AAA rubric criteria from criteria.json.
+
+        Args:
+            judge_input: Task input bundle.
+            criteria: List of {metric, description, max_score} dicts.
+
+        Returns:
+            (criteria_scores, rubric_score) where:
+              criteria_scores = {metric: {score, max_score, normalized_score, reasoning}}
+              rubric_score = mean of normalized per-criterion scores (0.0 – 1.0)
+        """
+        if not criteria:
+            return {}, 0.0
+
+        criteria_text = _format_criteria(criteria)
+        user_prompt = RUBRIC_CRITERIA_PROMPT.format(
+            task_description=judge_input.task_description,
+            agent_output=judge_input.code_changes or "(no output)",
+            criteria_text=criteria_text,
+        )
+        system_prompt = (
+            "You are a precise rubric evaluator. Always respond with valid JSON only."
+        )
+
+        response = self._backend.call(system_prompt, user_prompt)
+        return _parse_criteria_scores(response, criteria)
+
     # ---- internals ----
 
     def _build_provenance(self, rounds: int, rationale: str = "") -> dict:
diff --git a/scripts/ccb_metrics/judge/models.py b/scripts/ccb_metrics/judge/models.py
@@ -59,10 +59,15 @@ class JudgeResult:
     vote_distribution: dict = field(default_factory=dict)
     judged_at: str = ""
     provenance: dict = field(default_factory=dict)
+    # Hybrid evaluation fields (populated when criteria.json is present)
+    criteria_scores: dict = field(default_factory=dict)
+    rubric_score: float = 0.0
+    hybrid_composite: Optional[float] = None
+    verifier_weight: float = 0.6
 
     def to_dict(self) -> dict:
         """Produce JSON-serializable output matching judge_result.schema.json."""
-        return {
+        result: dict = {
             "task_id": self.task_id,
             "benchmark": self.benchmark,
             "config": self.config,
@@ -77,6 +82,14 @@ def to_dict(self) -> dict:
             "vote_distribution": self.vote_distribution,
             "provenance": self.provenance,
         }
+        # Include hybrid fields only when criteria scoring was performed
+        if self.criteria_scores:
+            result["criteria_scores"] = self.criteria_scores
+            result["rubric_score"] = self.rubric_score
+        if self.hybrid_composite is not None:
+            result["hybrid_composite"] = self.hybrid_composite
+            result["verifier_weight"] = self.verifier_weight
+        return result
 
 
 # ---- Helpers ----
diff --git a/scripts/ccb_metrics/judge/prompts.py b/scripts/ccb_metrics/judge/prompts.py
@@ -89,6 +89,36 @@
 }}
 """
 
+RUBRIC_CRITERIA_PROMPT = """\
+You are an expert evaluator assessing an AI agent's output against task-specific rubric criteria.
+
+## Task Description
+{task_description}
+
+## Agent Output
+{agent_output}
+
+## Rubric Criteria
+Evaluate the agent's output against each criterion below. Assign a score from 0 to max_score \
+(inclusive). Use fractional values for partial credit.
+
+{criteria_text}
+
+## Instructions
+Score each criterion based on the description. Be evidence-based and cite specific output elements.
+
+Respond with ONLY valid JSON in this exact format:
+{{
+  "criteria_scores": {{
+    "<metric_name>": {{
+      "score": <float 0 to max_score>,
+      "reasoning": "<brief explanation citing specific evidence>"
+    }}
+  }},
+  "overall_reasoning": "<synthesis of the evaluation across all criteria>"
+}}
+"""
+
 DIRECT_REVIEW_PROMPT = """\
 You are an expert code evaluator assessing an AI coding agent's output based on \
 the task description alone. No reference answer is available.
diff --git a/scripts/run_judge.py b/scripts/run_judge.py

Original file line number	Diff line number	Diff line change
`@@ -619,7 +619,7 @@`
`619`	`619`	`"Backwards compatible for tasks without criteria.json"`
`620`	`620`	`],`
`621`	`621`	`"priority": 21,`
`622`		`- "passes": false,`
	`622`	`+ "passes": true,`
`623`	`623`	`"notes": "Only E and J-family DS tasks need rubric judging initially. Most tasks are fully deterministic."`
`624`	`624`	`},`
`625`	`625`	`{`