|
23 | 23 | DIRECT_REVIEW_PROMPT, |
24 | 24 | REFERENCE_COMPLETENESS_PROMPT, |
25 | 25 | REFERENCE_CORRECTNESS_PROMPT, |
| 26 | + RUBRIC_CRITERIA_PROMPT, |
26 | 27 | ) |
27 | 28 |
|
28 | 29 | # --------------------------------------------------------------------------- |
@@ -84,6 +85,56 @@ def _render_prompt(template: str, judge_input: JudgeInput) -> str: |
84 | 85 | ) |
85 | 86 |
|
86 | 87 |
|
| 88 | +def _format_criteria(criteria: list[dict]) -> str: |
| 89 | + """Format a criteria list for inclusion in the rubric prompt.""" |
| 90 | + lines: list[str] = [] |
| 91 | + for i, c in enumerate(criteria, 1): |
| 92 | + metric = c.get("metric", f"criterion_{i}") |
| 93 | + max_score = c.get("max_score", 1) |
| 94 | + description = c.get("description", "") |
| 95 | + lines.append(f"Criterion {i}: **{metric}** (max score: {max_score})") |
| 96 | + lines.append(f" {description}") |
| 97 | + lines.append("") |
| 98 | + return "\n".join(lines).rstrip() |
| 99 | + |
| 100 | + |
| 101 | +def _parse_criteria_scores( |
| 102 | + response: dict, criteria: list[dict] |
| 103 | +) -> tuple[dict[str, dict], float]: |
| 104 | + """Extract per-criterion scores from LLM response. |
| 105 | +
|
| 106 | + Returns: |
| 107 | + (criteria_scores, rubric_score) where rubric_score is normalized mean. |
| 108 | + """ |
| 109 | + raw = response.get("criteria_scores", {}) |
| 110 | + criteria_scores: dict[str, dict] = {} |
| 111 | + normalized_scores: list[float] = [] |
| 112 | + |
| 113 | + for c in criteria: |
| 114 | + metric = c.get("metric", "") |
| 115 | + max_score = float(c.get("max_score", 1) or 1) |
| 116 | + entry = raw.get(metric, {}) |
| 117 | + if isinstance(entry, dict): |
| 118 | + raw_score = float(entry.get("score", 0.0)) |
| 119 | + reasoning = entry.get("reasoning", "") |
| 120 | + else: |
| 121 | + raw_score = 0.0 |
| 122 | + reasoning = "" |
| 123 | + # Clamp to valid range |
| 124 | + raw_score = max(0.0, min(raw_score, max_score)) |
| 125 | + normalized = raw_score / max_score if max_score > 0 else 0.0 |
| 126 | + criteria_scores[metric] = { |
| 127 | + "score": raw_score, |
| 128 | + "max_score": max_score, |
| 129 | + "normalized_score": round(normalized, 4), |
| 130 | + "reasoning": reasoning, |
| 131 | + } |
| 132 | + normalized_scores.append(normalized) |
| 133 | + |
| 134 | + rubric_score = sum(normalized_scores) / len(normalized_scores) if normalized_scores else 0.0 |
| 135 | + return criteria_scores, round(rubric_score, 4) |
| 136 | + |
| 137 | + |
87 | 138 | def _select_prompt(judge_input: JudgeInput) -> str: |
88 | 139 | """Select the appropriate prompt template based on oracle availability.""" |
89 | 140 | if judge_input.oracle_ground_truth and judge_input.oracle_ground_truth.strip(): |
@@ -267,6 +318,38 @@ def evaluate_with_voting( |
267 | 318 | provenance={**provenance, "confidence": confidence_float}, |
268 | 319 | ) |
269 | 320 |
|
| 321 | + def evaluate_with_criteria( |
| 322 | + self, |
| 323 | + judge_input: JudgeInput, |
| 324 | + criteria: list[dict], |
| 325 | + ) -> tuple[dict[str, dict], float]: |
| 326 | + """Score agent output against AAA rubric criteria from criteria.json. |
| 327 | +
|
| 328 | + Args: |
| 329 | + judge_input: Task input bundle. |
| 330 | + criteria: List of {metric, description, max_score} dicts. |
| 331 | +
|
| 332 | + Returns: |
| 333 | + (criteria_scores, rubric_score) where: |
| 334 | + criteria_scores = {metric: {score, max_score, normalized_score, reasoning}} |
| 335 | + rubric_score = mean of normalized per-criterion scores (0.0 – 1.0) |
| 336 | + """ |
| 337 | + if not criteria: |
| 338 | + return {}, 0.0 |
| 339 | + |
| 340 | + criteria_text = _format_criteria(criteria) |
| 341 | + user_prompt = RUBRIC_CRITERIA_PROMPT.format( |
| 342 | + task_description=judge_input.task_description, |
| 343 | + agent_output=judge_input.code_changes or "(no output)", |
| 344 | + criteria_text=criteria_text, |
| 345 | + ) |
| 346 | + system_prompt = ( |
| 347 | + "You are a precise rubric evaluator. Always respond with valid JSON only." |
| 348 | + ) |
| 349 | + |
| 350 | + response = self._backend.call(system_prompt, user_prompt) |
| 351 | + return _parse_criteria_scores(response, criteria) |
| 352 | + |
270 | 353 | # ---- internals ---- |
271 | 354 |
|
272 | 355 | def _build_provenance(self, rounds: int, rationale: str = "") -> dict: |
|
0 commit comments