-
Notifications
You must be signed in to change notification settings - Fork 6
Resolve judge code smell #102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,169 @@ | ||||||||
| """CLI entrypoint for the judge script: argument parser and main async entrypoint.""" | ||||||||
|
|
||||||||
| import argparse | ||||||||
| from pathlib import Path | ||||||||
| from typing import Optional | ||||||||
|
|
||||||||
| from judge import judge_conversations, judge_single_conversation | ||||||||
| from judge.llm_judge import LLMJudge | ||||||||
| from judge.rubric_config import ConversationData, RubricConfig, load_conversations | ||||||||
| from judge.utils import parse_judge_models | ||||||||
| from utils.utils import parse_key_value_list | ||||||||
|
|
||||||||
|
|
||||||||
| def get_parser() -> argparse.ArgumentParser: | ||||||||
| """Build and return the argument parser (for CLI and testing).""" | ||||||||
| parser = argparse.ArgumentParser( | ||||||||
| description="Judge existing LLM conversations using rubrics" | ||||||||
| ) | ||||||||
|
|
||||||||
| # required source | ||||||||
| source_group = parser.add_mutually_exclusive_group(required=True) | ||||||||
| source_group.add_argument( | ||||||||
| "--conversation", "-c", help="Path to a single conversation file to judge" | ||||||||
| ) | ||||||||
| source_group.add_argument( | ||||||||
| "--folder", | ||||||||
| "-f", | ||||||||
| help="Path to a conversation run folder " | ||||||||
| "(e.g. conversations/p_model__a_model__t6__r1__timestamp/)", | ||||||||
| ) | ||||||||
|
|
||||||||
| # rubrics | ||||||||
| parser.add_argument( | ||||||||
| "--rubrics", | ||||||||
| "-r", | ||||||||
| nargs="+", | ||||||||
| default=["data/rubric.tsv"], | ||||||||
| help="Rubric file(s) to use (default: data/rubric.tsv)", | ||||||||
| ) | ||||||||
|
|
||||||||
| # model | ||||||||
| parser.add_argument( | ||||||||
| "--judge-model", | ||||||||
| "-j", | ||||||||
| nargs="+", | ||||||||
| required=True, | ||||||||
| help=( | ||||||||
| "Model(s) to use for judging. " | ||||||||
| "Format: 'model' or 'model:count' for multiple instances. " | ||||||||
| "Can specify multiple models: --judge-model model1 model2:3. " | ||||||||
| "Examples: claude-sonnet-4-5-20250929, " | ||||||||
| "claude-sonnet-4-5-20250929:3, " | ||||||||
| "claude-sonnet-4-5-20250929:2 gpt-4o:1" | ||||||||
| ), | ||||||||
| ) | ||||||||
|
|
||||||||
| parser.add_argument( | ||||||||
| "--judge-model-extra-params", | ||||||||
| "-jep", | ||||||||
| help=( | ||||||||
| "Extra parameters for the judge model. " | ||||||||
| "Examples: temperature=0.7, max_tokens=1000. " | ||||||||
| "Default: temperature=0 (unless overridden)" | ||||||||
| ), | ||||||||
| type=parse_key_value_list, | ||||||||
| default={}, | ||||||||
| ) | ||||||||
|
|
||||||||
| # optional limit | ||||||||
| parser.add_argument( | ||||||||
| "--limit", | ||||||||
| "-l", | ||||||||
| type=int, | ||||||||
| default=None, | ||||||||
| help="Limit number of conversations to judge (for debugging)", | ||||||||
| ) | ||||||||
|
|
||||||||
| # output folder | ||||||||
| parser.add_argument( | ||||||||
| "--output", | ||||||||
| "-o", | ||||||||
| default="evaluations", | ||||||||
| help="Output folder for evaluation results (default: evaluations)", | ||||||||
| ) | ||||||||
|
|
||||||||
| # concurrency control | ||||||||
| parser.add_argument( | ||||||||
| "--max-concurrent", | ||||||||
| "-m", | ||||||||
| type=int, | ||||||||
| default=None, | ||||||||
| help=( | ||||||||
| "Maximum number of concurrent workers (default: None). " | ||||||||
| "Set to a high number or omit for unlimited concurrency." | ||||||||
| ), | ||||||||
| ) | ||||||||
|
|
||||||||
| parser.add_argument( | ||||||||
| "--per-judge", | ||||||||
| "-pj", | ||||||||
| action="store_true", | ||||||||
| help=( | ||||||||
| "If set, --max-concurrent applies per judge model. " | ||||||||
| "Otherwise, it applies to total workers across all judges." | ||||||||
| ), | ||||||||
| ) | ||||||||
|
|
||||||||
| parser.add_argument( | ||||||||
| "--verbose-workers", | ||||||||
| "-vw", | ||||||||
| action="store_true", | ||||||||
| help="Enable verbose worker logging to show concurrency behavior", | ||||||||
| ) | ||||||||
|
|
||||||||
| return parser | ||||||||
|
|
||||||||
|
|
||||||||
| async def main(args: argparse.Namespace) -> Optional[str]: | ||||||||
| """Main async entrypoint for judging conversations.""" | ||||||||
| # Parse judge models from args (supports "model" or "model:count" format) | ||||||||
| judge_models = parse_judge_models(args.judge_model) | ||||||||
|
|
||||||||
| models_str = ", ".join(f"{model}x{count}" for model, count in judge_models.items()) | ||||||||
| print(f"🎯 LLM Judge | Models: {models_str}") | ||||||||
|
|
||||||||
| # Load rubric configuration once at startup | ||||||||
| print("📚 Loading rubric configuration...") | ||||||||
| rubric_config = await RubricConfig.load(rubric_folder="data") | ||||||||
|
||||||||
| rubric_config = await RubricConfig.load(rubric_folder="data") | |
| rubric_folder = getattr(args, "rubrics", None) or "data" | |
| rubric_config = await RubricConfig.load(rubric_folder=rubric_folder) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--judge-model-extra-paramsuses a mutable dict (default={}) as the argparse default. BecauseLLMJudgemutates this dict (e.g., setstemperature), subsequentparse_args()calls in the same process can inherit mutated defaults. Usedefault=None(and normalize to{}inmain) or ensure a fresh dict per parse (e.g., copy).