diff --git a/README.md b/README.md index a9d83487..f26543fb 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,36 @@ Where - `j` is the flag for selecting the judge model(s) - `jep` are the judge model extra parameters (optional) +7. **Score and visualize the results**: + ```bash + python -m judge.score -r evaluations/{YOUR_EVAL_FOLDER}/results.csv + ``` + +## Quick Start: End-to-End Pipeline + +For convenience, you can run the entire workflow (generation → evaluation → scoring) with a single command: + +```bash +python3 run_pipeline.py \ + --user-agent claude-sonnet-4-5-20250929 \ + --provider-agent gpt-4o \ + --runs 2 \ + --turns 10 \ + --judge-model claude-sonnet-4-5-20250929 \ + --max-personas 5 +``` + +The pipeline script: +- Runs `generate.py` with your specified arguments +- Automatically passes the output folder to `judge.py` +- Automatically runs `judge/score.py` on the evaluation results +- Displays a summary with all output locations + +For help and all available options: +```bash +python3 run_pipeline.py --help +``` + ### Using Extra Parameters Both `generate.py` and `judge.py` support extra parameters for fine-tuning model behavior: @@ -74,7 +104,7 @@ Both `generate.py` and `judge.py` support extra parameters for fine-tuning model **Generate with temperature control:** ```bash # Lower temperature (0.3) for more consistent responses -python generate.py -u gpt-4o -uep temperature=0.3 -p claude-3-5-sonnet-20241022 -pep temperature=0.5 -t 6 -r 2 +python generate.py -u gpt-4o -uep temperature=0.3 -p claude-sonnet-4-5-20250929 -pep temperature=0.5 -t 6 -r 2 # Higher temperature (1.0) with max tokens python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep temperature=1 -t 6 -r 1 @@ -83,7 +113,7 @@ python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep t **Judge with custom parameters:** ```bash # Use lower temperature for more consistent evaluation -python judge.py -f conversations/my_experiment -j claude-3-5-sonnet-20241022 -jep temperature=0.3 +python judge.py -f conversations/my_experiment -j claude-sonnet-4-5-20250929 -jep temperature=0.3 # Multiple parameters python judge.py -f conversations/my_experiment -j gpt-4o -jep temperature=0.5,max_tokens=1500 diff --git a/generate.py b/generate.py index a723bf77..0264224a 100644 --- a/generate.py +++ b/generate.py @@ -25,7 +25,7 @@ async def main( max_concurrent: Optional[int] = None, max_total_words: Optional[int] = None, max_personas: Optional[int] = None, -) -> List[Dict[str, Any]]: +) -> tuple[List[Dict[str, Any]], str]: """ Generate conversations and return results. @@ -117,7 +117,7 @@ async def main( if verbose: print(f"✅ Generated {len(results)} conversations → {folder_name}/") - return results + return results, folder_name if __name__ == "__main__": @@ -127,7 +127,7 @@ async def main( "--user-agent", "-u", help=( - "Model for the user-agent. Examples: claude-3-5-sonnet-20241022, " + "Model for the user-agent. Examples: claude-sonnet-4-5-20250929, " "gemini-1.5-pro, llama3:8b" ), required=True, @@ -147,7 +147,7 @@ async def main( "--provider-agent", "-p", help=( - "Model for the provider-agent. Examples: claude-3-5-sonnet-20241022, " + "Model for the provider-agent. Examples: claude-sonnet-4-5-20250929, " "gemini-1.5-pro, llama3:8b" ), required=True, @@ -255,8 +255,7 @@ async def main( } # TODO: Do the run id here, so that it can be printed when starting - # Note: we are discarding the results, because they are saved to file - _ = asyncio.run( + results, output_folder = asyncio.run( main( persona_model_config=persona_model_config, agent_model_config=agent_model_config, diff --git a/judge.py b/judge.py index 888e7d17..58688790 100644 --- a/judge.py +++ b/judge.py @@ -6,6 +6,7 @@ import argparse import asyncio +from typing import Optional from judge import judge_conversations, judge_single_conversation from judge.llm_judge import LLMJudge @@ -13,7 +14,7 @@ from utils.utils import parse_key_value_list -async def main(args): +async def main(args) -> Optional[str]: """Main async entrypoint for judging conversations.""" # Parse judge models from args (supports "model" or "model:count" format) judge_models = {} @@ -47,6 +48,9 @@ async def main(args): judge_model_extra_params=args.judge_model_extra_params, ) await judge_single_conversation(judge, conversation, args.output) + # Single conversation mode doesn't need output folder for pipeline + print("ℹ️ Single conversation mode: output folder not needed for pipeline") + return None else: # Load all conversations at startup print(f"📂 Loading conversations from {args.folder}...") @@ -58,7 +62,7 @@ async def main(args): folder_name = Path(args.folder).name - await judge_conversations( + _, output_folder = await judge_conversations( judge_models=judge_models, conversations=conversations, rubric_config=rubric_config, @@ -71,6 +75,8 @@ async def main(args): verbose_workers=args.verbose_workers, ) + return output_folder + if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -108,9 +114,9 @@ async def main(args): "Model(s) to use for judging. " "Format: 'model' or 'model:count' for multiple instances. " "Can specify multiple models: --judge-model model1 model2:3. " - "Examples: claude-3-5-sonnet-20241022, " - "claude-3-5-sonnet-20241022:3, " - "claude-3-5-sonnet-20241022:2 gpt-4o:1" + "Examples: claude-sonnet-4-5-20250929, " + "claude-sonnet-4-5-20250929:3, " + "claude-sonnet-4-5-20250929:2 gpt-4o:1" ), ) diff --git a/judge/runner.py b/judge/runner.py index e9c6a490..13d288f3 100644 --- a/judge/runner.py +++ b/judge/runner.py @@ -434,7 +434,7 @@ async def judge_conversations( max_concurrent: Optional[int] = None, per_judge: bool = False, verbose_workers: bool = False, -) -> List[Dict[str, Any]]: +) -> tuple[List[Dict[str, Any]], str]: """ Judge conversations with multiple judge models. @@ -454,8 +454,9 @@ async def judge_conversations( per_judge: If True, max_concurrent applies per judge model; if False, total Returns: - Flattened list of evaluation results with one row per - (conversation, judge_model, judge_instance) tuple + Tuple of (results, output_folder) where results is a flattened list of + evaluation results with one row per (conversation, judge_model, judge_instance) + tuple, and output_folder is the path where evaluations were saved """ if output_folder is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] @@ -519,7 +520,7 @@ async def judge_conversations( if verbose: print(f"✅ Completed {len(results)} evaluations → {output_folder}/") - return results + return results, output_folder async def judge_single_conversation( diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py index d8faa1da..1cbf518e 100644 --- a/llm_clients/claude_llm.py +++ b/llm_clients/claude_llm.py @@ -92,7 +92,7 @@ async def generate_response( msg_type = type(msg).__name__ preview = msg.content[:100] content_preview = preview + "..." if len(msg.content) > 100 else msg.content - debug_print(f" {i+1}. {msg_type}: {content_preview}") + debug_print(f" {i + 1}. {msg_type}: {content_preview}") try: start_time = time.time() diff --git a/llm_clients/config.py b/llm_clients/config.py index bc8d1513..5ac5e760 100644 --- a/llm_clients/config.py +++ b/llm_clients/config.py @@ -36,7 +36,7 @@ def get_claude_config(cls) -> Dict[str, Any]: Returns only the model name. Runtime parameters (temperature, max_tokens) should be passed explicitly via CLI arguments. """ - return {"model": "claude-3-5-sonnet-20241022"} + return {"model": "claude-sonnet-4-5-20250929"} @classmethod def get_openai_config(cls) -> Dict[str, Any]: diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index 5cfbfa32..4fed8015 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -90,7 +90,7 @@ async def generate_response( msg_type = type(msg).__name__ preview = msg.content[:100] content_preview = preview + "..." if len(msg.content) > 100 else msg.content - debug_print(f" {i+1}. {msg_type}: {content_preview}") + debug_print(f" {i + 1}. {msg_type}: {content_preview}") try: start_time = time.time() diff --git a/llm_clients/llm_factory.py b/llm_clients/llm_factory.py index 80113167..afae8bd6 100644 --- a/llm_clients/llm_factory.py +++ b/llm_clients/llm_factory.py @@ -19,7 +19,7 @@ def create_llm( Args: model_name: The model identifier - (e.g., "claude-3-5-sonnet-20241022", "gpt-4") + (e.g., "claude-sonnet-4-5-20250929", "gpt-4") name: Display name for this LLM instance system_prompt: Optional system prompt **kwargs: Additional model-specific parameters @@ -72,7 +72,7 @@ def create_judge_llm( Args: model_name: The model identifier - (e.g., "claude-3-5-sonnet-20241022", "gpt-4") + (e.g., "claude-sonnet-4-5-20250929", "gpt-4") name: Display name for this LLM instance system_prompt: Optional system prompt **kwargs: Additional model-specific parameters diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py index 574148d8..508037c4 100644 --- a/llm_clients/openai_llm.py +++ b/llm_clients/openai_llm.py @@ -89,7 +89,7 @@ async def generate_response( msg_type = type(msg).__name__ preview = msg.content[:100] content_preview = preview + "..." if len(msg.content) > 100 else msg.content - debug_print(f" {i+1}. {msg_type}: {content_preview}") + debug_print(f" {i + 1}. {msg_type}: {content_preview}") try: start_time = time.time() diff --git a/model_config.json b/model_config.json index 55f164af..f2242114 100644 --- a/model_config.json +++ b/model_config.json @@ -1,15 +1,16 @@ { "prompt_models": { - "assistant": "claude-3-5-sonnet-20241022", + "assistant": "claude-sonnet-4-5-20250929", "philosopher": "claude-3-opus-20240229", "debate_starter": "claude-3-sonnet-20240229", "creative": "claude-3-haiku-20240307", - "scientist": "claude-3-5-sonnet-20241022", - "skeptic": "claude-3-5-sonnet-20241022", + "scientist": "claude-sonnet-4-5-20250929", + "skeptic": "claude-sonnet-4-5-20250929", "gpt_assistant": "gpt-4", "gpt_creative": "gpt-4-turbo", "gpt_analyst": "gpt-3.5-turbo", - "claude-sonnet-4-20250514": "claude-sonnet-4-20250514" + "claude-sonnet-4-20250514": "claude-sonnet-4-20250514", + "claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929" }, - "default_model": "claude-3-5-sonnet-20241022" + "default_model": "claude-sonnet-4-5-20250929" } \ No newline at end of file diff --git a/run_pipeline.py b/run_pipeline.py new file mode 100644 index 00000000..6d6dbf80 --- /dev/null +++ b/run_pipeline.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +""" +VERA-MH End-to-End Pipeline Runner (Python version) + +This script orchestrates the complete workflow: + 1. Generate conversations (generate.py) + 2. Evaluate them with LLM judge (judge.py) + 3. Score and visualize results (judge/score.py) + +It automatically passes the output folder from each step to the next step, +so you don't have to manually copy paths between commands. +""" + +import argparse +import asyncio +import os +import sys +from pathlib import Path + +from judge.score import ( + create_risk_level_visualizations, + create_visualizations, + print_scores, + score_results, + score_results_by_risk, +) +from utils.utils import parse_key_value_list + + +def parse_arguments(): + """ + Parse command line arguments and separate them into three groups: + - Arguments for generate.py + - Arguments for judge.py + - Arguments for judge/score.py + """ + parser = argparse.ArgumentParser( + description="VERA-MH Pipeline Runner: Generation → Evaluation → Scoring", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Example: + %(prog)s --user-agent claude-sonnet-4-5-20250929 \\ + --provider-agent gpt-4o \\ + --runs 2 \\ + --turns 10 \\ + --judge-model claude-sonnet-4-5-20250929 \\ + --max-personas 5 + """, + ) + + # Required arguments for generation + parser.add_argument( + "--user-agent", + "-u", + required=True, + help="User/persona model (e.g., claude-sonnet-4-5-20250929)", + ) + parser.add_argument( + "--provider-agent", + "-p", + required=True, + help="Provider/agent model (e.g., gpt-4o)", + ) + parser.add_argument( + "--runs", "-r", type=int, required=True, help="Number of runs per persona" + ) + parser.add_argument( + "--turns", + "-t", + type=int, + required=True, + help="Number of turns per conversation", + ) + + # Required arguments for judge + parser.add_argument( + "--judge-model", + "-j", + nargs="+", + required=True, + help="Judge model(s), format: model or model:count", + ) + + # Optional arguments for generation + parser.add_argument( + "--user-agent-extra-params", + "-uep", + help="Extra params for user agent (e.g., temperature=0.7)", + type=parse_key_value_list, + default={}, + ) + parser.add_argument( + "--provider-agent-extra-params", + "-pep", + help="Extra params for provider agent (e.g., temperature=0.5)", + type=parse_key_value_list, + default={}, + ) + parser.add_argument( + "--max-total-words", + "-w", + type=int, + help="Maximum total words per conversation", + ) + parser.add_argument( + "--max-concurrent", type=int, help="Maximum concurrent conversations" + ) + parser.add_argument( + "--max-personas", + type=int, + help="Maximum number of personas to load (for testing)", + ) + parser.add_argument( + "--folder-name", "-f", help="Custom folder name for conversations" + ) + parser.add_argument( + "--run-id", + "-i", + help="Custom run ID for conversation folder (default: timestamp)", + ) + parser.add_argument( + "--debug", action="store_true", help="Enable debug logging for generation" + ) + + # Optional arguments for judge + parser.add_argument( + "--judge-model-extra-params", + "-jep", + help="Extra params for judge model", + type=parse_key_value_list, + default={}, + ) + parser.add_argument( + "--judge-max-concurrent", type=int, help="Maximum concurrent judge workers" + ) + parser.add_argument( + "--judge-per-judge", + action="store_true", + help="Apply concurrency limit per judge", + ) + parser.add_argument( + "--judge-limit", type=int, help="Limit conversations to judge (for testing)" + ) + parser.add_argument( + "--judge-verbose-workers", + action="store_true", + help="Enable verbose worker logging", + ) + parser.add_argument( + "--rubrics", + nargs="+", + default=["data/rubric.tsv"], + help="Rubric file(s) to use for evaluation (default: data/rubric.tsv)", + ) + parser.add_argument( + "--judge-output", + default="evaluations", + help="Output folder for evaluation results (default: evaluations)", + ) + + # Optional arguments for scoring + parser.add_argument( + "--skip-risk-analysis", action="store_true", help="Skip risk-level analysis" + ) + parser.add_argument( + "--personas-tsv", + default="data/personas.tsv", + help="Path to personas.tsv (default: data/personas.tsv)", + ) + + return parser.parse_args() + + +async def main(): + """Main entry point for the pipeline runner.""" + + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("VERA-MH Pipeline: Generation → Evaluation → Scoring") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("") + + # Parse command line arguments + args = parse_arguments() + + # Import generate and judge main functions + # We import here to avoid circular dependencies and to allow --debug flag to be set + # Import judge.py main function + # (note: judge.py is a module file, judge/ is a package) + import importlib.util + + from generate import main as generate_main + + spec = importlib.util.spec_from_file_location("judge_script", "judge.py") + judge_script = importlib.util.module_from_spec(spec) + spec.loader.exec_module(judge_script) + judge_main = judge_script.main + + # Set debug mode if flag is provided + if args.debug: + from utils.debug import set_debug + + set_debug(True) + + # ========================================================================= + # Step 1: Generate conversations + # ========================================================================= + print("▶ Step 1/3: Generating conversations...") + + # Build model configs for generation + persona_model_config = { + "model": args.user_agent, + **args.user_agent_extra_params, + } + + agent_model_config = { + "model": args.provider_agent, + "name": args.provider_agent, + **args.provider_agent_extra_params, + } + + # Call generate.py's main function directly + _, conversation_folder = await generate_main( + persona_model_config=persona_model_config, + agent_model_config=agent_model_config, + max_turns=args.turns, + runs_per_prompt=args.runs, + persona_extra_run_params={ + k: v + for k, v in persona_model_config.items() + if k not in ["model", "model_name", "name", "temperature", "max_tokens"] + }, + agent_extra_run_params={ + k: v + for k, v in agent_model_config.items() + if k not in ["model", "model_name", "name", "temperature", "max_tokens"] + }, + folder_name=args.folder_name, + run_id=args.run_id, + max_concurrent=args.max_concurrent, + max_total_words=args.max_total_words, + max_personas=args.max_personas, + ) + + print("") + print(f"✓ Conversations saved to: {conversation_folder}/") + print("") + + # Validate that Step 1 produced conversation files + if not os.path.exists(conversation_folder): + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("❌ Pipeline failed at Step 1: Conversation folder not created") + print("") + print(f"Expected folder: {conversation_folder}") + print("") + print("Troubleshooting:") + print(" - Check that generate.py returned a valid folder path") + print(" - Verify file system permissions") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + sys.exit(1) + + # Count conversation files (exclude log files) + conversation_files = [ + f + for f in os.listdir(conversation_folder) + if f.endswith(".txt") and not f.endswith(".log") + ] + + if not conversation_files: + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("❌ Pipeline failed at Step 1: No conversations were generated") + print("") + print(f"Conversation folder: {conversation_folder}") + print(f"Files in folder: {len(os.listdir(conversation_folder))}") + print("") + print("Possible causes:") + print( + " 1. Invalid model name (check that the model exists in the " + "provider's API)" + ) + print(" 2. API authentication issues (check your API keys in .env)") + print(" 3. API rate limits or quota exceeded") + print(" 4. Network connectivity issues") + print("") + print("Troubleshooting:") + print(" - Check files in the conversation folder for error messages") + print(" - Look for API error responses in the output") + print(" - Verify model names are valid for your provider") + print(" - Run generate.py separately to isolate the issue") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + sys.exit(1) + + print(f"✓ Validated: {len(conversation_files)} conversation files generated") + print("") + + # ========================================================================= + # Step 2: Evaluate conversations with LLM judge + # ========================================================================= + print("▶ Step 2/3: Evaluating conversations...") + + # Build argparse.Namespace for judge.py's main function + judge_args = argparse.Namespace( + conversation=None, # Not using single conversation mode + folder=conversation_folder, + rubrics=args.rubrics, + judge_model=args.judge_model, + judge_model_extra_params=args.judge_model_extra_params, + limit=args.judge_limit, + output=args.judge_output, + max_concurrent=args.judge_max_concurrent, + per_judge=args.judge_per_judge, + verbose_workers=args.judge_verbose_workers, + ) + + # Call judge.py's main function directly + evaluation_folder = await judge_main(judge_args) + + if not evaluation_folder: + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("❌ Pipeline failed at Step 2: Judge did not return an evaluation folder") + print("") + print("Troubleshooting:") + print(" - Check error messages from the judge evaluation above") + print(" - Run judge.py separately to isolate the issue") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + sys.exit(1) + + # Validate that Step 2 produced evaluation results + if not os.path.exists(evaluation_folder): + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("❌ Pipeline failed at Step 2: Evaluation folder not created") + print("") + print(f"Expected folder: {evaluation_folder}") + print("") + print("Troubleshooting:") + print(" - Check that judge.py returned a valid folder path") + print(" - Verify file system permissions") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + sys.exit(1) + + # Check for results.csv file + results_csv_path = os.path.join(evaluation_folder, "results.csv") + if not os.path.exists(results_csv_path): + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("❌ Pipeline failed at Step 2: No evaluation results were generated") + print("") + print(f"Evaluation folder: {evaluation_folder}") + print(f"Expected results file: {results_csv_path}") + print("") + + # Check if folder is empty + folder_files = ( + os.listdir(evaluation_folder) if os.path.exists(evaluation_folder) else [] + ) + print(f"Files in evaluation folder: {len(folder_files)}") + if folder_files: + print(" Found: " + ", ".join(folder_files[:5])) + if len(folder_files) > 5: + print(f" ... and {len(folder_files) - 5} more") + + print("") + print("Possible causes:") + print(" 1. All evaluations failed (check judge model name and API access)") + print(" 2. Invalid judge model name") + print(" 3. Judge API authentication issues") + print( + " 4. Conversation files from Step 1 contained errors instead of " + "conversations" + ) + print("") + print("Troubleshooting:") + print(" - Check the conversation files from Step 1 for API error messages") + print(" - Look for judge evaluation errors in the output above") + print(" - Verify judge model name is valid") + print( + " - Run judge.py separately on the conversation folder to isolate the " + "issue" + ) + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + sys.exit(1) + + print("") + print(f"✓ Evaluations saved to: {evaluation_folder}/") + print("✓ Validated: results.csv exists with evaluation data") + print("") + + # ========================================================================= + # Step 3: Score results and create visualizations + # ========================================================================= + print("▶ Step 3/3: Scoring and visualizing results...") + + # Build paths for scoring + results_csv = os.path.join(evaluation_folder, "results.csv") + + # Call score_results for standard analysis + results = score_results(results_csv_path=results_csv) + print_scores(results) + + # Create standard visualizations + viz_path = Path(evaluation_folder) / "scores_visualization.png" + create_visualizations(results, viz_path) + + # Perform risk-level analysis unless skipped + if not args.skip_risk_analysis: + risk_results = score_results_by_risk( + results_csv_path=results_csv, + personas_tsv_path=args.personas_tsv, + ) + risk_viz_path = Path(evaluation_folder) / "scores_by_risk_visualization.png" + create_risk_level_visualizations(risk_results, risk_viz_path) + + # ========================================================================= + # Final summary + # ========================================================================= + print("") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + print("✓ Pipeline complete!") + print("") + print("Output Locations:") + print(f" Conversations: {conversation_folder}/") + print(f" Evaluations: {evaluation_folder}/") + print(f" Scores (JSON): {evaluation_folder}/scores.json") + if not args.skip_risk_analysis: + print(f" {evaluation_folder}/scores_by_risk.json") + print(f" Visualizations: {evaluation_folder}/scores_visualization.png") + if not args.skip_risk_analysis: + print(f" {evaluation_folder}/scores_by_risk_visualization.png") + print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/integration/test_evaluation_runner.py b/tests/integration/test_evaluation_runner.py index d69d304b..98691c23 100644 --- a/tests/integration/test_evaluation_runner.py +++ b/tests/integration/test_evaluation_runner.py @@ -451,7 +451,7 @@ async def test_judge_conversations_basic_workflow( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -509,7 +509,7 @@ async def test_judge_conversations_custom_output_folder( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, output_folder = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -555,7 +555,7 @@ async def test_judge_conversations_with_limit( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -716,7 +716,7 @@ async def test_judge_conversations_no_save_aggregated( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -767,7 +767,7 @@ async def test_load_conversation_with_unicode( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -811,7 +811,7 @@ async def test_load_conversation_multiline_messages( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -902,7 +902,7 @@ async def test_results_csv_contains_all_fields( question_prompt_file="question_prompt.txt", ) - await judge_conversations( + _, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -953,7 +953,7 @@ async def test_metadata_extraction_from_filenames( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1002,7 +1002,7 @@ async def test_empty_conversation_file( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1046,7 +1046,7 @@ async def test_malformed_conversation_format( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1083,7 +1083,7 @@ async def test_special_characters_in_folder_path( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1128,7 +1128,7 @@ async def test_very_long_conversation( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1170,7 +1170,7 @@ async def test_concurrent_file_writing( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1384,7 +1384,7 @@ async def test_judge_conversations_with_multiple_models( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge-1": 2, "mock-judge-2": 1}, conversations=conversations, rubric_config=rubric_config, @@ -1563,7 +1563,7 @@ async def test_judge_conversations_passes_concurrency_params( question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 2}, conversations=conversations, rubric_config=rubric_config, @@ -1839,7 +1839,7 @@ async def mock_batch_evaluate_empty(*args, **kwargs): question_prompt_file="question_prompt.txt", ) - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"mock-judge": 1}, conversations=conversations, rubric_config=rubric_config, diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py new file mode 100644 index 00000000..fdf7797b --- /dev/null +++ b/tests/integration/test_pipeline.py @@ -0,0 +1,959 @@ +""" +Integration tests for run_pipeline.py end-to-end pipeline orchestration. + +Tests the three-stage pipeline: generation → evaluation → scoring +Following VERA-MH testing patterns from test_conversation_runner.py + +Note: Full end-to-end execution tests are complex due to module import mechanics. +These tests focus on argument parsing, configuration building, and error paths. +""" + +import argparse +from unittest.mock import patch + +import pytest + +# Fixtures + + +@pytest.fixture +def pipeline_args(): + """Minimal valid pipeline arguments.""" + return argparse.Namespace( + user_agent="claude-sonnet-4-5-20250929", + provider_agent="gpt-4o", + runs=1, + turns=4, + judge_model=["claude-sonnet-4-5-20250929"], + user_agent_extra_params={}, + provider_agent_extra_params={}, + max_total_words=None, + max_concurrent=None, + max_personas=2, + folder_name=None, + run_id=None, + debug=False, + judge_model_extra_params={}, + judge_max_concurrent=None, + judge_per_judge=False, + judge_limit=None, + judge_verbose_workers=False, + rubrics=["data/rubric.tsv"], + judge_output="evaluations", + skip_risk_analysis=False, + personas_tsv="data/personas.tsv", + ) + + +# Test Classes + + +@pytest.mark.integration +class TestPipelineArgumentParsing: + """Test argument parsing and validation.""" + + def test_parse_arguments_required_only(self): + """Test parsing with only required arguments.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.user_agent == "claude-sonnet-4-5-20250929" + assert args.provider_agent == "gpt-4o" + assert args.runs == 1 + assert args.turns == 4 + assert args.judge_model == ["claude-sonnet-4-5-20250929"] + + def test_parse_arguments_with_extra_params(self): + """Test parsing with extra model parameters.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "--user-agent-extra-params", + "temperature=0.7,max_tokens=1000", + "--provider-agent-extra-params", + "temperature=0.5", + "--judge-model-extra-params", + "temperature=0.1", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.user_agent_extra_params == { + "temperature": 0.7, + "max_tokens": 1000, + } + assert args.provider_agent_extra_params == {"temperature": 0.5} + assert args.judge_model_extra_params == {"temperature": 0.1} + + def test_parse_arguments_multiple_judge_models(self): + """Test parsing with multiple judge models.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929:2", + "gpt-4o", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.judge_model == ["claude-sonnet-4-5-20250929:2", "gpt-4o"] + + def test_parse_arguments_missing_required(self): + """Test that missing required arguments raises error.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + # Missing other required args + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + with pytest.raises(SystemExit): + parse_arguments() + + def test_parse_arguments_optional_flags(self): + """Test parsing optional boolean flags.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "--debug", + "--judge-per-judge", + "--judge-verbose-workers", + "--skip-risk-analysis", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.debug is True + assert args.judge_per_judge is True + assert args.judge_verbose_workers is True + assert args.skip_risk_analysis is True + + def test_parse_arguments_with_all_optional_arguments(self): + """Test parsing with all optional arguments provided.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "2", + "--turns", + "10", + "--judge-model", + "claude-sonnet-4-5-20250929:2", + "gpt-4o", + "--user-agent-extra-params", + "temperature=0.7", + "--provider-agent-extra-params", + "temperature=0.5", + "--max-total-words", + "5000", + "--max-concurrent", + "10", + "--max-personas", + "5", + "--folder-name", + "custom_folder", + "--run-id", + "test_run_id", + "--debug", + "--judge-model-extra-params", + "temperature=0.1", + "--judge-max-concurrent", + "5", + "--judge-per-judge", + "--judge-limit", + "10", + "--judge-verbose-workers", + "--rubrics", + "data/rubric.tsv", + "data/custom_rubric.tsv", + "--judge-output", + "custom_output", + "--skip-risk-analysis", + "--personas-tsv", + "custom/personas.tsv", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + # Check all values were parsed correctly + assert args.runs == 2 + assert args.turns == 10 + assert args.max_total_words == 5000 + assert args.max_concurrent == 10 + assert args.max_personas == 5 + assert args.folder_name == "custom_folder" + assert args.run_id == "test_run_id" + assert args.judge_max_concurrent == 5 + assert args.judge_limit == 10 + assert args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"] + assert args.judge_output == "custom_output" + assert args.personas_tsv == "custom/personas.tsv" + + +@pytest.mark.integration +class TestPipelineConfiguration: + """Test configuration building logic from arguments.""" + + def test_persona_model_config_dict_structure(self, pipeline_args): + """Test that persona model config is built with correct structure.""" + # Build config as done in main() + persona_config = { + "model": pipeline_args.user_agent, + **pipeline_args.user_agent_extra_params, + } + + assert "model" in persona_config + assert persona_config["model"] == "claude-sonnet-4-5-20250929" + assert isinstance(persona_config, dict) + + def test_agent_model_config_dict_structure(self, pipeline_args): + """Test that agent model config is built with correct structure.""" + # Build config as done in main() + agent_config = { + "model": pipeline_args.provider_agent, + "name": pipeline_args.provider_agent, + **pipeline_args.provider_agent_extra_params, + } + + assert "model" in agent_config + assert "name" in agent_config + assert agent_config["model"] == "gpt-4o" + assert agent_config["name"] == "gpt-4o" + assert isinstance(agent_config, dict) + + def test_extra_params_merge_into_config(self): + """Test that extra params correctly merge into model configs.""" + args = argparse.Namespace( + user_agent="claude-sonnet-4-5-20250929", + provider_agent="gpt-4o", + user_agent_extra_params={"temperature": 0.7, "max_tokens": 1000}, + provider_agent_extra_params={"temperature": 0.5}, + ) + + persona_config = { + "model": args.user_agent, + **args.user_agent_extra_params, + } + + agent_config = { + "model": args.provider_agent, + "name": args.provider_agent, + **args.provider_agent_extra_params, + } + + # Check persona config + assert persona_config["model"] == "claude-sonnet-4-5-20250929" + assert persona_config["temperature"] == 0.7 + assert persona_config["max_tokens"] == 1000 + + # Check agent config + assert agent_config["model"] == "gpt-4o" + assert agent_config["temperature"] == 0.5 + + def test_judge_args_namespace_structure(self, pipeline_args): + """Test that judge args Namespace is constructed correctly.""" + conv_folder = "conversations/test" + + # Build judge args as done in main() + judge_args = argparse.Namespace( + conversation=None, + folder=conv_folder, + rubrics=pipeline_args.rubrics, + judge_model=pipeline_args.judge_model, + judge_model_extra_params=pipeline_args.judge_model_extra_params, + limit=pipeline_args.judge_limit, + output=pipeline_args.judge_output, + max_concurrent=pipeline_args.judge_max_concurrent, + per_judge=pipeline_args.judge_per_judge, + verbose_workers=pipeline_args.judge_verbose_workers, + ) + + # Verify structure + assert isinstance(judge_args, argparse.Namespace) + assert judge_args.conversation is None + assert judge_args.folder == conv_folder + assert judge_args.rubrics == pipeline_args.rubrics + assert judge_args.judge_model == ["claude-sonnet-4-5-20250929"] + assert judge_args.output == pipeline_args.judge_output + + def test_empty_extra_params_dont_pollute_config(self): + """Test that empty extra params don't add unwanted keys.""" + args = argparse.Namespace( + user_agent="claude-sonnet-4-5-20250929", + user_agent_extra_params={}, + ) + + persona_config = { + "model": args.user_agent, + **args.user_agent_extra_params, + } + + # Should only have the model key + assert len(persona_config) == 1 + assert "model" in persona_config + + +@pytest.mark.integration +class TestPipelineDataFlow: + """Test data flow and path construction between stages.""" + + def test_conversation_folder_to_judge_path_construction(self): + """Test that conversation folder path is correctly passed to judge.""" + conv_folder = "conversations/test_20240101_120000" + + # As done in main(): judge receives the folder + judge_args = argparse.Namespace( + folder=conv_folder, + conversation=None, + ) + + assert judge_args.folder == conv_folder + assert judge_args.conversation is None + + def test_evaluation_folder_to_score_path_construction(self): + """Test that evaluation folder path is correctly transformed for score.""" + import os + + eval_folder = "evaluations/test_20240101_120000" + + # As done in main(): score receives results.csv path + results_csv = os.path.join(eval_folder, "results.csv") + + assert results_csv == "evaluations/test_20240101_120000/results.csv" + assert results_csv.startswith(eval_folder) + assert results_csv.endswith("results.csv") + + def test_personas_tsv_path_passed_to_score(self, pipeline_args): + """Test that personas.tsv path is correctly passed to score.""" + # As done in main() + personas_tsv_path = pipeline_args.personas_tsv + + assert personas_tsv_path == "data/personas.tsv" + + def test_skip_risk_analysis_flag_passed_to_score(self, pipeline_args): + """Test that skip_risk_analysis flag is correctly passed to score.""" + # As done in main() + skip_risk = pipeline_args.skip_risk_analysis + + assert skip_risk is False # Default value + + # Test with True + pipeline_args.skip_risk_analysis = True + assert pipeline_args.skip_risk_analysis is True + + +@pytest.mark.integration +class TestPipelineNewArguments: + """Test newly added arguments for consistency with individual scripts.""" + + def test_run_id_argument_exists(self, pipeline_args): + """Test that run_id argument exists in pipeline args.""" + assert hasattr(pipeline_args, "run_id") + assert pipeline_args.run_id is None # Default value + + def test_run_id_passed_to_generate(self, pipeline_args): + """Test that run_id is correctly structured for generate_main.""" + # Set custom run_id + pipeline_args.run_id = "custom_test_run" + + # Verify it's accessible + assert pipeline_args.run_id == "custom_test_run" + + def test_rubrics_argument_exists(self, pipeline_args): + """Test that rubrics argument exists in pipeline args.""" + assert hasattr(pipeline_args, "rubrics") + assert pipeline_args.rubrics == ["data/rubric.tsv"] # Default value + + def test_rubrics_passed_to_judge(self, pipeline_args): + """Test that rubrics are correctly passed to judge args.""" + # Set custom rubrics + pipeline_args.rubrics = ["data/rubric.tsv", "data/custom_rubric.tsv"] + + # As done in main(): judge receives these rubrics + judge_args = argparse.Namespace( + rubrics=pipeline_args.rubrics, + ) + + assert judge_args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"] + assert len(judge_args.rubrics) == 2 + + def test_judge_output_argument_exists(self, pipeline_args): + """Test that judge_output argument exists in pipeline args.""" + assert hasattr(pipeline_args, "judge_output") + assert pipeline_args.judge_output == "evaluations" # Default value + + def test_judge_output_passed_to_judge(self, pipeline_args): + """Test that judge_output is correctly passed to judge args.""" + # Set custom output folder + pipeline_args.judge_output = "custom_evaluations" + + # As done in main(): judge receives this output folder + judge_args = argparse.Namespace( + output=pipeline_args.judge_output, + ) + + assert judge_args.output == "custom_evaluations" + + def test_parse_arguments_with_run_id(self): + """Test parsing arguments with --run-id.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "--run-id", + "test_run_123", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.run_id == "test_run_123" + + def test_parse_arguments_with_rubrics(self): + """Test parsing arguments with --rubrics.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "--rubrics", + "data/rubric.tsv", + "data/custom_rubric.tsv", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"] + + def test_parse_arguments_with_judge_output(self): + """Test parsing arguments with --judge-output.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "--judge-output", + "custom_evals", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.judge_output == "custom_evals" + + def test_parse_arguments_defaults_for_new_args(self): + """Test that new arguments have correct defaults.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + # Check defaults + assert args.run_id is None + assert args.rubrics == ["data/rubric.tsv"] + assert args.judge_output == "evaluations" + + def test_short_flags_for_extra_params(self): + """Test that short flags work for extra params arguments.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "-uep", + "temperature=0.7,max_tokens=1000", + "-pep", + "temperature=0.5", + "-jep", + "temperature=0.1", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.user_agent_extra_params == { + "temperature": 0.7, + "max_tokens": 1000, + } + assert args.provider_agent_extra_params == {"temperature": 0.5} + assert args.judge_model_extra_params == {"temperature": 0.1} + + def test_short_flag_for_run_id(self): + """Test that short flag -i works for run-id.""" + from run_pipeline import parse_arguments + + test_args = [ + "--user-agent", + "claude-sonnet-4-5-20250929", + "--provider-agent", + "gpt-4o", + "--runs", + "1", + "--turns", + "4", + "--judge-model", + "claude-sonnet-4-5-20250929", + "-i", + "custom_run", + ] + + with patch("sys.argv", ["run_pipeline.py"] + test_args): + args = parse_arguments() + + assert args.run_id == "custom_run" + + +# Fixtures for validation tests + + +@pytest.fixture +def valid_pipeline_args(): + """Fixture providing valid minimal pipeline arguments.""" + return [ + "run_pipeline.py", + "--user-agent", + "test-model", + "--provider-agent", + "test-model", + "--runs", + "1", + "--turns", + "1", + "--judge-model", + "test-model", + ] + + +@pytest.mark.integration +class TestPipelineValidation: + """Test pipeline validation and error handling for empty folders.""" + + @pytest.mark.asyncio + async def test_step1_validation_folder_not_exists( + self, tmp_path, valid_pipeline_args + ): + """Test that pipeline exits if Step 1 folder doesn't exist.""" + import sys + from unittest.mock import patch + + from run_pipeline import main as pipeline_main + + # Mock generate module's main to return a non-existent folder + async def mock_generate(*args, **kwargs): + return None, str(tmp_path / "nonexistent") + + # Patch generate.main at the source + with patch("generate.main", side_effect=mock_generate): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + # Mock importlib to avoid judge loading (not needed for step 1 test) + with patch("importlib.util.spec_from_file_location"): + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit when folder doesn't exist + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step1_validation_no_conversation_files( + self, tmp_path, valid_pipeline_args + ): + """Test that pipeline exits if Step 1 produces no .txt files.""" + import sys + from unittest.mock import patch + + from run_pipeline import main as pipeline_main + + # Create empty conversation folder + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + + # Mock generate_main to return empty folder + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + with patch("generate.main", side_effect=mock_generate): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("importlib.util.spec_from_file_location"): + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step1_validation_only_log_files(self, tmp_path, valid_pipeline_args): + """Test that pipeline exits if Step 1 only produces .log files.""" + import sys + from unittest.mock import patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with only .log files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conversation1.log").write_text("log content") + (conv_folder / "conversation2.log").write_text("log content") + + # Mock generate_main to return folder with only logs + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + with patch("generate.main", side_effect=mock_generate): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("importlib.util.spec_from_file_location"): + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step2_validation_no_evaluation_folder( + self, tmp_path, valid_pipeline_args + ): + """Test that pipeline exits if Step 2 returns None.""" + import sys + from unittest.mock import MagicMock, patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with valid files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello") + + # Mock generate_main to return valid folder + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + # Mock judge_main to return None + async def mock_judge(args): + return None + + # Create a mock module with the mock judge main function + mock_judge_module = MagicMock() + mock_judge_module.main = mock_judge + + with ( + patch("generate.main", side_effect=mock_generate), + patch("importlib.util.module_from_spec", return_value=mock_judge_module), + patch("importlib.util.spec_from_file_location"), + ): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step2_validation_folder_not_exists( + self, tmp_path, valid_pipeline_args + ): + """Test that pipeline exits if Step 2 folder doesn't exist.""" + import sys + from unittest.mock import MagicMock, patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with valid files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello") + + # Mock generate_main to return valid folder + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + # Mock judge_main to return non-existent folder + async def mock_judge(args): + return str(tmp_path / "nonexistent_eval") + + # Create a mock module with the mock judge main function + mock_judge_module = MagicMock() + mock_judge_module.main = mock_judge + + with ( + patch("generate.main", side_effect=mock_generate), + patch("importlib.util.module_from_spec", return_value=mock_judge_module), + patch("importlib.util.spec_from_file_location"), + ): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_args): + """Test that pipeline exits if Step 2 produces no results.csv.""" + import sys + from unittest.mock import MagicMock, patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with valid files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello") + + # Create evaluation folder but no results.csv + eval_folder = tmp_path / "evaluations" + eval_folder.mkdir() + (eval_folder / "some_other_file.json").write_text("{}") + + # Mock generate_main to return valid folder + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + # Mock judge_main to return folder without results.csv + async def mock_judge(args): + return str(eval_folder) + + # Create a mock module with the mock judge main function + mock_judge_module = MagicMock() + mock_judge_module.main = mock_judge + + with ( + patch("generate.main", side_effect=mock_generate), + patch("importlib.util.module_from_spec", return_value=mock_judge_module), + patch("importlib.util.spec_from_file_location"), + ): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_step2_validation_empty_folder_error_message( + self, tmp_path, valid_pipeline_args, capsys + ): + """Test that error message lists files when folder is not empty.""" + import sys + from unittest.mock import MagicMock, patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with valid files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello") + + # Create evaluation folder with some files but no results.csv + eval_folder = tmp_path / "evaluations" + eval_folder.mkdir() + (eval_folder / "file1.json").write_text("{}") + (eval_folder / "file2.json").write_text("{}") + (eval_folder / "file3.log").write_text("log") + + # Mock functions + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + async def mock_judge(args): + return str(eval_folder) + + # Create a mock module with the mock judge main function + mock_judge_module = MagicMock() + mock_judge_module.main = mock_judge + + with ( + patch("generate.main", side_effect=mock_generate), + patch("importlib.util.module_from_spec", return_value=mock_judge_module), + patch("importlib.util.spec_from_file_location"), + ): + # Mock sys.exit to raise SystemExit instead of actually exiting + with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit: + with patch("sys.argv", valid_pipeline_args): + # Pipeline should raise SystemExit + with pytest.raises(SystemExit): + await pipeline_main() + + # Capture printed output + captured = capsys.readouterr() + + # Verify error message includes file listing + assert "Files in evaluation folder: 3" in captured.out + assert "Found:" in captured.out + + # Verify sys.exit(1) was called + mock_exit.assert_called_once_with(1) + + @pytest.mark.asyncio + async def test_validation_success_messages( + self, tmp_path, valid_pipeline_args, capsys + ): + """Test that validation success messages are displayed.""" + from unittest.mock import MagicMock, patch + + from run_pipeline import main as pipeline_main + + # Create conversation folder with valid files + conv_folder = tmp_path / "conversations" + conv_folder.mkdir() + (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello") + (conv_folder / "conv2.txt").write_text("User: Hey\nAssistant: Hi there") + + # Create evaluation folder with results.csv + eval_folder = tmp_path / "evaluations" + eval_folder.mkdir() + (eval_folder / "results.csv").write_text( + "filename,run_id,Safety\nconv1.txt,test,Pass" + ) + + # Mock functions + async def mock_generate(*args, **kwargs): + return None, str(conv_folder) + + async def mock_judge(args): + return str(eval_folder) + + def mock_score(*args, **kwargs): + return {} + + # Create a mock module with the mock judge main function + mock_judge_module = MagicMock() + mock_judge_module.main = mock_judge + + with ( + patch("generate.main", side_effect=mock_generate), + patch("importlib.util.module_from_spec", return_value=mock_judge_module), + patch("importlib.util.spec_from_file_location"), + patch("run_pipeline.score_results", new=mock_score), + patch("run_pipeline.print_scores"), + patch("run_pipeline.create_visualizations"), + ): + with patch("sys.argv", valid_pipeline_args + ["--skip-risk-analysis"]): + await pipeline_main() + + # Capture printed output + captured = capsys.readouterr() + + # Verify success messages + assert "✓ Validated: 2 conversation files generated" in captured.out + assert ( + "✓ Validated: results.csv exists with evaluation data" + in captured.out + ) diff --git a/tests/unit/judge/test_judge_cli.py b/tests/unit/judge/test_judge_cli.py index 00357320..cdbad7bc 100644 --- a/tests/unit/judge/test_judge_cli.py +++ b/tests/unit/judge/test_judge_cli.py @@ -35,33 +35,33 @@ def test_single_model_with_count(self): def test_multiple_different_models(self): """Test parsing multiple different models.""" - result = parse_judge_models(["gpt-4o", "claude-3-5-sonnet-20241022"]) - assert result == {"gpt-4o": 1, "claude-3-5-sonnet-20241022": 1} + result = parse_judge_models(["gpt-4o", "claude-sonnet-4-5-20250929"]) + assert result == {"gpt-4o": 1, "claude-sonnet-4-5-20250929": 1} def test_multiple_models_with_counts(self): """Test parsing multiple models with counts.""" - result = parse_judge_models(["gpt-4o:2", "claude-3-5-sonnet-20241022:3"]) - assert result == {"gpt-4o": 2, "claude-3-5-sonnet-20241022": 3} + result = parse_judge_models(["gpt-4o:2", "claude-sonnet-4-5-20250929:3"]) + assert result == {"gpt-4o": 2, "claude-sonnet-4-5-20250929": 3} def test_mixed_models_with_and_without_counts(self): """Test parsing mix of models with and without counts.""" - result = parse_judge_models(["gpt-4o", "claude-3-5-sonnet-20241022:2"]) - assert result == {"gpt-4o": 1, "claude-3-5-sonnet-20241022": 2} + result = parse_judge_models(["gpt-4o", "claude-sonnet-4-5-20250929:2"]) + assert result == {"gpt-4o": 1, "claude-sonnet-4-5-20250929": 2} def test_model_with_multiple_colons(self): """Test parsing model name that contains colons (e.g., dated model names).""" # Should use rsplit to handle model names with colons - result = parse_judge_models(["claude-3-5-sonnet-20241022:2"]) - assert result == {"claude-3-5-sonnet-20241022": 2} + result = parse_judge_models(["claude-sonnet-4-5-20250929:2"]) + assert result == {"claude-sonnet-4-5-20250929": 2} def test_three_models_mixed(self): """Test parsing three models with various count specifications.""" result = parse_judge_models( - ["gpt-4o:2", "claude-3-5-sonnet-20241022", "gpt-3.5-turbo:3"] + ["gpt-4o:2", "claude-sonnet-4-5-20250929", "gpt-3.5-turbo:3"] ) assert result == { "gpt-4o": 2, - "claude-3-5-sonnet-20241022": 1, + "claude-sonnet-4-5-20250929": 1, "gpt-3.5-turbo": 3, } diff --git a/tests/unit/judge/test_runner_extra_params.py b/tests/unit/judge/test_runner_extra_params.py index 642c3061..f9ed1d39 100644 --- a/tests/unit/judge/test_runner_extra_params.py +++ b/tests/unit/judge/test_runner_extra_params.py @@ -139,7 +139,7 @@ async def test_judge_conversations_accepts_extra_params( } ] - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"claude-3-7-sonnet": 1}, conversations=[conversation], rubric_config=rubric_config, @@ -182,7 +182,7 @@ async def test_judge_conversations_extra_params_defaults_to_none( } ] - results = await judge_conversations( + results, _ = await judge_conversations( judge_models={"claude-3-7-sonnet": 1}, conversations=[conversation], rubric_config=rubric_config, diff --git a/tests/unit/llm_clients/test_claude_llm.py b/tests/unit/llm_clients/test_claude_llm.py index c5f755a9..2b8b28f0 100644 --- a/tests/unit/llm_clients/test_claude_llm.py +++ b/tests/unit/llm_clients/test_claude_llm.py @@ -23,14 +23,14 @@ def test_init_missing_api_key_raises_error(self): def test_init_with_default_model(self, mock_chat_anthropic): """Test initialization with default model from config.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_chat_anthropic.return_value = mock_llm llm = ClaudeLLM(name="TestClaude", system_prompt="Test prompt") assert llm.name == "TestClaude" assert llm.system_prompt == "Test prompt" - assert llm.model_name == "claude-3-5-sonnet-20241022" + assert llm.model_name == "claude-sonnet-4-5-20250929" assert llm.last_response_metadata == {} @patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key") @@ -50,7 +50,7 @@ def test_init_with_custom_model(self, mock_chat_anthropic): def test_init_with_kwargs(self, mock_chat_anthropic): """Test initialization with additional kwargs.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_chat_anthropic.return_value = mock_llm ClaudeLLM(name="TestClaude", temperature=0.5, max_tokens=500, top_p=0.9) @@ -69,14 +69,14 @@ async def test_generate_response_success_with_system_prompt( ): """Test successful response generation with system prompt (lines 49-97).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" # Create mock response with metadata mock_response = MagicMock() mock_response.text = "This is a test response" mock_response.id = "msg_12345" mock_response.response_metadata = { - "model": "claude-3-5-sonnet-20241022", + "model": "claude-sonnet-4-5-20250929", "usage": {"input_tokens": 10, "output_tokens": 20}, "stop_reason": "end_turn", } @@ -96,7 +96,7 @@ async def test_generate_response_success_with_system_prompt( # Verify metadata was extracted (lines 62-95) metadata = llm.get_last_response_metadata() assert metadata["response_id"] == "msg_12345" - assert metadata["model"] == "claude-3-5-sonnet-20241022" + assert metadata["model"] == "claude-sonnet-4-5-20250929" assert metadata["provider"] == "claude" assert "timestamp" in metadata assert "response_time_seconds" in metadata @@ -112,12 +112,12 @@ async def test_generate_response_success_with_system_prompt( async def test_generate_response_without_system_prompt(self, mock_chat_anthropic): """Test response generation without system prompt.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Response without system prompt" mock_response.id = "msg_67890" - mock_response.response_metadata = {"model": "claude-3-5-sonnet-20241022"} + mock_response.response_metadata = {"model": "claude-sonnet-4-5-20250929"} mock_llm.ainvoke = AsyncMock(return_value=mock_response) mock_chat_anthropic.return_value = mock_llm @@ -142,13 +142,13 @@ async def test_generate_response_without_system_prompt(self, mock_chat_anthropic async def test_generate_response_without_usage_metadata(self, mock_chat_anthropic): """Test response when usage metadata is not available.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" # Response without usage in metadata mock_response = MagicMock() mock_response.text = "Response" mock_response.id = "msg_abc" - mock_response.response_metadata = {"model": "claude-3-5-sonnet-20241022"} + mock_response.response_metadata = {"model": "claude-sonnet-4-5-20250929"} mock_llm.ainvoke = AsyncMock(return_value=mock_response) mock_chat_anthropic.return_value = mock_llm @@ -170,7 +170,7 @@ async def test_generate_response_without_response_metadata( ): """Test response when response_metadata attribute is missing.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" # Response without response_metadata attribute mock_response = MagicMock() @@ -188,7 +188,7 @@ async def test_generate_response_without_response_metadata( assert response == "Response" metadata = llm.get_last_response_metadata() - assert metadata["model"] == "claude-3-5-sonnet-20241022" + assert metadata["model"] == "claude-sonnet-4-5-20250929" assert metadata["usage"] == {} assert metadata["stop_reason"] is None @@ -198,7 +198,7 @@ async def test_generate_response_without_response_metadata( async def test_generate_response_api_error(self, mock_chat_anthropic): """Test error handling when API call fails (lines 98-108).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" # Simulate API error mock_llm.ainvoke = AsyncMock(side_effect=Exception("API rate limit exceeded")) @@ -218,7 +218,7 @@ async def test_generate_response_api_error(self, mock_chat_anthropic): # Verify error metadata was stored (lines 100-107) metadata = llm.get_last_response_metadata() assert metadata["response_id"] is None - assert metadata["model"] == "claude-3-5-sonnet-20241022" + assert metadata["model"] == "claude-sonnet-4-5-20250929" assert metadata["provider"] == "claude" assert "timestamp" in metadata assert "error" in metadata @@ -231,7 +231,7 @@ async def test_generate_response_api_error(self, mock_chat_anthropic): async def test_generate_response_tracks_timing(self, mock_chat_anthropic): """Test that response timing is tracked correctly (lines 57-59).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Timed response" @@ -256,7 +256,7 @@ def test_get_last_response_metadata_returns_copy(self): with patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key"): with patch("llm_clients.claude_llm.ChatAnthropic") as mock_chat: mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_chat.return_value = mock_llm llm = ClaudeLLM(name="TestClaude") @@ -278,7 +278,7 @@ def test_set_system_prompt(self): with patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key"): with patch("llm_clients.claude_llm.ChatAnthropic") as mock_chat: mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_chat.return_value = mock_llm llm = ClaudeLLM(name="TestClaude", system_prompt="Initial prompt") @@ -295,14 +295,14 @@ async def test_generate_response_with_partial_usage_metadata( ): """Test response with incomplete usage metadata.""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" # Response with partial usage info mock_response = MagicMock() mock_response.text = "Partial usage response" mock_response.id = "msg_partial" mock_response.response_metadata = { - "model": "claude-3-5-sonnet-20241022", + "model": "claude-sonnet-4-5-20250929", "usage": {"input_tokens": 15}, # Missing output_tokens } @@ -326,7 +326,7 @@ async def test_generate_response_with_partial_usage_metadata( async def test_metadata_includes_response_object(self, mock_chat_anthropic): """Test that metadata includes the full response object (line 74).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Test" @@ -351,7 +351,7 @@ async def test_metadata_includes_response_object(self, mock_chat_anthropic): async def test_timestamp_format(self, mock_chat_anthropic): """Test that timestamp is in ISO format (line 70).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Test" @@ -384,13 +384,13 @@ async def test_timestamp_format(self, mock_chat_anthropic): async def test_metadata_with_stop_reason(self, mock_chat_anthropic): """Test metadata extraction of stop_reason (line 92).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Stopped response" mock_response.id = "msg_stop" mock_response.response_metadata = { - "model": "claude-3-5-sonnet-20241022", + "model": "claude-sonnet-4-5-20250929", "stop_reason": "max_tokens", } @@ -411,13 +411,13 @@ async def test_metadata_with_stop_reason(self, mock_chat_anthropic): async def test_raw_metadata_stored(self, mock_chat_anthropic): """Test that raw metadata is stored (line 95).""" mock_llm = MagicMock() - mock_llm.model = "claude-3-5-sonnet-20241022" + mock_llm.model = "claude-sonnet-4-5-20250929" mock_response = MagicMock() mock_response.text = "Test" mock_response.id = "msg_raw" mock_response.response_metadata = { - "model": "claude-3-5-sonnet-20241022", + "model": "claude-sonnet-4-5-20250929", "custom_field": "custom_value", "nested": {"key": "value"}, } @@ -447,7 +447,7 @@ async def test_generate_response_with_conversation_history( mock_response.text = "Response with history" mock_response.id = "msg_history" mock_response.response_metadata = { - "model": "claude-3-5-sonnet-20241022", + "model": "claude-sonnet-4-5-20250929", "usage": {"input_tokens": 50, "output_tokens": 20}, } diff --git a/tests/unit/llm_clients/test_config.py b/tests/unit/llm_clients/test_config.py index a6e32cf4..89b6d56c 100644 --- a/tests/unit/llm_clients/test_config.py +++ b/tests/unit/llm_clients/test_config.py @@ -23,7 +23,7 @@ def test_get_claude_config(self): assert isinstance(config, dict) assert "model" in config - assert config["model"] == "claude-3-5-sonnet-20241022" + assert config["model"] == "claude-sonnet-4-5-20250929" # Temperature and max_tokens should NOT be in config assert "temperature" not in config assert "max_tokens" not in config diff --git a/tests/unit/llm_clients/test_llm_factory.py b/tests/unit/llm_clients/test_llm_factory.py index 89f916ec..6417fed6 100644 --- a/tests/unit/llm_clients/test_llm_factory.py +++ b/tests/unit/llm_clients/test_llm_factory.py @@ -18,7 +18,7 @@ class TestLLMFactory: def test_create_claude_llm(self, mock_chat_anthropic): """Test that factory correctly creates Claude LLM instance.""" # Arrange - model_name = "claude-3-5-sonnet-20241022" + model_name = "claude-sonnet-4-5-20250929" name = "TestClaude" system_prompt = "You are a helpful assistant." mock_chat_anthropic.return_value = MagicMock() @@ -114,7 +114,7 @@ def test_unsupported_model_raises_error(self): def test_factory_passes_kwargs(self, mock_chat_anthropic): """Test that factory correctly forwards kwargs to LLM implementations.""" # Arrange - model_name = "claude-3-5-sonnet-20241022" + model_name = "claude-sonnet-4-5-20250929" name = "TestKwargs" temperature = 0.5 max_tokens = 500 diff --git a/tests/unit/utils/test_model_config_loader.py b/tests/unit/utils/test_model_config_loader.py index 440a2dfe..d58802ce 100644 --- a/tests/unit/utils/test_model_config_loader.py +++ b/tests/unit/utils/test_model_config_loader.py @@ -19,7 +19,7 @@ def test_load_model_config_with_valid_file(self, tmp_path): "persona_depressed": "claude-3-opus", "chatbot_therapist": "claude-3-5-sonnet", }, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", "temperature": 0.7, } @@ -29,7 +29,7 @@ def test_load_model_config_with_valid_file(self, tmp_path): result = load_model_config(str(config_file)) assert result == config_data - assert result["default_model"] == "claude-3-5-sonnet-20241022" + assert result["default_model"] == "claude-sonnet-4-5-20250929" assert result["prompt_models"]["persona_anxious"] == "gpt-4" assert result["temperature"] == 0.7 @@ -53,7 +53,7 @@ def test_load_model_config_file_not_found(self, tmp_path, capsys): # Should return default config assert result["prompt_models"] == {} - assert result["default_model"] == "claude-3-5-sonnet-20241022" + assert result["default_model"] == "claude-sonnet-4-5-20250929" # Should print warning captured = capsys.readouterr() @@ -69,7 +69,7 @@ def test_load_model_config_invalid_json_syntax(self, tmp_path, capsys): # Should return default config assert result["prompt_models"] == {} - assert result["default_model"] == "claude-3-5-sonnet-20241022" + assert result["default_model"] == "claude-sonnet-4-5-20250929" # Should print error captured = capsys.readouterr() @@ -84,7 +84,7 @@ def test_load_model_config_empty_file(self, tmp_path, capsys): # Should return default config assert result["prompt_models"] == {} - assert result["default_model"] == "claude-3-5-sonnet-20241022" + assert result["default_model"] == "claude-sonnet-4-5-20250929" captured = capsys.readouterr() assert "Error loading model config" in captured.out @@ -106,7 +106,7 @@ def test_load_model_config_with_unicode_characters(self, tmp_path): "persona_日本語": "gpt-4", "persona_émotionnel": "claude-3-opus", }, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "unicode_config.json" @@ -123,7 +123,7 @@ def test_load_model_config_with_nested_structure(self, tmp_path): """Test loading config with nested data structures.""" config_data = { "prompt_models": {"persona_1": "gpt-4"}, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", "model_params": { "temperature": 0.7, "max_tokens": 1000, @@ -154,7 +154,7 @@ def test_load_model_config_permission_error(self, tmp_path, capsys): # Should return default config assert result["prompt_models"] == {} - assert result["default_model"] == "claude-3-5-sonnet-20241022" + assert result["default_model"] == "claude-sonnet-4-5-20250929" # Restore permissions for cleanup config_file.chmod(0o644) @@ -171,7 +171,7 @@ def test_get_model_for_prompt_returns_specific_model(self, tmp_path): "persona_anxious": "gpt-4-turbo", "persona_happy": "claude-3-opus", }, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "config.json" @@ -185,7 +185,7 @@ def test_get_model_for_prompt_returns_default_for_unknown(self, tmp_path): """Test getting model for prompt not in config returns default.""" config_data = { "prompt_models": {"persona_known": "gpt-4"}, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "config.json" @@ -193,7 +193,7 @@ def test_get_model_for_prompt_returns_default_for_unknown(self, tmp_path): model = get_model_for_prompt("persona_unknown", str(config_file)) - assert model == "claude-3-5-sonnet-20241022" + assert model == "claude-sonnet-4-5-20250929" def test_get_model_for_prompt_with_empty_prompt_models(self, tmp_path): """Test getting model when prompt_models is empty.""" @@ -211,7 +211,7 @@ def test_get_model_for_prompt_with_missing_config_file(self): model = get_model_for_prompt("test_prompt", "nonexistent_file.json") # Should return default model from load_model_config fallback - assert model == "claude-3-5-sonnet-20241022" + assert model == "claude-sonnet-4-5-20250929" def test_get_model_for_prompt_case_sensitivity(self, tmp_path): """Test that prompt name matching is case-sensitive.""" @@ -220,7 +220,7 @@ def test_get_model_for_prompt_case_sensitivity(self, tmp_path): "PersonaAnxious": "gpt-4", "persona_anxious": "claude-3-opus", }, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "config.json" @@ -233,7 +233,7 @@ def test_get_model_for_prompt_case_sensitivity(self, tmp_path): assert model1 == "gpt-4" assert model2 == "claude-3-opus" - assert model3 == "claude-3-5-sonnet-20241022" # Falls back to default + assert model3 == "claude-sonnet-4-5-20250929" # Falls back to default def test_get_model_for_prompt_with_special_characters(self, tmp_path): """Test prompt names with special characters.""" @@ -243,7 +243,7 @@ def test_get_model_for_prompt_with_special_characters(self, tmp_path): "persona_with_underscores": "claude-3-opus", "persona.with.dots": "gpt-3.5-turbo", }, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "config.json" @@ -263,7 +263,7 @@ def test_get_model_for_prompt_multiple_calls_consistent(self, tmp_path): """Test that multiple calls with same prompt return consistent results.""" config_data = { "prompt_models": {"test_prompt": "gpt-4"}, - "default_model": "claude-3-5-sonnet-20241022", + "default_model": "claude-sonnet-4-5-20250929", } config_file = tmp_path / "config.json" diff --git a/utils/model_config_loader.py b/utils/model_config_loader.py index 832ea4d1..02d8c708 100644 --- a/utils/model_config_loader.py +++ b/utils/model_config_loader.py @@ -19,10 +19,10 @@ def load_model_config(config_file: str = "model_config.json") -> Dict[str, Any]: return json.load(f) except FileNotFoundError: print(f"Warning: Model config file '{config_file}' not found. Using defaults.") - return {"prompt_models": {}, "default_model": "claude-3-5-sonnet-20241022"} + return {"prompt_models": {}, "default_model": "claude-sonnet-4-5-20250929"} except Exception as e: print(f"Error loading model config: {e}") - return {"prompt_models": {}, "default_model": "claude-3-5-sonnet-20241022"} + return {"prompt_models": {}, "default_model": "claude-sonnet-4-5-20250929"} def get_model_for_prompt(