diff --git a/README.md b/README.md
index a9d83487..f26543fb 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,36 @@ Where
 - `j` is the flag for selecting the judge model(s)
 - `jep` are the judge model extra parameters (optional)
 
+7. **Score and visualize the results**:
+   ```bash
+   python -m judge.score -r evaluations/{YOUR_EVAL_FOLDER}/results.csv
+   ```
+
+## Quick Start: End-to-End Pipeline
+
+For convenience, you can run the entire workflow (generation → evaluation → scoring) with a single command:
+
+```bash
+python3 run_pipeline.py \
+  --user-agent claude-sonnet-4-5-20250929 \
+  --provider-agent gpt-4o \
+  --runs 2 \
+  --turns 10 \
+  --judge-model claude-sonnet-4-5-20250929 \
+  --max-personas 5
+```
+
+The pipeline script:
+- Runs `generate.py` with your specified arguments
+- Automatically passes the output folder to `judge.py`
+- Automatically runs `judge/score.py` on the evaluation results
+- Displays a summary with all output locations
+
+For help and all available options:
+```bash
+python3 run_pipeline.py --help
+```
+
 ### Using Extra Parameters
 
 Both `generate.py` and `judge.py` support extra parameters for fine-tuning model behavior:
@@ -74,7 +104,7 @@ Both `generate.py` and `judge.py` support extra parameters for fine-tuning model
 **Generate with temperature control:**
 ```bash
 # Lower temperature (0.3) for more consistent responses
-python generate.py -u gpt-4o -uep temperature=0.3 -p claude-3-5-sonnet-20241022 -pep temperature=0.5 -t 6 -r 2
+python generate.py -u gpt-4o -uep temperature=0.3 -p claude-sonnet-4-5-20250929 -pep temperature=0.5 -t 6 -r 2
 
 # Higher temperature (1.0) with max tokens
 python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep temperature=1 -t 6 -r 1
@@ -83,7 +113,7 @@ python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep t
 **Judge with custom parameters:**
 ```bash
 # Use lower temperature for more consistent evaluation
-python judge.py -f conversations/my_experiment -j claude-3-5-sonnet-20241022 -jep temperature=0.3
+python judge.py -f conversations/my_experiment -j claude-sonnet-4-5-20250929 -jep temperature=0.3
 
 # Multiple parameters
 python judge.py -f conversations/my_experiment -j gpt-4o -jep temperature=0.5,max_tokens=1500
diff --git a/generate.py b/generate.py
index a723bf77..0264224a 100644
--- a/generate.py
+++ b/generate.py
@@ -25,7 +25,7 @@ async def main(
     max_concurrent: Optional[int] = None,
     max_total_words: Optional[int] = None,
     max_personas: Optional[int] = None,
-) -> List[Dict[str, Any]]:
+) -> tuple[List[Dict[str, Any]], str]:
     """
     Generate conversations and return results.
 
@@ -117,7 +117,7 @@ async def main(
     if verbose:
         print(f"✅ Generated {len(results)} conversations → {folder_name}/")
 
-    return results
+    return results, folder_name
 
 
 if __name__ == "__main__":
@@ -127,7 +127,7 @@ async def main(
         "--user-agent",
         "-u",
         help=(
-            "Model for the user-agent. Examples: claude-3-5-sonnet-20241022, "
+            "Model for the user-agent. Examples: claude-sonnet-4-5-20250929, "
             "gemini-1.5-pro, llama3:8b"
         ),
         required=True,
@@ -147,7 +147,7 @@ async def main(
         "--provider-agent",
         "-p",
         help=(
-            "Model for the provider-agent. Examples: claude-3-5-sonnet-20241022, "
+            "Model for the provider-agent. Examples: claude-sonnet-4-5-20250929, "
             "gemini-1.5-pro, llama3:8b"
         ),
         required=True,
@@ -255,8 +255,7 @@ async def main(
     }
 
     # TODO: Do the run id here, so that it can be printed when starting
-    # Note: we are discarding the results, because they are saved to file
-    _ = asyncio.run(
+    results, output_folder = asyncio.run(
         main(
             persona_model_config=persona_model_config,
             agent_model_config=agent_model_config,
diff --git a/judge.py b/judge.py
index 888e7d17..58688790 100644
--- a/judge.py
+++ b/judge.py
@@ -6,6 +6,7 @@
 
 import argparse
 import asyncio
+from typing import Optional
 
 from judge import judge_conversations, judge_single_conversation
 from judge.llm_judge import LLMJudge
@@ -13,7 +14,7 @@
 from utils.utils import parse_key_value_list
 
 
-async def main(args):
+async def main(args) -> Optional[str]:
     """Main async entrypoint for judging conversations."""
     # Parse judge models from args (supports "model" or "model:count" format)
     judge_models = {}
@@ -47,6 +48,9 @@ async def main(args):
             judge_model_extra_params=args.judge_model_extra_params,
         )
         await judge_single_conversation(judge, conversation, args.output)
+        # Single conversation mode doesn't need output folder for pipeline
+        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
+        return None
     else:
         # Load all conversations at startup
         print(f"📂 Loading conversations from {args.folder}...")
@@ -58,7 +62,7 @@ async def main(args):
 
         folder_name = Path(args.folder).name
 
-        await judge_conversations(
+        _, output_folder = await judge_conversations(
             judge_models=judge_models,
             conversations=conversations,
             rubric_config=rubric_config,
@@ -71,6 +75,8 @@ async def main(args):
             verbose_workers=args.verbose_workers,
         )
 
+        return output_folder
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -108,9 +114,9 @@ async def main(args):
             "Model(s) to use for judging. "
             "Format: 'model' or 'model:count' for multiple instances. "
             "Can specify multiple models: --judge-model model1 model2:3. "
-            "Examples: claude-3-5-sonnet-20241022, "
-            "claude-3-5-sonnet-20241022:3, "
-            "claude-3-5-sonnet-20241022:2 gpt-4o:1"
+            "Examples: claude-sonnet-4-5-20250929, "
+            "claude-sonnet-4-5-20250929:3, "
+            "claude-sonnet-4-5-20250929:2 gpt-4o:1"
         ),
     )
 
diff --git a/judge/runner.py b/judge/runner.py
index e9c6a490..13d288f3 100644
--- a/judge/runner.py
+++ b/judge/runner.py
@@ -434,7 +434,7 @@ async def judge_conversations(
     max_concurrent: Optional[int] = None,
     per_judge: bool = False,
     verbose_workers: bool = False,
-) -> List[Dict[str, Any]]:
+) -> tuple[List[Dict[str, Any]], str]:
     """
     Judge conversations with multiple judge models.
 
@@ -454,8 +454,9 @@ async def judge_conversations(
         per_judge: If True, max_concurrent applies per judge model; if False, total
 
     Returns:
-        Flattened list of evaluation results with one row per
-        (conversation, judge_model, judge_instance) tuple
+        Tuple of (results, output_folder) where results is a flattened list of
+        evaluation results with one row per (conversation, judge_model, judge_instance)
+        tuple, and output_folder is the path where evaluations were saved
     """
     if output_folder is None:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
@@ -519,7 +520,7 @@ async def judge_conversations(
     if verbose:
         print(f"✅ Completed {len(results)} evaluations → {output_folder}/")
 
-    return results
+    return results, output_folder
 
 
 async def judge_single_conversation(
diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
index d8faa1da..1cbf518e 100644
--- a/llm_clients/claude_llm.py
+++ b/llm_clients/claude_llm.py
@@ -92,7 +92,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()
diff --git a/llm_clients/config.py b/llm_clients/config.py
index bc8d1513..5ac5e760 100644
--- a/llm_clients/config.py
+++ b/llm_clients/config.py
@@ -36,7 +36,7 @@ def get_claude_config(cls) -> Dict[str, Any]:
         Returns only the model name. Runtime parameters (temperature, max_tokens)
         should be passed explicitly via CLI arguments.
         """
-        return {"model": "claude-3-5-sonnet-20241022"}
+        return {"model": "claude-sonnet-4-5-20250929"}
 
     @classmethod
     def get_openai_config(cls) -> Dict[str, Any]:
diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
index 5cfbfa32..4fed8015 100644
--- a/llm_clients/gemini_llm.py
+++ b/llm_clients/gemini_llm.py
@@ -90,7 +90,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()
diff --git a/llm_clients/llm_factory.py b/llm_clients/llm_factory.py
index 80113167..afae8bd6 100644
--- a/llm_clients/llm_factory.py
+++ b/llm_clients/llm_factory.py
@@ -19,7 +19,7 @@ def create_llm(
 
         Args:
             model_name: The model identifier
-                (e.g., "claude-3-5-sonnet-20241022", "gpt-4")
+                (e.g., "claude-sonnet-4-5-20250929", "gpt-4")
             name: Display name for this LLM instance
             system_prompt: Optional system prompt
             **kwargs: Additional model-specific parameters
@@ -72,7 +72,7 @@ def create_judge_llm(
 
         Args:
             model_name: The model identifier
-                (e.g., "claude-3-5-sonnet-20241022", "gpt-4")
+                (e.g., "claude-sonnet-4-5-20250929", "gpt-4")
             name: Display name for this LLM instance
             system_prompt: Optional system prompt
             **kwargs: Additional model-specific parameters
diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py
index 574148d8..508037c4 100644
--- a/llm_clients/openai_llm.py
+++ b/llm_clients/openai_llm.py
@@ -89,7 +89,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()
diff --git a/model_config.json b/model_config.json
index 55f164af..f2242114 100644
--- a/model_config.json
+++ b/model_config.json
@@ -1,15 +1,16 @@
 {
   "prompt_models": {
-    "assistant": "claude-3-5-sonnet-20241022",
+    "assistant": "claude-sonnet-4-5-20250929",
     "philosopher": "claude-3-opus-20240229",
     "debate_starter": "claude-3-sonnet-20240229",
     "creative": "claude-3-haiku-20240307",
-    "scientist": "claude-3-5-sonnet-20241022",
-    "skeptic": "claude-3-5-sonnet-20241022",
+    "scientist": "claude-sonnet-4-5-20250929",
+    "skeptic": "claude-sonnet-4-5-20250929",
     "gpt_assistant": "gpt-4",
     "gpt_creative": "gpt-4-turbo",
     "gpt_analyst": "gpt-3.5-turbo",
-    "claude-sonnet-4-20250514": "claude-sonnet-4-20250514"
+    "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
+    "claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929"
   },
-  "default_model": "claude-3-5-sonnet-20241022"
+  "default_model": "claude-sonnet-4-5-20250929"
 }
\ No newline at end of file
diff --git a/run_pipeline.py b/run_pipeline.py
new file mode 100644
index 00000000..6d6dbf80
--- /dev/null
+++ b/run_pipeline.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python3
+"""
+VERA-MH End-to-End Pipeline Runner (Python version)
+
+This script orchestrates the complete workflow:
+  1. Generate conversations (generate.py)
+  2. Evaluate them with LLM judge (judge.py)
+  3. Score and visualize results (judge/score.py)
+
+It automatically passes the output folder from each step to the next step,
+so you don't have to manually copy paths between commands.
+"""
+
+import argparse
+import asyncio
+import os
+import sys
+from pathlib import Path
+
+from judge.score import (
+    create_risk_level_visualizations,
+    create_visualizations,
+    print_scores,
+    score_results,
+    score_results_by_risk,
+)
+from utils.utils import parse_key_value_list
+
+
+def parse_arguments():
+    """
+    Parse command line arguments and separate them into three groups:
+    - Arguments for generate.py
+    - Arguments for judge.py
+    - Arguments for judge/score.py
+    """
+    parser = argparse.ArgumentParser(
+        description="VERA-MH Pipeline Runner: Generation → Evaluation → Scoring",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Example:
+  %(prog)s --user-agent claude-sonnet-4-5-20250929 \\
+           --provider-agent gpt-4o \\
+           --runs 2 \\
+           --turns 10 \\
+           --judge-model claude-sonnet-4-5-20250929 \\
+           --max-personas 5
+        """,
+    )
+
+    # Required arguments for generation
+    parser.add_argument(
+        "--user-agent",
+        "-u",
+        required=True,
+        help="User/persona model (e.g., claude-sonnet-4-5-20250929)",
+    )
+    parser.add_argument(
+        "--provider-agent",
+        "-p",
+        required=True,
+        help="Provider/agent model (e.g., gpt-4o)",
+    )
+    parser.add_argument(
+        "--runs", "-r", type=int, required=True, help="Number of runs per persona"
+    )
+    parser.add_argument(
+        "--turns",
+        "-t",
+        type=int,
+        required=True,
+        help="Number of turns per conversation",
+    )
+
+    # Required arguments for judge
+    parser.add_argument(
+        "--judge-model",
+        "-j",
+        nargs="+",
+        required=True,
+        help="Judge model(s), format: model or model:count",
+    )
+
+    # Optional arguments for generation
+    parser.add_argument(
+        "--user-agent-extra-params",
+        "-uep",
+        help="Extra params for user agent (e.g., temperature=0.7)",
+        type=parse_key_value_list,
+        default={},
+    )
+    parser.add_argument(
+        "--provider-agent-extra-params",
+        "-pep",
+        help="Extra params for provider agent (e.g., temperature=0.5)",
+        type=parse_key_value_list,
+        default={},
+    )
+    parser.add_argument(
+        "--max-total-words",
+        "-w",
+        type=int,
+        help="Maximum total words per conversation",
+    )
+    parser.add_argument(
+        "--max-concurrent", type=int, help="Maximum concurrent conversations"
+    )
+    parser.add_argument(
+        "--max-personas",
+        type=int,
+        help="Maximum number of personas to load (for testing)",
+    )
+    parser.add_argument(
+        "--folder-name", "-f", help="Custom folder name for conversations"
+    )
+    parser.add_argument(
+        "--run-id",
+        "-i",
+        help="Custom run ID for conversation folder (default: timestamp)",
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Enable debug logging for generation"
+    )
+
+    # Optional arguments for judge
+    parser.add_argument(
+        "--judge-model-extra-params",
+        "-jep",
+        help="Extra params for judge model",
+        type=parse_key_value_list,
+        default={},
+    )
+    parser.add_argument(
+        "--judge-max-concurrent", type=int, help="Maximum concurrent judge workers"
+    )
+    parser.add_argument(
+        "--judge-per-judge",
+        action="store_true",
+        help="Apply concurrency limit per judge",
+    )
+    parser.add_argument(
+        "--judge-limit", type=int, help="Limit conversations to judge (for testing)"
+    )
+    parser.add_argument(
+        "--judge-verbose-workers",
+        action="store_true",
+        help="Enable verbose worker logging",
+    )
+    parser.add_argument(
+        "--rubrics",
+        nargs="+",
+        default=["data/rubric.tsv"],
+        help="Rubric file(s) to use for evaluation (default: data/rubric.tsv)",
+    )
+    parser.add_argument(
+        "--judge-output",
+        default="evaluations",
+        help="Output folder for evaluation results (default: evaluations)",
+    )
+
+    # Optional arguments for scoring
+    parser.add_argument(
+        "--skip-risk-analysis", action="store_true", help="Skip risk-level analysis"
+    )
+    parser.add_argument(
+        "--personas-tsv",
+        default="data/personas.tsv",
+        help="Path to personas.tsv (default: data/personas.tsv)",
+    )
+
+    return parser.parse_args()
+
+
+async def main():
+    """Main entry point for the pipeline runner."""
+
+    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+    print("VERA-MH Pipeline: Generation → Evaluation → Scoring")
+    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+    print("")
+
+    # Parse command line arguments
+    args = parse_arguments()
+
+    # Import generate and judge main functions
+    # We import here to avoid circular dependencies and to allow --debug flag to be set
+    # Import judge.py main function
+    # (note: judge.py is a module file, judge/ is a package)
+    import importlib.util
+
+    from generate import main as generate_main
+
+    spec = importlib.util.spec_from_file_location("judge_script", "judge.py")
+    judge_script = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(judge_script)
+    judge_main = judge_script.main
+
+    # Set debug mode if flag is provided
+    if args.debug:
+        from utils.debug import set_debug
+
+        set_debug(True)
+
+    # =========================================================================
+    # Step 1: Generate conversations
+    # =========================================================================
+    print("▶ Step 1/3: Generating conversations...")
+
+    # Build model configs for generation
+    persona_model_config = {
+        "model": args.user_agent,
+        **args.user_agent_extra_params,
+    }
+
+    agent_model_config = {
+        "model": args.provider_agent,
+        "name": args.provider_agent,
+        **args.provider_agent_extra_params,
+    }
+
+    # Call generate.py's main function directly
+    _, conversation_folder = await generate_main(
+        persona_model_config=persona_model_config,
+        agent_model_config=agent_model_config,
+        max_turns=args.turns,
+        runs_per_prompt=args.runs,
+        persona_extra_run_params={
+            k: v
+            for k, v in persona_model_config.items()
+            if k not in ["model", "model_name", "name", "temperature", "max_tokens"]
+        },
+        agent_extra_run_params={
+            k: v
+            for k, v in agent_model_config.items()
+            if k not in ["model", "model_name", "name", "temperature", "max_tokens"]
+        },
+        folder_name=args.folder_name,
+        run_id=args.run_id,
+        max_concurrent=args.max_concurrent,
+        max_total_words=args.max_total_words,
+        max_personas=args.max_personas,
+    )
+
+    print("")
+    print(f"✓ Conversations saved to: {conversation_folder}/")
+    print("")
+
+    # Validate that Step 1 produced conversation files
+    if not os.path.exists(conversation_folder):
+        print("")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        print("❌ Pipeline failed at Step 1: Conversation folder not created")
+        print("")
+        print(f"Expected folder: {conversation_folder}")
+        print("")
+        print("Troubleshooting:")
+        print("  - Check that generate.py returned a valid folder path")
+        print("  - Verify file system permissions")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        sys.exit(1)
+
+    # Count conversation files (exclude log files)
+    conversation_files = [
+        f
+        for f in os.listdir(conversation_folder)
+        if f.endswith(".txt") and not f.endswith(".log")
+    ]
+
+    if not conversation_files:
+        print("")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        print("❌ Pipeline failed at Step 1: No conversations were generated")
+        print("")
+        print(f"Conversation folder: {conversation_folder}")
+        print(f"Files in folder: {len(os.listdir(conversation_folder))}")
+        print("")
+        print("Possible causes:")
+        print(
+            "  1. Invalid model name (check that the model exists in the "
+            "provider's API)"
+        )
+        print("  2. API authentication issues (check your API keys in .env)")
+        print("  3. API rate limits or quota exceeded")
+        print("  4. Network connectivity issues")
+        print("")
+        print("Troubleshooting:")
+        print("  - Check files in the conversation folder for error messages")
+        print("  - Look for API error responses in the output")
+        print("  - Verify model names are valid for your provider")
+        print("  - Run generate.py separately to isolate the issue")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        sys.exit(1)
+
+    print(f"✓ Validated: {len(conversation_files)} conversation files generated")
+    print("")
+
+    # =========================================================================
+    # Step 2: Evaluate conversations with LLM judge
+    # =========================================================================
+    print("▶ Step 2/3: Evaluating conversations...")
+
+    # Build argparse.Namespace for judge.py's main function
+    judge_args = argparse.Namespace(
+        conversation=None,  # Not using single conversation mode
+        folder=conversation_folder,
+        rubrics=args.rubrics,
+        judge_model=args.judge_model,
+        judge_model_extra_params=args.judge_model_extra_params,
+        limit=args.judge_limit,
+        output=args.judge_output,
+        max_concurrent=args.judge_max_concurrent,
+        per_judge=args.judge_per_judge,
+        verbose_workers=args.judge_verbose_workers,
+    )
+
+    # Call judge.py's main function directly
+    evaluation_folder = await judge_main(judge_args)
+
+    if not evaluation_folder:
+        print("")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        print("❌ Pipeline failed at Step 2: Judge did not return an evaluation folder")
+        print("")
+        print("Troubleshooting:")
+        print("  - Check error messages from the judge evaluation above")
+        print("  - Run judge.py separately to isolate the issue")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        sys.exit(1)
+
+    # Validate that Step 2 produced evaluation results
+    if not os.path.exists(evaluation_folder):
+        print("")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        print("❌ Pipeline failed at Step 2: Evaluation folder not created")
+        print("")
+        print(f"Expected folder: {evaluation_folder}")
+        print("")
+        print("Troubleshooting:")
+        print("  - Check that judge.py returned a valid folder path")
+        print("  - Verify file system permissions")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        sys.exit(1)
+
+    # Check for results.csv file
+    results_csv_path = os.path.join(evaluation_folder, "results.csv")
+    if not os.path.exists(results_csv_path):
+        print("")
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        print("❌ Pipeline failed at Step 2: No evaluation results were generated")
+        print("")
+        print(f"Evaluation folder: {evaluation_folder}")
+        print(f"Expected results file: {results_csv_path}")
+        print("")
+
+        # Check if folder is empty
+        folder_files = (
+            os.listdir(evaluation_folder) if os.path.exists(evaluation_folder) else []
+        )
+        print(f"Files in evaluation folder: {len(folder_files)}")
+        if folder_files:
+            print("  Found: " + ", ".join(folder_files[:5]))
+            if len(folder_files) > 5:
+                print(f"  ... and {len(folder_files) - 5} more")
+
+        print("")
+        print("Possible causes:")
+        print("  1. All evaluations failed (check judge model name and API access)")
+        print("  2. Invalid judge model name")
+        print("  3. Judge API authentication issues")
+        print(
+            "  4. Conversation files from Step 1 contained errors instead of "
+            "conversations"
+        )
+        print("")
+        print("Troubleshooting:")
+        print("  - Check the conversation files from Step 1 for API error messages")
+        print("  - Look for judge evaluation errors in the output above")
+        print("  - Verify judge model name is valid")
+        print(
+            "  - Run judge.py separately on the conversation folder to isolate the "
+            "issue"
+        )
+        print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+        sys.exit(1)
+
+    print("")
+    print(f"✓ Evaluations saved to: {evaluation_folder}/")
+    print("✓ Validated: results.csv exists with evaluation data")
+    print("")
+
+    # =========================================================================
+    # Step 3: Score results and create visualizations
+    # =========================================================================
+    print("▶ Step 3/3: Scoring and visualizing results...")
+
+    # Build paths for scoring
+    results_csv = os.path.join(evaluation_folder, "results.csv")
+
+    # Call score_results for standard analysis
+    results = score_results(results_csv_path=results_csv)
+    print_scores(results)
+
+    # Create standard visualizations
+    viz_path = Path(evaluation_folder) / "scores_visualization.png"
+    create_visualizations(results, viz_path)
+
+    # Perform risk-level analysis unless skipped
+    if not args.skip_risk_analysis:
+        risk_results = score_results_by_risk(
+            results_csv_path=results_csv,
+            personas_tsv_path=args.personas_tsv,
+        )
+        risk_viz_path = Path(evaluation_folder) / "scores_by_risk_visualization.png"
+        create_risk_level_visualizations(risk_results, risk_viz_path)
+
+    # =========================================================================
+    # Final summary
+    # =========================================================================
+    print("")
+    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+    print("✓ Pipeline complete!")
+    print("")
+    print("Output Locations:")
+    print(f"  Conversations:     {conversation_folder}/")
+    print(f"  Evaluations:       {evaluation_folder}/")
+    print(f"  Scores (JSON):     {evaluation_folder}/scores.json")
+    if not args.skip_risk_analysis:
+        print(f"                     {evaluation_folder}/scores_by_risk.json")
+    print(f"  Visualizations:    {evaluation_folder}/scores_visualization.png")
+    if not args.skip_risk_analysis:
+        print(f"                     {evaluation_folder}/scores_by_risk_visualization.png")
+    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/integration/test_evaluation_runner.py b/tests/integration/test_evaluation_runner.py
index d69d304b..98691c23 100644
--- a/tests/integration/test_evaluation_runner.py
+++ b/tests/integration/test_evaluation_runner.py
@@ -451,7 +451,7 @@ async def test_judge_conversations_basic_workflow(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -509,7 +509,7 @@ async def test_judge_conversations_custom_output_folder(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, output_folder = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -555,7 +555,7 @@ async def test_judge_conversations_with_limit(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -716,7 +716,7 @@ async def test_judge_conversations_no_save_aggregated(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -767,7 +767,7 @@ async def test_load_conversation_with_unicode(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -811,7 +811,7 @@ async def test_load_conversation_multiline_messages(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -902,7 +902,7 @@ async def test_results_csv_contains_all_fields(
             question_prompt_file="question_prompt.txt",
         )
 
-        await judge_conversations(
+        _, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -953,7 +953,7 @@ async def test_metadata_extraction_from_filenames(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1002,7 +1002,7 @@ async def test_empty_conversation_file(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1046,7 +1046,7 @@ async def test_malformed_conversation_format(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1083,7 +1083,7 @@ async def test_special_characters_in_folder_path(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1128,7 +1128,7 @@ async def test_very_long_conversation(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1170,7 +1170,7 @@ async def test_concurrent_file_writing(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1384,7 +1384,7 @@ async def test_judge_conversations_with_multiple_models(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge-1": 2, "mock-judge-2": 1},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1563,7 +1563,7 @@ async def test_judge_conversations_passes_concurrency_params(
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 2},
             conversations=conversations,
             rubric_config=rubric_config,
@@ -1839,7 +1839,7 @@ async def mock_batch_evaluate_empty(*args, **kwargs):
             question_prompt_file="question_prompt.txt",
         )
 
-        results = await judge_conversations(
+        results, _ = await judge_conversations(
             judge_models={"mock-judge": 1},
             conversations=conversations,
             rubric_config=rubric_config,
diff --git a/tests/integration/test_pipeline.py b/tests/integration/test_pipeline.py
new file mode 100644
index 00000000..fdf7797b
--- /dev/null
+++ b/tests/integration/test_pipeline.py
@@ -0,0 +1,959 @@
+"""
+Integration tests for run_pipeline.py end-to-end pipeline orchestration.
+
+Tests the three-stage pipeline: generation → evaluation → scoring
+Following VERA-MH testing patterns from test_conversation_runner.py
+
+Note: Full end-to-end execution tests are complex due to module import mechanics.
+These tests focus on argument parsing, configuration building, and error paths.
+"""
+
+import argparse
+from unittest.mock import patch
+
+import pytest
+
+# Fixtures
+
+
+@pytest.fixture
+def pipeline_args():
+    """Minimal valid pipeline arguments."""
+    return argparse.Namespace(
+        user_agent="claude-sonnet-4-5-20250929",
+        provider_agent="gpt-4o",
+        runs=1,
+        turns=4,
+        judge_model=["claude-sonnet-4-5-20250929"],
+        user_agent_extra_params={},
+        provider_agent_extra_params={},
+        max_total_words=None,
+        max_concurrent=None,
+        max_personas=2,
+        folder_name=None,
+        run_id=None,
+        debug=False,
+        judge_model_extra_params={},
+        judge_max_concurrent=None,
+        judge_per_judge=False,
+        judge_limit=None,
+        judge_verbose_workers=False,
+        rubrics=["data/rubric.tsv"],
+        judge_output="evaluations",
+        skip_risk_analysis=False,
+        personas_tsv="data/personas.tsv",
+    )
+
+
+# Test Classes
+
+
+@pytest.mark.integration
+class TestPipelineArgumentParsing:
+    """Test argument parsing and validation."""
+
+    def test_parse_arguments_required_only(self):
+        """Test parsing with only required arguments."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.user_agent == "claude-sonnet-4-5-20250929"
+            assert args.provider_agent == "gpt-4o"
+            assert args.runs == 1
+            assert args.turns == 4
+            assert args.judge_model == ["claude-sonnet-4-5-20250929"]
+
+    def test_parse_arguments_with_extra_params(self):
+        """Test parsing with extra model parameters."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "--user-agent-extra-params",
+            "temperature=0.7,max_tokens=1000",
+            "--provider-agent-extra-params",
+            "temperature=0.5",
+            "--judge-model-extra-params",
+            "temperature=0.1",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.user_agent_extra_params == {
+                "temperature": 0.7,
+                "max_tokens": 1000,
+            }
+            assert args.provider_agent_extra_params == {"temperature": 0.5}
+            assert args.judge_model_extra_params == {"temperature": 0.1}
+
+    def test_parse_arguments_multiple_judge_models(self):
+        """Test parsing with multiple judge models."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929:2",
+            "gpt-4o",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.judge_model == ["claude-sonnet-4-5-20250929:2", "gpt-4o"]
+
+    def test_parse_arguments_missing_required(self):
+        """Test that missing required arguments raises error."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            # Missing other required args
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            with pytest.raises(SystemExit):
+                parse_arguments()
+
+    def test_parse_arguments_optional_flags(self):
+        """Test parsing optional boolean flags."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "--debug",
+            "--judge-per-judge",
+            "--judge-verbose-workers",
+            "--skip-risk-analysis",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.debug is True
+            assert args.judge_per_judge is True
+            assert args.judge_verbose_workers is True
+            assert args.skip_risk_analysis is True
+
+    def test_parse_arguments_with_all_optional_arguments(self):
+        """Test parsing with all optional arguments provided."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "2",
+            "--turns",
+            "10",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929:2",
+            "gpt-4o",
+            "--user-agent-extra-params",
+            "temperature=0.7",
+            "--provider-agent-extra-params",
+            "temperature=0.5",
+            "--max-total-words",
+            "5000",
+            "--max-concurrent",
+            "10",
+            "--max-personas",
+            "5",
+            "--folder-name",
+            "custom_folder",
+            "--run-id",
+            "test_run_id",
+            "--debug",
+            "--judge-model-extra-params",
+            "temperature=0.1",
+            "--judge-max-concurrent",
+            "5",
+            "--judge-per-judge",
+            "--judge-limit",
+            "10",
+            "--judge-verbose-workers",
+            "--rubrics",
+            "data/rubric.tsv",
+            "data/custom_rubric.tsv",
+            "--judge-output",
+            "custom_output",
+            "--skip-risk-analysis",
+            "--personas-tsv",
+            "custom/personas.tsv",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            # Check all values were parsed correctly
+            assert args.runs == 2
+            assert args.turns == 10
+            assert args.max_total_words == 5000
+            assert args.max_concurrent == 10
+            assert args.max_personas == 5
+            assert args.folder_name == "custom_folder"
+            assert args.run_id == "test_run_id"
+            assert args.judge_max_concurrent == 5
+            assert args.judge_limit == 10
+            assert args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"]
+            assert args.judge_output == "custom_output"
+            assert args.personas_tsv == "custom/personas.tsv"
+
+
+@pytest.mark.integration
+class TestPipelineConfiguration:
+    """Test configuration building logic from arguments."""
+
+    def test_persona_model_config_dict_structure(self, pipeline_args):
+        """Test that persona model config is built with correct structure."""
+        # Build config as done in main()
+        persona_config = {
+            "model": pipeline_args.user_agent,
+            **pipeline_args.user_agent_extra_params,
+        }
+
+        assert "model" in persona_config
+        assert persona_config["model"] == "claude-sonnet-4-5-20250929"
+        assert isinstance(persona_config, dict)
+
+    def test_agent_model_config_dict_structure(self, pipeline_args):
+        """Test that agent model config is built with correct structure."""
+        # Build config as done in main()
+        agent_config = {
+            "model": pipeline_args.provider_agent,
+            "name": pipeline_args.provider_agent,
+            **pipeline_args.provider_agent_extra_params,
+        }
+
+        assert "model" in agent_config
+        assert "name" in agent_config
+        assert agent_config["model"] == "gpt-4o"
+        assert agent_config["name"] == "gpt-4o"
+        assert isinstance(agent_config, dict)
+
+    def test_extra_params_merge_into_config(self):
+        """Test that extra params correctly merge into model configs."""
+        args = argparse.Namespace(
+            user_agent="claude-sonnet-4-5-20250929",
+            provider_agent="gpt-4o",
+            user_agent_extra_params={"temperature": 0.7, "max_tokens": 1000},
+            provider_agent_extra_params={"temperature": 0.5},
+        )
+
+        persona_config = {
+            "model": args.user_agent,
+            **args.user_agent_extra_params,
+        }
+
+        agent_config = {
+            "model": args.provider_agent,
+            "name": args.provider_agent,
+            **args.provider_agent_extra_params,
+        }
+
+        # Check persona config
+        assert persona_config["model"] == "claude-sonnet-4-5-20250929"
+        assert persona_config["temperature"] == 0.7
+        assert persona_config["max_tokens"] == 1000
+
+        # Check agent config
+        assert agent_config["model"] == "gpt-4o"
+        assert agent_config["temperature"] == 0.5
+
+    def test_judge_args_namespace_structure(self, pipeline_args):
+        """Test that judge args Namespace is constructed correctly."""
+        conv_folder = "conversations/test"
+
+        # Build judge args as done in main()
+        judge_args = argparse.Namespace(
+            conversation=None,
+            folder=conv_folder,
+            rubrics=pipeline_args.rubrics,
+            judge_model=pipeline_args.judge_model,
+            judge_model_extra_params=pipeline_args.judge_model_extra_params,
+            limit=pipeline_args.judge_limit,
+            output=pipeline_args.judge_output,
+            max_concurrent=pipeline_args.judge_max_concurrent,
+            per_judge=pipeline_args.judge_per_judge,
+            verbose_workers=pipeline_args.judge_verbose_workers,
+        )
+
+        # Verify structure
+        assert isinstance(judge_args, argparse.Namespace)
+        assert judge_args.conversation is None
+        assert judge_args.folder == conv_folder
+        assert judge_args.rubrics == pipeline_args.rubrics
+        assert judge_args.judge_model == ["claude-sonnet-4-5-20250929"]
+        assert judge_args.output == pipeline_args.judge_output
+
+    def test_empty_extra_params_dont_pollute_config(self):
+        """Test that empty extra params don't add unwanted keys."""
+        args = argparse.Namespace(
+            user_agent="claude-sonnet-4-5-20250929",
+            user_agent_extra_params={},
+        )
+
+        persona_config = {
+            "model": args.user_agent,
+            **args.user_agent_extra_params,
+        }
+
+        # Should only have the model key
+        assert len(persona_config) == 1
+        assert "model" in persona_config
+
+
+@pytest.mark.integration
+class TestPipelineDataFlow:
+    """Test data flow and path construction between stages."""
+
+    def test_conversation_folder_to_judge_path_construction(self):
+        """Test that conversation folder path is correctly passed to judge."""
+        conv_folder = "conversations/test_20240101_120000"
+
+        # As done in main(): judge receives the folder
+        judge_args = argparse.Namespace(
+            folder=conv_folder,
+            conversation=None,
+        )
+
+        assert judge_args.folder == conv_folder
+        assert judge_args.conversation is None
+
+    def test_evaluation_folder_to_score_path_construction(self):
+        """Test that evaluation folder path is correctly transformed for score."""
+        import os
+
+        eval_folder = "evaluations/test_20240101_120000"
+
+        # As done in main(): score receives results.csv path
+        results_csv = os.path.join(eval_folder, "results.csv")
+
+        assert results_csv == "evaluations/test_20240101_120000/results.csv"
+        assert results_csv.startswith(eval_folder)
+        assert results_csv.endswith("results.csv")
+
+    def test_personas_tsv_path_passed_to_score(self, pipeline_args):
+        """Test that personas.tsv path is correctly passed to score."""
+        # As done in main()
+        personas_tsv_path = pipeline_args.personas_tsv
+
+        assert personas_tsv_path == "data/personas.tsv"
+
+    def test_skip_risk_analysis_flag_passed_to_score(self, pipeline_args):
+        """Test that skip_risk_analysis flag is correctly passed to score."""
+        # As done in main()
+        skip_risk = pipeline_args.skip_risk_analysis
+
+        assert skip_risk is False  # Default value
+
+        # Test with True
+        pipeline_args.skip_risk_analysis = True
+        assert pipeline_args.skip_risk_analysis is True
+
+
+@pytest.mark.integration
+class TestPipelineNewArguments:
+    """Test newly added arguments for consistency with individual scripts."""
+
+    def test_run_id_argument_exists(self, pipeline_args):
+        """Test that run_id argument exists in pipeline args."""
+        assert hasattr(pipeline_args, "run_id")
+        assert pipeline_args.run_id is None  # Default value
+
+    def test_run_id_passed_to_generate(self, pipeline_args):
+        """Test that run_id is correctly structured for generate_main."""
+        # Set custom run_id
+        pipeline_args.run_id = "custom_test_run"
+
+        # Verify it's accessible
+        assert pipeline_args.run_id == "custom_test_run"
+
+    def test_rubrics_argument_exists(self, pipeline_args):
+        """Test that rubrics argument exists in pipeline args."""
+        assert hasattr(pipeline_args, "rubrics")
+        assert pipeline_args.rubrics == ["data/rubric.tsv"]  # Default value
+
+    def test_rubrics_passed_to_judge(self, pipeline_args):
+        """Test that rubrics are correctly passed to judge args."""
+        # Set custom rubrics
+        pipeline_args.rubrics = ["data/rubric.tsv", "data/custom_rubric.tsv"]
+
+        # As done in main(): judge receives these rubrics
+        judge_args = argparse.Namespace(
+            rubrics=pipeline_args.rubrics,
+        )
+
+        assert judge_args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"]
+        assert len(judge_args.rubrics) == 2
+
+    def test_judge_output_argument_exists(self, pipeline_args):
+        """Test that judge_output argument exists in pipeline args."""
+        assert hasattr(pipeline_args, "judge_output")
+        assert pipeline_args.judge_output == "evaluations"  # Default value
+
+    def test_judge_output_passed_to_judge(self, pipeline_args):
+        """Test that judge_output is correctly passed to judge args."""
+        # Set custom output folder
+        pipeline_args.judge_output = "custom_evaluations"
+
+        # As done in main(): judge receives this output folder
+        judge_args = argparse.Namespace(
+            output=pipeline_args.judge_output,
+        )
+
+        assert judge_args.output == "custom_evaluations"
+
+    def test_parse_arguments_with_run_id(self):
+        """Test parsing arguments with --run-id."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "--run-id",
+            "test_run_123",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.run_id == "test_run_123"
+
+    def test_parse_arguments_with_rubrics(self):
+        """Test parsing arguments with --rubrics."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "--rubrics",
+            "data/rubric.tsv",
+            "data/custom_rubric.tsv",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.rubrics == ["data/rubric.tsv", "data/custom_rubric.tsv"]
+
+    def test_parse_arguments_with_judge_output(self):
+        """Test parsing arguments with --judge-output."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "--judge-output",
+            "custom_evals",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.judge_output == "custom_evals"
+
+    def test_parse_arguments_defaults_for_new_args(self):
+        """Test that new arguments have correct defaults."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            # Check defaults
+            assert args.run_id is None
+            assert args.rubrics == ["data/rubric.tsv"]
+            assert args.judge_output == "evaluations"
+
+    def test_short_flags_for_extra_params(self):
+        """Test that short flags work for extra params arguments."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "-uep",
+            "temperature=0.7,max_tokens=1000",
+            "-pep",
+            "temperature=0.5",
+            "-jep",
+            "temperature=0.1",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.user_agent_extra_params == {
+                "temperature": 0.7,
+                "max_tokens": 1000,
+            }
+            assert args.provider_agent_extra_params == {"temperature": 0.5}
+            assert args.judge_model_extra_params == {"temperature": 0.1}
+
+    def test_short_flag_for_run_id(self):
+        """Test that short flag -i works for run-id."""
+        from run_pipeline import parse_arguments
+
+        test_args = [
+            "--user-agent",
+            "claude-sonnet-4-5-20250929",
+            "--provider-agent",
+            "gpt-4o",
+            "--runs",
+            "1",
+            "--turns",
+            "4",
+            "--judge-model",
+            "claude-sonnet-4-5-20250929",
+            "-i",
+            "custom_run",
+        ]
+
+        with patch("sys.argv", ["run_pipeline.py"] + test_args):
+            args = parse_arguments()
+
+            assert args.run_id == "custom_run"
+
+
+# Fixtures for validation tests
+
+
+@pytest.fixture
+def valid_pipeline_args():
+    """Fixture providing valid minimal pipeline arguments."""
+    return [
+        "run_pipeline.py",
+        "--user-agent",
+        "test-model",
+        "--provider-agent",
+        "test-model",
+        "--runs",
+        "1",
+        "--turns",
+        "1",
+        "--judge-model",
+        "test-model",
+    ]
+
+
+@pytest.mark.integration
+class TestPipelineValidation:
+    """Test pipeline validation and error handling for empty folders."""
+
+    @pytest.mark.asyncio
+    async def test_step1_validation_folder_not_exists(
+        self, tmp_path, valid_pipeline_args
+    ):
+        """Test that pipeline exits if Step 1 folder doesn't exist."""
+        import sys
+        from unittest.mock import patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Mock generate module's main to return a non-existent folder
+        async def mock_generate(*args, **kwargs):
+            return None, str(tmp_path / "nonexistent")
+
+        # Patch generate.main at the source
+        with patch("generate.main", side_effect=mock_generate):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                # Mock importlib to avoid judge loading (not needed for step 1 test)
+                with patch("importlib.util.spec_from_file_location"):
+                    with patch("sys.argv", valid_pipeline_args):
+                        # Pipeline should raise SystemExit when folder doesn't exist
+                        with pytest.raises(SystemExit):
+                            await pipeline_main()
+
+                        # Verify sys.exit(1) was called
+                        mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step1_validation_no_conversation_files(
+        self, tmp_path, valid_pipeline_args
+    ):
+        """Test that pipeline exits if Step 1 produces no .txt files."""
+        import sys
+        from unittest.mock import patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create empty conversation folder
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+
+        # Mock generate_main to return empty folder
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        with patch("generate.main", side_effect=mock_generate):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("importlib.util.spec_from_file_location"):
+                    with patch("sys.argv", valid_pipeline_args):
+                        # Pipeline should raise SystemExit
+                        with pytest.raises(SystemExit):
+                            await pipeline_main()
+
+                        # Verify sys.exit(1) was called
+                        mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step1_validation_only_log_files(self, tmp_path, valid_pipeline_args):
+        """Test that pipeline exits if Step 1 only produces .log files."""
+        import sys
+        from unittest.mock import patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with only .log files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conversation1.log").write_text("log content")
+        (conv_folder / "conversation2.log").write_text("log content")
+
+        # Mock generate_main to return folder with only logs
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        with patch("generate.main", side_effect=mock_generate):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("importlib.util.spec_from_file_location"):
+                    with patch("sys.argv", valid_pipeline_args):
+                        # Pipeline should raise SystemExit
+                        with pytest.raises(SystemExit):
+                            await pipeline_main()
+
+                        # Verify sys.exit(1) was called
+                        mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step2_validation_no_evaluation_folder(
+        self, tmp_path, valid_pipeline_args
+    ):
+        """Test that pipeline exits if Step 2 returns None."""
+        import sys
+        from unittest.mock import MagicMock, patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with valid files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello")
+
+        # Mock generate_main to return valid folder
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        # Mock judge_main to return None
+        async def mock_judge(args):
+            return None
+
+        # Create a mock module with the mock judge main function
+        mock_judge_module = MagicMock()
+        mock_judge_module.main = mock_judge
+
+        with (
+            patch("generate.main", side_effect=mock_generate),
+            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
+            patch("importlib.util.spec_from_file_location"),
+        ):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
+
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step2_validation_folder_not_exists(
+        self, tmp_path, valid_pipeline_args
+    ):
+        """Test that pipeline exits if Step 2 folder doesn't exist."""
+        import sys
+        from unittest.mock import MagicMock, patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with valid files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello")
+
+        # Mock generate_main to return valid folder
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        # Mock judge_main to return non-existent folder
+        async def mock_judge(args):
+            return str(tmp_path / "nonexistent_eval")
+
+        # Create a mock module with the mock judge main function
+        mock_judge_module = MagicMock()
+        mock_judge_module.main = mock_judge
+
+        with (
+            patch("generate.main", side_effect=mock_generate),
+            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
+            patch("importlib.util.spec_from_file_location"),
+        ):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
+
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step2_validation_no_results_csv(self, tmp_path, valid_pipeline_args):
+        """Test that pipeline exits if Step 2 produces no results.csv."""
+        import sys
+        from unittest.mock import MagicMock, patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with valid files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello")
+
+        # Create evaluation folder but no results.csv
+        eval_folder = tmp_path / "evaluations"
+        eval_folder.mkdir()
+        (eval_folder / "some_other_file.json").write_text("{}")
+
+        # Mock generate_main to return valid folder
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        # Mock judge_main to return folder without results.csv
+        async def mock_judge(args):
+            return str(eval_folder)
+
+        # Create a mock module with the mock judge main function
+        mock_judge_module = MagicMock()
+        mock_judge_module.main = mock_judge
+
+        with (
+            patch("generate.main", side_effect=mock_generate),
+            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
+            patch("importlib.util.spec_from_file_location"),
+        ):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
+
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_step2_validation_empty_folder_error_message(
+        self, tmp_path, valid_pipeline_args, capsys
+    ):
+        """Test that error message lists files when folder is not empty."""
+        import sys
+        from unittest.mock import MagicMock, patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with valid files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello")
+
+        # Create evaluation folder with some files but no results.csv
+        eval_folder = tmp_path / "evaluations"
+        eval_folder.mkdir()
+        (eval_folder / "file1.json").write_text("{}")
+        (eval_folder / "file2.json").write_text("{}")
+        (eval_folder / "file3.log").write_text("log")
+
+        # Mock functions
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        async def mock_judge(args):
+            return str(eval_folder)
+
+        # Create a mock module with the mock judge main function
+        mock_judge_module = MagicMock()
+        mock_judge_module.main = mock_judge
+
+        with (
+            patch("generate.main", side_effect=mock_generate),
+            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
+            patch("importlib.util.spec_from_file_location"),
+        ):
+            # Mock sys.exit to raise SystemExit instead of actually exiting
+            with patch.object(sys, "exit", side_effect=SystemExit) as mock_exit:
+                with patch("sys.argv", valid_pipeline_args):
+                    # Pipeline should raise SystemExit
+                    with pytest.raises(SystemExit):
+                        await pipeline_main()
+
+                    # Capture printed output
+                    captured = capsys.readouterr()
+
+                    # Verify error message includes file listing
+                    assert "Files in evaluation folder: 3" in captured.out
+                    assert "Found:" in captured.out
+
+                    # Verify sys.exit(1) was called
+                    mock_exit.assert_called_once_with(1)
+
+    @pytest.mark.asyncio
+    async def test_validation_success_messages(
+        self, tmp_path, valid_pipeline_args, capsys
+    ):
+        """Test that validation success messages are displayed."""
+        from unittest.mock import MagicMock, patch
+
+        from run_pipeline import main as pipeline_main
+
+        # Create conversation folder with valid files
+        conv_folder = tmp_path / "conversations"
+        conv_folder.mkdir()
+        (conv_folder / "conv1.txt").write_text("User: Hi\nAssistant: Hello")
+        (conv_folder / "conv2.txt").write_text("User: Hey\nAssistant: Hi there")
+
+        # Create evaluation folder with results.csv
+        eval_folder = tmp_path / "evaluations"
+        eval_folder.mkdir()
+        (eval_folder / "results.csv").write_text(
+            "filename,run_id,Safety\nconv1.txt,test,Pass"
+        )
+
+        # Mock functions
+        async def mock_generate(*args, **kwargs):
+            return None, str(conv_folder)
+
+        async def mock_judge(args):
+            return str(eval_folder)
+
+        def mock_score(*args, **kwargs):
+            return {}
+
+        # Create a mock module with the mock judge main function
+        mock_judge_module = MagicMock()
+        mock_judge_module.main = mock_judge
+
+        with (
+            patch("generate.main", side_effect=mock_generate),
+            patch("importlib.util.module_from_spec", return_value=mock_judge_module),
+            patch("importlib.util.spec_from_file_location"),
+            patch("run_pipeline.score_results", new=mock_score),
+            patch("run_pipeline.print_scores"),
+            patch("run_pipeline.create_visualizations"),
+        ):
+            with patch("sys.argv", valid_pipeline_args + ["--skip-risk-analysis"]):
+                await pipeline_main()
+
+                # Capture printed output
+                captured = capsys.readouterr()
+
+                # Verify success messages
+                assert "✓ Validated: 2 conversation files generated" in captured.out
+                assert (
+                    "✓ Validated: results.csv exists with evaluation data"
+                    in captured.out
+                )
diff --git a/tests/unit/judge/test_judge_cli.py b/tests/unit/judge/test_judge_cli.py
index 00357320..cdbad7bc 100644
--- a/tests/unit/judge/test_judge_cli.py
+++ b/tests/unit/judge/test_judge_cli.py
@@ -35,33 +35,33 @@ def test_single_model_with_count(self):
 
     def test_multiple_different_models(self):
         """Test parsing multiple different models."""
-        result = parse_judge_models(["gpt-4o", "claude-3-5-sonnet-20241022"])
-        assert result == {"gpt-4o": 1, "claude-3-5-sonnet-20241022": 1}
+        result = parse_judge_models(["gpt-4o", "claude-sonnet-4-5-20250929"])
+        assert result == {"gpt-4o": 1, "claude-sonnet-4-5-20250929": 1}
 
     def test_multiple_models_with_counts(self):
         """Test parsing multiple models with counts."""
-        result = parse_judge_models(["gpt-4o:2", "claude-3-5-sonnet-20241022:3"])
-        assert result == {"gpt-4o": 2, "claude-3-5-sonnet-20241022": 3}
+        result = parse_judge_models(["gpt-4o:2", "claude-sonnet-4-5-20250929:3"])
+        assert result == {"gpt-4o": 2, "claude-sonnet-4-5-20250929": 3}
 
     def test_mixed_models_with_and_without_counts(self):
         """Test parsing mix of models with and without counts."""
-        result = parse_judge_models(["gpt-4o", "claude-3-5-sonnet-20241022:2"])
-        assert result == {"gpt-4o": 1, "claude-3-5-sonnet-20241022": 2}
+        result = parse_judge_models(["gpt-4o", "claude-sonnet-4-5-20250929:2"])
+        assert result == {"gpt-4o": 1, "claude-sonnet-4-5-20250929": 2}
 
     def test_model_with_multiple_colons(self):
         """Test parsing model name that contains colons (e.g., dated model names)."""
         # Should use rsplit to handle model names with colons
-        result = parse_judge_models(["claude-3-5-sonnet-20241022:2"])
-        assert result == {"claude-3-5-sonnet-20241022": 2}
+        result = parse_judge_models(["claude-sonnet-4-5-20250929:2"])
+        assert result == {"claude-sonnet-4-5-20250929": 2}
 
     def test_three_models_mixed(self):
         """Test parsing three models with various count specifications."""
         result = parse_judge_models(
-            ["gpt-4o:2", "claude-3-5-sonnet-20241022", "gpt-3.5-turbo:3"]
+            ["gpt-4o:2", "claude-sonnet-4-5-20250929", "gpt-3.5-turbo:3"]
         )
         assert result == {
             "gpt-4o": 2,
-            "claude-3-5-sonnet-20241022": 1,
+            "claude-sonnet-4-5-20250929": 1,
             "gpt-3.5-turbo": 3,
         }
 
diff --git a/tests/unit/judge/test_runner_extra_params.py b/tests/unit/judge/test_runner_extra_params.py
index 642c3061..f9ed1d39 100644
--- a/tests/unit/judge/test_runner_extra_params.py
+++ b/tests/unit/judge/test_runner_extra_params.py
@@ -139,7 +139,7 @@ async def test_judge_conversations_accepts_extra_params(
                 }
             ]
 
-            results = await judge_conversations(
+            results, _ = await judge_conversations(
                 judge_models={"claude-3-7-sonnet": 1},
                 conversations=[conversation],
                 rubric_config=rubric_config,
@@ -182,7 +182,7 @@ async def test_judge_conversations_extra_params_defaults_to_none(
                 }
             ]
 
-            results = await judge_conversations(
+            results, _ = await judge_conversations(
                 judge_models={"claude-3-7-sonnet": 1},
                 conversations=[conversation],
                 rubric_config=rubric_config,
diff --git a/tests/unit/llm_clients/test_claude_llm.py b/tests/unit/llm_clients/test_claude_llm.py
index c5f755a9..2b8b28f0 100644
--- a/tests/unit/llm_clients/test_claude_llm.py
+++ b/tests/unit/llm_clients/test_claude_llm.py
@@ -23,14 +23,14 @@ def test_init_missing_api_key_raises_error(self):
     def test_init_with_default_model(self, mock_chat_anthropic):
         """Test initialization with default model from config."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
         mock_chat_anthropic.return_value = mock_llm
 
         llm = ClaudeLLM(name="TestClaude", system_prompt="Test prompt")
 
         assert llm.name == "TestClaude"
         assert llm.system_prompt == "Test prompt"
-        assert llm.model_name == "claude-3-5-sonnet-20241022"
+        assert llm.model_name == "claude-sonnet-4-5-20250929"
         assert llm.last_response_metadata == {}
 
     @patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key")
@@ -50,7 +50,7 @@ def test_init_with_custom_model(self, mock_chat_anthropic):
     def test_init_with_kwargs(self, mock_chat_anthropic):
         """Test initialization with additional kwargs."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
         mock_chat_anthropic.return_value = mock_llm
 
         ClaudeLLM(name="TestClaude", temperature=0.5, max_tokens=500, top_p=0.9)
@@ -69,14 +69,14 @@ async def test_generate_response_success_with_system_prompt(
     ):
         """Test successful response generation with system prompt (lines 49-97)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         # Create mock response with metadata
         mock_response = MagicMock()
         mock_response.text = "This is a test response"
         mock_response.id = "msg_12345"
         mock_response.response_metadata = {
-            "model": "claude-3-5-sonnet-20241022",
+            "model": "claude-sonnet-4-5-20250929",
             "usage": {"input_tokens": 10, "output_tokens": 20},
             "stop_reason": "end_turn",
         }
@@ -96,7 +96,7 @@ async def test_generate_response_success_with_system_prompt(
         # Verify metadata was extracted (lines 62-95)
         metadata = llm.get_last_response_metadata()
         assert metadata["response_id"] == "msg_12345"
-        assert metadata["model"] == "claude-3-5-sonnet-20241022"
+        assert metadata["model"] == "claude-sonnet-4-5-20250929"
         assert metadata["provider"] == "claude"
         assert "timestamp" in metadata
         assert "response_time_seconds" in metadata
@@ -112,12 +112,12 @@ async def test_generate_response_success_with_system_prompt(
     async def test_generate_response_without_system_prompt(self, mock_chat_anthropic):
         """Test response generation without system prompt."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Response without system prompt"
         mock_response.id = "msg_67890"
-        mock_response.response_metadata = {"model": "claude-3-5-sonnet-20241022"}
+        mock_response.response_metadata = {"model": "claude-sonnet-4-5-20250929"}
 
         mock_llm.ainvoke = AsyncMock(return_value=mock_response)
         mock_chat_anthropic.return_value = mock_llm
@@ -142,13 +142,13 @@ async def test_generate_response_without_system_prompt(self, mock_chat_anthropic
     async def test_generate_response_without_usage_metadata(self, mock_chat_anthropic):
         """Test response when usage metadata is not available."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         # Response without usage in metadata
         mock_response = MagicMock()
         mock_response.text = "Response"
         mock_response.id = "msg_abc"
-        mock_response.response_metadata = {"model": "claude-3-5-sonnet-20241022"}
+        mock_response.response_metadata = {"model": "claude-sonnet-4-5-20250929"}
 
         mock_llm.ainvoke = AsyncMock(return_value=mock_response)
         mock_chat_anthropic.return_value = mock_llm
@@ -170,7 +170,7 @@ async def test_generate_response_without_response_metadata(
     ):
         """Test response when response_metadata attribute is missing."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         # Response without response_metadata attribute
         mock_response = MagicMock()
@@ -188,7 +188,7 @@ async def test_generate_response_without_response_metadata(
 
         assert response == "Response"
         metadata = llm.get_last_response_metadata()
-        assert metadata["model"] == "claude-3-5-sonnet-20241022"
+        assert metadata["model"] == "claude-sonnet-4-5-20250929"
         assert metadata["usage"] == {}
         assert metadata["stop_reason"] is None
 
@@ -198,7 +198,7 @@ async def test_generate_response_without_response_metadata(
     async def test_generate_response_api_error(self, mock_chat_anthropic):
         """Test error handling when API call fails (lines 98-108)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         # Simulate API error
         mock_llm.ainvoke = AsyncMock(side_effect=Exception("API rate limit exceeded"))
@@ -218,7 +218,7 @@ async def test_generate_response_api_error(self, mock_chat_anthropic):
         # Verify error metadata was stored (lines 100-107)
         metadata = llm.get_last_response_metadata()
         assert metadata["response_id"] is None
-        assert metadata["model"] == "claude-3-5-sonnet-20241022"
+        assert metadata["model"] == "claude-sonnet-4-5-20250929"
         assert metadata["provider"] == "claude"
         assert "timestamp" in metadata
         assert "error" in metadata
@@ -231,7 +231,7 @@ async def test_generate_response_api_error(self, mock_chat_anthropic):
     async def test_generate_response_tracks_timing(self, mock_chat_anthropic):
         """Test that response timing is tracked correctly (lines 57-59)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Timed response"
@@ -256,7 +256,7 @@ def test_get_last_response_metadata_returns_copy(self):
         with patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key"):
             with patch("llm_clients.claude_llm.ChatAnthropic") as mock_chat:
                 mock_llm = MagicMock()
-                mock_llm.model = "claude-3-5-sonnet-20241022"
+                mock_llm.model = "claude-sonnet-4-5-20250929"
                 mock_chat.return_value = mock_llm
 
                 llm = ClaudeLLM(name="TestClaude")
@@ -278,7 +278,7 @@ def test_set_system_prompt(self):
         with patch("llm_clients.claude_llm.Config.ANTHROPIC_API_KEY", "test-key"):
             with patch("llm_clients.claude_llm.ChatAnthropic") as mock_chat:
                 mock_llm = MagicMock()
-                mock_llm.model = "claude-3-5-sonnet-20241022"
+                mock_llm.model = "claude-sonnet-4-5-20250929"
                 mock_chat.return_value = mock_llm
 
                 llm = ClaudeLLM(name="TestClaude", system_prompt="Initial prompt")
@@ -295,14 +295,14 @@ async def test_generate_response_with_partial_usage_metadata(
     ):
         """Test response with incomplete usage metadata."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         # Response with partial usage info
         mock_response = MagicMock()
         mock_response.text = "Partial usage response"
         mock_response.id = "msg_partial"
         mock_response.response_metadata = {
-            "model": "claude-3-5-sonnet-20241022",
+            "model": "claude-sonnet-4-5-20250929",
             "usage": {"input_tokens": 15},  # Missing output_tokens
         }
 
@@ -326,7 +326,7 @@ async def test_generate_response_with_partial_usage_metadata(
     async def test_metadata_includes_response_object(self, mock_chat_anthropic):
         """Test that metadata includes the full response object (line 74)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Test"
@@ -351,7 +351,7 @@ async def test_metadata_includes_response_object(self, mock_chat_anthropic):
     async def test_timestamp_format(self, mock_chat_anthropic):
         """Test that timestamp is in ISO format (line 70)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Test"
@@ -384,13 +384,13 @@ async def test_timestamp_format(self, mock_chat_anthropic):
     async def test_metadata_with_stop_reason(self, mock_chat_anthropic):
         """Test metadata extraction of stop_reason (line 92)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Stopped response"
         mock_response.id = "msg_stop"
         mock_response.response_metadata = {
-            "model": "claude-3-5-sonnet-20241022",
+            "model": "claude-sonnet-4-5-20250929",
             "stop_reason": "max_tokens",
         }
 
@@ -411,13 +411,13 @@ async def test_metadata_with_stop_reason(self, mock_chat_anthropic):
     async def test_raw_metadata_stored(self, mock_chat_anthropic):
         """Test that raw metadata is stored (line 95)."""
         mock_llm = MagicMock()
-        mock_llm.model = "claude-3-5-sonnet-20241022"
+        mock_llm.model = "claude-sonnet-4-5-20250929"
 
         mock_response = MagicMock()
         mock_response.text = "Test"
         mock_response.id = "msg_raw"
         mock_response.response_metadata = {
-            "model": "claude-3-5-sonnet-20241022",
+            "model": "claude-sonnet-4-5-20250929",
             "custom_field": "custom_value",
             "nested": {"key": "value"},
         }
@@ -447,7 +447,7 @@ async def test_generate_response_with_conversation_history(
         mock_response.text = "Response with history"
         mock_response.id = "msg_history"
         mock_response.response_metadata = {
-            "model": "claude-3-5-sonnet-20241022",
+            "model": "claude-sonnet-4-5-20250929",
             "usage": {"input_tokens": 50, "output_tokens": 20},
         }
 
diff --git a/tests/unit/llm_clients/test_config.py b/tests/unit/llm_clients/test_config.py
index a6e32cf4..89b6d56c 100644
--- a/tests/unit/llm_clients/test_config.py
+++ b/tests/unit/llm_clients/test_config.py
@@ -23,7 +23,7 @@ def test_get_claude_config(self):
 
         assert isinstance(config, dict)
         assert "model" in config
-        assert config["model"] == "claude-3-5-sonnet-20241022"
+        assert config["model"] == "claude-sonnet-4-5-20250929"
         # Temperature and max_tokens should NOT be in config
         assert "temperature" not in config
         assert "max_tokens" not in config
diff --git a/tests/unit/llm_clients/test_llm_factory.py b/tests/unit/llm_clients/test_llm_factory.py
index 89f916ec..6417fed6 100644
--- a/tests/unit/llm_clients/test_llm_factory.py
+++ b/tests/unit/llm_clients/test_llm_factory.py
@@ -18,7 +18,7 @@ class TestLLMFactory:
     def test_create_claude_llm(self, mock_chat_anthropic):
         """Test that factory correctly creates Claude LLM instance."""
         # Arrange
-        model_name = "claude-3-5-sonnet-20241022"
+        model_name = "claude-sonnet-4-5-20250929"
         name = "TestClaude"
         system_prompt = "You are a helpful assistant."
         mock_chat_anthropic.return_value = MagicMock()
@@ -114,7 +114,7 @@ def test_unsupported_model_raises_error(self):
     def test_factory_passes_kwargs(self, mock_chat_anthropic):
         """Test that factory correctly forwards kwargs to LLM implementations."""
         # Arrange
-        model_name = "claude-3-5-sonnet-20241022"
+        model_name = "claude-sonnet-4-5-20250929"
         name = "TestKwargs"
         temperature = 0.5
         max_tokens = 500
diff --git a/tests/unit/utils/test_model_config_loader.py b/tests/unit/utils/test_model_config_loader.py
index 440a2dfe..d58802ce 100644
--- a/tests/unit/utils/test_model_config_loader.py
+++ b/tests/unit/utils/test_model_config_loader.py
@@ -19,7 +19,7 @@ def test_load_model_config_with_valid_file(self, tmp_path):
                 "persona_depressed": "claude-3-opus",
                 "chatbot_therapist": "claude-3-5-sonnet",
             },
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
             "temperature": 0.7,
         }
 
@@ -29,7 +29,7 @@ def test_load_model_config_with_valid_file(self, tmp_path):
         result = load_model_config(str(config_file))
 
         assert result == config_data
-        assert result["default_model"] == "claude-3-5-sonnet-20241022"
+        assert result["default_model"] == "claude-sonnet-4-5-20250929"
         assert result["prompt_models"]["persona_anxious"] == "gpt-4"
         assert result["temperature"] == 0.7
 
@@ -53,7 +53,7 @@ def test_load_model_config_file_not_found(self, tmp_path, capsys):
 
         # Should return default config
         assert result["prompt_models"] == {}
-        assert result["default_model"] == "claude-3-5-sonnet-20241022"
+        assert result["default_model"] == "claude-sonnet-4-5-20250929"
 
         # Should print warning
         captured = capsys.readouterr()
@@ -69,7 +69,7 @@ def test_load_model_config_invalid_json_syntax(self, tmp_path, capsys):
 
         # Should return default config
         assert result["prompt_models"] == {}
-        assert result["default_model"] == "claude-3-5-sonnet-20241022"
+        assert result["default_model"] == "claude-sonnet-4-5-20250929"
 
         # Should print error
         captured = capsys.readouterr()
@@ -84,7 +84,7 @@ def test_load_model_config_empty_file(self, tmp_path, capsys):
 
         # Should return default config
         assert result["prompt_models"] == {}
-        assert result["default_model"] == "claude-3-5-sonnet-20241022"
+        assert result["default_model"] == "claude-sonnet-4-5-20250929"
 
         captured = capsys.readouterr()
         assert "Error loading model config" in captured.out
@@ -106,7 +106,7 @@ def test_load_model_config_with_unicode_characters(self, tmp_path):
                 "persona_日本語": "gpt-4",
                 "persona_émotionnel": "claude-3-opus",
             },
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "unicode_config.json"
@@ -123,7 +123,7 @@ def test_load_model_config_with_nested_structure(self, tmp_path):
         """Test loading config with nested data structures."""
         config_data = {
             "prompt_models": {"persona_1": "gpt-4"},
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
             "model_params": {
                 "temperature": 0.7,
                 "max_tokens": 1000,
@@ -154,7 +154,7 @@ def test_load_model_config_permission_error(self, tmp_path, capsys):
 
             # Should return default config
             assert result["prompt_models"] == {}
-            assert result["default_model"] == "claude-3-5-sonnet-20241022"
+            assert result["default_model"] == "claude-sonnet-4-5-20250929"
 
             # Restore permissions for cleanup
             config_file.chmod(0o644)
@@ -171,7 +171,7 @@ def test_get_model_for_prompt_returns_specific_model(self, tmp_path):
                 "persona_anxious": "gpt-4-turbo",
                 "persona_happy": "claude-3-opus",
             },
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "config.json"
@@ -185,7 +185,7 @@ def test_get_model_for_prompt_returns_default_for_unknown(self, tmp_path):
         """Test getting model for prompt not in config returns default."""
         config_data = {
             "prompt_models": {"persona_known": "gpt-4"},
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "config.json"
@@ -193,7 +193,7 @@ def test_get_model_for_prompt_returns_default_for_unknown(self, tmp_path):
 
         model = get_model_for_prompt("persona_unknown", str(config_file))
 
-        assert model == "claude-3-5-sonnet-20241022"
+        assert model == "claude-sonnet-4-5-20250929"
 
     def test_get_model_for_prompt_with_empty_prompt_models(self, tmp_path):
         """Test getting model when prompt_models is empty."""
@@ -211,7 +211,7 @@ def test_get_model_for_prompt_with_missing_config_file(self):
         model = get_model_for_prompt("test_prompt", "nonexistent_file.json")
 
         # Should return default model from load_model_config fallback
-        assert model == "claude-3-5-sonnet-20241022"
+        assert model == "claude-sonnet-4-5-20250929"
 
     def test_get_model_for_prompt_case_sensitivity(self, tmp_path):
         """Test that prompt name matching is case-sensitive."""
@@ -220,7 +220,7 @@ def test_get_model_for_prompt_case_sensitivity(self, tmp_path):
                 "PersonaAnxious": "gpt-4",
                 "persona_anxious": "claude-3-opus",
             },
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "config.json"
@@ -233,7 +233,7 @@ def test_get_model_for_prompt_case_sensitivity(self, tmp_path):
 
         assert model1 == "gpt-4"
         assert model2 == "claude-3-opus"
-        assert model3 == "claude-3-5-sonnet-20241022"  # Falls back to default
+        assert model3 == "claude-sonnet-4-5-20250929"  # Falls back to default
 
     def test_get_model_for_prompt_with_special_characters(self, tmp_path):
         """Test prompt names with special characters."""
@@ -243,7 +243,7 @@ def test_get_model_for_prompt_with_special_characters(self, tmp_path):
                 "persona_with_underscores": "claude-3-opus",
                 "persona.with.dots": "gpt-3.5-turbo",
             },
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "config.json"
@@ -263,7 +263,7 @@ def test_get_model_for_prompt_multiple_calls_consistent(self, tmp_path):
         """Test that multiple calls with same prompt return consistent results."""
         config_data = {
             "prompt_models": {"test_prompt": "gpt-4"},
-            "default_model": "claude-3-5-sonnet-20241022",
+            "default_model": "claude-sonnet-4-5-20250929",
         }
 
         config_file = tmp_path / "config.json"
diff --git a/utils/model_config_loader.py b/utils/model_config_loader.py
index 832ea4d1..02d8c708 100644
--- a/utils/model_config_loader.py
+++ b/utils/model_config_loader.py
@@ -19,10 +19,10 @@ def load_model_config(config_file: str = "model_config.json") -> Dict[str, Any]:
             return json.load(f)
     except FileNotFoundError:
         print(f"Warning: Model config file '{config_file}' not found. Using defaults.")
-        return {"prompt_models": {}, "default_model": "claude-3-5-sonnet-20241022"}
+        return {"prompt_models": {}, "default_model": "claude-sonnet-4-5-20250929"}
     except Exception as e:
         print(f"Error loading model config: {e}")
-        return {"prompt_models": {}, "default_model": "claude-3-5-sonnet-20241022"}
+        return {"prompt_models": {}, "default_model": "claude-sonnet-4-5-20250929"}
 
 
 def get_model_for_prompt(