SpringCare · jgieringer · Jan 23, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/README.md b/README.md
@@ -67,14 +67,44 @@ Where
 - `j` is the flag for selecting the judge model(s)
 - `jep` are the judge model extra parameters (optional)
 
+7. **Score and visualize the results**:
+   ```bash
+   python -m judge.score -r evaluations/{YOUR_EVAL_FOLDER}/results.csv
+   ```
+
+## Quick Start: End-to-End Pipeline
+
+For convenience, you can run the entire workflow (generation → evaluation → scoring) with a single command:
+
+```bash
+python3 run_pipeline.py \
+  --user-agent claude-sonnet-4-5-20250929 \
+  --provider-agent gpt-4o \
+  --runs 2 \
+  --turns 10 \
+  --judge-model claude-sonnet-4-5-20250929 \
+  --max-personas 5
+```
+
+The pipeline script:
+- Runs `generate.py` with your specified arguments
+- Automatically passes the output folder to `judge.py`
+- Automatically runs `judge/score.py` on the evaluation results
+- Displays a summary with all output locations
+
+For help and all available options:
+```bash
+python3 run_pipeline.py --help
+```
+
 ### Using Extra Parameters
 
 Both `generate.py` and `judge.py` support extra parameters for fine-tuning model behavior:
 
 **Generate with temperature control:**
 ```bash
 # Lower temperature (0.3) for more consistent responses
-python generate.py -u gpt-4o -uep temperature=0.3 -p claude-3-5-sonnet-20241022 -pep temperature=0.5 -t 6 -r 2
+python generate.py -u gpt-4o -uep temperature=0.3 -p claude-sonnet-4-5-20250929 -pep temperature=0.5 -t 6 -r 2
 
 # Higher temperature (1.0) with max tokens
 python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep temperature=1 -t 6 -r 1
@@ -83,7 +113,7 @@ python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep t
 **Judge with custom parameters:**
 ```bash
 # Use lower temperature for more consistent evaluation
-python judge.py -f conversations/my_experiment -j claude-3-5-sonnet-20241022 -jep temperature=0.3
+python judge.py -f conversations/my_experiment -j claude-sonnet-4-5-20250929 -jep temperature=0.3
 
 # Multiple parameters
 python judge.py -f conversations/my_experiment -j gpt-4o -jep temperature=0.5,max_tokens=1500

diff --git a/generate.py b/generate.py
@@ -25,7 +25,7 @@ async def main(
     max_concurrent: Optional[int] = None,
     max_total_words: Optional[int] = None,
     max_personas: Optional[int] = None,
-) -> List[Dict[str, Any]]:
+) -> tuple[List[Dict[str, Any]], str]:
     """
     Generate conversations and return results.
 
@@ -117,7 +117,7 @@ async def main(
     if verbose:
         print(f"✅ Generated {len(results)} conversations → {folder_name}/")
 
-    return results
+    return results, folder_name
 
 
 if __name__ == "__main__":
@@ -127,7 +127,7 @@ async def main(
         "--user-agent",
         "-u",
         help=(
-            "Model for the user-agent. Examples: claude-3-5-sonnet-20241022, "
+            "Model for the user-agent. Examples: claude-sonnet-4-5-20250929, "
             "gemini-1.5-pro, llama3:8b"
         ),
         required=True,
@@ -147,7 +147,7 @@ async def main(
         "--provider-agent",
         "-p",
         help=(
-            "Model for the provider-agent. Examples: claude-3-5-sonnet-20241022, "
+            "Model for the provider-agent. Examples: claude-sonnet-4-5-20250929, "
             "gemini-1.5-pro, llama3:8b"
         ),
         required=True,
@@ -255,8 +255,7 @@ async def main(
     }
 
     # TODO: Do the run id here, so that it can be printed when starting
-    # Note: we are discarding the results, because they are saved to file
-    _ = asyncio.run(
+    results, output_folder = asyncio.run(
         main(
             persona_model_config=persona_model_config,
             agent_model_config=agent_model_config,

diff --git a/judge.py b/judge.py
@@ -6,14 +6,15 @@
 
 import argparse
 import asyncio
+from typing import Optional
 
 from judge import judge_conversations, judge_single_conversation
 from judge.llm_judge import LLMJudge
 from judge.rubric_config import ConversationData, RubricConfig, load_conversations
 from utils.utils import parse_key_value_list
 
 
-async def main(args):
+async def main(args) -> Optional[str]:
     """Main async entrypoint for judging conversations."""
     # Parse judge models from args (supports "model" or "model:count" format)
     judge_models = {}
@@ -47,6 +48,9 @@ async def main(args):
             judge_model_extra_params=args.judge_model_extra_params,
         )
         await judge_single_conversation(judge, conversation, args.output)
+        # Single conversation mode doesn't need output folder for pipeline
+        print("ℹ️  Single conversation mode: output folder not needed for pipeline")
+        return None
     else:
         # Load all conversations at startup
         print(f"📂 Loading conversations from {args.folder}...")
@@ -58,7 +62,7 @@ async def main(args):
 
         folder_name = Path(args.folder).name
 
-        await judge_conversations(
+        _, output_folder = await judge_conversations(
             judge_models=judge_models,
             conversations=conversations,
             rubric_config=rubric_config,
@@ -71,6 +75,8 @@ async def main(args):
             verbose_workers=args.verbose_workers,
         )
 
+        return output_folder
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -108,9 +114,9 @@ async def main(args):
             "Model(s) to use for judging. "
             "Format: 'model' or 'model:count' for multiple instances. "
             "Can specify multiple models: --judge-model model1 model2:3. "
-            "Examples: claude-3-5-sonnet-20241022, "
-            "claude-3-5-sonnet-20241022:3, "
-            "claude-3-5-sonnet-20241022:2 gpt-4o:1"
+            "Examples: claude-sonnet-4-5-20250929, "
+            "claude-sonnet-4-5-20250929:3, "
+            "claude-sonnet-4-5-20250929:2 gpt-4o:1"
         ),
     )
 

diff --git a/judge/runner.py b/judge/runner.py
@@ -434,7 +434,7 @@ async def judge_conversations(
     max_concurrent: Optional[int] = None,
     per_judge: bool = False,
     verbose_workers: bool = False,
-) -> List[Dict[str, Any]]:
+) -> tuple[List[Dict[str, Any]], str]:
     """
     Judge conversations with multiple judge models.
 
@@ -454,8 +454,9 @@ async def judge_conversations(
         per_judge: If True, max_concurrent applies per judge model; if False, total
 
     Returns:
-        Flattened list of evaluation results with one row per
-        (conversation, judge_model, judge_instance) tuple
+        Tuple of (results, output_folder) where results is a flattened list of
+        evaluation results with one row per (conversation, judge_model, judge_instance)
+        tuple, and output_folder is the path where evaluations were saved
     """
     if output_folder is None:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
@@ -519,7 +520,7 @@ async def judge_conversations(
     if verbose:
         print(f"✅ Completed {len(results)} evaluations → {output_folder}/")
 
-    return results
+    return results, output_folder
 
 
 async def judge_single_conversation(

diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
@@ -92,7 +92,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()

diff --git a/llm_clients/config.py b/llm_clients/config.py
@@ -36,7 +36,7 @@ def get_claude_config(cls) -> Dict[str, Any]:
         Returns only the model name. Runtime parameters (temperature, max_tokens)
         should be passed explicitly via CLI arguments.
         """
-        return {"model": "claude-3-5-sonnet-20241022"}
+        return {"model": "claude-sonnet-4-5-20250929"}
 
     @classmethod
     def get_openai_config(cls) -> Dict[str, Any]:

diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
@@ -90,7 +90,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()

diff --git a/llm_clients/llm_factory.py b/llm_clients/llm_factory.py
@@ -19,7 +19,7 @@ def create_llm(
 
         Args:
             model_name: The model identifier
-                (e.g., "claude-3-5-sonnet-20241022", "gpt-4")
+                (e.g., "claude-sonnet-4-5-20250929", "gpt-4")
             name: Display name for this LLM instance
             system_prompt: Optional system prompt
             **kwargs: Additional model-specific parameters
@@ -72,7 +72,7 @@ def create_judge_llm(
 
         Args:
             model_name: The model identifier
-                (e.g., "claude-3-5-sonnet-20241022", "gpt-4")
+                (e.g., "claude-sonnet-4-5-20250929", "gpt-4")
             name: Display name for this LLM instance
             system_prompt: Optional system prompt
             **kwargs: Additional model-specific parameters

diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py
@@ -89,7 +89,7 @@ async def generate_response(
             msg_type = type(msg).__name__
             preview = msg.content[:100]
             content_preview = preview + "..." if len(msg.content) > 100 else msg.content
-            debug_print(f"  {i+1}. {msg_type}: {content_preview}")
+            debug_print(f"  {i + 1}. {msg_type}: {content_preview}")
 
         try:
             start_time = time.time()

diff --git a/model_config.json b/model_config.json
@@ -1,15 +1,16 @@
 {
   "prompt_models": {
-    "assistant": "claude-3-5-sonnet-20241022",
+    "assistant": "claude-sonnet-4-5-20250929",
     "philosopher": "claude-3-opus-20240229",
     "debate_starter": "claude-3-sonnet-20240229",
     "creative": "claude-3-haiku-20240307",
-    "scientist": "claude-3-5-sonnet-20241022",
-    "skeptic": "claude-3-5-sonnet-20241022",
+    "scientist": "claude-sonnet-4-5-20250929",
+    "skeptic": "claude-sonnet-4-5-20250929",
     "gpt_assistant": "gpt-4",
     "gpt_creative": "gpt-4-turbo",
     "gpt_analyst": "gpt-3.5-turbo",
-    "claude-sonnet-4-20250514": "claude-sonnet-4-20250514"
+    "claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
+    "claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929"
   },
-  "default_model": "claude-3-5-sonnet-20241022"
+  "default_model": "claude-sonnet-4-5-20250929"
 }