Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,44 @@ Where
- `j` is the flag for selecting the judge model(s)
- `jep` are the judge model extra parameters (optional)

7. **Score and visualize the results**:
```bash
python -m judge.score -r evaluations/{YOUR_EVAL_FOLDER}/results.csv
```

## Quick Start: End-to-End Pipeline

For convenience, you can run the entire workflow (generation → evaluation → scoring) with a single command:

```bash
python3 run_pipeline.py \
--user-agent claude-sonnet-4-5-20250929 \
--provider-agent gpt-4o \
--runs 2 \
--turns 10 \
--judge-model claude-sonnet-4-5-20250929 \
--max-personas 5
```

The pipeline script:
- Runs `generate.py` with your specified arguments
- Automatically passes the output folder to `judge.py`
- Automatically runs `judge/score.py` on the evaluation results
- Displays a summary with all output locations

For help and all available options:
```bash
python3 run_pipeline.py --help
```

### Using Extra Parameters

Both `generate.py` and `judge.py` support extra parameters for fine-tuning model behavior:

**Generate with temperature control:**
```bash
# Lower temperature (0.3) for more consistent responses
python generate.py -u gpt-4o -uep temperature=0.3 -p claude-3-5-sonnet-20241022 -pep temperature=0.5 -t 6 -r 2
python generate.py -u gpt-4o -uep temperature=0.3 -p claude-sonnet-4-5-20250929 -pep temperature=0.5 -t 6 -r 2

# Higher temperature (1.0) with max tokens
python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep temperature=1 -t 6 -r 1
Expand All @@ -83,7 +113,7 @@ python generate.py -u gpt-4o -uep temperature=1,max_tokens=2000 -p gpt-4o -pep t
**Judge with custom parameters:**
```bash
# Use lower temperature for more consistent evaluation
python judge.py -f conversations/my_experiment -j claude-3-5-sonnet-20241022 -jep temperature=0.3
python judge.py -f conversations/my_experiment -j claude-sonnet-4-5-20250929 -jep temperature=0.3

# Multiple parameters
python judge.py -f conversations/my_experiment -j gpt-4o -jep temperature=0.5,max_tokens=1500
Expand Down
11 changes: 5 additions & 6 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def main(
max_concurrent: Optional[int] = None,
max_total_words: Optional[int] = None,
max_personas: Optional[int] = None,
) -> List[Dict[str, Any]]:
) -> tuple[List[Dict[str, Any]], str]:
"""
Generate conversations and return results.

Expand Down Expand Up @@ -117,7 +117,7 @@ async def main(
if verbose:
print(f"✅ Generated {len(results)} conversations → {folder_name}/")

return results
return results, folder_name


if __name__ == "__main__":
Expand All @@ -127,7 +127,7 @@ async def main(
"--user-agent",
"-u",
help=(
"Model for the user-agent. Examples: claude-3-5-sonnet-20241022, "
"Model for the user-agent. Examples: claude-sonnet-4-5-20250929, "
"gemini-1.5-pro, llama3:8b"
),
required=True,
Expand All @@ -147,7 +147,7 @@ async def main(
"--provider-agent",
"-p",
help=(
"Model for the provider-agent. Examples: claude-3-5-sonnet-20241022, "
"Model for the provider-agent. Examples: claude-sonnet-4-5-20250929, "
"gemini-1.5-pro, llama3:8b"
),
required=True,
Expand Down Expand Up @@ -255,8 +255,7 @@ async def main(
}

# TODO: Do the run id here, so that it can be printed when starting
# Note: we are discarding the results, because they are saved to file
_ = asyncio.run(
results, output_folder = asyncio.run(
main(
persona_model_config=persona_model_config,
agent_model_config=agent_model_config,
Expand Down
16 changes: 11 additions & 5 deletions judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@

import argparse
import asyncio
from typing import Optional

from judge import judge_conversations, judge_single_conversation
from judge.llm_judge import LLMJudge
from judge.rubric_config import ConversationData, RubricConfig, load_conversations
from utils.utils import parse_key_value_list


async def main(args):
async def main(args) -> Optional[str]:
"""Main async entrypoint for judging conversations."""
# Parse judge models from args (supports "model" or "model:count" format)
judge_models = {}
Expand Down Expand Up @@ -47,6 +48,9 @@ async def main(args):
judge_model_extra_params=args.judge_model_extra_params,
)
await judge_single_conversation(judge, conversation, args.output)
# Single conversation mode doesn't need output folder for pipeline
print("ℹ️ Single conversation mode: output folder not needed for pipeline")
return None
else:
# Load all conversations at startup
print(f"📂 Loading conversations from {args.folder}...")
Expand All @@ -58,7 +62,7 @@ async def main(args):

folder_name = Path(args.folder).name

await judge_conversations(
_, output_folder = await judge_conversations(
judge_models=judge_models,
conversations=conversations,
rubric_config=rubric_config,
Expand All @@ -71,6 +75,8 @@ async def main(args):
verbose_workers=args.verbose_workers,
)

return output_folder


if __name__ == "__main__":
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -108,9 +114,9 @@ async def main(args):
"Model(s) to use for judging. "
"Format: 'model' or 'model:count' for multiple instances. "
"Can specify multiple models: --judge-model model1 model2:3. "
"Examples: claude-3-5-sonnet-20241022, "
"claude-3-5-sonnet-20241022:3, "
"claude-3-5-sonnet-20241022:2 gpt-4o:1"
"Examples: claude-sonnet-4-5-20250929, "
"claude-sonnet-4-5-20250929:3, "
"claude-sonnet-4-5-20250929:2 gpt-4o:1"
),
)

Expand Down
9 changes: 5 additions & 4 deletions judge/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ async def judge_conversations(
max_concurrent: Optional[int] = None,
per_judge: bool = False,
verbose_workers: bool = False,
) -> List[Dict[str, Any]]:
) -> tuple[List[Dict[str, Any]], str]:
"""
Judge conversations with multiple judge models.

Expand All @@ -454,8 +454,9 @@ async def judge_conversations(
per_judge: If True, max_concurrent applies per judge model; if False, total

Returns:
Flattened list of evaluation results with one row per
(conversation, judge_model, judge_instance) tuple
Tuple of (results, output_folder) where results is a flattened list of
evaluation results with one row per (conversation, judge_model, judge_instance)
tuple, and output_folder is the path where evaluations were saved
"""
if output_folder is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
Expand Down Expand Up @@ -519,7 +520,7 @@ async def judge_conversations(
if verbose:
print(f"✅ Completed {len(results)} evaluations → {output_folder}/")

return results
return results, output_folder


async def judge_single_conversation(
Expand Down
2 changes: 1 addition & 1 deletion llm_clients/claude_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ async def generate_response(
msg_type = type(msg).__name__
preview = msg.content[:100]
content_preview = preview + "..." if len(msg.content) > 100 else msg.content
debug_print(f" {i+1}. {msg_type}: {content_preview}")
debug_print(f" {i + 1}. {msg_type}: {content_preview}")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that feels illegal


try:
start_time = time.time()
Expand Down
2 changes: 1 addition & 1 deletion llm_clients/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def get_claude_config(cls) -> Dict[str, Any]:
Returns only the model name. Runtime parameters (temperature, max_tokens)
should be passed explicitly via CLI arguments.
"""
return {"model": "claude-3-5-sonnet-20241022"}
return {"model": "claude-sonnet-4-5-20250929"}

@classmethod
def get_openai_config(cls) -> Dict[str, Any]:
Expand Down
2 changes: 1 addition & 1 deletion llm_clients/gemini_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ async def generate_response(
msg_type = type(msg).__name__
preview = msg.content[:100]
content_preview = preview + "..." if len(msg.content) > 100 else msg.content
debug_print(f" {i+1}. {msg_type}: {content_preview}")
debug_print(f" {i + 1}. {msg_type}: {content_preview}")

try:
start_time = time.time()
Expand Down
4 changes: 2 additions & 2 deletions llm_clients/llm_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def create_llm(

Args:
model_name: The model identifier
(e.g., "claude-3-5-sonnet-20241022", "gpt-4")
(e.g., "claude-sonnet-4-5-20250929", "gpt-4")
name: Display name for this LLM instance
system_prompt: Optional system prompt
**kwargs: Additional model-specific parameters
Expand Down Expand Up @@ -72,7 +72,7 @@ def create_judge_llm(

Args:
model_name: The model identifier
(e.g., "claude-3-5-sonnet-20241022", "gpt-4")
(e.g., "claude-sonnet-4-5-20250929", "gpt-4")
name: Display name for this LLM instance
system_prompt: Optional system prompt
**kwargs: Additional model-specific parameters
Expand Down
2 changes: 1 addition & 1 deletion llm_clients/openai_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ async def generate_response(
msg_type = type(msg).__name__
preview = msg.content[:100]
content_preview = preview + "..." if len(msg.content) > 100 else msg.content
debug_print(f" {i+1}. {msg_type}: {content_preview}")
debug_print(f" {i + 1}. {msg_type}: {content_preview}")

try:
start_time = time.time()
Expand Down
11 changes: 6 additions & 5 deletions model_config.json
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
{
"prompt_models": {
"assistant": "claude-3-5-sonnet-20241022",
"assistant": "claude-sonnet-4-5-20250929",
"philosopher": "claude-3-opus-20240229",
"debate_starter": "claude-3-sonnet-20240229",
"creative": "claude-3-haiku-20240307",
"scientist": "claude-3-5-sonnet-20241022",
"skeptic": "claude-3-5-sonnet-20241022",
"scientist": "claude-sonnet-4-5-20250929",
"skeptic": "claude-sonnet-4-5-20250929",
"gpt_assistant": "gpt-4",
"gpt_creative": "gpt-4-turbo",
"gpt_analyst": "gpt-3.5-turbo",
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514"
"claude-sonnet-4-20250514": "claude-sonnet-4-20250514",
"claude-sonnet-4-5-20250929": "claude-sonnet-4-5-20250929"
},
"default_model": "claude-3-5-sonnet-20241022"
"default_model": "claude-sonnet-4-5-20250929"
Comment on lines +3 to +15
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sator-labs should this file be taken out? If so, maybe a different PR
I don't see this model_config.json used anywhere but test_model_config_loader.py

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point. this file needs a little cleaning
like the above roles are from a very old version and not used a ymore

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch. actually i think the file should be removed entierly. let's do another one after?

}
Loading
Loading