Skip to content
5 changes: 4 additions & 1 deletion data/persona_prompt_template.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ Below is the context: """
- If the chatbot is repetitive point that out to the chatbot and/or show frustration

# Other instructions
- You start the conversation
- You start the conversation
- When asked to provide multiple responses, generate 5 diverse possible responses with probability scores (0.0-1.0) representing how likely each response is based on the persona
- Sample responses from different parts of the probability distribution, including less likely but contextually appropriate options
- Each response should vary in tone, directness, or emotional expression while staying in character
- Don't repeat the same sentences, make the same point multiple times or rephrase the same question
- Don't repeat or rephrase your sample prompt over and over
- Don't ask for scripts including but not limited to email templates
Expand Down
23 changes: 21 additions & 2 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ async def main(
run_id: Optional[str] = None,
max_concurrent: Optional[int] = None,
max_total_words: Optional[int] = None,
multiple_responses: bool = False,
) -> List[Dict[str, Any]]:
"""
Generate conversations and return results.
Expand All @@ -37,9 +38,13 @@ async def main(
runs_per_prompt: Number of runs per prompt
persona_names: List of persona names to use. If None, uses all personas.
verbose: Whether to print status messages
folder_name: Custom folder name for saving conversations. If None, uses default format.
folder_name: Custom folder name for saving conversations.
If None, uses default format.
max_total_words: Optional maximum total words across all responses
max_concurrent: Maximum number of concurrent conversations. If None, runs all conversations concurrently.
max_concurrent: Maximum number of concurrent conversations.
If None, runs all conversations concurrently.
multiple_responses: If True, generate multiple responses with scores
and select highest-scored one

Returns:
List of conversation results
Expand Down Expand Up @@ -67,6 +72,7 @@ async def main(
print(f" - Run ID: {run_id}")
print(f" - Max concurrent: {max_concurrent}")
print(f" - Max total words: {max_total_words}")
print(f" - Multiple responses: {multiple_responses}")

# Generate default folder name if not provided
if folder_name is None:
Expand Down Expand Up @@ -97,6 +103,7 @@ async def main(
run_id=run_id,
max_concurrent=max_concurrent,
max_total_words=max_total_words,
multiple_responses=multiple_responses,
)

# Run conversations
Expand Down Expand Up @@ -187,6 +194,17 @@ async def main(
type=int,
)

parser.add_argument(
"--multiple-responses",
"-m",
help=(
"Enable multiple response generation with scoring. "
"Generates 5 diverse responses and selects the highest-scored one."
),
action="store_true",
default=False,
)

args = parser.parse_args()

persona_model_config = {
Expand Down Expand Up @@ -222,5 +240,6 @@ async def main(
folder_name=args.folder_name,
max_concurrent=args.max_concurrent,
max_total_words=args.max_total_words,
multiple_responses=args.multiple_responses,
)
)
109 changes: 79 additions & 30 deletions generate_conversations/conversation_simulator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,29 @@
from typing import Any, Dict, List, Optional, Set
from typing import Any, Dict, List, Optional, Set, Tuple

from pydantic import BaseModel

from llm_clients import LLMInterface
from utils.conversation_utils import save_conversation_to_file


class ScoredResponse(BaseModel):
"""A single response with its probability score."""

text: str
probability: float


class ResponseWithScores(BaseModel):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd probably have called this ResponsesWithScores to indicate that its a list of paired responses and scores rather than a single response with a list of scores.

"""Model for multiple responses with confidence scores.

Note: Uses nested Pydantic model instead of List[Tuple[str, float]]
because OpenAI's structured output API doesn't support tuple types in
JSON schema. Tuples must be converted to objects with named fields.
"""

responses: List[ScoredResponse]


class ConversationSimulator:
"""Simulates a conversation between two LLM instances."""

Expand All @@ -15,16 +35,6 @@ def __init__(self, persona: LLMInterface, agent: LLMInterface):
# Define termination signals that indicate persona wants to end the conversation
self.termination_signals: Set[str] = set()

# "goodbye", "bye", "farewell", "talk to you later",
# "ttyl",
# "end conversation", "conversation over", "that's all",
# "nothing more to discuss",
# "i'm done", "let's end here",
# "conversation complete", "wrapping up", "final thoughts",
# "concluding", "to conclude",
# "in conclusion"
# }

def _should_terminate_conversation(
self, response: str, speaker: LLMInterface
) -> bool:
Expand All @@ -44,13 +54,7 @@ def _should_terminate_conversation(
return True

# Check for common ending patterns
ending_patterns = [
# "it was nice",
# "pleasure talking",
# "great conversation",
# "good chat",
# "until next time"
]
ending_patterns = []

for pattern in ending_patterns:
if pattern in response_lower:
Expand All @@ -63,6 +67,7 @@ async def start_conversation(
max_turns: int,
initial_message: Optional[str] = None,
max_total_words: Optional[int] = None,
multiple_responses: bool = False,
) -> List[Dict[str, Any]]:
"""
Start a conversation between the two LLMs with early stopping support.
Expand All @@ -72,7 +77,8 @@ async def start_conversation(
initial_message: Optional initial message (for the first speaker)
to start the conversation. By default, first speaker is persona.
max_total_words: Optional maximum total words across all responses

multiple_responses: If True, generate multiple responses with scores
and select the highest-scored one. Requires JudgeLLM support.

Returns:
List of conversation turns with speaker and message
Expand All @@ -90,20 +96,63 @@ async def start_conversation(
# Record start time for this turn

# Generate response
response = await current_speaker.generate_response(current_message)
response: str
score: Optional[float]
all_responses: Optional[List[Tuple[str, float]]]
Comment on lines +100 to +101
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variables 'score' and 'all_responses' are declared but immediately reassigned in both conditional branches. Initialize them with default values instead: score = None and all_responses = None.

Suggested change
score: Optional[float]
all_responses: Optional[List[Tuple[str, float]]]
score: Optional[float] = None
all_responses: Optional[List[Tuple[str, float]]] = None

Copilot uses AI. Check for mistakes.

if multiple_responses and hasattr(
current_speaker, "generate_structured_response"
):
# Generate multiple responses with scores
# Add instruction to generate multiple responses
multi_response_message = (
f"{current_message}\n\n"
"Please provide 5 diverse possible responses as a persona would, "
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you forked this to a whole different persona prompt for mutliple_responses == True, you could incorporate this in the rest of the prompt file?

"each with a probability score (0.0-1.0) indicating how likely "
"that response is based on the persona's characteristics."
)
structured_response = (
await current_speaker.generate_structured_response(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused... are we... not giving the member prompt the conversation history? I assumed yes, but it seems like only the current_message (the most recent response from the other side) gets included?

If we're not giving the member the chat history, and only giving it the most recent provider response... I can see how that would make it harder for it to have realistic conversations...

multi_response_message, ResponseWithScores
)
)

# Select the response with the highest score
best_response = max(
structured_response.responses, key=lambda x: x.probability
)
response = best_response.text
score = best_response.probability
# Store all responses in metadata for transparency
all_responses = [
(r.text, r.probability) for r in structured_response.responses
]
else:
# Generate single response (default behavior)
# Note: Despite interface definition, implementations return str
response = await current_speaker.generate_response(current_message) # type: ignore[assignment]
score = None
all_responses = None

# response is mostly a text string
Copy link

Copilot AI Jan 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment is unclear and imprecise. The phrase 'mostly a text string' is ambiguous. Clarify the intent or remove if the comment doesn't add value.

Suggested change
# response is mostly a text string
# Count the number of words in the LLM response

Copilot uses AI. Check for mistakes.
total_words += len(response.split())

# Record this turn
self.conversation_history.append(
{
"turn": turn + 1,
"speaker": current_speaker.get_name(),
"input": current_message or "",
"response": response,
"early_termination": False,
"logging": current_speaker.get_last_response_metadata(),
}
)
turn_data = {
"turn": turn + 1,
"speaker": current_speaker.get_name(),
"input": current_message or "",
"response": response,
"early_termination": False,
"logging": current_speaker.get_last_response_metadata(),
}

# Add multiple response metadata if available
if multiple_responses and all_responses is not None:
turn_data["selected_score"] = score
turn_data["all_responses"] = all_responses

self.conversation_history.append(turn_data)

# Check if persona wants to end the conversation
if self._should_terminate_conversation(response, current_speaker):
Expand Down
12 changes: 8 additions & 4 deletions generate_conversations/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
folder_name: str = "conversations",
max_concurrent: Optional[int] = None,
max_total_words: Optional[int] = None,
multiple_responses: bool = False,
):
self.persona_model_config = persona_model_config
self.agent_model_config = agent_model_config
Expand All @@ -45,6 +46,7 @@ def __init__(
# Default: None - run all conversations concurrently
self.max_concurrent = max_concurrent
self.max_total_words = max_total_words
self.multiple_responses = multiple_responses

self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
"system_prompt", "You are a helpful AI assistant."
Expand All @@ -66,8 +68,6 @@ async def run_single_conversation(

# Generate filename base using persona name, model, and run number
tag = uuid.uuid4().hex[:6]
# TODO: consider removing timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
# TODO: should this be inside the LLM class?
model_short = (
model_name.replace("claude-3-", "c3-")
Expand Down Expand Up @@ -111,6 +111,7 @@ async def run_single_conversation(
initial_message=None,
max_turns=max_turns,
max_total_words=self.max_total_words,
multiple_responses=self.multiple_responses,
)

# Log each conversation turn
Expand Down Expand Up @@ -164,7 +165,9 @@ async def run_conversations(
) -> List[Dict[str, Any]]:
"""Run multiple conversations concurrently."""
# Load prompts from CSV based on persona names
personas = load_prompts_from_csv(persona_names)
personas = load_prompts_from_csv(
persona_names, multiple_responses=self.multiple_responses
)

# Load agent configuration (fixed, shared across all conversations)
agent = LLMFactory.create_llm(
Expand Down Expand Up @@ -209,7 +212,8 @@ async def run_with_limit(task):
return await task

print(
f"Running {len(tasks)} conversations with max concurrency: {self.max_concurrent}"
f"Running {len(tasks)} conversations with max concurrency: "
f"{self.max_concurrent}"
)
results = await asyncio.gather(*[run_with_limit(task) for task in tasks])
else:
Expand Down
19 changes: 19 additions & 0 deletions generate_conversations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@ def load_prompts_from_csv(
name_list: Optional[List[str]] = None,
prompt_path="data/personas.tsv",
prompt_template_path="data/persona_prompt_template.txt",
multiple_responses: bool = False,
) -> List[dict[str, str]]:
"""Load prompts from personas.csv file and return them as a list.

Args:
name_list: Optional list of names to filter by. If None, returns all prompts.
prompt_path: Path to the CSV file containing persona data
prompt_template_path: Path to the template file for formatting prompts
multiple_responses: If True, include instructions for generating
multiple responses
"""

csv_path = Path(prompt_path)
Expand All @@ -33,6 +36,22 @@ def load_prompts_from_csv(
with open(template_path, "r", encoding="utf-8") as template_file:
template = template_file.read()

# Remove multiple response instructions if not needed
if not multiple_responses:
lines = template.split("\n")
filtered_lines = []
skip_next = False
for line in lines:
# Skip the three lines about multiple responses
if "When asked to provide multiple responses" in line:
skip_next = 2 # Skip this line and the next 2
continue
if skip_next > 0:
skip_next -= 1
continue
filtered_lines.append(line)
template = "\n".join(filtered_lines)

data = []
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
Expand Down
2 changes: 1 addition & 1 deletion llm_clients/claude_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ async def generate_response(self, message: Optional[str] = None) -> str:
# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)

return response.content
return response.content # type: ignore[return-value]
except Exception as e:
# Store error metadata
self.last_response_metadata = {
Expand Down
10 changes: 10 additions & 0 deletions llm_clients/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,16 @@ class Config:
"max_tokens": 1000,
},
"gemini-pro": {"provider": "google", "temperature": 0.7, "max_tokens": 1000},
"gemini-3-pro-preview": {
"provider": "google",
"temperature": 0.7,
"max_tokens": 1000,
},
"gemini-2.5-flash": {
"provider": "google",
"temperature": 0.7,
"max_tokens": 1000,
},
"llama2:7b": {
"provider": "ollama",
"temperature": 0.7,
Expand Down
Loading