SpringCare · sator-labs · Jan 3, 2026 · Jan 3, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/data/persona_prompt_template.txt b/data/persona_prompt_template.txt
@@ -25,7 +25,10 @@ Below is the context: """
 - If the chatbot is repetitive point that out to the chatbot and/or show frustration
 
 # Other instructions
-- You start the conversation 
+- You start the conversation
+- When asked to provide multiple responses, generate 5 diverse possible responses with probability scores (0.0-1.0) representing how likely each response is based on the persona
+- Sample responses from different parts of the probability distribution, including less likely but contextually appropriate options
+- Each response should vary in tone, directness, or emotional expression while staying in character
 - Don't repeat the same sentences, make the same point multiple times or rephrase the same question 
 - Don't repeat or rephrase your sample prompt over and over
 - Don't ask for scripts including but not limited to email templates

diff --git a/generate.py b/generate.py
@@ -23,6 +23,7 @@ async def main(
     run_id: Optional[str] = None,
     max_concurrent: Optional[int] = None,
     max_total_words: Optional[int] = None,
+    multiple_responses: bool = False,
 ) -> List[Dict[str, Any]]:
     """
     Generate conversations and return results.
@@ -37,9 +38,13 @@ async def main(
         runs_per_prompt: Number of runs per prompt
         persona_names: List of persona names to use. If None, uses all personas.
         verbose: Whether to print status messages
-        folder_name: Custom folder name for saving conversations. If None, uses default format.
+        folder_name: Custom folder name for saving conversations.
+            If None, uses default format.
         max_total_words: Optional maximum total words across all responses
-        max_concurrent: Maximum number of concurrent conversations. If None, runs all conversations concurrently.
+        max_concurrent: Maximum number of concurrent conversations.
+            If None, runs all conversations concurrently.
+        multiple_responses: If True, generate multiple responses with scores
+            and select highest-scored one
 
     Returns:
         List of conversation results
@@ -67,6 +72,7 @@ async def main(
         print(f"  - Run ID: {run_id}")
         print(f"  - Max concurrent: {max_concurrent}")
         print(f"  - Max total words: {max_total_words}")
+        print(f"  - Multiple responses: {multiple_responses}")
 
     # Generate default folder name if not provided
     if folder_name is None:
@@ -97,6 +103,7 @@ async def main(
         run_id=run_id,
         max_concurrent=max_concurrent,
         max_total_words=max_total_words,
+        multiple_responses=multiple_responses,
     )
 
     # Run conversations
@@ -187,6 +194,17 @@ async def main(
         type=int,
     )
 
+    parser.add_argument(
+        "--multiple-responses",
+        "-m",
+        help=(
+            "Enable multiple response generation with scoring. "
+            "Generates 5 diverse responses and selects the highest-scored one."
+        ),
+        action="store_true",
+        default=False,
+    )
+
     args = parser.parse_args()
 
     persona_model_config = {
@@ -222,5 +240,6 @@ async def main(
             folder_name=args.folder_name,
             max_concurrent=args.max_concurrent,
             max_total_words=args.max_total_words,
+            multiple_responses=args.multiple_responses,
         )
     )
diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
@@ -1,9 +1,29 @@
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from pydantic import BaseModel
 
 from llm_clients import LLMInterface
 from utils.conversation_utils import save_conversation_to_file
 
 
+class ScoredResponse(BaseModel):
+    """A single response with its probability score."""
+
+    text: str
+    probability: float
+
+
+class ResponseWithScores(BaseModel):
+    """Model for multiple responses with confidence scores.
+
+    Note: Uses nested Pydantic model instead of List[Tuple[str, float]]
+    because OpenAI's structured output API doesn't support tuple types in
+    JSON schema. Tuples must be converted to objects with named fields.
+    """
+
+    responses: List[ScoredResponse]
+
+
 class ConversationSimulator:
     """Simulates a conversation between two LLM instances."""
 
@@ -15,16 +35,6 @@ def __init__(self, persona: LLMInterface, agent: LLMInterface):
         # Define termination signals that indicate persona wants to end the conversation
         self.termination_signals: Set[str] = set()
 
-        # "goodbye", "bye", "farewell", "talk to you later",
-        # "ttyl",
-        # "end conversation", "conversation over", "that's all",
-        # "nothing more to discuss",
-        # "i'm done", "let's end here",
-        # "conversation complete", "wrapping up", "final thoughts",
-        # "concluding", "to conclude",
-        # "in conclusion"
-        # }
-
     def _should_terminate_conversation(
         self, response: str, speaker: LLMInterface
     ) -> bool:
@@ -44,13 +54,7 @@ def _should_terminate_conversation(
                 return True
 
         # Check for common ending patterns
-        ending_patterns = [
-            # "it was nice",
-            # "pleasure talking",
-            # "great conversation",
-            # "good chat",
-            # "until next time"
-        ]
+        ending_patterns = []
 
         for pattern in ending_patterns:
             if pattern in response_lower:
@@ -63,6 +67,7 @@ async def start_conversation(
         max_turns: int,
         initial_message: Optional[str] = None,
         max_total_words: Optional[int] = None,
+        multiple_responses: bool = False,
     ) -> List[Dict[str, Any]]:
         """
         Start a conversation between the two LLMs with early stopping support.
@@ -72,7 +77,8 @@ async def start_conversation(
             initial_message: Optional initial message (for the first speaker)
                 to start the conversation. By default, first speaker is persona.
             max_total_words: Optional maximum total words across all responses
-
+            multiple_responses: If True, generate multiple responses with scores
+                and select the highest-scored one. Requires JudgeLLM support.
 
         Returns:
             List of conversation turns with speaker and message
@@ -90,20 +96,63 @@ async def start_conversation(
             # Record start time for this turn
 
             # Generate response
-            response = await current_speaker.generate_response(current_message)
+            response: str
+            score: Optional[float]
+            all_responses: Optional[List[Tuple[str, float]]]
-            score: Optional[float]
-            all_responses: Optional[List[Tuple[str, float]]]
+            score: Optional[float] = None
+            all_responses: Optional[List[Tuple[str, float]]] = None
-            score: Optional[float]
-            all_responses: Optional[List[Tuple[str, float]]]
+            score: Optional[float] = None
+            all_responses: Optional[List[Tuple[str, float]]] = None
 
+            if multiple_responses and hasattr(
+                current_speaker, "generate_structured_response"
+            ):
+                # Generate multiple responses with scores
+                # Add instruction to generate multiple responses
+                multi_response_message = (
+                    f"{current_message}\n\n"
+                    "Please provide 5 diverse possible responses as a persona would, "
+                    "each with a probability score (0.0-1.0) indicating how likely "
+                    "that response is based on the persona's characteristics."
+                )
+                structured_response = (
+                    await current_speaker.generate_structured_response(
+                        multi_response_message, ResponseWithScores
+                    )
+                )
+
+                # Select the response with the highest score
+                best_response = max(
+                    structured_response.responses, key=lambda x: x.probability
+                )
+                response = best_response.text
+                score = best_response.probability
+                # Store all responses in metadata for transparency
+                all_responses = [
+                    (r.text, r.probability) for r in structured_response.responses
+                ]
+            else:
+                # Generate single response (default behavior)
+                # Note: Despite interface definition, implementations return str
+                response = await current_speaker.generate_response(current_message)  # type: ignore[assignment]
+                score = None
+                all_responses = None
+
+            # response is mostly a text string
-            # response is mostly a text string
+            # Count the number of words in the LLM response
-            # response is mostly a text string
+            # Count the number of words in the LLM response
             total_words += len(response.split())
+
             # Record this turn
-            self.conversation_history.append(
-                {
-                    "turn": turn + 1,
-                    "speaker": current_speaker.get_name(),
-                    "input": current_message or "",
-                    "response": response,
-                    "early_termination": False,
-                    "logging": current_speaker.get_last_response_metadata(),
-                }
-            )
+            turn_data = {
+                "turn": turn + 1,
+                "speaker": current_speaker.get_name(),
+                "input": current_message or "",
+                "response": response,
+                "early_termination": False,
+                "logging": current_speaker.get_last_response_metadata(),
+            }
+
+            # Add multiple response metadata if available
+            if multiple_responses and all_responses is not None:
+                turn_data["selected_score"] = score
+                turn_data["all_responses"] = all_responses
+
+            self.conversation_history.append(turn_data)
 
             # Check if persona wants to end the conversation
             if self._should_terminate_conversation(response, current_speaker):

diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py
@@ -33,6 +33,7 @@ def __init__(
         folder_name: str = "conversations",
         max_concurrent: Optional[int] = None,
         max_total_words: Optional[int] = None,
+        multiple_responses: bool = False,
     ):
         self.persona_model_config = persona_model_config
         self.agent_model_config = agent_model_config
@@ -45,6 +46,7 @@ def __init__(
         # Default: None - run all conversations concurrently
         self.max_concurrent = max_concurrent
         self.max_total_words = max_total_words
+        self.multiple_responses = multiple_responses
 
         self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
             "system_prompt", "You are a helpful AI assistant."
@@ -66,8 +68,6 @@ async def run_single_conversation(
 
         # Generate filename base using persona name, model, and run number
         tag = uuid.uuid4().hex[:6]
-        # TODO: consider removing timestamp
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
         # TODO: should this be inside the LLM class?
         model_short = (
             model_name.replace("claude-3-", "c3-")
@@ -111,6 +111,7 @@ async def run_single_conversation(
             initial_message=None,
             max_turns=max_turns,
             max_total_words=self.max_total_words,
+            multiple_responses=self.multiple_responses,
         )
 
         # Log each conversation turn
@@ -164,7 +165,9 @@ async def run_conversations(
     ) -> List[Dict[str, Any]]:
         """Run multiple conversations concurrently."""
         # Load prompts from CSV based on persona names
-        personas = load_prompts_from_csv(persona_names)
+        personas = load_prompts_from_csv(
+            persona_names, multiple_responses=self.multiple_responses
+        )
 
         # Load agent configuration (fixed, shared across all conversations)
         agent = LLMFactory.create_llm(
@@ -209,7 +212,8 @@ async def run_with_limit(task):
                     return await task
 
             print(
-                f"Running {len(tasks)} conversations with max concurrency: {self.max_concurrent}"
+                f"Running {len(tasks)} conversations with max concurrency: "
+                f"{self.max_concurrent}"
             )
             results = await asyncio.gather(*[run_with_limit(task) for task in tasks])
         else:

diff --git a/generate_conversations/utils.py b/generate_conversations/utils.py
@@ -11,13 +11,16 @@ def load_prompts_from_csv(
     name_list: Optional[List[str]] = None,
     prompt_path="data/personas.tsv",
     prompt_template_path="data/persona_prompt_template.txt",
+    multiple_responses: bool = False,
 ) -> List[dict[str, str]]:
     """Load prompts from personas.csv file and return them as a list.
 
     Args:
         name_list: Optional list of names to filter by. If None, returns all prompts.
         prompt_path: Path to the CSV file containing persona data
         prompt_template_path: Path to the template file for formatting prompts
+        multiple_responses: If True, include instructions for generating
+            multiple responses
     """
 
     csv_path = Path(prompt_path)
@@ -33,6 +36,22 @@ def load_prompts_from_csv(
     with open(template_path, "r", encoding="utf-8") as template_file:
         template = template_file.read()
 
+    # Remove multiple response instructions if not needed
+    if not multiple_responses:
+        lines = template.split("\n")
+        filtered_lines = []
+        skip_next = False
+        for line in lines:
+            # Skip the three lines about multiple responses
+            if "When asked to provide multiple responses" in line:
+                skip_next = 2  # Skip this line and the next 2
+                continue
+            if skip_next > 0:
+                skip_next -= 1
+                continue
+            filtered_lines.append(line)
+        template = "\n".join(filtered_lines)
+
     data = []
     with open(csv_path, "r", encoding="utf-8") as f:
         reader = csv.DictReader(f, delimiter="\t")

diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
@@ -115,7 +115,7 @@ async def generate_response(self, message: Optional[str] = None) -> str:
                 # Store raw metadata
                 self.last_response_metadata["raw_metadata"] = dict(metadata)
 
-            return response.content
+            return response.content  # type: ignore[return-value]
         except Exception as e:
             # Store error metadata
             self.last_response_metadata = {

diff --git a/llm_clients/config.py b/llm_clients/config.py
@@ -49,6 +49,16 @@ class Config:
             "max_tokens": 1000,
         },
         "gemini-pro": {"provider": "google", "temperature": 0.7, "max_tokens": 1000},
+        "gemini-3-pro-preview": {
+            "provider": "google",
+            "temperature": 0.7,
+            "max_tokens": 1000,
+        },
+        "gemini-2.5-flash": {
+            "provider": "google",
+            "temperature": 0.7,
+            "max_tokens": 1000,
+        },
         "llama2:7b": {
             "provider": "ollama",
             "temperature": 0.7,