SpringCare · jgieringer · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/docs/evaluating.md b/docs/evaluating.md
@@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None:
     self.system_prompt = system_prompt
 ```
 
-#### `get_last_response_metadata()` - Get response metadata (optional but recommended)
-```python
-def get_last_response_metadata(self) -> Dict[str, Any]:
-    """Get metadata from the last response."""
-    return self.last_response_metadata.copy()
-```
+#### `last_response_metadata` - Response metadata (required)
+
+Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict.
 
 ### 3. Add the new LLM client to the factory
 
@@ -227,6 +224,15 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name
 - **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions
 - **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages
 
+### Conversation flow and history
+
+VERA's ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can:
+
+- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or
+- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the thread; in that case you may use `conversation_history` only when needed (e.g. fallback or logging).
+
+Your LLM client is responsible for updating `conversation_id` after each response. At the end of `generate_response()`, after setting `last_response_metadata`, call `self.ensure_conversation_id()`. That method reads `self.last_response_metadata` and sets `self.conversation_id` from the key `"conversation_id"` if present, or creates a unique id via `create_conversation_id()`. The interface requires updating `last_response_metadata` in `generate_response()`. Use `self.conversation_id` when your API needs a thread or session id. Callers that need to store metadata elsewhere should use `last_response_metadata.copy()`. The ConversationSimulator does not manage conversation_id; it only calls `generate_response(conversation_history)`.
+
 ## Structured Output Support
 
 ### Native Support (Recommended)

diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
@@ -110,7 +110,7 @@ async def start_conversation(
                 input_message=input_msg,
                 response_message=lc_message,
                 early_termination=False,
-                logging_metadata=current_speaker.get_last_response_metadata(),
+                logging_metadata=current_speaker.last_response_metadata,
             )
             self.conversation_history.append(turn_obj)
 

diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py
@@ -49,20 +49,19 @@ def __init__(
         self.max_total_words = max_total_words
         self.max_personas = max_personas
 
-        self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
-            "system_prompt", "You are a helpful AI assistant."
-        )
-
     async def run_single_conversation(
         self,
         persona_config: dict,
-        agent,
         max_turns: int,
-        conversation_id: int,
+        conversation_index: int,
         run_number: int,
         **kargs: dict,
     ) -> Dict[str, Any]:
-        """Run a single conversation asynchronously."""
+        """Run a single conversation asynchronously.
+
+        Each conversation uses its own agent instance so conversation_id and
+        per-session state are not shared across concurrent tasks.
+        """
         model_name = persona_config["model"]
         system_prompt = persona_config["prompt"]  # This is now the full persona prompt
         persona_name = persona_config["name"]
@@ -83,7 +82,7 @@ async def run_single_conversation(
         logger = setup_conversation_logger(filename_base, run_id=self.run_id)
         start_time = time.time()
 
-        # Create LLM1 instance with the persona prompt and configuration
+        # Create persona instance
         persona = LLMFactory.create_llm(
             model_name=model_name,
             name=f"{model_short} {persona_name}",
@@ -92,6 +91,23 @@ async def run_single_conversation(
             **self.persona_model_config,
         )
 
+        # Create new agent instance to reset conversation_id and metadata.
+        # Exclude selected kwargs to avoid duplicate args expected in create_llm.
+        agent_kwargs = {
+            k: v
+            for k, v in self.agent_model_config.items()
+            if k not in ("model", "name", "system_prompt")
+        }
+        agent = LLMFactory.create_llm(
+            model_name=self.agent_model_config["model"],
+            name=self.agent_model_config.get("name", "Provider"),
+            system_prompt=self.agent_model_config.get(
+                "system_prompt", "You are a helpful AI assistant."
+            ),
+            role=Role.PROVIDER,
+            **agent_kwargs,
+        )
+
         # Log conversation start
         log_conversation_start(
             logger=logger,
@@ -148,7 +164,7 @@ async def run_single_conversation(
             simulator.save_conversation(f"{filename_base}.txt", self.folder_name)
 
             result = {
-                "id": conversation_id,
+                "id": conversation_index,
                 "llm1_model": model_name,
                 "llm1_prompt": persona_name,
                 "run_number": run_number,
@@ -164,11 +180,11 @@ async def run_single_conversation(
 
             # Cleanup LLM resources (e.g., close HTTP sessions for Azure)
             # Always cleanup, even if conversation failed
-            try:
-                await persona.cleanup()
-            except Exception as e:
-                # Log but don't fail if cleanup fails
-                print(f"Warning: Failed to cleanup persona LLM: {e}")
+            for llm in (persona, agent):
+                try:
+                    await llm.cleanup()
+                except Exception as e:
+                    print(f"Warning: Failed to cleanup LLM: {e}")
 
         return result
 
@@ -179,37 +195,26 @@ async def run_conversations(
         # Load prompts from CSV based on persona names
         personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas)
 
-        # Load agent configuration (fixed, shared across all conversations)
-        agent = LLMFactory.create_llm(
-            model_name=self.agent_model_config["model"],
-            name=self.agent_model_config.pop("name"),
-            system_prompt=self.AGENT_SYSTEM_PROMPT,
-            role=Role.PROVIDER,
-            **self.agent_model_config,
-        )
-
         # Create tasks for all conversations (each prompt run multiple times)
         tasks = []
-        conversation_id = 1
+        conversation_index = 1
 
         for persona in personas:
             for run in range(1, self.runs_per_prompt + 1):
                 tasks.append(
-                    # TODO: should we pass the persona object here?
                     self.run_single_conversation(
                         {
                             "model": self.persona_model_config["model"],
                             "prompt": persona["prompt"],
                             "name": persona["Name"],
                             "run": run,
                         },
-                        agent,
                         self.max_turns,
-                        conversation_id,
+                        conversation_index,
                         run,
                     )
                 )
-                conversation_id += 1
+                conversation_index += 1
 
         # Run all conversations with concurrency limit
         start_time = datetime.now()
@@ -237,11 +242,4 @@ async def run_with_limit(task):
 
         print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds")
 
-        # Cleanup agent LLM resources (e.g., close HTTP sessions for Azure)
-        try:
-            await agent.cleanup()
-        except Exception as e:
-            # Log but don't fail if cleanup fails
-            print(f"Warning: Failed to cleanup agent LLM: {e}")
-
         return results
diff --git a/llm_clients/azure_llm.py b/llm_clients/azure_llm.py
@@ -187,20 +187,21 @@ async def generate_response(
                 # Extract token usage
                 if "token_usage" in metadata:
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
+            self.ensure_conversation_id()
             return response.text
         except Exception as e:
             # Store error metadata
@@ -244,8 +245,10 @@ async def generate_response(
                 debug_print(
                     f"\n[DEBUG {self.name} - {self.role.value}] " f"{helpful_msg}"
                 )
+                self.ensure_conversation_id()
                 return f"Error generating response: {helpful_msg}"
 
+            self.ensure_conversation_id()
             return f"Error generating response: {error_msg}"
 
     async def generate_structured_response(
@@ -307,10 +310,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt

diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
@@ -118,19 +118,22 @@ async def generate_response(
                 # Extract token usage
                 if "usage" in metadata:
                     usage = metadata["usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "input_tokens": usage.get("input_tokens", 0),
                         "output_tokens": usage.get("output_tokens", 0),
                         "total_tokens": usage.get("input_tokens", 0)
                         + usage.get("output_tokens", 0),
                     }
 
                 # Extract stop reason
-                self.last_response_metadata["stop_reason"] = metadata.get("stop_reason")
+                self._last_response_metadata["stop_reason"] = metadata.get(
+                    "stop_reason"
+                )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
+            self.ensure_conversation_id()
             return response.text
         except Exception as e:
             # Store error metadata
@@ -143,6 +146,7 @@ async def generate_response(
                 "error": str(e),
                 "usage": {},
             }
+            self.ensure_conversation_id()
             return f"Error generating response: {str(e)}"
 
     async def generate_structured_response(
@@ -204,10 +208,6 @@ async def generate_structured_response(
             }
             raise RuntimeError(f"Error generating structured response: {str(e)}") from e
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
@@ -116,7 +116,7 @@ async def generate_response(
                 # Extract token usage - Gemini may have different structure
                 if "usage_metadata" in metadata:
                     usage = metadata["usage_metadata"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_token_count": usage.get("prompt_token_count", 0),
                         "candidates_token_count": usage.get(
                             "candidates_token_count", 0
@@ -126,20 +126,21 @@ async def generate_response(
                 elif "token_usage" in metadata:
                     # Fallback structure
                     usage = metadata["token_usage"]
-                    self.last_response_metadata["usage"] = {
+                    self._last_response_metadata["usage"] = {
                         "prompt_tokens": usage.get("prompt_tokens", 0),
                         "completion_tokens": usage.get("completion_tokens", 0),
                         "total_tokens": usage.get("total_tokens", 0),
                     }
 
                 # Extract finish reason
-                self.last_response_metadata["finish_reason"] = metadata.get(
+                self._last_response_metadata["finish_reason"] = metadata.get(
                     "finish_reason"
                 )
 
                 # Store raw metadata
-                self.last_response_metadata["raw_metadata"] = dict(metadata)
+                self._last_response_metadata["raw_metadata"] = dict(metadata)
 
+            self.ensure_conversation_id()
             return response.text
         except Exception as e:
             # Store error metadata
@@ -152,12 +153,9 @@ async def generate_response(
                 "error": str(e),
                 "usage": {},
             }
+            self.ensure_conversation_id()
             return f"Error generating response: {str(e)}"
 
-    def get_last_response_metadata(self) -> Dict[str, Any]:
-        """Get metadata from the last response."""
-        return self.last_response_metadata.copy()
-
     async def generate_structured_response(
         self, message: Optional[str], response_model: Type[T]
     ) -> T:

diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py
@@ -1,3 +1,5 @@
+import copy
+import uuid
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any, Dict, List, Optional, Type, TypeVar
@@ -31,7 +33,44 @@ def __init__(
         self.name = name
         self.role = role
         self.system_prompt = system_prompt or ""
-        self.last_response_metadata: Dict[str, Any] = {}
+        self._last_response_metadata: Dict[str, Any] = {}
+        self.conversation_id: Optional[str] = None
+
+    @property
+    def last_response_metadata(self) -> Dict[str, Any]:
+        """Metadata from the last generate_response call. Returns a deep copy so
+        callers cannot mutate internal state (including nested dicts like usage).
+        """
+        return copy.deepcopy(self._last_response_metadata)
+
+    @last_response_metadata.setter
+    def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None:
+        """Set metadata; use _last_response_metadata for in-place updates."""
+        self._last_response_metadata = value or {}
+
+    def create_conversation_id(self) -> str:
+        """Create a new unique conversation id.
+
+        Used when the client does not provide one in response metadata.
+        Subclasses may override to use a different id format.
+        """
+        return str(uuid.uuid4())
+
+    def ensure_conversation_id(self) -> None:
+        """Set conversation_id from last response metadata or create one if not set.
+
+        Call after generate_response (e.g. at the end of generate_response).
+        If conversation_id is already set, does nothing. Otherwise sets it from
+        self.last_response_metadata["conversation_id"] if present, or from
+        create_conversation_id(). Implementations must update
+        self.last_response_metadata in generate_response().
+        """
+        if self.conversation_id is not None:
+            return
+        metadata = self.last_response_metadata or {}
+        self.conversation_id = (
+            metadata.get("conversation_id") or self.create_conversation_id()
+        )
 
     @abstractmethod
     async def generate_response(
@@ -49,8 +88,13 @@ async def generate_response(
                 starting the conversation.
 
         Returns:
-            str: The response text. Metadata available via
-                get_last_response_metadata()
+            str: The response text. Metadata in self.last_response_metadata
+                (getter returns a copy so callers need not copy).
+
+        Note:
+            For API thread/session identification, use self.conversation_id.
+            Implementations should call self.ensure_conversation_id() before
+            returning, so self.conversation_id is set for the next call.
         """
         pass