Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions docs/evaluating.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,9 @@ def set_system_prompt(self, system_prompt: str) -> None:
self.system_prompt = system_prompt
```

#### `get_last_response_metadata()` - Get response metadata (optional but recommended)
```python
def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()
```
#### `last_response_metadata` - Response metadata (required)

Set in `__init__` (base sets it to `{}`). Update it in `generate_response()`: assign with `self.last_response_metadata = {...}`. If you need in-place updates (e.g. `self.last_response_metadata["usage"] = ...`), use `self._last_response_metadata` so the stored dict is updated. The property getter returns a copy so callers can use `last_response_metadata` without mutating the client's dict.

### 3. Add the new LLM client to the factory

Expand Down Expand Up @@ -227,6 +224,15 @@ python3 judge.py -f conversations/{YOUR_FOLDER} -j your-model-name
- **LangChain Integration**: The provided implementations use LangChain for robust LLM interactions
- **Error Handling**: Make sure to handle errors gracefully and return appropriate error messages

### Conversation flow and history

VERA's ConversationSimulator holds the full conversation and passes `conversation_history` into your client on every call. Your client is not required to store history. You can:

- **Stateless**: Build each request from `conversation_history` (as the built-in clients do), or
- **Server-side state**: Send a `conversation_id` to your API and let the server maintain the thread; in that case you may use `conversation_history` only when needed (e.g. fallback or logging).

Your LLM client is responsible for updating `conversation_id` after each response. At the end of `generate_response()`, after setting `last_response_metadata`, call `self.ensure_conversation_id()`. That method reads `self.last_response_metadata` and sets `self.conversation_id` from the key `"conversation_id"` if present, or creates a unique id via `create_conversation_id()`. The interface requires updating `last_response_metadata` in `generate_response()`. Use `self.conversation_id` when your API needs a thread or session id. Callers that need to store metadata elsewhere should use `last_response_metadata.copy()`. The ConversationSimulator does not manage conversation_id; it only calls `generate_response(conversation_history)`.

## Structured Output Support

### Native Support (Recommended)
Expand Down
2 changes: 1 addition & 1 deletion generate_conversations/conversation_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ async def start_conversation(
input_message=input_msg,
response_message=lc_message,
early_termination=False,
logging_metadata=current_speaker.get_last_response_metadata(),
logging_metadata=current_speaker.last_response_metadata,
)
self.conversation_history.append(turn_obj)

Expand Down
68 changes: 33 additions & 35 deletions generate_conversations/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,20 +49,19 @@ def __init__(
self.max_total_words = max_total_words
self.max_personas = max_personas

self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
"system_prompt", "You are a helpful AI assistant."
)

async def run_single_conversation(
self,
persona_config: dict,
agent,
max_turns: int,
conversation_id: int,
conversation_index: int,
run_number: int,
**kargs: dict,
) -> Dict[str, Any]:
"""Run a single conversation asynchronously."""
"""Run a single conversation asynchronously.

Each conversation uses its own agent instance so conversation_id and
per-session state are not shared across concurrent tasks.
"""
model_name = persona_config["model"]
system_prompt = persona_config["prompt"] # This is now the full persona prompt
persona_name = persona_config["name"]
Expand All @@ -83,7 +82,7 @@ async def run_single_conversation(
logger = setup_conversation_logger(filename_base, run_id=self.run_id)
start_time = time.time()

# Create LLM1 instance with the persona prompt and configuration
# Create persona instance
persona = LLMFactory.create_llm(
model_name=model_name,
name=f"{model_short} {persona_name}",
Expand All @@ -92,6 +91,23 @@ async def run_single_conversation(
**self.persona_model_config,
)

# Create new agent instance to reset conversation_id and metadata.
# Exclude selected kwargs to avoid duplicate args expected in create_llm.
agent_kwargs = {
k: v
for k, v in self.agent_model_config.items()
if k not in ("model", "name", "system_prompt")
}
agent = LLMFactory.create_llm(
model_name=self.agent_model_config["model"],
name=self.agent_model_config.get("name", "Provider"),
system_prompt=self.agent_model_config.get(
"system_prompt", "You are a helpful AI assistant."
),
role=Role.PROVIDER,
**agent_kwargs,
)

# Log conversation start
log_conversation_start(
logger=logger,
Expand Down Expand Up @@ -148,7 +164,7 @@ async def run_single_conversation(
simulator.save_conversation(f"{filename_base}.txt", self.folder_name)

result = {
"id": conversation_id,
"id": conversation_index,
"llm1_model": model_name,
"llm1_prompt": persona_name,
"run_number": run_number,
Expand All @@ -164,11 +180,11 @@ async def run_single_conversation(

# Cleanup LLM resources (e.g., close HTTP sessions for Azure)
# Always cleanup, even if conversation failed
try:
await persona.cleanup()
except Exception as e:
# Log but don't fail if cleanup fails
print(f"Warning: Failed to cleanup persona LLM: {e}")
for llm in (persona, agent):
try:
await llm.cleanup()
except Exception as e:
print(f"Warning: Failed to cleanup LLM: {e}")

return result

Expand All @@ -179,37 +195,26 @@ async def run_conversations(
# Load prompts from CSV based on persona names
personas = load_prompts_from_csv(persona_names, max_personas=self.max_personas)

# Load agent configuration (fixed, shared across all conversations)
agent = LLMFactory.create_llm(
model_name=self.agent_model_config["model"],
name=self.agent_model_config.pop("name"),
system_prompt=self.AGENT_SYSTEM_PROMPT,
role=Role.PROVIDER,
**self.agent_model_config,
)

# Create tasks for all conversations (each prompt run multiple times)
tasks = []
conversation_id = 1
conversation_index = 1

for persona in personas:
for run in range(1, self.runs_per_prompt + 1):
tasks.append(
# TODO: should we pass the persona object here?
self.run_single_conversation(
{
"model": self.persona_model_config["model"],
"prompt": persona["prompt"],
"name": persona["Name"],
"run": run,
},
agent,
self.max_turns,
conversation_id,
conversation_index,
run,
)
)
conversation_id += 1
conversation_index += 1

# Run all conversations with concurrency limit
start_time = datetime.now()
Expand Down Expand Up @@ -237,11 +242,4 @@ async def run_with_limit(task):

print(f"\nCompleted {len(results)} conversations in {total_time:.2f} seconds")

# Cleanup agent LLM resources (e.g., close HTTP sessions for Azure)
try:
await agent.cleanup()
except Exception as e:
# Log but don't fail if cleanup fails
print(f"Warning: Failed to cleanup agent LLM: {e}")

return results
13 changes: 6 additions & 7 deletions llm_clients/azure_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,20 +187,21 @@ async def generate_response(
# Extract token usage
if "token_usage" in metadata:
usage = metadata["token_usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}

# Extract finish reason
self.last_response_metadata["finish_reason"] = metadata.get(
self._last_response_metadata["finish_reason"] = metadata.get(
"finish_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

self.ensure_conversation_id()
return response.text
except Exception as e:
# Store error metadata
Expand Down Expand Up @@ -244,8 +245,10 @@ async def generate_response(
debug_print(
f"\n[DEBUG {self.name} - {self.role.value}] " f"{helpful_msg}"
)
self.ensure_conversation_id()
return f"Error generating response: {helpful_msg}"

self.ensure_conversation_id()
return f"Error generating response: {error_msg}"

async def generate_structured_response(
Expand Down Expand Up @@ -307,10 +310,6 @@ async def generate_structured_response(
}
raise RuntimeError(f"Error generating structured response: {str(e)}") from e

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

def set_system_prompt(self, system_prompt: str) -> None:
"""Set or update the system prompt."""
self.system_prompt = system_prompt
Expand Down
14 changes: 7 additions & 7 deletions llm_clients/claude_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,19 +118,22 @@ async def generate_response(
# Extract token usage
if "usage" in metadata:
usage = metadata["usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("input_tokens", 0)
+ usage.get("output_tokens", 0),
}

# Extract stop reason
self.last_response_metadata["stop_reason"] = metadata.get("stop_reason")
self._last_response_metadata["stop_reason"] = metadata.get(
"stop_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

self.ensure_conversation_id()
return response.text
except Exception as e:
# Store error metadata
Expand All @@ -143,6 +146,7 @@ async def generate_response(
"error": str(e),
"usage": {},
}
self.ensure_conversation_id()
return f"Error generating response: {str(e)}"

async def generate_structured_response(
Expand Down Expand Up @@ -204,10 +208,6 @@ async def generate_structured_response(
}
raise RuntimeError(f"Error generating structured response: {str(e)}") from e

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

def set_system_prompt(self, system_prompt: str) -> None:
"""Set or update the system prompt."""
self.system_prompt = system_prompt
14 changes: 6 additions & 8 deletions llm_clients/gemini_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ async def generate_response(
# Extract token usage - Gemini may have different structure
if "usage_metadata" in metadata:
usage = metadata["usage_metadata"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"prompt_token_count": usage.get("prompt_token_count", 0),
"candidates_token_count": usage.get(
"candidates_token_count", 0
Expand All @@ -126,20 +126,21 @@ async def generate_response(
elif "token_usage" in metadata:
# Fallback structure
usage = metadata["token_usage"]
self.last_response_metadata["usage"] = {
self._last_response_metadata["usage"] = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}

# Extract finish reason
self.last_response_metadata["finish_reason"] = metadata.get(
self._last_response_metadata["finish_reason"] = metadata.get(
"finish_reason"
)

# Store raw metadata
self.last_response_metadata["raw_metadata"] = dict(metadata)
self._last_response_metadata["raw_metadata"] = dict(metadata)

self.ensure_conversation_id()
return response.text
except Exception as e:
# Store error metadata
Expand All @@ -152,12 +153,9 @@ async def generate_response(
"error": str(e),
"usage": {},
}
self.ensure_conversation_id()
return f"Error generating response: {str(e)}"

def get_last_response_metadata(self) -> Dict[str, Any]:
"""Get metadata from the last response."""
return self.last_response_metadata.copy()

async def generate_structured_response(
self, message: Optional[str], response_model: Type[T]
) -> T:
Expand Down
50 changes: 47 additions & 3 deletions llm_clients/llm_interface.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import copy
import uuid
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Dict, List, Optional, Type, TypeVar
Expand Down Expand Up @@ -31,7 +33,44 @@ def __init__(
self.name = name
self.role = role
self.system_prompt = system_prompt or ""
self.last_response_metadata: Dict[str, Any] = {}
self._last_response_metadata: Dict[str, Any] = {}
self.conversation_id: Optional[str] = None

@property
def last_response_metadata(self) -> Dict[str, Any]:
"""Metadata from the last generate_response call. Returns a deep copy so
callers cannot mutate internal state (including nested dicts like usage).
"""
return copy.deepcopy(self._last_response_metadata)

@last_response_metadata.setter
def last_response_metadata(self, value: Optional[Dict[str, Any]]) -> None:
"""Set metadata; use _last_response_metadata for in-place updates."""
self._last_response_metadata = value or {}

def create_conversation_id(self) -> str:
"""Create a new unique conversation id.

Used when the client does not provide one in response metadata.
Subclasses may override to use a different id format.
"""
return str(uuid.uuid4())

def ensure_conversation_id(self) -> None:
"""Set conversation_id from last response metadata or create one if not set.

Call after generate_response (e.g. at the end of generate_response).
If conversation_id is already set, does nothing. Otherwise sets it from
self.last_response_metadata["conversation_id"] if present, or from
create_conversation_id(). Implementations must update
self.last_response_metadata in generate_response().
"""
if self.conversation_id is not None:
return
metadata = self.last_response_metadata or {}
self.conversation_id = (
metadata.get("conversation_id") or self.create_conversation_id()
)

@abstractmethod
async def generate_response(
Expand All @@ -49,8 +88,13 @@ async def generate_response(
starting the conversation.

Returns:
str: The response text. Metadata available via
get_last_response_metadata()
str: The response text. Metadata in self.last_response_metadata
(getter returns a copy so callers need not copy).

Note:
For API thread/session identification, use self.conversation_id.
Implementations should call self.ensure_conversation_id() before
returning, so self.conversation_id is set for the next call.
"""
pass

Expand Down
Loading