Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions src/strands_evals/evaluators/goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing_extensions import TypeVar, Union

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import EvaluationLevel, SessionLevelInput
from ..types.trace import EvaluationLevel
from .evaluator import Evaluator
from .prompt_templates.goal_success_rate import get_template

Expand Down Expand Up @@ -40,7 +40,7 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):

def __init__(
self,
version: str = "v0",
version: str = "v1",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
):
Expand All @@ -50,8 +50,7 @@ def __init__(
self.model = model

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
prompt = self._format_prompt(evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
normalized_score = self._score_mapping[rating.score]
Expand All @@ -64,8 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
return [result]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
prompt = self._format_prompt(evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
normalized_score = self._score_mapping[rating.score]
Expand All @@ -77,10 +75,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
)
return [result]

def _format_prompt(self, session_input: SessionLevelInput) -> str:
"""Format evaluation prompt from session-level input."""
def _format_prompt(self, evaluation_case: EvaluationData[InputT, OutputT]) -> str:
"""Format evaluation prompt from evaluation case."""

goal = evaluation_case.metadata.get("goal")
session_input = self._parse_trajectory(evaluation_case)

parts = []

if goal:
parts.append(f"# User Goal\n{goal}")

if session_input.available_tools:
parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
SYSTEM_PROMPT = """You are an objective judge evaluating whether a conversation between a User and an AI assistant successfully completed all User goals.

You will be provided with:
1. (Optional) A specific goal description that defines what the user is trying to achieve.
2. The list of available tools the AI assistant can use, with descriptions for each tool about when to use it and how to use it.
3. The complete conversation record with multiple turns including:
- User messages (User:)
- Assistant responses (Assistant:)
- Tool selected by the assistant (Action:)
- Tool outputs (Tool:)

Your task is to carefully analyze the conversation and determine if all User goals were successfully achieved. Follow these steps:

1. First, identify the user's goals:
- If a specific goal description is provided under "# User Goal", use that as the primary goal to evaluate.
- If no explicit goal is provided, derive the user's goals from their messages in the conversation record.
2. Analyze the list of available tools and reason about what tools the AI assistant should use to achieve the goal(s).
3. Check the conversation record to decide whether the AI assistant:
- Used the appropriate tools to address the user's goal(s)
- Got the expected outputs from those tools
- Responded to the User appropriately about the outcome
4. Determine if the goal(s) were achieved based on whether the user's needs were satisfied.

# Evaluation Rubric
- Yes: All user goals were achieved. The agent successfully completed all requested tasks, provided accurate information, and the user received satisfactory outcomes.
- No: Not all user goals were achieved. The agent failed to complete one or more requested tasks, provided incomplete/incorrect information, or the user's needs were not fully met."""
Loading