From a33db7c4fdaa7cca1f04e135d1c4e998fe04dace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Berm=C3=BChler?= <31950929+dbermuehler@users.noreply.github.com> Date: Wed, 17 Dec 2025 14:45:39 +0100 Subject: [PATCH] Introducing optional goal specification per evaluation case to GoalSuccessRateEvaluator --- .../evaluators/goal_success_rate_evaluator.py | 21 +++++++++------ .../goal_success_rate/goal_success_rate_v1.py | 26 +++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py index 93a3bf5..2b7c78a 100644 --- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py +++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py @@ -6,7 +6,7 @@ from typing_extensions import TypeVar, Union from ..types.evaluation import EvaluationData, EvaluationOutput -from ..types.trace import EvaluationLevel, SessionLevelInput +from ..types.trace import EvaluationLevel from .evaluator import Evaluator from .prompt_templates.goal_success_rate import get_template @@ -40,7 +40,7 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]): def __init__( self, - version: str = "v0", + version: str = "v1", model: Union[Model, str, None] = None, system_prompt: str | None = None, ): @@ -50,8 +50,7 @@ def __init__( self.model = model def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: - session_input = self._parse_trajectory(evaluation_case) - prompt = self._format_prompt(session_input) + prompt = self._format_prompt(evaluation_case) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) rating = evaluator_agent.structured_output(GoalSuccessRating, prompt) normalized_score = self._score_mapping[rating.score] @@ -64,8 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva return [result] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: - session_input = self._parse_trajectory(evaluation_case) - prompt = self._format_prompt(session_input) + prompt = self._format_prompt(evaluation_case) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt) normalized_score = self._score_mapping[rating.score] @@ -77,10 +75,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) ) return [result] - def _format_prompt(self, session_input: SessionLevelInput) -> str: - """Format evaluation prompt from session-level input.""" + def _format_prompt(self, evaluation_case: EvaluationData[InputT, OutputT]) -> str: + """Format evaluation prompt from evaluation case.""" + + goal = evaluation_case.metadata.get("goal") + session_input = self._parse_trajectory(evaluation_case) + parts = [] + if goal: + parts.append(f"# User Goal\n{goal}") + if session_input.available_tools: parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}") diff --git a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py new file mode 100644 index 0000000..8764023 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py @@ -0,0 +1,26 @@ +SYSTEM_PROMPT = """You are an objective judge evaluating whether a conversation between a User and an AI assistant successfully completed all User goals. + +You will be provided with: +1. (Optional) A specific goal description that defines what the user is trying to achieve. +2. The list of available tools the AI assistant can use, with descriptions for each tool about when to use it and how to use it. +3. The complete conversation record with multiple turns including: + - User messages (User:) + - Assistant responses (Assistant:) + - Tool selected by the assistant (Action:) + - Tool outputs (Tool:) + +Your task is to carefully analyze the conversation and determine if all User goals were successfully achieved. Follow these steps: + +1. First, identify the user's goals: + - If a specific goal description is provided under "# User Goal", use that as the primary goal to evaluate. + - If no explicit goal is provided, derive the user's goals from their messages in the conversation record. +2. Analyze the list of available tools and reason about what tools the AI assistant should use to achieve the goal(s). +3. Check the conversation record to decide whether the AI assistant: + - Used the appropriate tools to address the user's goal(s) + - Got the expected outputs from those tools + - Responded to the User appropriately about the outcome +4. Determine if the goal(s) were achieved based on whether the user's needs were satisfied. + +# Evaluation Rubric +- Yes: All user goals were achieved. The agent successfully completed all requested tasks, provided accurate information, and the user received satisfactory outcomes. +- No: Not all user goals were achieved. The agent failed to complete one or more requested tasks, provided incomplete/incorrect information, or the user's needs were not fully met."""