From a33db7c4fdaa7cca1f04e135d1c4e998fe04dace Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20Berm=C3=BChler?=
 <31950929+dbermuehler@users.noreply.github.com>
Date: Wed, 17 Dec 2025 14:45:39 +0100
Subject: [PATCH] Introducing optional goal specification per evaluation case
 to GoalSuccessRateEvaluator

---
 .../evaluators/goal_success_rate_evaluator.py | 21 +++++++++------
 .../goal_success_rate/goal_success_rate_v1.py | 26 +++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py

diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py
index 93a3bf5..2b7c78a 100644
--- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py
+++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py
@@ -6,7 +6,7 @@
 from typing_extensions import TypeVar, Union
 
 from ..types.evaluation import EvaluationData, EvaluationOutput
-from ..types.trace import EvaluationLevel, SessionLevelInput
+from ..types.trace import EvaluationLevel
 from .evaluator import Evaluator
 from .prompt_templates.goal_success_rate import get_template
 
@@ -40,7 +40,7 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
 
     def __init__(
         self,
-        version: str = "v0",
+        version: str = "v1",
         model: Union[Model, str, None] = None,
         system_prompt: str | None = None,
     ):
@@ -50,8 +50,7 @@ def __init__(
         self.model = model
 
     def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
-        session_input = self._parse_trajectory(evaluation_case)
-        prompt = self._format_prompt(session_input)
+        prompt = self._format_prompt(evaluation_case)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
         rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
         normalized_score = self._score_mapping[rating.score]
@@ -64,8 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
         return [result]
 
     async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
-        session_input = self._parse_trajectory(evaluation_case)
-        prompt = self._format_prompt(session_input)
+        prompt = self._format_prompt(evaluation_case)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
         rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
         normalized_score = self._score_mapping[rating.score]
@@ -77,10 +75,17 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
         )
         return [result]
 
-    def _format_prompt(self, session_input: SessionLevelInput) -> str:
-        """Format evaluation prompt from session-level input."""
+    def _format_prompt(self, evaluation_case: EvaluationData[InputT, OutputT]) -> str:
+        """Format evaluation prompt from evaluation case."""
+
+        goal = evaluation_case.metadata.get("goal")
+        session_input = self._parse_trajectory(evaluation_case)
+
         parts = []
 
+        if goal:
+            parts.append(f"# User Goal\n{goal}")
+
         if session_input.available_tools:
             parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")
 
diff --git a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py
new file mode 100644
index 0000000..8764023
--- /dev/null
+++ b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_v1.py
@@ -0,0 +1,26 @@
+SYSTEM_PROMPT = """You are an objective judge evaluating whether a conversation between a User and an AI assistant successfully completed all User goals.
+
+You will be provided with:
+1. (Optional) A specific goal description that defines what the user is trying to achieve.
+2. The list of available tools the AI assistant can use, with descriptions for each tool about when to use it and how to use it.
+3. The complete conversation record with multiple turns including:
+    - User messages (User:)
+    - Assistant responses (Assistant:)
+    - Tool selected by the assistant (Action:)
+    - Tool outputs (Tool:)
+
+Your task is to carefully analyze the conversation and determine if all User goals were successfully achieved. Follow these steps:
+
+1. First, identify the user's goals:
+   - If a specific goal description is provided under "# User Goal", use that as the primary goal to evaluate.
+   - If no explicit goal is provided, derive the user's goals from their messages in the conversation record.
+2. Analyze the list of available tools and reason about what tools the AI assistant should use to achieve the goal(s).
+3. Check the conversation record to decide whether the AI assistant:
+   - Used the appropriate tools to address the user's goal(s)
+   - Got the expected outputs from those tools
+   - Responded to the User appropriately about the outcome
+4. Determine if the goal(s) were achieved based on whether the user's needs were satisfied.
+
+# Evaluation Rubric
+- Yes: All user goals were achieved. The agent successfully completed all requested tasks, provided accurate information, and the user received satisfactory outcomes.
+- No: Not all user goals were achieved. The agent failed to complete one or more requested tasks, provided incomplete/incorrect information, or the user's needs were not fully met."""