From c51f857ccb52a4f459143f3b403b65b997d5b115 Mon Sep 17 00:00:00 2001 From: poshinchen Date: Wed, 28 Jan 2026 16:34:51 -0500 Subject: [PATCH] feat: added ResponseRelevanceEvaluator --- src/strands_evals/evaluators/__init__.py | 2 + .../response_relevance/__init__.py | 11 ++ .../response_relevance_v0.py | 29 ++++ .../response_relevance_evaluator.py | 144 ++++++++++++++++++ .../extractors/trace_extractor.py | 3 +- .../test_response_relevance_evaluator.py | 132 ++++++++++++++++ 6 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py create mode 100644 src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py create mode 100644 src/strands_evals/evaluators/response_relevance_evaluator.py create mode 100644 tests/strands_evals/evaluators/test_response_relevance_evaluator.py diff --git a/src/strands_evals/evaluators/__init__.py b/src/strands_evals/evaluators/__init__.py index 51346d8..677f138 100644 --- a/src/strands_evals/evaluators/__init__.py +++ b/src/strands_evals/evaluators/__init__.py @@ -5,6 +5,7 @@ from .helpfulness_evaluator import HelpfulnessEvaluator from .interactions_evaluator import InteractionsEvaluator from .output_evaluator import OutputEvaluator +from .response_relevance_evaluator import ResponseRelevanceEvaluator from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator from .trajectory_evaluator import TrajectoryEvaluator @@ -18,6 +19,7 @@ "HarmfulnessEvaluator", "GoalSuccessRateEvaluator", "FaithfulnessEvaluator", + "ResponseRelevanceEvaluator", "ToolSelectionAccuracyEvaluator", "ToolParameterAccuracyEvaluator", ] diff --git a/src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py b/src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py new file mode 100644 index 0000000..0834ec2 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py @@ -0,0 +1,11 @@ +from . import response_relevance_v0 + +VERSIONS = { + "v0": response_relevance_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py b/src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py new file mode 100644 index 0000000..e20e0d9 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py @@ -0,0 +1,29 @@ +SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question. + +# Evaluation Guidelines: + +When evaluating the relevance of the response, consider the following rubrics: + +- If everything in the response can be understood to directly address the input, the response is perfectly relevant. +- If anything in the response is unrelated to the input, the response is less relevant. +- Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized. +- Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized. + +# Rating Scale: + +1. Not At All + - No part of the response is relevant to the question + +2. Not Generally + - An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer + +3. Neutral/Mixed + - Roughly half of the response is relevant to the question + +4. Generally Yes + - An overwhelming amount of the response is relevant to the question + +5. Completely Yes + - Every piece of the response is relevant to the question + +IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy.""" diff --git a/src/strands_evals/evaluators/response_relevance_evaluator.py b/src/strands_evals/evaluators/response_relevance_evaluator.py new file mode 100644 index 0000000..8d40210 --- /dev/null +++ b/src/strands_evals/evaluators/response_relevance_evaluator.py @@ -0,0 +1,144 @@ +from enum import Enum +from typing import cast + +from pydantic import BaseModel, Field +from strands import Agent +from strands.agent.agent_result import AgentResult +from strands.models.model import Model +from typing_extensions import TypeVar, Union + +from ..types.evaluation import EvaluationData, EvaluationOutput +from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput +from .evaluator import Evaluator +from .prompt_templates.response_relevance import get_template + +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") + + +class ResponseRelevanceScore(str, Enum): + """Categorical response relevance ratings.""" + + NOT_AT_ALL = "Not At All" + NOT_GENERALLY = "Not Generally" + NEUTRAL_MIXED = "Neutral/Mixed" + GENERALLY_YES = "Generally Yes" + COMPLETELY_YES = "Completely Yes" + + +class ResponseRelevanceRating(BaseModel): + """Structured output for response relevance evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + score: ResponseRelevanceScore = Field(description="Categorical response relevance rating") + + +class ResponseRelevanceEvaluator(Evaluator[InputT, OutputT]): + """Evaluates the relevance of agent responses to user questions.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + _score_mapping = { + ResponseRelevanceScore.NOT_AT_ALL: 0.0, + ResponseRelevanceScore.NOT_GENERALLY: 0.25, + ResponseRelevanceScore.NEUTRAL_MIXED: 0.5, + ResponseRelevanceScore.GENERALLY_YES: 0.75, + ResponseRelevanceScore.COMPLETELY_YES: 1.0, + } + + def __init__( + self, + version: str = "v0", + model: Union[Model, str, None] = None, + system_prompt: str | None = None, + include_inputs: bool = True, + ): + super().__init__() + self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT + self.version = version + self.model = model + self.include_inputs = include_inputs + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating) + return self._create_evaluation_output(result) + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating) + return self._create_evaluation_output(result) + + def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutput]: + rating = cast(ResponseRelevanceRating, result.structured_output) + normalized_score = self._score_mapping[rating.score] + return [ + EvaluationOutput( + score=normalized_score, + test_pass=normalized_score >= 0.5, + reason=rating.reasoning, + label=rating.score, + ) + ] + + def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: + """Extract the most recent turn from the conversation for evaluation.""" + parsed_inputs = self._parse_trajectory(evaluation_case) + if not parsed_inputs: + raise ValueError( + "No turn-level inputs could be parsed from the trajectory. " + "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan." + ) + return parsed_inputs[-1] + + def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str: + """Extract user prompt from last message in session history. + + Args: + parsed_input: Trace-level input containing session history + + Returns: + User prompt text, or empty string if not available + """ + if not parsed_input.session_history: + return "" + + last_msg = parsed_input.session_history[-1] + if not isinstance(last_msg, list) and self._has_text_content(last_msg): + first_content = last_msg.content[0] + if isinstance(first_content, TextContent): + return first_content.text + + return "" + + def _format_prompt(self, parsed_input: TraceLevelInput) -> str: + """Format evaluation prompt from parsed trace data. + + Args: + parsed_input: Trace-level input containing agent response and session history + + Returns: + Formatted prompt string with conversation context and target response + """ + parts = [] + + if parsed_input.session_history: + history_lines = [] + for msg in parsed_input.session_history: + if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution): + continue # Skip tool execution lists + if not isinstance(msg, list) and self._has_text_content(msg): + first_content = msg.content[0] + if isinstance(first_content, TextContent): + history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}") + history_str = "\n".join(history_lines) + parts.append(f"# Previous turns:\n{history_str}") + + user_prompt = self._extract_user_prompt(parsed_input) + parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}") + + return "\n\n".join(parts) diff --git a/src/strands_evals/extractors/trace_extractor.py b/src/strands_evals/extractors/trace_extractor.py index e55ef85..0e6de3a 100644 --- a/src/strands_evals/extractors/trace_extractor.py +++ b/src/strands_evals/extractors/trace_extractor.py @@ -65,8 +65,7 @@ def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]: if tool_spans: try: tool_executions = [ - ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result) - for ts in tool_spans + ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result) for ts in tool_spans ] previous_turns.append(tool_executions) except (AttributeError, TypeError, ValueError) as e: diff --git a/tests/strands_evals/evaluators/test_response_relevance_evaluator.py b/tests/strands_evals/evaluators/test_response_relevance_evaluator.py new file mode 100644 index 0000000..73280cd --- /dev/null +++ b/tests/strands_evals/evaluators/test_response_relevance_evaluator.py @@ -0,0 +1,132 @@ +"""Tests for ResponseRelevanceEvaluator.""" + +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators import ResponseRelevanceEvaluator +from strands_evals.evaluators.response_relevance_evaluator import ( + ResponseRelevanceRating, + ResponseRelevanceScore, +) +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + agent_span = AgentInvocationSpan( + span_info=span_info, + user_prompt="What is the capital of France?", + agent_response="The capital of France is Paris.", + available_tools=[], + ) + trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session") + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="What is the capital of France?", + actual_output="The capital of France is Paris.", + actual_trajectory=session, + name="test", + ) + + +def test_init_with_defaults(): + evaluator = ResponseRelevanceEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.include_inputs is True + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = ResponseRelevanceEvaluator(version="v1", model="gpt-4", system_prompt="Custom", include_inputs=False) + + assert evaluator.version == "v1" + assert evaluator.model == "gpt-4" + assert evaluator.include_inputs is False + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.response_relevance_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = ResponseRelevanceRating( + reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES + ) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = ResponseRelevanceEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response directly answers the question" + assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES + + +@pytest.mark.parametrize( + "score,expected_value,expected_pass", + [ + (ResponseRelevanceScore.NOT_AT_ALL, 0.0, False), + (ResponseRelevanceScore.NOT_GENERALLY, 0.25, False), + (ResponseRelevanceScore.NEUTRAL_MIXED, 0.5, True), + (ResponseRelevanceScore.GENERALLY_YES, 0.75, True), + (ResponseRelevanceScore.COMPLETELY_YES, 1.0, True), + ], +) +@patch("strands_evals.evaluators.response_relevance_evaluator.Agent") +def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = ResponseRelevanceRating(reasoning="Test", score=score) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = ResponseRelevanceEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + assert result[0].label == score + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.response_relevance_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = ResponseRelevanceRating( + reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES + ) + + async def mock_invoke_async(*args, **kwargs): + return mock_result + + mock_agent.invoke_async = mock_invoke_async + mock_agent_class.return_value = mock_agent + evaluator = ResponseRelevanceEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response directly answers the question" + assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES