feat: added ResponseRelevanceEvaluator #112

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

poshinchen merged 1 commit into strands-agents:main from poshinchen:feat/response-relevance-evaluator

Feb 3, 2026

src/strands_evals/evaluators/__init__.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     from .helpfulness_evaluator import HelpfulnessEvaluator
     from .interactions_evaluator import InteractionsEvaluator
     from .output_evaluator import OutputEvaluator
+    from .response_relevance_evaluator import ResponseRelevanceEvaluator
     from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
     from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
     from .trajectory_evaluator import TrajectoryEvaluator
@@ Expand All / @@ -18,6 +19,7 @@ @@
         "HarmfulnessEvaluator",
         "GoalSuccessRateEvaluator",
         "FaithfulnessEvaluator",
+        "ResponseRelevanceEvaluator",
         "ToolSelectionAccuracyEvaluator",
         "ToolParameterAccuracyEvaluator",
     ]

src/strands_evals/evaluators/prompt_templates/response_relevance/__init__.py

-Original file line number
+Diff line change
@@ -0,0 +1,11 @@
+    from . import response_relevance_v0
+    VERSIONS = {
+        "v0": response_relevance_v0,
+    }
+    DEFAULT_VERSION = "v0"
+    def get_template(version: str = DEFAULT_VERSION):
+        return VERSIONS[version]

src/strands_evals/evaluators/prompt_templates/response_relevance/response_relevance_v0.py

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question.
+    # Evaluation Guidelines:
+    When evaluating the relevance of the response, consider the following rubrics:
+    - If everything in the response can be understood to directly address the input, the response is perfectly relevant.
+    - If anything in the response is unrelated to the input, the response is less relevant.
+    - Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized.
+    - Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized.
+    # Rating Scale:
+. Not At All
+       - No part of the response is relevant to the question
+. Not Generally
+       - An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer
+. Neutral/Mixed
+       - Roughly half of the response is relevant to the question
+. Generally Yes
+       - An overwhelming amount of the response is relevant to the question
+. Completely Yes
+       - Every piece of the response is relevant to the question
+    IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy."""

src/strands_evals/evaluators/response_relevance_evaluator.py

-Original file line number
+Diff line change
@@ -0,0 +1,144 @@
+    from enum import Enum
+    from typing import cast
+    from pydantic import BaseModel, Field
+    from strands import Agent
+    from strands.agent.agent_result import AgentResult
+    from strands.models.model import Model
+    from typing_extensions import TypeVar, Union
+    from ..types.evaluation import EvaluationData, EvaluationOutput
+    from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
+    from .evaluator import Evaluator
+    from .prompt_templates.response_relevance import get_template
+    InputT = TypeVar("InputT")
+    OutputT = TypeVar("OutputT")
+    class ResponseRelevanceScore(str, Enum):
+        """Categorical response relevance ratings."""
+        NOT_AT_ALL = "Not At All"
+        NOT_GENERALLY = "Not Generally"
+        NEUTRAL_MIXED = "Neutral/Mixed"
+        GENERALLY_YES = "Generally Yes"
+        COMPLETELY_YES = "Completely Yes"
+    class ResponseRelevanceRating(BaseModel):
+        """Structured output for response relevance evaluation."""
+        reasoning: str = Field(description="Step by step reasoning to derive the final score")
+        score: ResponseRelevanceScore = Field(description="Categorical response relevance rating")
+    class ResponseRelevanceEvaluator(Evaluator[InputT, OutputT]):
+        """Evaluates the relevance of agent responses to user questions."""
+        evaluation_level = EvaluationLevel.TRACE_LEVEL
+        _score_mapping = {
+            ResponseRelevanceScore.NOT_AT_ALL: 0.0,
+            ResponseRelevanceScore.NOT_GENERALLY: 0.25,
+            ResponseRelevanceScore.NEUTRAL_MIXED: 0.5,
+            ResponseRelevanceScore.GENERALLY_YES: 0.75,
+            ResponseRelevanceScore.COMPLETELY_YES: 1.0,
+        }
+        def __init__(
+            self,
+            version: str = "v0",
+            model: Union[Model, str, None] = None,
+            system_prompt: str | None = None,
+            include_inputs: bool = True,
+        ):
+            super().__init__()
+            self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
+            self.version = version
+            self.model = model
+            self.include_inputs = include_inputs
+        def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+            parsed_input = self._get_last_turn(evaluation_case)
+            prompt = self._format_prompt(parsed_input)
+            evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+            result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating)
+            return self._create_evaluation_output(result)
+        async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+            parsed_input = self._get_last_turn(evaluation_case)
+            prompt = self._format_prompt(parsed_input)
+            evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+            result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating)
+            return self._create_evaluation_output(result)
+        def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutput]:
+            rating = cast(ResponseRelevanceRating, result.structured_output)
+            normalized_score = self._score_mapping[rating.score]
+            return [
+                EvaluationOutput(
+                    score=normalized_score,
+                    test_pass=normalized_score >= 0.5,
+                    reason=rating.reasoning,
+                    label=rating.score,
+                )
+            ]
+        def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
+            """Extract the most recent turn from the conversation for evaluation."""
+            parsed_inputs = self._parse_trajectory(evaluation_case)
+            if not parsed_inputs:
+                raise ValueError(
+                    "No turn-level inputs could be parsed from the trajectory. "
+                    "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
+                )
+            return parsed_inputs[-1]
+        def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
+            """Extract user prompt from last message in session history.
+            Args:
+                parsed_input: Trace-level input containing session history
+            Returns:
+                User prompt text, or empty string if not available
+            """
+            if not parsed_input.session_history:
+                return ""
+            last_msg = parsed_input.session_history[-1]
+            if not isinstance(last_msg, list) and self._has_text_content(last_msg):
+                first_content = last_msg.content[0]
+                if isinstance(first_content, TextContent):
+                    return first_content.text
+            return ""
+        def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
+            """Format evaluation prompt from parsed trace data.
+            Args:
+                parsed_input: Trace-level input containing agent response and session history
+            Returns:
+                Formatted prompt string with conversation context and target response
+            """
+            parts = []
+            if parsed_input.session_history:
+                history_lines = []
+                for msg in parsed_input.session_history:
+                    if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
+                        continue  # Skip tool execution lists
+                    if not isinstance(msg, list) and self._has_text_content(msg):
+                        first_content = msg.content[0]
+                        if isinstance(first_content, TextContent):
+                            history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
+                history_str = "\n".join(history_lines)
+                parts.append(f"# Previous turns:\n{history_str}")
+            user_prompt = self._extract_user_prompt(parsed_input)
+            parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
+            return "\n\n".join(parts)

src/strands_evals/extractors/trace_extractor.py

-Original file line number
+Diff line change
@@ Expand Up @@
                     if tool_spans:
                         try:
                             tool_executions = [
-                                ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result)
-                                for ts in tool_spans
+                                ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result) for ts in tool_spans
                             ]
                             previous_turns.append(tool_executions)
                         except (AttributeError, TypeError, ValueError) as e:
@@ Expand Down @@

tests/strands_evals/evaluators/test_response_relevance_evaluator.py

-Original file line number
+Diff line change
@@ -0,0 +1,132 @@
+    """Tests for ResponseRelevanceEvaluator."""
+    from datetime import datetime
+    from unittest.mock import Mock, patch
+    import pytest
+    from strands_evals.evaluators import ResponseRelevanceEvaluator
+    from strands_evals.evaluators.response_relevance_evaluator import (
+        ResponseRelevanceRating,
+        ResponseRelevanceScore,
+    )
+    from strands_evals.types import EvaluationData
+    from strands_evals.types.trace import (
+        AgentInvocationSpan,
+        EvaluationLevel,
+        Session,
+        SpanInfo,
+        Trace,
+    )
+    @pytest.fixture
+    def evaluation_data():
+        now = datetime.now()
+        span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)
+        agent_span = AgentInvocationSpan(
+            span_info=span_info,
+            user_prompt="What is the capital of France?",
+            agent_response="The capital of France is Paris.",
+            available_tools=[],
+        )
+        trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session")
+        session = Session(traces=[trace], session_id="test-session")
+        return EvaluationData(
+            input="What is the capital of France?",
+            actual_output="The capital of France is Paris.",
+            actual_trajectory=session,
+            name="test",
+        )
+    def test_init_with_defaults():
+        evaluator = ResponseRelevanceEvaluator()
+        assert evaluator.version == "v0"
+        assert evaluator.model is None
+        assert evaluator.include_inputs is True
+        assert evaluator.system_prompt is not None
+        assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL
+    def test_init_with_custom_values():
+        evaluator = ResponseRelevanceEvaluator(version="v1", model="gpt-4", system_prompt="Custom", include_inputs=False)
+        assert evaluator.version == "v1"
+        assert evaluator.model == "gpt-4"
+        assert evaluator.include_inputs is False
+        assert evaluator.system_prompt == "Custom"
+    @patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
+    def test_evaluate(mock_agent_class, evaluation_data):
+        mock_agent = Mock()
+        mock_result = Mock()
+        mock_result.structured_output = ResponseRelevanceRating(
+            reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES
+        )
+        mock_agent.return_value = mock_result
+        mock_agent_class.return_value = mock_agent
+        evaluator = ResponseRelevanceEvaluator()
+        result = evaluator.evaluate(evaluation_data)
+        assert len(result) == 1
+        assert result[0].score == 1.0
+        assert result[0].test_pass is True
+        assert result[0].reason == "The response directly answers the question"
+        assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES
+    @pytest.mark.parametrize(
+        "score,expected_value,expected_pass",
+        [
+            (ResponseRelevanceScore.NOT_AT_ALL, 0.0, False),
+            (ResponseRelevanceScore.NOT_GENERALLY, 0.25, False),
+            (ResponseRelevanceScore.NEUTRAL_MIXED, 0.5, True),
+            (ResponseRelevanceScore.GENERALLY_YES, 0.75, True),
+            (ResponseRelevanceScore.COMPLETELY_YES, 1.0, True),
+        ],
+    )
+    @patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
+    def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
+        mock_agent = Mock()
+        mock_result = Mock()
+        mock_result.structured_output = ResponseRelevanceRating(reasoning="Test", score=score)
+        mock_agent.return_value = mock_result
+        mock_agent_class.return_value = mock_agent
+        evaluator = ResponseRelevanceEvaluator()
+        result = evaluator.evaluate(evaluation_data)
+        assert len(result) == 1
+        assert result[0].score == expected_value
+        assert result[0].test_pass == expected_pass
+        assert result[0].label == score
+    @pytest.mark.asyncio
+    @patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
+    async def test_evaluate_async(mock_agent_class, evaluation_data):
+        mock_agent = Mock()
+        mock_result = Mock()
+        mock_result.structured_output = ResponseRelevanceRating(
+            reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES
+        )
+        async def mock_invoke_async(*args, **kwargs):
+            return mock_result
+        mock_agent.invoke_async = mock_invoke_async
+        mock_agent_class.return_value = mock_agent
+        evaluator = ResponseRelevanceEvaluator()
+        result = await evaluator.evaluate_async(evaluation_data)
+        assert len(result) == 1
+        assert result[0].score == 1.0
+        assert result[0].test_pass is True
+        assert result[0].reason == "The response directly answers the question"
+        assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: added ResponseRelevanceEvaluator #112

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!