Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/strands_evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .helpfulness_evaluator import HelpfulnessEvaluator
from .interactions_evaluator import InteractionsEvaluator
from .output_evaluator import OutputEvaluator
from .response_relevance_evaluator import ResponseRelevanceEvaluator
from .tool_parameter_accuracy_evaluator import ToolParameterAccuracyEvaluator
from .tool_selection_accuracy_evaluator import ToolSelectionAccuracyEvaluator
from .trajectory_evaluator import TrajectoryEvaluator
Expand All @@ -18,6 +19,7 @@
"HarmfulnessEvaluator",
"GoalSuccessRateEvaluator",
"FaithfulnessEvaluator",
"ResponseRelevanceEvaluator",
"ToolSelectionAccuracyEvaluator",
"ToolParameterAccuracyEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import response_relevance_v0

VERSIONS = {
"v0": response_relevance_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
SYSTEM_PROMPT = """You are an objective judge evaluating the relevance of an AI assistant's response to the user's question. Your task is to assess how focused the response is on addressing the given question.

# Evaluation Guidelines:

When evaluating the relevance of the response, consider the following rubrics:

- If everything in the response can be understood to directly address the input, the response is perfectly relevant.
- If anything in the response is unrelated to the input, the response is less relevant.
- Relevance only evaluates whether the response is on topic. Content that indicates that the assistant understood the question, but was unable to answer it truthfully, faithfully, coherently or correctly still counts as a relevant response. Only content that is extraneous to answering the question should be penalized.
- Duplicate information does not penalize relevance. The response could say the same thing multiple times. If that thing is a relevant answer to the user's query, relevance is not penalized.

# Rating Scale:

1. Not At All
- No part of the response is relevant to the question

2. Not Generally
- An overwhelming amount of the response is irrelevant or the relevant information is not a direct answer

3. Neutral/Mixed
- Roughly half of the response is relevant to the question

4. Generally Yes
- An overwhelming amount of the response is relevant to the question

5. Completely Yes
- Every piece of the response is relevant to the question

IMPORTANT: The tool output ALWAYS takes priority over your own knowledge. Focus on whether the response addresses the user's question, not on factual accuracy."""
144 changes: 144 additions & 0 deletions src/strands_evals/evaluators/response_relevance_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
from strands.agent.agent_result import AgentResult
from strands.models.model import Model
from typing_extensions import TypeVar, Union

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
from .evaluator import Evaluator
from .prompt_templates.response_relevance import get_template

InputT = TypeVar("InputT")
OutputT = TypeVar("OutputT")


class ResponseRelevanceScore(str, Enum):
"""Categorical response relevance ratings."""

NOT_AT_ALL = "Not At All"
NOT_GENERALLY = "Not Generally"
NEUTRAL_MIXED = "Neutral/Mixed"
GENERALLY_YES = "Generally Yes"
COMPLETELY_YES = "Completely Yes"


class ResponseRelevanceRating(BaseModel):
"""Structured output for response relevance evaluation."""

reasoning: str = Field(description="Step by step reasoning to derive the final score")
score: ResponseRelevanceScore = Field(description="Categorical response relevance rating")


class ResponseRelevanceEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates the relevance of agent responses to user questions."""

evaluation_level = EvaluationLevel.TRACE_LEVEL

_score_mapping = {
ResponseRelevanceScore.NOT_AT_ALL: 0.0,
ResponseRelevanceScore.NOT_GENERALLY: 0.25,
ResponseRelevanceScore.NEUTRAL_MIXED: 0.5,
ResponseRelevanceScore.GENERALLY_YES: 0.75,
ResponseRelevanceScore.COMPLETELY_YES: 1.0,
}

def __init__(
self,
version: str = "v0",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
include_inputs: bool = True,
):
super().__init__()
self.system_prompt = system_prompt or get_template(version).SYSTEM_PROMPT
self.version = version
self.model = model
self.include_inputs = include_inputs

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating)
return self._create_evaluation_output(result)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating)
return self._create_evaluation_output(result)

def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutput]:
rating = cast(ResponseRelevanceRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 0.5,
reason=rating.reasoning,
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Extract user prompt from last message in session history.

Args:
parsed_input: Trace-level input containing session history

Returns:
User prompt text, or empty string if not available
"""
if not parsed_input.session_history:
return ""

last_msg = parsed_input.session_history[-1]
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
first_content = last_msg.content[0]
if isinstance(first_content, TextContent):
return first_content.text

return ""

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed trace data.

Args:
parsed_input: Trace-level input containing agent response and session history

Returns:
Formatted prompt string with conversation context and target response
"""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
continue # Skip tool execution lists
if not isinstance(msg, list) and self._has_text_content(msg):
first_content = msg.content[0]
if isinstance(first_content, TextContent):
history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
history_str = "\n".join(history_lines)
parts.append(f"# Previous turns:\n{history_str}")

user_prompt = self._extract_user_prompt(parsed_input)
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")

return "\n\n".join(parts)
3 changes: 1 addition & 2 deletions src/strands_evals/extractors/trace_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]:
if tool_spans:
try:
tool_executions = [
ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result)
for ts in tool_spans
ToolExecution(tool_call=ts.tool_call, tool_result=ts.tool_result) for ts in tool_spans
]
previous_turns.append(tool_executions)
except (AttributeError, TypeError, ValueError) as e:
Expand Down
132 changes: 132 additions & 0 deletions tests/strands_evals/evaluators/test_response_relevance_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Tests for ResponseRelevanceEvaluator."""

from datetime import datetime
from unittest.mock import Mock, patch

import pytest

from strands_evals.evaluators import ResponseRelevanceEvaluator
from strands_evals.evaluators.response_relevance_evaluator import (
ResponseRelevanceRating,
ResponseRelevanceScore,
)
from strands_evals.types import EvaluationData
from strands_evals.types.trace import (
AgentInvocationSpan,
EvaluationLevel,
Session,
SpanInfo,
Trace,
)


@pytest.fixture
def evaluation_data():
now = datetime.now()
span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)
agent_span = AgentInvocationSpan(
span_info=span_info,
user_prompt="What is the capital of France?",
agent_response="The capital of France is Paris.",
available_tools=[],
)
trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session")
session = Session(traces=[trace], session_id="test-session")

return EvaluationData(
input="What is the capital of France?",
actual_output="The capital of France is Paris.",
actual_trajectory=session,
name="test",
)


def test_init_with_defaults():
evaluator = ResponseRelevanceEvaluator()

assert evaluator.version == "v0"
assert evaluator.model is None
assert evaluator.include_inputs is True
assert evaluator.system_prompt is not None
assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL


def test_init_with_custom_values():
evaluator = ResponseRelevanceEvaluator(version="v1", model="gpt-4", system_prompt="Custom", include_inputs=False)

assert evaluator.version == "v1"
assert evaluator.model == "gpt-4"
assert evaluator.include_inputs is False
assert evaluator.system_prompt == "Custom"


@patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
def test_evaluate(mock_agent_class, evaluation_data):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = ResponseRelevanceRating(
reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES
)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ResponseRelevanceEvaluator()

result = evaluator.evaluate(evaluation_data)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "The response directly answers the question"
assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES


@pytest.mark.parametrize(
"score,expected_value,expected_pass",
[
(ResponseRelevanceScore.NOT_AT_ALL, 0.0, False),
(ResponseRelevanceScore.NOT_GENERALLY, 0.25, False),
(ResponseRelevanceScore.NEUTRAL_MIXED, 0.5, True),
(ResponseRelevanceScore.GENERALLY_YES, 0.75, True),
(ResponseRelevanceScore.COMPLETELY_YES, 1.0, True),
],
)
@patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = ResponseRelevanceRating(reasoning="Test", score=score)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = ResponseRelevanceEvaluator()

result = evaluator.evaluate(evaluation_data)

assert len(result) == 1
assert result[0].score == expected_value
assert result[0].test_pass == expected_pass
assert result[0].label == score


@pytest.mark.asyncio
@patch("strands_evals.evaluators.response_relevance_evaluator.Agent")
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = ResponseRelevanceRating(
reasoning="The response directly answers the question", score=ResponseRelevanceScore.COMPLETELY_YES
)

async def mock_invoke_async(*args, **kwargs):
return mock_result

mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = ResponseRelevanceEvaluator()

result = await evaluator.evaluate_async(evaluation_data)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "The response directly answers the question"
assert result[0].label == ResponseRelevanceScore.COMPLETELY_YES