Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion contributing/samples/gepa/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
from tau_bench.types import EnvRunResult
from tau_bench.types import RunConfig
import tau_bench_agent as tau_bench_agent_lib

import utils


Expand Down
1 change: 0 additions & 1 deletion contributing/samples/gepa/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from absl import flags
import experiment
from google.genai import types

import utils

_OUTPUT_DIR = flags.DEFINE_string(
Expand Down
34 changes: 27 additions & 7 deletions src/google/adk/evaluation/agent_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,18 @@ class _EvalMetricResultWithInvocation(BaseModel):

This is class is intentionally marked as private and is created for
convenience.

actual_invocations: These are the invocations that are obtained from the
agent under test.
expected_invocations: An optional list of invocations, if specified,
usually act as a benchmark/golden response. If these are specified
usually the expectation is that the length of this list and actual
invocaiton is the same.
eval_metric_result: This is the EvalMetricResult for the given actual and expected invocation.
"""

actual_invocation: Invocation
expected_invocation: Invocation
expected_invocation: Optional[Invocation] = None
eval_metric_result: EvalMetricResult


Expand Down Expand Up @@ -436,17 +444,29 @@ def _print_details(
"eval_status": per_invocation_result.eval_metric_result.eval_status,
"score": per_invocation_result.eval_metric_result.score,
"threshold": threshold,
"prompt": AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.user_content
"prompt": (
AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.user_content
)
if per_invocation_result.expected_invocation
else ""
),
"expected_response": AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.final_response
"expected_response": (
AgentEvaluator._convert_content_to_text(
per_invocation_result.expected_invocation.final_response
)
if per_invocation_result.expected_invocation
else ""
),
"actual_response": AgentEvaluator._convert_content_to_text(
per_invocation_result.actual_invocation.final_response
),
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
per_invocation_result.expected_invocation.intermediate_data
"expected_tool_calls": (
AgentEvaluator._convert_tool_calls_to_text(
per_invocation_result.expected_invocation.intermediate_data
)
if per_invocation_result.expected_invocation
else ""
),
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
per_invocation_result.actual_invocation.intermediate_data
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import agent
25 changes: 25 additions & 0 deletions tests/integration/fixture/conversation_scenario/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from google.adk import Agent

root_agent = Agent(
model="gemini-2.5-flash",
name="Conversation_scenario_agent",
instruction="""
You are helpful conversation agent. Kindly answer the questions.
""",
tools=[],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"eval_set_id": "test",
"eval_cases": [{
"eval_id": "CS-1",
"conversation_scenario": {
"starting_prompt": "Hello",
"conversation_plan": "Ask the agent to do something"
},
"session_input": null
}]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"criteria": { "hallucinations_v1": { "threshold": 0.6 } },
"user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 5 }
}
25 changes: 25 additions & 0 deletions tests/integration/test_conversation_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from google.adk.evaluation.agent_evaluator import AgentEvaluator
import pytest


@pytest.mark.asyncio
async def test_eval_agent_with_conversation_scenario():
await AgentEvaluator.evaluate(
agent_module="tests.integration.fixture.conversation_scenario.agent",
eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json",
num_runs=2,
)
104 changes: 104 additions & 0 deletions tests/unittests/evaluation/test_agent_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import sys
from unittest.mock import MagicMock
from unittest.mock import patch

from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
from google.adk.evaluation.agent_evaluator import AgentEvaluator
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetricResult
from google.adk.evaluation.eval_metrics import EvalStatus
from google.genai import types as genai_types


def _make_actual_invocation(
query: str = "user query", response: str = "agent response"
) -> Invocation:
return Invocation(
user_content=genai_types.Content(
parts=[genai_types.Part(text=query)], role="user"
),
final_response=genai_types.Content(
parts=[genai_types.Part(text=response)], role="model"
),
)


def _make_eval_metric_result(
score: float = 0.9, status: EvalStatus = EvalStatus.PASSED
) -> EvalMetricResult:
return EvalMetricResult(
metric_name="test_metric",
threshold=0.8,
score=score,
eval_status=status,
)


def _call_print_details(
items: list[_EvalMetricResultWithInvocation],
) -> MagicMock:
"""Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class."""
mock_pandas = MagicMock()
mock_tabulate_module = MagicMock()
mock_tabulate_module.tabulate = MagicMock(return_value="table")

with patch.dict(
sys.modules,
{"pandas": mock_pandas, "tabulate": mock_tabulate_module},
):
AgentEvaluator._print_details(
eval_metric_result_with_invocations=items,
overall_eval_status=EvalStatus.PASSED,
overall_score=0.9,
metric_name="test_metric",
threshold=0.8,
)

return mock_pandas.pandas.DataFrame


class TestPrintDetailsWithNoExpectedInvocation:
"""Tests for _print_details when expected_invocation is None."""

def test_does_not_raise(self):
items = [
_EvalMetricResultWithInvocation(
actual_invocation=_make_actual_invocation(),
expected_invocation=None,
eval_metric_result=_make_eval_metric_result(),
)
]
_call_print_details(items) # should not raise

def test_multiple_invocations_all_without_expected(self):
items = [
_EvalMetricResultWithInvocation(
actual_invocation=_make_actual_invocation(response=f"response {i}"),
expected_invocation=None,
eval_metric_result=_make_eval_metric_result(),
)
for i in range(3)
]
mock_df_cls = _call_print_details(items)
data = mock_df_cls.call_args[0][0]
assert len(data) == 3
for row in data:
assert row["prompt"] == ""
assert row["expected_response"] == ""
assert row["expected_tool_calls"] == ""
Loading