From 9e55132fa1fd1c5d9e9406e82f589d3211e3f737 Mon Sep 17 00:00:00 2001 From: Morgan Roux Date: Sun, 1 Mar 2026 09:41:45 +0100 Subject: [PATCH 1/5] fix: handle expected_invocation gracefully --- src/google/adk/evaluation/agent_evaluator.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..665134c0c0 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -87,10 +87,18 @@ class _EvalMetricResultWithInvocation(BaseModel): This is class is intentionally marked as private and is created for convenience. + + actual_invocations: These are the invocations that are obtained from the + agent under test. + expected_invocations: An optional list of invocations, if specified, + usually act as a benchmark/golden response. If these are specified + usually the expectation is that the length of this list and actual + invocaiton is the same. + eval_metric_result: This is the EvalMetricResult for the given actual and expected invocation. """ actual_invocation: Invocation - expected_invocation: Invocation + expected_invocation: Optional[Invocation] = None eval_metric_result: EvalMetricResult @@ -438,19 +446,19 @@ def _print_details( "threshold": threshold, "prompt": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.user_content - ), + ) if per_invocation_result.expected_invocation else None, "expected_response": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.final_response - ), + ) if per_invocation_result.expected_invocation else None, "actual_response": AgentEvaluator._convert_content_to_text( per_invocation_result.actual_invocation.final_response ), "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.expected_invocation.intermediate_data - ), + ) if per_invocation_result.expected_invocation else None, "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.actual_invocation.intermediate_data - ), + ) }) print( From 6a1e88b5caeb92c4d655a52b95d20ca123873a7b Mon Sep 17 00:00:00 2001 From: Morgan Roux Date: Sun, 1 Mar 2026 11:49:25 +0100 Subject: [PATCH 2/5] chore: add integration test --- .../fixture/conversation_scenario/agent.py | 25 ++++++++++++++++++ .../conversation_scenario.test.json | 11 ++++++++ .../conversation_scenario/test_config.json | 4 +++ .../integration/test_conversation_scenario.py | 26 +++++++++++++++++++ 4 files changed, 66 insertions(+) create mode 100644 tests/integration/fixture/conversation_scenario/agent.py create mode 100644 tests/integration/fixture/conversation_scenario/conversation_scenario.test.json create mode 100644 tests/integration/fixture/conversation_scenario/test_config.json create mode 100644 tests/integration/test_conversation_scenario.py diff --git a/tests/integration/fixture/conversation_scenario/agent.py b/tests/integration/fixture/conversation_scenario/agent.py new file mode 100644 index 0000000000..344b506813 --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/agent.py @@ -0,0 +1,25 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from google.adk import Agent + +root_agent = Agent( + model="gemini-2.5-flash", + name="Conversation_scenario_agent", + instruction=""" + You are helpful conversation agent. Kindly answer the questions. + """, + tools=[], +) diff --git a/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json new file mode 100644 index 0000000000..c22f0c90d0 --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json @@ -0,0 +1,11 @@ +{ + "eval_set_id": "test", + "eval_cases": [{ + "eval_id": "CS-1", + "conversation_scenario": { + "starting_prompt": "Hello", + "conversation_plan": "Ask the agent to do something" + }, + "session_input": null + }] +} diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json new file mode 100644 index 0000000000..74c2870fe4 --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/test_config.json @@ -0,0 +1,4 @@ +{ + "criteria": { "hallucinations_v1": { "threshold": 0.8 } }, + "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 10 } +} diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py new file mode 100644 index 0000000000..106d2ab5ac --- /dev/null +++ b/tests/integration/test_conversation_scenario.py @@ -0,0 +1,26 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.adk.evaluation.agent_evaluator import AgentEvaluator +import pytest + + +@pytest.mark.asyncio +async def test_eval_agent_with_conversation_scenario(): + await AgentEvaluator.evaluate( + agent_module="tests.integration.fixture.conversation_scenario.agent", + eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json", + num_runs=2, + ) + From cadf758986a175632d7dcfec9ea958accb432bab Mon Sep 17 00:00:00 2001 From: Morgan Roux Date: Sun, 1 Mar 2026 16:24:10 +0100 Subject: [PATCH 3/5] chore: update code to test num_runs=2 --- src/google/adk/evaluation/agent_evaluator.py | 6 +++--- tests/integration/fixture/conversation_scenario/__init__.py | 1 + .../fixture/conversation_scenario/test_config.json | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 tests/integration/fixture/conversation_scenario/__init__.py diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 665134c0c0..4c9da7bd26 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -446,16 +446,16 @@ def _print_details( "threshold": threshold, "prompt": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.user_content - ) if per_invocation_result.expected_invocation else None, + ) if per_invocation_result.expected_invocation else "", "expected_response": AgentEvaluator._convert_content_to_text( per_invocation_result.expected_invocation.final_response - ) if per_invocation_result.expected_invocation else None, + ) if per_invocation_result.expected_invocation else "", "actual_response": AgentEvaluator._convert_content_to_text( per_invocation_result.actual_invocation.final_response ), "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.expected_invocation.intermediate_data - ) if per_invocation_result.expected_invocation else None, + ) if per_invocation_result.expected_invocation else "", "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.actual_invocation.intermediate_data ) diff --git a/tests/integration/fixture/conversation_scenario/__init__.py b/tests/integration/fixture/conversation_scenario/__init__.py new file mode 100644 index 0000000000..02c597e11e --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json index 74c2870fe4..f0c1b8510a 100644 --- a/tests/integration/fixture/conversation_scenario/test_config.json +++ b/tests/integration/fixture/conversation_scenario/test_config.json @@ -1,4 +1,4 @@ { - "criteria": { "hallucinations_v1": { "threshold": 0.8 } }, - "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 10 } + "criteria": { "hallucinations_v1": { "threshold": 0.6 } }, + "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 5 } } From 1ec04ae025716a33d6702717e05e0a56e4dc317b Mon Sep 17 00:00:00 2001 From: Morgan Roux Date: Sun, 1 Mar 2026 16:41:44 +0100 Subject: [PATCH 4/5] test: add unittest --- .../evaluation/test_agent_evaluator.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py new file mode 100644 index 0000000000..22a64172e6 --- /dev/null +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -0,0 +1,152 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sys +from unittest.mock import MagicMock +from unittest.mock import patch + +from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetricResult +from google.adk.evaluation.eval_metrics import EvalStatus +from google.genai import types as genai_types + + +def _make_actual_invocation( + query: str = "user query", response: str = "agent response" +) -> Invocation: + return Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text=query)], role="user" + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text=response)], role="model" + ), + ) + + +def _make_eval_metric_result( + score: float = 0.9, status: EvalStatus = EvalStatus.PASSED +) -> EvalMetricResult: + return EvalMetricResult( + metric_name="test_metric", + threshold=0.8, + score=score, + eval_status=status, + ) + + +def _call_print_details( + items: list[_EvalMetricResultWithInvocation], +) -> MagicMock: + """Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class.""" + mock_pandas = MagicMock() + mock_tabulate_module = MagicMock() + mock_tabulate_module.tabulate = MagicMock(return_value="table") + + with patch.dict( + sys.modules, + {"pandas": mock_pandas, "tabulate": mock_tabulate_module}, + ): + AgentEvaluator._print_details( + eval_metric_result_with_invocations=items, + overall_eval_status=EvalStatus.PASSED, + overall_score=0.9, + metric_name="test_metric", + threshold=0.8, + ) + + return mock_pandas.pandas.DataFrame + + +class TestPrintDetailsWithNoExpectedInvocation: + """Tests for _print_details when expected_invocation is None.""" + + def test_does_not_raise(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + _call_print_details(items) # should not raise + + def test_prompt_is_empty_string(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert data[0]["prompt"] == "" + + def test_expected_response_is_empty_string(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert data[0]["expected_response"] == "" + + def test_expected_tool_calls_is_empty_string(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert data[0]["expected_tool_calls"] == "" + + def test_actual_response_is_populated(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(response="hello world"), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert data[0]["actual_response"] == "hello world" + + def test_multiple_invocations_all_without_expected(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(response=f"response {i}"), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + for i in range(3) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert len(data) == 3 + for row in data: + assert row["prompt"] == "" + assert row["expected_response"] == "" + assert row["expected_tool_calls"] == "" From 57e968b4832320309f0efdfc9e04d17f21d9051a Mon Sep 17 00:00:00 2001 From: Morgan Roux Date: Sun, 1 Mar 2026 16:43:41 +0100 Subject: [PATCH 5/5] chore: reformat --- contributing/samples/gepa/experiment.py | 1 - contributing/samples/gepa/run_experiment.py | 1 - src/google/adk/evaluation/agent_evaluator.py | 32 +++++++++---- .../integration/test_conversation_scenario.py | 1 - .../evaluation/test_agent_evaluator.py | 48 ------------------- 5 files changed, 22 insertions(+), 61 deletions(-) diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py index f3751206a8..2710c3894c 100644 --- a/contributing/samples/gepa/experiment.py +++ b/contributing/samples/gepa/experiment.py @@ -43,7 +43,6 @@ from tau_bench.types import EnvRunResult from tau_bench.types import RunConfig import tau_bench_agent as tau_bench_agent_lib - import utils diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py index d857da9635..e31db15788 100644 --- a/contributing/samples/gepa/run_experiment.py +++ b/contributing/samples/gepa/run_experiment.py @@ -25,7 +25,6 @@ from absl import flags import experiment from google.genai import types - import utils _OUTPUT_DIR = flags.DEFINE_string( diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 4c9da7bd26..d25a1f2735 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -444,21 +444,33 @@ def _print_details( "eval_status": per_invocation_result.eval_metric_result.eval_status, "score": per_invocation_result.eval_metric_result.score, "threshold": threshold, - "prompt": AgentEvaluator._convert_content_to_text( - per_invocation_result.expected_invocation.user_content - ) if per_invocation_result.expected_invocation else "", - "expected_response": AgentEvaluator._convert_content_to_text( - per_invocation_result.expected_invocation.final_response - ) if per_invocation_result.expected_invocation else "", + "prompt": ( + AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.user_content + ) + if per_invocation_result.expected_invocation + else "" + ), + "expected_response": ( + AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.final_response + ) + if per_invocation_result.expected_invocation + else "" + ), "actual_response": AgentEvaluator._convert_content_to_text( per_invocation_result.actual_invocation.final_response ), - "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( - per_invocation_result.expected_invocation.intermediate_data - ) if per_invocation_result.expected_invocation else "", + "expected_tool_calls": ( + AgentEvaluator._convert_tool_calls_to_text( + per_invocation_result.expected_invocation.intermediate_data + ) + if per_invocation_result.expected_invocation + else "" + ), "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.actual_invocation.intermediate_data - ) + ), }) print( diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py index 106d2ab5ac..b7c39dc090 100644 --- a/tests/integration/test_conversation_scenario.py +++ b/tests/integration/test_conversation_scenario.py @@ -23,4 +23,3 @@ async def test_eval_agent_with_conversation_scenario(): eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json", num_runs=2, ) - diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py index 22a64172e6..cf0fb4e880 100644 --- a/tests/unittests/evaluation/test_agent_evaluator.py +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -86,54 +86,6 @@ def test_does_not_raise(self): ] _call_print_details(items) # should not raise - def test_prompt_is_empty_string(self): - items = [ - _EvalMetricResultWithInvocation( - actual_invocation=_make_actual_invocation(), - expected_invocation=None, - eval_metric_result=_make_eval_metric_result(), - ) - ] - mock_df_cls = _call_print_details(items) - data = mock_df_cls.call_args[0][0] - assert data[0]["prompt"] == "" - - def test_expected_response_is_empty_string(self): - items = [ - _EvalMetricResultWithInvocation( - actual_invocation=_make_actual_invocation(), - expected_invocation=None, - eval_metric_result=_make_eval_metric_result(), - ) - ] - mock_df_cls = _call_print_details(items) - data = mock_df_cls.call_args[0][0] - assert data[0]["expected_response"] == "" - - def test_expected_tool_calls_is_empty_string(self): - items = [ - _EvalMetricResultWithInvocation( - actual_invocation=_make_actual_invocation(), - expected_invocation=None, - eval_metric_result=_make_eval_metric_result(), - ) - ] - mock_df_cls = _call_print_details(items) - data = mock_df_cls.call_args[0][0] - assert data[0]["expected_tool_calls"] == "" - - def test_actual_response_is_populated(self): - items = [ - _EvalMetricResultWithInvocation( - actual_invocation=_make_actual_invocation(response="hello world"), - expected_invocation=None, - eval_metric_result=_make_eval_metric_result(), - ) - ] - mock_df_cls = _call_print_details(items) - data = mock_df_cls.call_args[0][0] - assert data[0]["actual_response"] == "hello world" - def test_multiple_invocations_all_without_expected(self): items = [ _EvalMetricResultWithInvocation(