diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py index f3751206a8..2710c3894c 100644 --- a/contributing/samples/gepa/experiment.py +++ b/contributing/samples/gepa/experiment.py @@ -43,7 +43,6 @@ from tau_bench.types import EnvRunResult from tau_bench.types import RunConfig import tau_bench_agent as tau_bench_agent_lib - import utils diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py index d857da9635..e31db15788 100644 --- a/contributing/samples/gepa/run_experiment.py +++ b/contributing/samples/gepa/run_experiment.py @@ -25,7 +25,6 @@ from absl import flags import experiment from google.genai import types - import utils _OUTPUT_DIR = flags.DEFINE_string( diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..d25a1f2735 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -87,10 +87,18 @@ class _EvalMetricResultWithInvocation(BaseModel): This is class is intentionally marked as private and is created for convenience. + + actual_invocations: These are the invocations that are obtained from the + agent under test. + expected_invocations: An optional list of invocations, if specified, + usually act as a benchmark/golden response. If these are specified + usually the expectation is that the length of this list and actual + invocaiton is the same. + eval_metric_result: This is the EvalMetricResult for the given actual and expected invocation. """ actual_invocation: Invocation - expected_invocation: Invocation + expected_invocation: Optional[Invocation] = None eval_metric_result: EvalMetricResult @@ -436,17 +444,29 @@ def _print_details( "eval_status": per_invocation_result.eval_metric_result.eval_status, "score": per_invocation_result.eval_metric_result.score, "threshold": threshold, - "prompt": AgentEvaluator._convert_content_to_text( - per_invocation_result.expected_invocation.user_content + "prompt": ( + AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.user_content + ) + if per_invocation_result.expected_invocation + else "" ), - "expected_response": AgentEvaluator._convert_content_to_text( - per_invocation_result.expected_invocation.final_response + "expected_response": ( + AgentEvaluator._convert_content_to_text( + per_invocation_result.expected_invocation.final_response + ) + if per_invocation_result.expected_invocation + else "" ), "actual_response": AgentEvaluator._convert_content_to_text( per_invocation_result.actual_invocation.final_response ), - "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( - per_invocation_result.expected_invocation.intermediate_data + "expected_tool_calls": ( + AgentEvaluator._convert_tool_calls_to_text( + per_invocation_result.expected_invocation.intermediate_data + ) + if per_invocation_result.expected_invocation + else "" ), "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( per_invocation_result.actual_invocation.intermediate_data diff --git a/tests/integration/fixture/conversation_scenario/__init__.py b/tests/integration/fixture/conversation_scenario/__init__.py new file mode 100644 index 0000000000..02c597e11e --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/__init__.py @@ -0,0 +1 @@ +from . import agent diff --git a/tests/integration/fixture/conversation_scenario/agent.py b/tests/integration/fixture/conversation_scenario/agent.py new file mode 100644 index 0000000000..344b506813 --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/agent.py @@ -0,0 +1,25 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from google.adk import Agent + +root_agent = Agent( + model="gemini-2.5-flash", + name="Conversation_scenario_agent", + instruction=""" + You are helpful conversation agent. Kindly answer the questions. + """, + tools=[], +) diff --git a/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json new file mode 100644 index 0000000000..c22f0c90d0 --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json @@ -0,0 +1,11 @@ +{ + "eval_set_id": "test", + "eval_cases": [{ + "eval_id": "CS-1", + "conversation_scenario": { + "starting_prompt": "Hello", + "conversation_plan": "Ask the agent to do something" + }, + "session_input": null + }] +} diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json new file mode 100644 index 0000000000..f0c1b8510a --- /dev/null +++ b/tests/integration/fixture/conversation_scenario/test_config.json @@ -0,0 +1,4 @@ +{ + "criteria": { "hallucinations_v1": { "threshold": 0.6 } }, + "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 5 } +} diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py new file mode 100644 index 0000000000..b7c39dc090 --- /dev/null +++ b/tests/integration/test_conversation_scenario.py @@ -0,0 +1,25 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.adk.evaluation.agent_evaluator import AgentEvaluator +import pytest + + +@pytest.mark.asyncio +async def test_eval_agent_with_conversation_scenario(): + await AgentEvaluator.evaluate( + agent_module="tests.integration.fixture.conversation_scenario.agent", + eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json", + num_runs=2, + ) diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py new file mode 100644 index 0000000000..cf0fb4e880 --- /dev/null +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -0,0 +1,104 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import sys +from unittest.mock import MagicMock +from unittest.mock import patch + +from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetricResult +from google.adk.evaluation.eval_metrics import EvalStatus +from google.genai import types as genai_types + + +def _make_actual_invocation( + query: str = "user query", response: str = "agent response" +) -> Invocation: + return Invocation( + user_content=genai_types.Content( + parts=[genai_types.Part(text=query)], role="user" + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text=response)], role="model" + ), + ) + + +def _make_eval_metric_result( + score: float = 0.9, status: EvalStatus = EvalStatus.PASSED +) -> EvalMetricResult: + return EvalMetricResult( + metric_name="test_metric", + threshold=0.8, + score=score, + eval_status=status, + ) + + +def _call_print_details( + items: list[_EvalMetricResultWithInvocation], +) -> MagicMock: + """Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class.""" + mock_pandas = MagicMock() + mock_tabulate_module = MagicMock() + mock_tabulate_module.tabulate = MagicMock(return_value="table") + + with patch.dict( + sys.modules, + {"pandas": mock_pandas, "tabulate": mock_tabulate_module}, + ): + AgentEvaluator._print_details( + eval_metric_result_with_invocations=items, + overall_eval_status=EvalStatus.PASSED, + overall_score=0.9, + metric_name="test_metric", + threshold=0.8, + ) + + return mock_pandas.pandas.DataFrame + + +class TestPrintDetailsWithNoExpectedInvocation: + """Tests for _print_details when expected_invocation is None.""" + + def test_does_not_raise(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + ] + _call_print_details(items) # should not raise + + def test_multiple_invocations_all_without_expected(self): + items = [ + _EvalMetricResultWithInvocation( + actual_invocation=_make_actual_invocation(response=f"response {i}"), + expected_invocation=None, + eval_metric_result=_make_eval_metric_result(), + ) + for i in range(3) + ] + mock_df_cls = _call_print_details(items) + data = mock_df_cls.call_args[0][0] + assert len(data) == 3 + for row in data: + assert row["prompt"] == "" + assert row["expected_response"] == "" + assert row["expected_tool_calls"] == ""