From 9e55132fa1fd1c5d9e9406e82f589d3211e3f737 Mon Sep 17 00:00:00 2001
From: Morgan Roux <roux.morgan@gmail.com>
Date: Sun, 1 Mar 2026 09:41:45 +0100
Subject: [PATCH 1/5] fix: handle expected_invocation gracefully

---
 src/google/adk/evaluation/agent_evaluator.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index c0fc736340..665134c0c0 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -87,10 +87,18 @@ class _EvalMetricResultWithInvocation(BaseModel):
 
   This is class is intentionally marked as private and is created for
   convenience.
+
+  actual_invocations: These are the invocations that are obtained from the
+    agent under test.
+  expected_invocations: An optional list of invocations, if specified,
+    usually act as a benchmark/golden response. If these are specified
+    usually the expectation is that the length of this list and actual
+    invocaiton is the same.
+  eval_metric_result: This is the EvalMetricResult for the given actual and expected invocation.
   """
 
   actual_invocation: Invocation
-  expected_invocation: Invocation
+  expected_invocation: Optional[Invocation] = None
   eval_metric_result: EvalMetricResult
 
 
@@ -438,19 +446,19 @@ def _print_details(
           "threshold": threshold,
           "prompt": AgentEvaluator._convert_content_to_text(
               per_invocation_result.expected_invocation.user_content
-          ),
+          ) if per_invocation_result.expected_invocation else None,
           "expected_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.expected_invocation.final_response
-          ),
+          ) if per_invocation_result.expected_invocation else None,
           "actual_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.actual_invocation.final_response
           ),
           "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.expected_invocation.intermediate_data
-          ),
+          ) if per_invocation_result.expected_invocation else None,
           "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.actual_invocation.intermediate_data
-          ),
+          )
       })
 
     print(

From 6a1e88b5caeb92c4d655a52b95d20ca123873a7b Mon Sep 17 00:00:00 2001
From: Morgan Roux <roux.morgan@gmail.com>
Date: Sun, 1 Mar 2026 11:49:25 +0100
Subject: [PATCH 2/5] chore: add integration test

---
 .../fixture/conversation_scenario/agent.py    | 25 ++++++++++++++++++
 .../conversation_scenario.test.json           | 11 ++++++++
 .../conversation_scenario/test_config.json    |  4 +++
 .../integration/test_conversation_scenario.py | 26 +++++++++++++++++++
 4 files changed, 66 insertions(+)
 create mode 100644 tests/integration/fixture/conversation_scenario/agent.py
 create mode 100644 tests/integration/fixture/conversation_scenario/conversation_scenario.test.json
 create mode 100644 tests/integration/fixture/conversation_scenario/test_config.json
 create mode 100644 tests/integration/test_conversation_scenario.py

diff --git a/tests/integration/fixture/conversation_scenario/agent.py b/tests/integration/fixture/conversation_scenario/agent.py
new file mode 100644
index 0000000000..344b506813
--- /dev/null
+++ b/tests/integration/fixture/conversation_scenario/agent.py
@@ -0,0 +1,25 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from google.adk import Agent
+
+root_agent = Agent(
+    model="gemini-2.5-flash",
+    name="Conversation_scenario_agent",
+    instruction="""
+    You are helpful conversation agent. Kindly answer the questions.
+    """,
+    tools=[],
+)
diff --git a/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json
new file mode 100644
index 0000000000..c22f0c90d0
--- /dev/null
+++ b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json
@@ -0,0 +1,11 @@
+{
+  "eval_set_id": "test",
+  "eval_cases": [{
+    "eval_id": "CS-1",
+    "conversation_scenario": {
+      "starting_prompt": "Hello",
+      "conversation_plan": "Ask the agent to do something"
+    },
+    "session_input": null
+  }]
+}
diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json
new file mode 100644
index 0000000000..74c2870fe4
--- /dev/null
+++ b/tests/integration/fixture/conversation_scenario/test_config.json
@@ -0,0 +1,4 @@
+{
+  "criteria": { "hallucinations_v1": { "threshold": 0.8 } },
+  "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 10 }
+}
diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py
new file mode 100644
index 0000000000..106d2ab5ac
--- /dev/null
+++ b/tests/integration/test_conversation_scenario.py
@@ -0,0 +1,26 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_eval_agent_with_conversation_scenario():
+  await AgentEvaluator.evaluate(
+      agent_module="tests.integration.fixture.conversation_scenario.agent",
+      eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json",
+      num_runs=2,
+  )
+

From cadf758986a175632d7dcfec9ea958accb432bab Mon Sep 17 00:00:00 2001
From: Morgan Roux <roux.morgan@gmail.com>
Date: Sun, 1 Mar 2026 16:24:10 +0100
Subject: [PATCH 3/5] chore: update code to test num_runs=2

---
 src/google/adk/evaluation/agent_evaluator.py                | 6 +++---
 tests/integration/fixture/conversation_scenario/__init__.py | 1 +
 .../fixture/conversation_scenario/test_config.json          | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)
 create mode 100644 tests/integration/fixture/conversation_scenario/__init__.py

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index 665134c0c0..4c9da7bd26 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -446,16 +446,16 @@ def _print_details(
           "threshold": threshold,
           "prompt": AgentEvaluator._convert_content_to_text(
               per_invocation_result.expected_invocation.user_content
-          ) if per_invocation_result.expected_invocation else None,
+          ) if per_invocation_result.expected_invocation else "",
           "expected_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.expected_invocation.final_response
-          ) if per_invocation_result.expected_invocation else None,
+          ) if per_invocation_result.expected_invocation else "",
           "actual_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.actual_invocation.final_response
           ),
           "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.expected_invocation.intermediate_data
-          ) if per_invocation_result.expected_invocation else None,
+          ) if per_invocation_result.expected_invocation else "",
           "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.actual_invocation.intermediate_data
           )
diff --git a/tests/integration/fixture/conversation_scenario/__init__.py b/tests/integration/fixture/conversation_scenario/__init__.py
new file mode 100644
index 0000000000..02c597e11e
--- /dev/null
+++ b/tests/integration/fixture/conversation_scenario/__init__.py
@@ -0,0 +1 @@
+from . import agent
diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json
index 74c2870fe4..f0c1b8510a 100644
--- a/tests/integration/fixture/conversation_scenario/test_config.json
+++ b/tests/integration/fixture/conversation_scenario/test_config.json
@@ -1,4 +1,4 @@
 {
-  "criteria": { "hallucinations_v1": { "threshold": 0.8 } },
-  "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 10 }
+  "criteria": { "hallucinations_v1": { "threshold": 0.6 } },
+  "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 5 }
 }

From 1ec04ae025716a33d6702717e05e0a56e4dc317b Mon Sep 17 00:00:00 2001
From: Morgan Roux <roux.morgan@gmail.com>
Date: Sun, 1 Mar 2026 16:41:44 +0100
Subject: [PATCH 4/5] test: add unittest

---
 .../evaluation/test_agent_evaluator.py        | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py

diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
new file mode 100644
index 0000000000..22a64172e6
--- /dev/null
+++ b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -0,0 +1,152 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetricResult
+from google.adk.evaluation.eval_metrics import EvalStatus
+from google.genai import types as genai_types
+
+
+def _make_actual_invocation(
+    query: str = "user query", response: str = "agent response"
+) -> Invocation:
+  return Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text=query)], role="user"
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=response)], role="model"
+      ),
+  )
+
+
+def _make_eval_metric_result(
+    score: float = 0.9, status: EvalStatus = EvalStatus.PASSED
+) -> EvalMetricResult:
+  return EvalMetricResult(
+      metric_name="test_metric",
+      threshold=0.8,
+      score=score,
+      eval_status=status,
+  )
+
+
+def _call_print_details(
+    items: list[_EvalMetricResultWithInvocation],
+) -> MagicMock:
+  """Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class."""
+  mock_pandas = MagicMock()
+  mock_tabulate_module = MagicMock()
+  mock_tabulate_module.tabulate = MagicMock(return_value="table")
+
+  with patch.dict(
+      sys.modules,
+      {"pandas": mock_pandas, "tabulate": mock_tabulate_module},
+  ):
+    AgentEvaluator._print_details(
+        eval_metric_result_with_invocations=items,
+        overall_eval_status=EvalStatus.PASSED,
+        overall_score=0.9,
+        metric_name="test_metric",
+        threshold=0.8,
+    )
+
+  return mock_pandas.pandas.DataFrame
+
+
+class TestPrintDetailsWithNoExpectedInvocation:
+  """Tests for _print_details when expected_invocation is None."""
+
+  def test_does_not_raise(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    _call_print_details(items)  # should not raise
+
+  def test_prompt_is_empty_string(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert data[0]["prompt"] == ""
+
+  def test_expected_response_is_empty_string(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert data[0]["expected_response"] == ""
+
+  def test_expected_tool_calls_is_empty_string(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert data[0]["expected_tool_calls"] == ""
+
+  def test_actual_response_is_populated(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(response="hello world"),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert data[0]["actual_response"] == "hello world"
+
+  def test_multiple_invocations_all_without_expected(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(response=f"response {i}"),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+        for i in range(3)
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert len(data) == 3
+    for row in data:
+      assert row["prompt"] == ""
+      assert row["expected_response"] == ""
+      assert row["expected_tool_calls"] == ""

From 57e968b4832320309f0efdfc9e04d17f21d9051a Mon Sep 17 00:00:00 2001
From: Morgan Roux <roux.morgan@gmail.com>
Date: Sun, 1 Mar 2026 16:43:41 +0100
Subject: [PATCH 5/5] chore: reformat

---
 contributing/samples/gepa/experiment.py       |  1 -
 contributing/samples/gepa/run_experiment.py   |  1 -
 src/google/adk/evaluation/agent_evaluator.py  | 32 +++++++++----
 .../integration/test_conversation_scenario.py |  1 -
 .../evaluation/test_agent_evaluator.py        | 48 -------------------
 5 files changed, 22 insertions(+), 61 deletions(-)

diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py
index f3751206a8..2710c3894c 100644
--- a/contributing/samples/gepa/experiment.py
+++ b/contributing/samples/gepa/experiment.py
@@ -43,7 +43,6 @@
 from tau_bench.types import EnvRunResult
 from tau_bench.types import RunConfig
 import tau_bench_agent as tau_bench_agent_lib
-
 import utils
 
 
diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py
index d857da9635..e31db15788 100644
--- a/contributing/samples/gepa/run_experiment.py
+++ b/contributing/samples/gepa/run_experiment.py
@@ -25,7 +25,6 @@
 from absl import flags
 import experiment
 from google.genai import types
-
 import utils
 
 _OUTPUT_DIR = flags.DEFINE_string(
diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index 4c9da7bd26..d25a1f2735 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -444,21 +444,33 @@ def _print_details(
           "eval_status": per_invocation_result.eval_metric_result.eval_status,
           "score": per_invocation_result.eval_metric_result.score,
           "threshold": threshold,
-          "prompt": AgentEvaluator._convert_content_to_text(
-              per_invocation_result.expected_invocation.user_content
-          ) if per_invocation_result.expected_invocation else "",
-          "expected_response": AgentEvaluator._convert_content_to_text(
-              per_invocation_result.expected_invocation.final_response
-          ) if per_invocation_result.expected_invocation else "",
+          "prompt": (
+              AgentEvaluator._convert_content_to_text(
+                  per_invocation_result.expected_invocation.user_content
+              )
+              if per_invocation_result.expected_invocation
+              else ""
+          ),
+          "expected_response": (
+              AgentEvaluator._convert_content_to_text(
+                  per_invocation_result.expected_invocation.final_response
+              )
+              if per_invocation_result.expected_invocation
+              else ""
+          ),
           "actual_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.actual_invocation.final_response
           ),
-          "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
-              per_invocation_result.expected_invocation.intermediate_data
-          ) if per_invocation_result.expected_invocation else "",
+          "expected_tool_calls": (
+              AgentEvaluator._convert_tool_calls_to_text(
+                  per_invocation_result.expected_invocation.intermediate_data
+              )
+              if per_invocation_result.expected_invocation
+              else ""
+          ),
           "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.actual_invocation.intermediate_data
-          )
+          ),
       })
 
     print(
diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py
index 106d2ab5ac..b7c39dc090 100644
--- a/tests/integration/test_conversation_scenario.py
+++ b/tests/integration/test_conversation_scenario.py
@@ -23,4 +23,3 @@ async def test_eval_agent_with_conversation_scenario():
       eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json",
       num_runs=2,
   )
-
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
index 22a64172e6..cf0fb4e880 100644
--- a/tests/unittests/evaluation/test_agent_evaluator.py
+++ b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -86,54 +86,6 @@ def test_does_not_raise(self):
     ]
     _call_print_details(items)  # should not raise
 
-  def test_prompt_is_empty_string(self):
-    items = [
-        _EvalMetricResultWithInvocation(
-            actual_invocation=_make_actual_invocation(),
-            expected_invocation=None,
-            eval_metric_result=_make_eval_metric_result(),
-        )
-    ]
-    mock_df_cls = _call_print_details(items)
-    data = mock_df_cls.call_args[0][0]
-    assert data[0]["prompt"] == ""
-
-  def test_expected_response_is_empty_string(self):
-    items = [
-        _EvalMetricResultWithInvocation(
-            actual_invocation=_make_actual_invocation(),
-            expected_invocation=None,
-            eval_metric_result=_make_eval_metric_result(),
-        )
-    ]
-    mock_df_cls = _call_print_details(items)
-    data = mock_df_cls.call_args[0][0]
-    assert data[0]["expected_response"] == ""
-
-  def test_expected_tool_calls_is_empty_string(self):
-    items = [
-        _EvalMetricResultWithInvocation(
-            actual_invocation=_make_actual_invocation(),
-            expected_invocation=None,
-            eval_metric_result=_make_eval_metric_result(),
-        )
-    ]
-    mock_df_cls = _call_print_details(items)
-    data = mock_df_cls.call_args[0][0]
-    assert data[0]["expected_tool_calls"] == ""
-
-  def test_actual_response_is_populated(self):
-    items = [
-        _EvalMetricResultWithInvocation(
-            actual_invocation=_make_actual_invocation(response="hello world"),
-            expected_invocation=None,
-            eval_metric_result=_make_eval_metric_result(),
-        )
-    ]
-    mock_df_cls = _call_print_details(items)
-    data = mock_df_cls.call_args[0][0]
-    assert data[0]["actual_response"] == "hello world"
-
   def test_multiple_invocations_all_without_expected(self):
     items = [
         _EvalMetricResultWithInvocation(