google · morganroux · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/contributing/samples/gepa/experiment.py b/contributing/samples/gepa/experiment.py
@@ -43,7 +43,6 @@
 from tau_bench.types import EnvRunResult
 from tau_bench.types import RunConfig
 import tau_bench_agent as tau_bench_agent_lib
-
 import utils
 
 

diff --git a/contributing/samples/gepa/run_experiment.py b/contributing/samples/gepa/run_experiment.py
@@ -25,7 +25,6 @@
 from absl import flags
 import experiment
 from google.genai import types
-
 import utils
 
 _OUTPUT_DIR = flags.DEFINE_string(

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
@@ -87,10 +87,18 @@ class _EvalMetricResultWithInvocation(BaseModel):
 
   This is class is intentionally marked as private and is created for
   convenience.
+
+  actual_invocations: These are the invocations that are obtained from the
+    agent under test.
+  expected_invocations: An optional list of invocations, if specified,
+    usually act as a benchmark/golden response. If these are specified
+    usually the expectation is that the length of this list and actual
+    invocaiton is the same.
+  eval_metric_result: This is the EvalMetricResult for the given actual and expected invocation.
   """
 
   actual_invocation: Invocation
-  expected_invocation: Invocation
+  expected_invocation: Optional[Invocation] = None
   eval_metric_result: EvalMetricResult
 
 
@@ -436,17 +444,29 @@ def _print_details(
           "eval_status": per_invocation_result.eval_metric_result.eval_status,
           "score": per_invocation_result.eval_metric_result.score,
           "threshold": threshold,
-          "prompt": AgentEvaluator._convert_content_to_text(
-              per_invocation_result.expected_invocation.user_content
+          "prompt": (
+              AgentEvaluator._convert_content_to_text(
+                  per_invocation_result.expected_invocation.user_content
+              )
+              if per_invocation_result.expected_invocation
+              else ""
           ),
-          "expected_response": AgentEvaluator._convert_content_to_text(
-              per_invocation_result.expected_invocation.final_response
+          "expected_response": (
+              AgentEvaluator._convert_content_to_text(
+                  per_invocation_result.expected_invocation.final_response
+              )
+              if per_invocation_result.expected_invocation
+              else ""
           ),
           "actual_response": AgentEvaluator._convert_content_to_text(
               per_invocation_result.actual_invocation.final_response
           ),
-          "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
-              per_invocation_result.expected_invocation.intermediate_data
+          "expected_tool_calls": (
+              AgentEvaluator._convert_tool_calls_to_text(
+                  per_invocation_result.expected_invocation.intermediate_data
+              )
+              if per_invocation_result.expected_invocation
+              else ""
           ),
           "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
               per_invocation_result.actual_invocation.intermediate_data

diff --git a/tests/integration/fixture/conversation_scenario/__init__.py b/tests/integration/fixture/conversation_scenario/__init__.py
@@ -0,0 +1 @@
+from . import agent
diff --git a/tests/integration/fixture/conversation_scenario/agent.py b/tests/integration/fixture/conversation_scenario/agent.py
@@ -0,0 +1,25 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from google.adk import Agent
+
+root_agent = Agent(
+    model="gemini-2.5-flash",
+    name="Conversation_scenario_agent",
+    instruction="""
+    You are helpful conversation agent. Kindly answer the questions.
+    """,
+    tools=[],
+)
diff --git a/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json b/tests/integration/fixture/conversation_scenario/conversation_scenario.test.json
@@ -0,0 +1,11 @@
+{
+  "eval_set_id": "test",
+  "eval_cases": [{
+    "eval_id": "CS-1",
+    "conversation_scenario": {
+      "starting_prompt": "Hello",
+      "conversation_plan": "Ask the agent to do something"
+    },
+    "session_input": null
+  }]
+}
diff --git a/tests/integration/fixture/conversation_scenario/test_config.json b/tests/integration/fixture/conversation_scenario/test_config.json
@@ -0,0 +1,4 @@
+{
+  "criteria": { "hallucinations_v1": { "threshold": 0.6 } },
+  "user_simulator_config": { "model": "gemini-2.5-flash", "max_allowed_invocations": 5 }
+}
diff --git a/tests/integration/test_conversation_scenario.py b/tests/integration/test_conversation_scenario.py
@@ -0,0 +1,25 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_eval_agent_with_conversation_scenario():
+  await AgentEvaluator.evaluate(
+      agent_module="tests.integration.fixture.conversation_scenario.agent",
+      eval_dataset_file_path_or_dir="tests/integration/fixture/conversation_scenario/conversation_scenario.test.json",
+      num_runs=2,
+  )
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -0,0 +1,104 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import sys
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+from google.adk.evaluation.eval_case import Invocation
+from google.adk.evaluation.eval_metrics import EvalMetricResult
+from google.adk.evaluation.eval_metrics import EvalStatus
+from google.genai import types as genai_types
+
+
+def _make_actual_invocation(
+    query: str = "user query", response: str = "agent response"
+) -> Invocation:
+  return Invocation(
+      user_content=genai_types.Content(
+          parts=[genai_types.Part(text=query)], role="user"
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=response)], role="model"
+      ),
+  )
+
+
+def _make_eval_metric_result(
+    score: float = 0.9, status: EvalStatus = EvalStatus.PASSED
+) -> EvalMetricResult:
+  return EvalMetricResult(
+      metric_name="test_metric",
+      threshold=0.8,
+      score=score,
+      eval_status=status,
+  )
+
+
+def _call_print_details(
+    items: list[_EvalMetricResultWithInvocation],
+) -> MagicMock:
+  """Calls _print_details with mocked pandas/tabulate, returns the mock DataFrame class."""
+  mock_pandas = MagicMock()
+  mock_tabulate_module = MagicMock()
+  mock_tabulate_module.tabulate = MagicMock(return_value="table")
+
+  with patch.dict(
+      sys.modules,
+      {"pandas": mock_pandas, "tabulate": mock_tabulate_module},
+  ):
+    AgentEvaluator._print_details(
+        eval_metric_result_with_invocations=items,
+        overall_eval_status=EvalStatus.PASSED,
+        overall_score=0.9,
+        metric_name="test_metric",
+        threshold=0.8,
+    )
+
+  return mock_pandas.pandas.DataFrame
+
+
+class TestPrintDetailsWithNoExpectedInvocation:
+  """Tests for _print_details when expected_invocation is None."""
+
+  def test_does_not_raise(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+    ]
+    _call_print_details(items)  # should not raise
+
+  def test_multiple_invocations_all_without_expected(self):
+    items = [
+        _EvalMetricResultWithInvocation(
+            actual_invocation=_make_actual_invocation(response=f"response {i}"),
+            expected_invocation=None,
+            eval_metric_result=_make_eval_metric_result(),
+        )
+        for i in range(3)
+    ]
+    mock_df_cls = _call_print_details(items)
+    data = mock_df_cls.call_args[0][0]
+    assert len(data) == 3
+    for row in data:
+      assert row["prompt"] == ""
+      assert row["expected_response"] == ""
+      assert row["expected_tool_calls"] == ""