diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index 66f4dbe455..353f60be8f 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -24,8 +24,13 @@ import os from pathlib import Path import tempfile -import textwrap from typing import Optional +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + +import textwrap import click from click.core import ParameterSource @@ -36,6 +41,7 @@ from . import cli_deploy from .. import version from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE +from ..utils.app_loader import load_app_from_module from .cli import run_cli from .fast_api import get_fast_api_app from .utils import envs @@ -733,10 +739,19 @@ def cli_eval( ) try: + # Try to load App if available (for plugin support like ReflectAndRetryToolPlugin) + app = load_app_from_module(agent_module_file_path) + + if app: + logger.info("Using App instance for evaluation (plugins will be applied)") + else: + logger.info("No App found, using root_agent directly") + eval_service = LocalEvalService( root_agent=root_agent, eval_sets_manager=eval_sets_manager, eval_set_results_manager=eval_set_results_manager, + app=app, # NEW: Pass app if available user_simulator_provider=user_simulator_provider, ) diff --git a/src/google/adk/evaluation/app_inference_adapter.py b/src/google/adk/evaluation/app_inference_adapter.py new file mode 100644 index 0000000000..f0682f3073 --- /dev/null +++ b/src/google/adk/evaluation/app_inference_adapter.py @@ -0,0 +1,92 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional +from typing import TYPE_CHECKING + +from ..artifacts.base_artifact_service import BaseArtifactService +from ..memory.base_memory_service import BaseMemoryService +from ..runners import Runner +from ..sessions.base_session_service import BaseSessionService +from ._retry_options_utils import EnsureRetryOptionsPlugin +from .request_intercepter_plugin import _RequestIntercepterPlugin +from .simulation.user_simulator import UserSimulator + +if TYPE_CHECKING: + from .eval_case import SessionInput + + +class AppInferenceAdapter: + """Adapter to generate inferences from App without importing cli.*""" + + @staticmethod + async def generate_inferences_from_app( + app, + user_simulator: UserSimulator, + initial_session: Optional["SessionInput"], + session_id: str, + session_service: BaseSessionService, + artifact_service: BaseArtifactService, + memory_service: BaseMemoryService, + ): + """Shared app inference logic extracted from EvaluationGenerator.""" + + user_id = initial_session.user_id if initial_session else "test_user_id" + app_name = initial_session.app_name if initial_session else app.name + + # Create session + await session_service.create_session( + app_name=app_name, + user_id=user_id, + session_id=session_id, + state=initial_session.state if initial_session else {}, + ) + + # Add evaluation-required plugins + request_intercepter_plugin = _RequestIntercepterPlugin( + name="request_intercepter_plugin" + ) + ensure_retry_options_plugin = EnsureRetryOptionsPlugin( + name="ensure_retry_options" + ) + + # Duplicate app safely + app_for_runner = app.model_copy(deep=True) + + plugin_names = {p.name for p in app_for_runner.plugins} + if request_intercepter_plugin.name not in plugin_names: + app_for_runner.plugins.append(request_intercepter_plugin) + if ensure_retry_options_plugin.name not in plugin_names: + app_for_runner.plugins.append(ensure_retry_options_plugin) + + # Run simulation loop via runner + async with Runner( + app=app_for_runner, + session_service=session_service, + artifact_service=artifact_service, + memory_service=memory_service, + ) as runner: + + # Reuse existing eval user simulation loop + from .evaluation_generator import EvaluationGenerator + + return await EvaluationGenerator._run_user_simulation_loop( + runner=runner, + user_id=user_id, + session_id=session_id, + user_simulator=user_simulator, + request_intercepter_plugin=request_intercepter_plugin, + ) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 5d8b48c150..178d06113c 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -19,6 +19,11 @@ from typing import Any from typing import AsyncGenerator from typing import Optional +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + import uuid from google.genai.types import Content @@ -39,6 +44,7 @@ from .app_details import AgentDetails from .app_details import AppDetails from .eval_case import EvalCase +from .eval_case import IntermediateData from .eval_case import Invocation from .eval_case import InvocationEvent from .eval_case import InvocationEvents @@ -156,6 +162,55 @@ async def _process_query( initial_session=initial_session, ) + @staticmethod + async def _run_user_simulation_loop( + runner: Runner, + user_id: str, + session_id: str, + user_simulator: UserSimulator, + request_intercepter_plugin: _RequestIntercepterPlugin, + ) -> list[Invocation]: + """Run the user simulation loop and return invocations. + + Args: + runner: Configured Runner instance + user_id: User identifier + session_id: Session identifier + user_simulator: User simulator to generate messages + request_intercepter_plugin: Plugin to intercept requests for app_details + + Returns: + List of Invocation objects from the simulation + """ + events = [] + + # Loop through user simulator messages (handles both static and dynamic) + while True: + next_user_message = await user_simulator.get_next_user_message( + copy.deepcopy(events) + ) + if next_user_message.status == UserSimulatorStatus.SUCCESS: + async for ( + event + ) in EvaluationGenerator._generate_inferences_for_single_user_invocation( + runner, user_id, session_id, next_user_message.user_message + ): + events.append(event) + else: # no more messages + break + + # Extract app details from intercepted requests + app_details_by_invocation_id = ( + EvaluationGenerator._get_app_details_by_invocation_id( + events, request_intercepter_plugin + ) + ) + + # Convert events to invocations + return EvaluationGenerator.convert_events_to_eval_invocations( + events, app_details_by_invocation_id + ) + @staticmethod async def _generate_inferences_for_single_user_invocation( runner: Runner, @@ -240,28 +295,12 @@ async def _generate_inferences_from_root_agent( memory_service=memory_service, plugins=[request_intercepter_plugin, ensure_retry_options_plugin], ) as runner: - events = [] - while True: - next_user_message = await user_simulator.get_next_user_message( - copy.deepcopy(events) - ) - if next_user_message.status == UserSimulatorStatus.SUCCESS: - async for ( - event - ) in EvaluationGenerator._generate_inferences_for_single_user_invocation( - runner, user_id, session_id, next_user_message.user_message - ): - events.append(event) - else: # no message generated - break - - app_details_by_invocation_id = ( - EvaluationGenerator._get_app_details_by_invocation_id( - events, request_intercepter_plugin - ) - ) - return EvaluationGenerator.convert_events_to_eval_invocations( - events, app_details_by_invocation_id + return await EvaluationGenerator._run_user_simulation_loop( + runner=runner, + user_id=user_id, + session_id=session_id, + user_simulator=user_simulator, + request_intercepter_plugin=request_intercepter_plugin, ) @staticmethod @@ -326,6 +365,62 @@ def convert_events_to_eval_invocations( return invocations + @staticmethod + async def _generate_inferences_from_app( + app: "App", + user_simulator: "UserSimulator", + initial_session: Optional["SessionInput"], + session_id: str, + session_service: "BaseSessionService", + artifact_service: "BaseArtifactService", + memory_service: "BaseMemoryService", + ) -> list["Invocation"]: + """Generate inferences by invoking through App (preserving plugins).""" + + # Determine user_id consistently + user_id = initial_session.user_id if initial_session else "test_user_id" + + # Initialize session + app_name = initial_session.app_name if initial_session else app.name + await session_service.create_session( + app_name=app_name, + user_id=user_id, + session_id=session_id, + state=initial_session.state if initial_session else {}, + ) + + # Create plugins to track requests (needed for app_details) + request_intercepter_plugin = _RequestIntercepterPlugin( + name="request_intercepter_plugin" + ) + ensure_retry_options_plugin = EnsureRetryOptionsPlugin( + name="ensure_retry_options" + ) + + # Create a copy of the app to avoid mutating the original object and add eval-specific plugins. + app_for_runner = app.model_copy(deep=True) + # Add eval-specific plugins, ensuring no duplicates. + existing_plugin_names = {p.name for p in app_for_runner.plugins} + if request_intercepter_plugin.name not in existing_plugin_names: + app_for_runner.plugins.append(request_intercepter_plugin) + if ensure_retry_options_plugin.name not in existing_plugin_names: + app_for_runner.plugins.append(ensure_retry_options_plugin) + + # Create Runner with the modified App to preserve plugins + async with Runner( + app=app_for_runner, + session_service=session_service, + artifact_service=artifact_service, + memory_service=memory_service, + ) as runner: + return await EvaluationGenerator._run_user_simulation_loop( + runner=runner, + user_id=user_id, + session_id=session_id, + user_simulator=user_simulator, + request_intercepter_plugin=request_intercepter_plugin, + ) + @staticmethod def _get_app_details_by_invocation_id( events: list[Event], request_intercepter: _RequestIntercepterPlugin diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index f454266e00..b024fabb81 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -20,6 +20,11 @@ from typing import AsyncGenerator from typing import Callable from typing import Optional +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + import uuid from typing_extensions import override @@ -34,6 +39,7 @@ from ..utils._client_labels_utils import client_label_context from ..utils._client_labels_utils import EVAL_CLIENT_LABEL from ..utils.feature_decorator import experimental +from .app_inference_adapter import AppInferenceAdapter from .base_eval_service import BaseEvalService from .base_eval_service import EvaluateConfig from .base_eval_service import EvaluateRequest @@ -41,6 +47,7 @@ from .base_eval_service import InferenceResult from .base_eval_service import InferenceStatus from .eval_case import Invocation +from .eval_case import SessionInput from .eval_metrics import EvalMetric from .eval_metrics import EvalMetricResult from .eval_metrics import EvalMetricResultDetails @@ -79,11 +86,13 @@ def __init__( artifact_service: Optional[BaseArtifactService] = None, eval_set_results_manager: Optional[EvalSetResultsManager] = None, session_id_supplier: Callable[[], str] = _get_session_id, + app: Optional['App'] = None, user_simulator_provider: UserSimulatorProvider = UserSimulatorProvider(), memory_service: Optional[BaseMemoryService] = None, ): self._root_agent = root_agent self._eval_sets_manager = eval_sets_manager + self._app = app metric_evaluator_registry = ( metric_evaluator_registry or DEFAULT_METRIC_EVALUATOR_REGISTRY ) @@ -196,6 +205,37 @@ async def _evaluate_single_inference_result( The EvalCaseResult contains scores for each metric per invocation and the overall score. """ + # Handle failed inferences early - skip evaluation + if ( + inference_result.status == InferenceStatus.FAILURE + or inference_result.inferences is None + ): + # We still need to fetch eval_case to get the correct user_id. + eval_case = self._eval_sets_manager.get_eval_case( + app_name=inference_result.app_name, + eval_set_id=inference_result.eval_set_id, + eval_case_id=inference_result.eval_case_id, + ) + user_id = ( + eval_case.session_input.user_id + if eval_case + and eval_case.session_input + and eval_case.session_input.user_id + else 'test_user_id' + ) + eval_case_result = EvalCaseResult( + eval_set_file=inference_result.eval_set_id, + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + final_eval_status=EvalStatus.NOT_EVALUATED, + overall_eval_metric_results=[], + eval_metric_result_per_invocation=[], + session_id=inference_result.session_id, + session_details=None, + user_id=user_id, + ) + return (inference_result, eval_case_result) + eval_case = self._eval_sets_manager.get_eval_case( app_name=inference_result.app_name, eval_set_id=inference_result.eval_set_id, @@ -406,25 +446,37 @@ async def _perform_inference_single_eval_item( ) try: + # Use App if available (so plugins like ReflectAndRetryToolPlugin run) with client_label_context(EVAL_CLIENT_LABEL): - inferences = ( - await EvaluationGenerator._generate_inferences_from_root_agent( - root_agent=root_agent, - user_simulator=self._user_simulator_provider.provide(eval_case), - initial_session=initial_session, - session_id=session_id, - session_service=self._session_service, - artifact_service=self._artifact_service, - memory_service=self._memory_service, - ) - ) + # Extract common arguments to reduce duplication + common_args = { + 'user_simulator': self._user_simulator_provider.provide(eval_case), + 'initial_session': initial_session, + 'session_id': session_id, + 'session_service': self._session_service, + 'artifact_service': self._artifact_service, + 'memory_service': self._memory_service, + } + + if self._app is not None: + inferences = await AppInferenceAdapter.generate_inferences_from_app( + app=self._app, **common_args + ) + else: + # Fallback to direct root_agent usage (existing behavior) + inferences = ( + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, **common_args + ) + ) inference_result.inferences = inferences inference_result.status = InferenceStatus.SUCCESS return inference_result + except Exception as e: - # We intentionally catch the Exception as we don't failures to affect + # We intentionally catch the Exception as we don't want failures to affect # other inferences. logger.error( 'Inference failed for eval case `%s` with error %s.', diff --git a/src/google/adk/utils/app_loader.py b/src/google/adk/utils/app_loader.py new file mode 100644 index 0000000000..c2a061d236 --- /dev/null +++ b/src/google/adk/utils/app_loader.py @@ -0,0 +1,56 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for loading App instances from modules.""" + +from __future__ import annotations + +import importlib +import inspect +import logging +from typing import Optional +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from ..apps.app import App + +logger = logging.getLogger("google_adk." + __name__) + + +def load_app_from_module(module_path: str) -> Optional["App"]: + """Try to load an App instance from the agent module. + + Args: + module_path: Python module path (e.g., 'my_package.my_agent') + + Returns: + App instance if found, None otherwise + """ + from ..apps.app import App + + try: + module = importlib.import_module(module_path) + + # Find the first attribute that is an instance of App + for name, candidate in inspect.getmembers(module): + if isinstance(candidate, App): + logger.info(f"Loaded App instance '{name}' from {module_path}") + return candidate + + logger.debug(f"No App instance found in {module_path}") + + except (ImportError, AttributeError) as e: + logger.debug(f"Could not load App from module {module_path}: {e}") + + return None