diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py
index 66f4dbe455..353f60be8f 100644
--- a/src/google/adk/cli/cli_tools_click.py
+++ b/src/google/adk/cli/cli_tools_click.py
@@ -24,8 +24,13 @@
 import os
 from pathlib import Path
 import tempfile
-import textwrap
 from typing import Optional
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+  from ..apps.app import App
+
+import textwrap
 
 import click
 from click.core import ParameterSource
@@ -36,6 +41,7 @@
 from . import cli_deploy
 from .. import version
 from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
+from ..utils.app_loader import load_app_from_module
 from .cli import run_cli
 from .fast_api import get_fast_api_app
 from .utils import envs
@@ -733,10 +739,19 @@ def cli_eval(
   )
 
   try:
+    # Try to load App if available (for plugin support like ReflectAndRetryToolPlugin)
+    app = load_app_from_module(agent_module_file_path)
+
+    if app:
+      logger.info("Using App instance for evaluation (plugins will be applied)")
+    else:
+      logger.info("No App found, using root_agent directly")
+
     eval_service = LocalEvalService(
         root_agent=root_agent,
         eval_sets_manager=eval_sets_manager,
         eval_set_results_manager=eval_set_results_manager,
+        app=app,  # NEW: Pass app if available
         user_simulator_provider=user_simulator_provider,
     )
 
diff --git a/src/google/adk/evaluation/app_inference_adapter.py b/src/google/adk/evaluation/app_inference_adapter.py
new file mode 100644
index 0000000000..f0682f3073
--- /dev/null
+++ b/src/google/adk/evaluation/app_inference_adapter.py
@@ -0,0 +1,92 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Optional
+from typing import TYPE_CHECKING
+
+from ..artifacts.base_artifact_service import BaseArtifactService
+from ..memory.base_memory_service import BaseMemoryService
+from ..runners import Runner
+from ..sessions.base_session_service import BaseSessionService
+from ._retry_options_utils import EnsureRetryOptionsPlugin
+from .request_intercepter_plugin import _RequestIntercepterPlugin
+from .simulation.user_simulator import UserSimulator
+
+if TYPE_CHECKING:
+  from .eval_case import SessionInput
+
+
+class AppInferenceAdapter:
+  """Adapter to generate inferences from App without importing cli.*"""
+
+  @staticmethod
+  async def generate_inferences_from_app(
+      app,
+      user_simulator: UserSimulator,
+      initial_session: Optional["SessionInput"],
+      session_id: str,
+      session_service: BaseSessionService,
+      artifact_service: BaseArtifactService,
+      memory_service: BaseMemoryService,
+  ):
+    """Shared app inference logic extracted from EvaluationGenerator."""
+
+    user_id = initial_session.user_id if initial_session else "test_user_id"
+    app_name = initial_session.app_name if initial_session else app.name
+
+    # Create session
+    await session_service.create_session(
+        app_name=app_name,
+        user_id=user_id,
+        session_id=session_id,
+        state=initial_session.state if initial_session else {},
+    )
+
+    # Add evaluation-required plugins
+    request_intercepter_plugin = _RequestIntercepterPlugin(
+        name="request_intercepter_plugin"
+    )
+    ensure_retry_options_plugin = EnsureRetryOptionsPlugin(
+        name="ensure_retry_options"
+    )
+
+    # Duplicate app safely
+    app_for_runner = app.model_copy(deep=True)
+
+    plugin_names = {p.name for p in app_for_runner.plugins}
+    if request_intercepter_plugin.name not in plugin_names:
+      app_for_runner.plugins.append(request_intercepter_plugin)
+    if ensure_retry_options_plugin.name not in plugin_names:
+      app_for_runner.plugins.append(ensure_retry_options_plugin)
+
+    # Run simulation loop via runner
+    async with Runner(
+        app=app_for_runner,
+        session_service=session_service,
+        artifact_service=artifact_service,
+        memory_service=memory_service,
+    ) as runner:
+
+      # Reuse existing eval user simulation loop
+      from .evaluation_generator import EvaluationGenerator
+
+      return await EvaluationGenerator._run_user_simulation_loop(
+          runner=runner,
+          user_id=user_id,
+          session_id=session_id,
+          user_simulator=user_simulator,
+          request_intercepter_plugin=request_intercepter_plugin,
+      )
diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py
index 5d8b48c150..178d06113c 100644
--- a/src/google/adk/evaluation/evaluation_generator.py
+++ b/src/google/adk/evaluation/evaluation_generator.py
@@ -19,6 +19,11 @@
 from typing import Any
 from typing import AsyncGenerator
 from typing import Optional
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+  from ..apps.app import App
+
 import uuid
 
 from google.genai.types import Content
@@ -39,6 +44,7 @@
 from .app_details import AgentDetails
 from .app_details import AppDetails
 from .eval_case import EvalCase
+from .eval_case import IntermediateData
 from .eval_case import Invocation
 from .eval_case import InvocationEvent
 from .eval_case import InvocationEvents
@@ -156,6 +162,55 @@ async def _process_query(
         initial_session=initial_session,
     )
 
+  @staticmethod
+  async def _run_user_simulation_loop(
+      runner: Runner,
+      user_id: str,
+      session_id: str,
+      user_simulator: UserSimulator,
+      request_intercepter_plugin: _RequestIntercepterPlugin,
+  ) -> list[Invocation]:
+    """Run the user simulation loop and return invocations.
+
+    Args:
+        runner: Configured Runner instance
+        user_id: User identifier
+        session_id: Session identifier
+        user_simulator: User simulator to generate messages
+        request_intercepter_plugin: Plugin to intercept requests for app_details
+
+    Returns:
+        List of Invocation objects from the simulation
+    """
+    events = []
+
+    # Loop through user simulator messages (handles both static and dynamic)
+    while True:
+      next_user_message = await user_simulator.get_next_user_message(
+          copy.deepcopy(events)
+      )
+      if next_user_message.status == UserSimulatorStatus.SUCCESS:
+        async for (
+            event
+        ) in EvaluationGenerator._generate_inferences_for_single_user_invocation(
+            runner, user_id, session_id, next_user_message.user_message
+        ):
+          events.append(event)
+      else:  # no more messages
+        break
+
+    # Extract app details from intercepted requests
+    app_details_by_invocation_id = (
+        EvaluationGenerator._get_app_details_by_invocation_id(
+            events, request_intercepter_plugin
+        )
+    )
+
+    # Convert events to invocations
+    return EvaluationGenerator.convert_events_to_eval_invocations(
+        events, app_details_by_invocation_id
+    )
+
   @staticmethod
   async def _generate_inferences_for_single_user_invocation(
       runner: Runner,
@@ -240,28 +295,12 @@ async def _generate_inferences_from_root_agent(
         memory_service=memory_service,
         plugins=[request_intercepter_plugin, ensure_retry_options_plugin],
     ) as runner:
-      events = []
-      while True:
-        next_user_message = await user_simulator.get_next_user_message(
-            copy.deepcopy(events)
-        )
-        if next_user_message.status == UserSimulatorStatus.SUCCESS:
-          async for (
-              event
-          ) in EvaluationGenerator._generate_inferences_for_single_user_invocation(
-              runner, user_id, session_id, next_user_message.user_message
-          ):
-            events.append(event)
-        else:  # no message generated
-          break
-
-      app_details_by_invocation_id = (
-          EvaluationGenerator._get_app_details_by_invocation_id(
-              events, request_intercepter_plugin
-          )
-      )
-      return EvaluationGenerator.convert_events_to_eval_invocations(
-          events, app_details_by_invocation_id
+      return await EvaluationGenerator._run_user_simulation_loop(
+          runner=runner,
+          user_id=user_id,
+          session_id=session_id,
+          user_simulator=user_simulator,
+          request_intercepter_plugin=request_intercepter_plugin,
       )
 
   @staticmethod
@@ -326,6 +365,62 @@ def convert_events_to_eval_invocations(
 
     return invocations
 
+  @staticmethod
+  async def _generate_inferences_from_app(
+      app: "App",
+      user_simulator: "UserSimulator",
+      initial_session: Optional["SessionInput"],
+      session_id: str,
+      session_service: "BaseSessionService",
+      artifact_service: "BaseArtifactService",
+      memory_service: "BaseMemoryService",
+  ) -> list["Invocation"]:
+    """Generate inferences by invoking through App (preserving plugins)."""
+
+    # Determine user_id consistently
+    user_id = initial_session.user_id if initial_session else "test_user_id"
+
+    # Initialize session
+    app_name = initial_session.app_name if initial_session else app.name
+    await session_service.create_session(
+        app_name=app_name,
+        user_id=user_id,
+        session_id=session_id,
+        state=initial_session.state if initial_session else {},
+    )
+
+    # Create plugins to track requests (needed for app_details)
+    request_intercepter_plugin = _RequestIntercepterPlugin(
+        name="request_intercepter_plugin"
+    )
+    ensure_retry_options_plugin = EnsureRetryOptionsPlugin(
+        name="ensure_retry_options"
+    )
+
+    # Create a copy of the app to avoid mutating the original object and add eval-specific plugins.
+    app_for_runner = app.model_copy(deep=True)
+    # Add eval-specific plugins, ensuring no duplicates.
+    existing_plugin_names = {p.name for p in app_for_runner.plugins}
+    if request_intercepter_plugin.name not in existing_plugin_names:
+      app_for_runner.plugins.append(request_intercepter_plugin)
+    if ensure_retry_options_plugin.name not in existing_plugin_names:
+      app_for_runner.plugins.append(ensure_retry_options_plugin)
+
+    # Create Runner with the modified App to preserve plugins
+    async with Runner(
+        app=app_for_runner,
+        session_service=session_service,
+        artifact_service=artifact_service,
+        memory_service=memory_service,
+    ) as runner:
+      return await EvaluationGenerator._run_user_simulation_loop(
+          runner=runner,
+          user_id=user_id,
+          session_id=session_id,
+          user_simulator=user_simulator,
+          request_intercepter_plugin=request_intercepter_plugin,
+      )
+
   @staticmethod
   def _get_app_details_by_invocation_id(
       events: list[Event], request_intercepter: _RequestIntercepterPlugin
diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py
index f454266e00..b024fabb81 100644
--- a/src/google/adk/evaluation/local_eval_service.py
+++ b/src/google/adk/evaluation/local_eval_service.py
@@ -20,6 +20,11 @@
 from typing import AsyncGenerator
 from typing import Callable
 from typing import Optional
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+  from ..apps.app import App
+
 import uuid
 
 from typing_extensions import override
@@ -34,6 +39,7 @@
 from ..utils._client_labels_utils import client_label_context
 from ..utils._client_labels_utils import EVAL_CLIENT_LABEL
 from ..utils.feature_decorator import experimental
+from .app_inference_adapter import AppInferenceAdapter
 from .base_eval_service import BaseEvalService
 from .base_eval_service import EvaluateConfig
 from .base_eval_service import EvaluateRequest
@@ -41,6 +47,7 @@
 from .base_eval_service import InferenceResult
 from .base_eval_service import InferenceStatus
 from .eval_case import Invocation
+from .eval_case import SessionInput
 from .eval_metrics import EvalMetric
 from .eval_metrics import EvalMetricResult
 from .eval_metrics import EvalMetricResultDetails
@@ -79,11 +86,13 @@ def __init__(
       artifact_service: Optional[BaseArtifactService] = None,
       eval_set_results_manager: Optional[EvalSetResultsManager] = None,
       session_id_supplier: Callable[[], str] = _get_session_id,
+      app: Optional['App'] = None,
       user_simulator_provider: UserSimulatorProvider = UserSimulatorProvider(),
       memory_service: Optional[BaseMemoryService] = None,
   ):
     self._root_agent = root_agent
     self._eval_sets_manager = eval_sets_manager
+    self._app = app
     metric_evaluator_registry = (
         metric_evaluator_registry or DEFAULT_METRIC_EVALUATOR_REGISTRY
     )
@@ -196,6 +205,37 @@ async def _evaluate_single_inference_result(
     The EvalCaseResult contains scores for each metric per invocation and the
     overall score.
     """
+    # Handle failed inferences early - skip evaluation
+    if (
+        inference_result.status == InferenceStatus.FAILURE
+        or inference_result.inferences is None
+    ):
+      # We still need to fetch eval_case to get the correct user_id.
+      eval_case = self._eval_sets_manager.get_eval_case(
+          app_name=inference_result.app_name,
+          eval_set_id=inference_result.eval_set_id,
+          eval_case_id=inference_result.eval_case_id,
+      )
+      user_id = (
+          eval_case.session_input.user_id
+          if eval_case
+          and eval_case.session_input
+          and eval_case.session_input.user_id
+          else 'test_user_id'
+      )
+      eval_case_result = EvalCaseResult(
+          eval_set_file=inference_result.eval_set_id,
+          eval_set_id=inference_result.eval_set_id,
+          eval_id=inference_result.eval_case_id,
+          final_eval_status=EvalStatus.NOT_EVALUATED,
+          overall_eval_metric_results=[],
+          eval_metric_result_per_invocation=[],
+          session_id=inference_result.session_id,
+          session_details=None,
+          user_id=user_id,
+      )
+      return (inference_result, eval_case_result)
+
     eval_case = self._eval_sets_manager.get_eval_case(
         app_name=inference_result.app_name,
         eval_set_id=inference_result.eval_set_id,
@@ -406,25 +446,37 @@ async def _perform_inference_single_eval_item(
     )
 
     try:
+      # Use App if available (so plugins like ReflectAndRetryToolPlugin run)
       with client_label_context(EVAL_CLIENT_LABEL):
-        inferences = (
-            await EvaluationGenerator._generate_inferences_from_root_agent(
-                root_agent=root_agent,
-                user_simulator=self._user_simulator_provider.provide(eval_case),
-                initial_session=initial_session,
-                session_id=session_id,
-                session_service=self._session_service,
-                artifact_service=self._artifact_service,
-                memory_service=self._memory_service,
-            )
-        )
+        # Extract common arguments to reduce duplication
+        common_args = {
+            'user_simulator': self._user_simulator_provider.provide(eval_case),
+            'initial_session': initial_session,
+            'session_id': session_id,
+            'session_service': self._session_service,
+            'artifact_service': self._artifact_service,
+            'memory_service': self._memory_service,
+        }
+
+        if self._app is not None:
+          inferences = await AppInferenceAdapter.generate_inferences_from_app(
+              app=self._app, **common_args
+          )
+        else:
+          # Fallback to direct root_agent usage (existing behavior)
+          inferences = (
+              await EvaluationGenerator._generate_inferences_from_root_agent(
+                  root_agent=root_agent, **common_args
+              )
+          )
 
       inference_result.inferences = inferences
       inference_result.status = InferenceStatus.SUCCESS
 
       return inference_result
+
     except Exception as e:
-      # We intentionally catch the Exception as we don't failures to affect
+      # We intentionally catch the Exception as we don't want failures to affect
       # other inferences.
       logger.error(
           'Inference failed for eval case `%s` with error %s.',
diff --git a/src/google/adk/utils/app_loader.py b/src/google/adk/utils/app_loader.py
new file mode 100644
index 0000000000..c2a061d236
--- /dev/null
+++ b/src/google/adk/utils/app_loader.py
@@ -0,0 +1,56 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for loading App instances from modules."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+import logging
+from typing import Optional
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+  from ..apps.app import App
+
+logger = logging.getLogger("google_adk." + __name__)
+
+
+def load_app_from_module(module_path: str) -> Optional["App"]:
+  """Try to load an App instance from the agent module.
+
+  Args:
+      module_path: Python module path (e.g., 'my_package.my_agent')
+
+  Returns:
+      App instance if found, None otherwise
+  """
+  from ..apps.app import App
+
+  try:
+    module = importlib.import_module(module_path)
+
+    # Find the first attribute that is an instance of App
+    for name, candidate in inspect.getmembers(module):
+      if isinstance(candidate, App):
+        logger.info(f"Loaded App instance '{name}' from {module_path}")
+        return candidate
+
+    logger.debug(f"No App instance found in {module_path}")
+
+  except (ImportError, AttributeError) as e:
+    logger.debug(f"Could not load App from module {module_path}: {e}")
+
+  return None