fix: GenAI Client(evals) - Support EvaluationDataset output from run_inference as input dataset in create_evaluation_run in Vertex AI GenAI SDK evals

vertex-sdk-bot · copybara-github · commit 741c6ad6bf86 · 2025-10-30T09:39:37.000-07:00
PiperOrigin-RevId: 826069078
diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -48,70 +48,69 @@
 )
 
 
-# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
-# def test_create_eval_run_data_source_evaluation_set(client):
-#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
-#     client._api_client._http_options.api_version = "v1beta1"
-#     tool = genai_types.Tool(
-#         function_declarations=[
-#             genai_types.FunctionDeclaration(
-#                 name="get_weather",
-#                 description="Get weather in a location",
-#                 parameters={
-#                     "type": "object",
-#                     "properties": {"location": {"type": "string"}},
-#                 },
-#             )
-#         ]
-#     )
-#     evaluation_run = client.evals.create_evaluation_run(
-#         name="test4",
-#         display_name="test4",
-#         dataset=types.EvaluationRunDataSource(
-#             evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-#         ),
-#         dest=GCS_DEST,
-#         metrics=[
-#             UNIVERSAL_AR_METRIC,
-#             types.RubricMetric.FINAL_RESPONSE_QUALITY,
-#             LLM_METRIC
-#         ],
-#         agent_info=types.AgentInfo(
-#             agent="project/123/locations/us-central1/reasoningEngines/456",
-#             name="agent-1",
-#             instruction="agent-1 instruction",
-#             tool_declarations=[tool],
-#         ),
-#         labels={"label1": "value1"},
-#     )
-#     assert isinstance(evaluation_run, types.EvaluationRun)
-#     assert evaluation_run.display_name == "test4"
-#     assert evaluation_run.state == types.EvaluationRunState.PENDING
-#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-#     assert evaluation_run.data_source.evaluation_set == (
-#         "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-#     )
-#     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
-#         output_config=genai_types.OutputConfig(
-#             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
-#         ),
-#         metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
-#     )
-#     assert evaluation_run.inference_configs[
-#         "agent-1"
-#     ] == types.EvaluationRunInferenceConfig(
-#         agent_config=types.EvaluationRunAgentConfig(
-#             developer_instruction=genai_types.Content(
-#                 parts=[genai_types.Part(text="agent-1 instruction")]
-#             ),
-#             tools=[tool],
-#         )
-#     )
-#     assert evaluation_run.labels == {
-#        "vertex-ai-evaluation-agent-engine-id": "456",
-#        "label1": "value1",
-#     }
-#     assert evaluation_run.error is None
+def test_create_eval_run_data_source_evaluation_set(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
+    client._api_client._http_options.api_version = "v1beta1"
+    tool = genai_types.Tool(
+        function_declarations=[
+            genai_types.FunctionDeclaration(
+                name="get_weather",
+                description="Get weather in a location",
+                parameters={
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                },
+            )
+        ]
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test4",
+        display_name="test4",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+        ),
+        dest=GCS_DEST,
+        metrics=[
+            UNIVERSAL_AR_METRIC,
+            types.RubricMetric.FINAL_RESPONSE_QUALITY,
+            LLM_METRIC,
+        ],
+        agent_info=types.evals.AgentInfo(
+            agent="project/123/locations/us-central1/reasoningEngines/456",
+            name="agent-1",
+            instruction="agent-1 instruction",
+            tool_declarations=[tool],
+        ),
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test4"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.data_source.evaluation_set == (
+        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
+    )
+    assert evaluation_run.inference_configs[
+        "agent-1"
+    ] == types.EvaluationRunInferenceConfig(
+        agent_config=types.EvaluationRunAgentConfig(
+            developer_instruction=genai_types.Content(
+                parts=[genai_types.Part(text="agent-1 instruction")]
+            ),
+            tools=[tool],
+        )
+    )
+    assert evaluation_run.labels == {
+        "vertex-ai-evaluation-agent-engine-id": "456",
+        "label1": "value1",
+    }
+    assert evaluation_run.error is None
 
 
 def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         ),
         labels={"label1": "value1"},
         dest=GCS_DEST,
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test5"
@@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert evaluation_run.inference_configs is None
     assert evaluation_run.labels == {
@@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of the timestamp issue
+# Test fails in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 #     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
 #     input_df = pd.DataFrame(
@@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
 #             candidate_name="candidate_1",
 #             eval_dataset_df=input_df,
 #         ),
-#         dest="gs://lakeyk-limited-bucket/eval_run_output",
+#         dest=GCS_DEST,
+#         metrics=[UNIVERSAL_AR_METRIC],
 #     )
 #     assert isinstance(evaluation_run, types.EvaluationRun)
 #     assert evaluation_run.display_name == "test6"
@@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
             )
         ),
         dest=GCS_DEST,
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test8"
@@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert evaluation_run.error is None
     assert evaluation_run.inference_configs is None
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):
         agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
         src=test_df,
     )
-    assert inference_result.candidate_name == "agent"
+    assert inference_result.candidate_name is None
     assert inference_result.gcs_source is None
 
 
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
                 }
             ),
         )
-        assert inference_result.candidate_name == "agent"
+        assert inference_result.candidate_name is None
         assert inference_result.gcs_source is None
 
     @mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")
@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
                 }
             ),
         )
-        assert inference_result.candidate_name == "agent"
+        assert inference_result.candidate_name is None
         assert inference_result.gcs_source is None
 
     @mock.patch.object(_evals_utils, "EvalDatasetLoader")
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
@@ -798,7 +798,6 @@ def _execute_inference(
 
         evaluation_dataset = types.EvaluationDataset(
             eval_dataset_df=results_df,
-            candidate_name="agent",
         )
     else:
         raise ValueError("Either model or agent_engine must be provided.")
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
@@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
 
             intermediate_events: Optional[list[types.Event]] = None
             if intermediate_events_data:
-                logger.warning(
-                    "intermediate_events attribute is experimental and may change in "
-                    "future versions."
-                )
                 if isinstance(intermediate_events_data, list):
                     intermediate_events = []
                     for event in intermediate_events_data:
diff --git a/vertexai/_genai/_evals_visualization.py b/vertexai/_genai/_evals_visualization.py
@@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str:
 
             // If we have agent info, render as trace
             if(agentInfo) {{
-                let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🏃</span>agent_run</div></div>`;
+                let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🤖</span>agent_run</div></div>`;
                 eventsArray.forEach(event => {{
                     if (event.content && event.content.parts && event.content.parts.length > 0) {{
                         event.content.parts.forEach(part => {{
@@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
     dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
     html_content = _get_inference_html(dataframe_json_string)
     display.display(display.HTML(html_content))
+
+
+def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
+    """Returns a simple HTML string for displaying a status and optional error."""
+    error_html = ""
+    if error_message:
+        error_html = f"""
+        <p>
+            <b>Error:</b>
+            <pre style="white-space: pre-wrap; word-wrap: break-word;">{error_message}</pre>
+        </p>
+        """
+
+    return f"""
+    <div>
+        <p><b>Status:</b> {status}</p>
+        {error_html}
+    </div>
+    """
+
+
+def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
+    """Displays the status of an evaluation run in an IPython environment."""
+    if not _is_ipython_env():
+        logger.warning("Skipping display: not in an IPython environment.")
+        return
+    else:
+        from IPython import display
+
+    status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN"
+    error_message = str(eval_run_obj.error) if eval_run_obj.error else None
+    html_content = _get_status_html(status, error_message)
+    display.display(display.HTML(html_content))
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
@@ -1334,11 +1334,9 @@ def create_evaluation_run(
         *,
         dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
         dest: str,
+        metrics: list[types.EvaluationRunMetricOrDict],
         name: Optional[str] = None,
         display_name: Optional[str] = None,
-        metrics: Optional[
-            list[types.EvaluationRunMetricOrDict]
-        ] = None,  # TODO: Make required unified metrics available in prod.
         agent_info: Optional[types.evals.AgentInfoOrDict] = None,
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -1348,25 +1346,32 @@ def create_evaluation_run(
         Args:
           dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
           dest: The GCS URI prefix to write the evaluation results to.
+          metrics: The list of metrics to evaluate.
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
-          metrics: The list of metrics to evaluate.
           agent_info: The agent info to evaluate.
           labels: The labels to apply to the evaluation run.
           config: The configuration for the evaluation run.
 
         Returns:
             The created evaluation run.
         """
+        if agent_info and isinstance(agent_info, dict):
+            agent_info = types.evals.AgentInfo.model_validate(agent_info)
         if type(dataset).__name__ == "EvaluationDataset":
-            logger.warning(
-                "EvaluationDataset input is experimental and may change in future versions."
-            )
             if dataset.eval_dataset_df is None:
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
-            if dataset.candidate_name is None and agent_info:
+            if (
+                dataset.candidate_name
+                and agent_info.name
+                and dataset.candidate_name != agent_info.name
+            ):
+                logger.warning(
+                    "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+                )
+            elif dataset.candidate_name is None and agent_info:
                 dataset.candidate_name = agent_info.name
             eval_set = _evals_common._create_evaluation_set_from_dataframe(
                 self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -1383,9 +1388,6 @@ def create_evaluation_run(
         )
         inference_configs = {}
         if agent_info:
-            logger.warning(
-                "The agent_info field is experimental and may change in future versions."
-            )
             if isinstance(agent_info, dict):
                 agent_info = types.evals.AgentInfo.model_validate(agent_info)
             if (
@@ -2187,11 +2189,9 @@ async def create_evaluation_run(
         *,
         dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
         dest: str,
+        metrics: list[types.EvaluationRunMetricOrDict],
         name: Optional[str] = None,
         display_name: Optional[str] = None,
-        metrics: Optional[
-            list[types.EvaluationRunMetricOrDict]
-        ] = None,  # TODO: Make required unified metrics available in prod.
         agent_info: Optional[types.evals.AgentInfo] = None,
         labels: Optional[dict[str, str]] = None,
         config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -2201,25 +2201,32 @@ async def create_evaluation_run(
         Args:
           dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
           dest: The GCS URI prefix to write the evaluation results to.
+          metrics: The list of metrics to evaluate.
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
-          metrics: The list of metrics to evaluate.
           agent_info: The agent info to evaluate.
           labels: The labels to apply to the evaluation run.
           config: The configuration for the evaluation run.
 
         Returns:
             The created evaluation run.
         """
+        if agent_info and isinstance(agent_info, dict):
+            agent_info = types.evals.AgentInfo.model_validate(agent_info)
         if type(dataset).__name__ == "EvaluationDataset":
-            logger.warning(
-                "EvaluationDataset input is experimental and may change in future versions."
-            )
             if dataset.eval_dataset_df is None:
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
-            if dataset.candidate_name is None and agent_info:
+            if (
+                dataset.candidate_name
+                and agent_info.name
+                and dataset.candidate_name != agent_info.name
+            ):
+                logger.warning(
+                    "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+                )
+            elif dataset.candidate_name is None and agent_info:
                 dataset.candidate_name = agent_info.name
             eval_set = _evals_common._create_evaluation_set_from_dataframe(
                 self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -2236,9 +2243,6 @@ async def create_evaluation_run(
         )
         inference_configs = {}
         if agent_info:
-            logger.warning(
-                "The agent_info field is experimental and may change in future versions."
-            )
             if isinstance(agent_info, dict):
                 agent_info = types.evals.AgentInfo.model_validate(agent_info)
             if (
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py

Original file line number	Diff line number	Diff line change
`@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):`
`246`	`246`	`agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",`
`247`	`247`	`src=test_df,`
`248`	`248`	`)`
`249`		`- assert inference_result.candidate_name == "agent"`
	`249`	`+ assert inference_result.candidate_name is None`
`250`	`250`	`assert inference_result.gcs_source is None`
`251`	`251`
`252`	`252`
Original file line number	Diff line number	Diff line change
`@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(`
`1132`	`1132`	`}`
`1133`	`1133`	`),`
`1134`	`1134`	`)`
`1135`		`- assert inference_result.candidate_name == "agent"`
	`1135`	`+ assert inference_result.candidate_name is None`
`1136`	`1136`	`assert inference_result.gcs_source is None`
`1137`	`1137`
`1138`	`1138`	`@mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")`
`@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(`
`1211`	`1211`	`}`
`1212`	`1212`	`),`
`1213`	`1213`	`)`
`1214`		`- assert inference_result.candidate_name == "agent"`
	`1214`	`+ assert inference_result.candidate_name is None`
`1215`	`1215`	`assert inference_result.gcs_source is None`
`1216`	`1216`
`1217`	`1217`	`@mock.patch.object(_evals_utils, "EvalDatasetLoader")`
Original file line number	Diff line number	Diff line change
`@@ -798,7 +798,6 @@ def _execute_inference(`
`798`	`798`
`799`	`799`	`evaluation_dataset = types.EvaluationDataset(`
`800`	`800`	`eval_dataset_df=results_df,`
`801`		`- candidate_name="agent",`
`802`	`801`	`)`
`803`	`802`	`else:`
`804`	`803`	`raise ValueError("Either model or agent_engine must be provided.")`