googleapis
diff --git a/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 184 additions & 143 deletions b/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 184 additions & 143 deletions
@@ -14,6 +14,8 @@
 #
 # pylint: disable=protected-access,bad-continuation,missing-function-docstring
 
+import pandas as pd
+
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai import types
 from google.genai import types as genai_types
@@ -46,72 +48,116 @@
         )
     ),
 )
+INPUT_DF = pd.DataFrame(
+    {
+        "prompt": ["prompt1", "prompt2"],
+        "reference": ["reference1", "reference2"],
+        "response": ["response1", "response2"],
+        "intermediate_events": [
+            [
+                {
+                    "content": {
+                        "parts": [
+                            {"text": "first user input"},
+                        ],
+                        "role": "user",
+                    },
+                },
+                {
+                    "content": {
+                        "parts": [
+                            {"text": "first model response"},
+                        ],
+                        "role": "model",
+                    },
+                },
+            ],
+            [
+                {
+                    "content": {
+                        "parts": [
+                            {"text": "second user input"},
+                        ],
+                        "role": "user",
+                    },
+                },
+                {
+                    "content": {
+                        "parts": [
+                            {"text": "second model response"},
+                        ],
+                        "role": "model",
+                    },
+                },
+            ],
+        ],
+    }
+)
 
 
-# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
-# def test_create_eval_run_data_source_evaluation_set(client):
-#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
-#     client._api_client._http_options.api_version = "v1beta1"
-#     tool = genai_types.Tool(
-#         function_declarations=[
-#             genai_types.FunctionDeclaration(
-#                 name="get_weather",
-#                 description="Get weather in a location",
-#                 parameters={
-#                     "type": "object",
-#                     "properties": {"location": {"type": "string"}},
-#                 },
-#             )
-#         ]
-#     )
-#     evaluation_run = client.evals.create_evaluation_run(
-#         name="test4",
-#         display_name="test4",
-#         dataset=types.EvaluationRunDataSource(
-#             evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-#         ),
-#         dest=GCS_DEST,
-#         metrics=[
-#             UNIVERSAL_AR_METRIC,
-#             types.RubricMetric.FINAL_RESPONSE_QUALITY,
-#             LLM_METRIC
-#         ],
-#         agent_info=types.AgentInfo(
-#             agent="project/123/locations/us-central1/reasoningEngines/456",
-#             name="agent-1",
-#             instruction="agent-1 instruction",
-#             tool_declarations=[tool],
-#         ),
-#         labels={"label1": "value1"},
-#     )
-#     assert isinstance(evaluation_run, types.EvaluationRun)
-#     assert evaluation_run.display_name == "test4"
-#     assert evaluation_run.state == types.EvaluationRunState.PENDING
-#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-#     assert evaluation_run.data_source.evaluation_set == (
-#         "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-#     )
-#     assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
-#         output_config=genai_types.OutputConfig(
-#             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
-#         ),
-#         metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
-#     )
-#     assert evaluation_run.inference_configs[
-#         "agent-1"
-#     ] == types.EvaluationRunInferenceConfig(
-#         agent_config=types.EvaluationRunAgentConfig(
-#             developer_instruction=genai_types.Content(
-#                 parts=[genai_types.Part(text="agent-1 instruction")]
-#             ),
-#             tools=[tool],
-#         )
-#     )
-#     assert evaluation_run.labels == {
-#        "vertex-ai-evaluation-agent-engine-id": "456",
-#        "label1": "value1",
-#     }
-#     assert evaluation_run.error is None
+def test_create_eval_run_data_source_evaluation_set(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
+    client._api_client._http_options.api_version = "v1beta1"
+    tool = genai_types.Tool(
+        function_declarations=[
+            genai_types.FunctionDeclaration(
+                name="get_weather",
+                description="Get weather in a location",
+                parameters={
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                },
+            )
+        ]
+    )
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test4",
+        display_name="test4",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+        ),
+        dest=GCS_DEST,
+        metrics=[
+            UNIVERSAL_AR_METRIC,
+            types.RubricMetric.FINAL_RESPONSE_QUALITY,
+            LLM_METRIC,
+        ],
+        agent_info=types.evals.AgentInfo(
+            agent="project/123/locations/us-central1/reasoningEngines/456",
+            name="agent-1",
+            instruction="agent-1 instruction",
+            tool_declarations=[tool],
+        ),
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test4"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert evaluation_run.data_source.evaluation_set == (
+        "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
+    )
+    assert evaluation_run.inference_configs[
+        "agent-1"
+    ] == types.EvaluationRunInferenceConfig(
+        agent_config=types.EvaluationRunAgentConfig(
+            developer_instruction=genai_types.Content(
+                parts=[genai_types.Part(text="agent-1 instruction")]
+            ),
+            tools=[tool],
+        )
+    )
+    assert evaluation_run.labels == {
+        "vertex-ai-evaluation-agent-engine-id": "456",
+        "label1": "value1",
+    }
+    assert evaluation_run.error is None
 
 
 def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +178,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         ),
         labels={"label1": "value1"},
         dest=GCS_DEST,
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test5"
@@ -152,6 +199,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert evaluation_run.inference_configs is None
     assert evaluation_run.labels == {
@@ -160,101 +208,92 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
     assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of the timestamp issue
+# Test fails in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 #     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
-#     input_df = pd.DataFrame(
-#         {
-#             "prompt": ["prompt1", "prompt2"],
-#             "reference": ["reference1", "reference2"],
-#             "response": ["response1", "response2"],
-#             "intermediate_events": [
-#                 [
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "first user input"},
-#                             ],
-#                             "role": "user",
-#                         },
-#                     },
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "first model response"},
-#                             ],
-#                             "role": "model",
-#                         },
-#                     },
-#                 ],
-#                 [
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "second user input"},
-#                             ],
-#                             "role": "user",
-#                         },
-#                     },
-#                     {
-#                         "content": {
-#                             "parts": [
-#                                 {"text": "second model response"},
-#                             ],
-#                             "role": "model",
-#                         },
-#                     },
-#                 ],
-#             ],
-#         }
-#     )
 #     evaluation_run = client.evals.create_evaluation_run(
 #         name="test6",
 #         display_name="test6",
 #         dataset=types.EvaluationDataset(
 #             candidate_name="candidate_1",
-#             eval_dataset_df=input_df,
+#             eval_dataset_df=INPUT_DF,
 #         ),
-#         dest="gs://lakeyk-limited-bucket/eval_run_output",
+#         dest=GCS_DEST,
+#         metrics=[UNIVERSAL_AR_METRIC],
 #     )
 #     assert isinstance(evaluation_run, types.EvaluationRun)
 #     assert evaluation_run.display_name == "test6"
 #     assert evaluation_run.state == types.EvaluationRunState.PENDING
-#     assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-#     # Check evaluation set
-#     assert evaluation_run.data_source.evaluation_set
-#     eval_set = client.evals.get_evaluation_set(
-#         name=evaluation_run.data_source.evaluation_set
+#     check_evaluation_run_data_source(client, evaluation_run)
+#     assert evaluation_run.error is None
+
+
+# Test fails in replay mode because of UUID generation mismatch.
+# def test_create_eval_run_data_source_pandas_dataframe(client):
+#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
+#     evaluation_run = client.evals.create_evaluation_run(
+#         dataset=INPUT_DF,
+#         dest=GCS_DEST,
+#         metrics=[UNIVERSAL_AR_METRIC],
+#     )
+#     assert isinstance(evaluation_run, types.EvaluationRun)
+#     assert evaluation_run.state == types.EvaluationRunState.PENDING
+#     check_evaluation_run_data_source(client, evaluation_run)
+#     assert evaluation_run.error is None
+
+# Test fails in replay mode because of UUID generation mismatch.
+# def test_create_eval_run_data_source_evaluation_dataset_dict(client):
+#     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
+#     eval_dataset_dict = {
+#         "candidate_name": "candidate_1",
+#         "eval_dataset_df": INPUT_DF,
+#     }
+#     evaluation_run = client.evals.create_evaluation_run(
+#         dataset=eval_dataset_dict,
+#         dest=GCS_DEST,
+#         metrics=[UNIVERSAL_AR_METRIC],
 #     )
-#     assert len(eval_set.evaluation_items) == 2
-#     # Check evaluation items
-#     for i, eval_item_name in enumerate(eval_set.evaluation_items):
-#         eval_item = client.evals.get_evaluation_item(name=eval_item_name)
-#         assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
-#         assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].text
-#             == input_df.iloc[i]["response"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
-#             == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[0].role
-#             == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
-#             == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
-#         )
-#         assert (
-#             eval_item.evaluation_request.candidate_responses[0].events[1].role
-#             == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
-#         )
+#     assert isinstance(evaluation_run, types.EvaluationRun)
+#     assert evaluation_run.state == types.EvaluationRunState.PENDING
+#     check_evaluation_run_data_source(client, evaluation_run)
 #     assert evaluation_run.error is None
 
 
+def check_evaluation_run_data_source(client, evaluation_run):
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    # Check evaluation set
+    assert evaluation_run.data_source.evaluation_set
+    eval_set = client.evals.get_evaluation_set(
+        name=evaluation_run.data_source.evaluation_set
+    )
+    assert len(eval_set.evaluation_items) == 2
+    # Check evaluation items
+    for i, eval_item_name in enumerate(eval_set.evaluation_items):
+        eval_item = client.evals.get_evaluation_item(name=eval_item_name)
+        assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
+        assert eval_item.evaluation_request.prompt.text == INPUT_DF.iloc[i]["prompt"]
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].text
+            == INPUT_DF.iloc[i]["response"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
+            == INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[0].role
+            == INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["role"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
+            == INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
+        )
+        assert (
+            eval_item.evaluation_request.candidate_responses[0].events[1].role
+            == INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["role"]
+        )
+
+
 pytest_plugins = ("pytest_asyncio",)
 
 
@@ -276,6 +315,7 @@ async def test_create_eval_run_async(client):
             )
         ),
         dest=GCS_DEST,
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert isinstance(evaluation_run, types.EvaluationRun)
     assert evaluation_run.display_name == "test8"
@@ -292,6 +332,7 @@ async def test_create_eval_run_async(client):
         output_config=genai_types.OutputConfig(
             gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
         ),
+        metrics=[UNIVERSAL_AR_METRIC],
     )
     assert evaluation_run.error is None
     assert evaluation_run.inference_configs is None