4848)
4949
5050
51- # TODO(b/431231205): Re-enable once Unified Metrics are in prod.
52- # def test_create_eval_run_data_source_evaluation_set(client):
53- # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
54- # client._api_client._http_options.api_version = "v1beta1"
55- # tool = genai_types.Tool(
56- # function_declarations=[
57- # genai_types.FunctionDeclaration(
58- # name="get_weather",
59- # description="Get weather in a location",
60- # parameters={
61- # "type": "object",
62- # "properties": {"location": {"type": "string"}},
63- # },
64- # )
65- # ]
66- # )
67- # evaluation_run = client.evals.create_evaluation_run(
68- # name="test4",
69- # display_name="test4",
70- # dataset=types.EvaluationRunDataSource(
71- # evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
72- # ),
73- # dest=GCS_DEST,
74- # metrics=[
75- # UNIVERSAL_AR_METRIC,
76- # types.RubricMetric.FINAL_RESPONSE_QUALITY,
77- # LLM_METRIC
78- # ],
79- # agent_info=types.AgentInfo(
80- # agent="project/123/locations/us-central1/reasoningEngines/456",
81- # name="agent-1",
82- # instruction="agent-1 instruction",
83- # tool_declarations=[tool],
84- # ),
85- # labels={"label1": "value1"},
86- # )
87- # assert isinstance(evaluation_run, types.EvaluationRun)
88- # assert evaluation_run.display_name == "test4"
89- # assert evaluation_run.state == types.EvaluationRunState.PENDING
90- # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
91- # assert evaluation_run.data_source.evaluation_set == (
92- # "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
93- # )
94- # assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
95- # output_config=genai_types.OutputConfig(
96- # gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
97- # ),
98- # metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
99- # )
100- # assert evaluation_run.inference_configs[
101- # "agent-1"
102- # ] == types.EvaluationRunInferenceConfig(
103- # agent_config=types.EvaluationRunAgentConfig(
104- # developer_instruction=genai_types.Content(
105- # parts=[genai_types.Part(text="agent-1 instruction")]
106- # ),
107- # tools=[tool],
108- # )
109- # )
110- # assert evaluation_run.labels == {
111- # "vertex-ai-evaluation-agent-engine-id": "456",
112- # "label1": "value1",
113- # }
114- # assert evaluation_run.error is None
51+ def test_create_eval_run_data_source_evaluation_set (client ):
52+ """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
53+ client ._api_client ._http_options .api_version = "v1beta1"
54+ tool = genai_types .Tool (
55+ function_declarations = [
56+ genai_types .FunctionDeclaration (
57+ name = "get_weather" ,
58+ description = "Get weather in a location" ,
59+ parameters = {
60+ "type" : "object" ,
61+ "properties" : {"location" : {"type" : "string" }},
62+ },
63+ )
64+ ]
65+ )
66+ evaluation_run = client .evals .create_evaluation_run (
67+ name = "test4" ,
68+ display_name = "test4" ,
69+ dataset = types .EvaluationRunDataSource (
70+ evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
71+ ),
72+ dest = GCS_DEST ,
73+ metrics = [
74+ UNIVERSAL_AR_METRIC ,
75+ types .RubricMetric .FINAL_RESPONSE_QUALITY ,
76+ LLM_METRIC ,
77+ ],
78+ agent_info = types .evals .AgentInfo (
79+ agent = "project/123/locations/us-central1/reasoningEngines/456" ,
80+ name = "agent-1" ,
81+ instruction = "agent-1 instruction" ,
82+ tool_declarations = [tool ],
83+ ),
84+ labels = {"label1" : "value1" },
85+ )
86+ assert isinstance (evaluation_run , types .EvaluationRun )
87+ assert evaluation_run .display_name == "test4"
88+ assert evaluation_run .state == types .EvaluationRunState .PENDING
89+ assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
90+ assert evaluation_run .data_source .evaluation_set == (
91+ "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
92+ )
93+ assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
94+ output_config = genai_types .OutputConfig (
95+ gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
96+ ),
97+ metrics = [UNIVERSAL_AR_METRIC , FINAL_RESPONSE_QUALITY_METRIC , LLM_METRIC ],
98+ )
99+ assert evaluation_run .inference_configs [
100+ "agent-1"
101+ ] == types .EvaluationRunInferenceConfig (
102+ agent_config = types .EvaluationRunAgentConfig (
103+ developer_instruction = genai_types .Content (
104+ parts = [genai_types .Part (text = "agent-1 instruction" )]
105+ ),
106+ tools = [tool ],
107+ )
108+ )
109+ assert evaluation_run .labels == {
110+ "vertex-ai-evaluation-agent-engine-id" : "456" ,
111+ "label1" : "value1" ,
112+ }
113+ assert evaluation_run .error is None
115114
116115
117116def test_create_eval_run_data_source_bigquery_request_set (client ):
@@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
132131 ),
133132 labels = {"label1" : "value1" },
134133 dest = GCS_DEST ,
134+ metrics = [UNIVERSAL_AR_METRIC ],
135135 )
136136 assert isinstance (evaluation_run , types .EvaluationRun )
137137 assert evaluation_run .display_name == "test5"
@@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
152152 output_config = genai_types .OutputConfig (
153153 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
154154 ),
155+ metrics = [UNIVERSAL_AR_METRIC ],
155156 )
156157 assert evaluation_run .inference_configs is None
157158 assert evaluation_run .labels == {
@@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
160161 assert evaluation_run .error is None
161162
162163
163- # Test fails in replay mode because of the timestamp issue
164+ # Test fails in replay mode because of UUID generation mismatch.
164165# def test_create_eval_run_data_source_evaluation_dataset(client):
165166# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
166167# input_df = pd.DataFrame(
@@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
215216# candidate_name="candidate_1",
216217# eval_dataset_df=input_df,
217218# ),
218- # dest="gs://lakeyk-limited-bucket/eval_run_output",
219+ # dest=GCS_DEST,
220+ # metrics=[UNIVERSAL_AR_METRIC],
219221# )
220222# assert isinstance(evaluation_run, types.EvaluationRun)
221223# assert evaluation_run.display_name == "test6"
@@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
276278 )
277279 ),
278280 dest = GCS_DEST ,
281+ metrics = [UNIVERSAL_AR_METRIC ],
279282 )
280283 assert isinstance (evaluation_run , types .EvaluationRun )
281284 assert evaluation_run .display_name == "test8"
@@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
292295 output_config = genai_types .OutputConfig (
293296 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
294297 ),
298+ metrics = [UNIVERSAL_AR_METRIC ],
295299 )
296300 assert evaluation_run .error is None
297301 assert evaluation_run .inference_configs is None
0 commit comments