1414#
1515# pylint: disable=protected-access,bad-continuation,missing-function-docstring
1616
17+ import pandas as pd
18+
1719from tests .unit .vertexai .genai .replays import pytest_helper
1820from vertexai import types
1921from google .genai import types as genai_types
4648 )
4749 ),
4850)
51+ INPUT_DF = pd .DataFrame (
52+ {
53+ "prompt" : ["prompt1" , "prompt2" ],
54+ "reference" : ["reference1" , "reference2" ],
55+ "response" : ["response1" , "response2" ],
56+ "intermediate_events" : [
57+ [
58+ {
59+ "content" : {
60+ "parts" : [
61+ {"text" : "first user input" },
62+ ],
63+ "role" : "user" ,
64+ },
65+ },
66+ {
67+ "content" : {
68+ "parts" : [
69+ {"text" : "first model response" },
70+ ],
71+ "role" : "model" ,
72+ },
73+ },
74+ ],
75+ [
76+ {
77+ "content" : {
78+ "parts" : [
79+ {"text" : "second user input" },
80+ ],
81+ "role" : "user" ,
82+ },
83+ },
84+ {
85+ "content" : {
86+ "parts" : [
87+ {"text" : "second model response" },
88+ ],
89+ "role" : "model" ,
90+ },
91+ },
92+ ],
93+ ],
94+ }
95+ )
4996
5097
51- # TODO(b/431231205): Re-enable once Unified Metrics are in prod.
52- # def test_create_eval_run_data_source_evaluation_set(client):
53- # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
54- # client._api_client._http_options.api_version = "v1beta1"
55- # tool = genai_types.Tool(
56- # function_declarations=[
57- # genai_types.FunctionDeclaration(
58- # name="get_weather",
59- # description="Get weather in a location",
60- # parameters={
61- # "type": "object",
62- # "properties": {"location": {"type": "string"}},
63- # },
64- # )
65- # ]
66- # )
67- # evaluation_run = client.evals.create_evaluation_run(
68- # name="test4",
69- # display_name="test4",
70- # dataset=types.EvaluationRunDataSource(
71- # evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
72- # ),
73- # dest=GCS_DEST,
74- # metrics=[
75- # UNIVERSAL_AR_METRIC,
76- # types.RubricMetric.FINAL_RESPONSE_QUALITY,
77- # LLM_METRIC
78- # ],
79- # agent_info=types.AgentInfo(
80- # agent="project/123/locations/us-central1/reasoningEngines/456",
81- # name="agent-1",
82- # instruction="agent-1 instruction",
83- # tool_declarations=[tool],
84- # ),
85- # labels={"label1": "value1"},
86- # )
87- # assert isinstance(evaluation_run, types.EvaluationRun)
88- # assert evaluation_run.display_name == "test4"
89- # assert evaluation_run.state == types.EvaluationRunState.PENDING
90- # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
91- # assert evaluation_run.data_source.evaluation_set == (
92- # "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
93- # )
94- # assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
95- # output_config=genai_types.OutputConfig(
96- # gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
97- # ),
98- # metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
99- # )
100- # assert evaluation_run.inference_configs[
101- # "agent-1"
102- # ] == types.EvaluationRunInferenceConfig(
103- # agent_config=types.EvaluationRunAgentConfig(
104- # developer_instruction=genai_types.Content(
105- # parts=[genai_types.Part(text="agent-1 instruction")]
106- # ),
107- # tools=[tool],
108- # )
109- # )
110- # assert evaluation_run.labels == {
111- # "vertex-ai-evaluation-agent-engine-id": "456",
112- # "label1": "value1",
113- # }
114- # assert evaluation_run.error is None
98+ def test_create_eval_run_data_source_evaluation_set (client ):
99+ """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
100+ client ._api_client ._http_options .api_version = "v1beta1"
101+ tool = genai_types .Tool (
102+ function_declarations = [
103+ genai_types .FunctionDeclaration (
104+ name = "get_weather" ,
105+ description = "Get weather in a location" ,
106+ parameters = {
107+ "type" : "object" ,
108+ "properties" : {"location" : {"type" : "string" }},
109+ },
110+ )
111+ ]
112+ )
113+ evaluation_run = client .evals .create_evaluation_run (
114+ name = "test4" ,
115+ display_name = "test4" ,
116+ dataset = types .EvaluationRunDataSource (
117+ evaluation_set = "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
118+ ),
119+ dest = GCS_DEST ,
120+ metrics = [
121+ UNIVERSAL_AR_METRIC ,
122+ types .RubricMetric .FINAL_RESPONSE_QUALITY ,
123+ LLM_METRIC ,
124+ ],
125+ agent_info = types .evals .AgentInfo (
126+ agent = "project/123/locations/us-central1/reasoningEngines/456" ,
127+ name = "agent-1" ,
128+ instruction = "agent-1 instruction" ,
129+ tool_declarations = [tool ],
130+ ),
131+ labels = {"label1" : "value1" },
132+ )
133+ assert isinstance (evaluation_run , types .EvaluationRun )
134+ assert evaluation_run .display_name == "test4"
135+ assert evaluation_run .state == types .EvaluationRunState .PENDING
136+ assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
137+ assert evaluation_run .data_source .evaluation_set == (
138+ "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
139+ )
140+ assert evaluation_run .evaluation_config == types .EvaluationRunConfig (
141+ output_config = genai_types .OutputConfig (
142+ gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
143+ ),
144+ metrics = [UNIVERSAL_AR_METRIC , FINAL_RESPONSE_QUALITY_METRIC , LLM_METRIC ],
145+ )
146+ assert evaluation_run .inference_configs [
147+ "agent-1"
148+ ] == types .EvaluationRunInferenceConfig (
149+ agent_config = types .EvaluationRunAgentConfig (
150+ developer_instruction = genai_types .Content (
151+ parts = [genai_types .Part (text = "agent-1 instruction" )]
152+ ),
153+ tools = [tool ],
154+ )
155+ )
156+ assert evaluation_run .labels == {
157+ "vertex-ai-evaluation-agent-engine-id" : "456" ,
158+ "label1" : "value1" ,
159+ }
160+ assert evaluation_run .error is None
115161
116162
117163def test_create_eval_run_data_source_bigquery_request_set (client ):
@@ -132,6 +178,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
132178 ),
133179 labels = {"label1" : "value1" },
134180 dest = GCS_DEST ,
181+ metrics = [UNIVERSAL_AR_METRIC ],
135182 )
136183 assert isinstance (evaluation_run , types .EvaluationRun )
137184 assert evaluation_run .display_name == "test5"
@@ -152,6 +199,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
152199 output_config = genai_types .OutputConfig (
153200 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
154201 ),
202+ metrics = [UNIVERSAL_AR_METRIC ],
155203 )
156204 assert evaluation_run .inference_configs is None
157205 assert evaluation_run .labels == {
@@ -160,101 +208,92 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
160208 assert evaluation_run .error is None
161209
162210
163- # Test fails in replay mode because of the timestamp issue
211+ # Test fails in replay mode because of UUID generation mismatch.
164212# def test_create_eval_run_data_source_evaluation_dataset(client):
165213# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
166- # input_df = pd.DataFrame(
167- # {
168- # "prompt": ["prompt1", "prompt2"],
169- # "reference": ["reference1", "reference2"],
170- # "response": ["response1", "response2"],
171- # "intermediate_events": [
172- # [
173- # {
174- # "content": {
175- # "parts": [
176- # {"text": "first user input"},
177- # ],
178- # "role": "user",
179- # },
180- # },
181- # {
182- # "content": {
183- # "parts": [
184- # {"text": "first model response"},
185- # ],
186- # "role": "model",
187- # },
188- # },
189- # ],
190- # [
191- # {
192- # "content": {
193- # "parts": [
194- # {"text": "second user input"},
195- # ],
196- # "role": "user",
197- # },
198- # },
199- # {
200- # "content": {
201- # "parts": [
202- # {"text": "second model response"},
203- # ],
204- # "role": "model",
205- # },
206- # },
207- # ],
208- # ],
209- # }
210- # )
211214# evaluation_run = client.evals.create_evaluation_run(
212215# name="test6",
213216# display_name="test6",
214217# dataset=types.EvaluationDataset(
215218# candidate_name="candidate_1",
216- # eval_dataset_df=input_df ,
219+ # eval_dataset_df=INPUT_DF ,
217220# ),
218- # dest="gs://lakeyk-limited-bucket/eval_run_output",
221+ # dest=GCS_DEST,
222+ # metrics=[UNIVERSAL_AR_METRIC],
219223# )
220224# assert isinstance(evaluation_run, types.EvaluationRun)
221225# assert evaluation_run.display_name == "test6"
222226# assert evaluation_run.state == types.EvaluationRunState.PENDING
223- # assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
224- # # Check evaluation set
225- # assert evaluation_run.data_source.evaluation_set
226- # eval_set = client.evals.get_evaluation_set(
227- # name=evaluation_run.data_source.evaluation_set
227+ # check_evaluation_run_data_source(client, evaluation_run)
228+ # assert evaluation_run.error is None
229+
230+
231+ # Test fails in replay mode because of UUID generation mismatch.
232+ # def test_create_eval_run_data_source_pandas_dataframe(client):
233+ # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
234+ # evaluation_run = client.evals.create_evaluation_run(
235+ # dataset=INPUT_DF,
236+ # dest=GCS_DEST,
237+ # metrics=[UNIVERSAL_AR_METRIC],
238+ # )
239+ # assert isinstance(evaluation_run, types.EvaluationRun)
240+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
241+ # check_evaluation_run_data_source(client, evaluation_run)
242+ # assert evaluation_run.error is None
243+
244+ # Test fails in replay mode because of UUID generation mismatch.
245+ # def test_create_eval_run_data_source_evaluation_dataset_dict(client):
246+ # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
247+ # eval_dataset_dict = {
248+ # "candidate_name": "candidate_1",
249+ # "eval_dataset_df": INPUT_DF,
250+ # }
251+ # evaluation_run = client.evals.create_evaluation_run(
252+ # dataset=eval_dataset_dict,
253+ # dest=GCS_DEST,
254+ # metrics=[UNIVERSAL_AR_METRIC],
228255# )
229- # assert len(eval_set.evaluation_items) == 2
230- # # Check evaluation items
231- # for i, eval_item_name in enumerate(eval_set.evaluation_items):
232- # eval_item = client.evals.get_evaluation_item(name=eval_item_name)
233- # assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
234- # assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
235- # assert (
236- # eval_item.evaluation_request.candidate_responses[0].text
237- # == input_df.iloc[i]["response"]
238- # )
239- # assert (
240- # eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
241- # == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
242- # )
243- # assert (
244- # eval_item.evaluation_request.candidate_responses[0].events[0].role
245- # == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
246- # )
247- # assert (
248- # eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
249- # == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
250- # )
251- # assert (
252- # eval_item.evaluation_request.candidate_responses[0].events[1].role
253- # == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
254- # )
256+ # assert isinstance(evaluation_run, types.EvaluationRun)
257+ # assert evaluation_run.state == types.EvaluationRunState.PENDING
258+ # check_evaluation_run_data_source(client, evaluation_run)
255259# assert evaluation_run.error is None
256260
257261
262+ def check_evaluation_run_data_source (client , evaluation_run ):
263+ assert isinstance (evaluation_run .data_source , types .EvaluationRunDataSource )
264+ # Check evaluation set
265+ assert evaluation_run .data_source .evaluation_set
266+ eval_set = client .evals .get_evaluation_set (
267+ name = evaluation_run .data_source .evaluation_set
268+ )
269+ assert len (eval_set .evaluation_items ) == 2
270+ # Check evaluation items
271+ for i , eval_item_name in enumerate (eval_set .evaluation_items ):
272+ eval_item = client .evals .get_evaluation_item (name = eval_item_name )
273+ assert eval_item .evaluation_item_type == types .EvaluationItemType .REQUEST
274+ assert eval_item .evaluation_request .prompt .text == INPUT_DF .iloc [i ]["prompt" ]
275+ assert (
276+ eval_item .evaluation_request .candidate_responses [0 ].text
277+ == INPUT_DF .iloc [i ]["response" ]
278+ )
279+ assert (
280+ eval_item .evaluation_request .candidate_responses [0 ].events [0 ].parts [0 ].text
281+ == INPUT_DF .iloc [i ]["intermediate_events" ][0 ]["content" ]["parts" ][0 ]["text" ]
282+ )
283+ assert (
284+ eval_item .evaluation_request .candidate_responses [0 ].events [0 ].role
285+ == INPUT_DF .iloc [i ]["intermediate_events" ][0 ]["content" ]["role" ]
286+ )
287+ assert (
288+ eval_item .evaluation_request .candidate_responses [0 ].events [1 ].parts [0 ].text
289+ == INPUT_DF .iloc [i ]["intermediate_events" ][1 ]["content" ]["parts" ][0 ]["text" ]
290+ )
291+ assert (
292+ eval_item .evaluation_request .candidate_responses [0 ].events [1 ].role
293+ == INPUT_DF .iloc [i ]["intermediate_events" ][1 ]["content" ]["role" ]
294+ )
295+
296+
258297pytest_plugins = ("pytest_asyncio" ,)
259298
260299
@@ -276,6 +315,7 @@ async def test_create_eval_run_async(client):
276315 )
277316 ),
278317 dest = GCS_DEST ,
318+ metrics = [UNIVERSAL_AR_METRIC ],
279319 )
280320 assert isinstance (evaluation_run , types .EvaluationRun )
281321 assert evaluation_run .display_name == "test8"
@@ -292,6 +332,7 @@ async def test_create_eval_run_async(client):
292332 output_config = genai_types .OutputConfig (
293333 gcs_destination = genai_types .GcsDestination (output_uri_prefix = GCS_DEST )
294334 ),
335+ metrics = [UNIVERSAL_AR_METRIC ],
295336 )
296337 assert evaluation_run .error is None
297338 assert evaluation_run .inference_configs is None
0 commit comments