Skip to content

Commit 3b23d88

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add pd.DataFrame as input for dataset in create_evaluation_runin Vertex AI GenAI SDK evals
PiperOrigin-RevId: 825658094
1 parent 35ac4c5 commit 3b23d88

File tree

6 files changed

+286
-219
lines changed

6 files changed

+286
-219
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 184 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#
1515
# pylint: disable=protected-access,bad-continuation,missing-function-docstring
1616

17+
import pandas as pd
18+
1719
from tests.unit.vertexai.genai.replays import pytest_helper
1820
from vertexai import types
1921
from google.genai import types as genai_types
@@ -46,72 +48,116 @@
4648
)
4749
),
4850
)
51+
INPUT_DF = pd.DataFrame(
52+
{
53+
"prompt": ["prompt1", "prompt2"],
54+
"reference": ["reference1", "reference2"],
55+
"response": ["response1", "response2"],
56+
"intermediate_events": [
57+
[
58+
{
59+
"content": {
60+
"parts": [
61+
{"text": "first user input"},
62+
],
63+
"role": "user",
64+
},
65+
},
66+
{
67+
"content": {
68+
"parts": [
69+
{"text": "first model response"},
70+
],
71+
"role": "model",
72+
},
73+
},
74+
],
75+
[
76+
{
77+
"content": {
78+
"parts": [
79+
{"text": "second user input"},
80+
],
81+
"role": "user",
82+
},
83+
},
84+
{
85+
"content": {
86+
"parts": [
87+
{"text": "second model response"},
88+
],
89+
"role": "model",
90+
},
91+
},
92+
],
93+
],
94+
}
95+
)
4996

5097

51-
# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
52-
# def test_create_eval_run_data_source_evaluation_set(client):
53-
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
54-
# client._api_client._http_options.api_version = "v1beta1"
55-
# tool = genai_types.Tool(
56-
# function_declarations=[
57-
# genai_types.FunctionDeclaration(
58-
# name="get_weather",
59-
# description="Get weather in a location",
60-
# parameters={
61-
# "type": "object",
62-
# "properties": {"location": {"type": "string"}},
63-
# },
64-
# )
65-
# ]
66-
# )
67-
# evaluation_run = client.evals.create_evaluation_run(
68-
# name="test4",
69-
# display_name="test4",
70-
# dataset=types.EvaluationRunDataSource(
71-
# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
72-
# ),
73-
# dest=GCS_DEST,
74-
# metrics=[
75-
# UNIVERSAL_AR_METRIC,
76-
# types.RubricMetric.FINAL_RESPONSE_QUALITY,
77-
# LLM_METRIC
78-
# ],
79-
# agent_info=types.AgentInfo(
80-
# agent="project/123/locations/us-central1/reasoningEngines/456",
81-
# name="agent-1",
82-
# instruction="agent-1 instruction",
83-
# tool_declarations=[tool],
84-
# ),
85-
# labels={"label1": "value1"},
86-
# )
87-
# assert isinstance(evaluation_run, types.EvaluationRun)
88-
# assert evaluation_run.display_name == "test4"
89-
# assert evaluation_run.state == types.EvaluationRunState.PENDING
90-
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
91-
# assert evaluation_run.data_source.evaluation_set == (
92-
# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
93-
# )
94-
# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
95-
# output_config=genai_types.OutputConfig(
96-
# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
97-
# ),
98-
# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
99-
# )
100-
# assert evaluation_run.inference_configs[
101-
# "agent-1"
102-
# ] == types.EvaluationRunInferenceConfig(
103-
# agent_config=types.EvaluationRunAgentConfig(
104-
# developer_instruction=genai_types.Content(
105-
# parts=[genai_types.Part(text="agent-1 instruction")]
106-
# ),
107-
# tools=[tool],
108-
# )
109-
# )
110-
# assert evaluation_run.labels == {
111-
# "vertex-ai-evaluation-agent-engine-id": "456",
112-
# "label1": "value1",
113-
# }
114-
# assert evaluation_run.error is None
98+
def test_create_eval_run_data_source_evaluation_set(client):
99+
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
100+
client._api_client._http_options.api_version = "v1beta1"
101+
tool = genai_types.Tool(
102+
function_declarations=[
103+
genai_types.FunctionDeclaration(
104+
name="get_weather",
105+
description="Get weather in a location",
106+
parameters={
107+
"type": "object",
108+
"properties": {"location": {"type": "string"}},
109+
},
110+
)
111+
]
112+
)
113+
evaluation_run = client.evals.create_evaluation_run(
114+
name="test4",
115+
display_name="test4",
116+
dataset=types.EvaluationRunDataSource(
117+
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
118+
),
119+
dest=GCS_DEST,
120+
metrics=[
121+
UNIVERSAL_AR_METRIC,
122+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
123+
LLM_METRIC,
124+
],
125+
agent_info=types.evals.AgentInfo(
126+
agent="project/123/locations/us-central1/reasoningEngines/456",
127+
name="agent-1",
128+
instruction="agent-1 instruction",
129+
tool_declarations=[tool],
130+
),
131+
labels={"label1": "value1"},
132+
)
133+
assert isinstance(evaluation_run, types.EvaluationRun)
134+
assert evaluation_run.display_name == "test4"
135+
assert evaluation_run.state == types.EvaluationRunState.PENDING
136+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
137+
assert evaluation_run.data_source.evaluation_set == (
138+
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
139+
)
140+
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
141+
output_config=genai_types.OutputConfig(
142+
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
143+
),
144+
metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
145+
)
146+
assert evaluation_run.inference_configs[
147+
"agent-1"
148+
] == types.EvaluationRunInferenceConfig(
149+
agent_config=types.EvaluationRunAgentConfig(
150+
developer_instruction=genai_types.Content(
151+
parts=[genai_types.Part(text="agent-1 instruction")]
152+
),
153+
tools=[tool],
154+
)
155+
)
156+
assert evaluation_run.labels == {
157+
"vertex-ai-evaluation-agent-engine-id": "456",
158+
"label1": "value1",
159+
}
160+
assert evaluation_run.error is None
115161

116162

117163
def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +178,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
132178
),
133179
labels={"label1": "value1"},
134180
dest=GCS_DEST,
181+
metrics=[UNIVERSAL_AR_METRIC],
135182
)
136183
assert isinstance(evaluation_run, types.EvaluationRun)
137184
assert evaluation_run.display_name == "test5"
@@ -152,6 +199,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
152199
output_config=genai_types.OutputConfig(
153200
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
154201
),
202+
metrics=[UNIVERSAL_AR_METRIC],
155203
)
156204
assert evaluation_run.inference_configs is None
157205
assert evaluation_run.labels == {
@@ -160,101 +208,92 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
160208
assert evaluation_run.error is None
161209

162210

163-
# Test fails in replay mode because of the timestamp issue
211+
# Test fails in replay mode because of UUID generation mismatch.
164212
# def test_create_eval_run_data_source_evaluation_dataset(client):
165213
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
166-
# input_df = pd.DataFrame(
167-
# {
168-
# "prompt": ["prompt1", "prompt2"],
169-
# "reference": ["reference1", "reference2"],
170-
# "response": ["response1", "response2"],
171-
# "intermediate_events": [
172-
# [
173-
# {
174-
# "content": {
175-
# "parts": [
176-
# {"text": "first user input"},
177-
# ],
178-
# "role": "user",
179-
# },
180-
# },
181-
# {
182-
# "content": {
183-
# "parts": [
184-
# {"text": "first model response"},
185-
# ],
186-
# "role": "model",
187-
# },
188-
# },
189-
# ],
190-
# [
191-
# {
192-
# "content": {
193-
# "parts": [
194-
# {"text": "second user input"},
195-
# ],
196-
# "role": "user",
197-
# },
198-
# },
199-
# {
200-
# "content": {
201-
# "parts": [
202-
# {"text": "second model response"},
203-
# ],
204-
# "role": "model",
205-
# },
206-
# },
207-
# ],
208-
# ],
209-
# }
210-
# )
211214
# evaluation_run = client.evals.create_evaluation_run(
212215
# name="test6",
213216
# display_name="test6",
214217
# dataset=types.EvaluationDataset(
215218
# candidate_name="candidate_1",
216-
# eval_dataset_df=input_df,
219+
# eval_dataset_df=INPUT_DF,
217220
# ),
218-
# dest="gs://lakeyk-limited-bucket/eval_run_output",
221+
# dest=GCS_DEST,
222+
# metrics=[UNIVERSAL_AR_METRIC],
219223
# )
220224
# assert isinstance(evaluation_run, types.EvaluationRun)
221225
# assert evaluation_run.display_name == "test6"
222226
# assert evaluation_run.state == types.EvaluationRunState.PENDING
223-
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
224-
# # Check evaluation set
225-
# assert evaluation_run.data_source.evaluation_set
226-
# eval_set = client.evals.get_evaluation_set(
227-
# name=evaluation_run.data_source.evaluation_set
227+
# check_evaluation_run_data_source(client, evaluation_run)
228+
# assert evaluation_run.error is None
229+
230+
231+
# Test fails in replay mode because of UUID generation mismatch.
232+
# def test_create_eval_run_data_source_pandas_dataframe(client):
233+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
234+
# evaluation_run = client.evals.create_evaluation_run(
235+
# dataset=INPUT_DF,
236+
# dest=GCS_DEST,
237+
# metrics=[UNIVERSAL_AR_METRIC],
238+
# )
239+
# assert isinstance(evaluation_run, types.EvaluationRun)
240+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
241+
# check_evaluation_run_data_source(client, evaluation_run)
242+
# assert evaluation_run.error is None
243+
244+
# Test fails in replay mode because of UUID generation mismatch.
245+
# def test_create_eval_run_data_source_evaluation_dataset_dict(client):
246+
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with Pandas DataFrame."""
247+
# eval_dataset_dict = {
248+
# "candidate_name": "candidate_1",
249+
# "eval_dataset_df": INPUT_DF,
250+
# }
251+
# evaluation_run = client.evals.create_evaluation_run(
252+
# dataset=eval_dataset_dict,
253+
# dest=GCS_DEST,
254+
# metrics=[UNIVERSAL_AR_METRIC],
228255
# )
229-
# assert len(eval_set.evaluation_items) == 2
230-
# # Check evaluation items
231-
# for i, eval_item_name in enumerate(eval_set.evaluation_items):
232-
# eval_item = client.evals.get_evaluation_item(name=eval_item_name)
233-
# assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
234-
# assert eval_item.evaluation_request.prompt.text == input_df.iloc[i]["prompt"]
235-
# assert (
236-
# eval_item.evaluation_request.candidate_responses[0].text
237-
# == input_df.iloc[i]["response"]
238-
# )
239-
# assert (
240-
# eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
241-
# == input_df.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
242-
# )
243-
# assert (
244-
# eval_item.evaluation_request.candidate_responses[0].events[0].role
245-
# == input_df.iloc[i]["intermediate_events"][0]["content"]["role"]
246-
# )
247-
# assert (
248-
# eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
249-
# == input_df.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
250-
# )
251-
# assert (
252-
# eval_item.evaluation_request.candidate_responses[0].events[1].role
253-
# == input_df.iloc[i]["intermediate_events"][1]["content"]["role"]
254-
# )
256+
# assert isinstance(evaluation_run, types.EvaluationRun)
257+
# assert evaluation_run.state == types.EvaluationRunState.PENDING
258+
# check_evaluation_run_data_source(client, evaluation_run)
255259
# assert evaluation_run.error is None
256260

257261

262+
def check_evaluation_run_data_source(client, evaluation_run):
263+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
264+
# Check evaluation set
265+
assert evaluation_run.data_source.evaluation_set
266+
eval_set = client.evals.get_evaluation_set(
267+
name=evaluation_run.data_source.evaluation_set
268+
)
269+
assert len(eval_set.evaluation_items) == 2
270+
# Check evaluation items
271+
for i, eval_item_name in enumerate(eval_set.evaluation_items):
272+
eval_item = client.evals.get_evaluation_item(name=eval_item_name)
273+
assert eval_item.evaluation_item_type == types.EvaluationItemType.REQUEST
274+
assert eval_item.evaluation_request.prompt.text == INPUT_DF.iloc[i]["prompt"]
275+
assert (
276+
eval_item.evaluation_request.candidate_responses[0].text
277+
== INPUT_DF.iloc[i]["response"]
278+
)
279+
assert (
280+
eval_item.evaluation_request.candidate_responses[0].events[0].parts[0].text
281+
== INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["parts"][0]["text"]
282+
)
283+
assert (
284+
eval_item.evaluation_request.candidate_responses[0].events[0].role
285+
== INPUT_DF.iloc[i]["intermediate_events"][0]["content"]["role"]
286+
)
287+
assert (
288+
eval_item.evaluation_request.candidate_responses[0].events[1].parts[0].text
289+
== INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["parts"][0]["text"]
290+
)
291+
assert (
292+
eval_item.evaluation_request.candidate_responses[0].events[1].role
293+
== INPUT_DF.iloc[i]["intermediate_events"][1]["content"]["role"]
294+
)
295+
296+
258297
pytest_plugins = ("pytest_asyncio",)
259298

260299

@@ -276,6 +315,7 @@ async def test_create_eval_run_async(client):
276315
)
277316
),
278317
dest=GCS_DEST,
318+
metrics=[UNIVERSAL_AR_METRIC],
279319
)
280320
assert isinstance(evaluation_run, types.EvaluationRun)
281321
assert evaluation_run.display_name == "test8"
@@ -292,6 +332,7 @@ async def test_create_eval_run_async(client):
292332
output_config=genai_types.OutputConfig(
293333
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
294334
),
335+
metrics=[UNIVERSAL_AR_METRIC],
295336
)
296337
assert evaluation_run.error is None
297338
assert evaluation_run.inference_configs is None

0 commit comments

Comments
 (0)