Skip to content

Commit 741c6ad

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
fix: GenAI Client(evals) - Support EvaluationDataset output from run_inference as input dataset in create_evaluation_run in Vertex AI GenAI SDK evals
PiperOrigin-RevId: 826069078
1 parent a018a19 commit 741c6ad

File tree

8 files changed

+135
-101
lines changed

8 files changed

+135
-101
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 70 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -48,70 +48,69 @@
4848
)
4949

5050

51-
# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
52-
# def test_create_eval_run_data_source_evaluation_set(client):
53-
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
54-
# client._api_client._http_options.api_version = "v1beta1"
55-
# tool = genai_types.Tool(
56-
# function_declarations=[
57-
# genai_types.FunctionDeclaration(
58-
# name="get_weather",
59-
# description="Get weather in a location",
60-
# parameters={
61-
# "type": "object",
62-
# "properties": {"location": {"type": "string"}},
63-
# },
64-
# )
65-
# ]
66-
# )
67-
# evaluation_run = client.evals.create_evaluation_run(
68-
# name="test4",
69-
# display_name="test4",
70-
# dataset=types.EvaluationRunDataSource(
71-
# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
72-
# ),
73-
# dest=GCS_DEST,
74-
# metrics=[
75-
# UNIVERSAL_AR_METRIC,
76-
# types.RubricMetric.FINAL_RESPONSE_QUALITY,
77-
# LLM_METRIC
78-
# ],
79-
# agent_info=types.AgentInfo(
80-
# agent="project/123/locations/us-central1/reasoningEngines/456",
81-
# name="agent-1",
82-
# instruction="agent-1 instruction",
83-
# tool_declarations=[tool],
84-
# ),
85-
# labels={"label1": "value1"},
86-
# )
87-
# assert isinstance(evaluation_run, types.EvaluationRun)
88-
# assert evaluation_run.display_name == "test4"
89-
# assert evaluation_run.state == types.EvaluationRunState.PENDING
90-
# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
91-
# assert evaluation_run.data_source.evaluation_set == (
92-
# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
93-
# )
94-
# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
95-
# output_config=genai_types.OutputConfig(
96-
# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
97-
# ),
98-
# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
99-
# )
100-
# assert evaluation_run.inference_configs[
101-
# "agent-1"
102-
# ] == types.EvaluationRunInferenceConfig(
103-
# agent_config=types.EvaluationRunAgentConfig(
104-
# developer_instruction=genai_types.Content(
105-
# parts=[genai_types.Part(text="agent-1 instruction")]
106-
# ),
107-
# tools=[tool],
108-
# )
109-
# )
110-
# assert evaluation_run.labels == {
111-
# "vertex-ai-evaluation-agent-engine-id": "456",
112-
# "label1": "value1",
113-
# }
114-
# assert evaluation_run.error is None
51+
def test_create_eval_run_data_source_evaluation_set(client):
52+
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
53+
client._api_client._http_options.api_version = "v1beta1"
54+
tool = genai_types.Tool(
55+
function_declarations=[
56+
genai_types.FunctionDeclaration(
57+
name="get_weather",
58+
description="Get weather in a location",
59+
parameters={
60+
"type": "object",
61+
"properties": {"location": {"type": "string"}},
62+
},
63+
)
64+
]
65+
)
66+
evaluation_run = client.evals.create_evaluation_run(
67+
name="test4",
68+
display_name="test4",
69+
dataset=types.EvaluationRunDataSource(
70+
evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
71+
),
72+
dest=GCS_DEST,
73+
metrics=[
74+
UNIVERSAL_AR_METRIC,
75+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
76+
LLM_METRIC,
77+
],
78+
agent_info=types.evals.AgentInfo(
79+
agent="project/123/locations/us-central1/reasoningEngines/456",
80+
name="agent-1",
81+
instruction="agent-1 instruction",
82+
tool_declarations=[tool],
83+
),
84+
labels={"label1": "value1"},
85+
)
86+
assert isinstance(evaluation_run, types.EvaluationRun)
87+
assert evaluation_run.display_name == "test4"
88+
assert evaluation_run.state == types.EvaluationRunState.PENDING
89+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
90+
assert evaluation_run.data_source.evaluation_set == (
91+
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
92+
)
93+
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
94+
output_config=genai_types.OutputConfig(
95+
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
96+
),
97+
metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
98+
)
99+
assert evaluation_run.inference_configs[
100+
"agent-1"
101+
] == types.EvaluationRunInferenceConfig(
102+
agent_config=types.EvaluationRunAgentConfig(
103+
developer_instruction=genai_types.Content(
104+
parts=[genai_types.Part(text="agent-1 instruction")]
105+
),
106+
tools=[tool],
107+
)
108+
)
109+
assert evaluation_run.labels == {
110+
"vertex-ai-evaluation-agent-engine-id": "456",
111+
"label1": "value1",
112+
}
113+
assert evaluation_run.error is None
115114

116115

117116
def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
132131
),
133132
labels={"label1": "value1"},
134133
dest=GCS_DEST,
134+
metrics=[UNIVERSAL_AR_METRIC],
135135
)
136136
assert isinstance(evaluation_run, types.EvaluationRun)
137137
assert evaluation_run.display_name == "test5"
@@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
152152
output_config=genai_types.OutputConfig(
153153
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
154154
),
155+
metrics=[UNIVERSAL_AR_METRIC],
155156
)
156157
assert evaluation_run.inference_configs is None
157158
assert evaluation_run.labels == {
@@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
160161
assert evaluation_run.error is None
161162

162163

163-
# Test fails in replay mode because of the timestamp issue
164+
# Test fails in replay mode because of UUID generation mismatch.
164165
# def test_create_eval_run_data_source_evaluation_dataset(client):
165166
# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
166167
# input_df = pd.DataFrame(
@@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
215216
# candidate_name="candidate_1",
216217
# eval_dataset_df=input_df,
217218
# ),
218-
# dest="gs://lakeyk-limited-bucket/eval_run_output",
219+
# dest=GCS_DEST,
220+
# metrics=[UNIVERSAL_AR_METRIC],
219221
# )
220222
# assert isinstance(evaluation_run, types.EvaluationRun)
221223
# assert evaluation_run.display_name == "test6"
@@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
276278
)
277279
),
278280
dest=GCS_DEST,
281+
metrics=[UNIVERSAL_AR_METRIC],
279282
)
280283
assert isinstance(evaluation_run, types.EvaluationRun)
281284
assert evaluation_run.display_name == "test8"
@@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
292295
output_config=genai_types.OutputConfig(
293296
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
294297
),
298+
metrics=[UNIVERSAL_AR_METRIC],
295299
)
296300
assert evaluation_run.error is None
297301
assert evaluation_run.inference_configs is None

tests/unit/vertexai/genai/replays/test_evaluate_instances.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):
246246
agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
247247
src=test_df,
248248
)
249-
assert inference_result.candidate_name == "agent"
249+
assert inference_result.candidate_name is None
250250
assert inference_result.gcs_source is None
251251

252252

tests/unit/vertexai/genai/test_evals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
11321132
}
11331133
),
11341134
)
1135-
assert inference_result.candidate_name == "agent"
1135+
assert inference_result.candidate_name is None
11361136
assert inference_result.gcs_source is None
11371137

11381138
@mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")
@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
12111211
}
12121212
),
12131213
)
1214-
assert inference_result.candidate_name == "agent"
1214+
assert inference_result.candidate_name is None
12151215
assert inference_result.gcs_source is None
12161216

12171217
@mock.patch.object(_evals_utils, "EvalDatasetLoader")

vertexai/_genai/_evals_common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,6 @@ def _execute_inference(
798798

799799
evaluation_dataset = types.EvaluationDataset(
800800
eval_dataset_df=results_df,
801-
candidate_name="agent",
802801
)
803802
else:
804803
raise ValueError("Either model or agent_engine must be provided.")

vertexai/_genai/_evals_data_converters.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
366366

367367
intermediate_events: Optional[list[types.Event]] = None
368368
if intermediate_events_data:
369-
logger.warning(
370-
"intermediate_events attribute is experimental and may change in "
371-
"future versions."
372-
)
373369
if isinstance(intermediate_events_data, list):
374370
intermediate_events = []
375371
for event in intermediate_events_data:

vertexai/_genai/_evals_visualization.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str:
280280
281281
// If we have agent info, render as trace
282282
if(agentInfo) {{
283-
let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🏃</span>agent_run</div></div>`;
283+
let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🤖</span>agent_run</div></div>`;
284284
eventsArray.forEach(event => {{
285285
if (event.content && event.content.parts && event.content.parts.length > 0) {{
286286
event.content.parts.forEach(part => {{
@@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
10731073
dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
10741074
html_content = _get_inference_html(dataframe_json_string)
10751075
display.display(display.HTML(html_content))
1076+
1077+
1078+
def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
1079+
"""Returns a simple HTML string for displaying a status and optional error."""
1080+
error_html = ""
1081+
if error_message:
1082+
error_html = f"""
1083+
<p>
1084+
<b>Error:</b>
1085+
<pre style="white-space: pre-wrap; word-wrap: break-word;">{error_message}</pre>
1086+
</p>
1087+
"""
1088+
1089+
return f"""
1090+
<div>
1091+
<p><b>Status:</b> {status}</p>
1092+
{error_html}
1093+
</div>
1094+
"""
1095+
1096+
1097+
def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
1098+
"""Displays the status of an evaluation run in an IPython environment."""
1099+
if not _is_ipython_env():
1100+
logger.warning("Skipping display: not in an IPython environment.")
1101+
return
1102+
else:
1103+
from IPython import display
1104+
1105+
status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN"
1106+
error_message = str(eval_run_obj.error) if eval_run_obj.error else None
1107+
html_content = _get_status_html(status, error_message)
1108+
display.display(display.HTML(html_content))

vertexai/_genai/evals.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,11 +1334,9 @@ def create_evaluation_run(
13341334
*,
13351335
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
13361336
dest: str,
1337+
metrics: list[types.EvaluationRunMetricOrDict],
13371338
name: Optional[str] = None,
13381339
display_name: Optional[str] = None,
1339-
metrics: Optional[
1340-
list[types.EvaluationRunMetricOrDict]
1341-
] = None, # TODO: Make required unified metrics available in prod.
13421340
agent_info: Optional[types.evals.AgentInfoOrDict] = None,
13431341
labels: Optional[dict[str, str]] = None,
13441342
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -1348,25 +1346,32 @@ def create_evaluation_run(
13481346
Args:
13491347
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
13501348
dest: The GCS URI prefix to write the evaluation results to.
1349+
metrics: The list of metrics to evaluate.
13511350
name: The name of the evaluation run.
13521351
display_name: The display name of the evaluation run.
1353-
metrics: The list of metrics to evaluate.
13541352
agent_info: The agent info to evaluate.
13551353
labels: The labels to apply to the evaluation run.
13561354
config: The configuration for the evaluation run.
13571355
13581356
Returns:
13591357
The created evaluation run.
13601358
"""
1359+
if agent_info and isinstance(agent_info, dict):
1360+
agent_info = types.evals.AgentInfo.model_validate(agent_info)
13611361
if type(dataset).__name__ == "EvaluationDataset":
1362-
logger.warning(
1363-
"EvaluationDataset input is experimental and may change in future versions."
1364-
)
13651362
if dataset.eval_dataset_df is None:
13661363
raise ValueError(
13671364
"EvaluationDataset must have eval_dataset_df populated."
13681365
)
1369-
if dataset.candidate_name is None and agent_info:
1366+
if (
1367+
dataset.candidate_name
1368+
and agent_info.name
1369+
and dataset.candidate_name != agent_info.name
1370+
):
1371+
logger.warning(
1372+
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
1373+
)
1374+
elif dataset.candidate_name is None and agent_info:
13701375
dataset.candidate_name = agent_info.name
13711376
eval_set = _evals_common._create_evaluation_set_from_dataframe(
13721377
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -1383,9 +1388,6 @@ def create_evaluation_run(
13831388
)
13841389
inference_configs = {}
13851390
if agent_info:
1386-
logger.warning(
1387-
"The agent_info field is experimental and may change in future versions."
1388-
)
13891391
if isinstance(agent_info, dict):
13901392
agent_info = types.evals.AgentInfo.model_validate(agent_info)
13911393
if (
@@ -2187,11 +2189,9 @@ async def create_evaluation_run(
21872189
*,
21882190
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
21892191
dest: str,
2192+
metrics: list[types.EvaluationRunMetricOrDict],
21902193
name: Optional[str] = None,
21912194
display_name: Optional[str] = None,
2192-
metrics: Optional[
2193-
list[types.EvaluationRunMetricOrDict]
2194-
] = None, # TODO: Make required unified metrics available in prod.
21952195
agent_info: Optional[types.evals.AgentInfo] = None,
21962196
labels: Optional[dict[str, str]] = None,
21972197
config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -2201,25 +2201,32 @@ async def create_evaluation_run(
22012201
Args:
22022202
dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
22032203
dest: The GCS URI prefix to write the evaluation results to.
2204+
metrics: The list of metrics to evaluate.
22042205
name: The name of the evaluation run.
22052206
display_name: The display name of the evaluation run.
2206-
metrics: The list of metrics to evaluate.
22072207
agent_info: The agent info to evaluate.
22082208
labels: The labels to apply to the evaluation run.
22092209
config: The configuration for the evaluation run.
22102210
22112211
Returns:
22122212
The created evaluation run.
22132213
"""
2214+
if agent_info and isinstance(agent_info, dict):
2215+
agent_info = types.evals.AgentInfo.model_validate(agent_info)
22142216
if type(dataset).__name__ == "EvaluationDataset":
2215-
logger.warning(
2216-
"EvaluationDataset input is experimental and may change in future versions."
2217-
)
22182217
if dataset.eval_dataset_df is None:
22192218
raise ValueError(
22202219
"EvaluationDataset must have eval_dataset_df populated."
22212220
)
2222-
if dataset.candidate_name is None and agent_info:
2221+
if (
2222+
dataset.candidate_name
2223+
and agent_info.name
2224+
and dataset.candidate_name != agent_info.name
2225+
):
2226+
logger.warning(
2227+
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
2228+
)
2229+
elif dataset.candidate_name is None and agent_info:
22232230
dataset.candidate_name = agent_info.name
22242231
eval_set = _evals_common._create_evaluation_set_from_dataframe(
22252232
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -2236,9 +2243,6 @@ async def create_evaluation_run(
22362243
)
22372244
inference_configs = {}
22382245
if agent_info:
2239-
logger.warning(
2240-
"The agent_info field is experimental and may change in future versions."
2241-
)
22422246
if isinstance(agent_info, dict):
22432247
agent_info = types.evals.AgentInfo.model_validate(agent_info)
22442248
if (

0 commit comments

Comments
 (0)