Skip to content

Commit dbe80fd

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - Add class methods for parsing raw Agent Session history into the new AgentData structure. Add agent_resource_name attribute to AgentConfig and loading methods.
feat: GenAI Client(evals) - Add 3 new multi-turn predefined metrics for agent evaluation (`MULTI_TURN_TOOL_USE_QUALITY`, `MULTI_TURN_TRAJECTORY_QUALITY`, `MULTI_TURN_TASK_SUCCESS`). chore: GenAI Client(evals) - Update evaluation data converters and metric handlers to natively support `AgentData` in `EvaluationDataset` and `EvalCase`. chore: GenAI Client(evals) - Map `agent_data` to `agent_eval_data` in Vertex REST payload generation. PiperOrigin-RevId: 869945268
1 parent e5f71de commit dbe80fd

File tree

9 files changed

+579
-86
lines changed

9 files changed

+579
-86
lines changed

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai._genai import types
19+
from google.genai import types as genai_types
1920
import pandas as pd
2021

2122

@@ -96,6 +97,264 @@ def test_evaluation_byor(client):
9697
assert case_result.response_candidate_results is not None
9798

9899

100+
def test_evaluation_agent_data(client):
101+
"""Tests evaluate method with AgentData."""
102+
client._api_client._http_options.base_url = (
103+
"https://autopush-aiplatform.sandbox.googleapis.com/"
104+
)
105+
client._api_client._http_options.api_version = "v1beta1"
106+
107+
agent_data = types.evals.AgentData(
108+
agents={
109+
"coordinator": types.evals.AgentConfig(
110+
agent_id="coordinator",
111+
agent_type="RouterAgent",
112+
description="Root agent that delegates to specialists.",
113+
instruction=(
114+
"You are a travel coordinator. Delegate flight tasks to"
115+
" 'flight_bot' and hotel tasks to 'hotel_bot'."
116+
),
117+
sub_agents=["flight_bot", "hotel_bot"],
118+
tools=[
119+
genai_types.Tool(
120+
function_declarations=[
121+
genai_types.FunctionDeclaration(
122+
name="delegate_to_agent",
123+
description=("Delegates conversation to a sub-agent."),
124+
)
125+
]
126+
)
127+
],
128+
),
129+
"flight_bot": types.evals.AgentConfig(
130+
agent_id="flight_bot",
131+
agent_type="SpecialistAgent",
132+
description="Handles flight searches.",
133+
instruction="Search for flights using the available tools.",
134+
tools=[
135+
genai_types.Tool(
136+
function_declarations=[
137+
genai_types.FunctionDeclaration(
138+
name="search_flights",
139+
description=(
140+
"Finds flights based on origin and" " destination."
141+
),
142+
)
143+
]
144+
)
145+
],
146+
),
147+
"hotel_bot": types.evals.AgentConfig(
148+
agent_id="hotel_bot",
149+
agent_type="SpecialistAgent",
150+
description="Handles hotel searches.",
151+
instruction="Search for hotels using the available tools.",
152+
tools=[
153+
genai_types.Tool(
154+
function_declarations=[
155+
genai_types.FunctionDeclaration(
156+
name="search_hotels",
157+
description="Finds hotels in a given location.",
158+
)
159+
]
160+
)
161+
],
162+
),
163+
},
164+
turns=[
165+
types.evals.ConversationTurn(
166+
turn_index=0,
167+
events=[
168+
types.evals.AgentEvent(
169+
author="user",
170+
content=genai_types.Content(
171+
role="user",
172+
parts=[
173+
genai_types.Part(
174+
text=(
175+
"I need to book a flight to NYC for next"
176+
" Monday."
177+
)
178+
)
179+
],
180+
),
181+
),
182+
types.evals.AgentEvent(
183+
author="coordinator",
184+
content=genai_types.Content(
185+
role="model",
186+
parts=[
187+
genai_types.Part(
188+
function_call=genai_types.FunctionCall(
189+
name="delegate_to_agent",
190+
args={"agent_name": "flight_bot"},
191+
)
192+
)
193+
],
194+
),
195+
),
196+
types.evals.AgentEvent(
197+
author="flight_bot",
198+
content=genai_types.Content(
199+
role="model",
200+
parts=[
201+
genai_types.Part(
202+
function_call=genai_types.FunctionCall(
203+
name="search_flights",
204+
args={
205+
"destination": "NYC",
206+
"date": "next Monday",
207+
},
208+
)
209+
)
210+
],
211+
),
212+
),
213+
types.evals.AgentEvent(
214+
author="flight_bot",
215+
content=genai_types.Content(
216+
role="tool",
217+
parts=[
218+
genai_types.Part(
219+
function_response=genai_types.FunctionResponse(
220+
name="search_flights",
221+
response={
222+
"flights": [
223+
{
224+
"id": "UA100",
225+
"price": "$300",
226+
}
227+
]
228+
},
229+
)
230+
)
231+
],
232+
),
233+
),
234+
types.evals.AgentEvent(
235+
author="flight_bot",
236+
content=genai_types.Content(
237+
role="model",
238+
parts=[
239+
genai_types.Part(
240+
text="I found flight UA100 to NYC for $300."
241+
)
242+
],
243+
),
244+
),
245+
],
246+
),
247+
types.evals.ConversationTurn(
248+
turn_index=1,
249+
events=[
250+
types.evals.AgentEvent(
251+
author="user",
252+
content=genai_types.Content(
253+
role="user",
254+
parts=[
255+
genai_types.Part(
256+
text=(
257+
"Great, book that. I also need a hotel"
258+
" there."
259+
)
260+
)
261+
],
262+
),
263+
),
264+
types.evals.AgentEvent(
265+
author="coordinator",
266+
content=genai_types.Content(
267+
role="model",
268+
parts=[
269+
genai_types.Part(
270+
function_call=genai_types.FunctionCall(
271+
name="delegate_to_agent",
272+
args={"agent_name": "hotel_bot"},
273+
)
274+
)
275+
],
276+
),
277+
),
278+
types.evals.AgentEvent(
279+
author="hotel_bot",
280+
content=genai_types.Content(
281+
role="model",
282+
parts=[
283+
genai_types.Part(
284+
function_call=genai_types.FunctionCall(
285+
name="search_hotels",
286+
args={"location": "NYC"},
287+
)
288+
)
289+
],
290+
),
291+
),
292+
types.evals.AgentEvent(
293+
author="hotel_bot",
294+
content=genai_types.Content(
295+
role="tool",
296+
parts=[
297+
genai_types.Part(
298+
function_response=genai_types.FunctionResponse(
299+
name="search_hotels",
300+
response={
301+
"hotels": [
302+
{
303+
"name": "Central Park Hotel",
304+
"rating": 4.5,
305+
}
306+
]
307+
},
308+
)
309+
)
310+
],
311+
),
312+
),
313+
types.evals.AgentEvent(
314+
author="hotel_bot",
315+
content=genai_types.Content(
316+
role="model",
317+
parts=[
318+
genai_types.Part(
319+
text="I recommend the Central Park Hotel."
320+
)
321+
],
322+
),
323+
),
324+
],
325+
),
326+
],
327+
)
328+
329+
# Create the EvalCase and wrap it in an EvaluationDataset
330+
eval_case = types.EvalCase(agent_data=agent_data)
331+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
332+
333+
metrics = [
334+
types.RubricMetric.MULTI_TURN_TOOL_USE_QUALITY,
335+
types.RubricMetric.MULTI_TURN_TRAJECTORY_QUALITY,
336+
types.RubricMetric.MULTI_TURN_TASK_SUCCESS,
337+
]
338+
339+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
340+
341+
assert isinstance(evaluation_result, types.EvaluationResult)
342+
343+
assert evaluation_result.summary_metrics is not None
344+
assert len(evaluation_result.summary_metrics) > 0
345+
for summary in evaluation_result.summary_metrics:
346+
assert isinstance(summary, types.AggregatedMetricResult)
347+
assert summary.metric_name is not None
348+
assert summary.mean_score is not None
349+
350+
assert evaluation_result.eval_case_results is not None
351+
assert len(evaluation_result.eval_case_results) > 0
352+
for case_result in evaluation_result.eval_case_results:
353+
assert isinstance(case_result, types.EvalCaseResult)
354+
assert case_result.eval_case_index is not None
355+
assert case_result.response_candidate_results is not None
356+
357+
99358
pytestmark = pytest_helper.setup(
100359
file=__file__,
101360
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_instances.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -105,39 +105,6 @@ def test_pointwise_metric(client):
105105
assert response.pointwise_metric_result.score is not None
106106

107107

108-
# def test_predefined_metric_with_agent_data(client):
109-
# """Tests the _evaluate_instances method with predefined metric and agent_data."""
110-
# agent_data = types.evals.AgentData(
111-
# agent_config=types.evals.AgentConfig(
112-
# tools=[
113-
# genai_types.Tool(
114-
# function_declarations=[
115-
# genai_types.FunctionDeclaration(name="search")
116-
# ]
117-
# )
118-
# ],
119-
# developer_instruction=types.evals.InstanceData(text="instruction"),
120-
# ),
121-
# events=types.evals.Events(
122-
# event=[genai_types.Content(parts=[genai_types.Part(text="hello")])]
123-
# ),
124-
# )
125-
# instance = types.EvaluationInstance(
126-
# prompt=types.evals.InstanceData(text="What is the capital of France?"),
127-
# response=types.evals.InstanceData(text="Paris"),
128-
# reference=types.evals.InstanceData(text="Paris"),
129-
# agent_data=agent_data,
130-
# )
131-
132-
# response = client.evals.evaluate_instances(
133-
# metric_config=types._EvaluateInstancesRequestParameters(
134-
# metrics=[types.Metric(name="general_quality_v1")],
135-
# instance=instance,
136-
# )
137-
# )
138-
# assert response.metric_results[0].score is not None
139-
140-
141108
def test_pairwise_metric_with_autorater(client):
142109
"""Tests the _evaluate_instances method with PairwiseMetricInput and AutoraterConfig."""
143110

vertexai/_genai/_evals_common.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,8 +1047,10 @@ def _resolve_dataset_inputs(
10471047
datasets_to_process = dataset
10481048
logger.info("Processing %s dataset(s).", num_response_candidates)
10491049

1050-
loaded_raw_datasets: list[list[dict[str, Any]]] = []
1051-
schemas_for_merge: list[str] = []
1050+
if len(datasets_to_process) == 1 and datasets_to_process[0].eval_cases:
1051+
return datasets_to_process[0], 1
1052+
1053+
parsed_evaluation_datasets: list[types.EvaluationDataset] = []
10521054

10531055
for i, ds_item in enumerate(datasets_to_process):
10541056
if not isinstance(ds_item, types.EvaluationDataset):
@@ -1062,17 +1064,20 @@ def _resolve_dataset_inputs(
10621064
f"Item at index {i} is not an EvaluationDataset: {type(ds_item)}"
10631065
)
10641066

1067+
if ds_item.eval_cases:
1068+
logger.info("Dataset %d already contains eval_cases.", i)
1069+
parsed_evaluation_datasets.append(ds_item)
1070+
continue
1071+
10651072
ds_source_for_loader = _get_dataset_source(ds_item)
10661073
current_loaded_data = loader.load(ds_source_for_loader)
1067-
loaded_raw_datasets.append(current_loaded_data)
10681074

10691075
if dataset_schema:
10701076
current_schema = _evals_data_converters.EvalDatasetSchema(dataset_schema)
10711077
else:
10721078
current_schema = _evals_data_converters.auto_detect_dataset_schema( # type: ignore[assignment]
10731079
current_loaded_data
10741080
)
1075-
schemas_for_merge.append(current_schema)
10761081

10771082
logger.info(
10781083
"Dataset %d: Schema: %s. Using %s converter.",
@@ -1082,13 +1087,12 @@ def _resolve_dataset_inputs(
10821087
current_schema
10831088
).__class__.__name__,
10841089
)
1090+
converter = _evals_data_converters.get_dataset_converter(current_schema)
1091+
parsed_evaluation_datasets.append(converter.convert(current_loaded_data))
10851092

1086-
processed_eval_dataset = (
1087-
_evals_data_converters.merge_response_datasets_into_canonical_format(
1088-
raw_datasets=loaded_raw_datasets,
1089-
schemas=schemas_for_merge,
1090-
agent_info=agent_info,
1091-
)
1093+
processed_eval_dataset = _evals_data_converters.merge_evaluation_datasets(
1094+
datasets=parsed_evaluation_datasets,
1095+
agent_info=agent_info,
10921096
)
10931097

10941098
if not processed_eval_dataset.eval_cases:

vertexai/_genai/_evals_constant.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
"safety_v1",
2424
"multi_turn_general_quality_v1",
2525
"multi_turn_text_quality_v1",
26+
"multi_turn_tool_use_quality_v1",
27+
"multi_turn_trajectory_quality_v1",
28+
"multi_turn_task_success_v1",
2629
"final_response_match_v2",
2730
"final_response_reference_free_v1",
2831
"final_response_quality_v1",

0 commit comments

Comments
 (0)