Skip to content

Commit a19fb2d

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - update types.evals.AgentInfo attributes to support multi-agent systems. Refactor evaluation data handling for Agent-based evals
PiperOrigin-RevId: 886409094
1 parent 0cff2d8 commit a19fb2d

File tree

7 files changed

+348
-451
lines changed

7 files changed

+348
-451
lines changed

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
),
6666
)
6767
INFERENCE_CONFIG = types.EvaluationRunInferenceConfig(
68-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
68+
model="projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
6969
)
7070
TOOL = genai_types.Tool(
7171
function_declarations=[
@@ -82,8 +82,14 @@
8282
AGENT_INFO = types.evals.AgentInfo(
8383
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
8484
name="agent-1",
85-
instruction="agent-1 instruction",
86-
tool_declarations=[TOOL],
85+
agents={
86+
"agent-1": types.evals.AgentConfig(
87+
agent_id="agent-1",
88+
instruction="agent-1 instruction",
89+
tools=[TOOL],
90+
)
91+
},
92+
root_agent_id="agent-1",
8793
)
8894
DEFAULT_PROMPT_TEMPLATE = "{prompt}"
8995
INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
@@ -96,9 +102,9 @@
96102
}
97103
)
98104
CANDIDATE_NAME = "candidate_1"
99-
MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
105+
MODEL_NAME = "projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
100106
EVAL_SET_NAME = (
101-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
107+
"projects/977012026409/locations/us-central1/evaluationSets/6619939608513740800"
102108
)
103109

104110

@@ -140,12 +146,7 @@ def test_create_eval_run_data_source_evaluation_set(client):
140146
assert evaluation_run.inference_configs[
141147
AGENT_INFO.name
142148
] == types.EvaluationRunInferenceConfig(
143-
agent_config=types.EvaluationRunAgentConfig(
144-
developer_instruction=genai_types.Content(
145-
parts=[genai_types.Part(text="agent-1 instruction")]
146-
),
147-
tools=[TOOL],
148-
)
149+
agent_configs=AGENT_INFO.agents,
149150
)
150151
assert evaluation_run.labels == {
151152
"vertex-ai-evaluation-agent-engine-id": "456",

tests/unit/vertexai/genai/replays/test_get_evaluation_run.py

Lines changed: 24 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai import types
19-
from google.genai import types as genai_types
2019
import datetime
2120
import pytest
2221

@@ -25,13 +24,13 @@ def test_get_eval_run(client):
2524
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
2625
client._api_client._http_options.api_version = "v1beta1"
2726
evaluation_run_name = (
28-
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
27+
"projects/977012026409/locations/us-central1/evaluationRuns/3940878372367761408"
2928
)
3029
evaluation_run = client.evals.get_evaluation_run(
3130
name=evaluation_run_name, include_evaluation_items=True
3231
)
33-
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
34-
check_run_5133048044039700480_evaluation_item_results(
32+
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
33+
check_run_3940878372367761408_evaluation_item_results(
3534
client, evaluation_run, evaluation_run_name
3635
)
3736

@@ -40,10 +39,10 @@ def test_get_eval_run_include_evaluation_items_false(client):
4039
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
4140
client._api_client._http_options.api_version = "v1beta1"
4241
evaluation_run_name = (
43-
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
42+
"projects/977012026409/locations/us-central1/evaluationRuns/3940878372367761408"
4443
)
4544
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
46-
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
45+
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
4746
assert evaluation_run.evaluation_item_results is None
4847

4948

@@ -103,172 +102,58 @@ def test_get_eval_run_eval_set_source(client):
103102
async def test_get_eval_run_async(client):
104103
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
105104
client._api_client._http_options.api_version = "v1beta1"
106-
eval_run_id = "5133048044039700480"
105+
eval_run_id = "3940878372367761408"
107106
evaluation_run_name = (
108-
f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
107+
f"projects/977012026409/locations/us-central1/evaluationRuns/{eval_run_id}"
109108
)
110109
evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
111-
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
110+
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
112111
assert evaluation_run.evaluation_item_results is None
113112

114113

115-
def check_run_5133048044039700480(
114+
def check_run_3940878372367761408(
116115
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
117116
):
118117
assert isinstance(evaluation_run, types.EvaluationRun)
119118
assert evaluation_run.name == evaluation_run_name
120-
assert evaluation_run.display_name == "sdk-test-1"
121-
assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
119+
assert (
120+
evaluation_run.display_name
121+
== "evaluation_run_9a464a39-6d40-4d4e-a5e2-a4ceabea4b15"
122+
)
123+
assert evaluation_run.metadata == {"pipeline_id": "8162140658019074048"}
122124
assert evaluation_run.create_time == datetime.datetime(
123-
2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
125+
2026, 3, 18, 1, 10, 13, 360535, tzinfo=datetime.timezone.utc
124126
)
125127
assert evaluation_run.completion_time == datetime.datetime(
126-
2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
128+
2026, 3, 18, 1, 11, 0, 448191, tzinfo=datetime.timezone.utc
127129
)
128130
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
129131
assert evaluation_run.evaluation_set_snapshot == (
130-
"projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
132+
"projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
131133
)
132134
assert (
133135
evaluation_run.data_source.evaluation_set
134-
== "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
136+
== "projects/977012026409/locations/us-central1/evaluationSets/3991900109943078912"
135137
)
136138
assert evaluation_run.evaluation_run_results.evaluation_set == (
137-
"projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
138-
)
139-
assert evaluation_run.inference_configs == {
140-
"gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
141-
agent_config=types.EvaluationRunAgentConfig(
142-
developer_instruction={
143-
"parts": [{"text": "example agent developer instruction"}]
144-
},
145-
tools=[
146-
genai_types.Tool(
147-
function_declarations=[
148-
genai_types.FunctionDeclaration(
149-
name="check_chime",
150-
description="Check chime.",
151-
parameters={
152-
"type": "OBJECT",
153-
"properties": {
154-
"nums": {
155-
"type": "STRING",
156-
"description": "List of numbers to be verified.",
157-
}
158-
},
159-
"required": ["nums"],
160-
},
161-
),
162-
],
163-
)
164-
],
165-
)
166-
),
167-
}
168-
assert evaluation_run.evaluation_run_results.summary_metrics == (
169-
types.SummaryMetric(
170-
metrics={
171-
"gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
172-
"gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
173-
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
174-
"gemini-2.0-flash-001@default/universal/P90": 1,
175-
"gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
176-
"gemini-2.0-flash-001@default/universal/P95": 1,
177-
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
178-
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
179-
"gemini-2.0-flash-001@default/universal/MEDIAN": 1,
180-
"gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
181-
"gemini-2.0-flash-001@default/universal/MODE": 1,
182-
"gemini-2.0-flash-001@default/safety_v1/MODE": 1,
183-
"gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
184-
"gemini-2.0-flash-001@default/safety_v1/P90": 1,
185-
"gemini-2.0-flash-001@default/safety_v1/P95": 1,
186-
"gemini-2.0-flash-001@default/universal/P99": 1,
187-
"gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
188-
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
189-
"gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
190-
"gemini-2.0-flash-001@default/safety_v1/P99": 1,
191-
},
192-
total_items=3,
193-
)
139+
"projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
194140
)
141+
assert evaluation_run.evaluation_run_results.summary_metrics.total_items == 2
195142
assert evaluation_run.error is None
196143

197144

198-
def check_run_5133048044039700480_evaluation_item_results(
145+
def check_run_3940878372367761408_evaluation_item_results(
199146
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
200147
):
201148
eval_result = evaluation_run.evaluation_item_results
202149
assert isinstance(eval_result, types.EvaluationResult)
203150
assert eval_result.summary_metrics == [
204151
types.AggregatedMetricResult(
205-
metric_name="safety_v1",
206-
mean_score=0.7888888915379842,
207-
stdev_score=0.2991758188061675,
208-
),
209-
types.AggregatedMetricResult(
210-
metric_name="universal",
211-
mean_score=0.7888888915379842,
212-
stdev_score=0.2991758188061675,
152+
metric_name="general_quality_v1",
153+
mean_score=0.13333333656191826,
154+
stdev_score=0.03333333507180214,
213155
),
214156
]
215-
# Check the agent info.
216-
assert eval_result.agent_info == types.evals.AgentInfo(
217-
name="gemini-2.0-flash-001@default",
218-
instruction="example agent developer instruction",
219-
description=None,
220-
tool_declarations=[
221-
genai_types.Tool(
222-
function_declarations=[
223-
genai_types.FunctionDeclaration(
224-
name="check_chime",
225-
description="Check chime.",
226-
parameters={
227-
"type": "OBJECT",
228-
"properties": {
229-
"nums": {
230-
"type": "STRING",
231-
"description": "List of numbers to be verified.",
232-
}
233-
},
234-
"required": ["nums"],
235-
},
236-
),
237-
],
238-
)
239-
],
240-
)
241-
# Check the first eval case result.
242-
eval_case_result = eval_result.eval_case_results[0]
243-
assert isinstance(eval_case_result, types.EvalCaseResult)
244-
# Check the response candidate results.
245-
response_candidate_result = eval_case_result.response_candidate_results[0]
246-
assert response_candidate_result.response_index == 0
247-
universal_metric_result = response_candidate_result.metric_results["universal"]
248-
assert isinstance(universal_metric_result, types.EvalCaseMetricResult)
249-
assert universal_metric_result.metric_name == "universal"
250-
assert universal_metric_result.score > 0
251-
assert universal_metric_result.explanation is None
252-
# Check the first rubric verdict.
253-
rubric_verdict_0 = universal_metric_result.rubric_verdicts[0]
254-
assert isinstance(rubric_verdict_0, types.evals.RubricVerdict)
255-
assert rubric_verdict_0.evaluated_rubric == types.evals.Rubric(
256-
content=types.evals.RubricContent(
257-
property=types.evals.RubricContentProperty(
258-
description="The response is in English."
259-
)
260-
),
261-
importance="HIGH",
262-
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
263-
)
264-
assert rubric_verdict_0.reasoning is not None
265-
assert rubric_verdict_0.verdict is True
266-
# Check the first evaluation dataset.
267-
eval_dataset = eval_result.evaluation_dataset[0]
268-
assert isinstance(eval_dataset, types.EvaluationDataset)
269-
assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
270-
assert eval_dataset.eval_dataset_df.shape[0] == 3
271-
assert eval_dataset.eval_dataset_df.shape[1] > 3
272157

273158

274159
pytestmark = pytest_helper.setup(

0 commit comments

Comments
 (0)