1616
1717from tests .unit .vertexai .genai .replays import pytest_helper
1818from vertexai import types
19- from google .genai import types as genai_types
2019import datetime
2120import pytest
2221
@@ -25,13 +24,13 @@ def test_get_eval_run(client):
2524 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
2625 client ._api_client ._http_options .api_version = "v1beta1"
2726 evaluation_run_name = (
28- "projects/503583131166 /locations/us-central1/evaluationRuns/5133048044039700480 "
27+ "projects/977012026409 /locations/us-central1/evaluationRuns/3940878372367761408 "
2928 )
3029 evaluation_run = client .evals .get_evaluation_run (
3130 name = evaluation_run_name , include_evaluation_items = True
3231 )
33- check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
34- check_run_5133048044039700480_evaluation_item_results (
32+ check_run_3940878372367761408 (client , evaluation_run , evaluation_run_name )
33+ check_run_3940878372367761408_evaluation_item_results (
3534 client , evaluation_run , evaluation_run_name
3635 )
3736
@@ -40,10 +39,10 @@ def test_get_eval_run_include_evaluation_items_false(client):
4039 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
4140 client ._api_client ._http_options .api_version = "v1beta1"
4241 evaluation_run_name = (
43- "projects/503583131166 /locations/us-central1/evaluationRuns/5133048044039700480 "
42+ "projects/977012026409 /locations/us-central1/evaluationRuns/3940878372367761408 "
4443 )
4544 evaluation_run = client .evals .get_evaluation_run (name = evaluation_run_name )
46- check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
45+ check_run_3940878372367761408 (client , evaluation_run , evaluation_run_name )
4746 assert evaluation_run .evaluation_item_results is None
4847
4948
@@ -103,172 +102,58 @@ def test_get_eval_run_eval_set_source(client):
103102async def test_get_eval_run_async (client ):
104103 """Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
105104 client ._api_client ._http_options .api_version = "v1beta1"
106- eval_run_id = "5133048044039700480 "
105+ eval_run_id = "3940878372367761408 "
107106 evaluation_run_name = (
108- f"projects/503583131166 /locations/us-central1/evaluationRuns/{ eval_run_id } "
107+ f"projects/977012026409 /locations/us-central1/evaluationRuns/{ eval_run_id } "
109108 )
110109 evaluation_run = await client .aio .evals .get_evaluation_run (name = eval_run_id )
111- check_run_5133048044039700480 (client , evaluation_run , evaluation_run_name )
110+ check_run_3940878372367761408 (client , evaluation_run , evaluation_run_name )
112111 assert evaluation_run .evaluation_item_results is None
113112
114113
115- def check_run_5133048044039700480 (
114+ def check_run_3940878372367761408 (
116115 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
117116):
118117 assert isinstance (evaluation_run , types .EvaluationRun )
119118 assert evaluation_run .name == evaluation_run_name
120- assert evaluation_run .display_name == "sdk-test-1"
121- assert evaluation_run .metadata == {"pipeline_id" : "4868043098678099968" }
119+ assert (
120+ evaluation_run .display_name
121+ == "evaluation_run_9a464a39-6d40-4d4e-a5e2-a4ceabea4b15"
122+ )
123+ assert evaluation_run .metadata == {"pipeline_id" : "8162140658019074048" }
122124 assert evaluation_run .create_time == datetime .datetime (
123- 2025 , 10 , 21 , 19 , 25 , 58 , 669441 , tzinfo = datetime .timezone .utc
125+ 2026 , 3 , 18 , 1 , 10 , 13 , 360535 , tzinfo = datetime .timezone .utc
124126 )
125127 assert evaluation_run .completion_time == datetime .datetime (
126- 2025 , 10 , 21 , 19 , 26 , 15 , 855568 , tzinfo = datetime .timezone .utc
128+ 2026 , 3 , 18 , 1 , 11 , 0 , 448191 , tzinfo = datetime .timezone .utc
127129 )
128130 assert evaluation_run .state == types .EvaluationRunState .SUCCEEDED
129131 assert evaluation_run .evaluation_set_snapshot == (
130- "projects/503583131166 /locations/us-central1/evaluationSets/3122155626046685184 "
132+ "projects/977012026409 /locations/us-central1/evaluationSets/3885168317211607040 "
131133 )
132134 assert (
133135 evaluation_run .data_source .evaluation_set
134- == "projects/503583131166 /locations/us-central1/evaluationSets/3122155626046685184 "
136+ == "projects/977012026409 /locations/us-central1/evaluationSets/3991900109943078912 "
135137 )
136138 assert evaluation_run .evaluation_run_results .evaluation_set == (
137- "projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
138- )
139- assert evaluation_run .inference_configs == {
140- "gemini-2.0-flash-001@default" : types .EvaluationRunInferenceConfig (
141- agent_config = types .EvaluationRunAgentConfig (
142- developer_instruction = {
143- "parts" : [{"text" : "example agent developer instruction" }]
144- },
145- tools = [
146- genai_types .Tool (
147- function_declarations = [
148- genai_types .FunctionDeclaration (
149- name = "check_chime" ,
150- description = "Check chime." ,
151- parameters = {
152- "type" : "OBJECT" ,
153- "properties" : {
154- "nums" : {
155- "type" : "STRING" ,
156- "description" : "List of numbers to be verified." ,
157- }
158- },
159- "required" : ["nums" ],
160- },
161- ),
162- ],
163- )
164- ],
165- )
166- ),
167- }
168- assert evaluation_run .evaluation_run_results .summary_metrics == (
169- types .SummaryMetric (
170- metrics = {
171- "gemini-2.0-flash-001@default/safety_v1/VARIANCE" : 0.08950617055834077 ,
172- "gemini-2.0-flash-001@default/safety_v1/MAXIMUM" : 1 ,
173- "gemini-2.0-flash-001@default/universal/AVERAGE" : 0.7888888915379842 ,
174- "gemini-2.0-flash-001@default/universal/P90" : 1 ,
175- "gemini-2.0-flash-001@default/safety_v1/MEDIAN" : 1 ,
176- "gemini-2.0-flash-001@default/universal/P95" : 1 ,
177- "gemini-2.0-flash-001@default/universal/VARIANCE" : 0.08950617055834077 ,
178- "gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION" : 0.2991758188061675 ,
179- "gemini-2.0-flash-001@default/universal/MEDIAN" : 1 ,
180- "gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION" : 0.2991758188061675 ,
181- "gemini-2.0-flash-001@default/universal/MODE" : 1 ,
182- "gemini-2.0-flash-001@default/safety_v1/MODE" : 1 ,
183- "gemini-2.0-flash-001@default/safety_v1/MINIMUM" : 0.3333333432674408 ,
184- "gemini-2.0-flash-001@default/safety_v1/P90" : 1 ,
185- "gemini-2.0-flash-001@default/safety_v1/P95" : 1 ,
186- "gemini-2.0-flash-001@default/universal/P99" : 1 ,
187- "gemini-2.0-flash-001@default/safety_v1/AVERAGE" : 0.7888888915379842 ,
188- "gemini-2.0-flash-001@default/universal/MINIMUM" : 0.3333333432674408 ,
189- "gemini-2.0-flash-001@default/universal/MAXIMUM" : 1 ,
190- "gemini-2.0-flash-001@default/safety_v1/P99" : 1 ,
191- },
192- total_items = 3 ,
193- )
139+ "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
194140 )
141+ assert evaluation_run .evaluation_run_results .summary_metrics .total_items == 2
195142 assert evaluation_run .error is None
196143
197144
198- def check_run_5133048044039700480_evaluation_item_results (
145+ def check_run_3940878372367761408_evaluation_item_results (
199146 client , evaluation_run : types .EvaluationRun , evaluation_run_name : str
200147):
201148 eval_result = evaluation_run .evaluation_item_results
202149 assert isinstance (eval_result , types .EvaluationResult )
203150 assert eval_result .summary_metrics == [
204151 types .AggregatedMetricResult (
205- metric_name = "safety_v1" ,
206- mean_score = 0.7888888915379842 ,
207- stdev_score = 0.2991758188061675 ,
208- ),
209- types .AggregatedMetricResult (
210- metric_name = "universal" ,
211- mean_score = 0.7888888915379842 ,
212- stdev_score = 0.2991758188061675 ,
152+ metric_name = "general_quality_v1" ,
153+ mean_score = 0.13333333656191826 ,
154+ stdev_score = 0.03333333507180214 ,
213155 ),
214156 ]
215- # Check the agent info.
216- assert eval_result .agent_info == types .evals .AgentInfo (
217- name = "gemini-2.0-flash-001@default" ,
218- instruction = "example agent developer instruction" ,
219- description = None ,
220- tool_declarations = [
221- genai_types .Tool (
222- function_declarations = [
223- genai_types .FunctionDeclaration (
224- name = "check_chime" ,
225- description = "Check chime." ,
226- parameters = {
227- "type" : "OBJECT" ,
228- "properties" : {
229- "nums" : {
230- "type" : "STRING" ,
231- "description" : "List of numbers to be verified." ,
232- }
233- },
234- "required" : ["nums" ],
235- },
236- ),
237- ],
238- )
239- ],
240- )
241- # Check the first eval case result.
242- eval_case_result = eval_result .eval_case_results [0 ]
243- assert isinstance (eval_case_result , types .EvalCaseResult )
244- # Check the response candidate results.
245- response_candidate_result = eval_case_result .response_candidate_results [0 ]
246- assert response_candidate_result .response_index == 0
247- universal_metric_result = response_candidate_result .metric_results ["universal" ]
248- assert isinstance (universal_metric_result , types .EvalCaseMetricResult )
249- assert universal_metric_result .metric_name == "universal"
250- assert universal_metric_result .score > 0
251- assert universal_metric_result .explanation is None
252- # Check the first rubric verdict.
253- rubric_verdict_0 = universal_metric_result .rubric_verdicts [0 ]
254- assert isinstance (rubric_verdict_0 , types .evals .RubricVerdict )
255- assert rubric_verdict_0 .evaluated_rubric == types .evals .Rubric (
256- content = types .evals .RubricContent (
257- property = types .evals .RubricContentProperty (
258- description = "The response is in English."
259- )
260- ),
261- importance = "HIGH" ,
262- type = "LANGUAGE:PRIMARY_RESPONSE_LANGUAGE" ,
263- )
264- assert rubric_verdict_0 .reasoning is not None
265- assert rubric_verdict_0 .verdict is True
266- # Check the first evaluation dataset.
267- eval_dataset = eval_result .evaluation_dataset [0 ]
268- assert isinstance (eval_dataset , types .EvaluationDataset )
269- assert eval_dataset .candidate_name == "gemini-2.0-flash-001@default"
270- assert eval_dataset .eval_dataset_df .shape [0 ] == 3
271- assert eval_dataset .eval_dataset_df .shape [1 ] > 3
272157
273158
274159pytestmark = pytest_helper .setup (
0 commit comments