Skip to content

Commit 99a6e03

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - Improve retry budget, add jitter, and expand retryable errors
PiperOrigin-RevId: 896691317
1 parent 0653ff9 commit 99a6e03

6 files changed

Lines changed: 389 additions & 108 deletions

File tree

tests/unit/vertexai/genai/test_evals.py

Lines changed: 183 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6103,6 +6103,8 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
61036103
genai_errors.ClientError(code=429, response_json=error_response_json),
61046104
genai_errors.ClientError(code=429, response_json=error_response_json),
61056105
genai_errors.ClientError(code=429, response_json=error_response_json),
6106+
genai_errors.ClientError(code=429, response_json=error_response_json),
6107+
genai_errors.ClientError(code=429, response_json=error_response_json),
61066108
]
61076109

61086110
result = _evals_common._execute_evaluation(
@@ -6111,18 +6113,13 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
61116113
metrics=[metric],
61126114
)
61136115

6114-
assert mock_private_evaluate_instances.call_count == 3
6115-
assert mock_sleep.call_count == 2
6116+
assert mock_private_evaluate_instances.call_count == 5
6117+
assert mock_sleep.call_count == 4
61166118
assert len(result.summary_metrics) == 1
61176119
summary_metric = result.summary_metrics[0]
61186120
assert summary_metric.metric_name == "summarization_quality"
61196121
assert summary_metric.mean_score is None
61206122
assert summary_metric.num_cases_error == 1
6121-
assert (
6122-
"Judge model resource exhausted after 3 retries"
6123-
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
6124-
"summarization_quality"
6125-
].error_message
61266123

61276124

61286125
class TestEvaluationDataset:
@@ -6643,3 +6640,182 @@ def test_create_evaluation_set_with_agent_data(
66436640
candidate_response = candidate_responses[0]
66446641
assert candidate_response["candidate"] == "test-candidate"
66456642
assert candidate_response["agent_data"] == agent_data
6643+
6644+
6645+
class TestRateLimiter:
6646+
"""Tests for the RateLimiter class in _evals_utils."""
6647+
6648+
def test_rate_limiter_init(self):
6649+
"""Tests that RateLimiter initializes correctly."""
6650+
limiter = _evals_utils.RateLimiter(rate=10.0)
6651+
assert limiter.seconds_per_event == pytest.approx(0.1)
6652+
6653+
def test_rate_limiter_invalid_rate(self):
6654+
"""Tests that RateLimiter raises ValueError for non-positive rate."""
6655+
with pytest.raises(ValueError, match="Rate must be a positive number"):
6656+
_evals_utils.RateLimiter(rate=0)
6657+
with pytest.raises(ValueError, match="Rate must be a positive number"):
6658+
_evals_utils.RateLimiter(rate=-1)
6659+
6660+
@mock.patch("time.sleep", return_value=None)
6661+
@mock.patch("time.monotonic")
6662+
def test_rate_limiter_sleep_and_advance(self, mock_monotonic, mock_sleep):
6663+
"""Tests that sleep_and_advance properly throttles calls."""
6664+
# With rate=10 (0.1s interval):
6665+
# - __init__ at t=0: _next_allowed = 0.0
6666+
# - first call at t=0: no delay, _next_allowed = 0.1
6667+
# - second call at t=0.01: delay = 0.1 - 0.01 = 0.09
6668+
mock_monotonic.side_effect = [
6669+
0.0, # __init__: time.monotonic()
6670+
0.0, # first sleep_and_advance: now
6671+
0.01, # second sleep_and_advance: now
6672+
]
6673+
limiter = _evals_utils.RateLimiter(rate=10.0)
6674+
limiter.sleep_and_advance() # First call - should not sleep
6675+
limiter.sleep_and_advance() # Second call - should sleep
6676+
assert mock_sleep.call_count == 1
6677+
# Verify sleep was called with approximately the right delay
6678+
sleep_delay = mock_sleep.call_args[0][0]
6679+
assert 0.08 < sleep_delay <= 0.1
6680+
6681+
def test_rate_limiter_no_sleep_when_enough_time_passed(self):
6682+
"""Tests that no sleep occurs when enough time has passed."""
6683+
import time as real_time
6684+
6685+
limiter = _evals_utils.RateLimiter(rate=1000.0) # Very high rate
6686+
# With rate=1000, interval is 0.001s - should not sleep
6687+
start = real_time.time()
6688+
for _ in range(5):
6689+
limiter.sleep_and_advance()
6690+
elapsed = real_time.time() - start
6691+
# 5 calls at 1000 QPS should take ~0.005s, certainly under 1s
6692+
assert elapsed < 1.0
6693+
6694+
6695+
class TestCallWithRetry:
6696+
"""Tests for the shared _call_with_retry helper."""
6697+
6698+
@mock.patch("time.sleep", return_value=None)
6699+
def test_call_with_retry_success_on_first_try(self, mock_sleep):
6700+
"""Tests that _call_with_retry returns immediately on success."""
6701+
fn = mock.Mock(return_value="success")
6702+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6703+
assert result == "success"
6704+
assert fn.call_count == 1
6705+
assert mock_sleep.call_count == 0
6706+
6707+
@mock.patch("time.sleep", return_value=None)
6708+
def test_call_with_retry_success_after_retries(self, mock_sleep):
6709+
"""Tests that _call_with_retry succeeds after transient failures."""
6710+
error_json = {"error": {"code": 429, "message": "exhausted"}}
6711+
fn = mock.Mock(
6712+
side_effect=[
6713+
genai_errors.ClientError(code=429, response_json=error_json),
6714+
genai_errors.ClientError(code=429, response_json=error_json),
6715+
"success",
6716+
]
6717+
)
6718+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6719+
assert result == "success"
6720+
assert fn.call_count == 3
6721+
assert mock_sleep.call_count == 2
6722+
6723+
@mock.patch("time.sleep", return_value=None)
6724+
def test_call_with_retry_raises_after_max_retries(self, mock_sleep):
6725+
"""Tests that _call_with_retry raises after exhausting retries."""
6726+
error_json = {"error": {"code": 429, "message": "exhausted"}}
6727+
fn = mock.Mock(
6728+
side_effect=genai_errors.ClientError(code=429, response_json=error_json)
6729+
)
6730+
with pytest.raises(genai_errors.ClientError):
6731+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
6732+
assert fn.call_count == 5 # _MAX_RETRIES
6733+
assert mock_sleep.call_count == 4
6734+
6735+
@mock.patch("time.sleep", return_value=None)
6736+
def test_call_with_retry_retries_on_server_error(self, mock_sleep):
6737+
"""Tests retry on 503 ServiceUnavailable (ServerError)."""
6738+
error_json = {"error": {"code": 503, "message": "unavailable"}}
6739+
fn = mock.Mock(
6740+
side_effect=[
6741+
genai_errors.ServerError(code=503, response_json=error_json),
6742+
"success",
6743+
]
6744+
)
6745+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6746+
assert result == "success"
6747+
assert fn.call_count == 2
6748+
6749+
@mock.patch("time.sleep", return_value=None)
6750+
def test_call_with_retry_no_retry_on_non_retryable(self, mock_sleep):
6751+
"""Tests that non-retryable errors are raised immediately."""
6752+
error_json = {"error": {"code": 400, "message": "bad request"}}
6753+
fn = mock.Mock(
6754+
side_effect=genai_errors.ClientError(code=400, response_json=error_json)
6755+
)
6756+
with pytest.raises(genai_errors.ClientError):
6757+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
6758+
assert fn.call_count == 1
6759+
assert mock_sleep.call_count == 0
6760+
6761+
6762+
class TestComputationMetricRetry:
6763+
"""Tests for retry behavior in ComputationMetricHandler."""
6764+
6765+
@mock.patch.object(
6766+
_evals_metric_handlers.ComputationMetricHandler,
6767+
"SUPPORTED_COMPUTATION_METRICS",
6768+
frozenset(["bleu"]),
6769+
)
6770+
@mock.patch("time.sleep", return_value=None)
6771+
@mock.patch(
6772+
"vertexai._genai.evals.Evals.evaluate_instances"
6773+
)
6774+
def test_computation_metric_retry_on_resource_exhausted(
6775+
self,
6776+
mock_evaluate_instances,
6777+
mock_sleep,
6778+
mock_api_client_fixture,
6779+
):
6780+
"""Tests that ComputationMetricHandler retries on 429."""
6781+
dataset_df = pd.DataFrame(
6782+
[
6783+
{
6784+
"prompt": "Test prompt",
6785+
"response": "Test response",
6786+
"reference": "Test reference",
6787+
}
6788+
]
6789+
)
6790+
input_dataset = vertexai_genai_types.EvaluationDataset(
6791+
eval_dataset_df=dataset_df
6792+
)
6793+
metric = vertexai_genai_types.Metric(name="bleu")
6794+
error_response_json = {
6795+
"error": {
6796+
"code": 429,
6797+
"message": "Resource exhausted.",
6798+
"status": "RESOURCE_EXHAUSTED",
6799+
}
6800+
}
6801+
mock_bleu_result = mock.MagicMock()
6802+
mock_bleu_result.model_dump.return_value = {
6803+
"bleu_results": {"bleu_metric_values": [{"score": 0.85}]}
6804+
}
6805+
mock_evaluate_instances.side_effect = [
6806+
genai_errors.ClientError(code=429, response_json=error_response_json),
6807+
genai_errors.ClientError(code=429, response_json=error_response_json),
6808+
mock_bleu_result,
6809+
]
6810+
6811+
result = _evals_common._execute_evaluation(
6812+
api_client=mock_api_client_fixture,
6813+
dataset=input_dataset,
6814+
metrics=[metric],
6815+
)
6816+
6817+
assert mock_evaluate_instances.call_count == 3
6818+
assert mock_sleep.call_count == 2
6819+
summary_metric = result.summary_metrics[0]
6820+
assert summary_metric.metric_name == "bleu"
6821+
assert summary_metric.mean_score == 0.85

vertexai/_genai/_evals_common.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1530,6 +1530,7 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
15301530
dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]] = None,
15311531
dest: Optional[str] = None,
15321532
location: Optional[str] = None,
1533+
evaluation_service_qps: Optional[float] = None,
15331534
**kwargs,
15341535
) -> types.EvaluationResult:
15351536
"""Evaluates a dataset using the provided metrics.
@@ -1542,6 +1543,9 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
15421543
dest: The destination to save the evaluation results.
15431544
location: The location to use for the evaluation. If not specified, the
15441545
location configured in the client will be used.
1546+
evaluation_service_qps: The rate limit (queries per second) for calls
1547+
to the evaluation service. Defaults to 10. Increase this value if
1548+
your project has a higher EvaluateInstances API quota.
15451549
**kwargs: Extra arguments to pass to evaluation, such as `agent_info`.
15461550
15471551
Returns:
@@ -1617,7 +1621,8 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
16171621
logger.info("Running Metric Computation...")
16181622
t1 = time.perf_counter()
16191623
evaluation_result = _evals_metric_handlers.compute_metrics_and_aggregate(
1620-
evaluation_run_config
1624+
evaluation_run_config,
1625+
evaluation_service_qps=evaluation_service_qps,
16211626
)
16221627
t2 = time.perf_counter()
16231628
logger.info("Evaluation took: %f seconds", t2 - t1)

0 commit comments

Comments
 (0)