Skip to content

Commit eea81cf

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - Improve retry budget, add jitter, and expand retryable errors
PiperOrigin-RevId: 896691317
1 parent e2e81c9 commit eea81cf

File tree

6 files changed

+389
-108
lines changed

6 files changed

+389
-108
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 183 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6194,6 +6194,8 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
61946194
genai_errors.ClientError(code=429, response_json=error_response_json),
61956195
genai_errors.ClientError(code=429, response_json=error_response_json),
61966196
genai_errors.ClientError(code=429, response_json=error_response_json),
6197+
genai_errors.ClientError(code=429, response_json=error_response_json),
6198+
genai_errors.ClientError(code=429, response_json=error_response_json),
61976199
]
61986200

61996201
result = _evals_common._execute_evaluation(
@@ -6202,18 +6204,13 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
62026204
metrics=[metric],
62036205
)
62046206

6205-
assert mock_private_evaluate_instances.call_count == 3
6206-
assert mock_sleep.call_count == 2
6207+
assert mock_private_evaluate_instances.call_count == 5
6208+
assert mock_sleep.call_count == 4
62076209
assert len(result.summary_metrics) == 1
62086210
summary_metric = result.summary_metrics[0]
62096211
assert summary_metric.metric_name == "summarization_quality"
62106212
assert summary_metric.mean_score is None
62116213
assert summary_metric.num_cases_error == 1
6212-
assert (
6213-
"Judge model resource exhausted after 3 retries"
6214-
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
6215-
"summarization_quality"
6216-
].error_message
62176214

62186215

62196216
class TestEvaluationDataset:
@@ -6734,3 +6731,182 @@ def test_create_evaluation_set_with_agent_data(
67346731
candidate_response = candidate_responses[0]
67356732
assert candidate_response["candidate"] == "test-candidate"
67366733
assert candidate_response["agent_data"] == agent_data
6734+
6735+
6736+
class TestRateLimiter:
6737+
"""Tests for the RateLimiter class in _evals_utils."""
6738+
6739+
def test_rate_limiter_init(self):
6740+
"""Tests that RateLimiter initializes correctly."""
6741+
limiter = _evals_utils.RateLimiter(rate=10.0)
6742+
assert limiter.seconds_per_event == pytest.approx(0.1)
6743+
6744+
def test_rate_limiter_invalid_rate(self):
6745+
"""Tests that RateLimiter raises ValueError for non-positive rate."""
6746+
with pytest.raises(ValueError, match="Rate must be a positive number"):
6747+
_evals_utils.RateLimiter(rate=0)
6748+
with pytest.raises(ValueError, match="Rate must be a positive number"):
6749+
_evals_utils.RateLimiter(rate=-1)
6750+
6751+
@mock.patch("time.sleep", return_value=None)
6752+
@mock.patch("time.monotonic")
6753+
def test_rate_limiter_sleep_and_advance(self, mock_monotonic, mock_sleep):
6754+
"""Tests that sleep_and_advance properly throttles calls."""
6755+
# With rate=10 (0.1s interval):
6756+
# - __init__ at t=0: _next_allowed = 0.0
6757+
# - first call at t=0: no delay, _next_allowed = 0.1
6758+
# - second call at t=0.01: delay = 0.1 - 0.01 = 0.09
6759+
mock_monotonic.side_effect = [
6760+
0.0, # __init__: time.monotonic()
6761+
0.0, # first sleep_and_advance: now
6762+
0.01, # second sleep_and_advance: now
6763+
]
6764+
limiter = _evals_utils.RateLimiter(rate=10.0)
6765+
limiter.sleep_and_advance() # First call - should not sleep
6766+
limiter.sleep_and_advance() # Second call - should sleep
6767+
assert mock_sleep.call_count == 1
6768+
# Verify sleep was called with approximately the right delay
6769+
sleep_delay = mock_sleep.call_args[0][0]
6770+
assert 0.08 < sleep_delay <= 0.1
6771+
6772+
def test_rate_limiter_no_sleep_when_enough_time_passed(self):
6773+
"""Tests that no sleep occurs when enough time has passed."""
6774+
import time as real_time
6775+
6776+
limiter = _evals_utils.RateLimiter(rate=1000.0) # Very high rate
6777+
# With rate=1000, interval is 0.001s - should not sleep
6778+
start = real_time.time()
6779+
for _ in range(5):
6780+
limiter.sleep_and_advance()
6781+
elapsed = real_time.time() - start
6782+
# 5 calls at 1000 QPS should take ~0.005s, certainly under 1s
6783+
assert elapsed < 1.0
6784+
6785+
6786+
class TestCallWithRetry:
6787+
"""Tests for the shared _call_with_retry helper."""
6788+
6789+
@mock.patch("time.sleep", return_value=None)
6790+
def test_call_with_retry_success_on_first_try(self, mock_sleep):
6791+
"""Tests that _call_with_retry returns immediately on success."""
6792+
fn = mock.Mock(return_value="success")
6793+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6794+
assert result == "success"
6795+
assert fn.call_count == 1
6796+
assert mock_sleep.call_count == 0
6797+
6798+
@mock.patch("time.sleep", return_value=None)
6799+
def test_call_with_retry_success_after_retries(self, mock_sleep):
6800+
"""Tests that _call_with_retry succeeds after transient failures."""
6801+
error_json = {"error": {"code": 429, "message": "exhausted"}}
6802+
fn = mock.Mock(
6803+
side_effect=[
6804+
genai_errors.ClientError(code=429, response_json=error_json),
6805+
genai_errors.ClientError(code=429, response_json=error_json),
6806+
"success",
6807+
]
6808+
)
6809+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6810+
assert result == "success"
6811+
assert fn.call_count == 3
6812+
assert mock_sleep.call_count == 2
6813+
6814+
@mock.patch("time.sleep", return_value=None)
6815+
def test_call_with_retry_raises_after_max_retries(self, mock_sleep):
6816+
"""Tests that _call_with_retry raises after exhausting retries."""
6817+
error_json = {"error": {"code": 429, "message": "exhausted"}}
6818+
fn = mock.Mock(
6819+
side_effect=genai_errors.ClientError(code=429, response_json=error_json)
6820+
)
6821+
with pytest.raises(genai_errors.ClientError):
6822+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
6823+
assert fn.call_count == 5 # _MAX_RETRIES
6824+
assert mock_sleep.call_count == 4
6825+
6826+
@mock.patch("time.sleep", return_value=None)
6827+
def test_call_with_retry_retries_on_server_error(self, mock_sleep):
6828+
"""Tests retry on 503 ServiceUnavailable (ServerError)."""
6829+
error_json = {"error": {"code": 503, "message": "unavailable"}}
6830+
fn = mock.Mock(
6831+
side_effect=[
6832+
genai_errors.ServerError(code=503, response_json=error_json),
6833+
"success",
6834+
]
6835+
)
6836+
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
6837+
assert result == "success"
6838+
assert fn.call_count == 2
6839+
6840+
@mock.patch("time.sleep", return_value=None)
6841+
def test_call_with_retry_no_retry_on_non_retryable(self, mock_sleep):
6842+
"""Tests that non-retryable errors are raised immediately."""
6843+
error_json = {"error": {"code": 400, "message": "bad request"}}
6844+
fn = mock.Mock(
6845+
side_effect=genai_errors.ClientError(code=400, response_json=error_json)
6846+
)
6847+
with pytest.raises(genai_errors.ClientError):
6848+
_evals_metric_handlers._call_with_retry(fn, "test_metric")
6849+
assert fn.call_count == 1
6850+
assert mock_sleep.call_count == 0
6851+
6852+
6853+
class TestComputationMetricRetry:
6854+
"""Tests for retry behavior in ComputationMetricHandler."""
6855+
6856+
@mock.patch.object(
6857+
_evals_metric_handlers.ComputationMetricHandler,
6858+
"SUPPORTED_COMPUTATION_METRICS",
6859+
frozenset(["bleu"]),
6860+
)
6861+
@mock.patch("time.sleep", return_value=None)
6862+
@mock.patch(
6863+
"vertexai._genai.evals.Evals.evaluate_instances"
6864+
)
6865+
def test_computation_metric_retry_on_resource_exhausted(
6866+
self,
6867+
mock_evaluate_instances,
6868+
mock_sleep,
6869+
mock_api_client_fixture,
6870+
):
6871+
"""Tests that ComputationMetricHandler retries on 429."""
6872+
dataset_df = pd.DataFrame(
6873+
[
6874+
{
6875+
"prompt": "Test prompt",
6876+
"response": "Test response",
6877+
"reference": "Test reference",
6878+
}
6879+
]
6880+
)
6881+
input_dataset = vertexai_genai_types.EvaluationDataset(
6882+
eval_dataset_df=dataset_df
6883+
)
6884+
metric = vertexai_genai_types.Metric(name="bleu")
6885+
error_response_json = {
6886+
"error": {
6887+
"code": 429,
6888+
"message": "Resource exhausted.",
6889+
"status": "RESOURCE_EXHAUSTED",
6890+
}
6891+
}
6892+
mock_bleu_result = mock.MagicMock()
6893+
mock_bleu_result.model_dump.return_value = {
6894+
"bleu_results": {"bleu_metric_values": [{"score": 0.85}]}
6895+
}
6896+
mock_evaluate_instances.side_effect = [
6897+
genai_errors.ClientError(code=429, response_json=error_response_json),
6898+
genai_errors.ClientError(code=429, response_json=error_response_json),
6899+
mock_bleu_result,
6900+
]
6901+
6902+
result = _evals_common._execute_evaluation(
6903+
api_client=mock_api_client_fixture,
6904+
dataset=input_dataset,
6905+
metrics=[metric],
6906+
)
6907+
6908+
assert mock_evaluate_instances.call_count == 3
6909+
assert mock_sleep.call_count == 2
6910+
summary_metric = result.summary_metrics[0]
6911+
assert summary_metric.metric_name == "bleu"
6912+
assert summary_metric.mean_score == 0.85

vertexai/_genai/_evals_common.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,7 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
15321532
dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]] = None,
15331533
dest: Optional[str] = None,
15341534
location: Optional[str] = None,
1535+
evaluation_service_qps: Optional[float] = None,
15351536
**kwargs,
15361537
) -> types.EvaluationResult:
15371538
"""Evaluates a dataset using the provided metrics.
@@ -1544,6 +1545,9 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
15441545
dest: The destination to save the evaluation results.
15451546
location: The location to use for the evaluation. If not specified, the
15461547
location configured in the client will be used.
1548+
evaluation_service_qps: The rate limit (queries per second) for calls
1549+
to the evaluation service. Defaults to 10. Increase this value if
1550+
your project has a higher EvaluateInstances API quota.
15471551
**kwargs: Extra arguments to pass to evaluation, such as `agent_info`.
15481552
15491553
Returns:
@@ -1619,7 +1623,8 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
16191623
logger.info("Running Metric Computation...")
16201624
t1 = time.perf_counter()
16211625
evaluation_result = _evals_metric_handlers.compute_metrics_and_aggregate(
1622-
evaluation_run_config
1626+
evaluation_run_config,
1627+
evaluation_service_qps=evaluation_service_qps,
16231628
)
16241629
t2 = time.perf_counter()
16251630
logger.info("Evaluation took: %f seconds", t2 - t1)

0 commit comments

Comments
 (0)