Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 183 additions & 7 deletions tests/unit/vertexai/genai/test_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6194,6 +6194,8 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
]

result = _evals_common._execute_evaluation(
Expand All @@ -6202,18 +6204,13 @@ def test_predefined_metric_retry_fail_on_resource_exhausted(
metrics=[metric],
)

assert mock_private_evaluate_instances.call_count == 3
assert mock_sleep.call_count == 2
assert mock_private_evaluate_instances.call_count == 5
assert mock_sleep.call_count == 4
assert len(result.summary_metrics) == 1
summary_metric = result.summary_metrics[0]
assert summary_metric.metric_name == "summarization_quality"
assert summary_metric.mean_score is None
assert summary_metric.num_cases_error == 1
assert (
"Judge model resource exhausted after 3 retries"
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
"summarization_quality"
].error_message


class TestEvaluationDataset:
Expand Down Expand Up @@ -6734,3 +6731,182 @@ def test_create_evaluation_set_with_agent_data(
candidate_response = candidate_responses[0]
assert candidate_response["candidate"] == "test-candidate"
assert candidate_response["agent_data"] == agent_data


class TestRateLimiter:
"""Tests for the RateLimiter class in _evals_utils."""

def test_rate_limiter_init(self):
"""Tests that RateLimiter initializes correctly."""
limiter = _evals_utils.RateLimiter(rate=10.0)
assert limiter.seconds_per_event == pytest.approx(0.1)

def test_rate_limiter_invalid_rate(self):
"""Tests that RateLimiter raises ValueError for non-positive rate."""
with pytest.raises(ValueError, match="Rate must be a positive number"):
_evals_utils.RateLimiter(rate=0)
with pytest.raises(ValueError, match="Rate must be a positive number"):
_evals_utils.RateLimiter(rate=-1)

@mock.patch("time.sleep", return_value=None)
@mock.patch("time.monotonic")
def test_rate_limiter_sleep_and_advance(self, mock_monotonic, mock_sleep):
"""Tests that sleep_and_advance properly throttles calls."""
# With rate=10 (0.1s interval):
# - __init__ at t=0: _next_allowed = 0.0
# - first call at t=0: no delay, _next_allowed = 0.1
# - second call at t=0.01: delay = 0.1 - 0.01 = 0.09
mock_monotonic.side_effect = [
0.0, # __init__: time.monotonic()
0.0, # first sleep_and_advance: now
0.01, # second sleep_and_advance: now
]
limiter = _evals_utils.RateLimiter(rate=10.0)
limiter.sleep_and_advance() # First call - should not sleep
limiter.sleep_and_advance() # Second call - should sleep
assert mock_sleep.call_count == 1
# Verify sleep was called with approximately the right delay
sleep_delay = mock_sleep.call_args[0][0]
assert 0.08 < sleep_delay <= 0.1

def test_rate_limiter_no_sleep_when_enough_time_passed(self):
"""Tests that no sleep occurs when enough time has passed."""
import time as real_time

limiter = _evals_utils.RateLimiter(rate=1000.0) # Very high rate
# With rate=1000, interval is 0.001s - should not sleep
start = real_time.time()
for _ in range(5):
limiter.sleep_and_advance()
elapsed = real_time.time() - start
# 5 calls at 1000 QPS should take ~0.005s, certainly under 1s
assert elapsed < 1.0


class TestCallWithRetry:
"""Tests for the shared _call_with_retry helper."""

@mock.patch("time.sleep", return_value=None)
def test_call_with_retry_success_on_first_try(self, mock_sleep):
"""Tests that _call_with_retry returns immediately on success."""
fn = mock.Mock(return_value="success")
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
assert result == "success"
assert fn.call_count == 1
assert mock_sleep.call_count == 0

@mock.patch("time.sleep", return_value=None)
def test_call_with_retry_success_after_retries(self, mock_sleep):
"""Tests that _call_with_retry succeeds after transient failures."""
error_json = {"error": {"code": 429, "message": "exhausted"}}
fn = mock.Mock(
side_effect=[
genai_errors.ClientError(code=429, response_json=error_json),
genai_errors.ClientError(code=429, response_json=error_json),
"success",
]
)
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
assert result == "success"
assert fn.call_count == 3
assert mock_sleep.call_count == 2

@mock.patch("time.sleep", return_value=None)
def test_call_with_retry_raises_after_max_retries(self, mock_sleep):
"""Tests that _call_with_retry raises after exhausting retries."""
error_json = {"error": {"code": 429, "message": "exhausted"}}
fn = mock.Mock(
side_effect=genai_errors.ClientError(code=429, response_json=error_json)
)
with pytest.raises(genai_errors.ClientError):
_evals_metric_handlers._call_with_retry(fn, "test_metric")
assert fn.call_count == 5 # _MAX_RETRIES
assert mock_sleep.call_count == 4

@mock.patch("time.sleep", return_value=None)
def test_call_with_retry_retries_on_server_error(self, mock_sleep):
"""Tests retry on 503 ServiceUnavailable (ServerError)."""
error_json = {"error": {"code": 503, "message": "unavailable"}}
fn = mock.Mock(
side_effect=[
genai_errors.ServerError(code=503, response_json=error_json),
"success",
]
)
result = _evals_metric_handlers._call_with_retry(fn, "test_metric")
assert result == "success"
assert fn.call_count == 2

@mock.patch("time.sleep", return_value=None)
def test_call_with_retry_no_retry_on_non_retryable(self, mock_sleep):
"""Tests that non-retryable errors are raised immediately."""
error_json = {"error": {"code": 400, "message": "bad request"}}
fn = mock.Mock(
side_effect=genai_errors.ClientError(code=400, response_json=error_json)
)
with pytest.raises(genai_errors.ClientError):
_evals_metric_handlers._call_with_retry(fn, "test_metric")
assert fn.call_count == 1
assert mock_sleep.call_count == 0


class TestComputationMetricRetry:
"""Tests for retry behavior in ComputationMetricHandler."""

@mock.patch.object(
_evals_metric_handlers.ComputationMetricHandler,
"SUPPORTED_COMPUTATION_METRICS",
frozenset(["bleu"]),
)
@mock.patch("time.sleep", return_value=None)
@mock.patch(
"vertexai._genai.evals.Evals.evaluate_instances"
)
def test_computation_metric_retry_on_resource_exhausted(
self,
mock_evaluate_instances,
mock_sleep,
mock_api_client_fixture,
):
"""Tests that ComputationMetricHandler retries on 429."""
dataset_df = pd.DataFrame(
[
{
"prompt": "Test prompt",
"response": "Test response",
"reference": "Test reference",
}
]
)
input_dataset = vertexai_genai_types.EvaluationDataset(
eval_dataset_df=dataset_df
)
metric = vertexai_genai_types.Metric(name="bleu")
error_response_json = {
"error": {
"code": 429,
"message": "Resource exhausted.",
"status": "RESOURCE_EXHAUSTED",
}
}
mock_bleu_result = mock.MagicMock()
mock_bleu_result.model_dump.return_value = {
"bleu_results": {"bleu_metric_values": [{"score": 0.85}]}
}
mock_evaluate_instances.side_effect = [
genai_errors.ClientError(code=429, response_json=error_response_json),
genai_errors.ClientError(code=429, response_json=error_response_json),
mock_bleu_result,
]

result = _evals_common._execute_evaluation(
api_client=mock_api_client_fixture,
dataset=input_dataset,
metrics=[metric],
)

assert mock_evaluate_instances.call_count == 3
assert mock_sleep.call_count == 2
summary_metric = result.summary_metrics[0]
assert summary_metric.metric_name == "bleu"
assert summary_metric.mean_score == 0.85
7 changes: 6 additions & 1 deletion vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,7 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]] = None,
dest: Optional[str] = None,
location: Optional[str] = None,
evaluation_service_qps: Optional[float] = None,
**kwargs,
) -> types.EvaluationResult:
"""Evaluates a dataset using the provided metrics.
Expand All @@ -1544,6 +1545,9 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
dest: The destination to save the evaluation results.
location: The location to use for the evaluation. If not specified, the
location configured in the client will be used.
evaluation_service_qps: The rate limit (queries per second) for calls
to the evaluation service. Defaults to 10. Increase this value if
your project has a higher EvaluateInstances API quota.
**kwargs: Extra arguments to pass to evaluation, such as `agent_info`.

Returns:
Expand Down Expand Up @@ -1619,7 +1623,8 @@ def _execute_evaluation( # type: ignore[no-untyped-def]
logger.info("Running Metric Computation...")
t1 = time.perf_counter()
evaluation_result = _evals_metric_handlers.compute_metrics_and_aggregate(
evaluation_run_config
evaluation_run_config,
evaluation_service_qps=evaluation_service_qps,
)
t2 = time.perf_counter()
logger.info("Evaluation took: %f seconds", t2 - t1)
Expand Down
Loading
Loading