|
23 | 23 |
|
24 | 24 | from google import auth |
25 | 25 | from google.auth import credentials as auth_credentials |
| 26 | +from google import genai |
26 | 27 | from google.cloud import aiplatform |
27 | 28 | import vertexai |
28 | 29 | from google.cloud.aiplatform import initializer |
@@ -1025,6 +1026,62 @@ def test_compute_pointwise_metrics_metric_prompt_template_example( |
1025 | 1026 | "explanation", |
1026 | 1027 | ] |
1027 | 1028 |
|
| 1029 | + @pytest.mark.parametrize("api_transport", ["grpc", "rest"]) |
| 1030 | + def test_compute_pointwise_metrics_metric_prompt_template_example_string_model( |
| 1031 | + self, api_transport |
| 1032 | + ): |
| 1033 | + aiplatform.init( |
| 1034 | + project=_TEST_PROJECT, |
| 1035 | + location=_TEST_LOCATION, |
| 1036 | + api_transport=api_transport, |
| 1037 | + ) |
| 1038 | + mock_client = mock.create_autospec(genai.Client, instance=True) |
| 1039 | + mock_response = mock.MagicMock() |
| 1040 | + mock_response.text = "test_response" |
| 1041 | + mock_client.models.generate_content.return_value = mock_response |
| 1042 | + |
| 1043 | + test_metrics = [Pointwise.SUMMARIZATION_QUALITY] |
| 1044 | + test_eval_task = EvalTask( |
| 1045 | + dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE, metrics=test_metrics |
| 1046 | + ) |
| 1047 | + mock_metric_results = _MOCK_SUMMARIZATION_QUALITY_RESULT |
| 1048 | + with mock.patch.object(genai, "Client", return_value=mock_client): |
| 1049 | + with mock.patch.object( |
| 1050 | + target=gapic_evaluation_services.EvaluationServiceClient, |
| 1051 | + attribute="evaluate_instances", |
| 1052 | + side_effect=mock_metric_results, |
| 1053 | + ): |
| 1054 | + test_result = test_eval_task.evaluate( |
| 1055 | + model="gemini-1.5-pro", |
| 1056 | + prompt_template="{instruction} test prompt template {context}", |
| 1057 | + ) |
| 1058 | + |
| 1059 | + assert test_result.summary_metrics["row_count"] == 2 |
| 1060 | + assert test_result.summary_metrics["summarization_quality/mean"] == 4.5 |
| 1061 | + assert test_result.summary_metrics[ |
| 1062 | + "summarization_quality/std" |
| 1063 | + ] == pytest.approx(0.7, 0.1) |
| 1064 | + assert set(test_result.metrics_table.columns.values) == set( |
| 1065 | + [ |
| 1066 | + "context", |
| 1067 | + "instruction", |
| 1068 | + "reference", |
| 1069 | + "prompt", |
| 1070 | + "response", |
| 1071 | + "summarization_quality/score", |
| 1072 | + "summarization_quality/explanation", |
| 1073 | + ] |
| 1074 | + ) |
| 1075 | + assert list( |
| 1076 | + test_result.metrics_table["summarization_quality/score"].values |
| 1077 | + ) == [5, 4] |
| 1078 | + assert list( |
| 1079 | + test_result.metrics_table["summarization_quality/explanation"].values |
| 1080 | + ) == [ |
| 1081 | + "explanation", |
| 1082 | + "explanation", |
| 1083 | + ] |
| 1084 | + |
1028 | 1085 | @pytest.mark.parametrize("api_transport", ["grpc", "rest"]) |
1029 | 1086 | def test_compute_pointwise_metrics_without_model_inference(self, api_transport): |
1030 | 1087 | aiplatform.init( |
@@ -1401,13 +1458,13 @@ def test_compute_multiple_metrics(self, api_transport): |
1401 | 1458 | mock_baseline_model.generate_content.return_value = ( |
1402 | 1459 | _MOCK_MODEL_INFERENCE_RESPONSE |
1403 | 1460 | ) |
1404 | | - mock_baseline_model._model_name = "publishers/google/model/gemini-pro" |
| 1461 | + mock_baseline_model._model_name = "gemini-2.5-pro" |
1405 | 1462 | _TEST_PAIRWISE_METRIC._baseline_model = mock_baseline_model |
1406 | 1463 | mock_model = mock.create_autospec( |
1407 | 1464 | generative_models.GenerativeModel, instance=True |
1408 | 1465 | ) |
1409 | 1466 | mock_model.generate_content.return_value = _MOCK_MODEL_INFERENCE_RESPONSE |
1410 | | - mock_model._model_name = "publishers/google/model/gemini-pro" |
| 1467 | + mock_model._model_name = "gemini-2.5-flash" |
1411 | 1468 | test_metrics = [ |
1412 | 1469 | "exact_match", |
1413 | 1470 | Pointwise.SUMMARIZATION_QUALITY, |
@@ -2096,6 +2153,40 @@ def test_evaluate_invalid_metrics(self): |
2096 | 2153 | ) |
2097 | 2154 | test_eval_task.evaluate() |
2098 | 2155 |
|
| 2156 | + def test_pairwise_metric_baseline_model_deprecation_warning(self): |
| 2157 | + mock_baseline_model = mock.create_autospec( |
| 2158 | + generative_models.GenerativeModel, instance=True |
| 2159 | + ) |
| 2160 | + with pytest.warns( |
| 2161 | + DeprecationWarning, |
| 2162 | + match="vertexai.generative_models.GenerativeModel is deprecated for evaluation and will be removed in June 2026. Please pass a string model name instead.", |
| 2163 | + ): |
| 2164 | + pairwise_metric.PairwiseMetric( |
| 2165 | + metric="test_pairwise_metric", |
| 2166 | + metric_prompt_template="abc", |
| 2167 | + baseline_model=mock_baseline_model, |
| 2168 | + ) |
| 2169 | + |
| 2170 | + def test_evaluate_model_deprecation_warning(self): |
| 2171 | + mock_model = mock.create_autospec( |
| 2172 | + generative_models.GenerativeModel, instance=True |
| 2173 | + ) |
| 2174 | + mock_model._model_name = "publishers/google/model/gemini-pro" |
| 2175 | + test_eval_task = EvalTask( |
| 2176 | + dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE, |
| 2177 | + metrics=[_TEST_POINTWISE_METRIC], |
| 2178 | + ) |
| 2179 | + with mock.patch.object( |
| 2180 | + target=gapic_evaluation_services.EvaluationServiceClient, |
| 2181 | + attribute="evaluate_instances", |
| 2182 | + side_effect=_MOCK_POINTWISE_RESULT, |
| 2183 | + ): |
| 2184 | + with pytest.warns( |
| 2185 | + DeprecationWarning, |
| 2186 | + match="vertexai.generative_models.GenerativeModel is deprecated for evaluation and will be removed in June 2026. Please pass a string model name instead.", |
| 2187 | + ): |
| 2188 | + test_eval_task.evaluate(model=mock_model) |
| 2189 | + |
2099 | 2190 | def test_evaluate_duplicate_string_metric(self): |
2100 | 2191 | metrics = [ |
2101 | 2192 | "exact_match", |
@@ -2654,6 +2745,31 @@ def test_default_rubrics_parser_with_invalid_json(self): |
2654 | 2745 | parsed_rubrics = utils_preview.parse_rubrics(_INVALID_UNPARSED_RUBRIC) |
2655 | 2746 | assert parsed_rubrics == {"questions": ""} |
2656 | 2747 |
|
| 2748 | + def test_generate_responses_from_genai_model(self): |
| 2749 | + mock_client = mock.create_autospec(genai.Client, instance=True) |
| 2750 | + mock_response = mock.MagicMock() |
| 2751 | + mock_response.text = "test_response" |
| 2752 | + mock_client.models.generate_content.return_value = mock_response |
| 2753 | + |
| 2754 | + with mock.patch.object(genai, "Client", return_value=mock_client): |
| 2755 | + evaluation_run_config = eval_base.EvaluationRunConfig( |
| 2756 | + dataset=_TEST_EVAL_DATASET_WITHOUT_RESPONSE.copy(), |
| 2757 | + metrics=[], |
| 2758 | + metric_column_mapping={}, |
| 2759 | + client=mock.MagicMock(), |
| 2760 | + evaluation_service_qps=1, |
| 2761 | + retry_timeout=1, |
| 2762 | + ) |
| 2763 | + _evaluation._generate_responses_from_genai_model( |
| 2764 | + "gemini-1.5-pro", evaluation_run_config |
| 2765 | + ) |
| 2766 | + |
| 2767 | + assert list(evaluation_run_config.dataset["response"].values) == [ |
| 2768 | + "test_response", |
| 2769 | + "test_response", |
| 2770 | + ] |
| 2771 | + assert mock_client.models.generate_content.call_count == 2 |
| 2772 | + |
2657 | 2773 | def test_generate_responses_from_gemini_model(self): |
2658 | 2774 | mock_model = mock.create_autospec( |
2659 | 2775 | generative_models.GenerativeModel, instance=True |
|
0 commit comments