diff --git a/tests/unit/vertexai/genai/replays/test_evaluation_metric.py b/tests/unit/vertexai/genai/replays/test_evaluation_metric.py new file mode 100644 index 0000000000..6b1e9dc1b7 --- /dev/null +++ b/tests/unit/vertexai/genai/replays/test_evaluation_metric.py @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# pylint: disable=protected-access,bad-continuation,missing-function-docstring +import re + +from tests.unit.vertexai.genai.replays import pytest_helper +from vertexai._genai import types + +_TEST_PROJECT = "977012026409" +_TEST_LOCATION = "us-central1" + + +def test_create_and_get_evaluation_metric(client): + client._api_client._http_options.api_version = "v1beta1" + client._api_client._http_options.base_url = ( + "https://us-central1-staging-aiplatform.sandbox.googleapis.com/" + ) + result = client.evals.create_evaluation_metric( + display_name="test_metric", + description="test_description", + metric=types.RubricMetric.GENERAL_QUALITY, + ) + assert isinstance(result, str) + assert re.match( + r"^projects/[^/]+/locations/[^/]+/evaluationMetrics/[^/]+$", + result, + ) + metric = client.evals.get_evaluation_metric(metric_resource_name=result) + assert isinstance(metric, types.EvaluationMetric) + assert metric.display_name == "test_metric" + + +def test_list_evaluation_metrics(client): + client._api_client._http_options.api_version = "v1beta1" + client._api_client._http_options.base_url = ( + "https://us-central1-staging-aiplatform.sandbox.googleapis.com/" + ) + response = client.evals.list_evaluation_metrics() + assert isinstance(response, types.ListEvaluationMetricsResponse) + assert len(response.evaluation_metrics) >= 0 + + +# The setup function registers the module and method for the recorder +pytestmark = pytest_helper.setup( + file=__file__, + globals_for_file=globals(), + test_method="evals.create_evaluation_metric", +) diff --git a/vertexai/_genai/_transformers.py b/vertexai/_genai/_transformers.py index 246e9eb0d0..05e1460497 100644 --- a/vertexai/_genai/_transformers.py +++ b/vertexai/_genai/_transformers.py @@ -25,8 +25,15 @@ _METRIC_RES_NAME_RE = r"^projects/[^/]+/locations/[^/]+/evaluationMetrics/[^/]+$" +def t_metric( + metric: "types.MetricSubclass", +) -> dict[str, Any]: + """Prepares the metric payload for a single metric.""" + return t_metrics([metric])[0] + + def t_metrics( - metrics: list["types.MetricSubclass"], + metrics: "list[types.MetricSubclass]", set_default_aggregation_metrics: bool = False, ) -> list[dict[str, Any]]: """Prepares the metric payload for the evaluation request. diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index a6f11fed9b..b9304fd1e6 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -17,7 +17,7 @@ import json import logging -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional, Union, cast from urllib.parse import urlencode import uuid @@ -67,6 +67,26 @@ def _CreateEvaluationItemParameters_to_vertex( return to_object +def _CreateEvaluationMetricParameters_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["display_name"]) is not None: + setv(to_object, ["displayName"], getv(from_object, ["display_name"])) + + if getv(from_object, ["description"]) is not None: + setv(to_object, ["description"], getv(from_object, ["description"])) + + if getv(from_object, ["metric"]) is not None: + setv(to_object, ["metric"], t.t_metric(getv(from_object, ["metric"]))) + + if getv(from_object, ["config"]) is not None: + setv(to_object, ["config"], getv(from_object, ["config"])) + + return to_object + + def _CreateEvaluationRunParameters_to_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -294,6 +314,30 @@ def _EvaluationInstance_to_vertex( return to_object +def _EvaluationMetric_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["name"]) is not None: + setv(to_object, ["name"], getv(from_object, ["name"])) + + if getv(from_object, ["displayName"]) is not None: + setv(to_object, ["display_name"], getv(from_object, ["displayName"])) + + if getv(from_object, ["description"]) is not None: + setv(to_object, ["description"], getv(from_object, ["description"])) + + if getv(from_object, ["metric"]) is not None: + setv( + to_object, + ["metric"], + _UnifiedMetric_from_vertex(getv(from_object, ["metric"]), to_object), + ) + + return to_object + + def _EvaluationRunConfig_from_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -590,6 +634,24 @@ def _GetEvaluationItemParameters_to_vertex( return to_object +def _GetEvaluationMetricParameters_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["metric_resource_name"]) is not None: + setv( + to_object, + ["_url", "evaluation_metric"], + getv(from_object, ["metric_resource_name"]), + ) + + if getv(from_object, ["config"]) is not None: + setv(to_object, ["config"], getv(from_object, ["config"])) + + return to_object + + def _GetEvaluationRunParameters_to_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -618,6 +680,41 @@ def _GetEvaluationSetParameters_to_vertex( return to_object +def _ListEvaluationMetricsParameters_to_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["config"]) is not None: + setv(to_object, ["config"], getv(from_object, ["config"])) + + return to_object + + +def _ListEvaluationMetricsResponse_from_vertex( + from_object: Union[dict[str, Any], object], + parent_object: Optional[dict[str, Any]] = None, +) -> dict[str, Any]: + to_object: dict[str, Any] = {} + if getv(from_object, ["sdkHttpResponse"]) is not None: + setv(to_object, ["sdk_http_response"], getv(from_object, ["sdkHttpResponse"])) + + if getv(from_object, ["nextPageToken"]) is not None: + setv(to_object, ["next_page_token"], getv(from_object, ["nextPageToken"])) + + if getv(from_object, ["evaluationMetrics"]) is not None: + setv( + to_object, + ["evaluation_metrics"], + [ + _EvaluationMetric_from_vertex(item, to_object) + for item in getv(from_object, ["evaluationMetrics"]) + ], + ) + + return to_object + + def _RubricBasedMetricInput_to_vertex( from_object: Union[dict[str, Any], object], parent_object: Optional[dict[str, Any]] = None, @@ -866,6 +963,66 @@ def _create_evaluation_item( self._api_client._verify_response(return_value) return return_value + def _create_evaluation_metric( + self, + *, + display_name: Optional[str] = None, + description: Optional[str] = None, + metric: Optional[types.MetricOrDict] = None, + config: Optional[types.CreateEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """ + Creates an EvaluationMetric. + """ + + parameter_model = types._CreateEvaluationMetricParameters( + display_name=display_name, + description=description, + metric=metric, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _CreateEvaluationMetricParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "evaluationMetrics".format_map(request_url_dict) + else: + path = "evaluationMetrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = self._api_client.request("post", path, request_dict, http_options) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _EvaluationMetric_from_vertex(response_dict) + + return_value = types.EvaluationMetric._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + def _create_evaluation_run( self, *, @@ -1196,6 +1353,62 @@ def _generate_user_scenarios( self._api_client._verify_response(return_value) return return_value + def _get_evaluation_metric( + self, + *, + metric_resource_name: str, + config: Optional[types.GetEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """ + Retrieves an EvaluationMetric from the resource name. + """ + + parameter_model = types._GetEvaluationMetricParameters( + metric_resource_name=metric_resource_name, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _GetEvaluationMetricParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "{evaluation_metric}".format_map(request_url_dict) + else: + path = "{evaluation_metric}" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = self._api_client.request("get", path, request_dict, http_options) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _EvaluationMetric_from_vertex(response_dict) + + return_value = types.EvaluationMetric._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + def _get_evaluation_run( self, *, name: str, config: Optional[types.GetEvaluationRunConfigOrDict] = None ) -> types.EvaluationRun: @@ -1349,6 +1562,58 @@ def _get_evaluation_item( self._api_client._verify_response(return_value) return return_value + def _list_evaluation_metrics( + self, *, config: Optional[types.ListEvaluationMetricsConfigOrDict] = None + ) -> types.ListEvaluationMetricsResponse: + """ + Lists EvaluationMetrics. + """ + + parameter_model = types._ListEvaluationMetricsParameters( + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _ListEvaluationMetricsParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "evaluationMetrics".format_map(request_url_dict) + else: + path = "evaluationMetrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = self._api_client.request("get", path, request_dict, http_options) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _ListEvaluationMetricsResponse_from_vertex(response_dict) + + return_value = types.ListEvaluationMetricsResponse._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + def evaluate_instances( self, *, @@ -2090,6 +2355,66 @@ def generate_user_scenarios( ) return _evals_utils._postprocess_user_scenarios_response(response) + @_common.experimental_warning( + "The Vertex SDK GenAI evals.create_evaluation_metric method is experimental, " + "and may change in future versions." + ) + def create_evaluation_metric( + self, + *, + display_name: Optional[str] = None, + description: Optional[str] = None, + metric: Optional[types.MetricOrDict] = None, + config: Optional[types.CreateEvaluationMetricConfigOrDict] = None, + ) -> str: + """Creates an EvaluationMetric.""" + if metric and not isinstance(metric, dict): + # metric is now Metric | LazyLoadedPrebuiltMetric (RubricMetric) + # Mypy correctly narrows the type here, so cast is not needed. + resolved_metrics = _evals_common._resolve_metrics( + [metric], self._api_client + ) + metric = resolved_metrics[0] + + result = self._create_evaluation_metric( + display_name=display_name, + description=description, + metric=metric, + config=config, + ) + # result.name is Optional[str], but we know it's always returned on creation + return cast(str, result.name) + + @_common.experimental_warning( + "The Vertex SDK GenAI evals.get_evaluation_metric module is experimental, " + "and may change in future versions." + ) + def get_evaluation_metric( + self, + *, + metric_resource_name: str, + config: Optional[types.GetEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """Retrieves an EvaluationMetric from the resource name.""" + return self._get_evaluation_metric( + metric_resource_name=metric_resource_name, + config=config, + ) + + @_common.experimental_warning( + "The Vertex SDK GenAI evals.list_evaluation_metrics module is experimental, " + "and may change in future versions." + ) + def list_evaluation_metrics( + self, + *, + config: Optional[types.ListEvaluationMetricsConfigOrDict] = None, + ) -> types.ListEvaluationMetricsResponse: + """Lists EvaluationMetrics.""" + return self._list_evaluation_metrics( + config=config, + ) + class AsyncEvals(_api_module.BaseModule): @@ -2152,6 +2477,68 @@ async def _create_evaluation_item( self._api_client._verify_response(return_value) return return_value + async def _create_evaluation_metric( + self, + *, + display_name: Optional[str] = None, + description: Optional[str] = None, + metric: Optional[types.MetricOrDict] = None, + config: Optional[types.CreateEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """ + Creates an EvaluationMetric. + """ + + parameter_model = types._CreateEvaluationMetricParameters( + display_name=display_name, + description=description, + metric=metric, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _CreateEvaluationMetricParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "evaluationMetrics".format_map(request_url_dict) + else: + path = "evaluationMetrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = await self._api_client.async_request( + "post", path, request_dict, http_options + ) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _EvaluationMetric_from_vertex(response_dict) + + return_value = types.EvaluationMetric._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + async def _create_evaluation_run( self, *, @@ -2492,6 +2879,64 @@ async def _generate_user_scenarios( self._api_client._verify_response(return_value) return return_value + async def _get_evaluation_metric( + self, + *, + metric_resource_name: str, + config: Optional[types.GetEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """ + Retrieves an EvaluationMetric from the resource name. + """ + + parameter_model = types._GetEvaluationMetricParameters( + metric_resource_name=metric_resource_name, + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _GetEvaluationMetricParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "{evaluation_metric}".format_map(request_url_dict) + else: + path = "{evaluation_metric}" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = await self._api_client.async_request( + "get", path, request_dict, http_options + ) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _EvaluationMetric_from_vertex(response_dict) + + return_value = types.EvaluationMetric._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + async def _get_evaluation_run( self, *, name: str, config: Optional[types.GetEvaluationRunConfigOrDict] = None ) -> types.EvaluationRun: @@ -2651,6 +3096,60 @@ async def _get_evaluation_item( self._api_client._verify_response(return_value) return return_value + async def _list_evaluation_metrics( + self, *, config: Optional[types.ListEvaluationMetricsConfigOrDict] = None + ) -> types.ListEvaluationMetricsResponse: + """ + Lists EvaluationMetrics. + """ + + parameter_model = types._ListEvaluationMetricsParameters( + config=config, + ) + + request_url_dict: Optional[dict[str, str]] + if not self._api_client.vertexai: + raise ValueError("This method is only supported in the Vertex AI client.") + else: + request_dict = _ListEvaluationMetricsParameters_to_vertex(parameter_model) + request_url_dict = request_dict.get("_url") + if request_url_dict: + path = "evaluationMetrics".format_map(request_url_dict) + else: + path = "evaluationMetrics" + + query_params = request_dict.get("_query") + if query_params: + path = f"{path}?{urlencode(query_params)}" + # TODO: remove the hack that pops config. + request_dict.pop("config", None) + + http_options: Optional[types.HttpOptions] = None + if ( + parameter_model.config is not None + and parameter_model.config.http_options is not None + ): + http_options = parameter_model.config.http_options + + request_dict = _common.convert_to_dict(request_dict) + request_dict = _common.encode_unserializable_types(request_dict) + + response = await self._api_client.async_request( + "get", path, request_dict, http_options + ) + + response_dict = {} if not response.body else json.loads(response.body) + + if self._api_client.vertexai: + response_dict = _ListEvaluationMetricsResponse_from_vertex(response_dict) + + return_value = types.ListEvaluationMetricsResponse._from_response( + response=response_dict, kwargs=parameter_model.model_dump() + ) + + self._api_client._verify_response(return_value) + return return_value + async def batch_evaluate( self, *, @@ -3030,3 +3529,60 @@ async def generate_user_scenarios( root_agent_id=root_agent_id, ) return _evals_utils._postprocess_user_scenarios_response(response) + + @_common.experimental_warning( + "The Vertex SDK GenAI evals.create_evaluation_metric module is experimental, " + "and may change in future versions." + ) + async def create_evaluation_metric( + self, + *, + display_name: Optional[str] = None, + description: Optional[str] = None, + metric: Optional[types.MetricOrDict] = None, + config: Optional[types.CreateEvaluationMetricConfigOrDict] = None, + ) -> str: + """Creates an EvaluationMetric.""" + if metric and not isinstance(metric, dict): + resolved_metrics = _evals_common._resolve_metrics( + [metric], self._api_client + ) + metric = resolved_metrics[0] + + result = await self._create_evaluation_metric( + display_name=display_name, + description=description, + metric=metric, + config=config, + ) + return cast(str, result.name) + + @_common.experimental_warning( + "The Vertex SDK GenAI evals.get_evaluation_metric module is experimental, " + "and may change in future versions." + ) + async def get_evaluation_metric( + self, + *, + metric_resource_name: str, + config: Optional[types.GetEvaluationMetricConfigOrDict] = None, + ) -> types.EvaluationMetric: + """Retrieves an EvaluationMetric from the resource name.""" + return await self._get_evaluation_metric( + metric_resource_name=metric_resource_name, + config=config, + ) + + @_common.experimental_warning( + "The Vertex SDK GenAI evals.list_evaluation_metrics module is experimental, " + "and may change in future versions." + ) + async def list_evaluation_metrics( + self, + *, + config: Optional[types.ListEvaluationMetricsConfigOrDict] = None, + ) -> types.ListEvaluationMetricsResponse: + """Lists EvaluationMetrics.""" + return await self._list_evaluation_metrics( + config=config, + ) diff --git a/vertexai/_genai/types/__init__.py b/vertexai/_genai/types/__init__.py index f0615c1fd7..d8982b15dc 100644 --- a/vertexai/_genai/types/__init__.py +++ b/vertexai/_genai/types/__init__.py @@ -34,6 +34,7 @@ from .common import _CreateDatasetParameters from .common import _CreateDatasetVersionParameters from .common import _CreateEvaluationItemParameters +from .common import _CreateEvaluationMetricParameters from .common import _CreateEvaluationRunParameters from .common import _CreateEvaluationSetParameters from .common import _CreateMultimodalDatasetParameters @@ -68,6 +69,7 @@ from .common import _GetDatasetParameters from .common import _GetDatasetVersionParameters from .common import _GetEvaluationItemParameters +from .common import _GetEvaluationMetricParameters from .common import _GetEvaluationRunParameters from .common import _GetEvaluationSetParameters from .common import _GetMultimodalDatasetOperationParameters @@ -82,6 +84,7 @@ from .common import _ListAgentEngineTasksRequestParameters from .common import _ListDatasetsRequestParameters from .common import _ListDatasetVersionsRequestParameters +from .common import _ListEvaluationMetricsParameters from .common import _ListMultimodalDatasetsRequestParameters from .common import _OptimizeRequestParameters from .common import _OptimizeRequestParameters @@ -239,6 +242,9 @@ from .common import CreateEvaluationItemConfig from .common import CreateEvaluationItemConfigDict from .common import CreateEvaluationItemConfigOrDict +from .common import CreateEvaluationMetricConfig +from .common import CreateEvaluationMetricConfigDict +from .common import CreateEvaluationMetricConfigOrDict from .common import CreateEvaluationRunConfig from .common import CreateEvaluationRunConfigDict from .common import CreateEvaluationRunConfigOrDict @@ -363,6 +369,9 @@ from .common import EvaluationItemResultDict from .common import EvaluationItemResultOrDict from .common import EvaluationItemType +from .common import EvaluationMetric +from .common import EvaluationMetricDict +from .common import EvaluationMetricOrDict from .common import EvaluationPrompt from .common import EvaluationPromptDict from .common import EvaluationPromptOrDict @@ -525,6 +534,9 @@ from .common import GetEvaluationItemConfig from .common import GetEvaluationItemConfigDict from .common import GetEvaluationItemConfigOrDict +from .common import GetEvaluationMetricConfig +from .common import GetEvaluationMetricConfigDict +from .common import GetEvaluationMetricConfigOrDict from .common import GetEvaluationRunConfig from .common import GetEvaluationRunConfigDict from .common import GetEvaluationRunConfigOrDict @@ -589,6 +601,12 @@ from .common import ListDatasetVersionsResponse from .common import ListDatasetVersionsResponseDict from .common import ListDatasetVersionsResponseOrDict +from .common import ListEvaluationMetricsConfig +from .common import ListEvaluationMetricsConfigDict +from .common import ListEvaluationMetricsConfigOrDict +from .common import ListEvaluationMetricsResponse +from .common import ListEvaluationMetricsResponseDict +from .common import ListEvaluationMetricsResponseOrDict from .common import ListMultimodalDatasetsConfig from .common import ListMultimodalDatasetsConfigDict from .common import ListMultimodalDatasetsConfigOrDict @@ -1313,15 +1331,12 @@ "EvaluationItem", "EvaluationItemDict", "EvaluationItemOrDict", - "SamplingConfig", - "SamplingConfigDict", - "SamplingConfigOrDict", - "BigQueryRequestSet", - "BigQueryRequestSetDict", - "BigQueryRequestSetOrDict", - "EvaluationRunDataSource", - "EvaluationRunDataSourceDict", - "EvaluationRunDataSourceOrDict", + "Metric", + "MetricDict", + "MetricOrDict", + "CreateEvaluationMetricConfig", + "CreateEvaluationMetricConfigDict", + "CreateEvaluationMetricConfigOrDict", "PredefinedMetricSpec", "PredefinedMetricSpecDict", "PredefinedMetricSpecOrDict", @@ -1340,6 +1355,18 @@ "UnifiedMetric", "UnifiedMetricDict", "UnifiedMetricOrDict", + "EvaluationMetric", + "EvaluationMetricDict", + "EvaluationMetricOrDict", + "SamplingConfig", + "SamplingConfigDict", + "SamplingConfigOrDict", + "BigQueryRequestSet", + "BigQueryRequestSetDict", + "BigQueryRequestSetOrDict", + "EvaluationRunDataSource", + "EvaluationRunDataSourceDict", + "EvaluationRunDataSourceOrDict", "EvaluationRunMetric", "EvaluationRunMetricDict", "EvaluationRunMetricOrDict", @@ -1505,9 +1532,6 @@ "RubricBasedMetricInput", "RubricBasedMetricInputDict", "RubricBasedMetricInputOrDict", - "Metric", - "MetricDict", - "MetricOrDict", "MetricSource", "MetricSourceDict", "MetricSourceOrDict", @@ -1592,6 +1616,9 @@ "GenerateUserScenariosResponse", "GenerateUserScenariosResponseDict", "GenerateUserScenariosResponseOrDict", + "GetEvaluationMetricConfig", + "GetEvaluationMetricConfigDict", + "GetEvaluationMetricConfigOrDict", "GetEvaluationRunConfig", "GetEvaluationRunConfigDict", "GetEvaluationRunConfigOrDict", @@ -1601,6 +1628,12 @@ "GetEvaluationItemConfig", "GetEvaluationItemConfigDict", "GetEvaluationItemConfigOrDict", + "ListEvaluationMetricsConfig", + "ListEvaluationMetricsConfigDict", + "ListEvaluationMetricsConfigOrDict", + "ListEvaluationMetricsResponse", + "ListEvaluationMetricsResponseDict", + "ListEvaluationMetricsResponseOrDict", "OptimizeConfig", "OptimizeConfigDict", "OptimizeConfigOrDict", @@ -2238,8 +2271,8 @@ "MachineConfig", "Framework", "EvaluationItemType", - "SamplingMethod", "RubricContentType", + "SamplingMethod", "EvaluationRunState", "OptimizeTarget", "MemoryMetadataMergeStrategy", @@ -2273,14 +2306,17 @@ "_AppendAgentEngineTaskEventRequestParameters", "_ListAgentEngineTaskEventsRequestParameters", "_CreateEvaluationItemParameters", + "_CreateEvaluationMetricParameters", "_CreateEvaluationRunParameters", "_CreateEvaluationSetParameters", "_EvaluateInstancesRequestParameters", "_GenerateInstanceRubricsRequest", "_GenerateUserScenariosParameters", + "_GetEvaluationMetricParameters", "_GetEvaluationRunParameters", "_GetEvaluationSetParameters", "_GetEvaluationItemParameters", + "_ListEvaluationMetricsParameters", "_OptimizeRequestParameters", "_CustomJobParameters", "_GetCustomJobParameters", diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py index 3aef4b9cba..315bde2e64 100644 --- a/vertexai/_genai/types/common.py +++ b/vertexai/_genai/types/common.py @@ -402,15 +402,6 @@ class EvaluationItemType(_common.CaseInSensitiveEnum): """The EvaluationItem is the result of evaluation.""" -class SamplingMethod(_common.CaseInSensitiveEnum): - """Represents the sampling method for a BigQuery request set.""" - - UNSPECIFIED = "UNSPECIFIED" - """Sampling method is unspecified.""" - RANDOM = "RANDOM" - """Sampling method is random.""" - - class RubricContentType(_common.CaseInSensitiveEnum): """Specifies the type of rubric content to generate.""" @@ -422,6 +413,15 @@ class RubricContentType(_common.CaseInSensitiveEnum): """Generate rubrics in a unit test format.""" +class SamplingMethod(_common.CaseInSensitiveEnum): + """Represents the sampling method for a BigQuery request set.""" + + UNSPECIFIED = "UNSPECIFIED" + """Sampling method is unspecified.""" + RANDOM = "RANDOM" + """Sampling method is random.""" + + class EvaluationRunState(_common.CaseInSensitiveEnum): """Represents the state of an evaluation run.""" @@ -2137,143 +2137,382 @@ class EvaluationItemDict(TypedDict, total=False): EvaluationItemOrDict = Union[EvaluationItem, EvaluationItemDict] -class SamplingConfig(_common.BaseModel): - """Sampling config for a BigQuery request set.""" - - sampling_count: Optional[int] = Field(default=None, description="""""") - sampling_method: Optional[SamplingMethod] = Field(default=None, description="""""") - sampling_duration: Optional[str] = Field(default=None, description="""""") - - -class SamplingConfigDict(TypedDict, total=False): - """Sampling config for a BigQuery request set.""" - - sampling_count: Optional[int] - """""" - - sampling_method: Optional[SamplingMethod] - """""" - - sampling_duration: Optional[str] - """""" - - -SamplingConfigOrDict = Union[SamplingConfig, SamplingConfigDict] - - -class BigQueryRequestSet(_common.BaseModel): - """Represents a BigQuery request set.""" +class Metric(_common.BaseModel): + """The metric used for evaluation.""" - uri: Optional[str] = Field(default=None, description="""""") - prompt_column: Optional[str] = Field( + name: Optional[str] = Field(default=None, description="""The name of the metric.""") + custom_function: Optional[Callable[..., Any]] = Field( default=None, - description="""The column name of the prompt in the BigQuery table. Used for EvaluationRun only.""", + description="""The custom function that defines the end-to-end logic for metric computation.""", ) - rubrics_column: Optional[str] = Field( + prompt_template: Optional[str] = Field( + default=None, description="""The prompt template for the metric.""" + ) + judge_model_system_instruction: Optional[str] = Field( + default=None, description="""The system instruction for the judge model.""" + ) + return_raw_output: Optional[bool] = Field( default=None, - description="""The column name of the rubrics in the BigQuery table. Used for EvaluationRun only.""", + description="""Whether to return the raw output from the judge model.""", ) - candidate_response_columns: Optional[dict[str, str]] = Field( + parse_and_reduce_fn: Optional[Callable[..., Any]] = Field( default=None, - description="""The column name of the response candidates in the BigQuery table. Used for EvaluationRun only.""", + description="""The parse and reduce function for the judge model.""", ) - sampling_config: Optional[SamplingConfig] = Field( + aggregate_summary_fn: Optional[Callable[..., Any]] = Field( default=None, - description="""The sampling config for the BigQuery request set. Used for EvaluationRun only.""", + description="""The aggregate summary function for the judge model.""", + ) + remote_custom_function: Optional[str] = Field( + default=None, + description="""The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""", + ) + judge_model: Optional[str] = Field( + default=None, description="""The judge model for the metric.""" + ) + judge_model_generation_config: Optional[genai_types.GenerationConfig] = Field( + default=None, + description="""The generation config for the judge LLM (temperature, top_k, top_p, etc).""", + ) + judge_model_sampling_count: Optional[int] = Field( + default=None, description="""The sampling count for the judge model.""" + ) + rubric_group_name: Optional[str] = Field( + default=None, + description="""The rubric group name for the rubric-based metric.""", + ) + metric_spec_parameters: Optional[dict[str, Any]] = Field( + default=None, + description="""Optional steering instruction parameters for the automated predefined metric.""", + ) + metric_resource_name: Optional[str] = Field( + default=None, + description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""", ) + # Allow extra fields to support metric-specific config fields. + model_config = ConfigDict(extra="allow") -class BigQueryRequestSetDict(TypedDict, total=False): - """Represents a BigQuery request set.""" - - uri: Optional[str] - """""" - - prompt_column: Optional[str] - """The column name of the prompt in the BigQuery table. Used for EvaluationRun only.""" - - rubrics_column: Optional[str] - """The column name of the rubrics in the BigQuery table. Used for EvaluationRun only.""" - - candidate_response_columns: Optional[dict[str, str]] - """The column name of the response candidates in the BigQuery table. Used for EvaluationRun only.""" - - sampling_config: Optional[SamplingConfigDict] - """The sampling config for the BigQuery request set. Used for EvaluationRun only.""" - + _is_predefined: bool = PrivateAttr(default=False) + """A boolean indicating whether the metric is predefined.""" -BigQueryRequestSetOrDict = Union[BigQueryRequestSet, BigQueryRequestSetDict] + _config_source: Optional[str] = PrivateAttr(default=None) + """An optional string indicating the source of the metric configuration.""" + _version: Optional[str] = PrivateAttr(default=None) + """An optional string indicating the version of the metric.""" -class EvaluationRunDataSource(_common.BaseModel): - """Represents an evaluation run data source.""" + @model_validator(mode="after") + @classmethod + def validate_name(cls, model: "Metric") -> "Metric": + if not model.name: + raise ValueError("Metric name cannot be empty.") + model.name = model.name.lower() + return model - evaluation_set: Optional[str] = Field(default=None, description="""""") - bigquery_request_set: Optional[BigQueryRequestSet] = Field( - default=None, description="""""" - ) + def to_yaml_file(self, file_path: str, version: Optional[str] = None) -> None: + """Dumps the metric object to a YAML file. + Args: + file_path: The path to the YAML file. + version: Optional version string to include in the YAML output. -class EvaluationRunDataSourceDict(TypedDict, total=False): - """Represents an evaluation run data source.""" + Raises: + ImportError: If the pyyaml library is not installed. + """ + if yaml is None: + raise ImportError( + "YAML serialization requires the pyyaml library. Please install" + " it using 'pip install google-cloud-aiplatform[evaluation]'." + ) - evaluation_set: Optional[str] - """""" + fields_to_exclude = { + field_name + for field_name, field_info in self.model_fields.items() + if self.__getattribute__(field_name) is not None + and isinstance(self.__getattribute__(field_name), Callable) + } - bigquery_request_set: Optional[BigQueryRequestSetDict] - """""" + data_to_dump = self.model_dump( + exclude_unset=True, + exclude_none=True, + mode="json", + exclude=fields_to_exclude if fields_to_exclude else None, + ) + if version: + data_to_dump["version"] = version -EvaluationRunDataSourceOrDict = Union[ - EvaluationRunDataSource, EvaluationRunDataSourceDict -] + with open(file_path, "w", encoding="utf-8") as f: + yaml.dump(data_to_dump, f, sort_keys=False, allow_unicode=True) -class PredefinedMetricSpec(_common.BaseModel): - """Spec for predefined metric.""" +class LLMMetric(Metric): + """A metric that uses LLM-as-a-judge for evaluation.""" - metric_spec_name: Optional[str] = Field( - default=None, - description="""The name of a pre-defined metric, such as "instruction_following_v1" or - "text_quality_v1".""", - ) - metric_spec_parameters: Optional[dict[str, Any]] = Field( + rubric_group_name: Optional[str] = Field( default=None, - description="""The parameters needed to run the pre-defined metric.""", + description="""Optional. The name of the column in the EvaluationDataset containing the list of rubrics to use for this metric.""", ) + @field_validator("prompt_template", mode="before") + @classmethod + def validate_prompt_template(cls, value: Union[str, "MetricPromptBuilder"]) -> str: + """Validates prompt template to be a non-empty string.""" + if value is None: + raise ValueError("Prompt template cannot be empty.") + if isinstance(value, MetricPromptBuilder): + value = str(value) + if not value.strip(): + raise ValueError("Prompt template cannot be an empty string.") + return value -class PredefinedMetricSpecDict(TypedDict, total=False): - """Spec for predefined metric.""" + @field_validator("judge_model_sampling_count") + @classmethod + def validate_judge_model_sampling_count(cls, value: Optional[int]) -> Optional[int]: + """Validates judge_model_sampling_count to be between 1 and 32.""" + if value is not None and (value < 1 or value > 32): + raise ValueError("judge_model_sampling_count must be between 1 and 32.") + return value - metric_spec_name: Optional[str] - """The name of a pre-defined metric, such as "instruction_following_v1" or - "text_quality_v1".""" + @classmethod + def load(cls, config_path: str, client: Optional[Any] = None) -> "LLMMetric": + """Loads a metric configuration from a YAML or JSON file. - metric_spec_parameters: Optional[dict[str, Any]] - """The parameters needed to run the pre-defined metric.""" + This method allows for the creation of an LLMMetric instance from a + local file path or a Google Cloud Storage (GCS) URI. It will automatically + detect the file type (.yaml, .yml, or .json) and parse it accordingly. + Args: + config_path: The local path or GCS URI (e.g., 'gs://bucket/metric.yaml') + to the metric configuration file. + client: Optional. The Vertex AI client instance to use for authentication. + If not provided, Application Default Credentials (ADC) will be used. -PredefinedMetricSpecOrDict = Union[PredefinedMetricSpec, PredefinedMetricSpecDict] + Returns: + An instance of LLMMetric configured with the loaded data. + Raises: + ValueError: If the file path is invalid or the file content cannot be parsed. + ImportError: If a required library like 'PyYAML' or 'google-cloud-storage' is not installed. + IOError: If the file cannot be read from the specified path. + """ + file_extension = os.path.splitext(config_path)[1].lower() + if file_extension not in [".yaml", ".yml", ".json"]: + raise ValueError( + "Unsupported file extension for metric config. Must be .yaml, .yml, or .json" + ) -class RubricGenerationSpec(_common.BaseModel): - """Spec for generating rubrics.""" + content_str: str + if config_path.startswith("gs://"): + try: + from google.cloud import storage # type: ignore[attr-defined] - prompt_template: Optional[str] = Field( - default=None, - description="""Template for the prompt used to generate rubrics. - The details should be updated based on the most-recent recipe requirements.""", - ) - rubric_content_type: Optional[RubricContentType] = Field( - default=None, description="""The type of rubric content to be generated.""" - ) - rubric_type_ontology: Optional[list[str]] = Field( - default=None, - description="""An optional, pre-defined list of allowed types for generated rubrics. - If this field is provided, it implies `include_rubric_type` should be true, - and the generated rubric types should be chosen from this ontology.""", - ) + storage_client = storage.Client( + credentials=client._api_client._credentials if client else None + ) + path_without_prefix = config_path[len("gs://") :] + bucket_name, blob_path = path_without_prefix.split("/", 1) + + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_path) + content_str = blob.download_as_bytes().decode("utf-8") + except ImportError as e: + raise ImportError( + "Reading from GCS requires the 'google-cloud-storage' library. Please install it with 'pip install google-cloud-aiplatform[evaluation]'." + ) from e + except Exception as e: + raise IOError(f"Failed to read from GCS path {config_path}: {e}") from e + else: + try: + with open(config_path, "r", encoding="utf-8") as f: + content_str = f.read() + except FileNotFoundError: + raise FileNotFoundError( + f"Local configuration file not found at: {config_path}" + ) + except Exception as e: + raise IOError(f"Failed to read local file {config_path}: {e}") from e + + data: Dict[str, Any] + + if file_extension in [".yaml", ".yml"]: + if yaml is None: + raise ImportError( + "YAML parsing requires the pyyaml library. Please install it with 'pip install google-cloud-aiplatform[evaluation]'." + ) + data = yaml.safe_load(content_str) + elif file_extension == ".json": + data = json.loads(content_str) + + if not isinstance(data, dict): + raise ValueError("Metric config content did not parse into a dictionary.") + + return cls.model_validate(data) + + +class MetricDict(TypedDict, total=False): + """The metric used for evaluation.""" + + name: Optional[str] + """The name of the metric.""" + + custom_function: Optional[Callable[..., Any]] + """The custom function that defines the end-to-end logic for metric computation.""" + + prompt_template: Optional[str] + """The prompt template for the metric.""" + + judge_model_system_instruction: Optional[str] + """The system instruction for the judge model.""" + + return_raw_output: Optional[bool] + """Whether to return the raw output from the judge model.""" + + parse_and_reduce_fn: Optional[Callable[..., Any]] + """The parse and reduce function for the judge model.""" + + aggregate_summary_fn: Optional[Callable[..., Any]] + """The aggregate summary function for the judge model.""" + + remote_custom_function: Optional[str] + """The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""" + + judge_model: Optional[str] + """The judge model for the metric.""" + + judge_model_generation_config: Optional[genai_types.GenerationConfigDict] + """The generation config for the judge LLM (temperature, top_k, top_p, etc).""" + + judge_model_sampling_count: Optional[int] + """The sampling count for the judge model.""" + + rubric_group_name: Optional[str] + """The rubric group name for the rubric-based metric.""" + + metric_spec_parameters: Optional[dict[str, Any]] + """Optional steering instruction parameters for the automated predefined metric.""" + + metric_resource_name: Optional[str] + """The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""" + + +MetricOrDict = Union[Metric, MetricDict] + + +class CreateEvaluationMetricConfig(_common.BaseModel): + """Config for creating an evaluation metric.""" + + http_options: Optional[genai_types.HttpOptions] = Field( + default=None, description="""Used to override HTTP request options.""" + ) + + +class CreateEvaluationMetricConfigDict(TypedDict, total=False): + """Config for creating an evaluation metric.""" + + http_options: Optional[genai_types.HttpOptionsDict] + """Used to override HTTP request options.""" + + +CreateEvaluationMetricConfigOrDict = Union[ + CreateEvaluationMetricConfig, CreateEvaluationMetricConfigDict +] + + +class _CreateEvaluationMetricParameters(_common.BaseModel): + """Parameters for creating an evaluation metric.""" + + display_name: Optional[str] = Field( + default=None, + description="""The user-defined name of the evaluation metric. + + The display name can be up to 128 characters long and can comprise any + UTF-8 characters. + """, + ) + description: Optional[str] = Field( + default=None, description="""The description of the evaluation metric.""" + ) + metric: Optional[Metric] = Field( + default=None, + description="""The metric configuration of the evaluation metric.""", + ) + config: Optional[CreateEvaluationMetricConfig] = Field( + default=None, description="""""" + ) + + +class _CreateEvaluationMetricParametersDict(TypedDict, total=False): + """Parameters for creating an evaluation metric.""" + + display_name: Optional[str] + """The user-defined name of the evaluation metric. + + The display name can be up to 128 characters long and can comprise any + UTF-8 characters. + """ + + description: Optional[str] + """The description of the evaluation metric.""" + + metric: Optional[MetricDict] + """The metric configuration of the evaluation metric.""" + + config: Optional[CreateEvaluationMetricConfigDict] + """""" + + +_CreateEvaluationMetricParametersOrDict = Union[ + _CreateEvaluationMetricParameters, _CreateEvaluationMetricParametersDict +] + + +class PredefinedMetricSpec(_common.BaseModel): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] = Field( + default=None, + description="""The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""", + ) + metric_spec_parameters: Optional[dict[str, Any]] = Field( + default=None, + description="""The parameters needed to run the pre-defined metric.""", + ) + + +class PredefinedMetricSpecDict(TypedDict, total=False): + """Spec for predefined metric.""" + + metric_spec_name: Optional[str] + """The name of a pre-defined metric, such as "instruction_following_v1" or + "text_quality_v1".""" + + metric_spec_parameters: Optional[dict[str, Any]] + """The parameters needed to run the pre-defined metric.""" + + +PredefinedMetricSpecOrDict = Union[PredefinedMetricSpec, PredefinedMetricSpecDict] + + +class RubricGenerationSpec(_common.BaseModel): + """Spec for generating rubrics.""" + + prompt_template: Optional[str] = Field( + default=None, + description="""Template for the prompt used to generate rubrics. + The details should be updated based on the most-recent recipe requirements.""", + ) + rubric_content_type: Optional[RubricContentType] = Field( + default=None, description="""The type of rubric content to be generated.""" + ) + rubric_type_ontology: Optional[list[str]] = Field( + default=None, + description="""An optional, pre-defined list of allowed types for generated rubrics. + If this field is provided, it implies `include_rubric_type` should be true, + and the generated rubric types should be chosen from this ontology.""", + ) generator_model_config: Optional[genai_types.AutoraterConfig] = Field( default=None, description="""Configuration for the model used in rubric generation. @@ -2485,6 +2724,136 @@ class UnifiedMetricDict(TypedDict, total=False): UnifiedMetricOrDict = Union[UnifiedMetric, UnifiedMetricDict] +class EvaluationMetric(_common.BaseModel): + """Represents an evaluation metric.""" + + name: Optional[str] = Field( + default=None, description="""The resource name of the evaluation metric.""" + ) + display_name: Optional[str] = Field( + default=None, + description="""The user-friendly display name for the EvaluationMetric.""", + ) + description: Optional[str] = Field( + default=None, description="""The description of the EvaluationMetric.""" + ) + metric: Optional[UnifiedMetric] = Field( + default=None, + description="""The metric configuration of the evaluation metric.""", + ) + + +class EvaluationMetricDict(TypedDict, total=False): + """Represents an evaluation metric.""" + + name: Optional[str] + """The resource name of the evaluation metric.""" + + display_name: Optional[str] + """The user-friendly display name for the EvaluationMetric.""" + + description: Optional[str] + """The description of the EvaluationMetric.""" + + metric: Optional[UnifiedMetricDict] + """The metric configuration of the evaluation metric.""" + + +EvaluationMetricOrDict = Union[EvaluationMetric, EvaluationMetricDict] + + +class SamplingConfig(_common.BaseModel): + """Sampling config for a BigQuery request set.""" + + sampling_count: Optional[int] = Field(default=None, description="""""") + sampling_method: Optional[SamplingMethod] = Field(default=None, description="""""") + sampling_duration: Optional[str] = Field(default=None, description="""""") + + +class SamplingConfigDict(TypedDict, total=False): + """Sampling config for a BigQuery request set.""" + + sampling_count: Optional[int] + """""" + + sampling_method: Optional[SamplingMethod] + """""" + + sampling_duration: Optional[str] + """""" + + +SamplingConfigOrDict = Union[SamplingConfig, SamplingConfigDict] + + +class BigQueryRequestSet(_common.BaseModel): + """Represents a BigQuery request set.""" + + uri: Optional[str] = Field(default=None, description="""""") + prompt_column: Optional[str] = Field( + default=None, + description="""The column name of the prompt in the BigQuery table. Used for EvaluationRun only.""", + ) + rubrics_column: Optional[str] = Field( + default=None, + description="""The column name of the rubrics in the BigQuery table. Used for EvaluationRun only.""", + ) + candidate_response_columns: Optional[dict[str, str]] = Field( + default=None, + description="""The column name of the response candidates in the BigQuery table. Used for EvaluationRun only.""", + ) + sampling_config: Optional[SamplingConfig] = Field( + default=None, + description="""The sampling config for the BigQuery request set. Used for EvaluationRun only.""", + ) + + +class BigQueryRequestSetDict(TypedDict, total=False): + """Represents a BigQuery request set.""" + + uri: Optional[str] + """""" + + prompt_column: Optional[str] + """The column name of the prompt in the BigQuery table. Used for EvaluationRun only.""" + + rubrics_column: Optional[str] + """The column name of the rubrics in the BigQuery table. Used for EvaluationRun only.""" + + candidate_response_columns: Optional[dict[str, str]] + """The column name of the response candidates in the BigQuery table. Used for EvaluationRun only.""" + + sampling_config: Optional[SamplingConfigDict] + """The sampling config for the BigQuery request set. Used for EvaluationRun only.""" + + +BigQueryRequestSetOrDict = Union[BigQueryRequestSet, BigQueryRequestSetDict] + + +class EvaluationRunDataSource(_common.BaseModel): + """Represents an evaluation run data source.""" + + evaluation_set: Optional[str] = Field(default=None, description="""""") + bigquery_request_set: Optional[BigQueryRequestSet] = Field( + default=None, description="""""" + ) + + +class EvaluationRunDataSourceDict(TypedDict, total=False): + """Represents an evaluation run data source.""" + + evaluation_set: Optional[str] + """""" + + bigquery_request_set: Optional[BigQueryRequestSetDict] + """""" + + +EvaluationRunDataSourceOrDict = Union[ + EvaluationRunDataSource, EvaluationRunDataSourceDict +] + + class EvaluationRunMetric(_common.BaseModel): """The metric used for evaluation run.""" @@ -4300,383 +4669,120 @@ class RubricBasedMetricSpecDict(TypedDict, total=False): """Dynamically generate rubrics for evaluation using this specification.""" -RubricBasedMetricSpecOrDict = Union[RubricBasedMetricSpec, RubricBasedMetricSpecDict] - - -class RubricEnhancedContents(_common.BaseModel): - """Rubric-enhanced contents for evaluation.""" - - prompt: Optional[list[genai_types.Content]] = Field( - default=None, - description="""User prompt, using the standard Content type from the Gen AI SDK.""", - ) - rubric_groups: Optional[dict[str, "RubricGroup"]] = Field( - default=None, - description="""Named groups of rubrics associated with this prompt. - The key is a user-defined name for the rubric group.""", - ) - response: Optional[list[genai_types.Content]] = Field( - default=None, - description="""Response, using the standard Content type from the Gen AI SDK.""", - ) - other_content: Optional[ContentMap] = Field( - default=None, - description="""Other contents needed for the metric. - For example, if `reference` is needed for the metric, it can be provided - here.""", - ) - - -class RubricEnhancedContentsDict(TypedDict, total=False): - """Rubric-enhanced contents for evaluation.""" - - prompt: Optional[list[genai_types.ContentDict]] - """User prompt, using the standard Content type from the Gen AI SDK.""" - - rubric_groups: Optional[dict[str, "RubricGroup"]] - """Named groups of rubrics associated with this prompt. - The key is a user-defined name for the rubric group.""" - - response: Optional[list[genai_types.ContentDict]] - """Response, using the standard Content type from the Gen AI SDK.""" - - other_content: Optional[ContentMapDict] - """Other contents needed for the metric. - For example, if `reference` is needed for the metric, it can be provided - here.""" - - -RubricEnhancedContentsOrDict = Union[RubricEnhancedContents, RubricEnhancedContentsDict] - - -class RubricBasedMetricInstance(_common.BaseModel): - """Defines an instance for Rubric-based metrics. - - This class allows various input formats. - """ - - json_instance: Optional[str] = Field( - default=None, - description="""Specify evaluation fields and their string values in JSON format.""", - ) - content_map_instance: Optional[ContentMap] = Field( - default=None, - description="""Specify evaluation fields and their content values using a ContentMap.""", - ) - rubric_enhanced_contents: Optional[RubricEnhancedContents] = Field( - default=None, - description="""Provide input as Gemini Content along with one or more - associated rubric groups.""", - ) - - -class RubricBasedMetricInstanceDict(TypedDict, total=False): - """Defines an instance for Rubric-based metrics. - - This class allows various input formats. - """ - - json_instance: Optional[str] - """Specify evaluation fields and their string values in JSON format.""" - - content_map_instance: Optional[ContentMapDict] - """Specify evaluation fields and their content values using a ContentMap.""" - - rubric_enhanced_contents: Optional[RubricEnhancedContentsDict] - """Provide input as Gemini Content along with one or more - associated rubric groups.""" - - -RubricBasedMetricInstanceOrDict = Union[ - RubricBasedMetricInstance, RubricBasedMetricInstanceDict -] - - -class RubricBasedMetricInput(_common.BaseModel): - """Input for a rubric-based metrics.""" - - metric_spec: Optional[RubricBasedMetricSpec] = Field( - default=None, description="""Specification for the rubric-based metric.""" - ) - instance: Optional[RubricBasedMetricInstance] = Field( - default=None, description="""The instance to be evaluated.""" - ) - - -class RubricBasedMetricInputDict(TypedDict, total=False): - """Input for a rubric-based metrics.""" - - metric_spec: Optional[RubricBasedMetricSpecDict] - """Specification for the rubric-based metric.""" - - instance: Optional[RubricBasedMetricInstanceDict] - """The instance to be evaluated.""" - - -RubricBasedMetricInputOrDict = Union[RubricBasedMetricInput, RubricBasedMetricInputDict] - - -class Metric(_common.BaseModel): - """The metric used for evaluation.""" - - name: Optional[str] = Field(default=None, description="""The name of the metric.""") - custom_function: Optional[Callable[..., Any]] = Field( - default=None, - description="""The custom function that defines the end-to-end logic for metric computation.""", - ) - prompt_template: Optional[str] = Field( - default=None, description="""The prompt template for the metric.""" - ) - judge_model_system_instruction: Optional[str] = Field( - default=None, description="""The system instruction for the judge model.""" - ) - return_raw_output: Optional[bool] = Field( - default=None, - description="""Whether to return the raw output from the judge model.""", - ) - parse_and_reduce_fn: Optional[Callable[..., Any]] = Field( - default=None, - description="""The parse and reduce function for the judge model.""", - ) - aggregate_summary_fn: Optional[Callable[..., Any]] = Field( - default=None, - description="""The aggregate summary function for the judge model.""", - ) - remote_custom_function: Optional[str] = Field( - default=None, - description="""The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""", - ) - judge_model: Optional[str] = Field( - default=None, description="""The judge model for the metric.""" - ) - judge_model_generation_config: Optional[genai_types.GenerationConfig] = Field( - default=None, - description="""The generation config for the judge LLM (temperature, top_k, top_p, etc).""", - ) - judge_model_sampling_count: Optional[int] = Field( - default=None, description="""The sampling count for the judge model.""" - ) - rubric_group_name: Optional[str] = Field( - default=None, - description="""The rubric group name for the rubric-based metric.""", - ) - metric_spec_parameters: Optional[dict[str, Any]] = Field( - default=None, - description="""Optional steering instruction parameters for the automated predefined metric.""", - ) - metric_resource_name: Optional[str] = Field( - default=None, - description="""The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""", - ) - - # Allow extra fields to support metric-specific config fields. - model_config = ConfigDict(extra="allow") - - _is_predefined: bool = PrivateAttr(default=False) - """A boolean indicating whether the metric is predefined.""" - - _config_source: Optional[str] = PrivateAttr(default=None) - """An optional string indicating the source of the metric configuration.""" - - _version: Optional[str] = PrivateAttr(default=None) - """An optional string indicating the version of the metric.""" - - @model_validator(mode="after") - @classmethod - def validate_name(cls, model: "Metric") -> "Metric": - if not model.name: - raise ValueError("Metric name cannot be empty.") - model.name = model.name.lower() - return model - - def to_yaml_file(self, file_path: str, version: Optional[str] = None) -> None: - """Dumps the metric object to a YAML file. - - Args: - file_path: The path to the YAML file. - version: Optional version string to include in the YAML output. - - Raises: - ImportError: If the pyyaml library is not installed. - """ - if yaml is None: - raise ImportError( - "YAML serialization requires the pyyaml library. Please install" - " it using 'pip install google-cloud-aiplatform[evaluation]'." - ) - - fields_to_exclude = { - field_name - for field_name, field_info in self.model_fields.items() - if self.__getattribute__(field_name) is not None - and isinstance(self.__getattribute__(field_name), Callable) - } - - data_to_dump = self.model_dump( - exclude_unset=True, - exclude_none=True, - mode="json", - exclude=fields_to_exclude if fields_to_exclude else None, - ) - - if version: - data_to_dump["version"] = version - - with open(file_path, "w", encoding="utf-8") as f: - yaml.dump(data_to_dump, f, sort_keys=False, allow_unicode=True) - - -class LLMMetric(Metric): - """A metric that uses LLM-as-a-judge for evaluation.""" - - rubric_group_name: Optional[str] = Field( - default=None, - description="""Optional. The name of the column in the EvaluationDataset containing the list of rubrics to use for this metric.""", - ) - - @field_validator("prompt_template", mode="before") - @classmethod - def validate_prompt_template(cls, value: Union[str, "MetricPromptBuilder"]) -> str: - """Validates prompt template to be a non-empty string.""" - if value is None: - raise ValueError("Prompt template cannot be empty.") - if isinstance(value, MetricPromptBuilder): - value = str(value) - if not value.strip(): - raise ValueError("Prompt template cannot be an empty string.") - return value - - @field_validator("judge_model_sampling_count") - @classmethod - def validate_judge_model_sampling_count(cls, value: Optional[int]) -> Optional[int]: - """Validates judge_model_sampling_count to be between 1 and 32.""" - if value is not None and (value < 1 or value > 32): - raise ValueError("judge_model_sampling_count must be between 1 and 32.") - return value +RubricBasedMetricSpecOrDict = Union[RubricBasedMetricSpec, RubricBasedMetricSpecDict] - @classmethod - def load(cls, config_path: str, client: Optional[Any] = None) -> "LLMMetric": - """Loads a metric configuration from a YAML or JSON file. - This method allows for the creation of an LLMMetric instance from a - local file path or a Google Cloud Storage (GCS) URI. It will automatically - detect the file type (.yaml, .yml, or .json) and parse it accordingly. +class RubricEnhancedContents(_common.BaseModel): + """Rubric-enhanced contents for evaluation.""" - Args: - config_path: The local path or GCS URI (e.g., 'gs://bucket/metric.yaml') - to the metric configuration file. - client: Optional. The Vertex AI client instance to use for authentication. - If not provided, Application Default Credentials (ADC) will be used. + prompt: Optional[list[genai_types.Content]] = Field( + default=None, + description="""User prompt, using the standard Content type from the Gen AI SDK.""", + ) + rubric_groups: Optional[dict[str, "RubricGroup"]] = Field( + default=None, + description="""Named groups of rubrics associated with this prompt. + The key is a user-defined name for the rubric group.""", + ) + response: Optional[list[genai_types.Content]] = Field( + default=None, + description="""Response, using the standard Content type from the Gen AI SDK.""", + ) + other_content: Optional[ContentMap] = Field( + default=None, + description="""Other contents needed for the metric. + For example, if `reference` is needed for the metric, it can be provided + here.""", + ) - Returns: - An instance of LLMMetric configured with the loaded data. - Raises: - ValueError: If the file path is invalid or the file content cannot be parsed. - ImportError: If a required library like 'PyYAML' or 'google-cloud-storage' is not installed. - IOError: If the file cannot be read from the specified path. - """ - file_extension = os.path.splitext(config_path)[1].lower() - if file_extension not in [".yaml", ".yml", ".json"]: - raise ValueError( - "Unsupported file extension for metric config. Must be .yaml, .yml, or .json" - ) +class RubricEnhancedContentsDict(TypedDict, total=False): + """Rubric-enhanced contents for evaluation.""" - content_str: str - if config_path.startswith("gs://"): - try: - from google.cloud import storage # type: ignore[attr-defined] + prompt: Optional[list[genai_types.ContentDict]] + """User prompt, using the standard Content type from the Gen AI SDK.""" - storage_client = storage.Client( - credentials=client._api_client._credentials if client else None - ) - path_without_prefix = config_path[len("gs://") :] - bucket_name, blob_path = path_without_prefix.split("/", 1) + rubric_groups: Optional[dict[str, "RubricGroup"]] + """Named groups of rubrics associated with this prompt. + The key is a user-defined name for the rubric group.""" - bucket = storage_client.bucket(bucket_name) - blob = bucket.blob(blob_path) - content_str = blob.download_as_bytes().decode("utf-8") - except ImportError as e: - raise ImportError( - "Reading from GCS requires the 'google-cloud-storage' library. Please install it with 'pip install google-cloud-aiplatform[evaluation]'." - ) from e - except Exception as e: - raise IOError(f"Failed to read from GCS path {config_path}: {e}") from e - else: - try: - with open(config_path, "r", encoding="utf-8") as f: - content_str = f.read() - except FileNotFoundError: - raise FileNotFoundError( - f"Local configuration file not found at: {config_path}" - ) - except Exception as e: - raise IOError(f"Failed to read local file {config_path}: {e}") from e + response: Optional[list[genai_types.ContentDict]] + """Response, using the standard Content type from the Gen AI SDK.""" - data: Dict[str, Any] + other_content: Optional[ContentMapDict] + """Other contents needed for the metric. + For example, if `reference` is needed for the metric, it can be provided + here.""" - if file_extension in [".yaml", ".yml"]: - if yaml is None: - raise ImportError( - "YAML parsing requires the pyyaml library. Please install it with 'pip install google-cloud-aiplatform[evaluation]'." - ) - data = yaml.safe_load(content_str) - elif file_extension == ".json": - data = json.loads(content_str) - if not isinstance(data, dict): - raise ValueError("Metric config content did not parse into a dictionary.") +RubricEnhancedContentsOrDict = Union[RubricEnhancedContents, RubricEnhancedContentsDict] - return cls.model_validate(data) +class RubricBasedMetricInstance(_common.BaseModel): + """Defines an instance for Rubric-based metrics. -class MetricDict(TypedDict, total=False): - """The metric used for evaluation.""" + This class allows various input formats. + """ - name: Optional[str] - """The name of the metric.""" + json_instance: Optional[str] = Field( + default=None, + description="""Specify evaluation fields and their string values in JSON format.""", + ) + content_map_instance: Optional[ContentMap] = Field( + default=None, + description="""Specify evaluation fields and their content values using a ContentMap.""", + ) + rubric_enhanced_contents: Optional[RubricEnhancedContents] = Field( + default=None, + description="""Provide input as Gemini Content along with one or more + associated rubric groups.""", + ) - custom_function: Optional[Callable[..., Any]] - """The custom function that defines the end-to-end logic for metric computation.""" - prompt_template: Optional[str] - """The prompt template for the metric.""" +class RubricBasedMetricInstanceDict(TypedDict, total=False): + """Defines an instance for Rubric-based metrics. - judge_model_system_instruction: Optional[str] - """The system instruction for the judge model.""" + This class allows various input formats. + """ - return_raw_output: Optional[bool] - """Whether to return the raw output from the judge model.""" + json_instance: Optional[str] + """Specify evaluation fields and their string values in JSON format.""" - parse_and_reduce_fn: Optional[Callable[..., Any]] - """The parse and reduce function for the judge model.""" + content_map_instance: Optional[ContentMapDict] + """Specify evaluation fields and their content values using a ContentMap.""" - aggregate_summary_fn: Optional[Callable[..., Any]] - """The aggregate summary function for the judge model.""" + rubric_enhanced_contents: Optional[RubricEnhancedContentsDict] + """Provide input as Gemini Content along with one or more + associated rubric groups.""" - remote_custom_function: Optional[str] - """The evaluation function for the custom code execution metric. This custom code is run remotely in the evaluation service.""" - judge_model: Optional[str] - """The judge model for the metric.""" +RubricBasedMetricInstanceOrDict = Union[ + RubricBasedMetricInstance, RubricBasedMetricInstanceDict +] - judge_model_generation_config: Optional[genai_types.GenerationConfigDict] - """The generation config for the judge LLM (temperature, top_k, top_p, etc).""" - judge_model_sampling_count: Optional[int] - """The sampling count for the judge model.""" +class RubricBasedMetricInput(_common.BaseModel): + """Input for a rubric-based metrics.""" - rubric_group_name: Optional[str] - """The rubric group name for the rubric-based metric.""" + metric_spec: Optional[RubricBasedMetricSpec] = Field( + default=None, description="""Specification for the rubric-based metric.""" + ) + instance: Optional[RubricBasedMetricInstance] = Field( + default=None, description="""The instance to be evaluated.""" + ) - metric_spec_parameters: Optional[dict[str, Any]] - """Optional steering instruction parameters for the automated predefined metric.""" - metric_resource_name: Optional[str] - """The resource name of the metric definition. Example: projects/{project}/locations/{location}/evaluationMetrics/{evaluation_metric_id}""" +class RubricBasedMetricInputDict(TypedDict, total=False): + """Input for a rubric-based metrics.""" + + metric_spec: Optional[RubricBasedMetricSpecDict] + """Specification for the rubric-based metric.""" + instance: Optional[RubricBasedMetricInstanceDict] + """The instance to be evaluated.""" -MetricOrDict = Union[Metric, MetricDict] + +RubricBasedMetricInputOrDict = Union[RubricBasedMetricInput, RubricBasedMetricInputDict] class MetricSource(_common.BaseModel): @@ -5553,6 +5659,50 @@ class GenerateUserScenariosResponseDict(TypedDict, total=False): ] +class GetEvaluationMetricConfig(_common.BaseModel): + """Config for getting an evaluation metric.""" + + http_options: Optional[genai_types.HttpOptions] = Field( + default=None, description="""Used to override HTTP request options.""" + ) + + +class GetEvaluationMetricConfigDict(TypedDict, total=False): + """Config for getting an evaluation metric.""" + + http_options: Optional[genai_types.HttpOptionsDict] + """Used to override HTTP request options.""" + + +GetEvaluationMetricConfigOrDict = Union[ + GetEvaluationMetricConfig, GetEvaluationMetricConfigDict +] + + +class _GetEvaluationMetricParameters(_common.BaseModel): + """Parameters for getting an evaluation metric.""" + + metric_resource_name: Optional[str] = Field(default=None, description="""""") + config: Optional[GetEvaluationMetricConfig] = Field( + default=None, description="""""" + ) + + +class _GetEvaluationMetricParametersDict(TypedDict, total=False): + """Parameters for getting an evaluation metric.""" + + metric_resource_name: Optional[str] + """""" + + config: Optional[GetEvaluationMetricConfigDict] + """""" + + +_GetEvaluationMetricParametersOrDict = Union[ + _GetEvaluationMetricParameters, _GetEvaluationMetricParametersDict +] + + class GetEvaluationRunConfig(_common.BaseModel): """Config for get evaluation run.""" @@ -5675,6 +5825,79 @@ class _GetEvaluationItemParametersDict(TypedDict, total=False): ] +class ListEvaluationMetricsConfig(_common.BaseModel): + """Config for listing evaluation metrics.""" + + http_options: Optional[genai_types.HttpOptions] = Field( + default=None, description="""Used to override HTTP request options.""" + ) + + +class ListEvaluationMetricsConfigDict(TypedDict, total=False): + """Config for listing evaluation metrics.""" + + http_options: Optional[genai_types.HttpOptionsDict] + """Used to override HTTP request options.""" + + +ListEvaluationMetricsConfigOrDict = Union[ + ListEvaluationMetricsConfig, ListEvaluationMetricsConfigDict +] + + +class _ListEvaluationMetricsParameters(_common.BaseModel): + """Parameters for listing evaluation metrics.""" + + config: Optional[ListEvaluationMetricsConfig] = Field( + default=None, description="""""" + ) + + +class _ListEvaluationMetricsParametersDict(TypedDict, total=False): + """Parameters for listing evaluation metrics.""" + + config: Optional[ListEvaluationMetricsConfigDict] + """""" + + +_ListEvaluationMetricsParametersOrDict = Union[ + _ListEvaluationMetricsParameters, _ListEvaluationMetricsParametersDict +] + + +class ListEvaluationMetricsResponse(_common.BaseModel): + """Response for listing evaluation metrics.""" + + sdk_http_response: Optional[genai_types.HttpResponse] = Field( + default=None, description="""Used to retain the full HTTP response.""" + ) + next_page_token: Optional[str] = Field(default=None, description="""""") + evaluation_metrics: Optional[list[EvaluationMetric]] = Field( + default=None, + description="""List of evaluation metrics. + """, + ) + + +class ListEvaluationMetricsResponseDict(TypedDict, total=False): + """Response for listing evaluation metrics.""" + + sdk_http_response: Optional[genai_types.HttpResponseDict] + """Used to retain the full HTTP response.""" + + next_page_token: Optional[str] + """""" + + evaluation_metrics: Optional[list[EvaluationMetricDict]] + """List of evaluation metrics. + """ + + +ListEvaluationMetricsResponseOrDict = Union[ + ListEvaluationMetricsResponse, ListEvaluationMetricsResponseDict +] + + class OptimizeConfig(_common.BaseModel): """Config for Prompt Optimizer."""