From 99af46187b96244f105a9c356a35947afe7e1615 Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Tue, 17 Mar 2026 16:58:04 -0700 Subject: [PATCH] chore: GenAI Client(evals) - fix single turn metrics on Agent Scraping CUJ PiperOrigin-RevId: 885280870 --- tests/unit/vertexai/genai/test_evals.py | 157 +++++++++++++- vertexai/_genai/_evals_common.py | 212 +++++++++++------- vertexai/_genai/_evals_metric_handlers.py | 250 +++++++++++----------- 3 files changed, 417 insertions(+), 202 deletions(-) diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py index 2b534d4740..705960c0b0 100644 --- a/tests/unit/vertexai/genai/test_evals.py +++ b/tests/unit/vertexai/genai/test_evals.py @@ -1313,6 +1313,31 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict( ] ], "response": ["agent response"], + "agent_data": [ + { + "agents": None, + "turns": [ + { + "events": [ + { + "author": "model", + "content": { + "parts": [{"text": "intermediate1"}] + }, + }, + { + "author": "model", + "content": { + "parts": [{"text": "agent response"}] + }, + }, + ], + "turn_id": "turn_0", + "turn_index": 0, + } + ], + } + ], } ), ) @@ -1392,6 +1417,31 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string( ] ], "response": ["agent response"], + "agent_data": [ + { + "agents": None, + "turns": [ + { + "events": [ + { + "author": "model", + "content": { + "parts": [{"text": "intermediate1"}] + }, + }, + { + "author": "model", + "content": { + "parts": [{"text": "agent response"}] + }, + }, + ], + "turn_id": "turn_0", + "turn_index": 0, + } + ], + } + ], } ), ) @@ -1571,6 +1621,72 @@ def run_async_side_effect(*args, **kwargs): ], ], "response": ["agent response", "agent response 2"], + "agent_data": [ + { + "agents": { + "mock_agent": { + "agent_id": "mock_agent", + "agent_resource_name": None, + "agent_type": "Mock", + "instruction": "mock instruction", + "description": "mock description", + "tools": [], + } + }, + "turns": [ + { + "events": [ + { + "author": "model", + "content": { + "parts": [{"text": "intermediate1"}] + }, + }, + { + "author": "model", + "content": { + "parts": [{"text": "agent response"}] + }, + }, + ], + "turn_id": "turn_0", + "turn_index": 0, + } + ], + }, + { + "agents": { + "mock_agent": { + "agent_id": "mock_agent", + "agent_resource_name": None, + "agent_type": "Mock", + "instruction": "mock instruction", + "description": "mock description", + "tools": [], + } + }, + "turns": [ + { + "events": [ + { + "author": "model", + "content": { + "parts": [{"text": "intermediate2"}] + }, + }, + { + "author": "model", + "content": { + "parts": [{"text": "agent response 2"}] + }, + }, + ], + "turn_id": "turn_0", + "turn_index": 0, + } + ], + }, + ], } ) pd.testing.assert_frame_equal( @@ -1952,6 +2068,31 @@ def test_run_agent_internal_success(self, mock_run_agent): ] ], "response": ["final response"], + "agent_data": [ + { + "agents": None, + "turns": [ + { + "events": [ + { + "author": "model", + "content": { + "parts": [{"text": "intermediate1"}] + }, + }, + { + "author": "model", + "content": { + "parts": [{"text": "final response"}] + }, + }, + ], + "turn_id": "turn_0", + "turn_index": 0, + } + ], + } + ], } ) pd.testing.assert_frame_equal(result_df, expected_df) @@ -2144,24 +2285,24 @@ def test_run_agent_internal_malformed_event(self, mock_run_agent): assert not result_df["intermediate_events"][0] -class TestIsMultiTurnAgentRun: - """Unit tests for the _is_multi_turn_agent_run function.""" +class TestIsMultiTurnAgentSimulation: + """Unit tests for the _is_multi_turn_agent_simulation function.""" - def test_is_multi_turn_agent_run_with_config(self): + def test_is_multi_turn_agent_simulation_with_config(self): config = vertexai_genai_types.evals.UserSimulatorConfig(model_name="gemini-pro") - assert _evals_common._is_multi_turn_agent_run( + assert _evals_common._is_multi_turn_agent_simulation( user_simulator_config=config, prompt_dataset=pd.DataFrame() ) - def test_is_multi_turn_agent_run_with_conversation_plan(self): + def test_is_multi_turn_agent_simulation_with_conversation_plan(self): prompt_dataset = pd.DataFrame({"conversation_plan": ["plan"]}) - assert _evals_common._is_multi_turn_agent_run( + assert _evals_common._is_multi_turn_agent_simulation( user_simulator_config=None, prompt_dataset=prompt_dataset ) - def test_is_multi_turn_agent_run_false(self): + def test_is_multi_turn_agent_simulation_false(self): prompt_dataset = pd.DataFrame({"prompt": ["prompt"]}) - assert not _evals_common._is_multi_turn_agent_run( + assert not _evals_common._is_multi_turn_agent_simulation( user_simulator_config=None, prompt_dataset=prompt_dataset ) diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index 9fe93230eb..0b02354018 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -1604,93 +1604,111 @@ def _get_session_inputs(row: pd.Series) -> types.evals.SessionInput: ) -def _is_multi_turn_agent_run( +def _is_multi_turn_agent_simulation( user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, prompt_dataset: pd.DataFrame = None, ) -> bool: - """Checks if the agent run is multi-turn.""" + """Checks if the agent run is a multi-turn user simulation.""" return ( user_simulator_config is not None or "conversation_plan" in prompt_dataset.columns ) -def _run_agent_internal( - api_client: BaseApiClient, - agent_engine: Optional[Union[str, types.AgentEngine]], - agent: Optional[LlmAgent], - prompt_dataset: pd.DataFrame, - user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, -) -> pd.DataFrame: - """Runs an agent.""" - raw_responses = _run_agent( - api_client=api_client, - agent_engine=agent_engine, - agent=agent, - prompt_dataset=prompt_dataset, - user_simulator_config=user_simulator_config, - ) - processed_intermediate_events = [] - processed_responses = [] - processed_agent_data = [] - agent_data_agents = None - if agent: - agent_data_agents = types.evals.AgentData._get_agents_map(agent) +def _process_multi_turn_agent_response( + resp_item: Any, + agent_data_agents: Optional[dict[str, Any]], +) -> Optional[Union[str, dict[str, Any]]]: + """Processes a multi-turn agent response.""" + if isinstance(resp_item, dict) and "error" in resp_item: + return json.dumps(resp_item) + return types.evals.AgentData( + turns=resp_item, + agents=agent_data_agents, + ).model_dump(exclude_unset=True) + + +def _process_single_turn_agent_response( + resp_item: Any, + agent_data_agents: Optional[dict[str, Any]], +) -> tuple[ + Optional[Union[str, dict[str, Any]]], + list[dict[str, Any]], + Optional[Union[str, dict[str, Any]]], +]: + """Processes a single-turn agent response.""" + intermediate_events_row: list[dict[str, Any]] = [] + response_row: Optional[Union[str, dict[str, Any]]] = None + agent_data_row: Optional[Union[str, dict[str, Any]]] = None - for resp_item in raw_responses: - intermediate_events_row: list[dict[str, Any]] = [] - response_row: Optional[Union[str, dict[str, Any]]] = None - agent_data_row: Optional[Union[str, dict[str, Any]]] = None + if isinstance(resp_item, list): + try: + response_row = resp_item[-1]["content"]["parts"][0]["text"] + for intermediate_event in resp_item[:-1]: + intermediate_events_row.append( + { + "event_id": intermediate_event.get("id"), + "content": intermediate_event.get("content"), + "creation_timestamp": intermediate_event.get("timestamp"), + "author": intermediate_event.get("author"), + } + ) + # Construct AgentData natively for single-turn runs + agent_events = [] + for event_dict in resp_item: + content_dict = event_dict.get("content") + content_obj = None + if content_dict: + content_obj = genai_types.Content.model_validate(content_dict) + + agent_events.append( + types.evals.AgentEvent( + author=event_dict.get("author", "model"), + content=content_obj, + ) + ) - if _is_multi_turn_agent_run(user_simulator_config, prompt_dataset): - if isinstance(resp_item, dict) and "error" in resp_item: - agent_data_row = json.dumps(resp_item) - else: - # TODO: Migrate single turn agent run result to AgentData. - agent_data_row = types.evals.AgentData( - turns=resp_item, - agents=agent_data_agents, - ).model_dump() + turn = types.evals.ConversationTurn( + turn_index=0, + turn_id="turn_0", + events=agent_events, + ) + agent_data_row = types.evals.AgentData( + turns=[turn], + agents=agent_data_agents, + ).model_dump(exclude_unset=True) + except Exception as e: # pylint: disable=broad-exception-caught + error_payload = { + "error": ( + f"Failed to parse agent run response {str(resp_item)} to " + f"agent data: {e}" + ), + } + response_row = json.dumps(error_payload) + agent_data_row = json.dumps(error_payload) + elif isinstance(resp_item, dict) and "error" in resp_item: + response_row = json.dumps(resp_item) + else: + error_payload = { + "error": "Unexpected response type from agent run", + "response_type": str(type(resp_item)), + "details": str(resp_item), + } + response_row = json.dumps(error_payload) - else: - if isinstance(resp_item, list): - try: - response_row = resp_item[-1]["content"]["parts"][0]["text"] - for intermediate_event in resp_item[:-1]: - intermediate_events_row.append( - { - "event_id": intermediate_event.get("id"), - "content": intermediate_event.get("content"), - "creation_timestamp": intermediate_event.get( - "timestamp" - ), - "author": intermediate_event.get("author"), - } - ) - except Exception as e: # pylint: disable=broad-exception-caught - error_payload = { - "error": ( - f"Failed to parse agent run response {str(resp_item)} to " - f"agent data: {e}" - ), - } - response_row = json.dumps(error_payload) - elif isinstance(resp_item, dict) and "error" in resp_item: - response_row = json.dumps(resp_item) - else: - error_payload = { - "error": "Unexpected response type from agent run", - "response_type": str(type(resp_item)), - "details": str(resp_item), - } - response_row = json.dumps(error_payload) + return response_row, intermediate_events_row, agent_data_row - processed_intermediate_events.append(intermediate_events_row) - processed_responses.append(response_row) - processed_agent_data.append(agent_data_row) +def _create_agent_results_dataframe( + prompt_dataset: pd.DataFrame, + processed_responses: list[Any], + processed_intermediate_events: list[Any], + processed_agent_data: list[Any], + is_user_simulation: bool, +) -> pd.DataFrame: + """Creates a DataFrame from the processed agent responses.""" df_dict: dict[str, Any] = {} - if _is_multi_turn_agent_run(user_simulator_config, prompt_dataset): + if is_user_simulation: df_dict[AGENT_DATA] = processed_agent_data if len(processed_agent_data) != len(prompt_dataset): raise RuntimeError( @@ -1705,6 +1723,7 @@ def _run_agent_internal( else: df_dict[_evals_constant.INTERMEDIATE_EVENTS] = processed_intermediate_events df_dict[_evals_constant.RESPONSE] = processed_responses + df_dict[AGENT_DATA] = processed_agent_data if len(processed_responses) != len(prompt_dataset) or len( processed_responses ) != len(processed_intermediate_events): @@ -1730,6 +1749,55 @@ def _run_agent_internal( return results_df +def _run_agent_internal( + api_client: BaseApiClient, + agent_engine: Optional[Union[str, types.AgentEngine]], + agent: Optional[LlmAgent], + prompt_dataset: pd.DataFrame, + user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None, +) -> pd.DataFrame: + """Runs an agent.""" + raw_responses = _run_agent( + api_client=api_client, + agent_engine=agent_engine, + agent=agent, + prompt_dataset=prompt_dataset, + user_simulator_config=user_simulator_config, + ) + processed_intermediate_events = [] + processed_responses = [] + processed_agent_data = [] + agent_data_agents = None + if agent: + agent_data_agents = types.evals.AgentData._get_agents_map(agent) + + is_user_simulation = _is_multi_turn_agent_simulation( + user_simulator_config, prompt_dataset + ) + + for resp_item in raw_responses: + if is_user_simulation: + agent_data_row = _process_multi_turn_agent_response( + resp_item, agent_data_agents + ) + processed_agent_data.append(agent_data_row) + else: + response_row, intermediate_events_row, agent_data_row = ( + _process_single_turn_agent_response(resp_item, agent_data_agents) + ) + processed_responses.append(response_row) + processed_intermediate_events.append(intermediate_events_row) + processed_agent_data.append(agent_data_row) + + return _create_agent_results_dataframe( + prompt_dataset, + processed_responses, + processed_intermediate_events, + processed_agent_data, + is_user_simulation, + ) + + def _run_agent( api_client: BaseApiClient, agent_engine: Optional[Union[str, types.AgentEngine]], diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py index 47e543fa10..10e07fe07a 100644 --- a/vertexai/_genai/_evals_metric_handlers.py +++ b/vertexai/_genai/_evals_metric_handlers.py @@ -93,6 +93,72 @@ def _extract_text_from_content( return text_accumulator if any_text_part_found else None +def _get_prompt_from_eval_case( + eval_case: types.EvalCase, +) -> Optional[genai_types.Content]: + """Extracts prompt content from eval_case.prompt or starting_prompt.""" + if eval_case.prompt: + return eval_case.prompt + + user_scenario = getattr(eval_case, "user_scenario", None) + if user_scenario and user_scenario.starting_prompt: + return genai_types.Content( + parts=[genai_types.Part(text=user_scenario.starting_prompt)] + ) + + return None + + +def _get_response_from_eval_case( + eval_case: types.EvalCase, response_index: int, metric_name: str +) -> Optional[genai_types.Content]: + """Extracts response content from eval_case.responses.""" + response_content = None + if eval_case.responses and response_index < len(eval_case.responses): + response_content = eval_case.responses[response_index].response + + return response_content + + +def _value_to_content_list(value: Any) -> list[genai_types.Content]: + """Converts a value to a list of Content objects.""" + if isinstance(value, genai_types.Content): + return [value] + if isinstance(value, types.ResponseCandidate): + return [value.response] if value.response else [] + if isinstance(value, list) and value: + if isinstance(value[0], genai_types.Content): + return value + if isinstance(value[0], types.evals.Message): + history_texts = [] + for msg_obj in value: + msg_text = _extract_text_from_content(msg_obj.content) + if msg_text: + role = msg_obj.content.role or msg_obj.author or "user" + history_texts.append(f"{role}: {msg_text}") + return [ + genai_types.Content( + parts=[genai_types.Part(text="\n".join(history_texts))] + ) + ] + return [genai_types.Content(parts=[genai_types.Part(text=json.dumps(value))])] + if isinstance(value, dict): + return [genai_types.Content(parts=[genai_types.Part(text=json.dumps(value))])] + return [genai_types.Content(parts=[genai_types.Part(text=str(value))])] + + +def _get_autorater_config(metric: types.Metric) -> dict[str, Any]: + """Extracts autorater config settings from a metric.""" + autorater_config: dict[str, Any] = {} + if metric.judge_model: + autorater_config["autorater_model"] = metric.judge_model + if metric.judge_model_generation_config: + autorater_config["generation_config"] = metric.judge_model_generation_config + if metric.judge_model_sampling_count: + autorater_config["sampling_count"] = metric.judge_model_sampling_count + return autorater_config + + def _default_aggregate_scores( metric_name: str, eval_case_metric_results: list[types.EvalCaseMetricResult], @@ -213,17 +279,13 @@ def _build_request_payload( ) -> dict[str, Any]: """Builds the request parameters for evaluate instances.""" request_payload = {} - if response_index >= len(eval_case.responses): - raise IndexError( - f"response_index {response_index} out of bounds for eval_case with" - f" {len(eval_case.responses)} responses." - ) - if eval_case.responses is None: - raise ValueError( - f"No responses found for eval_case with ID {eval_case.eval_case_id}." - ) - current_response_candidate = eval_case.responses[response_index] - if _extract_text_from_content(current_response_candidate.response) is None: + + response_content = _get_response_from_eval_case( + eval_case, response_index, self.metric.name + ) + prediction_text = _extract_text_from_content(response_content) + + if prediction_text is None: raise ValueError( f"Response text missing for candidate {response_index} in eval_case" f" {eval_case.eval_case_id or 'Unknown ID'}." @@ -248,9 +310,7 @@ def _build_request_payload( }, "instances": [ { - "prediction": _extract_text_from_content( - current_response_candidate.response - ), + "prediction": prediction_text, "reference": _extract_text_from_content( eval_case.reference.response ), @@ -262,9 +322,7 @@ def _build_request_payload( "metric_spec": {}, "instances": [ { - "prediction": _extract_text_from_content( - current_response_candidate.response - ), + "prediction": prediction_text, "reference": _extract_text_from_content( eval_case.reference.response ), @@ -350,18 +408,13 @@ def _build_request_payload( if hasattr(self.metric, "target_language"): target_language = self.metric.target_language - if response_index >= len(eval_case.responses): - raise IndexError( - f"response_index {response_index} out of bounds for eval_case with" - f" {len(eval_case.responses)} responses." - ) + response_content = _get_response_from_eval_case( + eval_case, response_index, self.metric.name + ) + prediction_text = _extract_text_from_content(response_content) + prompt_text = _extract_text_from_content(_get_prompt_from_eval_case(eval_case)) - if eval_case.responses is None: - raise ValueError( - f"No responses found for eval_case with ID {eval_case.eval_case_id}." - ) - current_response_candidate = eval_case.responses[response_index] - if _extract_text_from_content(current_response_candidate.response) is None: + if prediction_text is None: raise ValueError( f"Response text missing for candidate {response_index} in eval_case" f" {eval_case.eval_case_id or 'Unknown ID'}." @@ -375,7 +428,7 @@ def _build_request_payload( "Reference text missing for eval_case" f" {eval_case.eval_case_id or 'Unknown ID'}." ) - if _extract_text_from_content(eval_case.prompt) is None: + if prompt_text is None: raise ValueError( "Prompt text (source for translation) missing for eval_case" f" {eval_case.eval_case_id or 'Unknown ID'}." @@ -388,11 +441,9 @@ def _build_request_payload( "target_language": target_language, }, "instance": { - "prediction": _extract_text_from_content( - current_response_candidate.response - ), + "prediction": prediction_text, "reference": _extract_text_from_content(eval_case.reference.response), - "source": _extract_text_from_content(eval_case.prompt), + "source": prompt_text, }, } return request_payload @@ -528,10 +579,11 @@ def _build_rubric_based_input( rubrics_list = [] parsed_rubrics = [types.evals.Rubric(**r) for r in rubrics_list] + extracted_prompt = _get_prompt_from_eval_case(eval_case) rubric_enhanced_contents = { "prompt": ( - [eval_case.prompt.model_dump(mode="json", exclude_none=True)] - if eval_case.prompt + [extracted_prompt.model_dump(mode="json", exclude_none=True)] + if extracted_prompt else None ), "response": [response_content.model_dump(mode="json", exclude_none=True)], @@ -561,8 +613,9 @@ def _build_pointwise_input( self, eval_case: types.EvalCase, response_content: genai_types.Content ) -> dict[str, Any]: """Builds the payload for a standard pointwise LLM metric.""" + extracted_prompt = _get_prompt_from_eval_case(eval_case) instance_data = { - "prompt": eval_case.prompt, + "prompt": extracted_prompt, "response": response_content, } template_obj = types.PromptTemplate(text=self.metric.prompt_template) @@ -573,46 +626,8 @@ def _build_pointwise_input( content_map_values = {} for key, value in instance_data.items(): - content_list_to_serialize = [] - if isinstance(value, genai_types.Content): - content_list_to_serialize = [value] - elif isinstance(value, types.ResponseCandidate): - if value.response: # pytype: disable=attribute-error - content_list_to_serialize = [value.response] - elif isinstance(value, list) and value: - if isinstance(value[0], genai_types.Content): - content_list_to_serialize = value - elif isinstance(value[0], types.evals.Message): - history_texts = [] - for msg_obj in value: - msg_text = _extract_text_from_content(msg_obj.content) - if msg_text: - role = msg_obj.content.role or msg_obj.author or "user" - history_texts.append(f"{role}: {msg_text}") - content_list_to_serialize = [ - genai_types.Content( - parts=[genai_types.Part(text="\n".join(history_texts))] - ) - ] - else: - content_list_to_serialize = [ - genai_types.Content( - parts=[genai_types.Part(text=json.dumps(value))] - ) - ] - elif isinstance(value, dict): - content_list_to_serialize = [ - genai_types.Content( - parts=[genai_types.Part(text=json.dumps(value))] - ) - ] - else: - content_list_to_serialize = [ - genai_types.Content(parts=[genai_types.Part(text=str(value))]) - ] - content_map_values[key] = types.ContentMapContents( - contents=content_list_to_serialize + contents=_value_to_content_list(value) ) instance_payload = types.PointwiseMetricInstance( @@ -638,15 +653,7 @@ def _build_pointwise_input( def _add_autorater_config(self, payload: dict[str, Any]) -> None: """Adds autorater config to the request payload if specified.""" - autorater_config: dict[str, Any] = {} - if self.metric.judge_model: - autorater_config["autorater_model"] = self.metric.judge_model - if self.metric.judge_model_generation_config: - autorater_config["generation_config"] = ( - self.metric.judge_model_generation_config - ) - if self.metric.judge_model_sampling_count: - autorater_config["sampling_count"] = self.metric.judge_model_sampling_count + autorater_config = _get_autorater_config(self.metric) if not autorater_config: return @@ -663,10 +670,10 @@ def _build_request_payload( self, eval_case: types.EvalCase, response_index: int ) -> dict[str, Any]: """Builds the request parameters for evaluate instances request.""" - if not eval_case.responses or response_index >= len(eval_case.responses): - raise IndexError(f"response_index {response_index} is out of bounds.") + response_content = _get_response_from_eval_case( + eval_case, response_index, self.metric.name + ) - response_content = eval_case.responses[response_index].response if not response_content: raise ValueError( f"Response content missing for candidate {response_index}." @@ -804,26 +811,36 @@ def get_metric_result( eval_case.model_dump(exclude_none=True), ) - if response_index >= len(eval_case.responses): + try: + response_content = _get_response_from_eval_case( + eval_case, response_index, metric_name + ) + except ValueError as e: + return types.EvalCaseMetricResult( + metric_name=metric_name, + error_message=str(e), + ) + + if not response_content: return types.EvalCaseMetricResult( - metric_name=self.metric.name, + metric_name=metric_name, error_message=( - f"response_index {response_index} out of bounds for EvalCase" - f" {eval_case.eval_case_id or 'Unknown ID'}." + f"No response found for candidate {response_index} in EvalCase" + f" {eval_case.eval_case_id}." ), ) - if not eval_case.responses: - raise ValueError(f"EvalCase {eval_case.eval_case_id} has no responses.") - - current_response_candidate = eval_case.responses[response_index] - instance_for_custom_fn = eval_case.model_dump( exclude={"responses"}, mode="json", exclude_none=True ) - instance_for_custom_fn["response"] = current_response_candidate.model_dump( + instance_for_custom_fn["response"] = response_content.model_dump( mode="json", exclude_none=True - ).get("response") + ) + extracted_prompt = _get_prompt_from_eval_case(eval_case) + if extracted_prompt: + instance_for_custom_fn["prompt"] = extracted_prompt.model_dump( + mode="json", exclude_none=True + ) error_msg = None score = None @@ -952,14 +969,9 @@ def _build_request_payload( self, eval_case: types.EvalCase, response_index: int ) -> dict[str, Any]: """Builds the request parameters for evaluate instances request.""" - if ( - not eval_case.responses or response_index >= len(eval_case.responses) - ) and not getattr(eval_case, "agent_data", None): - raise IndexError(f"response_index {response_index} is out of bounds.") - - response_content = None - if eval_case.responses and response_index < len(eval_case.responses): - response_content = eval_case.responses[response_index].response + response_content = _get_response_from_eval_case( + eval_case, response_index, self.metric.name + ) if not response_content and not getattr(eval_case, "agent_data", None): raise ValueError( @@ -980,21 +992,22 @@ def _build_request_payload( eval_case.reference.response ) + extracted_prompt = _get_prompt_from_eval_case(eval_case) prompt_instance_data = None if self.metric.name is not None and self.metric.name.startswith("multi_turn"): prompt_contents = [] if eval_case.conversation_history: for message in eval_case.conversation_history: prompt_contents.append(message.content) - if eval_case.prompt: - prompt_contents.append(eval_case.prompt) + if extracted_prompt: + prompt_contents.append(extracted_prompt) prompt_instance_data = types.evals.InstanceData( contents=types.evals.InstanceDataContents(contents=prompt_contents) ) else: prompt_instance_data = PredefinedMetricHandler._content_to_instance_data( - eval_case.prompt + extracted_prompt ) other_data_map: dict[str, Any] = {} @@ -1030,15 +1043,7 @@ def _build_request_payload( "instance": instance_payload, } - autorater_config: dict[str, Any] = {} - if self.metric.judge_model: - autorater_config["autorater_model"] = self.metric.judge_model - if self.metric.judge_model_generation_config: - autorater_config["generation_config"] = ( - self.metric.judge_model_generation_config - ) - if self.metric.judge_model_sampling_count: - autorater_config["sampling_count"] = self.metric.judge_model_sampling_count + autorater_config = _get_autorater_config(self.metric) if autorater_config: request_payload["autorater_config"] = genai_types.AutoraterConfig( **autorater_config @@ -1155,10 +1160,10 @@ def _build_request_payload( self, eval_case: types.EvalCase, response_index: int ) -> dict[str, Any]: """Builds the request parameters for evaluate instances request.""" - if not eval_case.responses or response_index >= len(eval_case.responses): - raise IndexError(f"response_index {response_index} is out of bounds.") + response_content = _get_response_from_eval_case( + eval_case, response_index, self.metric.name + ) - response_content = eval_case.responses[response_index].response if not response_content: raise ValueError( f"Response content missing for candidate {response_index}." @@ -1170,8 +1175,9 @@ def _build_request_payload( eval_case.reference.response ) + extracted_prompt = _get_prompt_from_eval_case(eval_case) prompt_instance_data = PredefinedMetricHandler._content_to_instance_data( - eval_case.prompt + extracted_prompt ) instance_payload = types.EvaluationInstance(