feat: GenAI Client(evals) - convert raw Agent Session history into the new AgentData structure

jsondai · copybara-github · commit 565482a5ddf9 · 2026-02-19T18:00:35.000-08:00
PiperOrigin-RevId: 869945268
diff --git a/tests/unit/vertexai/genai/replays/test_evaluate_instances.py b/tests/unit/vertexai/genai/replays/test_evaluate_instances.py
@@ -81,13 +81,19 @@ def test_rouge_metric(client):
 
 def test_pointwise_metric(client):
     """Tests the _evaluate_instances method with PointwiseMetricInput."""
-    instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
+    instance_dict = {
+        "prompt": "What is the capital of France?",
+        "response": "Paris",
+    }
     json_instance = json.dumps(instance_dict)
 
     test_input = types.PointwiseMetricInput(
         instance=types.PointwiseMetricInstance(json_instance=json_instance),
         metric_spec=genai_types.PointwiseMetricSpec(
-            metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
+            metric_prompt_template=(
+                "Evaluate if the response '{response}' correctly answers the"
+                " prompt '{prompt}'."
+            )
         ),
     )
     response = client.evals.evaluate_instances(
@@ -101,19 +107,20 @@ def test_pointwise_metric(client):
 
 def test_pointwise_metric_with_agent_data(client):
     """Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
-    instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
+    instance_dict = {
+        "prompt": "What is the capital of France?",
+        "response": "Paris",
+    }
     json_instance = json.dumps(instance_dict)
     agent_data = types.evals.AgentData(
         agent_config=types.evals.AgentConfig(
-            tools=types.evals.Tools(
-                tool=[
-                    genai_types.Tool(
-                        function_declarations=[
-                            genai_types.FunctionDeclaration(name="search")
-                        ]
-                    )
-                ]
-            ),
+            tools=[
+                genai_types.Tool(
+                    function_declarations=[
+                        genai_types.FunctionDeclaration(name="search")
+                    ]
+                )
+            ],
             developer_instruction=types.evals.InstanceData(text="instruction"),
         ),
         events=types.evals.Events(
@@ -129,7 +136,10 @@ def test_pointwise_metric_with_agent_data(client):
     test_input = types.PointwiseMetricInput(
         instance=types.PointwiseMetricInstance(json_instance=json_instance),
         metric_spec=genai_types.PointwiseMetricSpec(
-            metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
+            metric_prompt_template=(
+                "Evaluate if the response '{response}' correctly answers the"
+                " prompt '{prompt}'."
+            )
         ),
     )
     response = client.evals.evaluate_instances(
@@ -189,7 +199,10 @@ def test_pairwise_metric_with_autorater(client):
     test_input = types.PairwiseMetricInput(
         instance=types.PairwiseMetricInstance(json_instance=json_instance),
         metric_spec=genai_types.PairwiseMetricSpec(
-            metric_prompt_template="Which response is a better summary? Baseline: '{baseline_response}' or Candidate: '{candidate_response}'"
+            metric_prompt_template=(
+                "Which response is a better summary? Baseline:"
+                " '{baseline_response}' or Candidate: '{candidate_response}'"
+            )
         ),
     )
     autorater_config = genai_types.AutoraterConfig(sampling_count=2)
@@ -240,7 +253,10 @@ def test_inference_with_prompt_template(client):
 
 def test_run_inference_with_agent(client):
     test_df = pd.DataFrame(
-        {"prompt": ["agent prompt"], "session_inputs": ['{"user_id": "user_123"}']}
+        {
+            "prompt": ["agent prompt"],
+            "session_inputs": ['{"user_id": "user_123"}'],
+        }
     )
     inference_result = client.evals.run_inference(
         agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
@@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):
         )
 
         assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert agent_data.agent_config.tools.tool == [tool]
+        assert agent_data.agent_config.legacy_tools.tool == [tool]
         assert agent_data.events.event[0].parts[0].text == "intermediate event"
 
     def test_eval_case_to_agent_data_events_only(self):
@@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):
         )
 
         assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert not agent_data.agent_config.tools.tool
+        assert not agent_data.agent_config.legacy_tools.tool
 
     def test_eval_case_to_agent_data_agent_info_empty(self):
         intermediate_events = [
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
@@ -1299,21 +1299,36 @@ def _run_agent_internal(
         agent=agent,
         prompt_dataset=prompt_dataset,
     )
+
+    agent_obj = agent_engine if agent_engine else agent
+
     processed_intermediate_events = []
     processed_responses = []
-    for resp_item in raw_responses:
+    processed_agent_data = []  # New column for AgentData
+
+    for i, resp_item in enumerate(raw_responses):
         intermediate_events_row: list[dict[str, Any]] = []
         response_row = None
+
+        # --- Legacy Logic: Intermediate Events & Response ---
         if isinstance(resp_item, list):
             try:
-                response_row = resp_item[-1]["content"]["parts"][0]["text"]
+                # Attempt to extract final response text
+                if resp_item and "content" in resp_item[-1]:
+                    # Basic extraction, assumes last message is model response
+                    final_content = resp_item[-1]["content"]
+                    if isinstance(final_content, dict) and "parts" in final_content:
+                        response_row = final_content["parts"][0].get("text", "")
+                    elif hasattr(final_content, "parts"):
+                        response_row = final_content.parts[0].text
+
                 for intermediate_event in resp_item[:-1]:
                     intermediate_events_row.append(
                         {
-                            "event_id": intermediate_event["id"],
-                            "content": intermediate_event["content"],
-                            "creation_timestamp": intermediate_event["timestamp"],
-                            "author": intermediate_event["author"],
+                            "event_id": intermediate_event.get("id"),
+                            "content": intermediate_event.get("content"),
+                            "creation_timestamp": intermediate_event.get("timestamp"),
+                            "author": intermediate_event.get("author"),
                         }
                     )
             except Exception as e:  # pylint: disable=broad-exception-caught
@@ -1335,6 +1350,33 @@ def _run_agent_internal(
         processed_intermediate_events.append(intermediate_events_row)
         processed_responses.append(response_row)
 
+        # --- New Logic: AgentData ---
+        agent_data_obj = None
+        try:
+            # 1. Get User Prompt for the current row
+            primary_prompt_column = (
+                "request" if "request" in prompt_dataset.columns else "prompt"
+            )
+            user_prompt_val = prompt_dataset.iloc[i][primary_prompt_column]
+
+            # 2. Construct Full Session History (User Prompt + Agent Events)
+            # Normalize user prompt into a message dict structure
+            user_event = {"role": "user", "content": user_prompt_val}
+
+            full_session_history = [user_event]
+            if isinstance(resp_item, list):
+                full_session_history.extend(resp_item)
+
+            # 3. Create AgentData using the new factory method
+            agent_data_obj = types.evals.AgentData.from_session(
+                agent_obj, full_session_history
+            )
+        except Exception as e:
+            logger.warning("Failed to adapt AgentData for row %d: %s", i, e)
+            # Proceed without AgentData; backend will fallback to legacy fields
+
+        processed_agent_data.append(agent_data_obj)
+
     if len(processed_responses) != len(prompt_dataset) or len(
         processed_responses
     ) != len(processed_intermediate_events):
@@ -1353,6 +1395,7 @@ def _run_agent_internal(
         {
             _evals_constant.INTERMEDIATE_EVENTS: processed_intermediate_events,
             _evals_constant.RESPONSE: processed_responses,
+            "agent_data": processed_agent_data,  # Populate agent_data
         }
     )
 
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
@@ -879,6 +879,11 @@ def _eval_case_to_agent_data(
         eval_case: types.EvalCase,
     ) -> Optional[types.evals.AgentData]:
         """Converts an EvalCase object to an AgentData object."""
+        # --- NEW LOGIC: Use the structured agent_data if present ---
+        if getattr(eval_case, "agent_data", None):
+            return eval_case.agent_data
+
+        # --- LEGACY LOGIC: Fallback for older dataframes ---
         if not eval_case.agent_info and not eval_case.intermediate_events:
             return None
         tools = None
@@ -899,7 +904,7 @@ def _eval_case_to_agent_data(
 
             if tools or developer_instruction:
                 agent_config = types.evals.AgentConfig(
-                    tools=tools,
+                    legacy_tools=tools,
                     developer_instruction=developer_instruction,
                 )
 
diff --git a/vertexai/_genai/types/common.py b/vertexai/_genai/types/common.py
@@ -1492,6 +1492,10 @@ class EvalCase(_common.BaseModel):
         default=None,
         description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""",
     )
+    agent_data: Optional[evals_types.AgentData] = Field(
+        default=None,
+        description="""This field is experimental and may change in future versions. The agent data of the agent under evaluation.""",
+    )
     # Allow extra fields to support custom metric prompts and stay backward compatible.
     model_config = ConfigDict(frozen=True, extra="allow")
 
@@ -1526,6 +1530,9 @@ class EvalCaseDict(TypedDict, total=False):
     agent_info: Optional[evals_types.AgentInfo]
     """This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation."""
 
+    agent_data: Optional[evals_types.AgentData]
+    """This field is experimental and may change in future versions. The agent data of the agent under evaluation."""
+
 
 EvalCaseOrDict = Union[EvalCase, EvalCaseDict]
 
diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py

Original file line number	Diff line number	Diff line change
`@@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):`
`4056`	`4056`	`)`
`4057`	`4057`
`4058`	`4058`	`assert agent_data.agent_config.developer_instruction.text == "instruction1"`
`4059`		`- assert agent_data.agent_config.tools.tool == [tool]`
	`4059`	`+ assert agent_data.agent_config.legacy_tools.tool == [tool]`
`4060`	`4060`	`assert agent_data.events.event[0].parts[0].text == "intermediate event"`
`4061`	`4061`
`4062`	`4062`	`def test_eval_case_to_agent_data_events_only(self):`
`@@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):`
`4164`	`4164`	`)`
`4165`	`4165`
`4166`	`4166`	`assert agent_data.agent_config.developer_instruction.text == "instruction1"`
`4167`		`- assert not agent_data.agent_config.tools.tool`
	`4167`	`+ assert not agent_data.agent_config.legacy_tools.tool`
`4168`	`4168`
`4169`	`4169`	`def test_eval_case_to_agent_data_agent_info_empty(self):`
`4170`	`4170`	`intermediate_events = [`