From bbb449c8db03a1aff22009d122cae1122bbf80d7 Mon Sep 17 00:00:00 2001
From: Jason Dai <jsndai@google.com>
Date: Tue, 17 Mar 2026 12:45:50 -0700
Subject: [PATCH] chore: GenAI Client(evals) - refactor evaluation data
 handling for Agent-based evals

PiperOrigin-RevId: 885160026
---
 tests/unit/vertexai/genai/test_evals.py   | 296 +++++++++++++++----
 vertexai/_genai/_evals_common.py          | 234 +++++++++------
 vertexai/_genai/_evals_data_converters.py |   4 +-
 vertexai/_genai/_evals_metric_handlers.py | 327 ++++++++++----------
 vertexai/_genai/types/evals.py            | 344 +++++++++-------------
 5 files changed, 710 insertions(+), 495 deletions(-)

diff --git a/tests/unit/vertexai/genai/test_evals.py b/tests/unit/vertexai/genai/test_evals.py
index 2b534d4740..517a0ab33a 100644
--- a/tests/unit/vertexai/genai/test_evals.py
+++ b/tests/unit/vertexai/genai/test_evals.py
@@ -258,7 +258,10 @@ def test_eval_evaluate_with_agent_info(self, mock_execute_evaluation):
         dataset = vertexai_genai_types.EvaluationDataset(
             eval_dataset_df=pd.DataFrame([{"prompt": "p1", "response": "r1"}])
         )
-        agent_info = {"agent1": {"name": "agent1", "instruction": "instruction1"}}
+        agent_info = {
+            "name": "agent_system",
+            "agents": {"agent1": {"agent_id": "agent1", "instruction": "instruction1"}},
+        }
         self.client.evals.evaluate(
             dataset=dataset,
             metrics=[vertexai_genai_types.Metric(name="exact_match")],
@@ -1313,6 +1316,31 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
                         ]
                     ],
                     "response": ["agent response"],
+                    "agent_data": [
+                        {
+                            "agents": None,
+                            "turns": [
+                                {
+                                    "events": [
+                                        {
+                                            "author": "model",
+                                            "content": {
+                                                "parts": [{"text": "intermediate1"}]
+                                            },
+                                        },
+                                        {
+                                            "author": "model",
+                                            "content": {
+                                                "parts": [{"text": "agent response"}]
+                                            },
+                                        },
+                                    ],
+                                    "turn_id": "turn_0",
+                                    "turn_index": 0,
+                                }
+                            ],
+                        }
+                    ],
                 }
             ),
         )
@@ -1392,6 +1420,31 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
                         ]
                     ],
                     "response": ["agent response"],
+                    "agent_data": [
+                        {
+                            "agents": None,
+                            "turns": [
+                                {
+                                    "events": [
+                                        {
+                                            "author": "model",
+                                            "content": {
+                                                "parts": [{"text": "intermediate1"}]
+                                            },
+                                        },
+                                        {
+                                            "author": "model",
+                                            "content": {
+                                                "parts": [{"text": "agent response"}]
+                                            },
+                                        },
+                                    ],
+                                    "turn_id": "turn_0",
+                                    "turn_index": 0,
+                                }
+                            ],
+                        }
+                    ],
                 }
             ),
         )
@@ -1571,6 +1624,72 @@ def run_async_side_effect(*args, **kwargs):
                     ],
                 ],
                 "response": ["agent response", "agent response 2"],
+                "agent_data": [
+                    {
+                        "agents": {
+                            "mock_agent": {
+                                "agent_id": "mock_agent",
+                                "agent_type": "Mock",
+                                "instruction": "mock instruction",
+                                "description": "mock description",
+                                "tools": [],
+                                "sub_agents": [],
+                            }
+                        },
+                        "turns": [
+                            {
+                                "events": [
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "intermediate1"}]
+                                        },
+                                    },
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "agent response"}]
+                                        },
+                                    },
+                                ],
+                                "turn_id": "turn_0",
+                                "turn_index": 0,
+                            }
+                        ],
+                    },
+                    {
+                        "agents": {
+                            "mock_agent": {
+                                "agent_id": "mock_agent",
+                                "agent_type": "Mock",
+                                "instruction": "mock instruction",
+                                "description": "mock description",
+                                "tools": [],
+                                "sub_agents": [],
+                            }
+                        },
+                        "turns": [
+                            {
+                                "events": [
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "intermediate2"}]
+                                        },
+                                    },
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "agent response 2"}]
+                                        },
+                                    },
+                                ],
+                                "turn_id": "turn_0",
+                                "turn_index": 0,
+                            }
+                        ],
+                    },
+                ],
             }
         )
         pd.testing.assert_frame_equal(
@@ -1952,6 +2071,31 @@ def test_run_agent_internal_success(self, mock_run_agent):
                     ]
                 ],
                 "response": ["final response"],
+                "agent_data": [
+                    {
+                        "agents": None,
+                        "turns": [
+                            {
+                                "events": [
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "intermediate1"}]
+                                        },
+                                    },
+                                    {
+                                        "author": "model",
+                                        "content": {
+                                            "parts": [{"text": "final response"}]
+                                        },
+                                    },
+                                ],
+                                "turn_id": "turn_0",
+                                "turn_index": 0,
+                            }
+                        ],
+                    }
+                ],
             }
         )
         pd.testing.assert_frame_equal(result_df, expected_df)
@@ -2144,24 +2288,24 @@ def test_run_agent_internal_malformed_event(self, mock_run_agent):
         assert not result_df["intermediate_events"][0]
 
 
-class TestIsMultiTurnAgentRun:
-    """Unit tests for the _is_multi_turn_agent_run function."""
+class TestIsMultiTurnAgentSimulation:
+    """Unit tests for the _is_multi_turn_agent_simulation function."""
 
-    def test_is_multi_turn_agent_run_with_config(self):
+    def test_is_multi_turn_agent_simulation_with_config(self):
         config = vertexai_genai_types.evals.UserSimulatorConfig(model_name="gemini-pro")
-        assert _evals_common._is_multi_turn_agent_run(
+        assert _evals_common._is_multi_turn_agent_simulation(
             user_simulator_config=config, prompt_dataset=pd.DataFrame()
         )
 
-    def test_is_multi_turn_agent_run_with_conversation_plan(self):
+    def test_is_multi_turn_agent_simulation_with_conversation_plan(self):
         prompt_dataset = pd.DataFrame({"conversation_plan": ["plan"]})
-        assert _evals_common._is_multi_turn_agent_run(
+        assert _evals_common._is_multi_turn_agent_simulation(
             user_simulator_config=None, prompt_dataset=prompt_dataset
         )
 
-    def test_is_multi_turn_agent_run_false(self):
+    def test_is_multi_turn_agent_simulation_false(self):
         prompt_dataset = pd.DataFrame({"prompt": ["prompt"]})
-        assert not _evals_common._is_multi_turn_agent_run(
+        assert not _evals_common._is_multi_turn_agent_simulation(
             user_simulator_config=None, prompt_dataset=prompt_dataset
         )
 
@@ -3327,15 +3471,21 @@ def test_agent_info_creation(self):
             ]
         )
         agent_info = vertexai_genai_types.evals.AgentInfo(
-            name="agent1",
-            instruction="instruction1",
-            description="description1",
-            tool_declarations=[tool],
+            name="agent_system",
+            agents={
+                "agent1": vertexai_genai_types.evals.AgentConfig(
+                    agent_id="agent1",
+                    instruction="instruction1",
+                    description="description1",
+                    tools=[tool],
+                )
+            },
         )
-        assert agent_info.name == "agent1"
-        assert agent_info.instruction == "instruction1"
-        assert agent_info.description == "description1"
-        assert agent_info.tool_declarations == [tool]
+        assert agent_info.name == "agent_system"
+        assert "agent1" in agent_info.agents
+        assert agent_info.agents["agent1"].instruction == "instruction1"
+        assert agent_info.agents["agent1"].description == "description1"
+        assert agent_info.agents["agent1"].tools == [tool]
 
     @mock.patch.object(genai_types.FunctionDeclaration, "from_callable_with_api_option")
     def test_load_from_agent(self, mock_from_callable):
@@ -3351,6 +3501,7 @@ def my_search_tool(query: str) -> str:
         mock_agent.instruction = "mock instruction"
         mock_agent.description = "mock description"
         mock_agent.tools = [my_search_tool]
+        mock_agent.sub_agents = []
 
         agent_info = vertexai_genai_types.evals.AgentInfo.load_from_agent(
             agent=mock_agent,
@@ -3358,15 +3509,15 @@ def my_search_tool(query: str) -> str:
         )
 
         assert agent_info.name == "mock_agent"
-        assert agent_info.instruction == "mock instruction"
-        assert agent_info.description == "mock description"
+        assert agent_info.agents["mock_agent"].instruction == "mock instruction"
+        assert agent_info.agents["mock_agent"].description == "mock description"
         assert (
             agent_info.agent_resource_name
             == "projects/123/locations/abc/reasoningEngines/456"
         )
-        assert len(agent_info.tool_declarations) == 1
-        assert isinstance(agent_info.tool_declarations[0], genai_types.Tool)
-        assert agent_info.tool_declarations[0].function_declarations == [
+        assert len(agent_info.agents["mock_agent"].tools) == 1
+        assert isinstance(agent_info.agents["mock_agent"].tools[0], genai_types.Tool)
+        assert agent_info.agents["mock_agent"].tools[0].function_declarations == [
             mock_function_declaration
         ]
         mock_from_callable.assert_called_once_with(callable=my_search_tool)
@@ -3482,7 +3633,9 @@ def test_no_conflict_with_inference_configs(self):
         dataset = vertexai_genai_types.EvaluationDataset(
             eval_dataset_df=pd.DataFrame([{"agent_data": {"turns": []}}])
         )
-        inference_configs = {"cand1": {"agent_configs": {"agent1": {"name": "agent1"}}}}
+        inference_configs = {
+            "cand1": {"agent_configs": {"agent1": {"agent_id": "agent1"}}}
+        }
         _evals_utils._validate_dataset_agent_data(dataset, inference_configs)
 
     def test_no_conflict_if_inference_configs_has_no_agent_configs(self):
@@ -3535,9 +3688,14 @@ def test_eval_case_with_agent_eval_fields(self):
             ]
         )
         agent_info = vertexai_genai_types.evals.AgentInfo(
-            name="agent1",
-            instruction="instruction1",
-            tool_declarations=[tool],
+            name="agent_system",
+            agents={
+                "agent1": vertexai_genai_types.evals.AgentConfig(
+                    agent_id="agent1",
+                    instruction="instruction1",
+                    tools=[tool],
+                )
+            },
         )
         intermediate_events = [
             vertexai_genai_types.evals.Event(
@@ -4407,9 +4565,14 @@ def test_eval_case_to_agent_data(self):
             ]
         )
         agent_info = vertexai_genai_types.evals.AgentInfo(
-            name="agent1",
-            instruction="instruction1",
-            tool_declarations=[tool],
+            name="agent_system",
+            agents={
+                "agent1": vertexai_genai_types.evals.AgentConfig(
+                    agent_id="agent1",
+                    instruction="instruction1",
+                    tools=[tool],
+                )
+            },
         )
         intermediate_events = [
             vertexai_genai_types.evals.Event(
@@ -4417,6 +4580,7 @@ def test_eval_case_to_agent_data(self):
                 content=genai_types.Content(
                     parts=[genai_types.Part(text="intermediate event")]
                 ),
+                author="agent1",
             )
         ]
         eval_case = vertexai_genai_types.EvalCase(
@@ -4432,13 +4596,19 @@ def test_eval_case_to_agent_data(self):
 
         agent_data = (
             _evals_metric_handlers.PredefinedMetricHandler._eval_case_to_agent_data(
-                eval_case
+                eval_case,
+                eval_case.prompt,
+                eval_case.responses[0].response,
             )
         )
 
-        assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert agent_data.agent_config.legacy_tools.tool == [tool]
-        assert agent_data.events.event[0].parts[0].text == "intermediate event"
+        assert "agent1" in agent_data.agents
+        assert agent_data.agents["agent1"].instruction == "instruction1"
+        assert agent_data.agents["agent1"].tools == [tool]
+        assert len(agent_data.turns[0].events) == 3
+        assert (
+            agent_data.turns[0].events[1].content.parts[0].text == "intermediate event"
+        )
 
     def test_eval_case_to_agent_data_events_only(self):
         intermediate_events = [
@@ -4466,8 +4636,10 @@ def test_eval_case_to_agent_data_events_only(self):
             )
         )
 
-        assert agent_data.agent_config is None
-        assert agent_data.events.event[0].parts[0].text == "intermediate event"
+        assert agent_data.agents is None
+        assert (
+            agent_data.turns[0].events[0].content.parts[0].text == "intermediate event"
+        )
 
     def test_eval_case_to_agent_data_empty_event_content(self):
         intermediate_events = [
@@ -4493,14 +4665,19 @@ def test_eval_case_to_agent_data_empty_event_content(self):
             )
         )
 
-        assert agent_data.agent_config is None
-        assert not agent_data.events.event
+        assert agent_data.agents is None
+        assert agent_data.turns[0].events[0].content is None
 
     def test_eval_case_to_agent_data_empty_intermediate_events_list(self):
         agent_info = vertexai_genai_types.evals.AgentInfo(
-            name="agent1",
-            instruction="instruction1",
-            tool_declarations=[],
+            name="agent_system",
+            agents={
+                "agent1": vertexai_genai_types.evals.AgentConfig(
+                    agent_id="agent1",
+                    instruction="instruction1",
+                    tools=[],
+                )
+            },
         )
 
         eval_case = vertexai_genai_types.EvalCase(
@@ -4519,13 +4696,18 @@ def test_eval_case_to_agent_data_empty_intermediate_events_list(self):
             )
         )
 
-        assert not agent_data.events.event
+        assert agent_data.turns is None
 
     def test_eval_case_to_agent_data_agent_info_empty_tools(self):
         agent_info = vertexai_genai_types.evals.AgentInfo(
-            name="agent1",
-            instruction="instruction1",
-            tool_declarations=[],
+            name="agent_system",
+            agents={
+                "agent1": vertexai_genai_types.evals.AgentConfig(
+                    agent_id="agent1",
+                    instruction="instruction1",
+                    tools=[],
+                )
+            },
         )
         eval_case = vertexai_genai_types.EvalCase(
             prompt=genai_types.Content(parts=[genai_types.Part(text="Hello")]),
@@ -4544,8 +4726,8 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):
             )
         )
 
-        assert agent_data.agent_config.developer_instruction.text == "instruction1"
-        assert not agent_data.agent_config.legacy_tools.tool
+        assert agent_data.agents["agent1"].instruction == "instruction1"
+        assert not agent_data.agents["agent1"].tools
 
     def test_eval_case_to_agent_data_agent_info_empty(self):
         intermediate_events = [
@@ -4573,7 +4755,7 @@ def test_eval_case_to_agent_data_agent_info_empty(self):
             )
         )
 
-        assert agent_data.agent_config is None
+        assert agent_data.agents is None
 
     @mock.patch.object(_evals_metric_handlers.logger, "warning")
     def test_tool_use_quality_metric_no_tool_call_logs_warning(
@@ -5102,10 +5284,15 @@ def test_execute_evaluation_with_agent_info(
             ]
         }
         agent_info = {
-            "name": "agent1",
-            "instruction": "instruction1",
-            "description": "description1",
-            "tool_declarations": [tool],
+            "name": "agent_system",
+            "agents": {
+                "agent1": {
+                    "agent_id": "agent1",
+                    "instruction": "instruction1",
+                    "description": "description1",
+                    "tools": [tool],
+                }
+            },
         }
 
         result = _evals_common._execute_evaluation(
@@ -5117,9 +5304,10 @@ def test_execute_evaluation_with_agent_info(
 
         assert isinstance(result, vertexai_genai_types.EvaluationResult)
         assert len(result.eval_case_results) == 1
-        assert result.agent_info.name == "agent1"
-        assert result.agent_info.instruction == "instruction1"
-        assert result.agent_info.tool_declarations == [
+        assert result.agent_info.name == "agent_system"
+        assert "agent1" in result.agent_info.agents
+        assert result.agent_info.agents["agent1"].instruction == "instruction1"
+        assert result.agent_info.agents["agent1"].tools == [
             genai_types.Tool(
                 function_declarations=[
                     genai_types.FunctionDeclaration(
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 9fe93230eb..5cef72e641 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -346,14 +346,7 @@ def _resolve_inference_configs(
     if agent_info_pydantic and agent_info_pydantic.name:
         inference_configs = {}
         inference_configs[agent_info_pydantic.name] = (
-            types.EvaluationRunInferenceConfig(
-                agent_config=types.EvaluationRunAgentConfig(
-                    developer_instruction=genai_types.Content(
-                        parts=[genai_types.Part(text=agent_info_pydantic.instruction)]
-                    ),
-                    tools=agent_info_pydantic.tool_declarations,
-                )
-            )
+            types.EvaluationRunInferenceConfig(agent_configs=agent_info_pydantic.agents)
         )
     # Resolve prompt template data
     if inference_configs:
@@ -1604,93 +1597,111 @@ def _get_session_inputs(row: pd.Series) -> types.evals.SessionInput:
         )
 
 
-def _is_multi_turn_agent_run(
+def _is_multi_turn_agent_simulation(
     user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None,
     prompt_dataset: pd.DataFrame = None,
 ) -> bool:
-    """Checks if the agent run is multi-turn."""
+    """Checks if the agent run is a multi-turn user simulation."""
     return (
         user_simulator_config is not None
         or "conversation_plan" in prompt_dataset.columns
     )
 
 
-def _run_agent_internal(
-    api_client: BaseApiClient,
-    agent_engine: Optional[Union[str, types.AgentEngine]],
-    agent: Optional[LlmAgent],
-    prompt_dataset: pd.DataFrame,
-    user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None,
-) -> pd.DataFrame:
-    """Runs an agent."""
-    raw_responses = _run_agent(
-        api_client=api_client,
-        agent_engine=agent_engine,
-        agent=agent,
-        prompt_dataset=prompt_dataset,
-        user_simulator_config=user_simulator_config,
-    )
-    processed_intermediate_events = []
-    processed_responses = []
-    processed_agent_data = []
-    agent_data_agents = None
-    if agent:
-        agent_data_agents = types.evals.AgentData._get_agents_map(agent)
+def _process_multi_turn_agent_response(
+    resp_item: Any,
+    agent_data_agents: Optional[dict[str, Any]],
+) -> Optional[Union[str, dict[str, Any]]]:
+    """Processes a multi-turn agent response."""
+    if isinstance(resp_item, dict) and "error" in resp_item:
+        return json.dumps(resp_item)
+    return types.evals.AgentData(
+        turns=resp_item,
+        agents=agent_data_agents,
+    ).model_dump(exclude_unset=True)
+
+
+def _process_single_turn_agent_response(
+    resp_item: Any,
+    agent_data_agents: Optional[dict[str, Any]],
+) -> tuple[
+    Optional[Union[str, dict[str, Any]]],
+    list[dict[str, Any]],
+    Optional[Union[str, dict[str, Any]]],
+]:
+    """Processes a single-turn agent response."""
+    intermediate_events_row: list[dict[str, Any]] = []
+    response_row: Optional[Union[str, dict[str, Any]]] = None
+    agent_data_row: Optional[Union[str, dict[str, Any]]] = None
 
-    for resp_item in raw_responses:
-        intermediate_events_row: list[dict[str, Any]] = []
-        response_row: Optional[Union[str, dict[str, Any]]] = None
-        agent_data_row: Optional[Union[str, dict[str, Any]]] = None
+    if isinstance(resp_item, list):
+        try:
+            response_row = resp_item[-1]["content"]["parts"][0]["text"]
+            for intermediate_event in resp_item[:-1]:
+                intermediate_events_row.append(
+                    {
+                        "event_id": intermediate_event.get("id"),
+                        "content": intermediate_event.get("content"),
+                        "creation_timestamp": intermediate_event.get("timestamp"),
+                        "author": intermediate_event.get("author"),
+                    }
+                )
+            # Construct AgentData natively for single-turn runs
+            agent_events = []
+            for event_dict in resp_item:
+                content_dict = event_dict.get("content")
+                content_obj = None
+                if content_dict:
+                    content_obj = genai_types.Content.model_validate(content_dict)
+
+                agent_events.append(
+                    types.evals.AgentEvent(
+                        author=event_dict.get("author", "model"),
+                        content=content_obj,
+                    )
+                )
 
-        if _is_multi_turn_agent_run(user_simulator_config, prompt_dataset):
-            if isinstance(resp_item, dict) and "error" in resp_item:
-                agent_data_row = json.dumps(resp_item)
-            else:
-                # TODO: Migrate single turn agent run result to AgentData.
-                agent_data_row = types.evals.AgentData(
-                    turns=resp_item,
-                    agents=agent_data_agents,
-                ).model_dump()
+            turn = types.evals.ConversationTurn(
+                turn_index=0,
+                turn_id="turn_0",
+                events=agent_events,
+            )
+            agent_data_row = types.evals.AgentData(
+                turns=[turn],
+                agents=agent_data_agents,
+            ).model_dump(exclude_unset=True)
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            error_payload = {
+                "error": (
+                    f"Failed to parse agent run response {str(resp_item)} to "
+                    f"agent data: {e}"
+                ),
+            }
+            response_row = json.dumps(error_payload)
+            agent_data_row = json.dumps(error_payload)
+    elif isinstance(resp_item, dict) and "error" in resp_item:
+        response_row = json.dumps(resp_item)
+    else:
+        error_payload = {
+            "error": "Unexpected response type from agent run",
+            "response_type": str(type(resp_item)),
+            "details": str(resp_item),
+        }
+        response_row = json.dumps(error_payload)
 
-        else:
-            if isinstance(resp_item, list):
-                try:
-                    response_row = resp_item[-1]["content"]["parts"][0]["text"]
-                    for intermediate_event in resp_item[:-1]:
-                        intermediate_events_row.append(
-                            {
-                                "event_id": intermediate_event.get("id"),
-                                "content": intermediate_event.get("content"),
-                                "creation_timestamp": intermediate_event.get(
-                                    "timestamp"
-                                ),
-                                "author": intermediate_event.get("author"),
-                            }
-                        )
-                except Exception as e:  # pylint: disable=broad-exception-caught
-                    error_payload = {
-                        "error": (
-                            f"Failed to parse agent run response {str(resp_item)} to "
-                            f"agent data: {e}"
-                        ),
-                    }
-                    response_row = json.dumps(error_payload)
-            elif isinstance(resp_item, dict) and "error" in resp_item:
-                response_row = json.dumps(resp_item)
-            else:
-                error_payload = {
-                    "error": "Unexpected response type from agent run",
-                    "response_type": str(type(resp_item)),
-                    "details": str(resp_item),
-                }
-                response_row = json.dumps(error_payload)
+    return response_row, intermediate_events_row, agent_data_row
 
-        processed_intermediate_events.append(intermediate_events_row)
-        processed_responses.append(response_row)
-        processed_agent_data.append(agent_data_row)
 
+def _create_agent_results_dataframe(
+    prompt_dataset: pd.DataFrame,
+    processed_responses: list[Any],
+    processed_intermediate_events: list[Any],
+    processed_agent_data: list[Any],
+    is_user_simulation: bool,
+) -> pd.DataFrame:
+    """Creates a DataFrame from the processed agent responses."""
     df_dict: dict[str, Any] = {}
-    if _is_multi_turn_agent_run(user_simulator_config, prompt_dataset):
+    if is_user_simulation:
         df_dict[AGENT_DATA] = processed_agent_data
         if len(processed_agent_data) != len(prompt_dataset):
             raise RuntimeError(
@@ -1705,6 +1716,7 @@ def _run_agent_internal(
     else:
         df_dict[_evals_constant.INTERMEDIATE_EVENTS] = processed_intermediate_events
         df_dict[_evals_constant.RESPONSE] = processed_responses
+        df_dict[AGENT_DATA] = processed_agent_data
         if len(processed_responses) != len(prompt_dataset) or len(
             processed_responses
         ) != len(processed_intermediate_events):
@@ -1730,6 +1742,55 @@ def _run_agent_internal(
     return results_df
 
 
+def _run_agent_internal(
+    api_client: BaseApiClient,
+    agent_engine: Optional[Union[str, types.AgentEngine]],
+    agent: Optional[LlmAgent],
+    prompt_dataset: pd.DataFrame,
+    user_simulator_config: Optional[types.evals.UserSimulatorConfig] = None,
+) -> pd.DataFrame:
+    """Runs an agent."""
+    raw_responses = _run_agent(
+        api_client=api_client,
+        agent_engine=agent_engine,
+        agent=agent,
+        prompt_dataset=prompt_dataset,
+        user_simulator_config=user_simulator_config,
+    )
+    processed_intermediate_events = []
+    processed_responses = []
+    processed_agent_data = []
+    agent_data_agents = None
+    if agent:
+        agent_data_agents = types.evals.AgentData.get_agents_map(agent)
+
+    is_user_simulation = _is_multi_turn_agent_simulation(
+        user_simulator_config, prompt_dataset
+    )
+
+    for resp_item in raw_responses:
+        if is_user_simulation:
+            agent_data_row = _process_multi_turn_agent_response(
+                resp_item, agent_data_agents
+            )
+            processed_agent_data.append(agent_data_row)
+        else:
+            response_row, intermediate_events_row, agent_data_row = (
+                _process_single_turn_agent_response(resp_item, agent_data_agents)
+            )
+            processed_responses.append(response_row)
+            processed_intermediate_events.append(intermediate_events_row)
+            processed_agent_data.append(agent_data_row)
+
+    return _create_agent_results_dataframe(
+        prompt_dataset,
+        processed_responses,
+        processed_intermediate_events,
+        processed_agent_data,
+        is_user_simulation,
+    )
+
+
 def _run_agent(
     api_client: BaseApiClient,
     agent_engine: Optional[Union[str, types.AgentEngine]],
@@ -2163,12 +2224,17 @@ def _get_agent_info_from_inference_configs(
         else None
     )
     instruction = di.parts[0].text if di and di.parts and di.parts[0].text else None
+    tools = agent_config.tools if agent_config and agent_config.tools else None
+
     return types.evals.AgentInfo(
         name=candidate_names[0],
-        instruction=instruction,
-        tool_declarations=(
-            agent_config.tools if agent_config and agent_config.tools else None
-        ),
+        agents={
+            "agent_0": types.evals.AgentConfig(
+                instruction=instruction,
+                tools=tools,
+            )
+        },
+        root_agent_id="agent_0",
     )
 
 
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
index 89b3cb9852..33f18b1519 100644
--- a/vertexai/_genai/_evals_data_converters.py
+++ b/vertexai/_genai/_evals_data_converters.py
@@ -810,6 +810,7 @@ def merge_evaluation_datasets(
                 "conversation_history",
                 "intermediate_events",
                 "agent_data",
+                "agent_info",
             },
             exclude_none=True,
         )
@@ -834,6 +835,7 @@ def merge_evaluation_datasets(
                     "conversation_history",
                     "intermediate_events",
                     "agent_data",
+                    "agent_info",
                 },
                 exclude_none=True,
             )
@@ -865,7 +867,7 @@ def merge_evaluation_datasets(
             reference=base_eval_case.reference,
             system_instruction=base_eval_case.system_instruction,
             conversation_history=base_eval_case.conversation_history,
-            agent_info=agent_info,
+            agent_info=agent_info or base_eval_case.agent_info,
             agent_data=base_eval_case.agent_data,
             intermediate_events=base_eval_case.intermediate_events,
             **eval_case_custom_columns,
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
index 47e543fa10..b7dc349e68 100644
--- a/vertexai/_genai/_evals_metric_handlers.py
+++ b/vertexai/_genai/_evals_metric_handlers.py
@@ -93,6 +93,72 @@ def _extract_text_from_content(
     return text_accumulator if any_text_part_found else None
 
 
+def _get_prompt_from_eval_case(
+    eval_case: types.EvalCase,
+) -> Optional[genai_types.Content]:
+    """Extracts prompt content from eval_case.prompt or starting_prompt."""
+    if eval_case.prompt:
+        return eval_case.prompt
+
+    user_scenario = getattr(eval_case, "user_scenario", None)
+    if user_scenario and user_scenario.starting_prompt:
+        return genai_types.Content(
+            parts=[genai_types.Part(text=user_scenario.starting_prompt)]
+        )
+
+    return None
+
+
+def _get_response_from_eval_case(
+    eval_case: types.EvalCase, response_index: int, metric_name: str
+) -> Optional[genai_types.Content]:
+    """Extracts response content from eval_case.responses."""
+    response_content = None
+    if eval_case.responses and response_index < len(eval_case.responses):
+        response_content = eval_case.responses[response_index].response
+
+    return response_content
+
+
+def _value_to_content_list(value: Any) -> list[genai_types.Content]:
+    """Converts a value to a list of Content objects."""
+    if isinstance(value, genai_types.Content):
+        return [value]
+    if isinstance(value, types.ResponseCandidate):
+        return [value.response] if value.response else []
+    if isinstance(value, list) and value:
+        if isinstance(value[0], genai_types.Content):
+            return value
+        if isinstance(value[0], types.evals.Message):
+            history_texts = []
+            for msg_obj in value:
+                msg_text = _extract_text_from_content(msg_obj.content)
+                if msg_text:
+                    role = msg_obj.content.role or msg_obj.author or "user"
+                    history_texts.append(f"{role}: {msg_text}")
+            return [
+                genai_types.Content(
+                    parts=[genai_types.Part(text="\n".join(history_texts))]
+                )
+            ]
+        return [genai_types.Content(parts=[genai_types.Part(text=json.dumps(value))])]
+    if isinstance(value, dict):
+        return [genai_types.Content(parts=[genai_types.Part(text=json.dumps(value))])]
+    return [genai_types.Content(parts=[genai_types.Part(text=str(value))])]
+
+
+def _get_autorater_config(metric: types.Metric) -> dict[str, Any]:
+    """Extracts autorater config settings from a metric."""
+    autorater_config: dict[str, Any] = {}
+    if metric.judge_model:
+        autorater_config["autorater_model"] = metric.judge_model
+    if metric.judge_model_generation_config:
+        autorater_config["generation_config"] = metric.judge_model_generation_config
+    if metric.judge_model_sampling_count:
+        autorater_config["sampling_count"] = metric.judge_model_sampling_count
+    return autorater_config
+
+
 def _default_aggregate_scores(
     metric_name: str,
     eval_case_metric_results: list[types.EvalCaseMetricResult],
@@ -213,17 +279,13 @@ def _build_request_payload(
     ) -> dict[str, Any]:
         """Builds the request parameters for evaluate instances."""
         request_payload = {}
-        if response_index >= len(eval_case.responses):
-            raise IndexError(
-                f"response_index {response_index} out of bounds for eval_case with"
-                f" {len(eval_case.responses)} responses."
-            )
-        if eval_case.responses is None:
-            raise ValueError(
-                f"No responses found for eval_case with ID {eval_case.eval_case_id}."
-            )
-        current_response_candidate = eval_case.responses[response_index]
-        if _extract_text_from_content(current_response_candidate.response) is None:
+
+        response_content = _get_response_from_eval_case(
+            eval_case, response_index, self.metric.name
+        )
+        prediction_text = _extract_text_from_content(response_content)
+
+        if prediction_text is None:
             raise ValueError(
                 f"Response text missing for candidate {response_index} in eval_case"
                 f" {eval_case.eval_case_id or 'Unknown ID'}."
@@ -248,9 +310,7 @@ def _build_request_payload(
                 },
                 "instances": [
                     {
-                        "prediction": _extract_text_from_content(
-                            current_response_candidate.response
-                        ),
+                        "prediction": prediction_text,
                         "reference": _extract_text_from_content(
                             eval_case.reference.response
                         ),
@@ -262,9 +322,7 @@ def _build_request_payload(
                 "metric_spec": {},
                 "instances": [
                     {
-                        "prediction": _extract_text_from_content(
-                            current_response_candidate.response
-                        ),
+                        "prediction": prediction_text,
                         "reference": _extract_text_from_content(
                             eval_case.reference.response
                         ),
@@ -350,18 +408,13 @@ def _build_request_payload(
         if hasattr(self.metric, "target_language"):
             target_language = self.metric.target_language
 
-        if response_index >= len(eval_case.responses):
-            raise IndexError(
-                f"response_index {response_index} out of bounds for eval_case with"
-                f" {len(eval_case.responses)} responses."
-            )
+        response_content = _get_response_from_eval_case(
+            eval_case, response_index, self.metric.name
+        )
+        prediction_text = _extract_text_from_content(response_content)
+        prompt_text = _extract_text_from_content(_get_prompt_from_eval_case(eval_case))
 
-        if eval_case.responses is None:
-            raise ValueError(
-                f"No responses found for eval_case with ID {eval_case.eval_case_id}."
-            )
-        current_response_candidate = eval_case.responses[response_index]
-        if _extract_text_from_content(current_response_candidate.response) is None:
+        if prediction_text is None:
             raise ValueError(
                 f"Response text missing for candidate {response_index} in eval_case"
                 f" {eval_case.eval_case_id or 'Unknown ID'}."
@@ -375,7 +428,7 @@ def _build_request_payload(
                 "Reference text missing for eval_case"
                 f" {eval_case.eval_case_id or 'Unknown ID'}."
             )
-        if _extract_text_from_content(eval_case.prompt) is None:
+        if prompt_text is None:
             raise ValueError(
                 "Prompt text (source for translation) missing for eval_case"
                 f" {eval_case.eval_case_id or 'Unknown ID'}."
@@ -388,11 +441,9 @@ def _build_request_payload(
                 "target_language": target_language,
             },
             "instance": {
-                "prediction": _extract_text_from_content(
-                    current_response_candidate.response
-                ),
+                "prediction": prediction_text,
                 "reference": _extract_text_from_content(eval_case.reference.response),
-                "source": _extract_text_from_content(eval_case.prompt),
+                "source": prompt_text,
             },
         }
         return request_payload
@@ -528,10 +579,11 @@ def _build_rubric_based_input(
             rubrics_list = []
 
         parsed_rubrics = [types.evals.Rubric(**r) for r in rubrics_list]
+        extracted_prompt = _get_prompt_from_eval_case(eval_case)
         rubric_enhanced_contents = {
             "prompt": (
-                [eval_case.prompt.model_dump(mode="json", exclude_none=True)]
-                if eval_case.prompt
+                [extracted_prompt.model_dump(mode="json", exclude_none=True)]
+                if extracted_prompt
                 else None
             ),
             "response": [response_content.model_dump(mode="json", exclude_none=True)],
@@ -561,8 +613,9 @@ def _build_pointwise_input(
         self, eval_case: types.EvalCase, response_content: genai_types.Content
     ) -> dict[str, Any]:
         """Builds the payload for a standard pointwise LLM metric."""
+        extracted_prompt = _get_prompt_from_eval_case(eval_case)
         instance_data = {
-            "prompt": eval_case.prompt,
+            "prompt": extracted_prompt,
             "response": response_content,
         }
         template_obj = types.PromptTemplate(text=self.metric.prompt_template)
@@ -573,46 +626,8 @@ def _build_pointwise_input(
 
         content_map_values = {}
         for key, value in instance_data.items():
-            content_list_to_serialize = []
-            if isinstance(value, genai_types.Content):
-                content_list_to_serialize = [value]
-            elif isinstance(value, types.ResponseCandidate):
-                if value.response:  # pytype: disable=attribute-error
-                    content_list_to_serialize = [value.response]
-            elif isinstance(value, list) and value:
-                if isinstance(value[0], genai_types.Content):
-                    content_list_to_serialize = value
-                elif isinstance(value[0], types.evals.Message):
-                    history_texts = []
-                    for msg_obj in value:
-                        msg_text = _extract_text_from_content(msg_obj.content)
-                        if msg_text:
-                            role = msg_obj.content.role or msg_obj.author or "user"
-                            history_texts.append(f"{role}: {msg_text}")
-                    content_list_to_serialize = [
-                        genai_types.Content(
-                            parts=[genai_types.Part(text="\n".join(history_texts))]
-                        )
-                    ]
-                else:
-                    content_list_to_serialize = [
-                        genai_types.Content(
-                            parts=[genai_types.Part(text=json.dumps(value))]
-                        )
-                    ]
-            elif isinstance(value, dict):
-                content_list_to_serialize = [
-                    genai_types.Content(
-                        parts=[genai_types.Part(text=json.dumps(value))]
-                    )
-                ]
-            else:
-                content_list_to_serialize = [
-                    genai_types.Content(parts=[genai_types.Part(text=str(value))])
-                ]
-
             content_map_values[key] = types.ContentMapContents(
-                contents=content_list_to_serialize
+                contents=_value_to_content_list(value)
             )
 
         instance_payload = types.PointwiseMetricInstance(
@@ -638,15 +653,7 @@ def _build_pointwise_input(
 
     def _add_autorater_config(self, payload: dict[str, Any]) -> None:
         """Adds autorater config to the request payload if specified."""
-        autorater_config: dict[str, Any] = {}
-        if self.metric.judge_model:
-            autorater_config["autorater_model"] = self.metric.judge_model
-        if self.metric.judge_model_generation_config:
-            autorater_config["generation_config"] = (
-                self.metric.judge_model_generation_config
-            )
-        if self.metric.judge_model_sampling_count:
-            autorater_config["sampling_count"] = self.metric.judge_model_sampling_count
+        autorater_config = _get_autorater_config(self.metric)
 
         if not autorater_config:
             return
@@ -663,10 +670,10 @@ def _build_request_payload(
         self, eval_case: types.EvalCase, response_index: int
     ) -> dict[str, Any]:
         """Builds the request parameters for evaluate instances request."""
-        if not eval_case.responses or response_index >= len(eval_case.responses):
-            raise IndexError(f"response_index {response_index} is out of bounds.")
+        response_content = _get_response_from_eval_case(
+            eval_case, response_index, self.metric.name
+        )
 
-        response_content = eval_case.responses[response_index].response
         if not response_content:
             raise ValueError(
                 f"Response content missing for candidate {response_index}."
@@ -804,26 +811,36 @@ def get_metric_result(
             eval_case.model_dump(exclude_none=True),
         )
 
-        if response_index >= len(eval_case.responses):
+        try:
+            response_content = _get_response_from_eval_case(
+                eval_case, response_index, metric_name
+            )
+        except ValueError as e:
+            return types.EvalCaseMetricResult(
+                metric_name=metric_name,
+                error_message=str(e),
+            )
+
+        if not response_content:
             return types.EvalCaseMetricResult(
-                metric_name=self.metric.name,
+                metric_name=metric_name,
                 error_message=(
-                    f"response_index {response_index} out of bounds for EvalCase"
-                    f" {eval_case.eval_case_id or 'Unknown ID'}."
+                    f"No response found for candidate {response_index} in EvalCase"
+                    f" {eval_case.eval_case_id}."
                 ),
             )
 
-        if not eval_case.responses:
-            raise ValueError(f"EvalCase {eval_case.eval_case_id} has no responses.")
-
-        current_response_candidate = eval_case.responses[response_index]
-
         instance_for_custom_fn = eval_case.model_dump(
             exclude={"responses"}, mode="json", exclude_none=True
         )
-        instance_for_custom_fn["response"] = current_response_candidate.model_dump(
+        instance_for_custom_fn["response"] = response_content.model_dump(
             mode="json", exclude_none=True
-        ).get("response")
+        )
+        extracted_prompt = _get_prompt_from_eval_case(eval_case)
+        if extracted_prompt:
+            instance_for_custom_fn["prompt"] = extracted_prompt.model_dump(
+                mode="json", exclude_none=True
+            )
 
         error_msg = None
         score = None
@@ -906,60 +923,74 @@ def _content_to_instance_data(
     @staticmethod
     def _eval_case_to_agent_data(
         eval_case: types.EvalCase,
+        prompt_content: Optional[genai_types.Content] = None,
+        response_content: Optional[genai_types.Content] = None,
     ) -> Optional[types.evals.AgentData]:
-        """Converts an EvalCase object to an AgentData object."""
+        """Converts an EvalCase object to a single turn AgentData object."""
         if getattr(eval_case, "agent_data", None):
             return eval_case.agent_data
 
-        if not eval_case.agent_info and not eval_case.intermediate_events:
+        if (
+            not eval_case.agent_info
+            and not eval_case.intermediate_events
+            and not prompt_content
+            and not response_content
+        ):
             return None
-        tools = None
-        developer_instruction = None
-        agent_config = None
-        tool_declarations = []
-        event_contents = []
 
+        agents_map = None
         if eval_case.agent_info:
-            agent_info = eval_case.agent_info
-            if agent_info.instruction:
-                developer_instruction = types.evals.InstanceData(
-                    text=agent_info.instruction
-                )
-            if agent_info.tool_declarations:
-                tool_declarations = agent_info.tool_declarations
-            tools = types.evals.Tools(tool=tool_declarations)
-
-            if tools or developer_instruction:
-                agent_config = types.evals.AgentConfig(
-                    legacy_tools=tools,
-                    developer_instruction=developer_instruction,
+            agents_map = eval_case.agent_info.agents
+
+        events = []
+        if prompt_content:
+            events.append(
+                types.evals.AgentEvent(
+                    author="user",
+                    content=prompt_content,
                 )
+            )
 
         if eval_case.intermediate_events:
-            event_contents = [
-                event.content
-                for event in eval_case.intermediate_events
-                if event.content
+            for event in eval_case.intermediate_events:
+                events.append(
+                    types.evals.AgentEvent(
+                        author=event.author,
+                        content=event.content,
+                        event_time=event.creation_timestamp,
+                    )
+                )
+
+        if response_content:
+            events.append(
+                types.evals.AgentEvent(
+                    author="model",
+                    content=response_content,
+                )
+            )
+
+        turns = None
+        if events:
+            turns = [
+                types.evals.ConversationTurn(
+                    turn_index=0,
+                    turn_id="turn_0",
+                    events=events,
+                )
             ]
-        events = types.evals.Events(event=event_contents)
 
         return types.evals.AgentData(
-            agent_config=agent_config,
-            events=events,
+            agents=agents_map,
+            turns=turns,
         )
 
     def _build_request_payload(
         self, eval_case: types.EvalCase, response_index: int
     ) -> dict[str, Any]:
         """Builds the request parameters for evaluate instances request."""
-        if (
-            not eval_case.responses or response_index >= len(eval_case.responses)
-        ) and not getattr(eval_case, "agent_data", None):
-            raise IndexError(f"response_index {response_index} is out of bounds.")
-
-        response_content = None
-        if eval_case.responses and response_index < len(eval_case.responses):
-            response_content = eval_case.responses[response_index].response
+        response_content = _get_response_from_eval_case(
+            eval_case, response_index, self.metric.name
+        )
 
         if not response_content and not getattr(eval_case, "agent_data", None):
             raise ValueError(
@@ -980,21 +1011,22 @@ def _build_request_payload(
                 eval_case.reference.response
             )
 
+        extracted_prompt = _get_prompt_from_eval_case(eval_case)
         prompt_instance_data = None
         if self.metric.name is not None and self.metric.name.startswith("multi_turn"):
             prompt_contents = []
             if eval_case.conversation_history:
                 for message in eval_case.conversation_history:
                     prompt_contents.append(message.content)
-            if eval_case.prompt:
-                prompt_contents.append(eval_case.prompt)
+            if extracted_prompt:
+                prompt_contents.append(extracted_prompt)
 
             prompt_instance_data = types.evals.InstanceData(
                 contents=types.evals.InstanceDataContents(contents=prompt_contents)
             )
         else:
             prompt_instance_data = PredefinedMetricHandler._content_to_instance_data(
-                eval_case.prompt
+                extracted_prompt
             )
 
         other_data_map: dict[str, Any] = {}
@@ -1023,22 +1055,16 @@ def _build_request_payload(
                 if other_data_map
                 else None
             ),
-            agent_data=PredefinedMetricHandler._eval_case_to_agent_data(eval_case),
+            agent_data=PredefinedMetricHandler._eval_case_to_agent_data(
+                eval_case, extracted_prompt, response_content
+            ),
         )
 
         request_payload: dict[str, Any] = {
             "instance": instance_payload,
         }
 
-        autorater_config: dict[str, Any] = {}
-        if self.metric.judge_model:
-            autorater_config["autorater_model"] = self.metric.judge_model
-        if self.metric.judge_model_generation_config:
-            autorater_config["generation_config"] = (
-                self.metric.judge_model_generation_config
-            )
-        if self.metric.judge_model_sampling_count:
-            autorater_config["sampling_count"] = self.metric.judge_model_sampling_count
+        autorater_config = _get_autorater_config(self.metric)
         if autorater_config:
             request_payload["autorater_config"] = genai_types.AutoraterConfig(
                 **autorater_config
@@ -1155,10 +1181,10 @@ def _build_request_payload(
         self, eval_case: types.EvalCase, response_index: int
     ) -> dict[str, Any]:
         """Builds the request parameters for evaluate instances request."""
-        if not eval_case.responses or response_index >= len(eval_case.responses):
-            raise IndexError(f"response_index {response_index} is out of bounds.")
+        response_content = _get_response_from_eval_case(
+            eval_case, response_index, self.metric.name
+        )
 
-        response_content = eval_case.responses[response_index].response
         if not response_content:
             raise ValueError(
                 f"Response content missing for candidate {response_index}."
@@ -1170,8 +1196,9 @@ def _build_request_payload(
                 eval_case.reference.response
             )
 
+        extracted_prompt = _get_prompt_from_eval_case(eval_case)
         prompt_instance_data = PredefinedMetricHandler._content_to_instance_data(
-            eval_case.prompt
+            extracted_prompt
         )
 
         instance_payload = types.EvaluationInstance(
diff --git a/vertexai/_genai/types/evals.py b/vertexai/_genai/types/evals.py
index b95b4e320d..03271f755d 100644
--- a/vertexai/_genai/types/evals.py
+++ b/vertexai/_genai/types/evals.py
@@ -36,83 +36,6 @@ class Importance(_common.CaseInSensitiveEnum):
     """Low importance."""
 
 
-class Tools(_common.BaseModel):
-    """This field is experimental and will be removed in future versions.
-
-    Represents a list of tools for an agent.
-    """
-
-    tool: Optional[list[genai_types.Tool]] = Field(
-        default=None,
-        description="""List of tools: each tool can have multiple function declarations.""",
-    )
-
-
-class ToolsDict(TypedDict, total=False):
-    """This field is experimental and will be removed in future versions.
-
-    Represents a list of tools for an agent.
-    """
-
-    tool: Optional[list[genai_types.ToolDict]]
-    """List of tools: each tool can have multiple function declarations."""
-
-
-ToolsOrDict = Union[Tools, ToolsDict]
-
-
-class InstanceDataContents(_common.BaseModel):
-    """This field is experimental and will be removed in future versions.
-
-    List of standard Content messages from Gemini API.
-    """
-
-    contents: Optional[list[genai_types.Content]] = Field(
-        default=None, description="""Repeated contents."""
-    )
-
-
-class InstanceDataContentsDict(TypedDict, total=False):
-    """This field is experimental and will be removed in future versions.
-
-    List of standard Content messages from Gemini API.
-    """
-
-    contents: Optional[list[genai_types.ContentDict]]
-    """Repeated contents."""
-
-
-InstanceDataContentsOrDict = Union[InstanceDataContents, InstanceDataContentsDict]
-
-
-class InstanceData(_common.BaseModel):
-    """This field is experimental and will be removed in future versions.
-
-    Instance data used to populate placeholders in a metric prompt template.
-    """
-
-    text: Optional[str] = Field(default=None, description="""Text data.""")
-    contents: Optional[InstanceDataContents] = Field(
-        default=None, description="""List of Gemini content data."""
-    )
-
-
-class InstanceDataDict(TypedDict, total=False):
-    """This field is experimental and will be removed in future versions.
-
-    Instance data used to populate placeholders in a metric prompt template.
-    """
-
-    text: Optional[str]
-    """Text data."""
-
-    contents: Optional[InstanceDataContentsDict]
-    """List of Gemini content data."""
-
-
-InstanceDataOrDict = Union[InstanceData, InstanceDataDict]
-
-
 class AgentConfig(_common.BaseModel):
     """Represents configuration for an Agent."""
 
@@ -122,10 +45,6 @@ class AgentConfig(_common.BaseModel):
       This ID is used to refer to this agent, e.g., in AgentEvent.author, or in
       the `sub_agents` field. It must be unique within the `agents` map.""",
     )
-    agent_resource_name: Optional[str] = Field(
-        default=None,
-        description="""The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""",
-    )
     agent_type: Optional[str] = Field(
         default=None,
         description="""The type or class of the agent (e.g., "LlmAgent", "RouterAgent",
@@ -152,24 +71,13 @@ class AgentConfig(_common.BaseModel):
         description="""The list of valid agent IDs that this agent can delegate to.
       This defines the directed edges in the multi-agent system graph topology.""",
     )
-    tools_text: Optional[str] = Field(
-        default=None,
-        description="""A JSON string containing a list of tools available to an agent.""",
-    )
-    legacy_tools: Optional[Tools] = Field(
-        default=None, description="""List of tools."""
-    )
-    developer_instruction: Optional[InstanceData] = Field(
-        default=None,
-        description="""A field containing instructions from the developer for the agent.""",
-    )
 
     @staticmethod
     def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion:
         """Gets tool declarations from an agent.
 
         Args:
-          agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
+          agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type.
 
         Returns:
           The tool declarations of the agent.
@@ -188,25 +96,26 @@ def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion:
         return tool_declarations
 
     @classmethod
-    def from_agent(
-        cls, agent: Any, agent_resource_name: Optional[str] = None
-    ) -> "AgentConfig":
-        """Creates an AgentConfig from an ADK agent object.
+    def from_agent(cls, agent: Any) -> "AgentConfig":
+        """Creates an AgentConfig from an ADK agent.
 
         Args:
-          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
-          agent_resource_name: Optional. The agent engine resource name.
+          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type.
+          agent_resource_name: Optional. The agent engine resource name for the deployed agent.
 
         Returns:
-            An AgentConfig object populated with the agent's metadata.
+            An AgentConfig populated with the agent's metadata for evaluation.
         """
         return cls(  # pytype: disable=missing-parameter
-            agent_id=getattr(agent, "name", "agent_0") or "agent_0",
-            agent_resource_name=agent_resource_name,
+            agent_id=getattr(agent, "name", None),
             agent_type=agent.__class__.__name__,
             description=getattr(agent, "description", None),
             instruction=getattr(agent, "instruction", None),
             tools=AgentConfig._get_tool_declarations_from_agent(agent),
+            sub_agents=[
+                getattr(sub_agent, "name", None)
+                for sub_agent in getattr(agent, "sub_agents", [])
+            ],
         )
 
 
@@ -218,9 +127,6 @@ class AgentConfigDict(TypedDict, total=False):
       This ID is used to refer to this agent, e.g., in AgentEvent.author, or in
       the `sub_agents` field. It must be unique within the `agents` map."""
 
-    agent_resource_name: Optional[str]
-    """The Agent Engine resource name, formatted as `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`."""
-
     agent_type: Optional[str]
     """The type or class of the agent (e.g., "LlmAgent", "RouterAgent",
       "ToolUseAgent"). Useful for the autorater to understand the expected
@@ -243,15 +149,6 @@ class AgentConfigDict(TypedDict, total=False):
     """The list of valid agent IDs that this agent can delegate to.
       This defines the directed edges in the multi-agent system graph topology."""
 
-    tools_text: Optional[str]
-    """A JSON string containing a list of tools available to an agent."""
-
-    legacy_tools: Optional[ToolsDict]
-    """List of tools."""
-
-    developer_instruction: Optional[InstanceDataDict]
-    """A field containing instructions from the developer for the agent."""
-
 
 AgentConfigOrDict = Union[AgentConfig, AgentConfigDict]
 
@@ -339,30 +236,6 @@ class ConversationTurnDict(TypedDict, total=False):
 ConversationTurnOrDict = Union[ConversationTurn, ConversationTurnDict]
 
 
-class Events(_common.BaseModel):
-    """This field is experimental and will be removed in future versions.
-
-    Represents a list of events for an agent.
-    """
-
-    event: Optional[list[genai_types.Content]] = Field(
-        default=None, description="""A list of events."""
-    )
-
-
-class EventsDict(TypedDict, total=False):
-    """This field is experimental and will be removed in future versions.
-
-    Represents a list of events for an agent.
-    """
-
-    event: Optional[list[genai_types.ContentDict]]
-    """A list of events."""
-
-
-EventsOrDict = Union[Events, EventsDict]
-
-
 class AgentData(_common.BaseModel):
     """Represents data specific to multi-turn agent evaluations."""
 
@@ -378,20 +251,13 @@ class AgentData(_common.BaseModel):
       Each turn represents a logical execution cycle (e.g., User Input -> Agent
       Response).""",
     )
-    agent_config: Optional[AgentConfig] = Field(
-        default=None, description="""Agent configuration."""
-    )
-    events_text: Optional[str] = Field(
-        default=None, description="""A JSON string containing a sequence of events."""
-    )
-    events: Optional[Events] = Field(default=None, description="""A list of events.""")
 
     @classmethod
-    def _get_agents_map(cls, agent: Any) -> dict[str, AgentConfig]:
+    def get_agents_map(cls, agent: Any) -> dict[str, AgentConfig]:
         """Recursively gets all agent configs from an agent and its sub-agents.
 
         Args:
-          agent: The agent to get the agent info from.
+          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type.
 
         Returns:
           A dict mapping agent_id to AgentConfig.
@@ -401,7 +267,7 @@ def _get_agents_map(cls, agent: Any) -> dict[str, AgentConfig]:
         agents_map = {agent_id: agent_config}
 
         for sub_agent in getattr(agent, "sub_agents", []):
-            agents_map.update(cls._get_agents_map(sub_agent))
+            agents_map.update(cls.get_agents_map(sub_agent))
 
         return agents_map
 
@@ -419,7 +285,7 @@ def from_session(cls, agent: Any, session_history: list[Any]) -> "AgentData":
         Returns:
             An AgentData object containing the segmented history and agent config.
         """
-        agents_map = cls._get_agents_map(agent)
+        agents_map = cls.get_agents_map(agent)
         agent_id = getattr(agent, "name", "agent_0") or "agent_0"
 
         turns: list[ConversationTurn] = []
@@ -510,21 +376,12 @@ class AgentDataDict(TypedDict, total=False):
       Each turn represents a logical execution cycle (e.g., User Input -> Agent
       Response)."""
 
-    agent_config: Optional[AgentConfigDict]
-    """Agent configuration."""
-
-    events_text: Optional[str]
-    """A JSON string containing a sequence of events."""
-
-    events: Optional[EventsDict]
-    """A list of events."""
-
 
 AgentDataOrDict = Union[AgentData, AgentDataDict]
 
 
 class AgentInfo(_common.BaseModel):
-    """The agent info of an agent, used for agent eval."""
+    """The agent info of an agent system, used for agent evaluation."""
 
     agent_resource_name: Optional[str] = Field(
         default=None,
@@ -532,53 +389,30 @@ class AgentInfo(_common.BaseModel):
             `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`.""",
     )
     name: Optional[str] = Field(
-        default=None, description="""Agent name, used as an identifier."""
+        default=None, description="""Agent candidate name, used as an identifier."""
     )
-    instruction: Optional[str] = Field(
-        default=None, description="""Agent developer instruction."""
-    )
-    description: Optional[str] = Field(
-        default=None, description="""Agent description."""
+    agents: Optional[dict[str, AgentConfig]] = Field(
+        default=None,
+        description="""A map containing the static configurations for each agent in the system.
+      Key: agent_id (matches the `author` field in events).
+      Value: The static configuration of the agent.""",
     )
-    tool_declarations: Optional[genai_types.ToolListUnion] = Field(
-        default=None, description="""List of tools used by the Agent."""
+    root_agent_id: Optional[str] = Field(
+        default=None, description="""The agent ID of the root agent."""
     )
 
-    @staticmethod
-    def _get_tool_declarations_from_agent(agent: Any) -> genai_types.ToolListUnion:
-        """Gets tool declarations from an agent.
-
-        Args:
-          agent: The agent to get the tool declarations from. Data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
-
-        Returns:
-          The tool declarations of the agent.
-        """
-        tool_declarations: genai_types.ToolListUnion = []
-        for tool in agent.tools:
-            tool_declarations.append(
-                {
-                    "function_declarations": [
-                        genai_types.FunctionDeclaration.from_callable_with_api_option(
-                            callable=tool
-                        )
-                    ]
-                }
-            )
-        return tool_declarations
-
     @classmethod
     def load_from_agent(
         cls, agent: Any, agent_resource_name: Optional[str] = None
     ) -> "AgentInfo":
-        """Loads agent info from an agent.
+        """Loads agent info from an ADK agent.
 
         Args:
-          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type, use Any to avoid dependency on ADK.
-          agent_resource_name: Optional. The agent engine resource name.
+          agent: The agent to get the agent info from, data type is google.adk.agents.LLMAgent type.
+          agent_resource_name: Optional. The agent engine resource name for the deployed agent.
 
         Returns:
-          The agent info of the agent.
+          The agent info of the agent system.
 
         Example:
         ```
@@ -595,30 +429,27 @@ def load_from_agent(
         return cls(  # pytype: disable=missing-parameter
             name=agent.name,
             agent_resource_name=agent_resource_name,
-            instruction=agent.instruction,
-            description=agent.description,
-            tool_declarations=AgentInfo._get_tool_declarations_from_agent(agent),
+            agents=AgentData.get_agents_map(agent),
         )
 
 
 class AgentInfoDict(TypedDict, total=False):
-    """The agent info of an agent, used for agent eval."""
+    """The agent info of an agent system, used for agent evaluation."""
 
     agent_resource_name: Optional[str]
     """The agent engine used to run agent. Agent engine resource name in str type, with format
             `projects/{project}/locations/{location}/reasoningEngines/{reasoning_engine_id}`."""
 
     name: Optional[str]
-    """Agent name, used as an identifier."""
-
-    instruction: Optional[str]
-    """Agent developer instruction."""
+    """Agent candidate name, used as an identifier."""
 
-    description: Optional[str]
-    """Agent description."""
+    agents: Optional[dict[str, AgentConfigDict]]
+    """A map containing the static configurations for each agent in the system.
+      Key: agent_id (matches the `author` field in events).
+      Value: The static configuration of the agent."""
 
-    tool_declarations: Optional[genai_types.ToolListUnionDict]
-    """List of tools used by the Agent."""
+    root_agent_id: Optional[str]
+    """The agent ID of the root agent."""
 
 
 AgentInfoOrDict = Union[AgentInfo, AgentInfoDict]
@@ -854,6 +685,107 @@ class MessageDict(TypedDict, total=False):
 MessageOrDict = Union[Message, MessageDict]
 
 
+class Events(_common.BaseModel):
+    """This field is experimental and will be removed in future versions.
+
+    Represents a list of events for an agent.
+    """
+
+    event: Optional[list[genai_types.Content]] = Field(
+        default=None, description="""A list of events."""
+    )
+
+
+class EventsDict(TypedDict, total=False):
+    """This field is experimental and will be removed in future versions.
+
+    Represents a list of events for an agent.
+    """
+
+    event: Optional[list[genai_types.ContentDict]]
+    """A list of events."""
+
+
+EventsOrDict = Union[Events, EventsDict]
+
+
+class InstanceDataContents(_common.BaseModel):
+    """This field is experimental and will be removed in future versions.
+
+    List of standard Content messages from Gemini API.
+    """
+
+    contents: Optional[list[genai_types.Content]] = Field(
+        default=None, description="""Repeated contents."""
+    )
+
+
+class InstanceDataContentsDict(TypedDict, total=False):
+    """This field is experimental and will be removed in future versions.
+
+    List of standard Content messages from Gemini API.
+    """
+
+    contents: Optional[list[genai_types.ContentDict]]
+    """Repeated contents."""
+
+
+InstanceDataContentsOrDict = Union[InstanceDataContents, InstanceDataContentsDict]
+
+
+class InstanceData(_common.BaseModel):
+    """This field is experimental and will be removed in future versions.
+
+    Instance data used to populate placeholders in a metric prompt template.
+    """
+
+    text: Optional[str] = Field(default=None, description="""Text data.""")
+    contents: Optional[InstanceDataContents] = Field(
+        default=None, description="""List of Gemini content data."""
+    )
+
+
+class InstanceDataDict(TypedDict, total=False):
+    """This field is experimental and will be removed in future versions.
+
+    Instance data used to populate placeholders in a metric prompt template.
+    """
+
+    text: Optional[str]
+    """Text data."""
+
+    contents: Optional[InstanceDataContentsDict]
+    """List of Gemini content data."""
+
+
+InstanceDataOrDict = Union[InstanceData, InstanceDataDict]
+
+
+class Tools(_common.BaseModel):
+    """This field is experimental and will be removed in future versions.
+
+    Represents a list of tools for an agent.
+    """
+
+    tool: Optional[list[genai_types.Tool]] = Field(
+        default=None,
+        description="""List of tools: each tool can have multiple function declarations.""",
+    )
+
+
+class ToolsDict(TypedDict, total=False):
+    """This field is experimental and will be removed in future versions.
+
+    Represents a list of tools for an agent.
+    """
+
+    tool: Optional[list[genai_types.ToolDict]]
+    """List of tools: each tool can have multiple function declarations."""
+
+
+ToolsOrDict = Union[Tools, ToolsDict]
+
+
 class RubricContentProperty(_common.BaseModel):
     """Defines criteria based on a specific property."""