Skip to content

Commit 565482a

Browse files
jsondaicopybara-github
authored andcommitted
feat: GenAI Client(evals) - convert raw Agent Session history into the new AgentData structure
PiperOrigin-RevId: 869945268
1 parent 89d5723 commit 565482a

6 files changed

Lines changed: 547 additions & 159 deletions

File tree

tests/unit/vertexai/genai/replays/test_evaluate_instances.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,19 @@ def test_rouge_metric(client):
8181

8282
def test_pointwise_metric(client):
8383
"""Tests the _evaluate_instances method with PointwiseMetricInput."""
84-
instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
84+
instance_dict = {
85+
"prompt": "What is the capital of France?",
86+
"response": "Paris",
87+
}
8588
json_instance = json.dumps(instance_dict)
8689

8790
test_input = types.PointwiseMetricInput(
8891
instance=types.PointwiseMetricInstance(json_instance=json_instance),
8992
metric_spec=genai_types.PointwiseMetricSpec(
90-
metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
93+
metric_prompt_template=(
94+
"Evaluate if the response '{response}' correctly answers the"
95+
" prompt '{prompt}'."
96+
)
9197
),
9298
)
9399
response = client.evals.evaluate_instances(
@@ -101,19 +107,20 @@ def test_pointwise_metric(client):
101107

102108
def test_pointwise_metric_with_agent_data(client):
103109
"""Tests the _evaluate_instances method with PointwiseMetricInput and agent_data."""
104-
instance_dict = {"prompt": "What is the capital of France?", "response": "Paris"}
110+
instance_dict = {
111+
"prompt": "What is the capital of France?",
112+
"response": "Paris",
113+
}
105114
json_instance = json.dumps(instance_dict)
106115
agent_data = types.evals.AgentData(
107116
agent_config=types.evals.AgentConfig(
108-
tools=types.evals.Tools(
109-
tool=[
110-
genai_types.Tool(
111-
function_declarations=[
112-
genai_types.FunctionDeclaration(name="search")
113-
]
114-
)
115-
]
116-
),
117+
tools=[
118+
genai_types.Tool(
119+
function_declarations=[
120+
genai_types.FunctionDeclaration(name="search")
121+
]
122+
)
123+
],
117124
developer_instruction=types.evals.InstanceData(text="instruction"),
118125
),
119126
events=types.evals.Events(
@@ -129,7 +136,10 @@ def test_pointwise_metric_with_agent_data(client):
129136
test_input = types.PointwiseMetricInput(
130137
instance=types.PointwiseMetricInstance(json_instance=json_instance),
131138
metric_spec=genai_types.PointwiseMetricSpec(
132-
metric_prompt_template="Evaluate if the response '{response}' correctly answers the prompt '{prompt}'."
139+
metric_prompt_template=(
140+
"Evaluate if the response '{response}' correctly answers the"
141+
" prompt '{prompt}'."
142+
)
133143
),
134144
)
135145
response = client.evals.evaluate_instances(
@@ -189,7 +199,10 @@ def test_pairwise_metric_with_autorater(client):
189199
test_input = types.PairwiseMetricInput(
190200
instance=types.PairwiseMetricInstance(json_instance=json_instance),
191201
metric_spec=genai_types.PairwiseMetricSpec(
192-
metric_prompt_template="Which response is a better summary? Baseline: '{baseline_response}' or Candidate: '{candidate_response}'"
202+
metric_prompt_template=(
203+
"Which response is a better summary? Baseline:"
204+
" '{baseline_response}' or Candidate: '{candidate_response}'"
205+
)
193206
),
194207
)
195208
autorater_config = genai_types.AutoraterConfig(sampling_count=2)
@@ -240,7 +253,10 @@ def test_inference_with_prompt_template(client):
240253

241254
def test_run_inference_with_agent(client):
242255
test_df = pd.DataFrame(
243-
{"prompt": ["agent prompt"], "session_inputs": ['{"user_id": "user_123"}']}
256+
{
257+
"prompt": ["agent prompt"],
258+
"session_inputs": ['{"user_id": "user_123"}'],
259+
}
244260
)
245261
inference_result = client.evals.run_inference(
246262
agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",

tests/unit/vertexai/genai/test_evals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4056,7 +4056,7 @@ def test_eval_case_to_agent_data(self):
40564056
)
40574057

40584058
assert agent_data.agent_config.developer_instruction.text == "instruction1"
4059-
assert agent_data.agent_config.tools.tool == [tool]
4059+
assert agent_data.agent_config.legacy_tools.tool == [tool]
40604060
assert agent_data.events.event[0].parts[0].text == "intermediate event"
40614061

40624062
def test_eval_case_to_agent_data_events_only(self):
@@ -4164,7 +4164,7 @@ def test_eval_case_to_agent_data_agent_info_empty_tools(self):
41644164
)
41654165

41664166
assert agent_data.agent_config.developer_instruction.text == "instruction1"
4167-
assert not agent_data.agent_config.tools.tool
4167+
assert not agent_data.agent_config.legacy_tools.tool
41684168

41694169
def test_eval_case_to_agent_data_agent_info_empty(self):
41704170
intermediate_events = [

vertexai/_genai/_evals_common.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,21 +1299,36 @@ def _run_agent_internal(
12991299
agent=agent,
13001300
prompt_dataset=prompt_dataset,
13011301
)
1302+
1303+
agent_obj = agent_engine if agent_engine else agent
1304+
13021305
processed_intermediate_events = []
13031306
processed_responses = []
1304-
for resp_item in raw_responses:
1307+
processed_agent_data = [] # New column for AgentData
1308+
1309+
for i, resp_item in enumerate(raw_responses):
13051310
intermediate_events_row: list[dict[str, Any]] = []
13061311
response_row = None
1312+
1313+
# --- Legacy Logic: Intermediate Events & Response ---
13071314
if isinstance(resp_item, list):
13081315
try:
1309-
response_row = resp_item[-1]["content"]["parts"][0]["text"]
1316+
# Attempt to extract final response text
1317+
if resp_item and "content" in resp_item[-1]:
1318+
# Basic extraction, assumes last message is model response
1319+
final_content = resp_item[-1]["content"]
1320+
if isinstance(final_content, dict) and "parts" in final_content:
1321+
response_row = final_content["parts"][0].get("text", "")
1322+
elif hasattr(final_content, "parts"):
1323+
response_row = final_content.parts[0].text
1324+
13101325
for intermediate_event in resp_item[:-1]:
13111326
intermediate_events_row.append(
13121327
{
1313-
"event_id": intermediate_event["id"],
1314-
"content": intermediate_event["content"],
1315-
"creation_timestamp": intermediate_event["timestamp"],
1316-
"author": intermediate_event["author"],
1328+
"event_id": intermediate_event.get("id"),
1329+
"content": intermediate_event.get("content"),
1330+
"creation_timestamp": intermediate_event.get("timestamp"),
1331+
"author": intermediate_event.get("author"),
13171332
}
13181333
)
13191334
except Exception as e: # pylint: disable=broad-exception-caught
@@ -1335,6 +1350,33 @@ def _run_agent_internal(
13351350
processed_intermediate_events.append(intermediate_events_row)
13361351
processed_responses.append(response_row)
13371352

1353+
# --- New Logic: AgentData ---
1354+
agent_data_obj = None
1355+
try:
1356+
# 1. Get User Prompt for the current row
1357+
primary_prompt_column = (
1358+
"request" if "request" in prompt_dataset.columns else "prompt"
1359+
)
1360+
user_prompt_val = prompt_dataset.iloc[i][primary_prompt_column]
1361+
1362+
# 2. Construct Full Session History (User Prompt + Agent Events)
1363+
# Normalize user prompt into a message dict structure
1364+
user_event = {"role": "user", "content": user_prompt_val}
1365+
1366+
full_session_history = [user_event]
1367+
if isinstance(resp_item, list):
1368+
full_session_history.extend(resp_item)
1369+
1370+
# 3. Create AgentData using the new factory method
1371+
agent_data_obj = types.evals.AgentData.from_session(
1372+
agent_obj, full_session_history
1373+
)
1374+
except Exception as e:
1375+
logger.warning("Failed to adapt AgentData for row %d: %s", i, e)
1376+
# Proceed without AgentData; backend will fallback to legacy fields
1377+
1378+
processed_agent_data.append(agent_data_obj)
1379+
13381380
if len(processed_responses) != len(prompt_dataset) or len(
13391381
processed_responses
13401382
) != len(processed_intermediate_events):
@@ -1353,6 +1395,7 @@ def _run_agent_internal(
13531395
{
13541396
_evals_constant.INTERMEDIATE_EVENTS: processed_intermediate_events,
13551397
_evals_constant.RESPONSE: processed_responses,
1398+
"agent_data": processed_agent_data, # Populate agent_data
13561399
}
13571400
)
13581401

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,11 @@ def _eval_case_to_agent_data(
879879
eval_case: types.EvalCase,
880880
) -> Optional[types.evals.AgentData]:
881881
"""Converts an EvalCase object to an AgentData object."""
882+
# --- NEW LOGIC: Use the structured agent_data if present ---
883+
if getattr(eval_case, "agent_data", None):
884+
return eval_case.agent_data
885+
886+
# --- LEGACY LOGIC: Fallback for older dataframes ---
882887
if not eval_case.agent_info and not eval_case.intermediate_events:
883888
return None
884889
tools = None
@@ -899,7 +904,7 @@ def _eval_case_to_agent_data(
899904

900905
if tools or developer_instruction:
901906
agent_config = types.evals.AgentConfig(
902-
tools=tools,
907+
legacy_tools=tools,
903908
developer_instruction=developer_instruction,
904909
)
905910

vertexai/_genai/types/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1492,6 +1492,10 @@ class EvalCase(_common.BaseModel):
14921492
default=None,
14931493
description="""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation.""",
14941494
)
1495+
agent_data: Optional[evals_types.AgentData] = Field(
1496+
default=None,
1497+
description="""This field is experimental and may change in future versions. The agent data of the agent under evaluation.""",
1498+
)
14951499
# Allow extra fields to support custom metric prompts and stay backward compatible.
14961500
model_config = ConfigDict(frozen=True, extra="allow")
14971501

@@ -1526,6 +1530,9 @@ class EvalCaseDict(TypedDict, total=False):
15261530
agent_info: Optional[evals_types.AgentInfo]
15271531
"""This field is experimental and may change in future versions. The agent info of the agent under evaluation. This can be extended for multi-agent evaluation."""
15281532

1533+
agent_data: Optional[evals_types.AgentData]
1534+
"""This field is experimental and may change in future versions. The agent data of the agent under evaluation."""
1535+
15291536

15301537
EvalCaseOrDict = Union[EvalCase, EvalCaseDict]
15311538

0 commit comments

Comments
 (0)