Skip to content

Commit ea664d8

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Refactor evaluation instance building and update LLM metric handler
PiperOrigin-RevId: 888230786
1 parent c4beca0 commit ea664d8

File tree

4 files changed

+512
-534
lines changed

4 files changed

+512
-534
lines changed

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -357,22 +357,88 @@ def test_evaluation_metric_resource_name(client):
357357
"""Tests with a metric resource name in types.Metric."""
358358
client._api_client._http_options.api_version = "v1beta1"
359359
client._api_client._http_options.base_url = (
360-
"https://us-central1-staging-aiplatform.sandbox.googleapis.com/"
360+
"https://us-central1-autopush-aiplatform.sandbox.googleapis.com/"
361+
)
362+
tone_check_metric = types.LLMMetric(
363+
name="tone_check",
364+
prompt_template="""
365+
# Instruction
366+
You are a professional writing evaluator. Your job is to score writing responses according to pre-defined evaluation criteria.
367+
368+
# Criteria
369+
Analyze the tone of the response based on these two criteria:
370+
1. Professionalism: The response should use appropriate language and maintain a business-like demeanor.
371+
2. Empathy: The response should acknowledge the user's feelings and show understanding.
372+
373+
# Input
374+
Prompt: {agent_data.turns[0].events[0]}
375+
Response: {agent_data.turns[0].events[1]}
376+
377+
# Output Format
378+
Respond in a JSON format with the following schema:
379+
{
380+
"type": "OBJECT",
381+
"properties": {
382+
"score": {"type": "NUMBER"},
383+
"explanation": {"type": "STRING"},
384+
},
385+
"required": ["score", "explanation"],
386+
}
387+
Return the JSON format output in a string representation of a Python dictionary directly, without strings like '```json' or '```'.
388+
389+
The output would include the following fields:
390+
score: based on your evaluation, the score should be a number based on the rating rubrics.
391+
explanation: your explanation for the score rating, in one line.
392+
393+
## Example Output Format:
394+
{"score" : -1, "explanation": "Here is the reason that the response is given a score of -1 based on the rating rubric."}
395+
{"score" : 3, "explanation": "Here is the reason that the response is given a score of 3 based on the rating rubric."}
396+
{"score" : 0, "explanation": "Here is the reason that the response is given a score of 0 based on the rating rubric."}
397+
{"score" : 5, "explanation": "Here is the reason that the response is given a score of 5 based on the rating rubric."}
398+
""",
361399
)
362400
metric_resource_name = client.evals.create_evaluation_metric(
363-
display_name="test_metric",
364-
description="test_description",
365-
metric=types.RubricMetric.GENERAL_QUALITY,
401+
metric=tone_check_metric,
366402
)
367403
assert isinstance(metric_resource_name, str)
368404
assert re.match(
369405
r"^projects/[^/]+/locations/[^/]+/evaluationMetrics/[^/]+$",
370406
metric_resource_name,
371407
)
408+
agent_data = types.evals.AgentData(
409+
turns=[
410+
types.evals.ConversationTurn(
411+
turn_index=0,
412+
events=[
413+
types.evals.AgentEvent(
414+
author="user",
415+
content=genai_types.Content(
416+
role="user",
417+
parts=[
418+
genai_types.Part(
419+
text=("Write a simple story about a dinosaur")
420+
)
421+
],
422+
),
423+
),
424+
types.evals.AgentEvent(
425+
author="model",
426+
content=genai_types.Content(
427+
role="model",
428+
parts=[
429+
genai_types.Part(
430+
text="Once upon a time, there was a T-Rex named Rexy."
431+
)
432+
],
433+
),
434+
),
435+
],
436+
),
437+
],
438+
)
372439
byor_df = pd.DataFrame(
373440
{
374-
"prompt": ["Write a simple story about a dinosaur"],
375-
"response": ["Once upon a time, there was a T-Rex named Rexy."],
441+
"agent_data": [agent_data],
376442
}
377443
)
378444
metric = types.Metric(

0 commit comments

Comments
 (0)