From 15c38a270b0619aa153a7989055da31b622d5e50 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Wed, 1 Apr 2026 23:25:02 -0700 Subject: [PATCH 1/5] fix(samples): restructure friendly evaluator output with properties Update the FriendlyEvaluator sample to return the new standard output format with score, label, reason, threshold, and passed at the top level. Extra evaluator output fields (explanation, tone, confidence) are nested under a properties dict. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../friendly_evaluator/common_util/util.py | 34 +++++++++++++------ .../friendly_evaluator/friendly_evaluator.py | 2 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py index aa137276e55c..026e0352f51f 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -12,12 +12,11 @@ You MUST respond in the following JSON format only: { "score": , - "label": "", "reason": "", - "explanation": "" + "explanation": "", + "tone": "", + "confidence": "" } - -A score of 3 or above is considered "Pass", below 3 is "Fail". """ @@ -44,12 +43,19 @@ def build_evaluation_messages(query: str, response: str) -> list: def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: """Parse the LLM's JSON response into a structured evaluation result. + The return dict has the standard top-level keys (score, label, reason, + threshold, passed) and a ``properties`` dict for any extra output fields + the evaluator wants to surface. + :param raw_result: The raw string output from the LLM. :param threshold: The minimum score to be considered "Pass". - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ import json + # Keys that are promoted to the top level of the result + top_level_keys = {"score", "label", "reason"} + try: # Try to extract JSON from the response (handle markdown code blocks) text = raw_result.strip() @@ -57,17 +63,25 @@ def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: text = text.split("\n", 1)[1] if "\n" in text else text[3:] text = text.rsplit("```", 1)[0] result = json.loads(text.strip()) - score = int(result.get("score", threshold)) + score = max(1, min(5, int(result.get("score", threshold)))) + passed = score >= threshold + + # Collect any extra fields returned by the LLM into properties + properties = {k: v for k, v in result.items() if k not in top_level_keys} + return { - "score": max(1, min(5, score)), - "label": result.get("label", "Pass" if score >= threshold else "Fail"), + "score": score, + "label": "Pass" if passed else "Fail", "reason": result.get("reason", "No reason provided"), - "explanation": result.get("explanation", "No explanation provided"), + "threshold": threshold, + "passed": passed, + "properties": properties, } except (json.JSONDecodeError, ValueError, KeyError): return { "score": threshold, "label": "Pass", "reason": "Could not parse LLM response", - "explanation": f"Raw LLM output: {raw_result}", + "threshold": threshold, + "passed": True, } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py index 730237af61f5..eaf173d1cde1 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -49,7 +49,7 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict: :param query: The original user query. :param response: The response to evaluate. - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ messages = build_evaluation_messages(query, response) From aca5524b02004d650f08197d46e07a6592fefcb5 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:52:47 -0700 Subject: [PATCH 2/5] refactor: switch FriendlyEvaluator to OpenAI Responses API - Use 'from openai import OpenAI' instead of AzureOpenAI - Accept api_key and model params instead of model_config dict - Use client.responses.create() instead of chat.completions.create() - Update util.py: split build_evaluation_messages into build_evaluation_instructions() and build_evaluation_input() - Update sample init_parameters schema accordingly Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../friendly_evaluator/common_util/util.py | 34 ++++++----- .../friendly_evaluator/friendly_evaluator.py | 58 ++++++------------- .../sample_eval_upload_friendly_evaluator.py | 20 +++---- 3 files changed, 44 insertions(+), 68 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py index 026e0352f51f..188a80acd82b 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -20,24 +20,26 @@ """ -def build_evaluation_messages(query: str, response: str) -> list: - """Build the messages list for the LLM evaluation call. +def build_evaluation_instructions() -> str: + """Return the system instructions for the LLM evaluation call. + + :return: The system prompt string for the Responses API. + """ + return FRIENDLINESS_SYSTEM_PROMPT + + +def build_evaluation_input(query: str, response: str) -> str: + """Build the user input for the LLM evaluation call. :param query: The original user query. :param response: The response to evaluate for friendliness. - :return: A list of message dicts for the chat completion API. + :return: A string prompt for the Responses API. """ - return [ - {"role": "system", "content": FRIENDLINESS_SYSTEM_PROMPT}, - { - "role": "user", - "content": ( - f"Please evaluate the friendliness of the following response.\n\n" - f"Original query: {query}\n\n" - f"Response to evaluate: {response}" - ), - }, - ] + return ( + f"Please evaluate the friendliness of the following response.\n\n" + f"Original query: {query}\n\n" + f"Response to evaluate: {response}" + ) def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: @@ -80,8 +82,8 @@ def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: except (json.JSONDecodeError, ValueError, KeyError): return { "score": threshold, - "label": "Pass", + "label": "Fail", "reason": "Could not parse LLM response", "threshold": threshold, - "passed": True, + "passed": False, } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py index eaf173d1cde1..bbaa0c52910b 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -1,48 +1,25 @@ """Custom evaluator that uses an LLM to assess the friendliness of a response.""" -from openai import AzureOpenAI -from common_util.util import build_evaluation_messages, parse_evaluation_result +from openai import OpenAI +from common_util.util import build_evaluation_instructions, build_evaluation_input, parse_evaluation_result class FriendlyEvaluator: """Evaluates how friendly and approachable a response is using an LLM judge. - This evaluator sends the query and response to an LLM, which returns a - friendliness score (1-5), a pass/fail label, a reason, and a detailed explanation. + This evaluator sends the query and response to an LLM via the OpenAI Responses + API, which returns a friendliness score (1-5), a pass/fail label, a reason, + and a detailed explanation. - :param model_config: A dict containing Azure OpenAI connection info. Expected keys: - - azure_endpoint: The Azure OpenAI endpoint URL. - - azure_deployment: The deployment/model name. - - api_version: The API version (default: "2024-06-01"). - - api_key: (Optional) The API key. If not provided, DefaultAzureCredential is used. + :param api_key: The OpenAI API key. + :param model: The model to use for evaluation (e.g. "gpt-4o"). :param threshold: The minimum score (1-5) to be considered "Pass" (default: 3). """ - def __init__(self, *, model_config: dict, threshold: int = 3, **kwargs): - self.model_config = model_config + def __init__(self, *, api_key: str, model: str, threshold: int = 3, **kwargs): + self.client = OpenAI(api_key=api_key) + self.model = model self.threshold = threshold - api_key = model_config.get("api_key") - - if api_key: - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - api_key=api_key, - api_version=model_config.get("api_version", "2024-06-01"), - ) - else: - from azure.identity import DefaultAzureCredential, get_bearer_token_provider - - token_provider = get_bearer_token_provider( - DefaultAzureCredential(), - "https://cognitiveservices.azure.com/.default", - ) - self.client = AzureOpenAI( - azure_endpoint=model_config["azure_endpoint"], - azure_ad_token_provider=token_provider, - api_version=model_config.get("api_version", "2024-06-01"), - ) - - self.deployment = model_config["azure_deployment"] def __call__(self, *, query: str, response: str, **kwargs) -> dict: """Evaluate the friendliness of a response. @@ -51,16 +28,15 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict: :param response: The response to evaluate. :return: A dict with score, label, reason, threshold, passed, and properties. """ - messages = build_evaluation_messages(query, response) - - completion = self.client.chat.completions.create( - model=self.deployment, - messages=messages, + result = self.client.responses.create( + model=self.model, + instructions=build_evaluation_instructions(), + input=build_evaluation_input(query, response), temperature=0.0, - max_tokens=500, + max_output_tokens=500, ) - raw_result = completion.choices[0].message.content + raw_result = result.output_text if raw_result is None: - raise ValueError("No content in completion response") + raise ValueError("No content in response") return parse_evaluation_result(raw_result, self.threshold) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py index 67e168d3509a..144275f491d2 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py @@ -90,19 +90,17 @@ init_parameters={ "type": "object", "properties": { - "model_config": { - "type": "object", - "description": "Azure OpenAI configuration for the LLM judge", - "properties": { - "azure_endpoint": {"type": "string"}, - "api_version": {"type": "string"}, - "api_key": {"type": "string"}, - }, - "required": ["azure_endpoint", "api_key"], + "api_key": { + "type": "string", + "description": "OpenAI API key for the LLM judge", + }, + "model": { + "type": "string", + "description": "Model name to use for evaluation (e.g. gpt-4o)", }, "threshold": {"type": "number"}, }, - "required": ["model_config", "threshold"], + "required": ["api_key", "model", "threshold"], }, data_schema={ "type": "object", @@ -158,7 +156,7 @@ "name": evaluator_name, "evaluator_name": evaluator_name, "initialization_parameters": { - "deployment_name": f"{model_deployment_name}", # provide model_config or, deployment name passed is used to construct the model_config for the evaluator. + "deployment_name": f"{model_deployment_name}", # service converts deployment_name to api_key/model for the evaluator "threshold": 3, }, } From ccd1529dc2eb76eb89079c576814ed95130b95f6 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:58:00 -0700 Subject: [PATCH 3/5] docs: add required/optional field comments per PR review Address aprilk-ms review: annotate which fields in the evaluation result dict are required vs optional for the evaluation service. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../custom_evaluators/friendly_evaluator/common_util/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py index 188a80acd82b..07aca09f6a14 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -72,12 +72,14 @@ def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: properties = {k: v for k, v in result.items() if k not in top_level_keys} return { + # --- Required fields (must be present for the evaluation service) --- "score": score, "label": "Pass" if passed else "Fail", "reason": result.get("reason", "No reason provided"), + # --- Optional fields --- "threshold": threshold, "passed": passed, - "properties": properties, + "properties": properties, # extra metadata surfaced in the evaluation results } except (json.JSONDecodeError, ValueError, KeyError): return { From 805a66c05fa8a3e00347e94d61a967ccfa2bb436 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:17:42 -0700 Subject: [PATCH 4/5] fix: update sample init_parameters to pass api_key and model Align sample_eval_upload_friendly_evaluator.py with the updated FriendlyEvaluator that takes api_key and model instead of deployment_name/model_config. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../sample_eval_upload_friendly_evaluator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py index 144275f491d2..eec96e4a4deb 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py @@ -56,9 +56,8 @@ load_dotenv() endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] -model_deployment_name = os.environ.get("FOUNDRY_MODEL_NAME") -azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"] -azure_openai_api_key = os.environ["AZURE_OPENAI_API_KEY"] +openai_api_key = os.environ["OPENAI_API_KEY"] +openai_model = os.environ.get("OPENAI_MODEL", "gpt-4.1") # The folder containing the FriendlyEvaluator code, including common_util/ subfolder local_upload_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") @@ -96,7 +95,7 @@ }, "model": { "type": "string", - "description": "Model name to use for evaluation (e.g. gpt-4o)", + "description": "Model name to use for evaluation (e.g. gpt-4.1)", }, "threshold": {"type": "number"}, }, @@ -156,7 +155,8 @@ "name": evaluator_name, "evaluator_name": evaluator_name, "initialization_parameters": { - "deployment_name": f"{model_deployment_name}", # service converts deployment_name to api_key/model for the evaluator + "api_key": openai_api_key, + "model": openai_model, "threshold": 3, }, } From 35fd916aba822f2e9f46e87d28f97a9dcda0b3c8 Mon Sep 17 00:00:00 2001 From: Waqas Javed <7674577+w-javed@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:31:45 -0700 Subject: [PATCH 5/5] feat: combine standalone + upload into single friendly evaluator sample Merge sample_custom_evaluator_friendly_evaluator.py into sample_eval_upload_friendly_evaluator.py so the sample first runs FriendlyEvaluator locally, then uploads, creates eval, and runs it. Fix model_name parameter to match evaluator __init__ signature. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../friendly_evaluator/friendly_evaluator.py | 8 +- .../sample_eval_upload_custom_evaluator.py | 16 +- .../sample_eval_upload_friendly_evaluator.py | 137 ++++++++++++------ 3 files changed, 103 insertions(+), 58 deletions(-) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py index bbaa0c52910b..a72ea35988c6 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -12,13 +12,13 @@ class FriendlyEvaluator: and a detailed explanation. :param api_key: The OpenAI API key. - :param model: The model to use for evaluation (e.g. "gpt-4o"). + :param model_name: The model_name to use for evaluation (e.g. "gpt-4o"). :param threshold: The minimum score (1-5) to be considered "Pass" (default: 3). """ - def __init__(self, *, api_key: str, model: str, threshold: int = 3, **kwargs): + def __init__(self, *, api_key: str, model_name: str, threshold: int = 3, **kwargs): self.client = OpenAI(api_key=api_key) - self.model = model + self.model_name = model_name self.threshold = threshold def __call__(self, *, query: str, response: str, **kwargs) -> dict: @@ -29,7 +29,7 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict: :return: A dict with score, label, reason, threshold, passed, and properties. """ result = self.client.responses.create( - model=self.model, + model=self.model_name, instructions=build_evaluation_instructions(), input=build_evaluation_input(query, response), temperature=0.0, diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py index b6d1a9195fbf..491f6623004d 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_custom_evaluator.py @@ -215,13 +215,13 @@ print("Waiting for evaluation run to complete...") # --------------------------------------------------------------- - # 5. Cleanup (uncomment to delete) + # 5. Cleanup # --------------------------------------------------------------- - # print("\nCleaning up...") - # project_client.beta.evaluators.delete_version( - # name=code_evaluator.name, - # version=code_evaluator.version, - # ) - # client.evals.delete(eval_id=eval_object.id) - # print("Cleanup done.") + print("\nCleaning up...") + project_client.beta.evaluators.delete_version( + name=code_evaluator.name, + version=code_evaluator.version, + ) + client.evals.delete(eval_id=eval_object.id) + print("Cleanup done.") print("\nDone - upload, eval creation, and eval run verified successfully.") diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py index eec96e4a4deb..434c6b2b8c43 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_eval_upload_friendly_evaluator.py @@ -7,13 +7,14 @@ """ DESCRIPTION: Given an AIProjectClient, this sample demonstrates how to: - 1. Upload a custom LLM-based evaluator (FriendlyEvaluator) with nested - folder structure (common_util/) using `evaluators.upload()`. - 2. Create an evaluation (eval) that references the uploaded evaluator. - 3. Run the evaluation with inline data and poll for results. + 1. Run the FriendlyEvaluator standalone to verify it works locally. + 2. Upload the evaluator code (with nested folder structure) using + ``evaluators.upload()``. + 3. Create an evaluation (eval) that references the uploaded evaluator. + 4. Run the evaluation with inline data and poll for results. - The FriendlyEvaluator calls Azure OpenAI to judge the friendliness of a - response and returns score, label, reason, and explanation. + The FriendlyEvaluator calls OpenAI Responses API to judge the friendliness + of a response and returns score, label, reason, and explanation. USAGE: python sample_eval_upload_friendly_evaluator.py @@ -24,10 +25,13 @@ Set these environment variables with your own values: 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. - 2) FOUNDRY_MODEL_NAME - Optional. The name of the model deployment to use for evaluation. + 2) OPENAI_API_KEY - Required. The OpenAI API key. + 3) OPENAI_MODEL - Optional. The model to use (default: gpt-4o). """ import os +import sys +import json import time import random import string @@ -57,10 +61,48 @@ endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] openai_api_key = os.environ["OPENAI_API_KEY"] -openai_model = os.environ.get("OPENAI_MODEL", "gpt-4.1") +openai_model = os.environ.get("OPENAI_MODEL", "gpt-4o") -# The folder containing the FriendlyEvaluator code, including common_util/ subfolder -local_upload_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") +# Add the evaluator folder to sys.path so we can import it for local testing +evaluator_folder = str(Path(__file__).parent / "custom_evaluators" / "friendly_evaluator") +sys.path.insert(0, evaluator_folder) + +from friendly_evaluator import FriendlyEvaluator # noqa: E402 + +# --------------------------------------------------------------- +# 1. Run FriendlyEvaluator standalone to verify it works locally +# --------------------------------------------------------------- +print(f"=== Step 1: Standalone FriendlyEvaluator test (model={openai_model}) ===\n") + +evaluator = FriendlyEvaluator(api_key=openai_api_key, model_name=openai_model, threshold=3) + +test_cases = [ + { + "query": "How do I reset my password?", + "response": "Go to settings. Click reset. Done.", + }, + { + "query": "How do I reset my password?", + "response": ( + "Great question! I'd be happy to help you reset your password. " + "Just head over to Settings > Security > Reset Password, and follow " + "the prompts. If you run into any trouble, feel free to ask — I'm here to help! 😊" + ), + }, + { + "query": "Can you help me with my order?", + "response": "Read the FAQ.", + }, +] + +for i, tc in enumerate(test_cases, 1): + print(f"--- Test Case {i} ---") + print(f"Query: {tc['query']}") + print(f"Response: {tc['response'][:80]}...") + result = evaluator(query=tc["query"], response=tc["response"]) + print(f"Result: {json.dumps(result, indent=2)}\n") + +print("Standalone test complete.\n") with ( DefaultAzureCredential() as credential, @@ -68,7 +110,7 @@ project_client.get_openai_client() as client, ): # --------------------------------------------------------------- - # 1. Upload evaluator code and create evaluator version + # 2. Upload evaluator code and create evaluator version # The folder structure uploaded is: # friendly_evaluator/ # friendly_evaluator.py <- entry point @@ -79,6 +121,8 @@ suffix = "".join(random.choices(string.ascii_lowercase, k=5)) evaluator_name = f"friendly_evaluator_{suffix}" + print(f"=== Step 2: Upload evaluator as '{evaluator_name}' ===\n") + evaluator_version = EvaluatorVersion( evaluator_type=EvaluatorType.CUSTOM, categories=[EvaluatorCategory.QUALITY], @@ -93,13 +137,13 @@ "type": "string", "description": "OpenAI API key for the LLM judge", }, - "model": { + "model_name": { "type": "string", - "description": "Model name to use for evaluation (e.g. gpt-4.1)", + "description": "Model name to use for evaluation (e.g. gpt-4o)", }, "threshold": {"type": "number"}, }, - "required": ["api_key", "model", "threshold"], + "required": ["api_key", "model_name", "threshold"], }, data_schema={ "type": "object", @@ -120,33 +164,32 @@ ), ) - print("Uploading FriendlyEvaluator (with nested common_util folder)...") friendly_evaluator = project_client.beta.evaluators.upload( name=evaluator_name, evaluator_version=evaluator_version, - folder=local_upload_folder, + folder=evaluator_folder, ) - print(f"\nEvaluator created: name={friendly_evaluator.name}, version={friendly_evaluator.version}") + print(f"Evaluator created: name={friendly_evaluator.name}, version={friendly_evaluator.version}") print(f"Evaluator ID: {friendly_evaluator.id}") pprint(friendly_evaluator) # --------------------------------------------------------------- - # 2. Create an evaluation referencing the uploaded evaluator + # 3. Create an evaluation referencing the uploaded evaluator # --------------------------------------------------------------- + print(f"\n=== Step 3: Create evaluation ===\n") + data_source_config = DataSourceConfigCustom( - { - "type": "custom", - "item_schema": { - "type": "object", - "properties": { - "query": {"type": "string"}, - "response": {"type": "string"}, - }, - "required": ["query", "response"], + type="custom", + item_schema={ + "type": "object", + "properties": { + "query": {"type": "string"}, + "response": {"type": "string"}, }, - "include_sample_schema": True, - } + "required": ["query", "response"], + }, + include_sample_schema=True, ) testing_criteria = [ @@ -155,14 +198,13 @@ "name": evaluator_name, "evaluator_name": evaluator_name, "initialization_parameters": { - "api_key": openai_api_key, - "model": openai_model, - "threshold": 3, + "api_key": openai_api_key, + "model_name": openai_model, + "threshold": 3, }, } ] - print("\nCreating evaluation...") eval_object = client.evals.create( name=f"Friendliness Evaluation - {suffix}", data_source_config=data_source_config, @@ -171,9 +213,10 @@ print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") # --------------------------------------------------------------- - # 3. Run the evaluation with inline data + # 4. Run the evaluation with inline data # --------------------------------------------------------------- - print("\nCreating evaluation run with inline data...") + print(f"\n=== Step 4: Create evaluation run ===\n") + eval_run_object = client.evals.runs.create( eval_id=eval_object.id, name=f"Friendliness Eval Run - {suffix}", @@ -216,12 +259,14 @@ pprint(eval_run_object) # --------------------------------------------------------------- - # 4. Poll for evaluation run completion + # 5. Poll for evaluation run completion # --------------------------------------------------------------- + print("\n=== Step 5: Polling for results ===\n") + while True: run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) if run.status in ("completed", "failed"): - print(f"\nEvaluation run finished with status: {run.status}") + print(f"Evaluation run finished with status: {run.status}") output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) pprint(output_items) print(f"\nEvaluation run Report URL: {run.report_url}") @@ -230,13 +275,13 @@ print("Waiting for evaluation run to complete...") # --------------------------------------------------------------- - # 5. Cleanup (uncomment to delete) + # 6. Cleanup # --------------------------------------------------------------- - # print("\nCleaning up...") - # project_client.beta.evaluators.delete_version( - # name=friendly_evaluator.name, - # version=friendly_evaluator.version, - # ) - # client.evals.delete(eval_id=eval_object.id) - # print("Cleanup done.") - print("\nDone - FriendlyEvaluator upload, eval creation, and eval run verified successfully.") + print("\nCleaning up...") + project_client.beta.evaluators.delete_version( + name=friendly_evaluator.name, + version=friendly_evaluator.version, + ) + client.evals.delete(eval_id=eval_object.id) + print("Cleanup done.") + print("\nDone - FriendlyEvaluator standalone test, upload, eval creation, and eval run verified successfully.")