diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py index aa137276e55c..026e0352f51f 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/common_util/util.py @@ -12,12 +12,11 @@ You MUST respond in the following JSON format only: { "score": , - "label": "", "reason": "", - "explanation": "" + "explanation": "", + "tone": "", + "confidence": "" } - -A score of 3 or above is considered "Pass", below 3 is "Fail". """ @@ -44,12 +43,19 @@ def build_evaluation_messages(query: str, response: str) -> list: def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: """Parse the LLM's JSON response into a structured evaluation result. + The return dict has the standard top-level keys (score, label, reason, + threshold, passed) and a ``properties`` dict for any extra output fields + the evaluator wants to surface. + :param raw_result: The raw string output from the LLM. :param threshold: The minimum score to be considered "Pass". - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ import json + # Keys that are promoted to the top level of the result + top_level_keys = {"score", "label", "reason"} + try: # Try to extract JSON from the response (handle markdown code blocks) text = raw_result.strip() @@ -57,17 +63,25 @@ def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict: text = text.split("\n", 1)[1] if "\n" in text else text[3:] text = text.rsplit("```", 1)[0] result = json.loads(text.strip()) - score = int(result.get("score", threshold)) + score = max(1, min(5, int(result.get("score", threshold)))) + passed = score >= threshold + + # Collect any extra fields returned by the LLM into properties + properties = {k: v for k, v in result.items() if k not in top_level_keys} + return { - "score": max(1, min(5, score)), - "label": result.get("label", "Pass" if score >= threshold else "Fail"), + "score": score, + "label": "Pass" if passed else "Fail", "reason": result.get("reason", "No reason provided"), - "explanation": result.get("explanation", "No explanation provided"), + "threshold": threshold, + "passed": passed, + "properties": properties, } except (json.JSONDecodeError, ValueError, KeyError): return { "score": threshold, "label": "Pass", "reason": "Could not parse LLM response", - "explanation": f"Raw LLM output: {raw_result}", + "threshold": threshold, + "passed": True, } diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py index 730237af61f5..eaf173d1cde1 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/custom_evaluators/friendly_evaluator/friendly_evaluator.py @@ -49,7 +49,7 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict: :param query: The original user query. :param response: The response to evaluate. - :return: A dict with score, label, reason, and explanation. + :return: A dict with score, label, reason, threshold, passed, and properties. """ messages = build_evaluation_messages(query, response)