Azure · w-javed · Apr 2, 2026
@@ -12,12 +12,11 @@
 You MUST respond in the following JSON format only:
 {
     "score": <integer 1-5>,
-    "label": "<Pass or Fail>",
     "reason": "<brief reason for the score>",
-    "explanation": "<detailed explanation of why the response received this score>"
+    "explanation": "<detailed explanation of why the response received this score>",
+    "tone": "<the overall tone detected, e.g. warm, neutral, dismissive>",
+    "confidence": "<high, medium, or low confidence in the assessment>"
 }
-
-A score of 3 or above is considered "Pass", below 3 is "Fail".
 """
 
 
@@ -44,30 +43,45 @@ def build_evaluation_messages(query: str, response: str) -> list:
 def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict:
     """Parse the LLM's JSON response into a structured evaluation result.
 
+    The return dict has the standard top-level keys (score, label, reason,
+    threshold, passed) and a ``properties`` dict for any extra output fields
+    the evaluator wants to surface.
+
     :param raw_result: The raw string output from the LLM.
     :param threshold: The minimum score to be considered "Pass".
-    :return: A dict with score, label, reason, and explanation.
+    :return: A dict with score, label, reason, threshold, passed, and properties.
     """
     import json
 
+    # Keys that are promoted to the top level of the result
+    top_level_keys = {"score", "label", "reason"}
+
     try:
         # Try to extract JSON from the response (handle markdown code blocks)
         text = raw_result.strip()
         if text.startswith("```"):
             text = text.split("\n", 1)[1] if "\n" in text else text[3:]
             text = text.rsplit("```", 1)[0]
         result = json.loads(text.strip())
-        score = int(result.get("score", threshold))
+        score = max(1, min(5, int(result.get("score", threshold))))
+        passed = score >= threshold
+
+        # Collect any extra fields returned by the LLM into properties
+        properties = {k: v for k, v in result.items() if k not in top_level_keys}
+
         return {
-            "score": max(1, min(5, score)),
-            "label": result.get("label", "Pass" if score >= threshold else "Fail"),
+            "score": score,
+            "label": "Pass" if passed else "Fail",
             "reason": result.get("reason", "No reason provided"),
-            "explanation": result.get("explanation", "No explanation provided"),
+            "threshold": threshold,
+            "passed": passed,
+            "properties": properties,
         }
     except (json.JSONDecodeError, ValueError, KeyError):
         return {
             "score": threshold,
             "label": "Pass",
             "reason": "Could not parse LLM response",
-            "explanation": f"Raw LLM output: {raw_result}",
+            "threshold": threshold,
+            "passed": True,
         }
@@ -49,7 +49,7 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict:
 
         :param query: The original user query.
         :param response: The response to evaluate.
-        :return: A dict with score, label, reason, and explanation.
+        :return: A dict with score, label, reason, threshold, passed, and properties.
         """
         messages = build_evaluation_messages(query, response)