Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
You MUST respond in the following JSON format only:
{
"score": <integer 1-5>,
"label": "<Pass or Fail>",
"reason": "<brief reason for the score>",
"explanation": "<detailed explanation of why the response received this score>"
"explanation": "<detailed explanation of why the response received this score>",
"tone": "<the overall tone detected, e.g. warm, neutral, dismissive>",
"confidence": "<high, medium, or low confidence in the assessment>"
}

A score of 3 or above is considered "Pass", below 3 is "Fail".
"""


Expand All @@ -44,30 +43,45 @@ def build_evaluation_messages(query: str, response: str) -> list:
def parse_evaluation_result(raw_result: str, threshold: int = 3) -> dict:
"""Parse the LLM's JSON response into a structured evaluation result.

The return dict has the standard top-level keys (score, label, reason,
threshold, passed) and a ``properties`` dict for any extra output fields
the evaluator wants to surface.

:param raw_result: The raw string output from the LLM.
:param threshold: The minimum score to be considered "Pass".
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, passed, and properties.
"""
import json

# Keys that are promoted to the top level of the result
top_level_keys = {"score", "label", "reason"}

try:
# Try to extract JSON from the response (handle markdown code blocks)
text = raw_result.strip()
if text.startswith("```"):
text = text.split("\n", 1)[1] if "\n" in text else text[3:]
text = text.rsplit("```", 1)[0]
result = json.loads(text.strip())
score = int(result.get("score", threshold))
score = max(1, min(5, int(result.get("score", threshold))))
passed = score >= threshold

# Collect any extra fields returned by the LLM into properties
properties = {k: v for k, v in result.items() if k not in top_level_keys}

return {
"score": max(1, min(5, score)),
"label": result.get("label", "Pass" if score >= threshold else "Fail"),
"score": score,
"label": "Pass" if passed else "Fail",
"reason": result.get("reason", "No reason provided"),
"explanation": result.get("explanation", "No explanation provided"),
"threshold": threshold,
"passed": passed,
"properties": properties,
}
except (json.JSONDecodeError, ValueError, KeyError):
return {
"score": threshold,
"label": "Pass",
"reason": "Could not parse LLM response",
"explanation": f"Raw LLM output: {raw_result}",
"threshold": threshold,
"passed": True,
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __call__(self, *, query: str, response: str, **kwargs) -> dict:
:param query: The original user query.
:param response: The response to evaluate.
:return: A dict with score, label, reason, and explanation.
:return: A dict with score, label, reason, threshold, passed, and properties.
"""
messages = build_evaluation_messages(query, response)

Expand Down