Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
# Release History

## 1.16.3 (Unreleased)
## 1.16.4 (Unreleased)

### Features Added

- Added support for evaluator `properties` passthrough in AOAI evaluation results. When an evaluator returns a `properties` dict, it is included alongside `score`, `label`, `reason`, `threshold`, and `passed` in the result object.

### Breaking Changes

### Bugs Fixed

### Other Changes

## 1.16.3 (2026-04-01)

### Features Added

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2576,7 +2576,8 @@ def _extract_metric_values(
"score": 4.5,
"coherence_reason": "Good flow",
"threshold": 3.0,
"sample": {...}
"sample": {...},
"properties": {"explanation": "Detailed analysis...", "confidence": 0.95}
}
expected_metrics = ["score"]

Expand All @@ -2586,13 +2587,22 @@ def _extract_metric_values(
"score": 4.5,
"reason": "Good flow",
"threshold": 3.0,
"sample": {...}
"sample": {...},
"properties": {"explanation": "Detailed analysis...", "confidence": 0.95}
}
}

Note: If a ``properties`` key is present in the metrics dict and its value is a dict,
it is extracted and attached to every per-metric result entry. This allows evaluators
to return additional output fields alongside standard score/reason/threshold values.
"""
result_per_metric = {}
properties = None

for metric_key, metric_value in metrics.items():
if metric_key == "properties" and isinstance(metric_value, dict):
properties = metric_value
continue
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
temp_result_per_metric = {}
if metric not in result_per_metric:
Expand All @@ -2612,6 +2622,11 @@ def _extract_metric_values(
if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)

if properties is not None:
for metric_dict in result_per_metric.values():
if metric_dict is not None and len(metric_dict) > 0:
metric_dict["properties"] = properties

empty_metrics = []
empty_metrics.extend(
metric for metric, metric_dict in result_per_metric.items() if metric_dict is None or len(metric_dict) == 0
Expand Down Expand Up @@ -2869,8 +2884,13 @@ def _create_result_object(
"reason": "Good logical flow",
"threshold": 3.0,
"passed": None,
"sample": {"input": "...", "output": "..."}
"sample": {"input": "...", "output": "..."},
"properties": {"explanation": "...", "confidence": 0.95}
}

Note: The ``properties`` field is included only when the evaluator returned a
properties dict. It carries additional output fields beyond the standard
score/label/reason/threshold/passed values.
"""
# Extract values
score = metric_values.get("score")
Expand All @@ -2879,6 +2899,7 @@ def _create_result_object(
threshold = metric_values.get("threshold")
passed = metric_values.get("passed")
sample = metric_values.get("sample")
properties = metric_values.get("properties")

# Handle decrease boolean metrics
if is_inverse:
Expand All @@ -2898,6 +2919,8 @@ def _create_result_object(

if sample is not None:
result_obj["sample"] = sample
if properties is not None:
result_obj["properties"] = properties

return result_obj

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# ---------------------------------------------------------
# represents upcoming version

VERSION = "1.16.3"
VERSION = "1.16.4"
Original file line number Diff line number Diff line change
Expand Up @@ -1353,6 +1353,55 @@ def run_test():
assert len(empty_converted["_evaluation_results_list"]) == 0
assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0

# Test properties passthrough for custom evaluators
property_results = {
"metrics": {},
"rows": [
{
"inputs.query": "test query",
"outputs.friendly_eval.score": 4.5,
"outputs.friendly_eval.score_threshold": 3,
"outputs.friendly_eval.score_result": "Pass",
"outputs.friendly_eval.score_reason": "The response was warm",
"outputs.friendly_eval.properties": {
"explanation": "Detailed reasoning about friendliness",
"tone": "warm",
"confidence": "high",
},
}
],
"studio_url": None,
}

_convert_results_to_aoai_evaluation_results(
results=property_results,
logger=logger,
eval_run_id=eval_run_id,
eval_id=eval_id,
evaluators={"friendly_eval": lambda **kwargs: {"score": 1}},
eval_meta_data={
"testing_criteria": [
{
"name": "friendly_eval",
"type": "quality",
"metrics": ["score"],
}
]
},
)

property_result = property_results["_evaluation_results_list"][0]["results"][0]
assert property_result["score"] == 4.5
assert property_result["label"] == "Pass"
assert property_result["reason"] == "The response was warm"
assert property_result["threshold"] == 3
assert property_result["properties"] == {
"explanation": "Detailed reasoning about friendliness",
"tone": "warm",
"confidence": "high",
}
assert "explanation" not in property_result

@patch(
"azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins",
return_value={},
Expand Down
Loading