diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
index d46f3dd216fb..d89648c223e7 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -65,6 +65,21 @@
LOGGER = logging.getLogger(__name__)
+_RESULT_PROPERTY_SUFFIXES = (
+ "_result",
+ "_reason",
+ "_threshold",
+ "_label",
+ "_score",
+ "_model",
+ "_finish_reason",
+ "_sample_input",
+ "_sample_output",
+ "_total_tokens",
+ "_prompt_tokens",
+ "_completion_tokens",
+)
+
# For metrics (aggregates) whose metric names intentionally differ from their
# originating column name, usually because the aggregation of the original value
# means something sufficiently different.
@@ -2511,9 +2526,11 @@ def _process_criteria_metrics(
{"input": "...", "output": "..."}
)
"""
- expected_metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])
- criteria_type = testing_criteria_metadata.get(criteria_name, {}).get("type", "")
- is_inverse = testing_criteria_metadata.get(criteria_name, {}).get("is_inverse", False)
+ criteria_metadata = testing_criteria_metadata.get(criteria_name, {})
+ expected_metrics = criteria_metadata.get("metrics", [])
+ criteria_type = criteria_metadata.get("type", "")
+ evaluator_name = criteria_metadata.get("evaluator_name", "")
+ is_inverse = criteria_metadata.get("is_inverse", False)
if _is_none_or_nan(criteria_type) or _is_none_or_nan(criteria_name):
logger.warning(
@@ -2522,7 +2539,15 @@ def _process_criteria_metrics(
return ([], {})
# Extract metric values
- result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger)
+ include_property_bag = _should_include_property_bag(evaluator_name, metrics)
+ result_per_metric = _extract_metric_values(
+ criteria_name,
+ criteria_type,
+ include_property_bag,
+ metrics,
+ expected_metrics,
+ logger,
+ )
# Inject threshold from evaluator config when not present in raw results
# (e.g., PythonGrader/code evaluators don't emit a threshold column)
@@ -2550,7 +2575,12 @@ def _process_criteria_metrics(
def _extract_metric_values(
- criteria_name: str, criteria_type: str, metrics: Dict[str, Any], expected_metrics: List[str], logger: logging.Logger
+ criteria_name: str,
+ criteria_type: str,
+ include_property_bag: bool,
+ metrics: Dict[str, Any],
+ expected_metrics: List[str],
+ logger: logging.Logger,
) -> Dict[str, Dict[str, Any]]:
"""Extract and organize metric values by metric name.
@@ -2599,16 +2629,23 @@ def _extract_metric_values(
result_per_metric[metric] = {}
result_name, result_name_child_level, result_name_nested_child_level, derived_passed = _update_metric_value(
- criteria_type, result_per_metric[metric], metric_key, metric, metric_value, logger
- )
- _append_indirect_attachments_to_results(
- result_per_metric,
- result_name,
+ criteria_type,
+ result_per_metric[metric],
+ metric_key,
metric,
metric_value,
- result_name_child_level,
- result_name_nested_child_level,
+ logger,
+ include_property_bag=include_property_bag,
)
+ if result_name is not None:
+ _append_indirect_attachments_to_results(
+ result_per_metric,
+ result_name,
+ metric,
+ metric_value,
+ result_name_child_level,
+ result_name_nested_child_level,
+ )
if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)
@@ -2629,6 +2666,7 @@ def _update_metric_value(
metric: str,
metric_value: Any,
logger: logging.Logger,
+ include_property_bag: bool = False,
) -> Tuple[str, str, str]:
"""Update metric dictionary with the appropriate field based on metric key.
@@ -2638,6 +2676,10 @@ def _update_metric_value(
:param criteria_type: Type of the evaluation criteria (e.g. 'azure_ai_evaluator')
:type criteria_type: str
+ :param include_property_bag: Whether non-standard metric fields should be preserved under
+ the AOAI result properties bag. This defaults to False for backwards compatibility
+ with direct helper callers; the AOAI conversion path passes the explicit value.
+ :type include_property_bag: bool
:param metric_dict: Dictionary to update with metric values
:type metric_dict: Dict[str, Any]
:param metric_key: Key name of the metric (determines field assignment)
@@ -2674,13 +2716,20 @@ def _update_metric_value(
result_name_nested_child_level = None
derived_passed = None
+ property_name = None if metric_key == metric else _get_result_property_name(metric_key)
+ if property_name and include_property_bag:
+ _ensure_properties_dict(metric_dict)
+ metric_dict["properties"][property_name] = metric_value
+ result_name = "properties"
+ result_name_child_level = property_name
+
if metric_key.endswith("_score") or metric_key == "score":
metric_dict["score"] = metric_value
result_name = "score"
elif metric_key == "passed":
metric_dict["passed"] = metric_value
result_name = "passed"
- elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
+ elif metric_key.endswith("_result") or metric_key in ["result", "label"] or metric_key.endswith("_label"):
metric_dict["label"] = metric_value
result_name = "label"
if criteria_type == "azure_ai_evaluator":
@@ -2746,23 +2795,7 @@ def _update_metric_value(
result_name = "sample"
result_name_child_level = "usage"
result_name_nested_child_level = "completion_tokens"
- elif not any(
- metric_key.endswith(suffix)
- for suffix in [
- "_result",
- "_reason",
- "_threshold",
- "_label",
- "_score",
- "_model",
- "_finish_reason",
- "_sample_input",
- "_sample_output",
- "_total_tokens",
- "_prompt_tokens",
- "_completion_tokens",
- ]
- ):
+ elif not any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
# If no score found yet and this doesn't match other patterns, use as score
if metric_key == metric and metric_dict.get("score", None) is None:
metric_dict["score"] = metric_value
@@ -2814,6 +2847,43 @@ def _ensure_usage_dict(metric_dict: Dict[str, Any]) -> None:
metric_dict["sample"]["usage"] = {}
+def _ensure_properties_dict(metric_dict: Dict[str, Any]) -> None:
+ """Ensure properties dictionary exists in metric_dict.
+
+ :param metric_dict: Metric dictionary to modify
+ :type metric_dict: Dict[str, Any]
+ :return: None (modifies metric_dict in place)
+ :rtype: None
+ """
+ if "properties" not in metric_dict:
+ metric_dict["properties"] = {}
+
+
+def _should_include_property_bag(evaluator_name: Optional[str], metrics: Dict[str, Any]) -> bool:
+ """Return whether AOAI result properties should be emitted for a custom evaluator result."""
+ if "custom_score" in metrics:
+ return True
+ return not (evaluator_name and evaluator_name.startswith("builtin."))
+
+
+def _get_result_property_name(metric_key: str) -> Optional[str]:
+ """Return the result property name for fields that should be preserved in properties."""
+ result_property_prefix = "custom_"
+ standard_metric_keys = {"score", "passed", "result", "label", "reason", "threshold", "sample"}
+
+ if metric_key.startswith(result_property_prefix) and len(metric_key) > len(result_property_prefix):
+ if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+ return None
+ return metric_key[len(result_property_prefix) :]
+
+ if metric_key not in standard_metric_keys:
+ if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+ return None
+ return metric_key
+
+ return None
+
+
def _create_result_object(
criteria_name: str,
metric: str,
@@ -2879,6 +2949,7 @@ def _create_result_object(
threshold = metric_values.get("threshold")
passed = metric_values.get("passed")
sample = metric_values.get("sample")
+ properties = metric_values.get("properties")
# Handle decrease boolean metrics
if is_inverse:
@@ -2898,6 +2969,8 @@ def _create_result_object(
if sample is not None:
result_obj["sample"] = sample
+ if properties is not None:
+ result_obj["properties"] = properties
return result_obj
@@ -3264,7 +3337,11 @@ def _append_indirect_attachments_to_results(
) = metric_value
-def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
+def _get_metric_from_criteria(
+ testing_criteria_name: str,
+ metric_key: str,
+ metric_list: List[str],
+) -> str:
"""
Get the metric name from the testing criteria and metric key.
@@ -3291,6 +3368,9 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
elif metric_key == "f1_result" or metric_key == "f1_threshold" or metric_key == "f1_score":
metric = "f1_score"
return metric
+ elif len(metric_list) == 1:
+ metric = metric_list[0]
+ return metric
for expected_metric in metric_list:
if metric_key.startswith(expected_metric):
metric = expected_metric
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
index 6b40439c3ebd..0d88b1380898 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
@@ -208,6 +208,9 @@
"temperature": 1.0,
"top_p": 1.0,
"max_completions_tokens": 4096
+ },
+ "properties": {
+ "type": null
}
},
{
@@ -270,6 +273,9 @@
"content": "Let's think step by step: The response \"washington, d.c.\" is extremely brief and lacks any sentence structure, grammatical context, or vocabulary range. It does not convey a clear idea or message, making it difficult to understand. Based on the definitions, this aligns with Emergent Fluency as it shows minimal command of the language and is largely incomprehensible. \nThe response is fragmented and does not demonstrate any fluency traits such as grammar, vocabulary, or coherence. It is incomprehensible as a standalone statement. \n1 "
}
]
+ },
+ "properties": {
+ "gpt_fluency": 1.0
}
},
{
@@ -398,6 +404,9 @@
"temperature": 1.0,
"top_p": 1.0,
"max_completions_tokens": 4096
+ },
+ "properties": {
+ "type": null
}
},
{
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
index 47ef67eb4baa..a680bdbbb380 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Union
+from typing import List, Dict, Union, cast
import json
import logging
import math
@@ -26,6 +26,7 @@
SelfHarmEvaluator,
HateUnfairnessEvaluator,
AzureOpenAIModelConfiguration,
+ EvaluationResult,
)
from azure.ai.evaluation._aoai.label_grader import AzureOpenAILabelGrader
from azure.ai.evaluation._constants import (
@@ -1209,7 +1210,10 @@ def test_convert_results_to_aoai_evaluation_results(self):
eval_id = "test_eval_group_123"
eval_run_id = "test_run_456"
# Create EvaluationResult structure
- test_results = {"metrics": {"overall_score": 0.75}, "rows": test_rows, "studio_url": "https://test-studio.com"}
+ test_results = cast(
+ EvaluationResult,
+ {"metrics": {"overall_score": 0.75}, "rows": test_rows, "studio_url": "https://test-studio.com"},
+ )
# Test the conversion function
def run_test():
@@ -1247,6 +1251,12 @@ def run_test():
expected_results_json = json.load(f)
assert converted_results_json == expected_results_json
+ builtin_results = converted_results["_evaluation_results_list"][0]["results"]
+ labelgrader_result = next(result for result in builtin_results if result["name"] == "labelgrader")
+ fluency_result = next(result for result in builtin_results if result["name"] == "Fluency")
+ assert labelgrader_result["properties"] == {"type": None}
+ assert fluency_result["properties"] == {"gpt_fluency": 1.0}
+
# Verify metrics preserved
assert converted_results["metrics"]["overall_score"] == 0.75
@@ -1343,7 +1353,7 @@ def run_test():
assert "cached_tokens" in usage_item
# Test with empty results
- empty_results = {"metrics": {}, "rows": [], "studio_url": None}
+ empty_results = cast(EvaluationResult, {"metrics": {}, "rows": [], "studio_url": None})
_convert_results_to_aoai_evaluation_results(
results=empty_results, logger=logger, eval_run_id=eval_run_id, eval_id=eval_id, evaluators=evaluators
)
@@ -1353,6 +1363,55 @@ def run_test():
assert len(empty_converted["_evaluation_results_list"]) == 0
assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0
+ property_results = cast(
+ EvaluationResult,
+ {
+ "metrics": {},
+ "rows": [
+ {
+ "inputs.query": "test query",
+ "outputs.friendly_evaluator_gh4y.custom_score": 4.5,
+ "outputs.friendly_evaluator_gh4y.custom_threshold": 3,
+ "outputs.friendly_evaluator_gh4y.label": False,
+ "outputs.friendly_evaluator_gh4y.custom_observation_flag": False,
+ "outputs.friendly_evaluator_gh4y.explanation": "Detailed attack reasoning",
+ "outputs.friendly_evaluator_gh4y.attack_phase": "probe",
+ }
+ ],
+ "studio_url": None,
+ },
+ )
+
+ _convert_results_to_aoai_evaluation_results(
+ results=property_results,
+ logger=logger,
+ eval_run_id=eval_run_id,
+ eval_id=eval_id,
+ evaluators={"friendly_evaluator_gh4y": lambda **kwargs: {"score": 1}},
+ eval_meta_data={
+ "testing_criteria": [
+ {
+ "name": "friendly_evaluator_gh4y",
+ "type": "quality",
+ "metrics": ["score"],
+ "evaluator_name": "builtin.friendly_evaluator_gh4y",
+ }
+ ]
+ },
+ )
+
+ property_result = property_results["_evaluation_results_list"][0]["results"][0]
+ assert property_result["properties"] == {
+ "observation_flag": False,
+ "explanation": "Detailed attack reasoning",
+ "attack_phase": "probe",
+ }
+ assert property_result["score"] == 4.5
+ assert property_result["reason"] is None
+ assert "explanation" not in property_result
+ assert property_result["threshold"] == 3
+ assert property_result["label"] is False
+
@patch(
"azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins",
return_value={},
@@ -1826,6 +1885,33 @@ def test_nan_string_maps_to_none(self, suffix):
token_key = suffix.lstrip("_")
assert result["sample"]["usage"][token_key] is None
+ def test_legacy_helper_call_does_not_create_properties_bag(self):
+ metric_dict = {}
+ _update_metric_value(
+ criteria_type="azure_ai_evaluator",
+ metric_dict=metric_dict,
+ metric_key="custom_observation_flag",
+ metric="score",
+ metric_value=False,
+ logger=logging.getLogger("test"),
+ )
+
+ assert "properties" not in metric_dict
+
+ def test_include_property_bag_preserves_custom_fields(self):
+ metric_dict = {}
+ _update_metric_value(
+ criteria_type="azure_ai_evaluator",
+ include_property_bag=True,
+ metric_dict=metric_dict,
+ metric_key="custom_observation_flag",
+ metric="score",
+ metric_value=False,
+ logger=logging.getLogger("test"),
+ )
+
+ assert metric_dict["properties"] == {"observation_flag": False}
+
@pytest.mark.unittest
class TestBuildInternalLogAttributesThreshold: