Azure · ahmad-nader · Mar 31, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
@@ -65,6 +65,21 @@
 
 LOGGER = logging.getLogger(__name__)
 
+_RESULT_PROPERTY_SUFFIXES = (
+    "_result",
+    "_reason",
+    "_threshold",
+    "_label",
+    "_score",
+    "_model",
+    "_finish_reason",
+    "_sample_input",
+    "_sample_output",
+    "_total_tokens",
+    "_prompt_tokens",
+    "_completion_tokens",
+)
+
 # For metrics (aggregates) whose metric names intentionally differ from their
 # originating column name, usually because the aggregation of the original value
 # means something sufficiently different.
@@ -2511,9 +2526,11 @@ def _process_criteria_metrics(
             {"input": "...", "output": "..."}
         )
     """
-    expected_metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])
-    criteria_type = testing_criteria_metadata.get(criteria_name, {}).get("type", "")
-    is_inverse = testing_criteria_metadata.get(criteria_name, {}).get("is_inverse", False)
+    criteria_metadata = testing_criteria_metadata.get(criteria_name, {})
+    expected_metrics = criteria_metadata.get("metrics", [])
+    criteria_type = criteria_metadata.get("type", "")
+    evaluator_name = criteria_metadata.get("evaluator_name", "")
+    is_inverse = criteria_metadata.get("is_inverse", False)
 
     if _is_none_or_nan(criteria_type) or _is_none_or_nan(criteria_name):
         logger.warning(
@@ -2522,7 +2539,15 @@ def _process_criteria_metrics(
         return ([], {})
 
     # Extract metric values
-    result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger)
+    include_property_bag = _should_include_property_bag(evaluator_name, metrics)
+    result_per_metric = _extract_metric_values(
+        criteria_name,
+        criteria_type,
+        include_property_bag,
+        metrics,
+        expected_metrics,
+        logger,
+    )
 
     # Inject threshold from evaluator config when not present in raw results
     # (e.g., PythonGrader/code evaluators don't emit a threshold column)
@@ -2550,7 +2575,12 @@ def _process_criteria_metrics(
 
 
 def _extract_metric_values(
-    criteria_name: str, criteria_type: str, metrics: Dict[str, Any], expected_metrics: List[str], logger: logging.Logger
+    criteria_name: str,
+    criteria_type: str,
+    include_property_bag: bool,
+    metrics: Dict[str, Any],
+    expected_metrics: List[str],
+    logger: logging.Logger,
 ) -> Dict[str, Dict[str, Any]]:
     """Extract and organize metric values by metric name.
 
@@ -2599,16 +2629,23 @@ def _extract_metric_values(
             result_per_metric[metric] = {}
 
         result_name, result_name_child_level, result_name_nested_child_level, derived_passed = _update_metric_value(
-            criteria_type, result_per_metric[metric], metric_key, metric, metric_value, logger
-        )
-        _append_indirect_attachments_to_results(
-            result_per_metric,
-            result_name,
+            criteria_type,
+            result_per_metric[metric],
+            metric_key,
             metric,
             metric_value,
-            result_name_child_level,
-            result_name_nested_child_level,
+            logger,
+            include_property_bag=include_property_bag,
         )
+        if result_name is not None:
+            _append_indirect_attachments_to_results(
+                result_per_metric,
+                result_name,
+                metric,
+                metric_value,
+                result_name_child_level,
+                result_name_nested_child_level,
+            )
         if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
             _append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)
 
@@ -2629,6 +2666,7 @@ def _update_metric_value(
     metric: str,
     metric_value: Any,
     logger: logging.Logger,
+    include_property_bag: bool = False,
 ) -> Tuple[str, str, str]:
     """Update metric dictionary with the appropriate field based on metric key.
 
@@ -2638,6 +2676,10 @@ def _update_metric_value(
 
     :param criteria_type: Type of the evaluation criteria (e.g. 'azure_ai_evaluator')
     :type criteria_type: str
+    :param include_property_bag: Whether non-standard metric fields should be preserved under
+        the AOAI result properties bag. This defaults to False for backwards compatibility
+        with direct helper callers; the AOAI conversion path passes the explicit value.
+    :type include_property_bag: bool
     :param metric_dict: Dictionary to update with metric values
     :type metric_dict: Dict[str, Any]
     :param metric_key: Key name of the metric (determines field assignment)
@@ -2674,13 +2716,20 @@ def _update_metric_value(
     result_name_nested_child_level = None
     derived_passed = None
 
+    property_name = None if metric_key == metric else _get_result_property_name(metric_key)
+    if property_name and include_property_bag:
+        _ensure_properties_dict(metric_dict)
+        metric_dict["properties"][property_name] = metric_value
+        result_name = "properties"
+        result_name_child_level = property_name
+
     if metric_key.endswith("_score") or metric_key == "score":
         metric_dict["score"] = metric_value
         result_name = "score"
     elif metric_key == "passed":
         metric_dict["passed"] = metric_value
         result_name = "passed"
-    elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
+    elif metric_key.endswith("_result") or metric_key in ["result", "label"] or metric_key.endswith("_label"):
         metric_dict["label"] = metric_value
         result_name = "label"
         if criteria_type == "azure_ai_evaluator":
@@ -2746,23 +2795,7 @@ def _update_metric_value(
         result_name = "sample"
         result_name_child_level = "usage"
         result_name_nested_child_level = "completion_tokens"
-    elif not any(
-        metric_key.endswith(suffix)
-        for suffix in [
-            "_result",
-            "_reason",
-            "_threshold",
-            "_label",
-            "_score",
-            "_model",
-            "_finish_reason",
-            "_sample_input",
-            "_sample_output",
-            "_total_tokens",
-            "_prompt_tokens",
-            "_completion_tokens",
-        ]
-    ):
+    elif not any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
         # If no score found yet and this doesn't match other patterns, use as score
         if metric_key == metric and metric_dict.get("score", None) is None:
             metric_dict["score"] = metric_value
@@ -2814,6 +2847,43 @@ def _ensure_usage_dict(metric_dict: Dict[str, Any]) -> None:
         metric_dict["sample"]["usage"] = {}
 
 
+def _ensure_properties_dict(metric_dict: Dict[str, Any]) -> None:
+    """Ensure properties dictionary exists in metric_dict.
+
+    :param metric_dict: Metric dictionary to modify
+    :type metric_dict: Dict[str, Any]
+    :return: None (modifies metric_dict in place)
+    :rtype: None
+    """
+    if "properties" not in metric_dict:
+        metric_dict["properties"] = {}
+
+
+def _should_include_property_bag(evaluator_name: Optional[str], metrics: Dict[str, Any]) -> bool:
+    """Return whether AOAI result properties should be emitted for a custom evaluator result."""
+    if "custom_score" in metrics:
+        return True
+    return not (evaluator_name and evaluator_name.startswith("builtin."))
+
+
+def _get_result_property_name(metric_key: str) -> Optional[str]:
+    """Return the result property name for fields that should be preserved in properties."""
+    result_property_prefix = "custom_"
+    standard_metric_keys = {"score", "passed", "result", "label", "reason", "threshold", "sample"}
+
+    if metric_key.startswith(result_property_prefix) and len(metric_key) > len(result_property_prefix):
+        if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+            return None
+        return metric_key[len(result_property_prefix) :]
+
+    if metric_key not in standard_metric_keys:
+        if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+            return None
+        return metric_key
+
+    return None
+
+
 def _create_result_object(
     criteria_name: str,
     metric: str,
@@ -2879,6 +2949,7 @@ def _create_result_object(
     threshold = metric_values.get("threshold")
     passed = metric_values.get("passed")
     sample = metric_values.get("sample")
+    properties = metric_values.get("properties")
 
     # Handle decrease boolean metrics
     if is_inverse:
@@ -2898,6 +2969,8 @@ def _create_result_object(
 
     if sample is not None:
         result_obj["sample"] = sample
+    if properties is not None:
+        result_obj["properties"] = properties
 
     return result_obj
 
@@ -3264,7 +3337,11 @@ def _append_indirect_attachments_to_results(
                     ) = metric_value
 
 
-def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
+def _get_metric_from_criteria(
+    testing_criteria_name: str,
+    metric_key: str,
+    metric_list: List[str],
+) -> str:
     """
     Get the metric name from the testing criteria and metric key.
 
@@ -3291,6 +3368,9 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
     elif metric_key == "f1_result" or metric_key == "f1_threshold" or metric_key == "f1_score":
         metric = "f1_score"
         return metric
+    elif len(metric_list) == 1:
+        metric = metric_list[0]
+        return metric
     for expected_metric in metric_list:
         if metric_key.startswith(expected_metric):
             metric = expected_metric

@@ -208,6 +208,9 @@
                         "temperature": 1.0,
                         "top_p": 1.0,
                         "max_completions_tokens": 4096
+                    },
+                    "properties": {
+                        "type": null
                     }
                 },
                 {
@@ -270,6 +273,9 @@
                                 "content": "<S0>Let's think step by step: The response \"washington, d.c.\" is extremely brief and lacks any sentence structure, grammatical context, or vocabulary range. It does not convey a clear idea or message, making it difficult to understand. Based on the definitions, this aligns with Emergent Fluency as it shows minimal command of the language and is largely incomprehensible.</S0>  \n<S1>The response is fragmented and does not demonstrate any fluency traits such as grammar, vocabulary, or coherence. It is incomprehensible as a standalone statement.</S1>  \n<S2>1</S2>  "
                             }
                         ]
+                    },
+                    "properties": {
+                        "gpt_fluency": 1.0
                     }
                 },
                 {
@@ -398,6 +404,9 @@
                         "temperature": 1.0,
                         "top_p": 1.0,
                         "max_completions_tokens": 4096
+                    },
+                    "properties": {
+                        "type": null
                     }
                 },
                 {