diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
index d46f3dd216fb..d89648c223e7 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -65,6 +65,21 @@
 
 LOGGER = logging.getLogger(__name__)
 
+_RESULT_PROPERTY_SUFFIXES = (
+    "_result",
+    "_reason",
+    "_threshold",
+    "_label",
+    "_score",
+    "_model",
+    "_finish_reason",
+    "_sample_input",
+    "_sample_output",
+    "_total_tokens",
+    "_prompt_tokens",
+    "_completion_tokens",
+)
+
 # For metrics (aggregates) whose metric names intentionally differ from their
 # originating column name, usually because the aggregation of the original value
 # means something sufficiently different.
@@ -2511,9 +2526,11 @@ def _process_criteria_metrics(
             {"input": "...", "output": "..."}
         )
     """
-    expected_metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])
-    criteria_type = testing_criteria_metadata.get(criteria_name, {}).get("type", "")
-    is_inverse = testing_criteria_metadata.get(criteria_name, {}).get("is_inverse", False)
+    criteria_metadata = testing_criteria_metadata.get(criteria_name, {})
+    expected_metrics = criteria_metadata.get("metrics", [])
+    criteria_type = criteria_metadata.get("type", "")
+    evaluator_name = criteria_metadata.get("evaluator_name", "")
+    is_inverse = criteria_metadata.get("is_inverse", False)
 
     if _is_none_or_nan(criteria_type) or _is_none_or_nan(criteria_name):
         logger.warning(
@@ -2522,7 +2539,15 @@ def _process_criteria_metrics(
         return ([], {})
 
     # Extract metric values
-    result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger)
+    include_property_bag = _should_include_property_bag(evaluator_name, metrics)
+    result_per_metric = _extract_metric_values(
+        criteria_name,
+        criteria_type,
+        include_property_bag,
+        metrics,
+        expected_metrics,
+        logger,
+    )
 
     # Inject threshold from evaluator config when not present in raw results
     # (e.g., PythonGrader/code evaluators don't emit a threshold column)
@@ -2550,7 +2575,12 @@ def _process_criteria_metrics(
 
 
 def _extract_metric_values(
-    criteria_name: str, criteria_type: str, metrics: Dict[str, Any], expected_metrics: List[str], logger: logging.Logger
+    criteria_name: str,
+    criteria_type: str,
+    include_property_bag: bool,
+    metrics: Dict[str, Any],
+    expected_metrics: List[str],
+    logger: logging.Logger,
 ) -> Dict[str, Dict[str, Any]]:
     """Extract and organize metric values by metric name.
 
@@ -2599,16 +2629,23 @@ def _extract_metric_values(
             result_per_metric[metric] = {}
 
         result_name, result_name_child_level, result_name_nested_child_level, derived_passed = _update_metric_value(
-            criteria_type, result_per_metric[metric], metric_key, metric, metric_value, logger
-        )
-        _append_indirect_attachments_to_results(
-            result_per_metric,
-            result_name,
+            criteria_type,
+            result_per_metric[metric],
+            metric_key,
             metric,
             metric_value,
-            result_name_child_level,
-            result_name_nested_child_level,
+            logger,
+            include_property_bag=include_property_bag,
         )
+        if result_name is not None:
+            _append_indirect_attachments_to_results(
+                result_per_metric,
+                result_name,
+                metric,
+                metric_value,
+                result_name_child_level,
+                result_name_nested_child_level,
+            )
         if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
             _append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)
 
@@ -2629,6 +2666,7 @@ def _update_metric_value(
     metric: str,
     metric_value: Any,
     logger: logging.Logger,
+    include_property_bag: bool = False,
 ) -> Tuple[str, str, str]:
     """Update metric dictionary with the appropriate field based on metric key.
 
@@ -2638,6 +2676,10 @@ def _update_metric_value(
 
     :param criteria_type: Type of the evaluation criteria (e.g. 'azure_ai_evaluator')
     :type criteria_type: str
+    :param include_property_bag: Whether non-standard metric fields should be preserved under
+        the AOAI result properties bag. This defaults to False for backwards compatibility
+        with direct helper callers; the AOAI conversion path passes the explicit value.
+    :type include_property_bag: bool
     :param metric_dict: Dictionary to update with metric values
     :type metric_dict: Dict[str, Any]
     :param metric_key: Key name of the metric (determines field assignment)
@@ -2674,13 +2716,20 @@ def _update_metric_value(
     result_name_nested_child_level = None
     derived_passed = None
 
+    property_name = None if metric_key == metric else _get_result_property_name(metric_key)
+    if property_name and include_property_bag:
+        _ensure_properties_dict(metric_dict)
+        metric_dict["properties"][property_name] = metric_value
+        result_name = "properties"
+        result_name_child_level = property_name
+
     if metric_key.endswith("_score") or metric_key == "score":
         metric_dict["score"] = metric_value
         result_name = "score"
     elif metric_key == "passed":
         metric_dict["passed"] = metric_value
         result_name = "passed"
-    elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
+    elif metric_key.endswith("_result") or metric_key in ["result", "label"] or metric_key.endswith("_label"):
         metric_dict["label"] = metric_value
         result_name = "label"
         if criteria_type == "azure_ai_evaluator":
@@ -2746,23 +2795,7 @@ def _update_metric_value(
         result_name = "sample"
         result_name_child_level = "usage"
         result_name_nested_child_level = "completion_tokens"
-    elif not any(
-        metric_key.endswith(suffix)
-        for suffix in [
-            "_result",
-            "_reason",
-            "_threshold",
-            "_label",
-            "_score",
-            "_model",
-            "_finish_reason",
-            "_sample_input",
-            "_sample_output",
-            "_total_tokens",
-            "_prompt_tokens",
-            "_completion_tokens",
-        ]
-    ):
+    elif not any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
         # If no score found yet and this doesn't match other patterns, use as score
         if metric_key == metric and metric_dict.get("score", None) is None:
             metric_dict["score"] = metric_value
@@ -2814,6 +2847,43 @@ def _ensure_usage_dict(metric_dict: Dict[str, Any]) -> None:
         metric_dict["sample"]["usage"] = {}
 
 
+def _ensure_properties_dict(metric_dict: Dict[str, Any]) -> None:
+    """Ensure properties dictionary exists in metric_dict.
+
+    :param metric_dict: Metric dictionary to modify
+    :type metric_dict: Dict[str, Any]
+    :return: None (modifies metric_dict in place)
+    :rtype: None
+    """
+    if "properties" not in metric_dict:
+        metric_dict["properties"] = {}
+
+
+def _should_include_property_bag(evaluator_name: Optional[str], metrics: Dict[str, Any]) -> bool:
+    """Return whether AOAI result properties should be emitted for a custom evaluator result."""
+    if "custom_score" in metrics:
+        return True
+    return not (evaluator_name and evaluator_name.startswith("builtin."))
+
+
+def _get_result_property_name(metric_key: str) -> Optional[str]:
+    """Return the result property name for fields that should be preserved in properties."""
+    result_property_prefix = "custom_"
+    standard_metric_keys = {"score", "passed", "result", "label", "reason", "threshold", "sample"}
+
+    if metric_key.startswith(result_property_prefix) and len(metric_key) > len(result_property_prefix):
+        if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+            return None
+        return metric_key[len(result_property_prefix) :]
+
+    if metric_key not in standard_metric_keys:
+        if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
+            return None
+        return metric_key
+
+    return None
+
+
 def _create_result_object(
     criteria_name: str,
     metric: str,
@@ -2879,6 +2949,7 @@ def _create_result_object(
     threshold = metric_values.get("threshold")
     passed = metric_values.get("passed")
     sample = metric_values.get("sample")
+    properties = metric_values.get("properties")
 
     # Handle decrease boolean metrics
     if is_inverse:
@@ -2898,6 +2969,8 @@ def _create_result_object(
 
     if sample is not None:
         result_obj["sample"] = sample
+    if properties is not None:
+        result_obj["properties"] = properties
 
     return result_obj
 
@@ -3264,7 +3337,11 @@ def _append_indirect_attachments_to_results(
                     ) = metric_value
 
 
-def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
+def _get_metric_from_criteria(
+    testing_criteria_name: str,
+    metric_key: str,
+    metric_list: List[str],
+) -> str:
     """
     Get the metric name from the testing criteria and metric key.
 
@@ -3291,6 +3368,9 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
     elif metric_key == "f1_result" or metric_key == "f1_threshold" or metric_key == "f1_score":
         metric = "f1_score"
         return metric
+    elif len(metric_list) == 1:
+        metric = metric_list[0]
+        return metric
     for expected_metric in metric_list:
         if metric_key.startswith(expected_metric):
             metric = expected_metric
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
index 6b40439c3ebd..0d88b1380898 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json
@@ -208,6 +208,9 @@
                         "temperature": 1.0,
                         "top_p": 1.0,
                         "max_completions_tokens": 4096
+                    },
+                    "properties": {
+                        "type": null
                     }
                 },
                 {
@@ -270,6 +273,9 @@
                                 "content": "<S0>Let's think step by step: The response \"washington, d.c.\" is extremely brief and lacks any sentence structure, grammatical context, or vocabulary range. It does not convey a clear idea or message, making it difficult to understand. Based on the definitions, this aligns with Emergent Fluency as it shows minimal command of the language and is largely incomprehensible.</S0>  \n<S1>The response is fragmented and does not demonstrate any fluency traits such as grammar, vocabulary, or coherence. It is incomprehensible as a standalone statement.</S1>  \n<S2>1</S2>  "
                             }
                         ]
+                    },
+                    "properties": {
+                        "gpt_fluency": 1.0
                     }
                 },
                 {
@@ -398,6 +404,9 @@
                         "temperature": 1.0,
                         "top_p": 1.0,
                         "max_completions_tokens": 4096
+                    },
+                    "properties": {
+                        "type": null
                     }
                 },
                 {
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
index 47ef67eb4baa..a680bdbbb380 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -1,4 +1,4 @@
-from typing import List, Dict, Union
+from typing import List, Dict, Union, cast
 import json
 import logging
 import math
@@ -26,6 +26,7 @@
     SelfHarmEvaluator,
     HateUnfairnessEvaluator,
     AzureOpenAIModelConfiguration,
+    EvaluationResult,
 )
 from azure.ai.evaluation._aoai.label_grader import AzureOpenAILabelGrader
 from azure.ai.evaluation._constants import (
@@ -1209,7 +1210,10 @@ def test_convert_results_to_aoai_evaluation_results(self):
         eval_id = "test_eval_group_123"
         eval_run_id = "test_run_456"
         # Create EvaluationResult structure
-        test_results = {"metrics": {"overall_score": 0.75}, "rows": test_rows, "studio_url": "https://test-studio.com"}
+        test_results = cast(
+            EvaluationResult,
+            {"metrics": {"overall_score": 0.75}, "rows": test_rows, "studio_url": "https://test-studio.com"},
+        )
 
         # Test the conversion function
         def run_test():
@@ -1247,6 +1251,12 @@ def run_test():
             expected_results_json = json.load(f)
         assert converted_results_json == expected_results_json
 
+        builtin_results = converted_results["_evaluation_results_list"][0]["results"]
+        labelgrader_result = next(result for result in builtin_results if result["name"] == "labelgrader")
+        fluency_result = next(result for result in builtin_results if result["name"] == "Fluency")
+        assert labelgrader_result["properties"] == {"type": None}
+        assert fluency_result["properties"] == {"gpt_fluency": 1.0}
+
         # Verify metrics preserved
         assert converted_results["metrics"]["overall_score"] == 0.75
 
@@ -1343,7 +1353,7 @@ def run_test():
             assert "cached_tokens" in usage_item
 
         # Test with empty results
-        empty_results = {"metrics": {}, "rows": [], "studio_url": None}
+        empty_results = cast(EvaluationResult, {"metrics": {}, "rows": [], "studio_url": None})
         _convert_results_to_aoai_evaluation_results(
             results=empty_results, logger=logger, eval_run_id=eval_run_id, eval_id=eval_id, evaluators=evaluators
         )
@@ -1353,6 +1363,55 @@ def run_test():
         assert len(empty_converted["_evaluation_results_list"]) == 0
         assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0
 
+        property_results = cast(
+            EvaluationResult,
+            {
+                "metrics": {},
+                "rows": [
+                    {
+                        "inputs.query": "test query",
+                        "outputs.friendly_evaluator_gh4y.custom_score": 4.5,
+                        "outputs.friendly_evaluator_gh4y.custom_threshold": 3,
+                        "outputs.friendly_evaluator_gh4y.label": False,
+                        "outputs.friendly_evaluator_gh4y.custom_observation_flag": False,
+                        "outputs.friendly_evaluator_gh4y.explanation": "Detailed attack reasoning",
+                        "outputs.friendly_evaluator_gh4y.attack_phase": "probe",
+                    }
+                ],
+                "studio_url": None,
+            },
+        )
+
+        _convert_results_to_aoai_evaluation_results(
+            results=property_results,
+            logger=logger,
+            eval_run_id=eval_run_id,
+            eval_id=eval_id,
+            evaluators={"friendly_evaluator_gh4y": lambda **kwargs: {"score": 1}},
+            eval_meta_data={
+                "testing_criteria": [
+                    {
+                        "name": "friendly_evaluator_gh4y",
+                        "type": "quality",
+                        "metrics": ["score"],
+                        "evaluator_name": "builtin.friendly_evaluator_gh4y",
+                    }
+                ]
+            },
+        )
+
+        property_result = property_results["_evaluation_results_list"][0]["results"][0]
+        assert property_result["properties"] == {
+            "observation_flag": False,
+            "explanation": "Detailed attack reasoning",
+            "attack_phase": "probe",
+        }
+        assert property_result["score"] == 4.5
+        assert property_result["reason"] is None
+        assert "explanation" not in property_result
+        assert property_result["threshold"] == 3
+        assert property_result["label"] is False
+
     @patch(
         "azure.ai.evaluation._evaluate._evaluate._map_names_to_builtins",
         return_value={},
@@ -1826,6 +1885,33 @@ def test_nan_string_maps_to_none(self, suffix):
         token_key = suffix.lstrip("_")
         assert result["sample"]["usage"][token_key] is None
 
+    def test_legacy_helper_call_does_not_create_properties_bag(self):
+        metric_dict = {}
+        _update_metric_value(
+            criteria_type="azure_ai_evaluator",
+            metric_dict=metric_dict,
+            metric_key="custom_observation_flag",
+            metric="score",
+            metric_value=False,
+            logger=logging.getLogger("test"),
+        )
+
+        assert "properties" not in metric_dict
+
+    def test_include_property_bag_preserves_custom_fields(self):
+        metric_dict = {}
+        _update_metric_value(
+            criteria_type="azure_ai_evaluator",
+            include_property_bag=True,
+            metric_dict=metric_dict,
+            metric_key="custom_observation_flag",
+            metric="score",
+            metric_value=False,
+            logger=logging.getLogger("test"),
+        )
+
+        assert metric_dict["properties"] == {"observation_flag": False}
+
 
 @pytest.mark.unittest
 class TestBuildInternalLogAttributesThreshold: