Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,21 @@

LOGGER = logging.getLogger(__name__)

_RESULT_PROPERTY_SUFFIXES = (
"_result",
"_reason",
"_threshold",
"_label",
"_score",
"_model",
"_finish_reason",
"_sample_input",
"_sample_output",
"_total_tokens",
"_prompt_tokens",
"_completion_tokens",
)

# For metrics (aggregates) whose metric names intentionally differ from their
# originating column name, usually because the aggregation of the original value
# means something sufficiently different.
Expand Down Expand Up @@ -2511,9 +2526,11 @@ def _process_criteria_metrics(
{"input": "...", "output": "..."}
)
"""
expected_metrics = testing_criteria_metadata.get(criteria_name, {}).get("metrics", [])
criteria_type = testing_criteria_metadata.get(criteria_name, {}).get("type", "")
is_inverse = testing_criteria_metadata.get(criteria_name, {}).get("is_inverse", False)
criteria_metadata = testing_criteria_metadata.get(criteria_name, {})
expected_metrics = criteria_metadata.get("metrics", [])
criteria_type = criteria_metadata.get("type", "")
evaluator_name = criteria_metadata.get("evaluator_name", "")
is_inverse = criteria_metadata.get("is_inverse", False)

if _is_none_or_nan(criteria_type) or _is_none_or_nan(criteria_name):
logger.warning(
Expand All @@ -2522,7 +2539,15 @@ def _process_criteria_metrics(
return ([], {})

# Extract metric values
result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger)
include_property_bag = _should_include_property_bag(evaluator_name, metrics)
result_per_metric = _extract_metric_values(
criteria_name,
criteria_type,
include_property_bag,
metrics,
expected_metrics,
logger,
)

# Inject threshold from evaluator config when not present in raw results
# (e.g., PythonGrader/code evaluators don't emit a threshold column)
Expand Down Expand Up @@ -2550,7 +2575,12 @@ def _process_criteria_metrics(


def _extract_metric_values(
criteria_name: str, criteria_type: str, metrics: Dict[str, Any], expected_metrics: List[str], logger: logging.Logger
criteria_name: str,
criteria_type: str,
include_property_bag: bool,
metrics: Dict[str, Any],
expected_metrics: List[str],
logger: logging.Logger,
) -> Dict[str, Dict[str, Any]]:
"""Extract and organize metric values by metric name.

Expand Down Expand Up @@ -2599,16 +2629,23 @@ def _extract_metric_values(
result_per_metric[metric] = {}

result_name, result_name_child_level, result_name_nested_child_level, derived_passed = _update_metric_value(
criteria_type, result_per_metric[metric], metric_key, metric, metric_value, logger
)
_append_indirect_attachments_to_results(
result_per_metric,
result_name,
criteria_type,
result_per_metric[metric],
metric_key,
metric,
metric_value,
result_name_child_level,
result_name_nested_child_level,
logger,
include_property_bag=include_property_bag,
)
if result_name is not None:
_append_indirect_attachments_to_results(
result_per_metric,
result_name,
metric,
metric_value,
result_name_child_level,
result_name_nested_child_level,
)
if result_name == "label" and criteria_type == "azure_ai_evaluator" and derived_passed is not None:
_append_indirect_attachments_to_results(result_per_metric, "passed", metric, derived_passed, None, None)

Expand All @@ -2629,6 +2666,7 @@ def _update_metric_value(
metric: str,
metric_value: Any,
logger: logging.Logger,
include_property_bag: bool = False,
) -> Tuple[str, str, str]:
"""Update metric dictionary with the appropriate field based on metric key.

Expand All @@ -2638,6 +2676,10 @@ def _update_metric_value(

:param criteria_type: Type of the evaluation criteria (e.g. 'azure_ai_evaluator')
:type criteria_type: str
:param include_property_bag: Whether non-standard metric fields should be preserved under
the AOAI result properties bag. This defaults to False for backwards compatibility
with direct helper callers; the AOAI conversion path passes the explicit value.
:type include_property_bag: bool
:param metric_dict: Dictionary to update with metric values
:type metric_dict: Dict[str, Any]
:param metric_key: Key name of the metric (determines field assignment)
Expand Down Expand Up @@ -2674,13 +2716,20 @@ def _update_metric_value(
result_name_nested_child_level = None
derived_passed = None

property_name = None if metric_key == metric else _get_result_property_name(metric_key)
if property_name and include_property_bag:
_ensure_properties_dict(metric_dict)
metric_dict["properties"][property_name] = metric_value
result_name = "properties"
result_name_child_level = property_name

if metric_key.endswith("_score") or metric_key == "score":
metric_dict["score"] = metric_value
result_name = "score"
elif metric_key == "passed":
metric_dict["passed"] = metric_value
result_name = "passed"
elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
elif metric_key.endswith("_result") or metric_key in ["result", "label"] or metric_key.endswith("_label"):
metric_dict["label"] = metric_value
result_name = "label"
if criteria_type == "azure_ai_evaluator":
Expand Down Expand Up @@ -2746,23 +2795,7 @@ def _update_metric_value(
result_name = "sample"
result_name_child_level = "usage"
result_name_nested_child_level = "completion_tokens"
elif not any(
metric_key.endswith(suffix)
for suffix in [
"_result",
"_reason",
"_threshold",
"_label",
"_score",
"_model",
"_finish_reason",
"_sample_input",
"_sample_output",
"_total_tokens",
"_prompt_tokens",
"_completion_tokens",
]
):
elif not any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
# If no score found yet and this doesn't match other patterns, use as score
if metric_key == metric and metric_dict.get("score", None) is None:
metric_dict["score"] = metric_value
Expand Down Expand Up @@ -2814,6 +2847,43 @@ def _ensure_usage_dict(metric_dict: Dict[str, Any]) -> None:
metric_dict["sample"]["usage"] = {}


def _ensure_properties_dict(metric_dict: Dict[str, Any]) -> None:
"""Ensure properties dictionary exists in metric_dict.

:param metric_dict: Metric dictionary to modify
:type metric_dict: Dict[str, Any]
:return: None (modifies metric_dict in place)
:rtype: None
"""
if "properties" not in metric_dict:
metric_dict["properties"] = {}


def _should_include_property_bag(evaluator_name: Optional[str], metrics: Dict[str, Any]) -> bool:
"""Return whether AOAI result properties should be emitted for a custom evaluator result."""
if "custom_score" in metrics:
return True
return not (evaluator_name and evaluator_name.startswith("builtin."))


def _get_result_property_name(metric_key: str) -> Optional[str]:
"""Return the result property name for fields that should be preserved in properties."""
result_property_prefix = "custom_"
standard_metric_keys = {"score", "passed", "result", "label", "reason", "threshold", "sample"}

if metric_key.startswith(result_property_prefix) and len(metric_key) > len(result_property_prefix):
if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
return None
return metric_key[len(result_property_prefix) :]

if metric_key not in standard_metric_keys:
if any(metric_key.endswith(suffix) for suffix in _RESULT_PROPERTY_SUFFIXES):
return None
return metric_key

return None


def _create_result_object(
criteria_name: str,
metric: str,
Expand Down Expand Up @@ -2879,6 +2949,7 @@ def _create_result_object(
threshold = metric_values.get("threshold")
passed = metric_values.get("passed")
sample = metric_values.get("sample")
properties = metric_values.get("properties")

# Handle decrease boolean metrics
if is_inverse:
Expand All @@ -2898,6 +2969,8 @@ def _create_result_object(

if sample is not None:
result_obj["sample"] = sample
if properties is not None:
result_obj["properties"] = properties

return result_obj

Expand Down Expand Up @@ -3264,7 +3337,11 @@ def _append_indirect_attachments_to_results(
) = metric_value


def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
def _get_metric_from_criteria(
testing_criteria_name: str,
metric_key: str,
metric_list: List[str],
) -> str:
"""
Get the metric name from the testing criteria and metric key.

Expand All @@ -3291,6 +3368,9 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
elif metric_key == "f1_result" or metric_key == "f1_threshold" or metric_key == "f1_score":
metric = "f1_score"
return metric
elif len(metric_list) == 1:
metric = metric_list[0]
return metric
for expected_metric in metric_list:
if metric_key.startswith(expected_metric):
metric = expected_metric
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@
"temperature": 1.0,
"top_p": 1.0,
"max_completions_tokens": 4096
},
"properties": {
"type": null
}
},
{
Expand Down Expand Up @@ -270,6 +273,9 @@
"content": "<S0>Let's think step by step: The response \"washington, d.c.\" is extremely brief and lacks any sentence structure, grammatical context, or vocabulary range. It does not convey a clear idea or message, making it difficult to understand. Based on the definitions, this aligns with Emergent Fluency as it shows minimal command of the language and is largely incomprehensible.</S0> \n<S1>The response is fragmented and does not demonstrate any fluency traits such as grammar, vocabulary, or coherence. It is incomprehensible as a standalone statement.</S1> \n<S2>1</S2> "
}
]
},
"properties": {
"gpt_fluency": 1.0
}
},
{
Expand Down Expand Up @@ -398,6 +404,9 @@
"temperature": 1.0,
"top_p": 1.0,
"max_completions_tokens": 4096
},
"properties": {
"type": null
}
},
{
Expand Down
Loading