Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions src/uipath/eval/_helpers/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,33 @@
from ..models import ErrorEvaluationResult, EvaluationResult


def is_empty_value(value: Any) -> bool:
"""Check if a value is empty or contains only empty values.

Handles multiple cases:
- None or empty string
- String with only whitespace
- Dict where all values are empty strings or whitespace
- Empty list or dict
"""
if value is None:
return True

if isinstance(value, str):
return not value.strip()

if isinstance(value, dict):
if not value: # Empty dict
return True
# Check if all values are empty strings
return all(isinstance(v, str) and not v.strip() for v in value.values())

if isinstance(value, list):
return len(value) == 0

return False


def auto_discover_entrypoint() -> str:
"""Auto-discover entrypoint from config file.

Expand Down
25 changes: 24 additions & 1 deletion src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""LLM-as-a-judge evaluator for subjective quality assessment of agent outputs."""

import logging
from typing import Any, Optional

from pydantic import field_validator
Expand All @@ -9,14 +10,23 @@
from ..._utils.constants import COMMUNITY_agents_SUFFIX
from ...platform.chat import UiPathLlmChatService
from ...platform.chat.llm_gateway import RequiredToolChoice
from ..models.models import AgentExecution, EvaluationResult, LLMResponse
from .._helpers.helpers import is_empty_value
from ..models.models import (
AgentExecution,
EvaluationResult,
LLMResponse,
UiPathEvaluationError,
UiPathEvaluationErrorCategory,
)
from .base_legacy_evaluator import (
BaseLegacyEvaluator,
LegacyEvaluationCriteria,
LegacyEvaluatorConfig,
)
from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response

logger = logging.getLogger(__name__)


class LegacyLlmAsAJudgeEvaluatorConfig(LegacyEvaluatorConfig):
"""Configuration for legacy LLM-as-a-judge evaluators."""
Expand Down Expand Up @@ -124,6 +134,19 @@ def _create_evaluation_prompt(
self, expected_output: Any, actual_output: Any
) -> str:
"""Create the evaluation prompt for the LLM."""
# Validate that expected output is not empty
if is_empty_value(expected_output):
logger.error(
"❌ EMPTY_EXPECTED_OUTPUT: Expected output is empty or contains only empty values. "
f"Received: {repr(expected_output)}"
)
raise UiPathEvaluationError(
code="EMPTY_EXPECTED_OUTPUT",
title="Expected output cannot be empty",
detail="The evaluation criteria must contain a non-empty expected output.",
category=UiPathEvaluationErrorCategory.USER,
)

formatted_prompt = self.prompt.replace(
self.actual_output_placeholder,
str(actual_output),
Expand Down
19 changes: 19 additions & 0 deletions src/uipath/eval/evaluators/legacy_trajectory_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Trajectory evaluator for analyzing execution paths and decision sequences."""

import logging
from typing import Any, Optional

from opentelemetry.sdk.trace import ReadableSpan
Expand All @@ -10,11 +11,14 @@
from ..._utils.constants import COMMUNITY_agents_SUFFIX
from ...platform.chat import UiPathLlmChatService
from ...platform.chat.llm_gateway import RequiredToolChoice
from .._helpers.helpers import is_empty_value
from ..models.models import (
AgentExecution,
LLMResponse,
NumericEvaluationResult,
TrajectoryEvaluationTrace,
UiPathEvaluationError,
UiPathEvaluationErrorCategory,
)
from .base_legacy_evaluator import (
BaseLegacyEvaluator,
Expand All @@ -23,6 +27,8 @@
)
from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response

logger = logging.getLogger(__name__)


class LegacyTrajectoryEvaluatorConfig(LegacyEvaluatorConfig):
"""Configuration for legacy trajectory evaluators."""
Expand Down Expand Up @@ -103,6 +109,19 @@ def _create_evaluation_prompt(
agent_run_history: Any,
) -> str:
"""Create the evaluation prompt for the LLM."""
# Validate that expected agent behavior is not empty
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add tests

if is_empty_value(expected_agent_behavior):
logger.error(
"❌ EMPTY_EXPECTED_AGENT_BEHAVIOR: Expected agent behavior is empty or contains only empty values. "
f"Received: {repr(expected_agent_behavior)}"
)
raise UiPathEvaluationError(
code="EMPTY_EXPECTED_AGENT_BEHAVIOR",
title="Expected agent behavior cannot be empty",
detail="The evaluation criteria must contain a non-empty expected agent behavior.",
category=UiPathEvaluationErrorCategory.USER,
)

formatted_prompt = self.prompt.replace(
self.expected_agent_behavior_placeholder,
str(expected_agent_behavior),
Expand Down
41 changes: 0 additions & 41 deletions src/uipath/eval/evaluators/llm_as_judge_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,34 +136,6 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any:
"""Get the expected output from the evaluation criteria. Must be implemented by concrete evaluator classes."""
pass

def _is_empty_expected_output(self, expected_output: Any) -> bool:
"""Check if the expected output is empty or contains only empty values.

Handles multiple cases:
- None or empty string
- String with only whitespace
- Dict where all values are empty strings or whitespace
- Empty list or dict
"""
if expected_output is None:
return True

if isinstance(expected_output, str):
return not expected_output.strip()

if isinstance(expected_output, dict):
if not expected_output: # Empty dict
return True
# Check if all values are empty strings
return all(
isinstance(v, str) and not v.strip() for v in expected_output.values()
)

if isinstance(expected_output, list):
return len(expected_output) == 0

return False

async def evaluate(
self,
agent_execution: AgentExecution,
Expand Down Expand Up @@ -193,19 +165,6 @@ def _create_evaluation_prompt(
"""Create the evaluation prompt for the LLM."""
expected_output = self._get_expected_output(evaluation_criteria)

# Validate that expected output is not empty
if self._is_empty_expected_output(expected_output):
logger.error(
"❌ EMPTY_EXPECTED_OUTPUT: Expected output is empty or contains only empty values. "
f"Received: {repr(expected_output)}"
)
raise UiPathEvaluationError(
code="EMPTY_EXPECTED_OUTPUT",
title="Expected output cannot be empty",
detail="The evaluation criteria must contain a non-empty expected output or expected agent behavior.",
category=UiPathEvaluationErrorCategory.USER,
)

formatted_prompt = self.evaluator_config.prompt.replace(
self.actual_output_placeholder,
str(self._get_actual_output(agent_execution)),
Expand Down
109 changes: 109 additions & 0 deletions tests/evaluators/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Test module for helper functions in uipath.eval._helpers.helpers."""

from uipath.eval._helpers.helpers import is_empty_value


class TestIsEmptyValue:
"""Test is_empty_value helper function.

These tests are based on realistic evaluation criteria structures:
- expectedOutput: typically a dict like {"content": "..."} or empty dict {}
- expectedAgentBehavior: typically a string describing expected behavior
"""

# --- Empty expectedAgentBehavior (string) cases ---

def test_empty_string_expected_agent_behavior(self) -> None:
"""Test empty string expectedAgentBehavior is considered empty."""
assert is_empty_value("") is True

def test_whitespace_only_expected_agent_behavior(self) -> None:
"""Test whitespace-only expectedAgentBehavior is considered empty."""
assert is_empty_value(" ") is True
assert is_empty_value(" ") is True
assert is_empty_value("\t") is True
assert is_empty_value("\n") is True
assert is_empty_value(" \t\n ") is True

def test_valid_expected_agent_behavior(self) -> None:
"""Test non-empty expectedAgentBehavior strings are not empty."""
assert is_empty_value("The agent should search for weather") is False
assert is_empty_value("Call the get_user tool with id=123") is False
assert is_empty_value(" valid behavior ") is False

# --- Empty expectedOutput (dict) cases ---

def test_empty_dict_expected_output(self) -> None:
"""Test empty dict expectedOutput is considered empty."""
# trajectory evaluator: {"expectedOutput": {}}
assert is_empty_value({}) is True

def test_dict_with_empty_content_field(self) -> None:
"""Test dict with empty content field is considered empty."""
# llm-as-a-judge: {"expectedOutput": {"content": ""}}
assert is_empty_value({"content": ""}) is True

def test_dict_with_whitespace_content_field(self) -> None:
"""Test dict with whitespace-only content field is considered empty."""
assert is_empty_value({"content": " "}) is True
assert is_empty_value({"content": "\t\n"}) is True

def test_dict_with_multiple_empty_string_fields(self) -> None:
"""Test dict where all values are empty strings is considered empty."""
assert is_empty_value({"content": "", "reasoning": ""}) is True
assert is_empty_value({"content": " ", "reasoning": "\t"}) is True

def test_dict_with_valid_content_field(self) -> None:
"""Test dict with non-empty content field is not empty."""
assert is_empty_value({"content": "Expected response"}) is False
assert is_empty_value({"content": "The answer is 42"}) is False

def test_dict_with_mixed_empty_and_non_empty_fields(self) -> None:
"""Test dict with at least one non-empty value is not empty."""
# If any value is non-empty, the whole dict is not empty
assert is_empty_value({"content": "value", "reasoning": ""}) is False
assert is_empty_value({"content": "", "reasoning": "some reason"}) is False

# --- None case ---

def test_none_is_empty(self) -> None:
"""Test that None is considered empty."""
assert is_empty_value(None) is True

# --- Empty list case ---

def test_empty_list_is_empty(self) -> None:
"""Test that empty list is considered empty."""
assert is_empty_value([]) is True

def test_non_empty_list_is_not_empty(self) -> None:
"""Test that non-empty lists are not considered empty."""
assert is_empty_value(["step1", "step2"]) is False
assert is_empty_value([{"content": ""}]) is False

# --- Edge cases with non-string dict values ---

def test_dict_with_non_string_values_is_not_empty(self) -> None:
"""Test that dict with non-string values is not considered empty.

The function only checks if all values are empty strings.
Non-string values (int, bool, None, list, dict) make it non-empty.
"""
assert is_empty_value({"content": 0}) is False
assert is_empty_value({"content": False}) is False
assert is_empty_value({"content": None}) is False
assert is_empty_value({"content": []}) is False
assert is_empty_value({"content": {}}) is False

# --- Other types (not typical in eval criteria but handled) ---

def test_numeric_values_are_not_empty(self) -> None:
"""Test that numeric values are not considered empty."""
assert is_empty_value(0) is False
assert is_empty_value(42) is False
assert is_empty_value(0.0) is False

def test_boolean_values_are_not_empty(self) -> None:
"""Test that boolean values are not considered empty."""
assert is_empty_value(False) is False
assert is_empty_value(True) is False