diff --git a/src/uipath/eval/_helpers/helpers.py b/src/uipath/eval/_helpers/helpers.py index 9318ca45b..f00f5f5e5 100644 --- a/src/uipath/eval/_helpers/helpers.py +++ b/src/uipath/eval/_helpers/helpers.py @@ -10,6 +10,33 @@ from ..models import ErrorEvaluationResult, EvaluationResult +def is_empty_value(value: Any) -> bool: + """Check if a value is empty or contains only empty values. + + Handles multiple cases: + - None or empty string + - String with only whitespace + - Dict where all values are empty strings or whitespace + - Empty list or dict + """ + if value is None: + return True + + if isinstance(value, str): + return not value.strip() + + if isinstance(value, dict): + if not value: # Empty dict + return True + # Check if all values are empty strings + return all(isinstance(v, str) and not v.strip() for v in value.values()) + + if isinstance(value, list): + return len(value) == 0 + + return False + + def auto_discover_entrypoint() -> str: """Auto-discover entrypoint from config file. diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py index 3dde8c6e1..6100f8355 100644 --- a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -1,5 +1,6 @@ """LLM-as-a-judge evaluator for subjective quality assessment of agent outputs.""" +import logging from typing import Any, Optional from pydantic import field_validator @@ -9,7 +10,14 @@ from ..._utils.constants import COMMUNITY_agents_SUFFIX from ...platform.chat import UiPathLlmChatService from ...platform.chat.llm_gateway import RequiredToolChoice -from ..models.models import AgentExecution, EvaluationResult, LLMResponse +from .._helpers.helpers import is_empty_value +from ..models.models import ( + AgentExecution, + EvaluationResult, + LLMResponse, + UiPathEvaluationError, + UiPathEvaluationErrorCategory, +) from .base_legacy_evaluator import ( BaseLegacyEvaluator, LegacyEvaluationCriteria, @@ -17,6 +25,8 @@ ) from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response +logger = logging.getLogger(__name__) + class LegacyLlmAsAJudgeEvaluatorConfig(LegacyEvaluatorConfig): """Configuration for legacy LLM-as-a-judge evaluators.""" @@ -124,6 +134,19 @@ def _create_evaluation_prompt( self, expected_output: Any, actual_output: Any ) -> str: """Create the evaluation prompt for the LLM.""" + # Validate that expected output is not empty + if is_empty_value(expected_output): + logger.error( + "❌ EMPTY_EXPECTED_OUTPUT: Expected output is empty or contains only empty values. " + f"Received: {repr(expected_output)}" + ) + raise UiPathEvaluationError( + code="EMPTY_EXPECTED_OUTPUT", + title="Expected output cannot be empty", + detail="The evaluation criteria must contain a non-empty expected output.", + category=UiPathEvaluationErrorCategory.USER, + ) + formatted_prompt = self.prompt.replace( self.actual_output_placeholder, str(actual_output), diff --git a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py index bdd37b79a..c0c1a3412 100644 --- a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py @@ -1,5 +1,6 @@ """Trajectory evaluator for analyzing execution paths and decision sequences.""" +import logging from typing import Any, Optional from opentelemetry.sdk.trace import ReadableSpan @@ -10,11 +11,14 @@ from ..._utils.constants import COMMUNITY_agents_SUFFIX from ...platform.chat import UiPathLlmChatService from ...platform.chat.llm_gateway import RequiredToolChoice +from .._helpers.helpers import is_empty_value from ..models.models import ( AgentExecution, LLMResponse, NumericEvaluationResult, TrajectoryEvaluationTrace, + UiPathEvaluationError, + UiPathEvaluationErrorCategory, ) from .base_legacy_evaluator import ( BaseLegacyEvaluator, @@ -23,6 +27,8 @@ ) from .legacy_llm_helpers import create_evaluation_tool, extract_tool_call_response +logger = logging.getLogger(__name__) + class LegacyTrajectoryEvaluatorConfig(LegacyEvaluatorConfig): """Configuration for legacy trajectory evaluators.""" @@ -103,6 +109,19 @@ def _create_evaluation_prompt( agent_run_history: Any, ) -> str: """Create the evaluation prompt for the LLM.""" + # Validate that expected agent behavior is not empty + if is_empty_value(expected_agent_behavior): + logger.error( + "❌ EMPTY_EXPECTED_AGENT_BEHAVIOR: Expected agent behavior is empty or contains only empty values. " + f"Received: {repr(expected_agent_behavior)}" + ) + raise UiPathEvaluationError( + code="EMPTY_EXPECTED_AGENT_BEHAVIOR", + title="Expected agent behavior cannot be empty", + detail="The evaluation criteria must contain a non-empty expected agent behavior.", + category=UiPathEvaluationErrorCategory.USER, + ) + formatted_prompt = self.prompt.replace( self.expected_agent_behavior_placeholder, str(expected_agent_behavior), diff --git a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py index b734f3de1..e4c9bffd8 100644 --- a/src/uipath/eval/evaluators/llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/llm_as_judge_evaluator.py @@ -136,34 +136,6 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any: """Get the expected output from the evaluation criteria. Must be implemented by concrete evaluator classes.""" pass - def _is_empty_expected_output(self, expected_output: Any) -> bool: - """Check if the expected output is empty or contains only empty values. - - Handles multiple cases: - - None or empty string - - String with only whitespace - - Dict where all values are empty strings or whitespace - - Empty list or dict - """ - if expected_output is None: - return True - - if isinstance(expected_output, str): - return not expected_output.strip() - - if isinstance(expected_output, dict): - if not expected_output: # Empty dict - return True - # Check if all values are empty strings - return all( - isinstance(v, str) and not v.strip() for v in expected_output.values() - ) - - if isinstance(expected_output, list): - return len(expected_output) == 0 - - return False - async def evaluate( self, agent_execution: AgentExecution, @@ -193,19 +165,6 @@ def _create_evaluation_prompt( """Create the evaluation prompt for the LLM.""" expected_output = self._get_expected_output(evaluation_criteria) - # Validate that expected output is not empty - if self._is_empty_expected_output(expected_output): - logger.error( - "❌ EMPTY_EXPECTED_OUTPUT: Expected output is empty or contains only empty values. " - f"Received: {repr(expected_output)}" - ) - raise UiPathEvaluationError( - code="EMPTY_EXPECTED_OUTPUT", - title="Expected output cannot be empty", - detail="The evaluation criteria must contain a non-empty expected output or expected agent behavior.", - category=UiPathEvaluationErrorCategory.USER, - ) - formatted_prompt = self.evaluator_config.prompt.replace( self.actual_output_placeholder, str(self._get_actual_output(agent_execution)), diff --git a/tests/evaluators/test_helpers.py b/tests/evaluators/test_helpers.py new file mode 100644 index 000000000..cb376fcbb --- /dev/null +++ b/tests/evaluators/test_helpers.py @@ -0,0 +1,109 @@ +"""Test module for helper functions in uipath.eval._helpers.helpers.""" + +from uipath.eval._helpers.helpers import is_empty_value + + +class TestIsEmptyValue: + """Test is_empty_value helper function. + + These tests are based on realistic evaluation criteria structures: + - expectedOutput: typically a dict like {"content": "..."} or empty dict {} + - expectedAgentBehavior: typically a string describing expected behavior + """ + + # --- Empty expectedAgentBehavior (string) cases --- + + def test_empty_string_expected_agent_behavior(self) -> None: + """Test empty string expectedAgentBehavior is considered empty.""" + assert is_empty_value("") is True + + def test_whitespace_only_expected_agent_behavior(self) -> None: + """Test whitespace-only expectedAgentBehavior is considered empty.""" + assert is_empty_value(" ") is True + assert is_empty_value(" ") is True + assert is_empty_value("\t") is True + assert is_empty_value("\n") is True + assert is_empty_value(" \t\n ") is True + + def test_valid_expected_agent_behavior(self) -> None: + """Test non-empty expectedAgentBehavior strings are not empty.""" + assert is_empty_value("The agent should search for weather") is False + assert is_empty_value("Call the get_user tool with id=123") is False + assert is_empty_value(" valid behavior ") is False + + # --- Empty expectedOutput (dict) cases --- + + def test_empty_dict_expected_output(self) -> None: + """Test empty dict expectedOutput is considered empty.""" + # trajectory evaluator: {"expectedOutput": {}} + assert is_empty_value({}) is True + + def test_dict_with_empty_content_field(self) -> None: + """Test dict with empty content field is considered empty.""" + # llm-as-a-judge: {"expectedOutput": {"content": ""}} + assert is_empty_value({"content": ""}) is True + + def test_dict_with_whitespace_content_field(self) -> None: + """Test dict with whitespace-only content field is considered empty.""" + assert is_empty_value({"content": " "}) is True + assert is_empty_value({"content": "\t\n"}) is True + + def test_dict_with_multiple_empty_string_fields(self) -> None: + """Test dict where all values are empty strings is considered empty.""" + assert is_empty_value({"content": "", "reasoning": ""}) is True + assert is_empty_value({"content": " ", "reasoning": "\t"}) is True + + def test_dict_with_valid_content_field(self) -> None: + """Test dict with non-empty content field is not empty.""" + assert is_empty_value({"content": "Expected response"}) is False + assert is_empty_value({"content": "The answer is 42"}) is False + + def test_dict_with_mixed_empty_and_non_empty_fields(self) -> None: + """Test dict with at least one non-empty value is not empty.""" + # If any value is non-empty, the whole dict is not empty + assert is_empty_value({"content": "value", "reasoning": ""}) is False + assert is_empty_value({"content": "", "reasoning": "some reason"}) is False + + # --- None case --- + + def test_none_is_empty(self) -> None: + """Test that None is considered empty.""" + assert is_empty_value(None) is True + + # --- Empty list case --- + + def test_empty_list_is_empty(self) -> None: + """Test that empty list is considered empty.""" + assert is_empty_value([]) is True + + def test_non_empty_list_is_not_empty(self) -> None: + """Test that non-empty lists are not considered empty.""" + assert is_empty_value(["step1", "step2"]) is False + assert is_empty_value([{"content": ""}]) is False + + # --- Edge cases with non-string dict values --- + + def test_dict_with_non_string_values_is_not_empty(self) -> None: + """Test that dict with non-string values is not considered empty. + + The function only checks if all values are empty strings. + Non-string values (int, bool, None, list, dict) make it non-empty. + """ + assert is_empty_value({"content": 0}) is False + assert is_empty_value({"content": False}) is False + assert is_empty_value({"content": None}) is False + assert is_empty_value({"content": []}) is False + assert is_empty_value({"content": {}}) is False + + # --- Other types (not typical in eval criteria but handled) --- + + def test_numeric_values_are_not_empty(self) -> None: + """Test that numeric values are not considered empty.""" + assert is_empty_value(0) is False + assert is_empty_value(42) is False + assert is_empty_value(0.0) is False + + def test_boolean_values_are_not_empty(self) -> None: + """Test that boolean values are not considered empty.""" + assert is_empty_value(False) is False + assert is_empty_value(True) is False