feat: add support for line-by-line evaluation

AAgnihotry · claude · AAgnihotry · commit 8750e34ee311 · 2026-03-23T10:18:06.000-07:00
This commit adds line-by-line evaluation capability to output evaluators, allowing them to evaluate multi-line outputs on a per-line basis and provide granular feedback with partial credit scoring. Key changes: - Added lineByLineEvaluator config flag to OutputEvaluatorConfig - Added lineDelimiter config to customize split behavior (default: "\n") - Implemented _evaluate_line_by_line() method in BaseOutputEvaluator - Fixed runtime aggregation to handle line-by-line sub-results - Fixed targetOutputKey wrapping for individual line evaluations - Added sample agent demonstrating the feature (samples/line_by_line_test) Benefits: - Provides partial credit (e.g., 2/3 lines correct = 0.67 score) - More granular feedback with per-line results - Useful for evaluating structured multi-line outputs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/packages/uipath/samples/line_by_line_test/bindings.json b/packages/uipath/samples/line_by_line_test/bindings.json
@@ -0,0 +1,4 @@
+{
+    "version": "2.0",
+    "resources": []
+}
diff --git a/packages/uipath/samples/line_by_line_test/entry-points.json b/packages/uipath/samples/line_by_line_test/entry-points.json
@@ -0,0 +1,5 @@
+{
+    "$schema": "https://cloud.uipath.com/draft/2024-12/entry-point",
+    "$id": "entry-points.json",
+    "entryPoints": []
+}
diff --git a/packages/uipath/samples/line_by_line_test/evaluations/eval-sets/default.json b/packages/uipath/samples/line_by_line_test/evaluations/eval-sets/default.json
@@ -0,0 +1,68 @@
+{
+  "version": "1.0",
+  "id": "line-by-line-test",
+  "name": "Line-by-Line Evaluation Test",
+  "evaluatorRefs": [
+    "LineByLineExactMatch",
+    "RegularExactMatch"
+  ],
+  "evaluations": [
+    {
+      "id": "test-all-lines-match",
+      "name": "Test all lines match exactly",
+      "inputs": {
+        "items": ["apple", "banana", "cherry"]
+      },
+      "evaluationCriterias": {
+        "LineByLineExactMatch": {
+          "expectedOutput": {
+            "result": "Item: apple\nItem: banana\nItem: cherry"
+          }
+        },
+        "RegularExactMatch": {
+          "expectedOutput": {
+            "result": "Item: apple\nItem: banana\nItem: cherry"
+          }
+        }
+      }
+    },
+    {
+      "id": "test-partial-line-mismatch",
+      "name": "Test when one line doesn't match",
+      "inputs": {
+        "items": ["apple", "banana", "cherry"]
+      },
+      "evaluationCriterias": {
+        "LineByLineExactMatch": {
+          "expectedOutput": {
+            "result": "Item: apple\nItem: WRONG\nItem: cherry"
+          }
+        },
+        "RegularExactMatch": {
+          "expectedOutput": {
+            "result": "Item: apple\nItem: WRONG\nItem: cherry"
+          }
+        }
+      }
+    },
+    {
+      "id": "test-single-item",
+      "name": "Test with single item",
+      "inputs": {
+        "items": ["orange"]
+      },
+      "evaluationCriterias": {
+        "LineByLineExactMatch": {
+          "expectedOutput": {
+            "result": "Item: orange"
+          }
+        },
+        "RegularExactMatch": {
+          "expectedOutput": {
+            "result": "Item: orange"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/line_by_line_test/evaluations/eval-sets/simple_test.json b/packages/uipath/samples/line_by_line_test/evaluations/eval-sets/simple_test.json
@@ -0,0 +1,22 @@
+{
+  "version": "1.0",
+  "id": "simple-test",
+  "name": "Simple Test",
+  "evaluatorRefs": ["LineByLineExactMatch"],
+  "evaluations": [
+    {
+      "id": "test-1",
+      "name": "Single test",
+      "inputs": {
+        "items": ["apple"]
+      },
+      "evaluationCriterias": {
+        "LineByLineExactMatch": {
+          "expectedOutput": {
+            "result": "Item: apple"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/packages/uipath/samples/line_by_line_test/evaluations/evaluators/line-by-line-exact-match.json b/packages/uipath/samples/line_by_line_test/evaluations/evaluators/line-by-line-exact-match.json
@@ -0,0 +1,14 @@
+{
+  "version": "1.0",
+  "id": "LineByLineExactMatch",
+  "description": "Evaluates each line of output separately using exact match",
+  "evaluatorTypeId": "uipath-exact-match",
+  "evaluatorConfig": {
+    "name": "LineByLineExactMatch",
+    "targetOutputKey": "result",
+    "lineByLineEvaluator": true,
+    "lineDelimiter": "\n",
+    "ignoreCase": false,
+    "negated": false
+  }
+}
diff --git a/packages/uipath/samples/line_by_line_test/evaluations/evaluators/regular-exact-match.json b/packages/uipath/samples/line_by_line_test/evaluations/evaluators/regular-exact-match.json
@@ -0,0 +1,13 @@
+{
+  "version": "1.0",
+  "id": "RegularExactMatch",
+  "description": "Regular exact match evaluator (for comparison)",
+  "evaluatorTypeId": "uipath-exact-match",
+  "evaluatorConfig": {
+    "name": "RegularExactMatch",
+    "targetOutputKey": "result",
+    "lineByLineEvaluator": false,
+    "ignoreCase": false,
+    "negated": false
+  }
+}
diff --git a/packages/uipath/samples/line_by_line_test/main.py b/packages/uipath/samples/line_by_line_test/main.py
@@ -0,0 +1,34 @@
+"""Simple agent to test line-by-line evaluation.
+
+This agent takes a list of items and outputs one result per line.
+"""
+
+from pydantic import BaseModel
+
+
+class Input(BaseModel):
+    """Input schema."""
+
+    items: list[str]
+
+
+class Output(BaseModel):
+    """Output schema."""
+
+    result: str
+
+
+def main(input_data: Input) -> Output:
+    """Process items and return one result per line.
+
+    Args:
+        input_data: Input containing list of items
+
+    Returns:
+        Output with one processed item per line
+    """
+    results = []
+    for item in input_data.items:
+        results.append(f"Item: {item}")
+
+    return Output(result="\n".join(results))
diff --git a/packages/uipath/samples/line_by_line_test/uipath.json b/packages/uipath/samples/line_by_line_test/uipath.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "https://cloud.uipath.com/draft/2024-12/uipath",
+  "runtimeOptions": {
+    "isConversational": false
+  },
+  "packOptions": {
+    "fileExtensionsIncluded": [],
+    "filesIncluded": [],
+    "filesExcluded": [],
+    "directoriesExcluded": [],
+    "includeUvLock": true
+  },
+  "functions": {
+    "main": "main.py:main"
+  },
+  "agents": {}
+}
diff --git a/packages/uipath/src/uipath/eval/evaluators/output_evaluator.py b/packages/uipath/src/uipath/eval/evaluators/output_evaluator.py
diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py
diff --git a/packages/uipath/tests/evaluators/test_evaluator_methods.py b/packages/uipath/tests/evaluators/test_evaluator_methods.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "version": "2.0",
 +    "resources": []
 +}