Skip to content

Commit 8750e34

Browse files
AAgnihotryclaude
andcommitted
feat: add support for line-by-line evaluation
This commit adds line-by-line evaluation capability to output evaluators, allowing them to evaluate multi-line outputs on a per-line basis and provide granular feedback with partial credit scoring. Key changes: - Added lineByLineEvaluator config flag to OutputEvaluatorConfig - Added lineDelimiter config to customize split behavior (default: "\n") - Implemented _evaluate_line_by_line() method in BaseOutputEvaluator - Fixed runtime aggregation to handle line-by-line sub-results - Fixed targetOutputKey wrapping for individual line evaluations - Added sample agent demonstrating the feature (samples/line_by_line_test) Benefits: - Provides partial credit (e.g., 2/3 lines correct = 0.67 score) - More granular feedback with per-line results - Useful for evaluating structured multi-line outputs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent e395d37 commit 8750e34

11 files changed

Lines changed: 584 additions & 2 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"version": "2.0",
3+
"resources": []
4+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"$schema": "https://cloud.uipath.com/draft/2024-12/entry-point",
3+
"$id": "entry-points.json",
4+
"entryPoints": []
5+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"version": "1.0",
3+
"id": "line-by-line-test",
4+
"name": "Line-by-Line Evaluation Test",
5+
"evaluatorRefs": [
6+
"LineByLineExactMatch",
7+
"RegularExactMatch"
8+
],
9+
"evaluations": [
10+
{
11+
"id": "test-all-lines-match",
12+
"name": "Test all lines match exactly",
13+
"inputs": {
14+
"items": ["apple", "banana", "cherry"]
15+
},
16+
"evaluationCriterias": {
17+
"LineByLineExactMatch": {
18+
"expectedOutput": {
19+
"result": "Item: apple\nItem: banana\nItem: cherry"
20+
}
21+
},
22+
"RegularExactMatch": {
23+
"expectedOutput": {
24+
"result": "Item: apple\nItem: banana\nItem: cherry"
25+
}
26+
}
27+
}
28+
},
29+
{
30+
"id": "test-partial-line-mismatch",
31+
"name": "Test when one line doesn't match",
32+
"inputs": {
33+
"items": ["apple", "banana", "cherry"]
34+
},
35+
"evaluationCriterias": {
36+
"LineByLineExactMatch": {
37+
"expectedOutput": {
38+
"result": "Item: apple\nItem: WRONG\nItem: cherry"
39+
}
40+
},
41+
"RegularExactMatch": {
42+
"expectedOutput": {
43+
"result": "Item: apple\nItem: WRONG\nItem: cherry"
44+
}
45+
}
46+
}
47+
},
48+
{
49+
"id": "test-single-item",
50+
"name": "Test with single item",
51+
"inputs": {
52+
"items": ["orange"]
53+
},
54+
"evaluationCriterias": {
55+
"LineByLineExactMatch": {
56+
"expectedOutput": {
57+
"result": "Item: orange"
58+
}
59+
},
60+
"RegularExactMatch": {
61+
"expectedOutput": {
62+
"result": "Item: orange"
63+
}
64+
}
65+
}
66+
}
67+
]
68+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"version": "1.0",
3+
"id": "simple-test",
4+
"name": "Simple Test",
5+
"evaluatorRefs": ["LineByLineExactMatch"],
6+
"evaluations": [
7+
{
8+
"id": "test-1",
9+
"name": "Single test",
10+
"inputs": {
11+
"items": ["apple"]
12+
},
13+
"evaluationCriterias": {
14+
"LineByLineExactMatch": {
15+
"expectedOutput": {
16+
"result": "Item: apple"
17+
}
18+
}
19+
}
20+
}
21+
]
22+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"version": "1.0",
3+
"id": "LineByLineExactMatch",
4+
"description": "Evaluates each line of output separately using exact match",
5+
"evaluatorTypeId": "uipath-exact-match",
6+
"evaluatorConfig": {
7+
"name": "LineByLineExactMatch",
8+
"targetOutputKey": "result",
9+
"lineByLineEvaluator": true,
10+
"lineDelimiter": "\n",
11+
"ignoreCase": false,
12+
"negated": false
13+
}
14+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"version": "1.0",
3+
"id": "RegularExactMatch",
4+
"description": "Regular exact match evaluator (for comparison)",
5+
"evaluatorTypeId": "uipath-exact-match",
6+
"evaluatorConfig": {
7+
"name": "RegularExactMatch",
8+
"targetOutputKey": "result",
9+
"lineByLineEvaluator": false,
10+
"ignoreCase": false,
11+
"negated": false
12+
}
13+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Simple agent to test line-by-line evaluation.
2+
3+
This agent takes a list of items and outputs one result per line.
4+
"""
5+
6+
from pydantic import BaseModel
7+
8+
9+
class Input(BaseModel):
10+
"""Input schema."""
11+
12+
items: list[str]
13+
14+
15+
class Output(BaseModel):
16+
"""Output schema."""
17+
18+
result: str
19+
20+
21+
def main(input_data: Input) -> Output:
22+
"""Process items and return one result per line.
23+
24+
Args:
25+
input_data: Input containing list of items
26+
27+
Returns:
28+
Output with one processed item per line
29+
"""
30+
results = []
31+
for item in input_data.items:
32+
results.append(f"Item: {item}")
33+
34+
return Output(result="\n".join(results))
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"$schema": "https://cloud.uipath.com/draft/2024-12/uipath",
3+
"runtimeOptions": {
4+
"isConversational": false
5+
},
6+
"packOptions": {
7+
"fileExtensionsIncluded": [],
8+
"filesIncluded": [],
9+
"filesExcluded": [],
10+
"directoriesExcluded": [],
11+
"includeUvLock": true
12+
},
13+
"functions": {
14+
"main": "main.py:main"
15+
},
16+
"agents": {}
17+
}

0 commit comments

Comments
 (0)