diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 5df5ee42..95ee3ac8 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -178,7 +178,8 @@ export const evalRunCommand = command({ threshold: option({ type: optional(number), long: 'threshold', - description: 'Suite-level quality gate: exit 1 if mean score falls below this value (0-1)', + description: + 'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value', }), }, handler: async (args) => { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 32d3318f..ff53b8b5 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -46,7 +46,6 @@ import { calculateEvaluationSummary, formatEvaluationSummary, formatMatrixSummary, - formatThresholdSummary, } from './statistics.js'; import { type TargetSelection, selectMultipleTargets, selectTarget } from './targets.js'; @@ -568,6 +567,7 @@ async function runSingleEvalFile(params: { readonly matrixMode?: boolean; readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; + readonly threshold?: number; }): Promise<{ results: EvaluationResult[] }> { const { testFilePath, @@ -685,6 +685,7 @@ async function runSingleEvalFile(params: { failOnError, graderTarget: options.graderTarget, model: options.model, + threshold: options.threshold, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { ( @@ -1162,6 +1163,7 @@ export async function runEvalCommand( matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, }); return result.results; @@ -1185,16 +1187,13 @@ export async function runEvalCommand( ); } - const summary = calculateEvaluationSummary(allResults); - console.log(formatEvaluationSummary(summary)); + const thresholdOpts = + resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; + const summary = calculateEvaluationSummary(allResults, thresholdOpts); + console.log(formatEvaluationSummary(summary, thresholdOpts)); - // Threshold quality gate check - let thresholdFailed = false; - if (resolvedThreshold !== undefined) { - const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold); - console.log(`\n${thresholdResult.message}`); - thresholdFailed = !thresholdResult.passed; - } + // Exit code matches RESULT verdict: fail if any test scored below threshold. + const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0; // Print matrix summary when multiple targets were evaluated if (isMatrixMode && allResults.length > 0) { diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 910052d2..38aa4c50 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -84,6 +84,7 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] { export function calculateEvaluationSummary( results: readonly EvaluationResult[], + options?: { threshold?: number }, ): EvaluationSummary { const total = results.length; @@ -132,10 +133,19 @@ export function calculateEvaluationSummary( const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length)); const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length)); - // Count by execution status + // Count by execution status. When a custom threshold is provided, + // recompute passed/failed from raw scores instead of executionStatus + // (which uses the hardcoded PASS_THRESHOLD of 0.8). const executionErrorCount = executionErrors.length; - const qualityFailureCount = results.filter((r) => r.executionStatus === 'quality_failure').length; - const passedCount = results.filter((r) => r.executionStatus === 'ok').length; + const scoreThreshold = options?.threshold; + const passedCount = + scoreThreshold !== undefined + ? qualityResults.filter((r) => r.score >= scoreThreshold).length + : results.filter((r) => r.executionStatus === 'ok').length; + const qualityFailureCount = + scoreThreshold !== undefined + ? qualityResults.filter((r) => r.score < scoreThreshold).length + : results.filter((r) => r.executionStatus === 'quality_failure').length; // Aggregate by failure stage and reason (execution errors only) const byFailureStage: Record = {}; @@ -174,7 +184,10 @@ function formatScore(value: number): string { return value.toFixed(3); } -export function formatEvaluationSummary(summary: EvaluationSummary): string { +export function formatEvaluationSummary( + summary: EvaluationSummary, + options?: { threshold?: number }, +): string { if (summary.total === 0) { return '\nNo results to summarize'; } @@ -193,14 +206,16 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push(''); } - // Overall verdict line + // Overall verdict: all non-error cases must score >= per-test threshold. + const gradedCount = summary.total - summary.executionErrorCount; + const threshold = options?.threshold ?? 0.8; const overallPassed = - summary.passedCount === summary.total - summary.executionErrorCount || + summary.passedCount === gradedCount || (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`; + const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`; lines.push('\n=================================================='); if (useColor) { @@ -334,17 +349,3 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin return lines.join('\n'); } - -/** - * Format a threshold check summary line. - * Returns whether the threshold was met and the formatted message. - */ -export function formatThresholdSummary( - meanScore: number, - threshold: number, -): { passed: boolean; message: string } { - const passed = meanScore >= threshold; - const verdict = passed ? 'PASS' : 'FAIL'; - const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) — ${verdict}`; - return { passed, message }; -} diff --git a/apps/cli/test/commands/eval/threshold.test.ts b/apps/cli/test/commands/eval/threshold.test.ts index 65c05916..0d729ecf 100644 --- a/apps/cli/test/commands/eval/threshold.test.ts +++ b/apps/cli/test/commands/eval/threshold.test.ts @@ -1,31 +1,49 @@ import { describe, expect, it } from 'bun:test'; -import { formatThresholdSummary } from '../../../src/commands/eval/statistics.js'; - -describe('formatThresholdSummary', () => { - it('returns PASS when mean score meets threshold', () => { - const result = formatThresholdSummary(0.85, 0.6); - expect(result.passed).toBe(true); - expect(result.message).toContain('0.85'); - expect(result.message).toContain('0.60'); - expect(result.message).toContain('PASS'); +import type { EvaluationResult } from '@agentv/core'; + +import { calculateEvaluationSummary } from '../../../src/commands/eval/statistics.js'; + +function makeResult(testId: string, score: number): EvaluationResult { + return { + testId, + score, + executionStatus: score >= 0.8 ? 'ok' : 'quality_failure', + } as EvaluationResult; +} + +describe('calculateEvaluationSummary with threshold', () => { + const results: EvaluationResult[] = [ + makeResult('test-1', 1.0), + makeResult('test-2', 0.6), + makeResult('test-3', 0.9), + makeResult('test-4', 0.4), + ]; + + it('uses default 0.8 threshold when no threshold provided', () => { + const summary = calculateEvaluationSummary(results); + // test-1 (1.0) and test-3 (0.9) pass at 0.8 + expect(summary.passedCount).toBe(2); + expect(summary.qualityFailureCount).toBe(2); }); - it('returns FAIL when mean score is below threshold', () => { - const result = formatThresholdSummary(0.53, 0.6); - expect(result.passed).toBe(false); - expect(result.message).toContain('0.53'); - expect(result.message).toContain('0.60'); - expect(result.message).toContain('FAIL'); + it('recomputes passed/failed with custom threshold', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0.5 }); + // test-1 (1.0), test-2 (0.6), test-3 (0.9) pass at 0.5 + expect(summary.passedCount).toBe(3); + expect(summary.qualityFailureCount).toBe(1); }); - it('returns PASS when mean score exactly equals threshold', () => { - const result = formatThresholdSummary(0.6, 0.6); - expect(result.passed).toBe(true); + it('stricter threshold reduces pass count', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0.95 }); + // only test-1 (1.0) passes at 0.95 + expect(summary.passedCount).toBe(1); + expect(summary.qualityFailureCount).toBe(3); }); - it('returns PASS for threshold 0 with any score', () => { - const result = formatThresholdSummary(0, 0); - expect(result.passed).toBe(true); + it('threshold 0 passes everything', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0 }); + expect(summary.passedCount).toBe(4); + expect(summary.qualityFailureCount).toBe(0); }); }); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 0388223d..71ce71a8 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -242,7 +242,7 @@ When halted, remaining tests are recorded with `failureReasonCode: 'error_thresh ### Suite-Level Quality Threshold -Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. +Set a per-test score threshold for the eval suite. Each test case must score at or above this value to pass. If any test scores below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. **CLI flag:** @@ -257,12 +257,12 @@ execution: threshold: 0.8 ``` -The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1. Mean score is computed from quality results only (execution errors are excluded). +The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1 (default: 0.8). Execution errors are excluded from the count. -When active, a summary line is printed after the eval results: +When active, the summary line shows how many tests met the threshold: ``` -Suite score: 0.85 (threshold: 0.80) — PASS +RESULT: PASS (28/31 scored >= 0.8, mean: 0.927) ``` The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as `` in JUnit output. When no threshold is set, JUnit defaults to 0.5. diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 27e1ce6f..521c5659 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -77,8 +77,8 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; -function classifyQualityStatus(score: number): ExecutionStatus { - return score >= PASS_THRESHOLD ? 'ok' : 'quality_failure'; +function classifyQualityStatus(score: number, threshold = PASS_THRESHOLD): ExecutionStatus { + return score >= threshold ? 'ok' : 'quality_failure'; } function buildSkippedEvaluatorError( @@ -194,6 +194,8 @@ export interface RunEvalCaseOptions { readonly evalDir?: string; /** Include verbose request details in results (e.g. agent input text) */ readonly verbose?: boolean; + /** Per-test score threshold for pass/fail (default: 0.8) */ + readonly threshold?: number; } export interface ProgressEvent { @@ -261,6 +263,8 @@ export interface RunEvaluationOptions { readonly graderTarget?: string; /** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */ readonly model?: string; + /** Per-test score threshold for pass/fail (default: 0.8) */ + readonly threshold?: number; } export async function runEvaluation( @@ -299,6 +303,7 @@ export async function runEvaluation( retainOnFailure, graderTarget: cliGraderTarget, model: cliModel, + threshold: scoreThreshold, } = options; // Disable cache when trials > 1 (cache makes trials deterministic = pointless) @@ -475,6 +480,7 @@ export async function runEvaluation( agentTimeoutMs, targetResolver, availableTargets, + threshold: scoreThreshold, }); } catch (error) { if (verbose) { @@ -933,6 +939,7 @@ export async function runEvaluation( repoManager, evalDir, verbose, + threshold: scoreThreshold, }; let result = trials && trials.count > 1 @@ -1123,6 +1130,7 @@ async function runBatchEvaluation(options: { readonly agentTimeoutMs?: number; readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; + readonly threshold?: number; }): Promise { const { evalCases, @@ -1138,6 +1146,7 @@ async function runBatchEvaluation(options: { agentTimeoutMs, targetResolver, availableTargets, + threshold: batchThreshold, } = options; // Prepare prompt inputs up front so we can reuse them for grading. @@ -1246,6 +1255,7 @@ async function runBatchEvaluation(options: { targetResolver, availableTargets, verbose, + threshold: batchThreshold, }); if (providerError) { @@ -1337,6 +1347,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { const { evalCase, @@ -2041,6 +2054,7 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, + threshold: evalThreshold, } = options; const gradeTimestamp = nowFn(); @@ -2124,7 +2138,7 @@ async function evaluateCandidate(options: { scores: scores, trace: trace, fileChanges, - executionStatus: classifyQualityStatus(score.score), + executionStatus: classifyQualityStatus(score.score, evalThreshold), }; }