Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,16 @@ export const evalRunCommand = command({
description:
'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value',
}),
tag: multioption({
type: array(string),
long: 'tag',
description: 'Only run eval files that have this tag (repeatable, AND logic)',
}),
excludeTag: multioption({
type: array(string),
long: 'exclude-tag',
description: 'Skip eval files that have this tag (repeatable, file skipped if any match)',
}),
},
handler: async (args) => {
// Launch interactive wizard when no eval paths and stdin is a TTY
Expand Down Expand Up @@ -224,6 +234,8 @@ export const evalRunCommand = command({
model: args.model,
outputMessages: args.outputMessages,
threshold: args.threshold,
tag: args.tag,
excludeTag: args.excludeTag,
};
const result = await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
if (result?.thresholdFailed) {
Expand Down
78 changes: 73 additions & 5 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ interface NormalizedOptions {
readonly model?: string;
readonly outputMessages: number | 'all';
readonly threshold?: number;
readonly tags: readonly string[];
readonly excludeTags: readonly string[];
}

function normalizeBoolean(value: unknown): boolean {
Expand Down Expand Up @@ -140,6 +142,43 @@ function normalizeWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' |
return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
}

function normalizeStringArray(value: unknown): readonly string[] {
if (Array.isArray(value)) {
return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0);
}
return [];
}

/**
* Check whether an eval file's tags satisfy --tag / --exclude-tag filters.
*
* - `--tag X` means the file must have tag X (AND logic: all specified tags must be present)
* - `--exclude-tag X` means the file must NOT have tag X (AND logic: none of the specified tags may be present)
* - When both are used, both conditions must hold.
* - Files without tags are excluded when --tag is specified, but included when only --exclude-tag is specified.
*/
export function matchesTagFilters(
fileTags: readonly string[] | undefined,
includeTags: readonly string[],
excludeTags: readonly string[],
): boolean {
const tags = new Set(fileTags ?? []);

// --tag: every specified tag must be present
if (includeTags.length > 0) {
for (const required of includeTags) {
if (!tags.has(required)) return false;
}
}

// --exclude-tag: none of the specified tags may be present
for (const excluded of excludeTags) {
if (tags.has(excluded)) return false;
}

return true;
}

/**
* Normalize --output-messages value. Accepts a number (>= 1) or "all".
* Defaults to 1 (last assistant message only).
Expand Down Expand Up @@ -304,6 +343,8 @@ function normalizeOptions(
model: normalizeString(rawOptions.model),
outputMessages: normalizeOutputMessages(normalizeString(rawOptions.outputMessages)),
threshold: normalizeOptionalNumber(rawOptions.threshold),
tags: normalizeStringArray(rawOptions.tag),
excludeTags: normalizeStringArray(rawOptions.excludeTag),
} satisfies NormalizedOptions;
}

Expand Down Expand Up @@ -434,6 +475,7 @@ async function prepareFileMetadata(params: {
readonly totalBudgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
}> {
const { testFilePath, repoRoot, cwd, options } = params;

Expand Down Expand Up @@ -524,6 +566,7 @@ async function prepareFileMetadata(params: {
totalBudgetUsd: suite.totalBudgetUsd,
failOnError: suite.failOnError,
threshold: suite.threshold,
tags: suite.metadata?.tags,
};
}

Expand Down Expand Up @@ -970,6 +1013,7 @@ export async function runEvalCommand(
readonly totalBudgetUsd?: number;
readonly failOnError?: FailOnError;
readonly threshold?: number;
readonly tags?: readonly string[];
}
>();
// Separate TypeScript/JS eval files from YAML files.
Expand Down Expand Up @@ -1006,6 +1050,27 @@ export async function runEvalCommand(
fileMetadata.set(testFilePath, meta);
}

// Apply --tag / --exclude-tag filtering at the eval-file level
const hasTagFilters = options.tags.length > 0 || options.excludeTags.length > 0;
if (hasTagFilters) {
const skippedFiles: string[] = [];
for (const [testFilePath, meta] of fileMetadata.entries()) {
if (!matchesTagFilters(meta.tags, options.tags, options.excludeTags)) {
fileMetadata.delete(testFilePath);
skippedFiles.push(path.relative(cwd, testFilePath));
}
}
if (skippedFiles.length > 0 && options.verbose) {
console.log(
`Skipped ${skippedFiles.length} eval file(s) by tag filter: ${skippedFiles.join(', ')}`,
);
}
if (fileMetadata.size === 0) {
console.log('No eval files matched the tag filters. Nothing to run.');
return;
}
}

// Resolve cache: combine CLI flags with YAML config
// Use first file's YAML config for cache settings (consistent across a run)
const firstMeta = fileMetadata.values().next().value;
Expand Down Expand Up @@ -1116,8 +1181,11 @@ export async function runEvalCommand(
}
}

// Use only files that survived tag filtering (fileMetadata keys)
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));

try {
await runWithLimit(resolvedTestFiles, fileConcurrency, async (testFilePath) => {
await runWithLimit(activeTestFiles, fileConcurrency, async (testFilePath) => {
const targetPrep = fileMetadata.get(testFilePath);
if (!targetPrep) {
throw new Error(`Missing metadata for ${testFilePath}`);
Expand Down Expand Up @@ -1208,7 +1276,7 @@ export async function runEvalCommand(
}

if (usesDefaultArtifactWorkspace) {
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
const workspaceDir = path.dirname(outputPath);
const {
testArtifactDir,
Expand All @@ -1230,7 +1298,7 @@ export async function runEvalCommand(
// Write companion artifacts (grading, timing, benchmark) if requested
if (options.artifacts) {
const artifactsDir = path.resolve(options.artifacts);
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
const {
testArtifactDir,
indexPath,
Expand Down Expand Up @@ -1275,7 +1343,7 @@ export async function runEvalCommand(

// Suggest retry-errors command when execution errors are detected
if (summary.executionErrorCount > 0 && !options.retryErrors) {
const evalFileArgs = resolvedTestFiles.map((f) => path.relative(cwd, f)).join(' ');
const evalFileArgs = activeTestFiles.map((f) => path.relative(cwd, f)).join(' ');
const targetFlag = options.target ? ` --target ${options.target}` : '';
const relativeOutputPath = path.relative(cwd, outputPath);
console.log(
Expand All @@ -1287,7 +1355,7 @@ export async function runEvalCommand(
return {
executionErrorCount: summary.executionErrorCount,
outputPath,
testFiles: resolvedTestFiles,
testFiles: activeTestFiles,
target: options.target,
thresholdFailed,
};
Expand Down
73 changes: 73 additions & 0 deletions apps/cli/test/commands/eval/tag-filtering.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { describe, expect, it } from 'bun:test';

import { matchesTagFilters } from '../../../src/commands/eval/run-eval.js';

describe('matchesTagFilters', () => {
describe('no filters', () => {
it('accepts files with tags', () => {
expect(matchesTagFilters(['agent', 'slow'], [], [])).toBe(true);
});

it('accepts files without tags', () => {
expect(matchesTagFilters(undefined, [], [])).toBe(true);
});

it('accepts files with empty tags', () => {
expect(matchesTagFilters([], [], [])).toBe(true);
});
});

describe('--tag (include)', () => {
it('accepts file with matching tag', () => {
expect(matchesTagFilters(['agent', 'fast'], ['agent'], [])).toBe(true);
});

it('rejects file without matching tag', () => {
expect(matchesTagFilters(['slow', 'multi-provider'], ['agent'], [])).toBe(false);
});

it('requires all specified tags (AND logic)', () => {
expect(matchesTagFilters(['agent', 'fast'], ['agent', 'fast'], [])).toBe(true);
expect(matchesTagFilters(['agent'], ['agent', 'fast'], [])).toBe(false);
});

it('rejects files with no tags when --tag is specified', () => {
expect(matchesTagFilters(undefined, ['agent'], [])).toBe(false);
expect(matchesTagFilters([], ['agent'], [])).toBe(false);
});
});

describe('--exclude-tag', () => {
it('accepts file without excluded tag', () => {
expect(matchesTagFilters(['agent', 'fast'], [], ['slow'])).toBe(true);
});

it('rejects file with excluded tag', () => {
expect(matchesTagFilters(['agent', 'slow'], [], ['slow'])).toBe(false);
});

it('rejects file if any excluded tag is present (AND logic)', () => {
expect(matchesTagFilters(['agent', 'slow'], [], ['slow', 'flaky'])).toBe(false);
expect(matchesTagFilters(['agent', 'flaky'], [], ['slow', 'flaky'])).toBe(false);
});

it('accepts files with no tags when only --exclude-tag is specified', () => {
expect(matchesTagFilters(undefined, [], ['slow'])).toBe(true);
expect(matchesTagFilters([], [], ['slow'])).toBe(true);
});
});

describe('combined --tag and --exclude-tag', () => {
it('accepts file matching include and not matching exclude', () => {
expect(matchesTagFilters(['agent', 'fast'], ['agent'], ['slow'])).toBe(true);
});

it('rejects file matching include but also matching exclude', () => {
expect(matchesTagFilters(['agent', 'slow'], ['agent'], ['slow'])).toBe(false);
});

it('rejects file not matching include even if not matching exclude', () => {
expect(matchesTagFilters(['fast'], ['agent'], ['slow'])).toBe(false);
});
});
});
Loading