Skip to content

Commit 146eddf

Browse files
jahoomaclaude
andcommitted
evalbuff: use Codebuff SDK, direct LLM API, and improve quality
Replace CLI spawning with Codebuff SDK for agent execution and Vercel AI SDK for LLM calls (5x faster prompt generation). Add base2-free-evals agent with noAskUser. Use local git clones with hardlinks for near-instant repo setup. Filter trivial commits, use average reviewer scores, inline traces into doc writer prompts, and add adaptive improvement thresholds. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 694ae0b commit 146eddf

File tree

12 files changed

+433
-133
lines changed

12 files changed

+433
-133
lines changed

agents/base2/base2-free-evals.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { createBase2 } from './base2'
2+
3+
const definition = {
4+
...createBase2('free', { noAskUser: true }),
5+
id: 'base2-free-evals',
6+
displayName: 'Buffy the Free Evals Orchestrator',
7+
}
8+
export default definition

bun.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

evalbuff/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
"run": "bun run src/run-evalbuff.ts"
1515
},
1616
"dependencies": {
17+
"@ai-sdk/anthropic": "^2.0.50",
1718
"@codebuff/common": "workspace:*",
1819
"@codebuff/sdk": "workspace:*",
20+
"ai": "^5.0.0",
1921
"zod": "^4.2.1"
2022
}
2123
}

evalbuff/src/__tests__/e2e.test.ts

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({
4040
},
4141
}))
4242

43-
mock.module('../cli-runner', () => ({
44-
runCliAgent: async () => ({
45-
diff: 'mock diff content',
46-
durationMs: 1000,
47-
exitCode: 0,
48-
stdout: 'mock stdout',
49-
stderr: '',
50-
}),
43+
mock.module('../runners/codebuff', () => ({
44+
CodebuffRunner: class {
45+
constructor() {}
46+
async run() {
47+
return {
48+
steps: [{ type: 'text', content: 'mock trace' }],
49+
totalCostUsd: 0.01,
50+
diff: 'mock diff content',
51+
}
52+
}
53+
},
54+
}))
55+
56+
mock.module('@codebuff/sdk', () => ({
57+
CodebuffClient: class {
58+
constructor() {}
59+
async run() { return { output: { type: 'success' }, sessionState: null } }
60+
},
61+
loadLocalAgents: async () => ({}),
5162
}))
5263

5364
// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
@@ -126,7 +137,7 @@ describe('evalbuff E2E', () => {
126137
await runLearnMode({
127138
mode: 'learn',
128139
repoPath: repoDir,
129-
agentCommand: 'echo',
140+
agentId: 'base2-free-evals',
130141
parallelism: 1,
131142
maxCostUsd: 50,
132143
agentTimeoutMs: 10_000,

evalbuff/src/__tests__/loop.integration.test.ts

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,30 @@ mock.module('../test-repo-utils', () => ({
3232
},
3333
}))
3434

35-
// Mock CLI runner to return a fake result
36-
mock.module('../cli-runner', () => ({
37-
runCliAgent: async () => {
38-
cliRunnerCallCount++
39-
return {
40-
diff: 'mock diff content',
41-
durationMs: 1000,
42-
exitCode: 0,
43-
stdout: 'mock stdout',
44-
stderr: '',
35+
// Mock CodebuffRunner to return a fake result
36+
mock.module('../runners/codebuff', () => ({
37+
CodebuffRunner: class {
38+
constructor() {}
39+
async run() {
40+
cliRunnerCallCount++
41+
return {
42+
steps: [{ type: 'text', content: 'mock trace' }],
43+
totalCostUsd: 0.01,
44+
diff: 'mock diff content',
45+
}
4546
}
4647
},
4748
}))
4849

50+
// Mock SDK client and loadLocalAgents
51+
mock.module('@codebuff/sdk', () => ({
52+
CodebuffClient: class {
53+
constructor() {}
54+
async run() { return { output: { type: 'success' }, sessionState: null } }
55+
},
56+
loadLocalAgents: async () => ({}),
57+
}))
58+
4959
// Mock judge to return configurable scores
5060
mock.module('../judge', () => ({
5161
judgeTaskResult: async () => {
@@ -144,7 +154,7 @@ describe('runLearnMode integration', () => {
144154
await runLearnMode({
145155
mode: 'learn',
146156
repoPath: repoDir,
147-
agentCommand: 'echo',
157+
agentId: 'base2-free-evals',
148158
parallelism: 1,
149159
maxCostUsd: 100,
150160
agentTimeoutMs: 10_000,
@@ -190,7 +200,7 @@ describe('runLearnMode integration', () => {
190200
await runLearnMode({
191201
mode: 'learn',
192202
repoPath: repoDir,
193-
agentCommand: 'echo',
203+
agentId: 'base2-free-evals',
194204
parallelism: 1,
195205
maxCostUsd: 100,
196206
agentTimeoutMs: 10_000,
@@ -233,7 +243,7 @@ describe('runLearnMode integration', () => {
233243
await runLearnMode({
234244
mode: 'learn',
235245
repoPath: repoDir,
236-
agentCommand: 'echo',
246+
agentId: 'base2-free-evals',
237247
parallelism: 1,
238248
maxCostUsd: 100,
239249
agentTimeoutMs: 10_000,
@@ -245,10 +255,10 @@ describe('runLearnMode integration', () => {
245255
expect(fs.existsSync(logPath)).toBe(false)
246256
})
247257

248-
it('rejects doc edit when score does not improve', async () => {
249-
// Commit1: baseline 4.0, rerun 3.0 (worse) — doc rejected, loop stops.
258+
it('rejects doc edit when score drops significantly', async () => {
259+
// Commit1: baseline 5.0, rerun 2.0 (3-point drop, past 1.5 threshold) — doc rejected.
250260
// Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null.
251-
judgeScores = [4.0, 3.0, 8.0, 8.0]
261+
judgeScores = [5.0, 2.0, 8.0, 8.0]
252262
analyzeFailureResults = [
253263
{
254264
reasoning: 'Tried to help',
@@ -262,7 +272,7 @@ describe('runLearnMode integration', () => {
262272
await runLearnMode({
263273
mode: 'learn',
264274
repoPath: repoDir,
265-
agentCommand: 'echo',
275+
agentId: 'base2-free-evals',
266276
parallelism: 1,
267277
maxCostUsd: 100,
268278
agentTimeoutMs: 10_000,
@@ -290,7 +300,7 @@ describe('runPromptMode integration', () => {
290300
await runPromptMode({
291301
mode: 'prompt',
292302
repoPath: repoDir,
293-
agentCommand: 'echo',
303+
agentId: 'base2-free-evals',
294304
parallelism: 1,
295305
maxCostUsd: 100,
296306
agentTimeoutMs: 10_000,

evalbuff/src/commit-task-generator.ts

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import { execSync } from 'child_process'
22
import fs from 'fs'
3-
import os from 'os'
43
import path from 'path'
54

5+
import { generatePrompt } from './llm'
6+
67
export interface CommitTask {
78
sha: string
89
parentSha: string
@@ -14,6 +15,55 @@ export interface CommitTask {
1415

1516
const MAX_DIFF_CHARS = 200_000
1617

18+
/**
19+
* Commit message patterns that indicate trivial/automated commits not worth
20+
* running agents on. Saves ~10 agent+judge invocations per skipped commit.
21+
*/
22+
const TRIVIAL_COMMIT_PATTERNS = [
23+
/^bump\b.*\bversion\b/i,
24+
/^v?\d+\.\d+\.\d+$/, // version-only messages like "1.0.635"
25+
/^release\s+v?\d+/i,
26+
/^chore\(release\)/i,
27+
/^update\s+(change|changelog)/i,
28+
/^merge\s+(branch|pull request)/i,
29+
]
30+
31+
/**
32+
* Returns true if a commit is trivial and should be skipped.
33+
* Checks commit message patterns and whether only package.json version fields changed.
34+
*/
35+
function isTrivialCommit(
36+
message: string,
37+
filesChanged: string[],
38+
diff: string,
39+
): boolean {
40+
const firstLine = message.split('\n')[0].trim()
41+
42+
// Check message patterns
43+
if (TRIVIAL_COMMIT_PATTERNS.some((p) => p.test(firstLine))) return true
44+
45+
// Single package.json change that only touches "version" field
46+
if (
47+
filesChanged.length === 1 &&
48+
filesChanged[0].endsWith('package.json') &&
49+
diff.length < 1000
50+
) {
51+
const addedLines = diff
52+
.split('\n')
53+
.filter((l) => l.startsWith('+') && !l.startsWith('+++'))
54+
const removedLines = diff
55+
.split('\n')
56+
.filter((l) => l.startsWith('-') && !l.startsWith('---'))
57+
const allVersionChanges =
58+
[...addedLines, ...removedLines].every((l) =>
59+
/^\s*[+-]\s*"version"/.test(l),
60+
)
61+
if (allVersionChanges) return true
62+
}
63+
64+
return false
65+
}
66+
1767
/**
1868
* Files that add noise to diffs without useful signal.
1969
* Lockfiles are huge and auto-generated — agents shouldn't replicate them.
@@ -231,31 +281,14 @@ ${filesSection}## Diff
231281
${diff}
232282
\`\`\``
233283

234-
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-'))
235-
const promptFile = path.join(tmpDir, 'PROMPT_GEN.md')
236-
237284
try {
238-
fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)
239-
240-
// IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
241-
// which can confuse prompt generation (e.g., generating prompts about evalbuff itself).
242-
const output = execSync(
243-
`claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
244-
{
245-
cwd: tmpDir,
246-
encoding: 'utf-8',
247-
timeout: 2 * 60 * 1000,
248-
stdio: ['ignore', 'pipe', 'pipe'],
249-
maxBuffer: 10 * 1024 * 1024,
250-
},
251-
).trim()
252-
285+
// Use API directly — faster than spawning Claude CLI (~3s vs ~15s)
286+
// and avoids CLAUDE.md/AGENTS.md context pollution
287+
const output = await generatePrompt(PROMPT_GEN_SYSTEM, userPrompt)
253288
return output || message
254289
} catch {
255290
// Fallback to the commit message itself
256291
return message
257-
} finally {
258-
fs.rmSync(tmpDir, { recursive: true, force: true })
259292
}
260293
}
261294

@@ -270,6 +303,12 @@ export async function buildCommitTask(
270303
const info = getCommitInfo(repoPath, sha)
271304
if (!info) return null
272305

306+
// Skip trivial/automated commits (version bumps, releases, etc.)
307+
if (isTrivialCommit(info.message, info.filesChanged, info.diff)) {
308+
console.log(`Skipping ${sha.slice(0, 8)}: trivial commit (${info.message.split('\n')[0].slice(0, 50)})`)
309+
return null
310+
}
311+
273312
// Skip commits with diffs that exceed our limit
274313
if (info.diff.length > MAX_DIFF_CHARS) {
275314
console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`)

0 commit comments

Comments
 (0)