Skip to content

Commit 0deec9a

Browse files
jahoomaclaude
andcommitted
evalbuff: use Claude SDK runner for carve evals, delete generated doc
- Switch carve eval inner agents to Claude SDK (sonnet) with 3 parallel runs - Update carve-features to use gpt-5.4 model - Remove auto-generated discover-before-implement.md (test artifact) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent e4376f9 commit 0deec9a

File tree

3 files changed

+23
-200
lines changed

3 files changed

+23
-200
lines changed

docs/patterns/discover-before-implement.md

Lines changed: 0 additions & 159 deletions
This file was deleted.

evalbuff/src/run-carve-eval.ts

Lines changed: 15 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ import fs from 'fs'
1010
import os from 'os'
1111
import path from 'path'
1212

13-
import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk'
14-
1513
import {
1614
analyzeFailure,
1715
applyDocEdit,
@@ -20,10 +18,11 @@ import {
2018
revertDocEdit,
2119
} from './docs-optimizer'
2220
import { judgeTaskResult } from './judge'
23-
import { CodebuffRunner } from './runners/codebuff'
21+
import { ClaudeRunner } from './runners/claude'
2422

2523
import type { CarvedFeature, CarveResult, FileOperation } from './carve-features'
2624
import type { JudgingResult, ReviewerAgentType } from './judge'
25+
import type { RunnerResult } from './runners/runner'
2726

2827
// --- Apply carve operations to a repo directory ---
2928

@@ -140,9 +139,7 @@ async function runAgentOnCarve(opts: {
140139
repoPath: string
141140
feature: CarvedFeature
142141
initCommand?: string
143-
client: CodebuffClient
144-
agentId: string
145-
agentDefinitions: any[]
142+
model: string
146143
agentTimeoutMs: number
147144
groundTruthDiff: string
148145
reviewerAgents: ReviewerAgentType[]
@@ -160,9 +157,7 @@ async function runAgentOnCarve(opts: {
160157
repoPath,
161158
feature,
162159
initCommand,
163-
client,
164-
agentId,
165-
agentDefinitions,
160+
model,
166161
agentTimeoutMs,
167162
groundTruthDiff,
168163
reviewerAgents,
@@ -173,18 +168,10 @@ async function runAgentOnCarve(opts: {
173168
// Copy docs into the carved repo
174169
copyDocsIntoRepo(docsSourcePath, repoDir)
175170

176-
console.log(` [Run ${idx + 1}/${total}] Running agent on carved repo...`)
177-
const runner = new CodebuffRunner({
178-
cwd: repoDir,
179-
client,
180-
agentId,
181-
localAgentDefinitions: agentDefinitions,
182-
printEvents: false,
183-
commitId: feature.id.slice(0, 8),
184-
parentSha: carveSha,
185-
})
171+
console.log(` [Run ${idx + 1}/${total}] Running claude (${model}) on carved repo...`)
172+
const runner = new ClaudeRunner(repoDir, {}, model)
186173

187-
let result: Awaited<ReturnType<typeof runner.run>>
174+
let result: RunnerResult
188175
try {
189176
result = await runner.run(feature.prompt)
190177
} catch (runError) {
@@ -271,7 +258,7 @@ interface CarveEvalOptions {
271258
repoPath: string
272259
carveFile: string
273260
featureId?: string // run only this feature (default: all)
274-
agentId: string
261+
model: string
275262
parallelism: number
276263
agentTimeoutMs: number
277264
reviewerAgents: ReviewerAgentType[]
@@ -294,7 +281,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
294281
repoPath,
295282
carveFile,
296283
featureId,
297-
agentId,
284+
model,
298285
parallelism,
299286
agentTimeoutMs,
300287
reviewerAgents,
@@ -319,16 +306,9 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
319306
}
320307
}
321308

322-
// Init SDK client
323-
const client = new CodebuffClient({ cwd: repoPath })
324-
const agentsDir = path.resolve(__dirname, '../../agents')
325-
const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir })
326-
const agentDefinitions = Object.values(loadedAgents)
327-
console.log(`Loaded ${agentDefinitions.length} agent definitions`)
328-
329309
console.log(`\nCarve Eval:`)
330310
console.log(` Repo: ${repoPath}`)
331-
console.log(` Agent: ${agentId}`)
311+
console.log(` Model: ${model}`)
332312
console.log(` Parallelism: ${parallelism}`)
333313
console.log(` Reviewers: ${reviewerAgents.join(', ')}`)
334314
console.log(` Features: ${features.length}`)
@@ -355,9 +335,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
355335
repoPath,
356336
feature,
357337
initCommand,
358-
client,
359-
agentId,
360-
agentDefinitions,
338+
model,
361339
agentTimeoutMs,
362340
groundTruthDiff,
363341
reviewerAgents,
@@ -450,9 +428,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
450428
repoPath,
451429
feature,
452430
initCommand,
453-
client,
454-
agentId,
455-
agentDefinitions,
431+
model,
456432
agentTimeoutMs,
457433
groundTruthDiff,
458434
reviewerAgents,
@@ -587,8 +563,8 @@ if (import.meta.main) {
587563
const repoPath = getArg('repo')
588564
const carveFile = getArg('carve-file')
589565
const featureId = hasArg('feature') ? getArg('feature') : undefined
590-
const agentId = getArg('agent', 'base2-free-evals')
591-
const parallelism = parseInt(getArg('parallelism', '5'))
566+
const model = getArg('model', 'sonnet')
567+
const parallelism = parseInt(getArg('parallelism', '3'))
592568
const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000'))
593569
const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined
594570
const reviewerAgents: ReviewerAgentType[] = reviewerAgentsArg
@@ -601,7 +577,7 @@ if (import.meta.main) {
601577
repoPath,
602578
carveFile,
603579
featureId,
604-
agentId,
580+
model,
605581
parallelism,
606582
agentTimeoutMs,
607583
reviewerAgents,

evalbuff/src/runners/claude.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,16 @@ import type {
99
export class ClaudeRunner implements Runner {
1010
private cwd: string
1111
private env: Record<string, string>
12+
private model: string
1213

13-
constructor(cwd: string, env: Record<string, string> = {}) {
14+
constructor(
15+
cwd: string,
16+
env: Record<string, string> = {},
17+
model: string = 'claude-opus-4-5-20251101',
18+
) {
1419
this.cwd = cwd
1520
this.env = env
21+
this.model = model
1622
}
1723

1824
async run(prompt: string): Promise<RunnerResult> {
@@ -28,7 +34,7 @@ export class ClaudeRunner implements Runner {
2834
'--verbose',
2935
'--dangerously-skip-permissions',
3036
'--model',
31-
'claude-opus-4-5-20251101',
37+
this.model,
3238
]
3339

3440
console.log(`[ClaudeRunner] Running: claude ${args.join(' ')}`)

0 commit comments

Comments
 (0)