CodebuffAI
diff --git a/‎agents/base2/base2-free-evals.ts‎
Lines changed: 8 additions & 0 deletions b/‎agents/base2/base2-free-evals.ts‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎bun.lock‎
Lines changed: 2 additions & 0 deletions b/‎bun.lock‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎evalbuff/package.json‎
Lines changed: 2 additions & 0 deletions b/‎evalbuff/package.json‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎evalbuff/src/__tests__/e2e.test.ts‎
Lines changed: 20 additions & 9 deletions b/‎evalbuff/src/__tests__/e2e.test.ts‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎evalbuff/src/__tests__/loop.integration.test.ts‎
Lines changed: 28 additions & 18 deletions b/‎evalbuff/src/__tests__/loop.integration.test.ts‎
Lines changed: 28 additions & 18 deletions
diff --git a/‎evalbuff/src/commit-task-generator.ts‎
Lines changed: 60 additions & 21 deletions b/‎evalbuff/src/commit-task-generator.ts‎
Lines changed: 60 additions & 21 deletions
@@ -0,0 +1,8 @@
+import { createBase2 } from './base2'
+
+const definition = {
+  ...createBase2('free', { noAskUser: true }),
+  id: 'base2-free-evals',
+  displayName: 'Buffy the Free Evals Orchestrator',
+}
+export default definition
@@ -14,8 +14,10 @@
     "run": "bun run src/run-evalbuff.ts"
   },
   "dependencies": {
+    "@ai-sdk/anthropic": "^2.0.50",
     "@codebuff/common": "workspace:*",
     "@codebuff/sdk": "workspace:*",
+    "ai": "^5.0.0",
     "zod": "^4.2.1"
   }
 }
@@ -40,14 +40,25 @@ mock.module('../test-repo-utils', () => ({
   },
 }))
 
-mock.module('../cli-runner', () => ({
-  runCliAgent: async () => ({
-    diff: 'mock diff content',
-    durationMs: 1000,
-    exitCode: 0,
-    stdout: 'mock stdout',
-    stderr: '',
-  }),
+mock.module('../runners/codebuff', () => ({
+  CodebuffRunner: class {
+    constructor() {}
+    async run() {
+      return {
+        steps: [{ type: 'text', content: 'mock trace' }],
+        totalCostUsd: 0.01,
+        diff: 'mock diff content',
+      }
+    }
+  },
+}))
+
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+    async run() { return { output: { type: 'success' }, sessionState: null } }
+  },
+  loadLocalAgents: async () => ({}),
 }))
 
 // Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement)
@@ -126,7 +137,7 @@ describe('evalbuff E2E', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 50,
       agentTimeoutMs: 10_000,
 
@@ -32,20 +32,30 @@ mock.module('../test-repo-utils', () => ({
   },
 }))
 
-// Mock CLI runner to return a fake result
-mock.module('../cli-runner', () => ({
-  runCliAgent: async () => {
-    cliRunnerCallCount++
-    return {
-      diff: 'mock diff content',
-      durationMs: 1000,
-      exitCode: 0,
-      stdout: 'mock stdout',
-      stderr: '',
+// Mock CodebuffRunner to return a fake result
+mock.module('../runners/codebuff', () => ({
+  CodebuffRunner: class {
+    constructor() {}
+    async run() {
+      cliRunnerCallCount++
+      return {
+        steps: [{ type: 'text', content: 'mock trace' }],
+        totalCostUsd: 0.01,
+        diff: 'mock diff content',
+      }
     }
   },
 }))
 
+// Mock SDK client and loadLocalAgents
+mock.module('@codebuff/sdk', () => ({
+  CodebuffClient: class {
+    constructor() {}
+    async run() { return { output: { type: 'success' }, sessionState: null } }
+  },
+  loadLocalAgents: async () => ({}),
+}))
+
 // Mock judge to return configurable scores
 mock.module('../judge', () => ({
   judgeTaskResult: async () => {
@@ -144,7 +154,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -190,7 +200,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -233,7 +243,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -245,10 +255,10 @@ describe('runLearnMode integration', () => {
     expect(fs.existsSync(logPath)).toBe(false)
   })
 
-  it('rejects doc edit when score does not improve', async () => {
-    // Commit1: baseline 4.0, rerun 3.0 (worse) — doc rejected, loop stops.
+  it('rejects doc edit when score drops significantly', async () => {
+    // Commit1: baseline 5.0, rerun 2.0 (3-point drop, past 1.5 threshold) — doc rejected.
     // Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null.
-    judgeScores = [4.0, 3.0, 8.0, 8.0]
+    judgeScores = [5.0, 2.0, 8.0, 8.0]
     analyzeFailureResults = [
       {
         reasoning: 'Tried to help',
@@ -262,7 +272,7 @@ describe('runLearnMode integration', () => {
     await runLearnMode({
       mode: 'learn',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
@@ -290,7 +300,7 @@ describe('runPromptMode integration', () => {
     await runPromptMode({
       mode: 'prompt',
       repoPath: repoDir,
-      agentCommand: 'echo',
+      agentId: 'base2-free-evals',
       parallelism: 1,
       maxCostUsd: 100,
       agentTimeoutMs: 10_000,
 
@@ -1,8 +1,9 @@
 import { execSync } from 'child_process'
 import fs from 'fs'
-import os from 'os'
 import path from 'path'
 
+import { generatePrompt } from './llm'
+
 export interface CommitTask {
   sha: string
   parentSha: string
@@ -14,6 +15,55 @@ export interface CommitTask {
 
 const MAX_DIFF_CHARS = 200_000
 
+/**
+ * Commit message patterns that indicate trivial/automated commits not worth
+ * running agents on. Saves ~10 agent+judge invocations per skipped commit.
+ */
+const TRIVIAL_COMMIT_PATTERNS = [
+  /^bump\b.*\bversion\b/i,
+  /^v?\d+\.\d+\.\d+$/,           // version-only messages like "1.0.635"
+  /^release\s+v?\d+/i,
+  /^chore\(release\)/i,
+  /^update\s+(change|changelog)/i,
+  /^merge\s+(branch|pull request)/i,
+]
+
+/**
+ * Returns true if a commit is trivial and should be skipped.
+ * Checks commit message patterns and whether only package.json version fields changed.
+ */
+function isTrivialCommit(
+  message: string,
+  filesChanged: string[],
+  diff: string,
+): boolean {
+  const firstLine = message.split('\n')[0].trim()
+
+  // Check message patterns
+  if (TRIVIAL_COMMIT_PATTERNS.some((p) => p.test(firstLine))) return true
+
+  // Single package.json change that only touches "version" field
+  if (
+    filesChanged.length === 1 &&
+    filesChanged[0].endsWith('package.json') &&
+    diff.length < 1000
+  ) {
+    const addedLines = diff
+      .split('\n')
+      .filter((l) => l.startsWith('+') && !l.startsWith('+++'))
+    const removedLines = diff
+      .split('\n')
+      .filter((l) => l.startsWith('-') && !l.startsWith('---'))
+    const allVersionChanges =
+      [...addedLines, ...removedLines].every((l) =>
+        /^\s*[+-]\s*"version"/.test(l),
+      )
+    if (allVersionChanges) return true
+  }
+
+  return false
+}
+
 /**
  * Files that add noise to diffs without useful signal.
  * Lockfiles are huge and auto-generated — agents shouldn't replicate them.
@@ -231,31 +281,14 @@ ${filesSection}## Diff
 ${diff}
 \`\`\``
 
-  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-promptgen-'))
-  const promptFile = path.join(tmpDir, 'PROMPT_GEN.md')
-
   try {
-    fs.writeFileSync(promptFile, `${PROMPT_GEN_SYSTEM}\n\n---\n\n${userPrompt}`)
-
-    // IMPORTANT: Run in tmpDir to avoid Claude reading the repo's CLAUDE.md/AGENTS.md,
-    // which can confuse prompt generation (e.g., generating prompts about evalbuff itself).
-    const output = execSync(
-      `claude --dangerously-skip-permissions -p "Read ${promptFile} and follow all instructions. Respond with ONLY the task prompt text."`,
-      {
-        cwd: tmpDir,
-        encoding: 'utf-8',
-        timeout: 2 * 60 * 1000,
-        stdio: ['ignore', 'pipe', 'pipe'],
-        maxBuffer: 10 * 1024 * 1024,
-      },
-    ).trim()
-
+    // Use API directly — faster than spawning Claude CLI (~3s vs ~15s)
+    // and avoids CLAUDE.md/AGENTS.md context pollution
+    const output = await generatePrompt(PROMPT_GEN_SYSTEM, userPrompt)
     return output || message
   } catch {
     // Fallback to the commit message itself
     return message
-  } finally {
-    fs.rmSync(tmpDir, { recursive: true, force: true })
   }
 }
 
@@ -270,6 +303,12 @@ export async function buildCommitTask(
   const info = getCommitInfo(repoPath, sha)
   if (!info) return null
 
+  // Skip trivial/automated commits (version bumps, releases, etc.)
+  if (isTrivialCommit(info.message, info.filesChanged, info.diff)) {
+    console.log(`Skipping ${sha.slice(0, 8)}: trivial commit (${info.message.split('\n')[0].slice(0, 50)})`)
+    return null
+  }
+
   // Skip commits with diffs that exceed our limit
   if (info.diff.length > MAX_DIFF_CHARS) {
     console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`)
Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,10 @@`
`14`	`14`	`"run": "bun run src/run-evalbuff.ts"`
`15`	`15`	`},`
`16`	`16`	`"dependencies": {`
	`17`	`+ "@ai-sdk/anthropic": "^2.0.50",`
`17`	`18`	`"@codebuff/common": "workspace:*",`
`18`	`19`	`"@codebuff/sdk": "workspace:*",`
	`20`	`+ "ai": "^5.0.0",`
`19`	`21`	`"zod": "^4.2.1"`
`20`	`22`	`}`
`21`	`23`	`}`