From 85a83e537d4506ff3b41bb73aa4688d3eb96e7eb Mon Sep 17 00:00:00 2001 From: CodebuffAI <189203002+CodebuffAI@users.noreply.github.com> Date: Thu, 2 Apr 2026 20:47:09 +0000 Subject: [PATCH] Remove evalbuff and expensivebuff --- AGENTS.md | 3 - bun.lock | 16 - evalbuff/README.md | 151 --- evalbuff/old/BRAINSTORM.md | 207 ---- evalbuff/old/PHASE-1-SPEC.md | 861 ----------------- evalbuff/old/README.md | 37 - evalbuff/old/agents/context-agent.ts | 56 -- evalbuff/old/agents/review-agent.ts | 97 -- evalbuff/old/agents/scan-agent.ts | 46 - evalbuff/old/cli/package.json | 24 - evalbuff/old/cli/src/commands/context.ts | 87 -- evalbuff/old/cli/src/commands/init.ts | 127 --- evalbuff/old/cli/src/commands/login.ts | 22 - evalbuff/old/cli/src/commands/logout.ts | 12 - evalbuff/old/cli/src/commands/review.ts | 139 --- evalbuff/old/cli/src/index.ts | 82 -- evalbuff/old/cli/src/templates/skill.ts | 45 - evalbuff/old/cli/src/utils/auth.ts | 188 ---- evalbuff/old/cli/src/utils/config.ts | 119 --- evalbuff/old/cli/src/utils/git.ts | 110 --- evalbuff/old/cli/src/utils/knowledge.ts | 50 - evalbuff/old/cli/src/utils/output.ts | 62 -- evalbuff/old/cli/src/utils/project.ts | 9 - evalbuff/old/cli/tsconfig.json | 12 - evalbuff/package.json | 24 - evalbuff/src/__tests__/cli-runner.test.ts | 107 --- evalbuff/src/__tests__/criteria.test.ts | 119 --- evalbuff/src/__tests__/docs-optimizer.test.ts | 126 --- evalbuff/src/__tests__/e2e.test.ts | 190 ---- .../src/__tests__/loop.integration.test.ts | 318 ------- evalbuff/src/__tests__/morning-report.test.ts | 161 ---- .../src/__tests__/trace-compressor.test.ts | 159 ---- evalbuff/src/agent-runner.ts | 196 ---- evalbuff/src/carve-features.ts | 533 ----------- evalbuff/src/cli-runner.ts | 113 --- evalbuff/src/commit-task-generator.ts | 345 ------- evalbuff/src/criteria.ts | 165 ---- evalbuff/src/docs-optimizer.ts | 381 -------- evalbuff/src/evalbuff-criteria.json | 22 - evalbuff/src/judge.ts | 549 ----------- evalbuff/src/llm.ts | 49 - evalbuff/src/morning-report.ts | 197 ---- evalbuff/src/run-carve-eval.ts | 668 ------------- evalbuff/src/run-e2e-test.ts | 296 ------ evalbuff/src/run-evalbuff.ts | 898 ------------------ evalbuff/src/runners/claude.ts | 182 ---- evalbuff/src/runners/codebuff.ts | 139 --- evalbuff/src/runners/codex.ts | 143 --- evalbuff/src/runners/index.ts | 3 - evalbuff/src/runners/runner.ts | 13 - evalbuff/src/test-repo-utils.ts | 143 --- evalbuff/src/trace-compressor.ts | 284 ------ evalbuff/src/types.ts | 83 -- evalbuff/tsconfig.json | 14 - expensivebuff/cli/release/README.md | 51 - expensivebuff/cli/release/index.js | 30 - expensivebuff/cli/release/package.json | 24 - package.json | 1 - 58 files changed, 9288 deletions(-) delete mode 100644 evalbuff/README.md delete mode 100644 evalbuff/old/BRAINSTORM.md delete mode 100644 evalbuff/old/PHASE-1-SPEC.md delete mode 100644 evalbuff/old/README.md delete mode 100644 evalbuff/old/agents/context-agent.ts delete mode 100644 evalbuff/old/agents/review-agent.ts delete mode 100644 evalbuff/old/agents/scan-agent.ts delete mode 100644 evalbuff/old/cli/package.json delete mode 100644 evalbuff/old/cli/src/commands/context.ts delete mode 100644 evalbuff/old/cli/src/commands/init.ts delete mode 100644 evalbuff/old/cli/src/commands/login.ts delete mode 100644 evalbuff/old/cli/src/commands/logout.ts delete mode 100644 evalbuff/old/cli/src/commands/review.ts delete mode 100644 evalbuff/old/cli/src/index.ts delete mode 100644 evalbuff/old/cli/src/templates/skill.ts delete mode 100644 evalbuff/old/cli/src/utils/auth.ts delete mode 100644 evalbuff/old/cli/src/utils/config.ts delete mode 100644 evalbuff/old/cli/src/utils/git.ts delete mode 100644 evalbuff/old/cli/src/utils/knowledge.ts delete mode 100644 evalbuff/old/cli/src/utils/output.ts delete mode 100644 evalbuff/old/cli/src/utils/project.ts delete mode 100644 evalbuff/old/cli/tsconfig.json delete mode 100644 evalbuff/package.json delete mode 100644 evalbuff/src/__tests__/cli-runner.test.ts delete mode 100644 evalbuff/src/__tests__/criteria.test.ts delete mode 100644 evalbuff/src/__tests__/docs-optimizer.test.ts delete mode 100644 evalbuff/src/__tests__/e2e.test.ts delete mode 100644 evalbuff/src/__tests__/loop.integration.test.ts delete mode 100644 evalbuff/src/__tests__/morning-report.test.ts delete mode 100644 evalbuff/src/__tests__/trace-compressor.test.ts delete mode 100644 evalbuff/src/agent-runner.ts delete mode 100644 evalbuff/src/carve-features.ts delete mode 100644 evalbuff/src/cli-runner.ts delete mode 100644 evalbuff/src/commit-task-generator.ts delete mode 100644 evalbuff/src/criteria.ts delete mode 100644 evalbuff/src/docs-optimizer.ts delete mode 100644 evalbuff/src/evalbuff-criteria.json delete mode 100644 evalbuff/src/judge.ts delete mode 100644 evalbuff/src/llm.ts delete mode 100644 evalbuff/src/morning-report.ts delete mode 100644 evalbuff/src/run-carve-eval.ts delete mode 100644 evalbuff/src/run-e2e-test.ts delete mode 100644 evalbuff/src/run-evalbuff.ts delete mode 100644 evalbuff/src/runners/claude.ts delete mode 100644 evalbuff/src/runners/codebuff.ts delete mode 100644 evalbuff/src/runners/codex.ts delete mode 100644 evalbuff/src/runners/index.ts delete mode 100644 evalbuff/src/runners/runner.ts delete mode 100644 evalbuff/src/test-repo-utils.ts delete mode 100644 evalbuff/src/trace-compressor.ts delete mode 100644 evalbuff/src/types.ts delete mode 100644 evalbuff/tsconfig.json delete mode 100644 expensivebuff/cli/release/README.md delete mode 100644 expensivebuff/cli/release/index.js delete mode 100644 expensivebuff/cli/release/package.json diff --git a/AGENTS.md b/AGENTS.md index 231b9295c4..5028c2c794 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,7 +2,6 @@ Codebuff is an advanced coding agent with a composable agent framework. It also includes: - freebuff, the free coding agent -- evalbuff, a project to improve an agent through evals ## Goal @@ -25,7 +24,6 @@ Make an efficient learning agent that can do anything. - `agents/` — main agents shipped with codebuff - `.agents/` — local agent templates (prompt + programmatic agents) - `freebuff/` - a free coding agent built from configuring codebuff cli -- `evalbuff/` — automated docs optimization loop (run agent → judge → analyze → improve docs) ## Conventions @@ -44,5 +42,4 @@ IMPORTANT: Prefer retrieval-led reasoning over pre-training-led reasoning. Alway - `docs/environment-variables.md` — Env var rules, DI helpers, loading order - `docs/agents-and-tools.md` — Agent system, shell shims, tool definitions - `docs/patterns/handle-steps-generators.md` — handleSteps generator patterns and spawn_agents tool calls -- `docs/evalbuff/interpreting-task-prompts.md` - `docs/patterns/discover-before-implement.md` diff --git a/bun.lock b/bun.lock index 5c9ce08a53..00a9d0d549 100644 --- a/bun.lock +++ b/bun.lock @@ -107,18 +107,6 @@ "@types/parse-path": "^7.1.0", }, }, - "evalbuff": { - "name": "@codebuff/evalbuff", - "version": "1.0.0", - "dependencies": { - "@ai-sdk/anthropic": "^2.0.50", - "@codebuff/common": "workspace:*", - "@codebuff/sdk": "workspace:*", - "ai": "^5.0.0", - "openai": "^6.33.0", - "zod": "^4.2.1", - }, - }, "evals": { "name": "@codebuff/evals", "version": "1.0.0", @@ -501,8 +489,6 @@ "@codebuff/common": ["@codebuff/common@workspace:common"], - "@codebuff/evalbuff": ["@codebuff/evalbuff@workspace:evalbuff"], - "@codebuff/evals": ["@codebuff/evals@workspace:evals"], "@codebuff/freebuff": ["@codebuff/freebuff@workspace:freebuff"], @@ -2915,8 +2901,6 @@ "open": ["open@10.2.0", "", { "dependencies": { "default-browser": "^5.2.1", "define-lazy-prop": "^3.0.0", "is-inside-container": "^1.0.0", "wsl-utils": "^0.1.0" } }, "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA=="], - "openai": ["openai@6.33.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-xAYN1W3YsDXJWA5F277135YfkEk6H7D3D6vWwRhJ3OEkzRgcyK8z/P5P9Gyi/wB4N8kK9kM5ZjprfvyHagKmpw=="], - "openid-client": ["openid-client@5.7.1", "", { "dependencies": { "jose": "^4.15.9", "lru-cache": "^6.0.0", "object-hash": "^2.2.0", "oidc-token-hash": "^5.0.3" } }, "sha512-jDBPgSVfTnkIh71Hg9pRvtJc6wTwqjRkN88+gCFtYWrlP4Yx2Dsrow8uPi3qLr/aeymPF3o2+dS+wOpglK04ew=="], "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], diff --git a/evalbuff/README.md b/evalbuff/README.md deleted file mode 100644 index 518fbce6cf..0000000000 --- a/evalbuff/README.md +++ /dev/null @@ -1,151 +0,0 @@ -# Evalbuff - -Evalbuff improves a coding agent's performance by iteratively optimizing project documentation. It watches an agent fail, writes docs to fix the pattern, and keeps only the changes that measurably help. - -## Two Modes - -### 1. Commit Learning Mode (default) - -Walks through your repo's git history commit-by-commit, using each commit as a learning opportunity: - -1. Start at HEAD~500 (configurable) and process commits one at a time, oldest first -2. For each commit, craft a human-like prompt that vaguely describes the change (via LLM) -3. Run N agents in parallel (default 5) on that prompt against the parent commit -4. Judge all runs — using the actual commit diff as ground truth -5. Always analyze failures and propose doc changes (ensuring they're generic enough to help future tasks, not just this one) -6. Re-run N agents with the proposed docs -7. If scores improve, keep the docs and try to propose more improvements -8. If scores don't improve, reject the docs and move to the next commit -9. State is saved after each commit — resume at any time - -The result: a `docs/` directory that encodes patterns the agent needs to know, learned from real historical changes. - -### 2. Prompt Mode - -Run a specific coding prompt and improve docs for it — no git history needed: - -1. Given a prompt describing a coding task -2. Run N agents in parallel on the prompt against the current HEAD -3. Judge all runs — no ground truth, relies entirely on e2e testing by the judge -4. Analyze and propose doc changes -5. Re-run and keep/reject as with learn mode - -Useful for targeted doc improvement around known pain points. - -## How It Works - -``` -for each task (commit or prompt): - ┌─────────────────────────────────────────────────────┐ - │ 1. Run N agents in parallel (baseline) │ - │ 2. Judge all N runs → average score │ - │ 3. Analyze worst run → propose generic doc │ - │ 4. Apply doc to repo │ - │ 5. Re-run N agents with new doc │ - │ 6. Score improved? Keep doc, try more improvements │ - │ Score same/worse? Reject doc, next task │ - └─────────────────────────────────────────────────────┘ -``` - -Key design decisions: -- **Low-cost agent** (`codebuff --agent base2-free` by default) — runs many times cheaply -- **N parallel runs** for statistical significance — one run is noisy, five gives a decent signal -- **Always analyze** — no score threshold; every task is a learning opportunity -- **Generic docs only** — the doc writer is instructed to skip task-specific advice and focus on patterns -- **Iterative improvement** — keeps proposing docs until one is rejected, then moves on - -## Usage - -### Commit Learning Mode - -```bash -bun run evalbuff/src/run-evalbuff.ts \ - --repo /path/to/target-repo \ - --agent "codebuff --agent base2-free" \ - --commits 500 \ - --parallelism 5 \ - --max-cost 100 -``` - -### Prompt Mode - -```bash -bun run evalbuff/src/run-evalbuff.ts \ - --repo /path/to/target-repo \ - --agent "codebuff --agent base2-free" \ - --prompt "Add a dark mode toggle to the settings page" \ - --parallelism 5 -``` - -### Arguments - -| Argument | Default | Description | -|----------|---------|-------------| -| `--repo` | required | Path to the target repo where docs/ will be written | -| `--agent` | `codebuff --agent base2-free` | Agent CLI command (prompt appended as last arg) | -| `--prompt` | — | If set, runs in prompt mode instead of learn mode | -| `--commits` | 500 | How many commits back to start from (learn mode) | -| `--parallelism` | 5 | Number of agents to run in parallel per task | -| `--max-cost` | 100 | Stop after spending this many USD (estimated) | -| `--agent-timeout` | 300000 | Per-agent timeout in ms (5 min default) | -| `--init-command` | — | Command to run in each test repo (e.g., `npm install`) | -| `--criteria` | auto | Path to criteria JSON (auto-created if omitted) | -| `--reviewers` | `claude,codex` | Comma-separated reviewer agent types | - -### Resuming - -State is saved to `evalbuff-state.json` in the target repo after each commit. Re-running with the same `--repo` automatically resumes from where it left off — it knows which commit was last processed and continues from there. - -### Overnight Run - -```bash -nohup bun run evalbuff/src/run-evalbuff.ts \ - --repo /path/to/repo \ - --commits 500 \ - --parallelism 5 \ - --max-cost 200 \ - > evalbuff-overnight.log 2>&1 & -``` - -## What Gets Produced - -``` -target-repo/ -├── docs/ # Generated documentation -│ ├── patterns/ -│ │ └── error-handling.md -│ ├── conventions/ -│ │ └── naming.md -│ └── architecture/ -│ └── data-flow.md -├── AGENTS.md # Table of contents -├── evalbuff-state.json # Resumable state (last commit SHA) -├── evalbuff-log.jsonl # Per-task log -├── evalbuff-criteria.json # Current criteria level -└── evalbuff-report-2026-03-26.md # Report -``` - -## Living Quality Criteria - -Judges use a leveling system to avoid over-optimizing prematurely: - -| Level | Criteria Added | Promotion | -|-------|---------------|-----------| -| L1 | Builds, tests pass, basic completeness | Start | -| L2 | + Feature works E2E, logs clean | After L1 avg >= 8.0 over 10 tasks | -| L3 | + Edge cases, UI verification | After L2 avg >= 8.0 | -| L4 | + Cross-component integration, performance | After L3 avg >= 8.0 | -| L5 | + Production readiness | After L4 avg >= 8.0 | - -## Architecture - -| File | Role | -|------|------| -| `run-evalbuff.ts` | Main orchestrator — learn mode + prompt mode | -| `commit-task-generator.ts` | Extract tasks from git history, generate prompts from commits | -| `cli-runner.ts` | Agent-agnostic CLI runner — spawns any agent, captures diff | -| `judge.ts` | AI judging with/without ground truth, multi-reviewer aggregation | -| `docs-optimizer.ts` | Failure analysis, generic doc writing, doc application/revert | -| `criteria.ts` | Living quality criteria with L1-L5 promotion | -| `morning-report.ts` | Report generation from JSONL log | -| `test-repo-utils.ts` | Isolated git repo lifecycle management | diff --git a/evalbuff/old/BRAINSTORM.md b/evalbuff/old/BRAINSTORM.md deleted file mode 100644 index 1a81ff1a69..0000000000 --- a/evalbuff/old/BRAINSTORM.md +++ /dev/null @@ -1,207 +0,0 @@ -# Evalbuff — Brainstorm - -> Generate evals for *your* codebase. Not generic benchmarks — codebase-specific e2e testing, review, and context for AI coding agents. - -## What is Evalbuff? - -A CLI tool that helps teams build, run, and improve end-to-end evaluations for their codebase. It's intended to be used by: - -- **The coding agent** — to check its own changes in a review step -- **CI** — to run core flows and grade output quality -- **The human developer** — to define flows, dump knowledge, and tune evals - -Evalbuff is **not a coding agent**. It evaluates, reviews, and provides context. This means it complements any coding agent (Codebuff, Claude Code, Cursor, Copilot, etc.) without competing with them. - -## Commands - -| Command | Audience | Description | -|---------|----------|-------------| -| `evalbuff` | Human | Fancy TUI for browsing/editing knowledge, evals, and results | -| `evalbuff init` | Human | Initialize evalbuff in a project | -| `evalbuff context ` | Agent / Human | Return relevant files, knowledge, and gotchas for a prompt | -| `evalbuff review [prompt]` | Agent / CI / Human | Review a change e2e, give rich structured feedback. Optional prompt describes what was requested so the reviewer can verify intent. | -| `evalbuff run [task]` | CI / Human | Run eval tasks and output graded results | -| `evalbuff learn` | CI / Human | Self-improvement: iterate on evals, knowledge, and context quality | -| `evalbuff refresh` | CI (nightly) | Scan recent commits, update knowledge and eval subagents | - -## Phase 1 — Context + Review (Immediate Value, Zero Setup) - -The `context` and `review` commands are useful on day one with minimal configuration and can be a product in themselves. - -### `evalbuff context` - -Takes a prompt, returns everything a coding agent needs to work on it: - -- **Relevant files** with summaries (leveraging an excellent file picker) -- **Background knowledge** of the systems involved -- **Lessons and gotchas** learned from past work - -This is like a dynamic, project-specific skill that's better than any static AGENTS.md. Any coding agent can call this to get oriented before making changes. - -### `evalbuff review [prompt]` - -Given file diffs, uncommitted changes, or a branch: - -- Outputs rich, structured feedback on what went wrong and why -- Feedback is designed to be easy to feed back into a coding agent for a fix -- Can check against project conventions, known patterns, and past mistakes - -Both commands naturally build up the `.agents/knowledge/` directory, which makes everything better over time. - -### Skill Installation — Teaching the Coding Agent About Evalbuff - -For `context` and `review` to be useful to coding agents, the agent needs to *know* they exist and how to call them. Evalbuff solves this by installing a skill into the user's project. - -`evalbuff init` (or a dedicated `evalbuff install-skill`) writes a `SKILL.md` file into both: - -- `.agents/skills/evalbuff/SKILL.md` — for Codebuff and SDK-based agents -- `.claude/skills/evalbuff/SKILL.md` — for Claude Code compatibility - -The skill teaches the coding agent: - -- **When to call `evalbuff context `** — at the start of a task, to get relevant files, background knowledge, and gotchas before making changes -- **When to call `evalbuff review`** — after making changes, to get structured feedback before committing -- **Expected output format** — so the agent knows how to parse and act on the results -- **How to feed review feedback back** — close the loop by using review output to fix issues - -This is the critical glue that makes evalbuff work with *any* coding agent that supports skills (Codebuff, Claude Code, and anything built on the Codebuff SDK). The skill acts as a lightweight integration layer — no plugin system, no API integration, just a markdown file that the agent reads. - -Example skill content (draft): - -```markdown ---- -name: evalbuff -description: Use evalbuff to get project context before coding and review changes before committing ---- - -# Evalbuff - -This project uses evalbuff for context gathering and change review. - -## Before starting a task - -Run `evalbuff context ""` to get: -- Relevant files you should read -- Background knowledge about the systems involved -- Known gotchas and lessons from past work - -## After making changes - -Run `evalbuff review ""` to get structured feedback on your uncommitted changes. The prompt helps the reviewer verify the changes match the original intent. -If the review surfaces issues, fix them before considering the task complete. -``` - -## Phase 2 — E2E Eval Creation + Running - -### The Incremental Approach - -E2E setups are bespoke. Some projects need a full production-like environment (multiple backend servers, databases, third-party services). Setting up everything at once is wasteful and overwhelming. - -**Instead, evalbuff builds e2e infrastructure incrementally:** - -1. User describes ONE concrete e2e flow to check (e.g. "user signs up and creates a project") -2. An agent (defined via codebuff SDK) analyzes the codebase and figures out what's needed to test that one flow -3. Outputs a plan — walks the developer through manual steps, automates what it can -4. Creates the task definition in `.agents/evals/tasks/signup-flow/PROMPT.md` -5. When the user adds another flow, the agent diffs what's already set up and only adds what's missing - -This way we never set up unnecessary infrastructure. Each new flow is additive. - -### `evalbuff run` - -- Define core flows for the app that should be tested -- Grade output quality with LLM judges -- Run in CI or locally -- Optimize over time for speed and cost - -## Phase 3 — Self-Improvement Flywheel - -### `evalbuff learn` - -Runs a coding agent + evals, then iterates on its own evals and knowledge to make them: - -- **More discerning** — better at catching real issues -- **More efficient** — faster, cheaper to run -- Improves `evalbuff context` by saving more knowledge and configuring subagents - -The key insight: improving evals and knowledge is more important than updating skills/AGENTS.md. `evalbuff context` is a dynamic skill that's better than a fixed one, and `evalbuff review` handles the rest. - -### `evalbuff refresh` - -Intended to run nightly from CI (e.g. GitHub Actions): - -- Looks through commits since last refresh point -- Updates eval subagent knowledge -- Updates skills and known patterns -- Keeps evals fresh as the codebase evolves - -## Directory Structure - -### Evalbuff Package Structure - -``` -evalbuff/ -├── cli/ # TUI + commands (inspired by codebuff/cli) -├── core/ # Shared logic: context gathering, review, eval running -├── agents/ # Built-in agent definitions (uses codebuff SDK) -├── skills/ # Skill templates to install into user projects -│ └── evalbuff/ -│ └── SKILL.md # The skill that teaches agents how to use evalbuff -├── BRAINSTORM.md -└── README.md -``` - -### What Evalbuff Manages in the User's Project - -``` -.agents/ -├── skills/ -│ └── evalbuff/ -│ └── SKILL.md # Installed by `evalbuff init` — teaches agents to use evalbuff -├── evals/ -│ ├── evalbuff.json # Config (LLM provider, settings) -│ ├── tasks/ # E2E flow definitions -│ │ └── / -│ │ ├── PROMPT.md # What to check + success criteria (or SPEC.md) -│ │ └── traces/ # Historical run traces -│ └── review-tasks/ # Review-specific eval tasks -├── agent-definitions/ # Custom subagents -└── knowledge/ - └── *.md # Project knowledge, lessons, gotchas - -.claude/ -└── skills/ - └── evalbuff/ - └── SKILL.md # Same skill, for Claude Code compatibility -``` - -## Key Ideas - -### Evals Are Never Done - -> "Everything could be an eval and then the rest of the system optimizes for it." — Alex - -> "Even human vibes can be encoded." - -There are always ways to improve evals. The `learn` command creates a flywheel that manual tests never have. - -### Decoupled from the Coding Agent - -Evalbuff runs separately from the coding agent. This: - -- Gets around the subsidized coding agent pricing problem -- Works with ANY coding agent, not just Codebuff -- Makes `evalbuff context` a viral hook — it makes every coding agent better - -### The Context Command as a Trojan Horse - -`evalbuff context` is the easiest entry point. No eval setup required. Just install and immediately get better results from whatever coding tool you already use. Once teams see the value, they naturally want `review`, then `run`, then the full flywheel. - -## Open Questions - -- How should LLM provider configuration work? API keys from the user vs. evalbuff-hosted? -- Should `evalbuff run` spin up infrastructure itself, or just validate that the user has set it up? -- What's the pricing model? Per-eval-run? Subscription? Free tier for `context` + `review`? -- How much of the codebuff SDK can we reuse vs. what needs to be evalbuff-specific? -- Should traces be stored locally, in the cloud, or both? -- How do we handle projects with existing test infrastructure (Playwright, Cypress, etc.) — integrate or replace? diff --git a/evalbuff/old/PHASE-1-SPEC.md b/evalbuff/old/PHASE-1-SPEC.md deleted file mode 100644 index 4da7fe3d9a..0000000000 --- a/evalbuff/old/PHASE-1-SPEC.md +++ /dev/null @@ -1,861 +0,0 @@ -# Evalbuff — Phase 1 Spec - -> Phase 1 delivers three CLI commands (`init`, `context`, `review`), authentication, and skill installation. No TUI. Markdown output to stdout. LLM calls go through the Codebuff backend via the SDK. - -## Table of Contents - -- [Overview](#overview) -- [Installation](#installation) -- [Authentication](#authentication) -- [Commands](#commands) - - [`evalbuff init`](#evalbuff-init) - - [`evalbuff context`](#evalbuff-context) - - [`evalbuff review`](#evalbuff-review) - - [`evalbuff login`](#evalbuff-login) - - [`evalbuff logout`](#evalbuff-logout) - - [`evalbuff --help` / `--version`](#evalbuff---help----version) -- [Skill Installation](#skill-installation) -- [Initial Project Scan](#initial-project-scan) -- [Configuration File](#configuration-file) -- [Agent Definitions](#agent-definitions) -- [Package Structure](#package-structure) -- [Technical Architecture](#technical-architecture) -- [Error Handling](#error-handling) -- [UX Details](#ux-details) -- [Non-Goals](#non-goals) -- [Acceptance Criteria](#acceptance-criteria) - ---- - -## Overview - -Phase 1 is the minimum useful product: a developer installs evalbuff, runs `evalbuff init` in their project, and immediately gets two capabilities: - -1. **`evalbuff context `** — any coding agent (or human) can call this to get relevant files, background knowledge, and gotchas before starting work. -2. **`evalbuff review [prompt]`** — after making changes, get structured feedback on what went wrong and why. The optional prompt provides context about the original request, giving the reviewer deeper understanding of intent. - -`evalbuff init` also installs a **skill file** into the project so that coding agents (Codebuff, Claude Code) automatically know to call these commands. - -## Installation - -Evalbuff is published to npm as a standalone package: - -```bash -npm install -g evalbuff -``` - -The package is built as a compiled binary (same approach as the Codebuff CLI — using `bun build --compile`), so users don't need Bun or Node installed. The npm package uses platform-specific optional dependencies (like esbuild and turbo do) to download the correct binary. - -For CI, install globally and cache the binary, or use `npx`: - -```bash -npx evalbuff review --branch main -``` - -## Authentication - -Evalbuff uses the same Codebuff backend and user accounts. Authentication works identically to the Codebuff CLI. - -### Login Flow - -1. User runs any command that requires auth (or explicitly runs `evalbuff login`). -2. CLI opens a browser to the Codebuff login page. -3. User authenticates in the browser. -4. CLI polls for authentication completion, stores credentials locally. - -### Credential Storage - -- Credentials are stored at `~/.config/evalbuff/credentials.json` (separate from Codebuff credentials). -- Same schema: `{ "default": { "name", "email", "authToken", ... } }`. -- If the user is already logged into Codebuff, evalbuff could detect this and offer to reuse the session (stretch goal — not required for Phase 1). - -### CI / Non-Interactive Auth - -- The `EVALBUFF_API_KEY` environment variable provides auth in CI environments. -- When set, it takes precedence over stored credentials. -- No browser login is triggered when an API key is present. - ---- - -## Commands - -### `evalbuff init` - -Initialize evalbuff in a project. Sets up configuration, installs skill files, and runs an initial project scan. - -#### Usage - -``` -evalbuff init [options] -``` - -#### Options - -| Flag | Description | -|------|-------------| -| `--cwd ` | Project root directory (defaults to current directory) | -| `--skip-scan` | Skip the initial project scan, just create config and install skills | -| `--force` | Overwrite existing configuration and skill files without prompting (does NOT overwrite knowledge files) | - -#### Behavior - -1. **Check authentication** — trigger login flow if not authenticated. -2. **Detect project root** — find the nearest git root or use `--cwd`. -3. **Check if already initialized** — if `evalbuff.json` exists, prompt to overwrite config and skill files (or use `--force`). Knowledge files are never overwritten by `--force`. -4. **Create configuration file** — write `.agents/evals/evalbuff.json` with defaults. -5. **Install skill files** — write `SKILL.md` to both: - - `.agents/skills/evalbuff/SKILL.md` - - `.claude/skills/evalbuff/SKILL.md` -6. **Create knowledge directory** — ensure `.agents/knowledge/` exists. -7. **Run initial project scan** — unless `--skip-scan`, execute the Scan Agent (see [Initial Project Scan](#initial-project-scan)) to bootstrap knowledge files. If knowledge files already exist, the scan agent merges new observations rather than overwriting. -8. **Print summary** — show what was created, where skill files were installed, and suggest next steps. - -#### Output - -``` -✓ Created .agents/evals/evalbuff.json -✓ Installed skill to .agents/skills/evalbuff/SKILL.md -✓ Installed skill to .claude/skills/evalbuff/SKILL.md -✓ Generated project knowledge (4 files) - -Evalbuff is ready! Your coding agents will now automatically use evalbuff for context and review. - -Try it: - evalbuff context "add user authentication" - evalbuff review -``` - ---- - -### `evalbuff context` - -Returns relevant files, background knowledge, and gotchas for a given prompt. Designed to be called by coding agents before starting a task, or by humans to explore what's relevant. - -#### Usage - -``` -evalbuff context [options] -``` - -#### Options - -| Flag | Description | -|------|-------------| -| `--cwd ` | Project root directory (defaults to current directory) | -| `--max-files ` | Maximum number of files to return (default: 15) | -| `--files-only` | Output only file paths, one per line (for piping) | - -#### Behavior - -1. **Check authentication** — trigger login flow if not authenticated. -2. **Locate project root** — find nearest git root or use `--cwd`. -3. **Load configuration** — read `evalbuff.json` if it exists (works without init, with a warning). -4. **Execute the Context Agent** — send the prompt, project file tree, and any existing knowledge to the Codebuff backend via SDK. -5. **Output markdown to stdout**. - -#### Progress Feedback - -Since `context` involves LLM calls that may take 10-30 seconds, the CLI writes progress indicators to **stderr** (keeping stdout clean for the markdown output): - -``` -⠋ Scanning project structure... -⠋ Finding relevant files... -⠋ Synthesizing context... -``` - -The spinner and status messages go to stderr so that piping stdout (e.g. `evalbuff context "add auth" > context.md`) works cleanly. In non-TTY environments (CI), progress messages are suppressed. - -#### Output Format - -The output is markdown with three sections: - -```markdown -## Relevant Files - -- **`src/auth/login.ts`** — Handles user login flow, validates credentials, issues JWT tokens -- **`src/middleware/auth-guard.ts`** — Express middleware that checks JWT on protected routes -- **`src/db/models/user.ts`** — User model with password hashing and verification methods -- **`tests/auth/login.test.ts`** — Existing tests for the login flow - -## Background - -This project uses Express with JWT authentication. The auth system was recently -refactored (see commit abc123) to use refresh tokens. The User model uses bcrypt -for password hashing with a cost factor of 12. - -The API follows REST conventions with routes defined in `src/routes/index.ts`. -Auth routes are mounted at `/api/auth/*`. - -## Gotchas - -- The JWT secret is loaded from `process.env.JWT_SECRET` — make sure it's set in `.env.test` for tests. -- The User model has a `beforeSave` hook that auto-hashes passwords — don't hash manually. -- Rate limiting is applied to `/api/auth/login` (5 attempts per minute) — tests need to account for this. -``` - -When `--files-only` is passed, output is just the file paths: - -``` -src/auth/login.ts -src/middleware/auth-guard.ts -src/db/models/user.ts -tests/auth/login.test.ts -``` - -#### Without Init - -If evalbuff has not been initialized (no `evalbuff.json`), the command still works but: -- Prints a warning to stderr: `Warning: evalbuff not initialized. Run "evalbuff init" for better results.` -- The "Background" and "Gotchas" sections will be less informed (no project knowledge to draw from). -- File picking still works based on the file tree and code search. - ---- - -### `evalbuff review` - -Reviews code changes and outputs structured feedback. Designed for coding agents to self-check, for CI to gate PRs, or for humans to get a second opinion. - -The optional `` provides context about the original user request and what the reviewer should focus on. This is especially valuable when a coding agent calls `evalbuff review` — it can pass along the user's original instructions so the reviewer understands the *intent* behind the changes, not just the diff. - -#### Usage - -``` -evalbuff review [prompt] [options] -``` - -#### Options - -| Flag | Description | -|------|-------------| -| `--cwd ` | Project root directory (defaults to current directory) | -| `--files ` | Scope the review to specific files | -| `--branch [base]` | Compare current branch against a base branch (defaults to `main` or configured default branch) | -| `--commit ` | Review a specific commit | -| `--staged` | Review only staged changes (`git diff --cached`) | - -#### Prompt - -The prompt is an optional positional argument. It tells the Review Agent what the user originally asked for and what aspects to pay attention to. Examples: - -```bash -# Coding agent passes along the user's original request -evalbuff review "The user asked to add JWT authentication to the API routes" - -# Human describes what they were working on -evalbuff review "Refactored the database layer to use connection pooling" - -# With additional options -evalbuff review "Add pagination to the /users endpoint" --branch main -evalbuff review "Fix the race condition in the queue worker" --staged -evalbuff review "Migrate from Express to Fastify" --files src/server.ts src/routes/index.ts -``` - -When a prompt is provided, the Review Agent uses it to: -- Verify the changes actually accomplish what was requested -- Check for missing pieces (e.g. "user asked for auth but no tests were added") -- Evaluate whether the approach is appropriate for the stated goal -- Provide more targeted, relevant feedback - -Without a prompt, the Review Agent still works — it just reviews the diff on its own merits without knowledge of the original intent. - -#### Input Modes - -1. **Default (no file scoping)** — reviews all uncommitted changes (staged + unstaged): `git diff HEAD` -2. **Specific files** — `evalbuff review --files src/auth.ts src/db.ts` — reviews uncommitted changes in those files only -3. **Branch comparison** — `evalbuff review --branch` — reviews the diff between the current branch and its merge base with the default branch (e.g. `main`). Optionally specify a different base: `evalbuff review --branch develop` -4. **Staged only** — `evalbuff review --staged` — reviews only staged changes -5. **Specific commit** — `evalbuff review --commit abc123` — reviews the diff introduced by that commit - -#### Behavior - -1. **Check authentication** — trigger login flow if not authenticated. -2. **Locate project root** — find nearest git root or use `--cwd`. -3. **Collect the diff** — use the appropriate `git diff` command based on input mode. -4. **Bail if empty** — if there's no diff, print a message and exit cleanly. -5. **Load project knowledge** — read `.agents/knowledge/` files if they exist. -6. **Execute the Review Agent** — send the prompt (if provided), diff, file context (full files being modified), and knowledge to the backend via SDK. -7. **Output markdown to stdout**. - -#### Output Format - -When a prompt is provided (e.g. `evalbuff review "Add JWT authentication to the API routes"`), the output includes a **Goal Assessment** subsection: - -```markdown -## Review Summary - -Reviewed 4 files with 127 lines changed. Found 1 critical issue, 2 warnings, and 3 suggestions. - -### Goal Assessment - -**Prompt:** "Add JWT authentication to the API routes" - -✅ JWT token generation and verification is implemented in `src/auth/jwt.ts`. -✅ Auth middleware is applied to protected routes. -⚠️ No refresh token mechanism — the prompt didn't specify this, but the token expiry is set to 15 minutes with no way to renew without re-login. -❌ The `/api/admin/*` routes are not protected — these likely need auth too. - -## Issues -``` - -When no prompt is provided, the Goal Assessment subsection is omitted and the output begins directly with the summary stats: - -```markdown -## Review Summary - -Reviewed 4 files with 127 lines changed. Found 1 critical issue, 2 warnings, and 3 suggestions. - -## Issues - -### 🔴 Critical: SQL injection vulnerability in user search - -**`src/db/queries/users.ts:45`** - -The `searchUsers` function interpolates user input directly into a SQL query string. -This allows arbitrary SQL injection. - -```ts -// Current (vulnerable) -const query = `SELECT * FROM users WHERE name LIKE '%${searchTerm}%'` - -// Suggested fix -const query = `SELECT * FROM users WHERE name LIKE $1` -const params = [`%${searchTerm}%`] -``` - ---- - -### 🟡 Warning: Missing error handling in auth middleware - -**`src/middleware/auth-guard.ts:23`** - -The JWT verification call doesn't handle the case where the token is malformed -(not just expired). This will throw an unhandled exception and crash the process. - ---- - -### 🟡 Warning: Test coverage gap - -**`src/auth/login.ts`** - -The new `rememberMe` parameter changes token expiry but no tests cover this behavior. -Consider adding tests for both `rememberMe: true` and `rememberMe: false`. - -## Suggestions - -- 💡 Consider adding input validation for the `email` field in `src/auth/register.ts` — currently accepts any string. -- 💡 The `findUserByEmail` query in `src/db/queries/users.ts` could use a database index on `email` for better performance. -- 💡 The error messages in `src/auth/login.ts` distinguish between "user not found" and "wrong password" — this leaks information about valid accounts. Consider a generic "invalid credentials" message. - -## Stats - -| Metric | Value | -|--------|-------| -| Files reviewed | 4 | -| Lines changed | +89 / -38 | -| Critical issues | 1 | -| Warnings | 2 | -| Suggestions | 3 | -``` - -#### Progress Feedback - -Since `review` involves LLM calls that may take 10-30 seconds, the CLI writes progress indicators to **stderr** (keeping stdout clean for the markdown output): - -``` -⠋ Collecting diff... -⠋ Analyzing 4 changed files... -⠋ Generating review... -``` - -The spinner and status messages go to stderr so that piping stdout (e.g. `evalbuff review > review.md`) works cleanly. In non-TTY environments (CI), progress messages are suppressed. - -#### Exit Codes - -| Code | Meaning | -|------|---------| -| `0` | Review complete, no critical issues | -| `1` | Review complete, critical issues found | -| `2` | Error (auth failure, network error, not a git repo, etc.) | - -The non-zero exit on critical issues makes `evalbuff review` usable as a CI gate: - -```yaml -# GitHub Actions example -- name: Evalbuff Review - run: evalbuff review "PR changes" --branch main - env: - EVALBUFF_API_KEY: ${{ secrets.EVALBUFF_API_KEY }} -``` - ---- - -### `evalbuff login` - -Explicitly trigger the authentication flow. - -#### Usage - -``` -evalbuff login -``` - -#### Behavior - -1. Open browser to Codebuff login page. -2. Poll for completion. -3. Store credentials at `~/.config/evalbuff/credentials.json`. -4. Print success message with user email. - ---- - -### `evalbuff logout` - -Clear stored credentials. - -#### Usage - -``` -evalbuff logout -``` - -#### Behavior - -1. Remove stored credentials from `~/.config/evalbuff/credentials.json`. -2. Print confirmation. - ---- - -### `evalbuff --help` / `--version` - -Standard help and version output. - -``` -$ evalbuff --help - -evalbuff — Codebase-specific evals, context, and review for AI coding agents - -Commands: - init Initialize evalbuff in a project - context Get relevant files, knowledge, and gotchas for a task - review [prompt] Review code changes with structured feedback - login Authenticate with evalbuff - logout Clear stored credentials - -Options: - --cwd Project root directory - --help Show help - --version Show version -``` - ---- - -## Skill Installation - -The installed `SKILL.md` is the integration layer that makes coding agents aware of evalbuff. It's a markdown file with YAML frontmatter, following the standard skill format. - -### Template - -```markdown ---- -name: evalbuff -description: Use evalbuff to get project context before coding and review changes before committing ---- - -# Evalbuff - -This project uses evalbuff for AI-assisted context gathering and change review. - -## Before Starting a Task - -Run evalbuff to get oriented before making changes: - - evalbuff context "" - -This returns: -- **Relevant files** with summaries — so you know what to read -- **Background knowledge** about the systems involved -- **Gotchas and lessons** from past work — so you avoid known pitfalls - -Use this output to inform which files to read and what to watch out for. - -## After Making Changes - -Run evalbuff to review your changes before considering the task complete. Include a description of what the user originally asked for so the reviewer can verify the changes match the intent: - - evalbuff review "" - -This returns structured feedback including: -- 🔴 **Critical issues** that must be fixed -- 🟡 **Warnings** that should be addressed -- 💡 **Suggestions** for improvement -- Whether the changes actually accomplish the stated goal - -If there are critical issues (🔴), fix them and re-run the review. -If there are only warnings and suggestions, use your judgment. - -## Tips - -- Always run `evalbuff context` first — it often surfaces non-obvious files and gotchas. -- Always pass the user's original request to `evalbuff review` — this helps catch missing requirements and verify the changes match intent. -- Run `evalbuff review` even for small changes — it catches things like missing error handling, test gaps, and convention violations. -- You can review specific files: `evalbuff review "add auth" --files src/auth.ts src/db.ts` -- You can review staged changes only: `evalbuff review "fix login bug" --staged` -``` - -### Installation Targets - -`evalbuff init` writes this file to: - -1. **`.agents/skills/evalbuff/SKILL.md`** — discovered by Codebuff and any SDK-based agent -2. **`.claude/skills/evalbuff/SKILL.md`** — discovered by Claude Code - -Both files have identical content. - ---- - -## Initial Project Scan - -When `evalbuff init` runs (without `--skip-scan`), it executes the **Scan Agent** to analyze the project and bootstrap knowledge files. - -### What the Scan Agent Does - -1. **Reads the project file tree** — directory structure, file types, key config files. -2. **Identifies the tech stack** — languages, frameworks, build tools, package managers (from `package.json`, `Cargo.toml`, `requirements.txt`, `build.gradle`, etc.). -3. **Detects architectural patterns** — monorepo vs single package, microservices, API structure, frontend/backend split. -4. **Finds existing test infrastructure** — test frameworks, test directories, CI configuration. -5. **Reads key configuration files** — linter configs, CI workflows, Dockerfiles, etc. -6. **Scans for existing knowledge** — `README.md`, `CONTRIBUTING.md`, `AGENTS.md`, `knowledge.md`, existing skill files. - -### Generated Knowledge Files - -The scan generates markdown files in `.agents/knowledge/`: - -| File | Contents | -|------|----------| -| `architecture.md` | High-level overview: project type, directory structure, how components relate | -| `tech-stack.md` | Languages, frameworks, key dependencies, build system, runtime | -| `conventions.md` | Coding patterns observed: naming, file organization, error handling patterns | -| `testing.md` | Test frameworks, test directory layout, how to run tests, CI setup | - -These files are read by the Context and Review agents to provide more informed output. - -### Scan Agent Tools - -The Scan Agent needs access to: -- **File read** — read config files, README, etc. -- **Directory listing** — understand project structure -- **Code search** — find patterns, imports, test files -- **File tree** — get the full project layout - ---- - -## Configuration File - -Located at `.agents/evals/evalbuff.json`. - -### Schema - -```json -{ - "version": 1, - "project": { - "name": "my-project", - "description": "Brief description of the project" - }, - "context": { - "maxFiles": 15, - "excludePatterns": [ - "dist/**", - "node_modules/**", - "*.generated.ts" - ] - }, - "review": { - "defaultBranch": "main" - } -} -``` - -### Fields - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `version` | `number` | Yes | Config version, always `1` for Phase 1 | -| `project.name` | `string` | No | Project name (auto-detected from package.json or directory name) | -| `project.description` | `string` | No | Brief project description (auto-detected from README or package.json) | -| `context.maxFiles` | `number` | No | Default max files returned by `context` (default: 15) | -| `context.excludePatterns` | `string[]` | No | Glob patterns to exclude from context file picking | -| `review.defaultBranch` | `string` | No | Branch to compare against in `--branch` mode (default: "main") | - ---- - -## Agent Definitions - -Phase 1 requires three agents, all defined as Codebuff SDK agent definitions and executed against the Codebuff backend. - -### Scan Agent - -**Purpose:** Analyze a project during `evalbuff init` and generate knowledge files. - -**Input:** -- Project file tree -- Contents of key config files (auto-detected) - -**Output:** -- Creates/writes knowledge markdown files to `.agents/knowledge/` - -**Tools:** file read, directory listing, code search, file write (restricted to `.agents/knowledge/` only) - -The Scan Agent generates a fixed set of knowledge files (`architecture.md`, `tech-stack.md`, `conventions.md`, `testing.md`). It does not create arbitrary files. If these files already exist, it reads them first and merges new observations rather than replacing user-curated content. - -### Context Agent - -**Purpose:** Given a user prompt, return relevant files, background knowledge, and gotchas. - -**Input:** -- The user's prompt (what they're about to work on) -- Project file tree -- Contents of `.agents/knowledge/*.md` -- `evalbuff.json` configuration - -**Output:** -- Markdown to stdout with three sections: Relevant Files, Background, Gotchas - -**Tools:** file read, directory listing, code search (all read-only — no writes) - -### Review Agent - -**Purpose:** Given code changes and (optionally) the original user request, return structured review feedback. - -**Input:** -- The user's prompt describing what was requested and what to review (optional — if omitted, the agent reviews the diff on its own merits) -- The git diff -- Full contents of modified files (for context around the diff) -- Contents of `.agents/knowledge/*.md` -- `evalbuff.json` configuration - -When a prompt is provided, the Review Agent evaluates both the *quality* of the code changes and whether they *fulfill the stated intent*. This means it can catch issues like: -- Missing requirements ("the user asked for pagination but there's no limit/offset parameter") -- Scope creep ("the changes also refactored the logger, which wasn't requested") -- Wrong approach ("the user asked for JWT auth but the changes implement session-based auth") - -**Output:** -- Markdown to stdout with sections: Review Summary, Issues (🔴/🟡), Suggestions (💡), Stats -- When a prompt was provided, the Review Summary includes a **Goal Assessment** — whether the changes accomplish the stated objective -- Exit code: 0 if no critical issues, 1 if critical issues found - -**Tools:** file read, code search (all read-only — no writes) - ---- - -## Package Structure - -Everything lives within the monorepo under `evalbuff/`. - -``` -evalbuff/ -├── cli/ -│ ├── src/ -│ │ ├── index.ts # Entry point, argument parsing -│ │ ├── commands/ -│ │ │ ├── init.ts # evalbuff init -│ │ │ ├── context.ts # evalbuff context -│ │ │ ├── review.ts # evalbuff review [prompt] -│ │ │ ├── login.ts # evalbuff login -│ │ │ └── logout.ts # evalbuff logout -│ │ ├── utils/ -│ │ │ ├── auth.ts # Credential storage and retrieval -│ │ │ ├── config.ts # evalbuff.json reading/writing -│ │ │ ├── git.ts # Git operations (diff, branch detection) -│ │ │ ├── knowledge.ts # Reading/writing knowledge files -│ │ │ ├── output.ts # Markdown formatting helpers -│ │ │ └── project.ts # Project root detection, file tree -│ │ └── templates/ -│ │ └── SKILL.md # Skill template to install -│ ├── package.json -│ └── tsconfig.json -├── agents/ -│ ├── scan-agent.ts # Scan Agent definition (SDK agent) -│ ├── context-agent.ts # Context Agent definition (SDK agent) -│ └── review-agent.ts # Review Agent definition (SDK agent) -├── BRAINSTORM.md -├── PHASE-1-SPEC.md -└── README.md -``` - -### Dependencies - -The `evalbuff/cli` package depends on: -- `@codebuff/sdk` — for executing agents against the Codebuff backend -- `commander` — for CLI argument parsing -- `zod` — for config schema validation - -It does **not** depend on the full Codebuff CLI (no TUI framework, no React, no OpenTUI). - ---- - -## Technical Architecture - -``` -┌─────────────────────────────────────────────────────┐ -│ User's Terminal │ -│ │ -│ $ evalbuff context "add user auth" │ -│ │ -│ ┌─────────────────────┐ │ -│ │ evalbuff CLI │ │ -│ │ (argument parsing, │ │ -│ │ auth, git ops) │ │ -│ └──────────┬──────────┘ │ -│ │ │ -│ ▼ │ -│ ┌─────────────────────┐ ┌────────────────────┐ │ -│ │ @codebuff/sdk │────▶│ Local Tools │ │ -│ │ (agent execution) │◀────│ (file read, code │ │ -│ └──────────┬──────────┘ │ search, dir list) │ │ -│ │ └────────────────────┘ │ -└─────────────┼───────────────────────────────────────┘ - │ HTTPS (LLM calls) - ▼ - ┌──────────────────┐ - │ Codebuff Backend │ - │ (same server as │ - │ Codebuff CLI) │ - └──────────────────┘ -``` - -- **CLI layer** handles argument parsing, auth, git operations, and formatting. -- **SDK layer** handles agent execution — sending prompts to the backend, processing tool calls locally. -- **Tools execute locally** — file reads, code search, directory listing all happen on the user's machine. Only the LLM inference calls go to the backend. -- **Output is markdown to stdout** — no TUI rendering, no interactive elements. - ---- - -## Error Handling - -| Scenario | Behavior | -|----------|----------| -| Not in a git repository | `review` exits with error: `"Not a git repository. Run from within a git repo."` · `context` and `init` still work (review needs git for diffs) | -| Not initialized | `context` and `review` work with a warning to stderr: `"evalbuff not initialized. Run 'evalbuff init' for better results."` · Knowledge sections will be sparse | -| No changes to review | Clean exit (code 0): `"No changes to review."` | -| Auth expired / invalid | Prompt to re-login (interactive) or fail with clear message (CI) | -| Network error | `"Failed to connect to evalbuff backend. Check your internet connection and try again."` Exit code 2 | -| `evalbuff.json` malformed | Warning to stderr with specific parse error, fall back to defaults | -| Already initialized | Prompt: `"evalbuff is already initialized. Overwrite? (y/N)"` · `--force` skips prompt | -| LLM rate limit / quota | `"Rate limit exceeded. Please try again in a moment."` or `"Insufficient credits. Visit codebuff.com for more."` Exit code 2 | - ---- - -## UX Details - -### Progress Indicators - -All commands that make LLM calls (`init` scan, `context`, `review`) show a spinner with status messages on **stderr**. This keeps stdout clean for machine-readable output. - -- Spinners use a simple braille animation (`⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏`) -- Status messages update as the operation progresses -- In non-TTY environments (piped output, CI), spinners are suppressed entirely -- On error, the spinner is cleared before printing the error message - -### Credit Usage Feedback - -After every command that consumes credits (`init`, `context`, `review`), a one-line credit usage summary is printed to **stderr**: - -``` -✓ Done (0.12 credits used) -``` - -This helps users track their consumption without cluttering the main output. - -### Streaming vs. Buffered Output - -For Phase 1, output is **buffered** — the full markdown is written to stdout only after the agent completes. This simplifies implementation and ensures the output is always well-formed markdown. - -Streaming output (printing markdown sections as they arrive) is a future improvement. The spinner on stderr provides feedback while the user waits. - -## Non-Goals - -The following are explicitly out of scope for Phase 1: - -- **TUI** — no interactive mode, no `evalbuff` with no args -- **`evalbuff run`** — no eval task execution -- **`evalbuff learn`** — no self-improvement loop -- **`evalbuff refresh`** — no commit scanning -- **Task definitions** — no `.agents/evals/tasks/` directory -- **Traces** — no historical run storage -- **Cursor / Windsurf / Copilot skill targets** — only `.agents/` and `.claude/` -- **JSON output format** — markdown only (JSON can be added later via `--format`) -- **Cloud storage** — everything is local to the project -- **Custom agent definitions** — only the three built-in agents - ---- - -## Acceptance Criteria - -### Authentication - -- [ ] `evalbuff login` opens browser and completes auth flow -- [ ] Credentials are stored at `~/.config/evalbuff/credentials.json` -- [ ] `evalbuff logout` clears stored credentials -- [ ] `EVALBUFF_API_KEY` env var works for non-interactive auth -- [ ] Commands that need auth trigger login automatically if not authenticated - -### `evalbuff init` - -- [ ] Creates `.agents/evals/evalbuff.json` with valid default configuration -- [ ] Installs `SKILL.md` to `.agents/skills/evalbuff/SKILL.md` -- [ ] Installs `SKILL.md` to `.claude/skills/evalbuff/SKILL.md` -- [ ] Creates `.agents/knowledge/` directory -- [ ] Runs initial project scan and generates knowledge files (architecture, tech-stack, conventions, testing) -- [ ] `--skip-scan` skips the scan but still creates config and skills -- [ ] `--force` overwrites without prompting -- [ ] Prompts before overwriting existing configuration -- [ ] Prints a clear summary of what was created - -### `evalbuff context` - -- [ ] Accepts a prompt string and returns markdown to stdout -- [ ] Output contains: Relevant Files (with summaries), Background, Gotchas sections -- [ ] `--max-files` limits the number of files returned -- [ ] `--files-only` outputs just file paths, one per line -- [ ] Works without `evalbuff init` (with warning to stderr) -- [ ] Uses project knowledge when available for richer output -- [ ] Exit code 0 on success, 2 on error - -### `evalbuff review` - -- [ ] Accepts an optional `[prompt]` positional argument describing the original request and review focus -- [ ] When a prompt is provided, the review includes a Goal Assessment evaluating whether changes fulfill the stated intent -- [ ] When no prompt is provided, the review evaluates changes on their own merits -- [ ] Default: reviews all uncommitted changes (staged + unstaged) -- [ ] `--files ` scopes the review to specific files -- [ ] `--branch [name]` compares against a branch -- [ ] `--staged` reviews only staged changes -- [ ] `--commit ` reviews a specific commit -- [ ] Output contains: Review Summary (with Goal Assessment if prompt given), Issues (🔴/🟡), Suggestions (💡), Stats -- [ ] Exit code 0 when no critical issues, 1 when critical issues found, 2 on error -- [ ] Prints clean message and exits 0 when there are no changes to review -- [ ] Uses project knowledge for more informed feedback -- [ ] Works without `evalbuff init` (with warning to stderr) - -### Skill Installation - -- [ ] Installed SKILL.md follows the standard frontmatter format (`name`, `description`) -- [ ] Skill content explains when and how to call `evalbuff context` and `evalbuff review` -- [ ] Skill content describes expected output format -- [ ] Both `.agents/skills/` and `.claude/skills/` targets are created - -### UX - -- [ ] Progress spinners display on stderr during LLM calls -- [ ] Spinners are suppressed in non-TTY environments -- [ ] Credit usage summary prints to stderr after each command that uses credits - -### General - -- [ ] `evalbuff --help` prints usage information for all commands -- [ ] `evalbuff --version` prints the current version -- [ ] `--cwd ` works on all commands to set the project root -- [ ] All errors produce clear, actionable messages -- [ ] All output goes to stdout (warnings/errors to stderr) -- [ ] Package installs correctly via `npm install -g evalbuff` diff --git a/evalbuff/old/README.md b/evalbuff/old/README.md deleted file mode 100644 index 538dc3c280..0000000000 --- a/evalbuff/old/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Evalbuff - -Codebase-specific evals, context, and review for AI coding agents. - -## Quick Start - -```bash -# Initialize evalbuff in your project -evalbuff init - -# Get context before starting a task -evalbuff context "add user authentication" - -# Review your changes -evalbuff review "added JWT auth to API routes" -``` - -## Commands - -| Command | Description | -|---------|-------------| -| `evalbuff init` | Initialize evalbuff in a project | -| `evalbuff context ` | Get relevant files, knowledge, and gotchas | -| `evalbuff review [prompt]` | Review code changes with structured feedback | -| `evalbuff login` | Authenticate with evalbuff | -| `evalbuff logout` | Clear stored credentials | - -## Development - -From the monorepo root: - -```bash -bun install -bun --cwd evalbuff/cli run dev -- --help -``` - -See [PHASE-1-SPEC.md](./PHASE-1-SPEC.md) for the full specification. diff --git a/evalbuff/old/agents/context-agent.ts b/evalbuff/old/agents/context-agent.ts deleted file mode 100644 index 7fc7b8ff2c..0000000000 --- a/evalbuff/old/agents/context-agent.ts +++ /dev/null @@ -1,56 +0,0 @@ -import type { AgentDefinition } from '@codebuff/sdk' - -export const contextAgent: AgentDefinition = { - id: 'evalbuff-context', - displayName: 'Evalbuff Context Agent', - model: 'anthropic/claude-sonnet-4.5', - toolNames: ['read_files', 'list_directory', 'code_search', 'glob', 'end_turn'], - spawnableAgents: [], - outputMode: 'last_message', - inputSchema: { - prompt: { - type: 'string', - description: 'What the user is about to work on', - }, - }, - - systemPrompt: `You are the evalbuff Context Agent. Given a description of what a developer (or AI coding agent) is about to work on, you find the most relevant files, provide background knowledge, and surface potential gotchas. - -Your output MUST be well-formatted markdown with exactly three sections: - -## Relevant Files - -A bullet list of the most relevant files, each with a bold file path and a brief summary: -- **\`path/to/file.ts\`** — What this file does and why it's relevant - -Order files by relevance (most relevant first). Include test files if relevant. - -## Background - -Provide context about the systems, patterns, and architecture involved. Reference specific files and patterns. This should help someone unfamiliar with this area of the codebase get oriented quickly. - -## Gotchas - -List potential pitfalls, non-obvious behaviors, edge cases, or things that have caused problems before. Be specific: -- Reference specific files, functions, or configuration -- Explain WHY something is a gotcha, not just WHAT it is -- Include environment setup requirements if relevant - -Rules: -- Use the tools available to explore the codebase. Read files, search for patterns, list directories. -- Be thorough but concise. Quality over quantity. -- If project knowledge files exist, they were provided in the context — use them. -- Output ONLY the markdown. No preamble or explanation outside the three sections.`, - - instructionsPrompt: `Find the most relevant files and context for the user's task. Use your tools: - -1. Think about what areas of the codebase are likely relevant based on the prompt. -2. List directories to understand the project structure. -3. Use code_search to find relevant patterns, imports, and definitions. -4. Read the most important files to understand them. -5. Use glob to find files matching relevant patterns. - -Then output your findings as markdown with the three required sections: Relevant Files, Background, Gotchas. - -Do NOT output anything besides the markdown. No tool calls after you start writing the markdown output.`, -} diff --git a/evalbuff/old/agents/review-agent.ts b/evalbuff/old/agents/review-agent.ts deleted file mode 100644 index 0f149e6f38..0000000000 --- a/evalbuff/old/agents/review-agent.ts +++ /dev/null @@ -1,97 +0,0 @@ -import type { AgentDefinition } from '@codebuff/sdk' - -export const reviewAgent: AgentDefinition = { - id: 'evalbuff-review', - displayName: 'Evalbuff Review Agent', - model: 'anthropic/claude-sonnet-4.5', - toolNames: ['read_files', 'code_search', 'end_turn'], - spawnableAgents: [], - outputMode: 'last_message', - inputSchema: { - prompt: { - type: 'string', - description: 'The diff to review, along with optional context about the original request', - }, - }, - - systemPrompt: `You are the evalbuff Review Agent. You review code changes and provide structured, actionable feedback. - -You receive a git diff and optionally the original user request that motivated the changes. Your job is to find real issues, not nitpick. - -Your output MUST be well-formatted markdown following this structure: - -## Review Summary - -Start with a one-line summary: "Reviewed N files with M lines changed. Found X critical issues, Y warnings, and Z suggestions." - -If a prompt describing the original request was provided, include a **Goal Assessment** subsection: - -### Goal Assessment - -**Prompt:** "" - -Use ✅ for things that are done correctly, ⚠️ for partial/concerning, and ❌ for missing or wrong: -- ✅ Description of what was accomplished correctly -- ⚠️ Description of concern -- ❌ Description of what's missing or wrong - -## Issues - -List issues grouped by severity. Use this format for each: - -### 🔴 Critical: - -**\`file/path.ts:line\`** - -Explanation of the issue and why it's critical. - -\`\`\`ts -// Current (problematic) -code here - -// Suggested fix -fixed code here -\`\`\` - ---- - -### 🟡 Warning: - -**\`file/path.ts:line\`** - -Explanation. - -## Suggestions - -- 💡 Suggestion with file reference and explanation. -- 💡 Another suggestion. - -## Stats - -| Metric | Value | -|--------|-------| -| Files reviewed | N | -| Lines changed | +X / -Y | -| Critical issues | N | -| Warnings | N | -| Suggestions | N | - -Rules: -- 🔴 Critical: Security vulnerabilities, data loss risks, crashes, logic errors that break functionality. -- 🟡 Warning: Missing error handling, test gaps, potential performance issues, convention violations. -- 💡 Suggestion: Style improvements, better approaches, refactoring opportunities. -- Be specific: reference exact file paths and line numbers. -- Provide code fixes for critical issues when possible. -- Use the available tools to read full files for context around the diff. -- If there are no issues, say so clearly. Don't invent problems. -- Output ONLY the markdown. No preamble.`, - - instructionsPrompt: `Review the provided code changes. You may use tools to read the full contents of modified files for better context. - -1. Analyze the diff carefully. -2. If file paths are mentioned in the diff, read those files to understand the full context. -3. Use code_search if you need to understand how changed functions are used elsewhere. -4. Write your review following the exact markdown format specified in your system prompt. - -Do NOT output anything besides the review markdown. No tool calls after you start writing the review.`, -} diff --git a/evalbuff/old/agents/scan-agent.ts b/evalbuff/old/agents/scan-agent.ts deleted file mode 100644 index bdc8cc2538..0000000000 --- a/evalbuff/old/agents/scan-agent.ts +++ /dev/null @@ -1,46 +0,0 @@ -import type { AgentDefinition } from '@codebuff/sdk' - -export const scanAgent: AgentDefinition = { - id: 'evalbuff-scan', - displayName: 'Evalbuff Scan Agent', - model: 'anthropic/claude-sonnet-4.5', - toolNames: ['read_files', 'list_directory', 'code_search', 'write_file', 'end_turn'], - spawnableAgents: [], - outputMode: 'last_message', - inputSchema: { - prompt: { - type: 'string', - description: 'Instructions for the scan agent', - }, - }, - - systemPrompt: `You are a project analysis agent for evalbuff. Your job is to analyze a software project and generate knowledge files that help AI coding agents understand the project. - -You will analyze the project structure, tech stack, coding conventions, and testing infrastructure, then write your findings as markdown files. - -You MUST write exactly these four files using the write_file tool: -1. \`.agents/knowledge/architecture.md\` — High-level overview: project type, directory structure, how components relate -2. \`.agents/knowledge/tech-stack.md\` — Languages, frameworks, key dependencies, build system, runtime -3. \`.agents/knowledge/conventions.md\` — Coding patterns observed: naming, file organization, error handling patterns -4. \`.agents/knowledge/testing.md\` — Test frameworks, test directory layout, how to run tests, CI setup - -Rules: -- ONLY write files under \`.agents/knowledge/\`. Do not write anywhere else. -- Each file should be concise but informative (aim for 50-200 lines each). -- Use markdown formatting with clear headers. -- Base your analysis on actual evidence from the codebase (config files, imports, directory structure). -- If knowledge files already exist, read them first and merge new observations rather than replacing user-curated content.`, - - instructionsPrompt: `Analyze this project thoroughly: - -1. Start by reading key configuration files (package.json, Cargo.toml, requirements.txt, pyproject.toml, build.gradle, Makefile, Dockerfile, etc. — whatever exists). -2. List the top-level directory to understand the project structure. -3. Use code_search to find patterns like import styles, error handling, test frameworks. -4. Read a few representative source files to understand coding conventions. -5. Look for CI configuration (.github/workflows/, .gitlab-ci.yml, etc.). -6. Check for existing knowledge files in \`.agents/knowledge/\` — if they exist, read them first. - -Then write all four knowledge files. Be specific and cite actual file paths and patterns you observed. - -After writing all files, end your turn with a brief summary of what you found.`, -} diff --git a/evalbuff/old/cli/package.json b/evalbuff/old/cli/package.json deleted file mode 100644 index 987856f22d..0000000000 --- a/evalbuff/old/cli/package.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "@codebuff/evalbuff", - "version": "0.1.0", - "description": "Codebase-specific evals, context, and review for AI coding agents", - "private": true, - "type": "module", - "bin": { - "evalbuff": "./src/index.ts" - }, - "scripts": { - "dev": "bun src/index.ts", - "typecheck": "tsc --noEmit -p .", - "test": "bun test" - }, - "dependencies": { - "@codebuff/sdk": "workspace:*", - "@codebuff/common": "workspace:*", - "commander": "^13.1.0", - "zod": "^4.2.1" - }, - "devDependencies": { - "@types/node": "^22.9.0" - } -} diff --git a/evalbuff/old/cli/src/commands/context.ts b/evalbuff/old/cli/src/commands/context.ts deleted file mode 100644 index 4d96059c70..0000000000 --- a/evalbuff/old/cli/src/commands/context.ts +++ /dev/null @@ -1,87 +0,0 @@ -import { CodebuffClient } from '@codebuff/sdk' - -import { contextAgent } from '../../../agents/context-agent' -import { ensureAuth } from '../utils/auth' -import { readConfig } from '../utils/config' -import { readKnowledgeFiles } from '../utils/knowledge' -import { printError, printWarning, Spinner } from '../utils/output' -import { findProjectRoot } from '../utils/project' - -interface ContextOptions { - cwd?: string - maxFiles?: string - filesOnly?: boolean -} - -export async function contextCommand( - prompt: string, - options: ContextOptions, -): Promise { - try { - const apiKey = await ensureAuth() - const projectRoot = findProjectRoot(options.cwd) - - const config = readConfig(projectRoot) - if (!config) { - printWarning( - 'evalbuff not initialized. Run "evalbuff init" for better results.', - ) - } - - const maxFiles = options.maxFiles - ? parseInt(options.maxFiles, 10) - : config?.context?.maxFiles ?? 15 - - const knowledgeFiles = readKnowledgeFiles(projectRoot) - - const spinner = new Spinner() - spinner.start('Scanning project structure...') - - const client = new CodebuffClient({ apiKey }) - - let agentPrompt = `Task: ${prompt}\n\nReturn up to ${maxFiles} relevant files.` - - if (options.filesOnly) { - agentPrompt += - '\n\nIMPORTANT: Output ONLY file paths, one per line. No markdown, no summaries, no sections. Just file paths.' - } - - let output = '' - - spinner.update('Finding relevant files...') - - const result = await client.run({ - agent: contextAgent, - prompt: agentPrompt, - cwd: projectRoot, - knowledgeFiles, - maxAgentSteps: 15, - handleStreamChunk: (chunk) => { - if (typeof chunk === 'string') { - output += chunk - } - }, - }) - - spinner.stop() - - if (result.output.type === 'error') { - printError(result.output.message) - process.exit(2) - } - - process.stdout.write(output) - if (output.length > 0 && !output.endsWith('\n')) { - process.stdout.write('\n') - } - - process.stderr.write('✓ Done\n') - } catch (error) { - printError( - error instanceof Error - ? error.message - : 'Failed to gather context.', - ) - process.exit(2) - } -} diff --git a/evalbuff/old/cli/src/commands/init.ts b/evalbuff/old/cli/src/commands/init.ts deleted file mode 100644 index dd2e045344..0000000000 --- a/evalbuff/old/cli/src/commands/init.ts +++ /dev/null @@ -1,127 +0,0 @@ -import fs from 'fs' -import path from 'path' -import readline from 'readline' - -import { CodebuffClient } from '@codebuff/sdk' - -import { scanAgent } from '../../../agents/scan-agent' -import { SKILL_TEMPLATE } from '../templates/skill' -import { ensureAuth } from '../utils/auth' -import { - configPath, - getDefaultConfig, - readConfig, - writeConfig, -} from '../utils/config' -import { ensureKnowledgeDir, readKnowledgeFiles } from '../utils/knowledge' -import { printError, Spinner } from '../utils/output' -import { findProjectRoot } from '../utils/project' - -interface InitOptions { - cwd?: string - skipScan?: boolean - force?: boolean -} - -function promptConfirm(question: string): Promise { - const rl = readline.createInterface({ - input: process.stdin, - output: process.stderr, - }) - return new Promise((resolve) => { - rl.question(`${question} (y/N) `, (answer) => { - rl.close() - resolve(answer.toLowerCase() === 'y') - }) - }) -} - -function installSkillFile(projectRoot: string, targetDir: string): string { - const skillPath = path.join(projectRoot, targetDir, 'evalbuff', 'SKILL.md') - const dir = path.dirname(skillPath) - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }) - } - fs.writeFileSync(skillPath, SKILL_TEMPLATE) - return path.relative(projectRoot, skillPath) -} - -export async function initCommand(options: InitOptions): Promise { - try { - const apiKey = await ensureAuth() - const projectRoot = findProjectRoot(options.cwd) - - const existingConfig = readConfig(projectRoot) - if (existingConfig && !options.force) { - const shouldOverwrite = await promptConfirm( - 'evalbuff is already initialized. Overwrite config and skill files?', - ) - if (!shouldOverwrite) { - process.stderr.write('Aborted.\n') - return - } - } - - const config = getDefaultConfig(projectRoot) - writeConfig(projectRoot, config) - const configRelPath = path.relative(projectRoot, configPath(projectRoot)) - process.stderr.write(`✓ Created ${configRelPath}\n`) - - const agentsSkillPath = installSkillFile( - projectRoot, - '.agents/skills', - ) - process.stderr.write(`✓ Installed skill to ${agentsSkillPath}\n`) - - const claudeSkillPath = installSkillFile( - projectRoot, - '.claude/skills', - ) - process.stderr.write(`✓ Installed skill to ${claudeSkillPath}\n`) - - ensureKnowledgeDir(projectRoot) - - if (!options.skipScan) { - const spinner = new Spinner() - spinner.start('Scanning project...') - - try { - const existingKnowledge = readKnowledgeFiles(projectRoot) - - const client = new CodebuffClient({ apiKey }) - let scanPrompt = 'Analyze this project and generate knowledge files.' - if (Object.keys(existingKnowledge).length > 0) { - scanPrompt += - ' Knowledge files already exist — read them first and merge new observations rather than overwriting.' - } - - const result = await client.run({ - agent: scanAgent, - prompt: scanPrompt, - cwd: projectRoot, - knowledgeFiles: existingKnowledge, - maxAgentSteps: 20, - }) - - if (result.output.type === 'error') { - spinner.fail(`Scan failed: ${result.output.message}`) - } else { - spinner.succeed('Generated project knowledge') - } - } catch (error) { - spinner.fail( - `Scan failed: ${error instanceof Error ? error.message : String(error)}`, - ) - } - } - - process.stderr.write( - `\nEvalbuff is ready! Your coding agents will now automatically use evalbuff for context and review.\n\nTry it:\n evalbuff context "add user authentication"\n evalbuff review\n`, - ) - } catch (error) { - printError( - error instanceof Error ? error.message : 'Init failed.', - ) - process.exit(2) - } -} diff --git a/evalbuff/old/cli/src/commands/login.ts b/evalbuff/old/cli/src/commands/login.ts deleted file mode 100644 index 3d4a6a0052..0000000000 --- a/evalbuff/old/cli/src/commands/login.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { loginFlow, getUserCredentials } from '../utils/auth' -import { printError } from '../utils/output' - -export async function loginCommand(): Promise { - try { - const existing = getUserCredentials() - if (existing) { - process.stderr.write( - `Already logged in as ${existing.email}. Run "evalbuff logout" first to switch accounts.\n`, - ) - return - } - - const user = await loginFlow() - process.stderr.write(`\n✓ Logged in as ${user.email}\n`) - } catch (error) { - printError( - error instanceof Error ? error.message : 'Login failed.', - ) - process.exit(2) - } -} diff --git a/evalbuff/old/cli/src/commands/logout.ts b/evalbuff/old/cli/src/commands/logout.ts deleted file mode 100644 index 696ac0b1ff..0000000000 --- a/evalbuff/old/cli/src/commands/logout.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { clearUserCredentials, getUserCredentials } from '../utils/auth' - -export function logoutCommand(): void { - const user = getUserCredentials() - clearUserCredentials() - - if (user) { - process.stderr.write(`✓ Logged out (was ${user.email})\n`) - } else { - process.stderr.write('Already logged out.\n') - } -} diff --git a/evalbuff/old/cli/src/commands/review.ts b/evalbuff/old/cli/src/commands/review.ts deleted file mode 100644 index e2653919fa..0000000000 --- a/evalbuff/old/cli/src/commands/review.ts +++ /dev/null @@ -1,139 +0,0 @@ -import fs from 'fs' -import path from 'path' - -import { CodebuffClient } from '@codebuff/sdk' - -import { reviewAgent } from '../../../agents/review-agent' -import { ensureAuth } from '../utils/auth' -import { readConfig } from '../utils/config' -import { - getDiff, - getChangedFiles, - isGitRepo, -} from '../utils/git' -import { readKnowledgeFiles } from '../utils/knowledge' -import { printError, printWarning, Spinner } from '../utils/output' -import { findProjectRoot } from '../utils/project' - -interface ReviewOptions { - cwd?: string - files?: string[] - branch?: string | true - staged?: boolean - commit?: string -} - -export async function reviewCommand( - prompt: string | undefined, - options: ReviewOptions, -): Promise { - try { - const apiKey = await ensureAuth() - const projectRoot = findProjectRoot(options.cwd) - - if (!isGitRepo(projectRoot)) { - printError('Not a git repository. Run from within a git repo.') - process.exit(2) - } - - const config = readConfig(projectRoot) - if (!config) { - printWarning( - 'evalbuff not initialized. Run "evalbuff init" for better results.', - ) - } - - const defaultBranch = config?.review?.defaultBranch ?? 'main' - - const diffOptions = { - cwd: projectRoot, - files: options.files, - branch: options.branch, - staged: options.staged, - commit: options.commit, - defaultBranch, - } - - const diff = getDiff(diffOptions) - - if (!diff.trim()) { - process.stderr.write('No changes to review.\n') - process.exit(0) - } - - const changedFiles = options.files ?? getChangedFiles(diffOptions) - - const spinner = new Spinner() - spinner.start('Collecting diff...') - - const fileContents: Record = {} - for (const filePath of changedFiles) { - const absPath = path.join(projectRoot, filePath) - if (fs.existsSync(absPath)) { - try { - fileContents[filePath] = fs.readFileSync(absPath, 'utf8') - } catch { - // skip unreadable files - } - } - } - - const knowledgeFiles = readKnowledgeFiles(projectRoot) - - spinner.update(`Analyzing ${changedFiles.length} changed files...`) - - let agentPrompt = `## Git Diff\n\n\`\`\`diff\n${diff}\n\`\`\`\n\n` - agentPrompt += `## Changed Files (full contents)\n\n` - for (const [filePath, content] of Object.entries(fileContents)) { - agentPrompt += `### ${filePath}\n\n\`\`\`\n${content}\n\`\`\`\n\n` - } - - if (prompt) { - agentPrompt += `## Original Request\n\nThe user's original request was: "${prompt}"\n\nInclude a Goal Assessment in your review that evaluates whether the changes fulfill this intent.\n` - } - - const client = new CodebuffClient({ apiKey }) - - let output = '' - - spinner.update('Generating review...') - - const result = await client.run({ - agent: reviewAgent, - prompt: agentPrompt, - cwd: projectRoot, - knowledgeFiles, - maxAgentSteps: 10, - handleStreamChunk: (chunk) => { - if (typeof chunk === 'string') { - output += chunk - } - }, - }) - - spinner.stop() - - if (result.output.type === 'error') { - printError(result.output.message) - process.exit(2) - } - - process.stdout.write(output) - if (output.length > 0 && !output.endsWith('\n')) { - process.stdout.write('\n') - } - - process.stderr.write('✓ Done\n') - - if (output.includes('🔴')) { - process.exit(1) - } - } catch (error) { - printError( - error instanceof Error - ? error.message - : 'Review failed.', - ) - process.exit(2) - } -} diff --git a/evalbuff/old/cli/src/index.ts b/evalbuff/old/cli/src/index.ts deleted file mode 100644 index a6830a1f34..0000000000 --- a/evalbuff/old/cli/src/index.ts +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env bun -import { Command } from 'commander' - -import { contextCommand } from './commands/context' -import { initCommand } from './commands/init' -import { loginCommand } from './commands/login' -import { logoutCommand } from './commands/logout' -import { reviewCommand } from './commands/review' - -const program = new Command() - .name('evalbuff') - .description( - 'Codebase-specific evals, context, and review for AI coding agents', - ) - .version('0.1.0') - -program - .command('init') - .description('Initialize evalbuff in a project') - .option('--cwd ', 'Project root directory') - .option('--skip-scan', 'Skip the initial project scan') - .option('--force', 'Overwrite existing configuration without prompting') - .action(async (options) => { - await initCommand({ - cwd: options.cwd, - skipScan: options.skipScan, - force: options.force, - }) - }) - -program - .command('context') - .description('Get relevant files, knowledge, and gotchas for a task') - .argument('', 'Description of what you are about to work on') - .option('--cwd ', 'Project root directory') - .option('--max-files ', 'Maximum number of files to return') - .option('--files-only', 'Output only file paths, one per line') - .action(async (prompt: string, options) => { - await contextCommand(prompt, { - cwd: options.cwd, - maxFiles: options.maxFiles, - filesOnly: options.filesOnly, - }) - }) - -program - .command('review') - .description('Review code changes with structured feedback') - .argument('[prompt]', 'Description of the original request for goal assessment') - .option('--cwd ', 'Project root directory') - .option('--files ', 'Scope the review to specific files') - .option( - '--branch [base]', - 'Compare current branch against a base branch', - ) - .option('--staged', 'Review only staged changes') - .option('--commit ', 'Review a specific commit') - .action(async (prompt: string | undefined, options) => { - await reviewCommand(prompt, { - cwd: options.cwd, - files: options.files, - branch: options.branch, - staged: options.staged, - commit: options.commit, - }) - }) - -program - .command('login') - .description('Authenticate with evalbuff') - .action(async () => { - await loginCommand() - }) - -program - .command('logout') - .description('Clear stored credentials') - .action(() => { - logoutCommand() - }) - -program.parse() diff --git a/evalbuff/old/cli/src/templates/skill.ts b/evalbuff/old/cli/src/templates/skill.ts deleted file mode 100644 index f666241a84..0000000000 --- a/evalbuff/old/cli/src/templates/skill.ts +++ /dev/null @@ -1,45 +0,0 @@ -export const SKILL_TEMPLATE = `--- -name: evalbuff -description: Use evalbuff to get project context before coding and review changes before committing ---- - -# Evalbuff - -This project uses evalbuff for AI-assisted context gathering and change review. - -## Before Starting a Task - -Run evalbuff to get oriented before making changes: - - evalbuff context "" - -This returns: -- **Relevant files** with summaries — so you know what to read -- **Background knowledge** about the systems involved -- **Gotchas and lessons** from past work — so you avoid known pitfalls - -Use this output to inform which files to read and what to watch out for. - -## After Making Changes - -Run evalbuff to review your changes before considering the task complete. Include a description of what the user originally asked for so the reviewer can verify the changes match the intent: - - evalbuff review "" - -This returns structured feedback including: -- 🔴 **Critical issues** that must be fixed -- 🟡 **Warnings** that should be addressed -- 💡 **Suggestions** for improvement -- Whether the changes actually accomplish the stated goal - -If there are critical issues (🔴), fix them and re-run the review. -If there are only warnings and suggestions, use your judgment. - -## Tips - -- Always run \`evalbuff context\` first — it often surfaces non-obvious files and gotchas. -- Always pass the user's original request to \`evalbuff review\` — this helps catch missing requirements and verify the changes match intent. -- Run \`evalbuff review\` even for small changes — it catches things like missing error handling, test gaps, and convention violations. -- You can review specific files: \`evalbuff review "add auth" --files src/auth.ts src/db.ts\` -- You can review staged changes only: \`evalbuff review "fix login bug" --staged\` -` diff --git a/evalbuff/old/cli/src/utils/auth.ts b/evalbuff/old/cli/src/utils/auth.ts deleted file mode 100644 index e20d299c22..0000000000 --- a/evalbuff/old/cli/src/utils/auth.ts +++ /dev/null @@ -1,188 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' -import { execSync } from 'child_process' - -import { WEBSITE_URL } from '@codebuff/sdk' -import { z } from 'zod' - -const EVALBUFF_API_KEY_ENV_VAR = 'EVALBUFF_API_KEY' - -const userSchema = z.object({ - name: z.string(), - email: z.string(), - authToken: z.string(), - fingerprintId: z.string().optional(), - fingerprintHash: z.string().optional(), -}) - -type User = z.infer - -const credentialsSchema = z.object({ - default: userSchema.optional(), -}) - -export function getConfigDir(): string { - return path.join(os.homedir(), '.config', 'evalbuff') -} - -export function getCredentialsPath(): string { - return path.join(getConfigDir(), 'credentials.json') -} - -export function getUserCredentials(): User | null { - const credentialsPath = getCredentialsPath() - if (!fs.existsSync(credentialsPath)) return null - - try { - const raw = fs.readFileSync(credentialsPath, 'utf8') - const parsed = credentialsSchema.parse(JSON.parse(raw)) - return parsed.default ?? null - } catch { - return null - } -} - -export function getAuthToken(): string | undefined { - const envToken = process.env[EVALBUFF_API_KEY_ENV_VAR] - if (envToken) return envToken - - const user = getUserCredentials() - return user?.authToken -} - -export function saveUserCredentials(user: User): void { - const configDir = getConfigDir() - const credentialsPath = getCredentialsPath() - - if (!fs.existsSync(configDir)) { - fs.mkdirSync(configDir, { recursive: true }) - } - - let existing: Record = {} - if (fs.existsSync(credentialsPath)) { - try { - existing = JSON.parse(fs.readFileSync(credentialsPath, 'utf8')) - } catch { - // ignore - } - } - - fs.writeFileSync( - credentialsPath, - JSON.stringify({ ...existing, default: user }, null, 2), - ) -} - -export function clearUserCredentials(): void { - const credentialsPath = getCredentialsPath() - if (!fs.existsSync(credentialsPath)) return - - try { - const { default: _, ...rest } = JSON.parse( - fs.readFileSync(credentialsPath, 'utf8'), - ) - if (Object.keys(rest).length === 0) { - fs.unlinkSync(credentialsPath) - } else { - fs.writeFileSync(credentialsPath, JSON.stringify(rest, null, 2)) - } - } catch { - // ignore - } -} - -function generateFingerprintId(): string { - return `evalbuff-${Math.random().toString(36).substring(2, 15)}` -} - -function openBrowser(url: string): void { - try { - const platform = process.platform - if (platform === 'darwin') { - execSync(`open ${JSON.stringify(url)}`, { stdio: 'ignore' }) - } else if (platform === 'linux') { - execSync(`xdg-open ${JSON.stringify(url)}`, { stdio: 'ignore' }) - } else if (platform === 'win32') { - execSync(`start ${JSON.stringify(url)}`, { stdio: 'ignore' }) - } - } catch { - // Browser open failed, user will need to copy the URL - } -} - -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)) -} - -export async function loginFlow(): Promise { - const fingerprintId = generateFingerprintId() - - const codeResponse = await fetch(`${WEBSITE_URL}/api/auth/cli/code`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ fingerprintId }), - }) - - if (!codeResponse.ok) { - throw new Error('Failed to initiate login. Check your internet connection.') - } - - const { loginUrl, fingerprintHash, expiresAt } = (await codeResponse.json()) as { - loginUrl: string - fingerprintHash: string - expiresAt: string - } - - process.stderr.write(`\nOpen this URL to log in:\n\n ${loginUrl}\n\n`) - process.stderr.write('Waiting for authentication...\n') - openBrowser(loginUrl) - - const startTime = Date.now() - const timeoutMs = 5 * 60 * 1000 - const pollIntervalMs = 5000 - - while (Date.now() - startTime < timeoutMs) { - await sleep(pollIntervalMs) - - try { - const params = new URLSearchParams({ - fingerprintId, - fingerprintHash, - expiresAt, - }) - const statusResponse = await fetch( - `${WEBSITE_URL}/api/auth/cli/status?${params}`, - ) - - if (statusResponse.ok) { - const data = (await statusResponse.json()) as { - user?: Record - } - if (data.user) { - const user: User = { - name: String(data.user.name ?? ''), - email: String(data.user.email ?? ''), - authToken: String(data.user.authToken ?? ''), - fingerprintId, - fingerprintHash, - } - saveUserCredentials(user) - return user - } - } - } catch { - // Network error during polling, continue - } - } - - throw new Error('Login timed out. Please try again.') -} - -export async function ensureAuth(): Promise { - const token = getAuthToken() - if (token) return token - - const user = await loginFlow() - return user.authToken -} diff --git a/evalbuff/old/cli/src/utils/config.ts b/evalbuff/old/cli/src/utils/config.ts deleted file mode 100644 index f07e997321..0000000000 --- a/evalbuff/old/cli/src/utils/config.ts +++ /dev/null @@ -1,119 +0,0 @@ -import fs from 'fs' -import path from 'path' - -import { z } from 'zod' - -const CONFIG_PATH = '.agents/evals/evalbuff.json' - -const evalbuffConfigSchema = z.object({ - version: z.number(), - project: z - .object({ - name: z.string().optional(), - description: z.string().optional(), - }) - .optional(), - context: z - .object({ - maxFiles: z.number().optional(), - excludePatterns: z.array(z.string()).optional(), - }) - .optional(), - review: z - .object({ - defaultBranch: z.string().optional(), - }) - .optional(), -}) - -export type EvalbuffConfig = z.infer - -export function configPath(projectRoot: string): string { - return path.join(projectRoot, CONFIG_PATH) -} - -export function readConfig(projectRoot: string): EvalbuffConfig | null { - const filePath = configPath(projectRoot) - if (!fs.existsSync(filePath)) return null - - try { - const raw = JSON.parse(fs.readFileSync(filePath, 'utf8')) - return evalbuffConfigSchema.parse(raw) - } catch (error) { - process.stderr.write( - `Warning: Failed to parse evalbuff.json: ${error instanceof Error ? error.message : String(error)}. Using defaults.\n`, - ) - return null - } -} - -export function writeConfig( - projectRoot: string, - config: EvalbuffConfig, -): void { - const filePath = configPath(projectRoot) - const dir = path.dirname(filePath) - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }) - } - fs.writeFileSync(filePath, JSON.stringify(config, null, 2) + '\n') -} - -export function detectProjectName(projectRoot: string): string { - const pkgPath = path.join(projectRoot, 'package.json') - if (fs.existsSync(pkgPath)) { - try { - const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8')) - if (typeof pkg.name === 'string' && pkg.name) return pkg.name - } catch { - // ignore - } - } - - const pyprojectPath = path.join(projectRoot, 'pyproject.toml') - if (fs.existsSync(pyprojectPath)) { - try { - const content = fs.readFileSync(pyprojectPath, 'utf8') - const nameMatch = content.match(/^name\s*=\s*"([^"]+)"/m) - if (nameMatch) return nameMatch[1] - } catch { - // ignore - } - } - - return path.basename(projectRoot) -} - -export function detectProjectDescription(projectRoot: string): string { - const pkgPath = path.join(projectRoot, 'package.json') - if (fs.existsSync(pkgPath)) { - try { - const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf8')) - if (typeof pkg.description === 'string' && pkg.description) - return pkg.description - } catch { - // ignore - } - } - return '' -} - -export function getDefaultConfig(projectRoot: string): EvalbuffConfig { - const name = detectProjectName(projectRoot) - const description = detectProjectDescription(projectRoot) - - return { - version: 1, - project: { - name, - ...(description && { description }), - }, - context: { - maxFiles: 15, - excludePatterns: ['dist/**', 'node_modules/**', '*.generated.ts'], - }, - review: { - defaultBranch: 'main', - }, - } -} diff --git a/evalbuff/old/cli/src/utils/git.ts b/evalbuff/old/cli/src/utils/git.ts deleted file mode 100644 index 7eab0a44f4..0000000000 --- a/evalbuff/old/cli/src/utils/git.ts +++ /dev/null @@ -1,110 +0,0 @@ -import { execSync } from 'child_process' - -export function isGitRepo(cwd: string): boolean { - try { - execSync('git rev-parse --is-inside-work-tree', { - cwd, - stdio: 'pipe', - }) - return true - } catch { - return false - } -} - -export function getGitRoot(cwd: string): string | null { - try { - return execSync('git rev-parse --show-toplevel', { - cwd, - stdio: 'pipe', - encoding: 'utf8', - }).trim() - } catch { - return null - } -} - -export function getDefaultBranch(cwd: string): string { - try { - const result = execSync( - 'git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null || echo refs/remotes/origin/main', - { cwd, stdio: 'pipe', encoding: 'utf8' }, - ).trim() - return result.replace('refs/remotes/origin/', '') - } catch { - return 'main' - } -} - -export interface DiffOptions { - cwd: string - files?: string[] - branch?: string | true - staged?: boolean - commit?: string - defaultBranch?: string -} - -export function getDiff(options: DiffOptions): string { - const { cwd, files, branch, staged, commit, defaultBranch = 'main' } = options - - let cmd: string - - if (commit) { - cmd = `git diff ${commit}~1 ${commit}` - } else if (branch !== undefined) { - const baseBranch = typeof branch === 'string' ? branch : defaultBranch - const mergeBase = execSync(`git merge-base ${baseBranch} HEAD`, { - cwd, - stdio: 'pipe', - encoding: 'utf8', - }).trim() - cmd = `git diff ${mergeBase} HEAD` - } else if (staged) { - cmd = 'git diff --cached' - } else { - cmd = 'git diff HEAD' - } - - if (files && files.length > 0) { - cmd += ' -- ' + files.map((f) => JSON.stringify(f)).join(' ') - } - - try { - return execSync(cmd, { cwd, stdio: 'pipe', encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }) - } catch { - return '' - } -} - -export function getChangedFiles(options: DiffOptions): string[] { - const { cwd, branch, staged, commit, defaultBranch = 'main' } = options - - let cmd: string - - if (commit) { - cmd = `git diff --name-only ${commit}~1 ${commit}` - } else if (branch !== undefined) { - const baseBranch = typeof branch === 'string' ? branch : defaultBranch - const mergeBase = execSync(`git merge-base ${baseBranch} HEAD`, { - cwd, - stdio: 'pipe', - encoding: 'utf8', - }).trim() - cmd = `git diff --name-only ${mergeBase} HEAD` - } else if (staged) { - cmd = 'git diff --cached --name-only' - } else { - cmd = 'git diff HEAD --name-only' - } - - try { - const result = execSync(cmd, { cwd, stdio: 'pipe', encoding: 'utf8' }) - return result - .trim() - .split('\n') - .filter((f) => f.length > 0) - } catch { - return [] - } -} diff --git a/evalbuff/old/cli/src/utils/knowledge.ts b/evalbuff/old/cli/src/utils/knowledge.ts deleted file mode 100644 index 76718c3570..0000000000 --- a/evalbuff/old/cli/src/utils/knowledge.ts +++ /dev/null @@ -1,50 +0,0 @@ -import fs from 'fs' -import path from 'path' - -const KNOWLEDGE_DIR = '.agents/knowledge' - -export function knowledgeDir(projectRoot: string): string { - return path.join(projectRoot, KNOWLEDGE_DIR) -} - -export function ensureKnowledgeDir(projectRoot: string): void { - const dir = knowledgeDir(projectRoot) - if (!fs.existsSync(dir)) { - fs.mkdirSync(dir, { recursive: true }) - } -} - -export function readKnowledgeFiles( - projectRoot: string, -): Record { - const dir = knowledgeDir(projectRoot) - if (!fs.existsSync(dir)) return {} - - const files: Record = {} - try { - const entries = fs.readdirSync(dir) - for (const entry of entries) { - if (!entry.endsWith('.md')) continue - const filePath = path.join(dir, entry) - try { - files[path.join(KNOWLEDGE_DIR, entry)] = fs.readFileSync( - filePath, - 'utf8', - ) - } catch { - // skip unreadable files - } - } - } catch { - // directory doesn't exist or can't be read - } - - return files -} - -export const KNOWLEDGE_FILE_NAMES = [ - 'architecture.md', - 'tech-stack.md', - 'conventions.md', - 'testing.md', -] as const diff --git a/evalbuff/old/cli/src/utils/output.ts b/evalbuff/old/cli/src/utils/output.ts deleted file mode 100644 index ea4f61d372..0000000000 --- a/evalbuff/old/cli/src/utils/output.ts +++ /dev/null @@ -1,62 +0,0 @@ -const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'] -const SPINNER_INTERVAL_MS = 80 - -export function isTTY(): boolean { - return process.stderr.isTTY === true -} - -export class Spinner { - private frameIndex = 0 - private timer: ReturnType | null = null - private currentMessage = '' - - start(message: string): void { - this.currentMessage = message - if (!isTTY()) return - - this.render() - this.timer = setInterval(() => { - this.frameIndex = (this.frameIndex + 1) % SPINNER_FRAMES.length - this.render() - }, SPINNER_INTERVAL_MS) - } - - update(message: string): void { - this.currentMessage = message - if (!isTTY()) return - this.render() - } - - stop(): void { - if (this.timer) { - clearInterval(this.timer) - this.timer = null - } - if (isTTY()) { - process.stderr.write('\r\x1b[K') - } - } - - succeed(message: string): void { - this.stop() - process.stderr.write(`✓ ${message}\n`) - } - - fail(message: string): void { - this.stop() - process.stderr.write(`✗ ${message}\n`) - } - - private render(): void { - const frame = SPINNER_FRAMES[this.frameIndex] - process.stderr.write(`\r\x1b[K${frame} ${this.currentMessage}`) - } -} - -export function printError(message: string): void { - process.stderr.write(`Error: ${message}\n`) -} - -export function printWarning(message: string): void { - process.stderr.write(`Warning: ${message}\n`) -} diff --git a/evalbuff/old/cli/src/utils/project.ts b/evalbuff/old/cli/src/utils/project.ts deleted file mode 100644 index 7d32f6e074..0000000000 --- a/evalbuff/old/cli/src/utils/project.ts +++ /dev/null @@ -1,9 +0,0 @@ -import path from 'path' - -import { getGitRoot } from './git' - -export function findProjectRoot(cwd?: string): string { - const startDir = cwd ? path.resolve(cwd) : process.cwd() - const gitRoot = getGitRoot(startDir) - return gitRoot ?? startDir -} diff --git a/evalbuff/old/cli/tsconfig.json b/evalbuff/old/cli/tsconfig.json deleted file mode 100644 index 30b7a1ec13..0000000000 --- a/evalbuff/old/cli/tsconfig.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "types": ["bun-types"], - "skipLibCheck": true, - "paths": { - "@codebuff/sdk": ["../../sdk/src/index.ts"] - } - }, - "include": ["src", "../agents"], - "exclude": ["node_modules", "dist"] -} diff --git a/evalbuff/package.json b/evalbuff/package.json deleted file mode 100644 index e97a2a3a8e..0000000000 --- a/evalbuff/package.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "@codebuff/evalbuff", - "version": "1.0.0", - "description": "Automated docs optimization loop: run agent → judge → analyze failures → propose doc edits", - "private": true, - "type": "module", - "scripts": { - "typecheck": "tsc --noEmit -p .", - "test": "bun test src/__tests__/criteria.test.ts src/__tests__/docs-optimizer.test.ts src/__tests__/morning-report.test.ts src/__tests__/cli-runner.test.ts && bun test src/__tests__/loop.integration.test.ts && bun test src/__tests__/e2e.test.ts", - "test:unit": "bun test src/__tests__/criteria.test.ts src/__tests__/docs-optimizer.test.ts src/__tests__/morning-report.test.ts src/__tests__/cli-runner.test.ts", - "test:integration": "bun test src/__tests__/loop.integration.test.ts", - "test:e2e": "bun test src/__tests__/e2e.test.ts", - "test:e2e-real": "bun run src/run-e2e-test.ts", - "run": "bun run src/run-evalbuff.ts" - }, - "dependencies": { - "@ai-sdk/anthropic": "^2.0.50", - "@codebuff/common": "workspace:*", - "@codebuff/sdk": "workspace:*", - "ai": "^5.0.0", - "openai": "^6.33.0", - "zod": "^4.2.1" - } -} diff --git a/evalbuff/src/__tests__/cli-runner.test.ts b/evalbuff/src/__tests__/cli-runner.test.ts deleted file mode 100644 index a0aab3f8a7..0000000000 --- a/evalbuff/src/__tests__/cli-runner.test.ts +++ /dev/null @@ -1,107 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' -import { execSync } from 'child_process' - -import { afterEach, beforeEach, describe, expect, it } from 'bun:test' - -import { runCliAgent } from '../cli-runner' - -let tmpDir: string - -beforeEach(() => { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-cli-test-')) - // Initialize a git repo so git diff works - execSync('git init && git add . && git commit --allow-empty -m "init"', { - cwd: tmpDir, - stdio: 'ignore', - }) -}) - -afterEach(() => { - fs.rmSync(tmpDir, { recursive: true, force: true }) -}) - -describe('runCliAgent', () => { - it('happy path: captures stdout and exit code 0', async () => { - const result = await runCliAgent({ - command: 'echo', - prompt: 'hello world', - cwd: tmpDir, - timeoutMs: 10_000, - }) - - expect(result.exitCode).toBe(0) - expect(result.stdout.trim()).toBe('hello world') - expect(result.durationMs).toBeGreaterThan(0) - }) - - it('captures git diff when agent creates a file', async () => { - // Use a bash command that creates a file - const scriptPath = path.join(tmpDir, 'agent.sh') - fs.writeFileSync( - scriptPath, - '#!/bin/bash\necho "new content" > newfile.txt\n', - ) - fs.chmodSync(scriptPath, '755') - - const result = await runCliAgent({ - command: scriptPath, - prompt: 'create a file', - cwd: tmpDir, - timeoutMs: 10_000, - }) - - expect(result.exitCode).toBe(0) - expect(result.diff).toContain('newfile.txt') - expect(result.diff).toContain('new content') - }) - - it('handles agent crash with non-zero exit code', async () => { - const result = await runCliAgent({ - command: 'bash -c', - prompt: 'exit 42', - cwd: tmpDir, - timeoutMs: 10_000, - }) - - expect(result.exitCode).toBe(42) - }) - - it('returns empty diff when agent makes no changes', async () => { - const result = await runCliAgent({ - command: 'echo', - prompt: 'do nothing', - cwd: tmpDir, - timeoutMs: 10_000, - }) - - expect(result.diff).toBe('') - }) - - it('rejects when agent CLI is not found', async () => { - const promise = runCliAgent({ - command: 'nonexistent-agent-binary-xyz', - prompt: 'test', - cwd: tmpDir, - timeoutMs: 10_000, - }) - - await expect(promise).rejects.toThrow('CLI agent failed to start') - await expect(promise).rejects.toThrow('nonexistent-agent-binary-xyz') - }) - - it('kills agent on timeout', async () => { - const result = await runCliAgent({ - command: 'sleep', - prompt: '30', - cwd: tmpDir, - timeoutMs: 500, // 500ms timeout - }) - - // Process should have been killed - expect(result.durationMs).toBeLessThan(5000) - // Exit code is null when killed by signal, which becomes 1 - expect(result.exitCode).not.toBe(0) - }) -}) diff --git a/evalbuff/src/__tests__/criteria.test.ts b/evalbuff/src/__tests__/criteria.test.ts deleted file mode 100644 index 3b25cfb5c9..0000000000 --- a/evalbuff/src/__tests__/criteria.test.ts +++ /dev/null @@ -1,119 +0,0 @@ -import { describe, expect, it } from 'bun:test' - -import { - formatCriteriaForPrompt, - getCriteriaForLevel, - maybePromoteCriteria, -} from '../criteria' - -import type { QualityCriteria } from '../criteria' - -function makeCriteria( - level: number, - threshold = 8.0, - window = 10, -): QualityCriteria { - return { - level, - criteria: getCriteriaForLevel(level), - promotionThreshold: threshold, - promotionWindow: window, - } -} - -describe('getCriteriaForLevel', () => { - it('returns only L1 criteria at level 1', () => { - const criteria = getCriteriaForLevel(1) - expect(criteria).toHaveLength(3) - expect(criteria.map((c) => c.name)).toEqual([ - 'Builds & Compiles', - 'Existing Tests Pass', - 'Basic Completeness', - ]) - }) - - it('accumulates criteria up to level 3', () => { - const criteria = getCriteriaForLevel(3) - expect(criteria.map((c) => c.name)).toEqual([ - 'Builds & Compiles', - 'Existing Tests Pass', - 'Basic Completeness', - 'Feature Works E2E', - 'Logs & Observability', - 'Edge Cases & Error States', - 'UI/UX Verification', - ]) - }) - - it('includes all criteria at level 5', () => { - const criteria = getCriteriaForLevel(5) - expect(criteria).toHaveLength(10) - expect(criteria[criteria.length - 1].name).toBe('Production Readiness') - }) - - it('caps at level 5 even if higher number passed', () => { - const criteria = getCriteriaForLevel(10) - expect(criteria).toHaveLength(10) - }) -}) - -describe('maybePromoteCriteria', () => { - it('promotes when avg above threshold over window', () => { - const criteria = makeCriteria(1, 8.0, 5) - const scores = [8.5, 9.0, 8.2, 8.8, 8.6] - const newLevel = maybePromoteCriteria(criteria, scores) - expect(newLevel).toBe(2) - }) - - it('does NOT promote when avg below threshold', () => { - const criteria = makeCriteria(1, 8.0, 5) - const scores = [7.0, 6.5, 8.0, 7.5, 7.0] - const newLevel = maybePromoteCriteria(criteria, scores) - expect(newLevel).toBe(1) - }) - - it('does NOT promote when already at max level (5)', () => { - const criteria = makeCriteria(5, 8.0, 3) - const scores = [9.0, 9.5, 9.0] - const newLevel = maybePromoteCriteria(criteria, scores) - expect(newLevel).toBe(5) - }) - - it('does NOT promote when fewer iterations than window size', () => { - const criteria = makeCriteria(1, 8.0, 10) - const scores = [9.0, 9.5, 9.0] - const newLevel = maybePromoteCriteria(criteria, scores) - expect(newLevel).toBe(1) - }) - - it('uses only the last N scores in the window', () => { - const criteria = makeCriteria(2, 8.0, 3) - const scores = [3.0, 4.0, 5.0, 8.5, 9.0, 8.5] - const newLevel = maybePromoteCriteria(criteria, scores) - expect(newLevel).toBe(3) - }) -}) - -describe('formatCriteriaForPrompt', () => { - it('includes level and E2E-focused criteria names', () => { - const criteria = makeCriteria(2) - const prompt = formatCriteriaForPrompt(criteria) - expect(prompt).toContain('Level 2/5') - expect(prompt).toContain('Builds & Compiles') - expect(prompt).toContain('Feature Works E2E') - }) - - it('includes weights', () => { - const criteria = makeCriteria(1) - const prompt = formatCriteriaForPrompt(criteria) - expect(prompt).toContain('weight: 3') - expect(prompt).toContain('weight: 2') - }) - - it('instructs E2E verification', () => { - const criteria = makeCriteria(1) - const prompt = formatCriteriaForPrompt(criteria) - expect(prompt).toContain('MUST verify') - expect(prompt).toContain('E2E testing') - }) -}) diff --git a/evalbuff/src/__tests__/docs-optimizer.test.ts b/evalbuff/src/__tests__/docs-optimizer.test.ts deleted file mode 100644 index 5d96d84d99..0000000000 --- a/evalbuff/src/__tests__/docs-optimizer.test.ts +++ /dev/null @@ -1,126 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { afterEach, beforeEach, describe, expect, it } from 'bun:test' - -import { applyDocEdit, compareScores, readCurrentDocs } from '../docs-optimizer' - -let tmpDir: string - -beforeEach(() => { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-test-')) -}) - -afterEach(() => { - fs.rmSync(tmpDir, { recursive: true, force: true }) -}) - -describe('applyDocEdit', () => { - it('creates new file under docs/ and updates AGENTS.md TOC', () => { - const result = applyDocEdit( - tmpDir, - 'patterns/error-handling.md', - '# Error Handling\n\nAlways use try/catch.', - ) - expect(result).toBe(true) - - const docPath = path.join(tmpDir, 'docs', 'patterns', 'error-handling.md') - expect(fs.existsSync(docPath)).toBe(true) - expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling') - - const agentsMd = fs.readFileSync( - path.join(tmpDir, 'AGENTS.md'), - 'utf-8', - ) - expect(agentsMd).toContain('docs/patterns/error-handling.md') - }) - - it('overwrites existing file content', () => { - // Create initial doc - applyDocEdit(tmpDir, 'conventions/naming.md', 'Original content') - - // Overwrite - applyDocEdit(tmpDir, 'conventions/naming.md', 'Updated content') - - const content = fs.readFileSync( - path.join(tmpDir, 'docs', 'conventions', 'naming.md'), - 'utf-8', - ) - expect(content).toBe('Updated content') - }) - - it('does not duplicate AGENTS.md entry on overwrite', () => { - applyDocEdit(tmpDir, 'test.md', 'v1') - applyDocEdit(tmpDir, 'test.md', 'v2') - - const agentsMd = fs.readFileSync( - path.join(tmpDir, 'AGENTS.md'), - 'utf-8', - ) - // The link format is "- [docs/test.md](docs/test.md)" — one entry has two occurrences of the path - const entryMatches = agentsMd.match(/- \[docs\/test\.md\]/g) - expect(entryMatches).toHaveLength(1) - }) - - it('rejects path starting with /', () => { - const result = applyDocEdit(tmpDir, '/etc/passwd', 'bad') - expect(result).toBe(false) - }) - - it('rejects path with ..', () => { - const result = applyDocEdit(tmpDir, '../outside/file.md', 'bad') - expect(result).toBe(false) - }) - - it('creates AGENTS.md if it does not exist', () => { - expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(false) - applyDocEdit(tmpDir, 'new-doc.md', 'content') - expect(fs.existsSync(path.join(tmpDir, 'AGENTS.md'))).toBe(true) - - const agentsMd = fs.readFileSync( - path.join(tmpDir, 'AGENTS.md'), - 'utf-8', - ) - expect(agentsMd).toContain('# Documentation') - expect(agentsMd).toContain('docs/new-doc.md') - }) -}) - -describe('compareScores', () => { - it('returns improved when new > old', () => { - expect(compareScores(5.0, 7.0)).toBe('improved') - }) - - it('returns same when new == old', () => { - expect(compareScores(5.0, 5.0)).toBe('same') - }) - - it('returns worse when new < old', () => { - expect(compareScores(7.0, 5.0)).toBe('worse') - }) -}) - -describe('readCurrentDocs', () => { - it('returns empty object when docs/ does not exist', () => { - const docs = readCurrentDocs(tmpDir) - expect(docs).toEqual({}) - }) - - it('reads all markdown files recursively', () => { - const docsDir = path.join(tmpDir, 'docs') - fs.mkdirSync(path.join(docsDir, 'patterns'), { recursive: true }) - fs.writeFileSync(path.join(docsDir, 'intro.md'), 'intro content') - fs.writeFileSync( - path.join(docsDir, 'patterns', 'api.md'), - 'api patterns', - ) - // Non-md file should be ignored - fs.writeFileSync(path.join(docsDir, 'notes.txt'), 'ignored') - - const docs = readCurrentDocs(tmpDir) - expect(Object.keys(docs).sort()).toEqual(['intro.md', 'patterns/api.md']) - expect(docs['intro.md']).toBe('intro content') - expect(docs['patterns/api.md']).toBe('api patterns') - }) -}) diff --git a/evalbuff/src/__tests__/e2e.test.ts b/evalbuff/src/__tests__/e2e.test.ts deleted file mode 100644 index f1ca599662..0000000000 --- a/evalbuff/src/__tests__/e2e.test.ts +++ /dev/null @@ -1,190 +0,0 @@ -/** - * E2E test for evalbuff. - * - * This test runs the full evalbuff loop with mocked LLM calls but real - * orchestration. It verifies: - * - The morning report is generated - * - Log entries are written - * - State file tracks processed commits - * - Doc edits are committed to the repo when they improve scores - * - * Run: bun test evalbuff/src/__tests__/e2e.test.ts - */ -import { execSync } from 'child_process' -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { afterAll, beforeAll, describe, expect, it, mock } from 'bun:test' - -import type { JudgingResult } from '../judge' -import type { DocSuggestion } from '../docs-optimizer' - -// --- Mocks for LLM calls only --- - -let judgeCallCount = 0 - -mock.module('../test-repo-utils', () => ({ - withTestRepo: async (_config: any, fn: (cwd: string) => Promise) => { - const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-repo-')) - execSync('git init && git add . && git commit --allow-empty -m "init"', { - cwd: dir, - stdio: 'ignore', - env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' }, - }) - try { - return await fn(dir) - } finally { - fs.rmSync(dir, { recursive: true, force: true }) - } - }, -})) - -mock.module('../runners/codebuff', () => ({ - CodebuffRunner: class { - constructor() {} - async run() { - return { - steps: [{ type: 'text', content: 'mock trace' }], - totalCostUsd: 0.01, - diff: 'mock diff content', - } - } - }, -})) - -mock.module('@codebuff/sdk', () => ({ - CodebuffClient: class { - constructor() {} - async run() { return { output: { type: 'success' }, sessionState: null } } - }, - loadLocalAgents: async () => ({}), -})) - -// Judge returns alternating scores: low (triggers doc edit), then higher (confirms improvement) -mock.module('../judge', () => ({ - judgeTaskResult: async () => { - const scores = [3.0, 6.0, 8.5, 5.0, 7.0, 9.0] - const score = scores[judgeCallCount % scores.length] - judgeCallCount++ - return { - analysis: `Mock analysis for call ${judgeCallCount}`, - strengths: ['Correctly identified the problem'], - weaknesses: ['Missing error handling', 'No tests added'], - e2eTestsPerformed: ['Started dev server', 'Tested API endpoint'], - completionScore: score, - codeQualityScore: score, - e2eScore: score, - overallScore: score, - } satisfies JudgingResult - }, -})) - -const actualDocsOptimizer = await import('../docs-optimizer') -mock.module('../docs-optimizer', () => ({ - ...actualDocsOptimizer, - analyzeFailure: async () => - ({ - reasoning: 'Agent consistently misses error handling patterns in async code', - suggestedDocPath: 'patterns/async-error-handling.md', - suggestedContent: - '# Async Error Handling\n\nAll async functions should use try/catch blocks.\nPropagate errors with meaningful messages.\n', - }) satisfies DocSuggestion, -})) - -// Mock commit-task-generator -mock.module('../commit-task-generator', () => ({ - getCommitList: () => ['sha-1', 'sha-2', 'sha-3'], - buildCommitTask: async (_repoPath: string, sha: string) => ({ - sha, - parentSha: `parent-${sha}`, - message: `Commit ${sha}`, - prompt: `Do the thing for ${sha}`, - diff: `mock diff for ${sha}`, - filesChanged: ['src/file.ts'], - }), -})) - -const { runLearnMode } = await import('../run-evalbuff') - -// --- Test setup --- - -let repoDir: string - -beforeAll(() => { - repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-e2e-target-')) - execSync('git init && git add . && git commit --allow-empty -m "init"', { - cwd: repoDir, - stdio: 'ignore', - env: { ...process.env, GIT_AUTHOR_NAME: 'test', GIT_AUTHOR_EMAIL: 'test@test.com', GIT_COMMITTER_NAME: 'test', GIT_COMMITTER_EMAIL: 'test@test.com' }, - }) - execSync('git remote add origin https://github.com/test/repo', { - cwd: repoDir, - stdio: 'ignore', - }) - - judgeCallCount = 0 -}) - -afterAll(() => { - fs.rmSync(repoDir, { recursive: true, force: true }) -}) - -// --- E2E tests --- - -describe('evalbuff E2E', () => { - it('runs full learn loop: processes commits, improves docs, generates report', async () => { - await runLearnMode({ - mode: 'learn', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 50, - agentTimeoutMs: 10_000, - commitCount: 500, - }) - - // 1. Morning report exists - const reportFiles = fs - .readdirSync(repoDir) - .filter((f) => f.startsWith('evalbuff-report-')) - expect(reportFiles.length).toBe(1) - const report = fs.readFileSync( - path.join(repoDir, reportFiles[0]), - 'utf-8', - ) - expect(report).toContain('# Evalbuff Morning Report') - - // 2. Log has entries - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - expect(fs.existsSync(logPath)).toBe(true) - const logLines = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - expect(logLines.length).toBeGreaterThan(0) - - // 3. State tracks last processed commit - const statePath = path.join(repoDir, 'evalbuff-state.json') - const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(state.lastProcessedCommitSha).toBe('sha-3') - expect(state.processedCommitCount).toBe(3) - - // 4. At least one doc was written (first task scores 3.0) - const docsDir = path.join(repoDir, 'docs') - expect(fs.existsSync(docsDir)).toBe(true) - - // 5. AGENTS.md was created with TOC - const agentsMdPath = path.join(repoDir, 'AGENTS.md') - expect(fs.existsSync(agentsMdPath)).toBe(true) - const agentsMd = fs.readFileSync(agentsMdPath, 'utf-8') - expect(agentsMd).toContain('async-error-handling.md') - - // 6. Doc edits were committed to git - const gitLog = execSync('git log --oneline', { - cwd: repoDir, - encoding: 'utf-8', - }) - expect(gitLog).toContain('evalbuff:') - }) -}) diff --git a/evalbuff/src/__tests__/loop.integration.test.ts b/evalbuff/src/__tests__/loop.integration.test.ts deleted file mode 100644 index 7246261330..0000000000 --- a/evalbuff/src/__tests__/loop.integration.test.ts +++ /dev/null @@ -1,318 +0,0 @@ -import { execSync } from 'child_process' -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { afterEach, beforeEach, describe, expect, it, mock } from 'bun:test' - -import type { JudgingResult } from '../judge' -import type { DocSuggestion } from '../docs-optimizer' - -// --- Mocks --- - -let judgeCallCount = 0 -let judgeScores: number[] = [] -let analyzeCallCount = 0 -let analyzeFailureResults: Array = [] -let cliRunnerCallCount = 0 - -// Mock withTestRepo to use a local temp dir instead of cloning -mock.module('../test-repo-utils', () => ({ - withTestRepo: async (_config: any, fn: (cwd: string) => Promise) => { - const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-mock-repo-')) - execSync('git init && git add . && git commit --allow-empty -m "init"', { - cwd: dir, - stdio: 'ignore', - }) - try { - return await fn(dir) - } finally { - fs.rmSync(dir, { recursive: true, force: true }) - } - }, -})) - -// Mock CodebuffRunner to return a fake result -mock.module('../runners/codebuff', () => ({ - CodebuffRunner: class { - constructor() {} - async run() { - cliRunnerCallCount++ - return { - steps: [{ type: 'text', content: 'mock trace' }], - totalCostUsd: 0.01, - diff: 'mock diff content', - } - } - }, -})) - -// Mock SDK client and loadLocalAgents -mock.module('@codebuff/sdk', () => ({ - CodebuffClient: class { - constructor() {} - async run() { return { output: { type: 'success' }, sessionState: null } } - }, - loadLocalAgents: async () => ({}), -})) - -// Mock judge to return configurable scores -mock.module('../judge', () => ({ - judgeTaskResult: async () => { - const score = judgeScores[judgeCallCount] ?? 5.0 - judgeCallCount++ - return { - analysis: 'Mock analysis', - strengths: ['Good'], - weaknesses: ['Could improve'], - e2eTestsPerformed: ['Mock E2E test'], - completionScore: score, - codeQualityScore: score, - e2eScore: score, - overallScore: score, - } satisfies JudgingResult - }, - judgeCommitResult: async () => { - const score = judgeScores[judgeCallCount] ?? 5.0 - judgeCallCount++ - return { - analysis: 'Mock analysis', - strengths: ['Good'], - weaknesses: ['Could improve'], - e2eTestsPerformed: ['Mock E2E test'], - completionScore: score, - codeQualityScore: score, - e2eScore: score, - overallScore: score, - } satisfies JudgingResult - }, -})) - -// Mock docs-optimizer LLM calls but keep pure functions -const actualDocsOptimizer = await import('../docs-optimizer') -mock.module('../docs-optimizer', () => ({ - ...actualDocsOptimizer, - analyzeFailure: async () => { - const result = analyzeFailureResults[analyzeCallCount] ?? null - analyzeCallCount++ - return result - }, -})) - -// Mock commit-task-generator to avoid real git and LLM calls -mock.module('../commit-task-generator', () => ({ - getCommitList: () => ['sha-1', 'sha-2', 'sha-3'], - buildCommitTask: async (_repoPath: string, sha: string) => ({ - sha, - parentSha: `parent-${sha}`, - message: `Commit ${sha}`, - prompt: `Do the thing for ${sha}`, - diff: `mock diff for ${sha}`, - filesChanged: ['src/file.ts'], - }), -})) - -// Import after mocks are set up -const { runLearnMode, runPromptMode } = await import('../run-evalbuff') - -// --- Test fixtures --- - -let repoDir: string - -beforeEach(() => { - repoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-integ-')) - execSync('git init && git add . && git commit --allow-empty -m "init"', { - cwd: repoDir, - stdio: 'ignore', - }) - // Set up a fake remote so git remote get-url works - execSync('git remote add origin https://github.com/test/repo', { - cwd: repoDir, - stdio: 'ignore', - }) - - // Reset mock state - judgeCallCount = 0 - judgeScores = [] - analyzeCallCount = 0 - analyzeFailureResults = [] - cliRunnerCallCount = 0 -}) - -afterEach(() => { - fs.rmSync(repoDir, { recursive: true, force: true }) -}) - -// --- Tests --- - -describe('runLearnMode integration', () => { - it('processes commits, runs agents in parallel, judges, and logs', async () => { - // With parallelism=1 and 3 commits, we get 3 baseline runs (1 per commit) - // Each baseline run gets judged once - judgeScores = [8.0, 8.0, 8.0] - - await runLearnMode({ - mode: 'learn', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 100, - agentTimeoutMs: 10_000, - commitCount: 500, - }) - - // Verify log was written with entries for each commit - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - expect(fs.existsSync(logPath)).toBe(true) - const logLines = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - expect(logLines).toHaveLength(3) - - // Verify state was saved with lastProcessedCommitSha - const statePath = path.join(repoDir, 'evalbuff-state.json') - expect(fs.existsSync(statePath)).toBe(true) - const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - expect(state.lastProcessedCommitSha).toBe('sha-3') - expect(state.processedCommitCount).toBe(3) - - // Verify morning report was generated - const reportFiles = fs - .readdirSync(repoDir) - .filter((f) => f.startsWith('evalbuff-report-')) - expect(reportFiles.length).toBeGreaterThan(0) - }) - - it('attempts doc edit and keeps it when score improves', async () => { - // parallelism=1: commit1 baseline=4.0, rerun with doc=7.0 (improved, kept) - // Then analyze returns null to stop loop. commit2 baseline=8.0, analyze returns null. - // commit3 baseline=8.0, analyze returns null. - judgeScores = [4.0, 7.0, 8.0, 8.0, 8.0, 8.0] - const docSuggestion: DocSuggestion = { - reasoning: 'Agent missed error handling patterns', - suggestedDocPath: 'patterns/errors.md', - suggestedContent: '# Error Handling\n\nAlways use try/catch.', - } - // First analyze call returns suggestion, then null to stop iterating - analyzeFailureResults = [docSuggestion, null, null, null] - - await runLearnMode({ - mode: 'learn', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 100, - agentTimeoutMs: 10_000, - commitCount: 500, - }) - - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - const entries = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - .map((l) => JSON.parse(l)) - - // First entry should show doc improvement - expect(entries[0].oldScore).toBe(4.0) - expect(entries[0].newScore).toBe(7.0) - expect(entries[0].docEdit).not.toBeNull() - - // Doc should have been applied to the real repo - const docPath = path.join(repoDir, 'docs', 'patterns', 'errors.md') - expect(fs.existsSync(docPath)).toBe(true) - expect(fs.readFileSync(docPath, 'utf-8')).toContain('Error Handling') - }) - - it('stops when cost exceeds maxCostUsd', async () => { - judgeScores = [8.0, 8.0, 8.0] - - // Pre-set cost at limit - const statePath = path.join(repoDir, 'evalbuff-state.json') - fs.writeFileSync( - statePath, - JSON.stringify({ - lastProcessedCommitSha: null, - totalCostUsd: 100.0, - recentScores: [], - processedCommitCount: 0, - }), - ) - - await runLearnMode({ - mode: 'learn', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 100, - agentTimeoutMs: 10_000, - commitCount: 500, - }) - - // Should not have processed any commits (cost already at limit) - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - expect(fs.existsSync(logPath)).toBe(false) - }) - - it('rejects doc edit when score drops significantly', async () => { - // Commit1: baseline 5.0, rerun 2.0 (3-point drop, past 1.5 threshold) — doc rejected. - // Commit2: baseline 8.0, analyze returns null. Commit3: baseline 8.0, null. - judgeScores = [5.0, 2.0, 8.0, 8.0] - analyzeFailureResults = [ - { - reasoning: 'Tried to help', - suggestedDocPath: 'bad-doc.md', - suggestedContent: '# Bad Doc\n\nThis will not help.', - }, - null, - null, - ] - - await runLearnMode({ - mode: 'learn', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 100, - agentTimeoutMs: 10_000, - commitCount: 500, - }) - - // Doc should NOT exist in the real repo - const docPath = path.join(repoDir, 'docs', 'bad-doc.md') - expect(fs.existsSync(docPath)).toBe(false) - }) -}) - -describe('runPromptMode integration', () => { - it('runs agents on a prompt and attempts doc improvement', async () => { - judgeScores = [5.0, 7.0] - analyzeFailureResults = [ - { - reasoning: 'Agent needs better context', - suggestedDocPath: 'conventions/api.md', - suggestedContent: '# API Conventions\n\nUse REST.', - }, - null, // stop after first improvement - ] - - await runPromptMode({ - mode: 'prompt', - repoPath: repoDir, - agentId: 'base2-free-evals', - parallelism: 1, - maxCostUsd: 100, - agentTimeoutMs: 10_000, - prompt: 'Add a new API endpoint for users', - }) - - // Verify log was written - const logPath = path.join(repoDir, 'evalbuff-log.jsonl') - expect(fs.existsSync(logPath)).toBe(true) - const entry = JSON.parse( - fs.readFileSync(logPath, 'utf-8').trim(), - ) - expect(entry.taskId).toBe('prompt-mode') - }) -}) diff --git a/evalbuff/src/__tests__/morning-report.test.ts b/evalbuff/src/__tests__/morning-report.test.ts deleted file mode 100644 index 3819b9c3ee..0000000000 --- a/evalbuff/src/__tests__/morning-report.test.ts +++ /dev/null @@ -1,161 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { afterEach, beforeEach, describe, expect, it } from 'bun:test' - -import { appendLogEntry, generateMorningReport } from '../morning-report' - -import type { EvalbuffLogEntry } from '../morning-report' - -let tmpDir: string -let logPath: string - -beforeEach(() => { - tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-report-test-')) - logPath = path.join(tmpDir, 'evalbuff-log.jsonl') -}) - -afterEach(() => { - fs.rmSync(tmpDir, { recursive: true, force: true }) -}) - -function makeEntry(overrides: Partial = {}): EvalbuffLogEntry { - return { - taskId: 'task-001', - timestamp: '2026-03-25T08:00:00.000Z', - oldScore: 5.0, - newScore: null, - docEdit: null, - scoreComparison: null, - costUsd: 0.5, - durationMs: 60_000, - criteriaLevel: 1, - ...overrides, - } -} - -describe('generateMorningReport', () => { - it('generates valid report from JSONL log with all stats', () => { - const entries: EvalbuffLogEntry[] = [ - makeEntry({ - taskId: 'task-001', - oldScore: 5.0, - newScore: 7.5, - docEdit: { path: 'patterns/api.md', reasoning: 'Agent missed API pattern' }, - scoreComparison: 'improved', - costUsd: 1.2, - durationMs: 120_000, - }), - makeEntry({ - taskId: 'task-002', - timestamp: '2026-03-25T09:00:00.000Z', - oldScore: 8.0, - costUsd: 0.8, - durationMs: 90_000, - }), - ] - - for (const entry of entries) { - appendLogEntry(logPath, entry) - } - - const report = generateMorningReport(logPath) - - expect(report).toContain('# Evalbuff Morning Report') - expect(report).toContain('Iterations | 2') - expect(report).toContain('$2.00') - expect(report).toContain('Docs Attempted | 1') - expect(report).toContain('Docs Kept (improved score) | 1') - expect(report).toContain('task-001') - expect(report).toContain('task-002') - expect(report).toContain('patterns/api.md') - }) - - it('generates empty report when log file does not exist', () => { - const report = generateMorningReport( - path.join(tmpDir, 'nonexistent.jsonl'), - ) - expect(report).toContain('No iterations were run') - expect(report).toContain('Iterations | 0') - }) - - it('generates empty report when log file is empty', () => { - fs.writeFileSync(logPath, '') - const report = generateMorningReport(logPath) - expect(report).toContain('No iterations were run') - }) - - it('shows errors table when iterations have errors', () => { - appendLogEntry( - logPath, - makeEntry({ - taskId: 'task-fail', - error: 'Agent timed out after 300s', - }), - ) - - const report = generateMorningReport(logPath) - expect(report).toContain('## Errors') - expect(report).toContain('task-fail') - expect(report).toContain('Agent timed out') - }) - - it('shows score trajectory section', () => { - appendLogEntry(logPath, makeEntry({ taskId: 'task-a', oldScore: 3.0 })) - appendLogEntry(logPath, makeEntry({ taskId: 'task-b', oldScore: 7.0 })) - - const report = generateMorningReport(logPath) - expect(report).toContain('## Score Trajectory') - expect(report).toContain('task-a') - expect(report).toContain('task-b') - }) - - it('shows doc changes with score impact', () => { - appendLogEntry( - logPath, - makeEntry({ - taskId: 'task-doc', - oldScore: 4.0, - newScore: 6.5, - docEdit: { path: 'conventions/naming.md', reasoning: 'Naming was wrong' }, - scoreComparison: 'improved', - }), - ) - appendLogEntry( - logPath, - makeEntry({ - taskId: 'task-revert', - oldScore: 5.0, - newScore: 4.0, - docEdit: { path: 'patterns/bad.md', reasoning: 'Did not help' }, - scoreComparison: 'worse', - }), - ) - - const report = generateMorningReport(logPath) - expect(report).toContain('## Doc Changes') - expect(report).toContain('4.0 -> 6.5') - expect(report).toContain('Yes') // kept - expect(report).toContain('5.0 -> 4.0') - expect(report).toContain('No') // reverted - }) -}) - -describe('appendLogEntry', () => { - it('appends JSONL entries that can be parsed back', () => { - const entry1 = makeEntry({ taskId: 'a' }) - const entry2 = makeEntry({ taskId: 'b' }) - - appendLogEntry(logPath, entry1) - appendLogEntry(logPath, entry2) - - const lines = fs - .readFileSync(logPath, 'utf-8') - .trim() - .split('\n') - expect(lines).toHaveLength(2) - expect(JSON.parse(lines[0]).taskId).toBe('a') - expect(JSON.parse(lines[1]).taskId).toBe('b') - }) -}) diff --git a/evalbuff/src/__tests__/trace-compressor.test.ts b/evalbuff/src/__tests__/trace-compressor.test.ts deleted file mode 100644 index 7039465fdc..0000000000 --- a/evalbuff/src/__tests__/trace-compressor.test.ts +++ /dev/null @@ -1,159 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { afterEach, beforeEach, describe, expect, it } from 'bun:test' - -import { compressTrace, cleanupTraceDir } from '../trace-compressor' - -let traceDir: string - -beforeEach(() => { - traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-trace-test-')) -}) - -afterEach(() => { - cleanupTraceDir(traceDir) -}) - -describe('compressTrace', () => { - it('leaves short traces unchanged', () => { - const trace = 'Thinking about the problem...\nLooking at the code.\nDone.' - const result = compressTrace(trace, traceDir) - - expect(result.inline).toBe(trace) - expect(fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt'))).toHaveLength(0) - }) - - it('extracts large code fence blocks to files', () => { - const largeBlock = 'x'.repeat(500) - const trace = `Thinking about the problem... -\`\`\` -${largeBlock} -\`\`\` -Done.` - - const result = compressTrace(trace, traceDir) - - // The inline trace should have a pointer instead of the large block - expect(result.inline).toContain('[Code block stored in:') - expect(result.inline).toMatch(/\d+ chars/) - expect(result.inline).not.toContain(largeBlock) - - // The file should contain the block - const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) - expect(files).toHaveLength(1) - const fileContent = fs.readFileSync(path.join(traceDir, files[0]), 'utf-8') - expect(fileContent).toContain(largeBlock) - }) - - it('keeps small code fence blocks inline', () => { - const trace = `Looking at code: -\`\`\` -const x = 1 -\`\`\` -Done.` - - const result = compressTrace(trace, traceDir) - - expect(result.inline).toContain('const x = 1') - expect(result.inline).not.toContain('[Code block stored in:') - expect(fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt'))).toHaveLength(0) - }) - - it('extracts large indented blocks', () => { - const indentedLines = Array.from({ length: 20 }, (_, i) => ` line ${i}: ${'content '.repeat(10)}`).join('\n') - const trace = `Running command:\n${indentedLines}\nDone.` - - const result = compressTrace(trace, traceDir) - - expect(result.inline).toContain('[Indented block stored in:') - expect(result.inline).toContain('20 lines') - - const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) - expect(files).toHaveLength(1) - }) - - it('handles JSON-lines format (Claude streaming)', () => { - const largeContent = 'x'.repeat(500) - const events = [ - JSON.stringify({ type: 'tool_use', name: 'Read', input: { path: 'src/index.ts' } }), - JSON.stringify({ type: 'tool_result', content: largeContent }), - JSON.stringify({ type: 'text', content: 'Now I understand the code.' }), - ] - const trace = events.join('\n') - - const result = compressTrace(trace, traceDir) - - // Tool use should still be inline - expect(result.inline).toContain('"name":"Read"') - // Large tool result should be extracted - expect(result.inline).toContain('[Stored in:') - expect(result.inline).not.toContain(largeContent) - // Text event should be inline - expect(result.inline).toContain('Now I understand the code') - - const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) - expect(files).toHaveLength(1) - }) - - it('keeps small JSON tool results inline', () => { - const events = [ - JSON.stringify({ type: 'tool_use', name: 'Read', input: { path: 'a.ts' } }), - JSON.stringify({ type: 'tool_result', content: 'short result' }), - ] - const trace = events.join('\n') - - const result = compressTrace(trace, traceDir) - - expect(result.inline).toContain('short result') - expect(result.inline).not.toContain('[Stored in:') - }) - - it('extracts multiple large blocks', () => { - const block1 = 'a'.repeat(500) - const block2 = 'b'.repeat(500) - const trace = `Step 1: -\`\`\` -${block1} -\`\`\` -Step 2: -\`\`\` -${block2} -\`\`\` -Done.` - - const result = compressTrace(trace, traceDir) - - const files = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) - expect(files).toHaveLength(2) - expect(result.inline).not.toContain(block1) - expect(result.inline).not.toContain(block2) - }) - - it('generates a content summary in the pointer', () => { - const jsonBlock = '{\n "name": "test",\n' + ' "data": "x",\n'.repeat(50) + '}' - const trace = `Result:\n\`\`\`\n${jsonBlock}\n\`\`\`\nDone.` - - const result = compressTrace(trace, traceDir) - - // Should have a summary mentioning it's a code block - expect(result.inline).toContain('code block') - }) -}) - -describe('cleanupTraceDir', () => { - it('removes the directory and all files', () => { - fs.writeFileSync(path.join(traceDir, 'test.txt'), 'content') - expect(fs.existsSync(traceDir)).toBe(true) - - cleanupTraceDir(traceDir) - - expect(fs.existsSync(traceDir)).toBe(false) - }) - - it('does not throw on non-existent directory', () => { - cleanupTraceDir('/tmp/nonexistent-evalbuff-trace-dir-xyz') - // Should not throw - }) -}) diff --git a/evalbuff/src/agent-runner.ts b/evalbuff/src/agent-runner.ts deleted file mode 100644 index 174dcb22b9..0000000000 --- a/evalbuff/src/agent-runner.ts +++ /dev/null @@ -1,196 +0,0 @@ -import { execSync , exec } from 'child_process' -import { promisify } from 'util' - -const execAsync = promisify(exec) - -import { withTimeout } from '@codebuff/common/util/promise' - - -import { withTestRepo } from './test-repo-utils' -import { ClaudeRunner } from './runners/claude' -import { CodebuffRunner } from './runners/codebuff' -import { CodexRunner } from './runners/codex' - -import type { Runner, AgentStep } from './runners/runner' -import type { EvalCommitV2, FinalCheckOutput } from './types' -import type { CodebuffClient } from '@codebuff/sdk' - -export type { AgentStep } - -export type ExternalAgentType = 'claude' | 'codex' - -export async function runAgentOnCommit({ - client, - agentId, - commit, - repoUrl, - initCommand, - env, - localAgentDefinitions, - printEvents, - finalCheckCommands, - externalAgentType, -}: { - client: CodebuffClient - agentId: string - commit: EvalCommitV2 - repoUrl: string - initCommand?: string - env?: Record - localAgentDefinitions: any[] - printEvents: boolean - finalCheckCommands?: string[] - externalAgentType?: ExternalAgentType -}): Promise<{ - diff: string - contextFiles: Record - durationMs: number - cost: number - error?: string - trace: AgentStep[] - finalCheckOutputs?: FinalCheckOutput[] -}> { - console.log(`[${commit.id}] Running agent ${agentId}...`) - const startTime = Date.now() - let diff = '' - let contextFiles: Record = {} - let error: string | undefined - let cost = 0 - const trace: AgentStep[] = [] - let finalCheckOutputs: FinalCheckOutput[] | undefined - - try { - const timeoutMs = 60 * 60 * 1000 // 60 minutes - await withTimeout( - withTestRepo( - { - repoUrl, - parentSha: commit.parentSha, - initCommand, - env, - }, - async (repoDir) => { - // Select the appropriate runner - let runner: Runner - if (externalAgentType === 'claude') { - runner = new ClaudeRunner(repoDir, env) - } else if (externalAgentType === 'codex') { - runner = new CodexRunner(repoDir, env) - } else { - runner = new CodebuffRunner({ - cwd: repoDir, - env, - client, - agentId, - localAgentDefinitions, - printEvents, - commitId: commit.id, - parentSha: commit.parentSha, - }) - } - - console.log( - `[${commit.id}] Running agent: ${externalAgentType || 'codebuff'}`, - ) - - const result = await runner.run(commit.prompt) - trace.push(...result.steps) - cost = result.totalCostUsd - diff = result.diff - - const contextFilePaths = new Set([ - ...commit.supplementalFiles, - ...commit.fileDiffs.map((fd) => fd.path), - ]) - for (const { status, path } of commit.fileDiffs) { - if (status === 'added') { - contextFilePaths.delete(path) - } - } - - for (const filePath of contextFilePaths) { - try { - const content = execSync( - `git show ${commit.parentSha}:${JSON.stringify(filePath)}`, - { - cwd: repoDir, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }, - ) - contextFiles[filePath] = content - } catch (error) { - contextFiles[filePath] = '' - } - } - - // Run final check commands if specified - if (finalCheckCommands && finalCheckCommands.length > 0) { - console.log( - `[${commit.id}] Running ${finalCheckCommands.length} final check commands...`, - ) - finalCheckOutputs = await runFinalCheckCommands( - finalCheckCommands, - repoDir, - env, - ) - } - }, - ), - timeoutMs, - `Agent ${agentId} timed out after ${timeoutMs / 1000} seconds`, - ) - } catch (e) { - error = e instanceof Error ? `${e.message}\n${e.stack}` : String(e) - } - - const durationMs = Date.now() - startTime - - return { - diff, - contextFiles, - durationMs, - cost, - error, - trace, - finalCheckOutputs, - } -} - -async function runFinalCheckCommands( - commands: string[], - cwd: string, - env?: Record, -): Promise { - const results: FinalCheckOutput[] = [] - - for (const command of commands) { - console.log(` Running: ${command}`) - try { - const { stdout, stderr } = await execAsync(command, { - cwd, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, // 10MB buffer - env: { ...process.env, ...env }, - }) - results.push({ - command, - exitCode: 0, - stdout, - stderr, - }) - console.log(` ✓ Command succeeded: ${command}`) - } catch (error: any) { - // Command failed, but we still capture the output - results.push({ - command, - exitCode: error.code || 1, - stdout: error.stdout || '', - stderr: error.stderr || error.message || '', - }) - console.log(` ✗ Command failed (exit ${error.code}): ${command}`) - } - } - - return results -} diff --git a/evalbuff/src/carve-features.ts b/evalbuff/src/carve-features.ts deleted file mode 100644 index 080f1080ef..0000000000 --- a/evalbuff/src/carve-features.ts +++ /dev/null @@ -1,533 +0,0 @@ -/** - * Feature Carver for evalbuff v2. - * - * Instead of using git commits as evals, this: - * 1. Analyzes a codebase to identify discrete, self-contained features - * 2. Plans how to cleanly delete each feature - * 3. Produces diffs that remove the feature (code, docs, references) - * - * The output can then be used as eval tasks: give agents a simple prompt - * to rebuild the deleted feature, judge against the original code. - */ -import { execSync } from 'child_process' -import fs from 'fs' -import path from 'path' - -import OpenAI from 'openai' - -// --- Types --- - -export interface CarveCandidate { - id: string - name: string - prompt: string // Short, natural prompt to rebuild this feature - description: string // What this feature does - files: string[] // Files involved (to delete or modify) - complexity: 'small' | 'medium' | 'large' -} - -export interface CarvePlan { - candidates: CarveCandidate[] - reasoning: string -} - -export interface FileOperation { - path: string - action: 'delete' | 'modify' - /** For 'modify': the new file content with the feature removed */ - newContent?: string -} - -export interface CarvedFeature { - id: string - prompt: string - description: string - complexity: 'small' | 'medium' | 'large' - /** Files as they exist before carving (the "ground truth" to rebuild) */ - originalFiles: Record - /** Operations to perform to carve the feature out */ - operations: FileOperation[] - /** Unified diff of the carving (deletions) */ - diff: string -} - -export interface CarveResult { - repoPath: string - generationDate: string - features: CarvedFeature[] -} - -// --- OpenAI client --- - -function getClient(): OpenAI { - return new OpenAI() // Uses OPENAI_API_KEY from env -} - -const PLANNING_MODEL = 'gpt-5.4' -const CARVING_MODEL = 'gpt-5.4' - -// --- Repo analysis helpers --- - -function getFileTree(repoPath: string, maxDepth: number = 4): string { - try { - // Use git ls-files to only get tracked files - const files = execSync('git ls-files', { - cwd: repoPath, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - .trim() - .split('\n') - .filter(Boolean) - - // Filter out noise - const filtered = files.filter((f) => { - const parts = f.split('/') - if (parts.length > maxDepth) return false - if (f.endsWith('.lock') || f.endsWith('.lockb')) return false - if (f.includes('node_modules/')) return false - if (f.endsWith('.json') && f.includes('package-lock')) return false - return true - }) - - return filtered.join('\n') - } catch { - return '' - } -} - -function readFile(repoPath: string, filePath: string): string | null { - try { - const fullPath = path.join(repoPath, filePath) - return fs.readFileSync(fullPath, 'utf-8') - } catch { - return null - } -} - -function getRepoStats(repoPath: string): string { - const fileTree = getFileTree(repoPath) - const files = fileTree.split('\n').filter(Boolean) - - const byExtension: Record = {} - for (const f of files) { - const ext = path.extname(f) || '(no ext)' - byExtension[ext] = (byExtension[ext] || 0) + 1 - } - - const sorted = Object.entries(byExtension) - .sort((a, b) => b[1] - a[1]) - .slice(0, 15) - .map(([ext, count]) => ` ${ext}: ${count}`) - .join('\n') - - return `Total tracked files: ${files.length}\nBy extension:\n${sorted}` -} - -// --- Phase 1: Plan features to carve --- - -const PLANNING_SYSTEM = `You are an expert software architect analyzing a codebase to identify discrete, self-contained features that can be cleanly "carved out" (deleted) and used as coding evaluation tasks. - -## Your Goal - -Identify 15-25 features in this codebase that could be cleanly removed and then rebuilt by a coding agent. Each feature should: - -1. **Be self-contained** — removing it leaves the rest of the codebase functional (maybe some missing imports/references, but structurally intact) -2. **Be describable in 1-2 sentences** — a developer could ask for it naturally -3. **Be non-trivial but bounded** — not a one-liner, but not "rewrite the whole app" -4. **Cover different aspects** — mix of UI components, API endpoints, utilities, config, tests, etc. -5. **Not overlap** — deleting feature A shouldn't also delete most of feature B - -## What makes a good carve candidate - -- A React component + its usage sites -- An API endpoint (route + handler + types) -- A CLI subcommand or flag -- A utility module used in a few places -- A feature behind a config/flag -- A test suite for a specific module -- A middleware or plugin -- An integration with an external service - -## What makes a BAD candidate - -- Core infrastructure that everything depends on (routing, auth framework, database connection) -- A single function that's called in 50 places -- Trivially small changes (rename, config tweak) -- Auto-generated or boilerplate code - -## Output Format - -Respond with valid JSON matching this schema: -{ - "reasoning": "Your analysis of the codebase and approach to selecting features", - "candidates": [ - { - "id": "short-kebab-id", - "name": "Human readable name", - "prompt": "Natural prompt a developer would use to ask for this feature, 1-2 sentences", - "description": "What this feature does and why it exists", - "files": ["path/to/file1.ts", "path/to/file2.tsx"], - "complexity": "small|medium|large" - } - ] -} - -Be thorough in listing ALL files involved in each feature — missing a file means the carve won't be clean.` - -export async function planFeatures(repoPath: string): Promise { - const client = getClient() - - const fileTree = getFileTree(repoPath) - const stats = getRepoStats(repoPath) - - // Read key files for context - const keyFiles = [ - 'package.json', - 'README.md', - 'CLAUDE.md', - 'tsconfig.json', - 'src/index.ts', - 'src/index.tsx', - 'src/app.ts', - 'src/app.tsx', - 'src/main.ts', - 'src/main.tsx', - ] - - let keyFileContents = '' - for (const kf of keyFiles) { - const content = readFile(repoPath, kf) - if (content) { - keyFileContents += `\n### ${kf}\n\`\`\`\n${content.slice(0, 5000)}\n\`\`\`\n` - } - } - - const userPrompt = `## Repository Stats -${stats} - -## File Tree -\`\`\` -${fileTree} -\`\`\` - -## Key Files -${keyFileContents || '(none found)'} - -Please analyze this codebase and identify 15-25 features that can be cleanly carved out for evaluation.` - - console.log('Planning features to carve...') - const response = await client.chat.completions.create({ - model: PLANNING_MODEL, - messages: [ - { role: 'system', content: PLANNING_SYSTEM }, - { role: 'user', content: userPrompt }, - ], - response_format: { type: 'json_object' }, - }) - - const text = response.choices[0]?.message?.content - if (!text) throw new Error('No response from planning model') - - const parsed = JSON.parse(text) as CarvePlan - console.log(`Identified ${parsed.candidates.length} carve candidates`) - return parsed -} - -// --- Phase 2: Execute carving for each feature --- - -const CARVING_SYSTEM = `You are a precise code surgeon. Your job is to cleanly remove a specific feature from a codebase. - -## Rules - -1. **Delete completely** — remove ALL code related to the feature: components, handlers, types, tests, docs, imports, route registrations, etc. -2. **Don't break the rest** — the remaining code should still be structurally valid. Fix imports, remove dead references, etc. -3. **Minimal collateral** — only remove what's necessary. Don't "improve" or refactor surrounding code. -4. **Be thorough** — check for references in other files. If file A imports something from the feature, update file A's imports. - -## Output Format - -Respond with valid JSON matching this schema: -{ - "operations": [ - { - "path": "path/to/file.ts", - "action": "delete" - }, - { - "path": "path/to/other-file.ts", - "action": "modify", - "newContent": "...full file content with feature removed..." - } - ] -} - -For "modify" operations, provide the COMPLETE new file content (not a diff). This must be the entire file with only the feature-related code removed. -For "delete" operations, the entire file will be removed. - -Only include files that actually need to change. Don't include files that are unaffected.` - -export async function carveFeature( - repoPath: string, - candidate: CarveCandidate, -): Promise { - const client = getClient() - - // Read all files involved - const fileContents: Record = {} - for (const filePath of candidate.files) { - const content = readFile(repoPath, filePath) - if (content) { - fileContents[filePath] = content - } - } - - if (Object.keys(fileContents).length === 0) { - console.warn(` No readable files for feature ${candidate.id}, skipping`) - return null - } - - // Also read files that might reference the feature's files (importers) - const referenceFiles = findReferencingFiles(repoPath, candidate.files) - for (const refFile of referenceFiles) { - if (!fileContents[refFile]) { - const content = readFile(repoPath, refFile) - if (content) { - fileContents[refFile] = content - } - } - } - - let filesSection = '' - for (const [filePath, content] of Object.entries(fileContents)) { - const isFeatureFile = candidate.files.includes(filePath) - const label = isFeatureFile ? '(FEATURE FILE)' : '(REFERENCING FILE)' - filesSection += `\n### ${filePath} ${label}\n\`\`\`\n${content}\n\`\`\`\n` - } - - const userPrompt = `## Feature to Remove -**Name:** ${candidate.name} -**Description:** ${candidate.description} -**Feature files:** ${candidate.files.join(', ')} - -## Current File Contents -${filesSection} - -Remove this feature completely. For files that are entirely part of the feature, use "delete". For files that contain the feature mixed with other code, use "modify" and provide the full updated content.` - - console.log(` Carving feature: ${candidate.id}...`) - const response = await client.chat.completions.create({ - model: CARVING_MODEL, - messages: [ - { role: 'system', content: CARVING_SYSTEM }, - { role: 'user', content: userPrompt }, - ], - response_format: { type: 'json_object' }, - }) - - const text = response.choices[0]?.message?.content - if (!text) { - console.warn(` No response for feature ${candidate.id}`) - return null - } - - const parsed = JSON.parse(text) as { operations: FileOperation[] } - - // Compute diff - const diff = computeDiff(repoPath, parsed.operations) - - // Save original files (only the feature files, for judging) - const originalFiles: Record = {} - for (const filePath of candidate.files) { - if (fileContents[filePath]) { - originalFiles[filePath] = fileContents[filePath] - } - } - - return { - id: candidate.id, - prompt: candidate.prompt, - description: candidate.description, - complexity: candidate.complexity, - originalFiles, - operations: parsed.operations, - diff, - } -} - -// --- Helpers --- - -/** - * Find files that import/reference any of the given files. - * Uses git grep to find import statements. - */ -function findReferencingFiles( - repoPath: string, - featureFiles: string[], -): string[] { - const referencingFiles = new Set() - - for (const featureFile of featureFiles) { - // Extract the module name (without extension) for import matching - const basename = path.basename(featureFile).replace(/\.[^.]+$/, '') - const dirname = path.dirname(featureFile) - - // Search for imports of this file - try { - const results = execSync( - `git grep -l "${basename}" -- '*.ts' '*.tsx' '*.js' '*.jsx'`, - { - cwd: repoPath, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }, - ) - .trim() - .split('\n') - .filter(Boolean) - - for (const result of results) { - // Don't include the feature's own files - if (!featureFiles.includes(result)) { - referencingFiles.add(result) - } - } - } catch { - // git grep returns exit code 1 when no matches - } - } - - // Limit to reasonable number - const sorted = [...referencingFiles].slice(0, 20) - return sorted -} - -/** - * Compute a unified diff from file operations. - * Creates a temp worktree, applies operations, and diffs. - */ -function computeDiff( - repoPath: string, - operations: FileOperation[], -): string { - const diffs: string[] = [] - - for (const op of operations) { - const fullPath = path.join(repoPath, op.path) - const originalContent = fs.existsSync(fullPath) - ? fs.readFileSync(fullPath, 'utf-8') - : '' - - if (op.action === 'delete') { - // Show the full file as deleted - const lines = originalContent.split('\n') - const header = `--- a/${op.path}\n+++ /dev/null` - const hunk = `@@ -1,${lines.length} +0,0 @@\n` + - lines.map((l) => `-${l}`).join('\n') - diffs.push(`${header}\n${hunk}`) - } else if (op.action === 'modify' && op.newContent !== undefined) { - // Compute line-level diff - const oldLines = originalContent.split('\n') - const newLines = op.newContent.split('\n') - // Use a simple diff representation — the full before/after - const header = `--- a/${op.path}\n+++ b/${op.path}` - // For now, show full replacement (not optimal but correct) - const hunk = `@@ -1,${oldLines.length} +1,${newLines.length} @@\n` + - oldLines.map((l) => `-${l}`).join('\n') + '\n' + - newLines.map((l) => `+${l}`).join('\n') - diffs.push(`${header}\n${hunk}`) - } - } - - return diffs.join('\n\n') -} - -// --- Main orchestrator --- - -export async function carveFeatures( - repoPath: string, - options: { - count?: number // Number of features to carve (default: 10) - outputPath?: string - } = {}, -): Promise { - const { count = 10, outputPath } = options - - console.log(`\nCarving features from: ${repoPath}`) - console.log(`Target: ${count} features\n`) - - // Phase 1: Plan - const plan = await planFeatures(repoPath) - - console.log(`\nPlanning complete. Reasoning:\n${plan.reasoning}\n`) - console.log('Candidates:') - for (const c of plan.candidates) { - console.log(` ${c.id} (${c.complexity}): ${c.name}`) - console.log(` Prompt: ${c.prompt}`) - console.log(` Files: ${c.files.join(', ')}`) - } - - // Select top N candidates (prefer medium complexity) - const ranked = [...plan.candidates].sort((a, b) => { - const complexityOrder = { medium: 0, small: 1, large: 2 } - return complexityOrder[a.complexity] - complexityOrder[b.complexity] - }) - const selected = ranked.slice(0, count) - - console.log(`\nSelected ${selected.length} features for carving:\n`) - - // Phase 2: Carve each feature - const features: CarvedFeature[] = [] - for (const candidate of selected) { - try { - const carved = await carveFeature(repoPath, candidate) - if (carved) { - features.push(carved) - console.log(` ✓ ${carved.id} — ${carved.operations.length} file operations`) - } - } catch (error) { - console.error(` ✗ ${candidate.id} failed:`, error) - } - } - - const result: CarveResult = { - repoPath, - generationDate: new Date().toISOString(), - features, - } - - // Save output - const outPath = - outputPath || - path.join(repoPath, `carve-${new Date().toISOString().slice(0, 10)}.json`) - fs.writeFileSync(outPath, JSON.stringify(result, null, 2)) - console.log(`\nSaved ${features.length} carved features to: ${outPath}`) - - return result -} - -// --- CLI --- - -if (import.meta.main) { - const args = process.argv.slice(2) - - const getArg = (name: string, defaultValue?: string): string => { - const idx = args.indexOf(`--${name}`) - if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] - if (defaultValue !== undefined) return defaultValue - throw new Error(`Missing required argument: --${name}`) - } - - const repoPath = getArg('repo') - const count = parseInt(getArg('count', '10')) - const outputPath = args.indexOf('--output') >= 0 ? getArg('output') : undefined - - carveFeatures(repoPath, { count, outputPath }) - .then((result) => { - console.log(`\nDone! Carved ${result.features.length} features.`) - }) - .catch((error) => { - console.error('Carving failed:', error) - process.exit(1) - }) -} diff --git a/evalbuff/src/cli-runner.ts b/evalbuff/src/cli-runner.ts deleted file mode 100644 index fdd3cd50cf..0000000000 --- a/evalbuff/src/cli-runner.ts +++ /dev/null @@ -1,113 +0,0 @@ -import { execSync, spawn } from 'child_process' - -export interface CliRunnerOptions { - command: string // e.g., "claude -p" or "codex exec --full-auto" - prompt: string - cwd: string - timeoutMs: number // Default 300_000 (5 min) - env?: Record -} - -export interface CliRunnerResult { - diff: string - durationMs: number - exitCode: number - stdout: string - stderr: string -} - -export async function runCliAgent( - options: CliRunnerOptions, -): Promise { - const { command, prompt, cwd, timeoutMs, env } = options - const startTime = Date.now() - - return new Promise((resolve, reject) => { - const [cmd, ...baseArgs] = command.split(' ') - const args = [...baseArgs, prompt] - - console.log(`[CliRunner] Running: ${cmd} ${baseArgs.join(' ')} `) - - // Use detached + process group so we can kill the entire tree on timeout - const child = spawn(cmd, args, { - cwd, - env: { ...process.env, ...env }, - stdio: ['ignore', 'pipe', 'pipe'], - detached: true, - }) - - let stdout = '' - let stderr = '' - - const killTree = () => { - const pid = child.pid - if (pid != null) { - try { - // Kill the entire process group (negative pid) - process.kill(-pid, 'SIGTERM') - } catch { - // Process may already be dead - } - setTimeout(() => { - try { - process.kill(-pid, 'SIGKILL') - } catch { - // ignore - } - }, 5000) - } - } - - const timer = setTimeout(() => { - console.warn(`[CliRunner] Timeout after ${timeoutMs}ms, killing process tree`) - killTree() - }, timeoutMs) - - child.stdout.on('data', (data: Buffer) => { - stdout += data.toString() - }) - - child.stderr.on('data', (data: Buffer) => { - stderr += data.toString() - process.stderr.write(data) - }) - - child.on('error', (error) => { - clearTimeout(timer) - reject( - new Error( - `CLI agent failed to start: ${error.message}. Make sure '${cmd}' is installed and in PATH.`, - ), - ) - }) - - child.on('close', (code) => { - clearTimeout(timer) - const durationMs = Date.now() - startTime - - // Capture git diff of agent's changes - let diff = '' - try { - execSync('git add .', { cwd, stdio: 'ignore' }) - diff = execSync('git diff HEAD', { - cwd, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - } catch { - // Ignore git errors - } - - resolve({ - diff, - durationMs, - exitCode: code ?? 1, - stdout, - stderr, - }) - }) - - // Don't let the detached child keep the parent alive - child.unref() - }) -} diff --git a/evalbuff/src/commit-task-generator.ts b/evalbuff/src/commit-task-generator.ts deleted file mode 100644 index e85127699d..0000000000 --- a/evalbuff/src/commit-task-generator.ts +++ /dev/null @@ -1,345 +0,0 @@ -import { execSync } from 'child_process' -import fs from 'fs' -import path from 'path' - -import { generatePrompt } from './llm' - -export interface CommitTask { - sha: string - parentSha: string - message: string - prompt: string - diff: string - filesChanged: string[] -} - -const MAX_DIFF_CHARS = 200_000 - -/** - * Commit message patterns that indicate trivial/automated commits not worth - * running agents on. Saves ~10 agent+judge invocations per skipped commit. - */ -const TRIVIAL_COMMIT_PATTERNS = [ - /^bump\b.*\bversion\b/i, - /^v?\d+\.\d+\.\d+$/, // version-only messages like "1.0.635" - /^release\s+v?\d+/i, - /^chore\(release\)/i, - /^update\s+(change|changelog)/i, - /^merge\s+(branch|pull request)/i, -] - -/** - * Returns true if a commit is trivial and should be skipped. - * Checks commit message patterns and whether only package.json version fields changed. - */ -function isTrivialCommit( - message: string, - filesChanged: string[], - diff: string, -): boolean { - const firstLine = message.split('\n')[0].trim() - - // Check message patterns - if (TRIVIAL_COMMIT_PATTERNS.some((p) => p.test(firstLine))) return true - - // Single package.json change that only touches "version" field - if ( - filesChanged.length === 1 && - filesChanged[0].endsWith('package.json') && - diff.length < 1000 - ) { - const addedLines = diff - .split('\n') - .filter((l) => l.startsWith('+') && !l.startsWith('+++')) - const removedLines = diff - .split('\n') - .filter((l) => l.startsWith('-') && !l.startsWith('---')) - const allVersionChanges = - [...addedLines, ...removedLines].every((l) => - /^\s*[+-]\s*"version"/.test(l), - ) - if (allVersionChanges) return true - } - - return false -} - -/** - * Files that add noise to diffs without useful signal. - * Lockfiles are huge and auto-generated — agents shouldn't replicate them. - */ -const NOISE_FILE_PATTERNS = [ - 'bun.lock', - 'bun.lockb', - 'package-lock.json', - 'yarn.lock', - 'pnpm-lock.yaml', - 'Gemfile.lock', - 'Cargo.lock', - 'poetry.lock', - 'composer.lock', - 'go.sum', -] - -function isNoiseFile(filePath: string): boolean { - const basename = filePath.split('/').pop() || '' - return NOISE_FILE_PATTERNS.includes(basename) -} - -/** - * Get a list of commits from the repo, oldest first. - * Starts from `startAfterSha` (exclusive) or HEAD~commitCount if no state. - */ -export function getCommitList( - repoPath: string, - commitCount: number, - startAfterSha?: string, -): string[] { - if (startAfterSha) { - // Get all commits from startAfterSha (exclusive) to HEAD - const output = execSync( - `git log --format=%H --reverse ${startAfterSha}..HEAD`, - { cwd: repoPath, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }, - ).trim() - return output ? output.split('\n') : [] - } - - // Get last N commits, oldest first - const output = execSync( - `git log --format=%H -n ${commitCount} --reverse`, - { cwd: repoPath, encoding: 'utf-8', maxBuffer: 10 * 1024 * 1024 }, - ).trim() - return output ? output.split('\n') : [] -} - -/** - * Extract commit info needed to build a task. - * Returns null for merge commits or commits with no parent. - */ -export function getCommitInfo( - repoPath: string, - sha: string, -): { parentSha: string; message: string; diff: string; filesChanged: string[] } | null { - try { - // Get parent SHA - const parents = execSync(`git log --pretty=%P -n 1 ${sha}`, { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - - if (!parents) return null // initial commit - - const parentList = parents.split(' ') - if (parentList.length > 1) return null // merge commit - - const parentSha = parentList[0] - - // Get commit message - const message = execSync(`git log --format=%B -n 1 ${sha}`, { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - - // Get files changed (filter out noise files like lockfiles) - const filesOutput = execSync(`git diff --name-only ${parentSha} ${sha}`, { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - const allFiles = filesOutput ? filesOutput.split('\n') : [] - const filesChanged = allFiles.filter((f) => !isNoiseFile(f)) - - // Get diff, excluding noise files (lockfiles etc.) - const excludeArgs = NOISE_FILE_PATTERNS.map((p) => `':!${p}'`).join(' ') - const diff = execSync( - `git diff ${parentSha} ${sha} -- . ${excludeArgs}`, - { - cwd: repoPath, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }, - ) - - return { parentSha, message, diff, filesChanged } - } catch { - return null - } -} - -/** - * Read a file's content at a specific commit SHA. - * Returns null if the file doesn't exist at that commit. - */ -function readFileAtCommit( - repoPath: string, - sha: string, - filePath: string, -): string | null { - try { - return execSync(`git show ${sha}:${JSON.stringify(filePath)}`, { - cwd: repoPath, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - } catch { - return null - } -} - -/** - * Read the full contents of all files being modified at the parent commit. - * This gives the prompt generator context about what the code looks like - * before the change, so it can write a realistic human prompt. - */ -function readFilesAtParent( - repoPath: string, - parentSha: string, - filesChanged: string[], -): Record { - const files: Record = {} - let totalSize = 0 - const maxTotalSize = 500_000 // 500K total for all files - - for (const filePath of filesChanged) { - if (totalSize >= maxTotalSize) break - if (isNoiseFile(filePath)) continue - - const content = readFileAtCommit(repoPath, parentSha, filePath) - if (content != null && content.length > 0) { - files[filePath] = content - totalSize += content.length - } - } - - return files -} - -const PROMPT_GEN_SYSTEM = `You are generating a task prompt that a human developer would realistically write to ask an AI coding agent to make changes to their codebase. - -You will receive: -- A git diff showing exactly what was changed -- The full contents of all files being modified (as they looked BEFORE the change) -- The commit message (as a hint, but don't just copy it) - -Your job is to write a natural, human-sounding prompt — the kind of thing a developer would type into a chat with an AI assistant. - -## Key Principles - -1. Focus on high-level functional requirements, not implementation details - - GOOD: "add user authentication to the API" - - BAD: "implement an authenticateUser function in src/auth/middleware.ts" - -2. Use natural language — like a Slack message or ticket description - - GOOD: "the nightly CI is pointing at the wrong directory, it should be agents not .agents" - - BAD: "Update the directory reference in .github/workflows/nightly-e2e.yml from .agents to agents" - -3. Describe what you WANT or what's WRONG, not how to fix it - - GOOD: "the hover state on buttons looks broken" - - BAD: "change the CSS hover opacity from 0.5 to 0.8 in Button.tsx" - -4. Don't reference specific file paths unless a human naturally would. Humans describe the feature area, not the file tree. - - GOOD: "our login page needs to redirect to freebuff.com instead of codebuff.com" - - BAD: "update src/auth/login.ts, src/config/urls.ts, and tests/auth.test.ts to change codebuff.com to freebuff.com" - -5. Don't over-specify. Leave room for the agent to figure out the implementation. - -6. Keep it to 1-4 sentences. - -7. Read the FULL file contents to understand context. The diff alone can be misleading — understanding the surrounding code helps you write a prompt that makes sense for this codebase. - -## Output - -Respond with ONLY the prompt text. No quotes, no preamble, no explanation.` - -/** - * Generate a human-like task prompt from a commit. - * Reads the full files at the parent commit for context, similar to how - * buffbench uses file-explorer agents to understand the codebase. - */ -export async function generatePromptFromCommit( - repoPath: string, - parentSha: string, - message: string, - diff: string, - filesChanged: string[], -): Promise { - // Read full file contents at the parent commit for context - const fileContents = readFilesAtParent(repoPath, parentSha, filesChanged) - - let filesSection = '' - if (Object.keys(fileContents).length > 0) { - filesSection = `## File Contents (before the change)\n\n` - for (const [filePath, content] of Object.entries(fileContents)) { - filesSection += `### ${filePath}\n\`\`\`\n${content}\n\`\`\`\n\n` - } - } - - const userPrompt = `## Commit Message -${message} - -${filesSection}## Diff -\`\`\`diff -${diff} -\`\`\`` - - try { - // Use API directly — faster than spawning Claude CLI (~3s vs ~15s) - // and avoids CLAUDE.md/AGENTS.md context pollution - const output = await generatePrompt(PROMPT_GEN_SYSTEM, userPrompt) - return output || message - } catch { - // Fallback to the commit message itself - return message - } -} - -/** - * Build a full CommitTask from a SHA. - * Returns null if the commit can't be used (merge, initial, too large diff, etc). - */ -export async function buildCommitTask( - repoPath: string, - sha: string, -): Promise { - const info = getCommitInfo(repoPath, sha) - if (!info) return null - - // Skip trivial/automated commits (version bumps, releases, etc.) - if (isTrivialCommit(info.message, info.filesChanged, info.diff)) { - console.log(`Skipping ${sha.slice(0, 8)}: trivial commit (${info.message.split('\n')[0].slice(0, 50)})`) - return null - } - - // Skip commits with diffs that exceed our limit - if (info.diff.length > MAX_DIFF_CHARS) { - console.log(`Skipping ${sha.slice(0, 8)}: diff too large (${info.diff.length} chars)`) - return null - } - - // Skip commits with no meaningful code changes (after filtering noise files) - if (info.filesChanged.length === 0) { - return null - } - - // Skip commits where the diff is empty after filtering noise files - if (info.diff.trim().length === 0) { - console.log(`Skipping ${sha.slice(0, 8)}: only noise files changed (lockfiles, etc.)`) - return null - } - - const prompt = await generatePromptFromCommit( - repoPath, - info.parentSha, - info.message, - info.diff, - info.filesChanged, - ) - - return { - sha, - parentSha: info.parentSha, - message: info.message, - prompt, - diff: info.diff, - filesChanged: info.filesChanged, - } -} diff --git a/evalbuff/src/criteria.ts b/evalbuff/src/criteria.ts deleted file mode 100644 index bc3f9cd290..0000000000 --- a/evalbuff/src/criteria.ts +++ /dev/null @@ -1,165 +0,0 @@ -import fs from 'fs' - -export interface QualityCriterion { - name: string - weight: number - description: string -} - -export interface QualityCriteria { - level: number // 1-5 - criteria: QualityCriterion[] - promotionThreshold: number // default 8.0 - promotionWindow: number // default 10 -} - -export const DEFAULT_CRITERIA: Record = { - 1: [ - { - name: 'Builds & Compiles', - weight: 3, - description: - 'The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds.', - }, - { - name: 'Existing Tests Pass', - weight: 3, - description: - 'All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced.', - }, - { - name: 'Basic Completeness', - weight: 2, - description: - 'All aspects of the prompt are addressed. No partial implementations or TODO comments left behind.', - }, - ], - 2: [ - { - name: 'Feature Works E2E', - weight: 4, - description: - 'The new feature or bug fix actually works when you use the application. Start the app, navigate to the relevant page or endpoint, and exercise the feature. Use browser tools, curl, or the appropriate client to verify the happy path end-to-end.', - }, - { - name: 'Logs & Observability', - weight: 1, - description: - 'Check application logs for errors, warnings, or stack traces during E2E testing. Verify no unexpected errors appear when exercising the feature.', - }, - ], - 3: [ - { - name: 'Edge Cases & Error States', - weight: 3, - description: - 'Test error states and edge cases E2E. Submit invalid inputs, trigger error conditions, test boundary values. Verify the app handles them gracefully without crashing.', - }, - { - name: 'UI/UX Verification', - weight: 2, - description: - 'For UI changes: visually verify the rendered output. Check layout, responsiveness, and that the UI matches expectations. Take screenshots to document.', - }, - ], - 4: [ - { - name: 'Cross-Component Integration', - weight: 2, - description: - 'Verify the change works correctly with related features. Test flows that cross component boundaries. If a backend change was made, verify the frontend still works. If a DB migration was added, verify queries work.', - }, - { - name: 'Performance & No Regressions', - weight: 2, - description: - 'Verify no performance regressions. Check page load times, API response times, or resource usage. Ensure the change does not break unrelated features.', - }, - ], - 5: [ - { - name: 'Production Readiness', - weight: 2, - description: - 'Full production readiness check. Verify migrations, environment variable handling, error recovery, and graceful degradation. The change should be safe to deploy.', - }, - ], -} - -export function getCriteriaForLevel(level: number): QualityCriterion[] { - const criteria: QualityCriterion[] = [] - for (let l = 1; l <= Math.min(level, 5); l++) { - criteria.push(...(DEFAULT_CRITERIA[l] || [])) - } - return criteria -} - -export function loadCriteria(criteriaPath?: string): QualityCriteria { - if (criteriaPath && fs.existsSync(criteriaPath)) { - const raw = JSON.parse(fs.readFileSync(criteriaPath, 'utf-8')) - return raw as QualityCriteria - } - return { - level: 1, - criteria: getCriteriaForLevel(1), - promotionThreshold: 8.0, - promotionWindow: 10, - } -} - -export function saveCriteria( - criteriaPath: string, - criteria: QualityCriteria, -): void { - fs.writeFileSync(criteriaPath, JSON.stringify(criteria, null, 2)) -} - -/** - * Checks if criteria should be promoted to the next level. - * Returns the new level if promoted, or the current level if not. - */ -export function maybePromoteCriteria( - criteria: QualityCriteria, - recentScores: number[], -): number { - if (criteria.level >= 5) return criteria.level - if (recentScores.length < criteria.promotionWindow) return criteria.level - - const windowScores = recentScores.slice(-criteria.promotionWindow) - const avg = windowScores.reduce((sum, s) => sum + s, 0) / windowScores.length - - if (avg >= criteria.promotionThreshold) { - const newLevel = criteria.level + 1 - console.log( - `Criteria promoted from level ${criteria.level} to ${newLevel} (avg ${avg.toFixed(1)} >= ${criteria.promotionThreshold})`, - ) - return newLevel - } - - return criteria.level -} - -/** - * Format criteria as text for injection into reviewer agent prompts. - */ -export function formatCriteriaForPrompt(criteria: QualityCriteria): string { - const lines = [ - `## Quality Criteria (Level ${criteria.level}/5)`, - '', - 'You MUST verify each of these criteria. Higher levels require deeper E2E testing:', - '', - ] - - for (const c of criteria.criteria) { - lines.push(`- **${c.name}** (weight: ${c.weight}): ${c.description}`) - } - - lines.push( - '', - 'For each criterion, describe what you tested and what you observed. If you cannot test a criterion (e.g., no UI for a backend change), note that and explain why.', - '', - 'Weight these criteria proportionally when computing scores. A failure on a high-weight criterion should have a bigger impact on the score than a low-weight one.', - ) - - return lines.join('\n') -} diff --git a/evalbuff/src/docs-optimizer.ts b/evalbuff/src/docs-optimizer.ts deleted file mode 100644 index 408dffc4c1..0000000000 --- a/evalbuff/src/docs-optimizer.ts +++ /dev/null @@ -1,381 +0,0 @@ -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { analyzeFailureViaApi } from './llm' -import { compressTrace, cleanupTraceDir } from './trace-compressor' - -import type { JudgingResult } from './judge' - -export interface DocSuggestion { - reasoning: string - suggestedDocPath: string // relative to docs/, e.g. "coding-patterns/error-handling.md" - suggestedContent: string -} - -const DOC_WRITER_SYSTEM_PROMPT = `You are an expert at writing developer documentation that helps AI coding agents perform better. - -Your job: Given the results of an AI coding agent's attempt at a task, write a targeted documentation file that would help the agent perform better on FUTURE tasks — not just this specific one. - -## Critical Rule: Genericity - -The docs you write must be **generic enough to be useful across many future tasks**, not solely useful for the specific task that was just attempted. Think about: -- What general PATTERN does this failure reveal? -- What CONVENTION or ARCHITECTURE knowledge would prevent a whole class of similar errors? -- What would a senior developer tell a new team member on their first day? - -DO NOT write docs that only help with one specific task. If the failure is too task-specific and doesn't reveal a general pattern, respond with: {"skip": true, "reasoning": "Too task-specific to generalize"} - -## What Makes Good Agent Docs - -The best docs for AI coding agents are: -1. **Maps, not essays** — tell the agent WHERE things are and HOW they connect. "Feature X lives in src/x/, uses the Y pattern from src/shared/y.ts, and must be registered in src/registry.ts" -2. **Decision trees, not philosophy** — "If modifying auth, check src/middleware/auth.ts AND update tests in __tests__/auth.test.ts. If adding a new route, register it in routes.ts." -3. **Anti-patterns with fixes** — "DON'T create new files in the root. DO put utilities in src/shared/. DON'T import from '../../../', DO use the path alias @/" -4. **Concrete examples** — Show a before/after or a correct pattern from the actual codebase. - -Bad docs that HURT agent performance (avoid these): -- Vague principles like "keep code clean" or "follow SOLID" -- Long explanations without actionable takeaways -- Docs that duplicate what's already in the code (comments, types, etc.) -- Over-scoped docs that try to cover everything - -## Using the Agent Trace - -You may be given the agent's trace (stdout) showing its reasoning process, tool calls, and decisions. This is the most valuable signal — it shows you WHY the agent went wrong, not just WHAT it got wrong. Look for: -- **Wrong assumptions** about the codebase structure or conventions -- **Misunderstood patterns** — the agent tried something that doesn't match how this codebase works -- **Missing context** — the agent didn't know about a key file, config, or convention -- **Wrong approach** — the agent took a fundamentally different approach than needed - -Write docs that address the ROOT CAUSE visible in the trace, not just the symptom visible in the diff. - -## Rules - -1. Be SPECIFIC and ACTIONABLE. Reference concrete file paths, function names, and patterns from the codebase. -2. Do NOT write generic advice like "follow best practices" or "write clean code." -3. Focus on the general PATTERN behind the gap, not the specific gap itself. -4. Write docs that a coding agent will read and immediately know what to do differently on any similar task. -5. Keep docs concise — under 100 lines. Dense information beats verbose explanations. Every line should be actionable. -6. Use a logical file path that groups related docs together (e.g., "patterns/", "conventions/", "architecture/"). -7. Include examples of correct patterns from the codebase when possible. -8. If a doc already exists on a similar topic, suggest UPDATING it (use the same path) rather than creating a new one. -9. Start the doc with a 1-2 sentence TL;DR that tells the agent the key rule. - -## Output Format - -You MUST respond with ONLY a JSON object (no markdown fences, no explanation). The JSON must have exactly these fields: -{ - "reasoning": "Why this doc would help (referencing the general pattern, not just this task)", - "suggestedDocPath": "path/relative/to/docs/dir.md", - "suggestedContent": "The markdown content" -} - -Or if too task-specific: -{"skip": true, "reasoning": "explanation"}` - -function formatEditHistory(history?: DocEditHistoryEntry[]): string { - if (!history || history.length === 0) return '' - - const lines = history.map((entry) => { - const score = - entry.scoreBefore != null && entry.scoreAfter != null - ? ` (score: ${entry.scoreBefore.toFixed(1)} → ${entry.scoreAfter.toFixed(1)})` - : '' - return `- **${entry.outcome.toUpperCase()}**: \`${entry.path}\`${score}\n Reasoning: ${entry.reasoning}` - }) - - return `## Edit History (previous doc edits tried this session) - -Use this history to avoid repeating rejected approaches and to build on what worked. - -${lines.join('\n')}` -} - -/** - * Analyze agent run results and suggest a doc edit to improve future performance. - * Always analyzes — no score threshold check. - * Returns null if the doc writer decides the failure is too task-specific to generalize. - */ -export interface DocEditHistoryEntry { - path: string - reasoning: string - outcome: 'accepted' | 'rejected' - scoreBefore?: number - scoreAfter?: number -} - -export async function analyzeFailure({ - judgeResult, - taskPrompt, - agentDiff, - agentTrace, - groundTruthDiff, - currentDocs, - editHistory, - commitMessage, -}: { - judgeResult: JudgingResult - taskPrompt: string - agentDiff: string - agentTrace?: string // stdout from the agent — reasoning, tool calls, errors - groundTruthDiff?: string // optional — not available in prompt mode - currentDocs: Record - editHistory?: DocEditHistoryEntry[] - commitMessage?: string // original commit message — helps identify patterns -}): Promise { - const docsContent = Object.entries(currentDocs) - .map(([docPath, content]) => `### ${docPath}\n\`\`\`\n${content}\n\`\`\``) - .join('\n\n') - - const groundTruthSection = groundTruthDiff - ? `## Ground Truth (what should have been done) -\`\`\`diff -${groundTruthDiff} -\`\`\`` - : '## Ground Truth\n(Not available — judge should have tested the output directly)' - - // Compress agent trace: keep reasoning inline, extract large tool results to files - // We inline the extracted files into the prompt to avoid extra tool-call roundtrips - let compressed: ReturnType | null = null - let traceSection = '' - - if (agentTrace) { - const traceDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-traces-')) - compressed = compressTrace(agentTrace, traceDir) - - // Inline extracted trace files to avoid tool-call roundtrips - const resultFiles = fs.readdirSync(traceDir).filter((f) => f.endsWith('.txt')) - let inlinedResults = '' - for (const file of resultFiles) { - const content = fs.readFileSync(path.join(traceDir, file), 'utf-8') - // Cap each file to 5KB to avoid bloating the prompt - const capped = content.length > 5000 ? content.slice(0, 5000) + '\n... (truncated)' : content - inlinedResults += `\n### ${file}\n\`\`\`\n${capped}\n\`\`\`\n` - } - - traceSection = `## Agent Trace (reasoning, tool calls, and decisions) - -This is the agent's stdout showing its reasoning process, tool calls, and decisions. -Look for: what the agent misunderstood, wrong assumptions it made, where it went off track. - -\`\`\` -${compressed.inline} -\`\`\` -${inlinedResults ? `\n## Extracted Tool Results\n${inlinedResults}` : ''}` - - // Clean up trace dir immediately since we've inlined everything - cleanupTraceDir(compressed.traceDir) - compressed = null - } - - const commitSection = commitMessage - ? `## Original Commit Message (for pattern context) -${commitMessage} - -` - : '' - - const prompt = `${DOC_WRITER_SYSTEM_PROMPT} - -## Task Prompt -${taskPrompt} - -${commitSection}## Judge Analysis -${judgeResult.analysis} - -## Judge Weaknesses Found -${judgeResult.weaknesses.map((w) => `- ${w}`).join('\n')} - -## Judge Strengths Found -${judgeResult.strengths.map((s) => `- ${s}`).join('\n')} - -## Overall Score: ${judgeResult.overallScore}/10 - -${groundTruthSection} - -## Agent's Changes (what was actually done) -\`\`\`diff -${agentDiff || '(No changes made)'} -\`\`\` - -${traceSection} - -## Current Docs (already available to the agent) -${docsContent || '(No docs yet)'} - -${formatEditHistory(editHistory)} - -Based on the agent's trace (if available), the gap between what the agent did and what it should have done, and the judge's analysis, write a doc file that captures a GENERAL PATTERN that would help the agent across many similar tasks. Focus on what the agent MISUNDERSTOOD (visible in the trace) rather than just what it got wrong (visible in the diff). If this failure doesn't reveal a generalizable pattern, respond with {"skip": true, "reasoning": "..."}. - -Respond with ONLY the JSON object.` - - try { - // Use API directly — faster than spawning Claude CLI and avoids cwd/CLAUDE.md pollution - const output = await analyzeFailureViaApi(prompt) - - // Try to extract JSON from the output - let jsonStr = output - const jsonMatch = output.match(/```(?:json)?\s*\n([\s\S]*?)\n\s*```/) - if (jsonMatch) { - jsonStr = jsonMatch[1] - } - const objMatch = jsonStr.match(/\{[\s\S]*\}/) - if (!objMatch) { - console.error('Doc writer did not return JSON') - return null - } - - const value = JSON.parse(objMatch[0]) - - // Check if the doc writer decided to skip - if (value.skip) { - console.log(`Doc writer skipped: ${value.reasoning}`) - return null - } - - const suggestion = value as DocSuggestion - - // Validate the path is under docs/ - if ( - suggestion.suggestedDocPath.startsWith('/') || - suggestion.suggestedDocPath.includes('..') - ) { - console.error( - `Doc writer suggested invalid path: ${suggestion.suggestedDocPath}`, - ) - return null - } - - if (!suggestion.reasoning || !suggestion.suggestedDocPath || !suggestion.suggestedContent) { - console.error('Doc writer returned incomplete suggestion') - return null - } - - return suggestion - } catch (error) { - console.error('Doc writer failed:', error) - return null - } -} - -/** - * Apply a doc edit to a repo — writes the file and updates AGENTS.md TOC. - */ -export function applyDocEdit( - repoPath: string, - docPath: string, - content: string, - agentsMdPath?: string, -): boolean { - if (docPath.startsWith('/') || docPath.includes('..')) { - console.error(`Rejected doc path outside docs/: ${docPath}`) - return false - } - - const fullDocPath = path.join(repoPath, 'docs', docPath) - const fullAgentsMdPath = agentsMdPath || path.join(repoPath, 'AGENTS.md') - - try { - fs.mkdirSync(path.dirname(fullDocPath), { recursive: true }) - - const isNew = !fs.existsSync(fullDocPath) - fs.writeFileSync(fullDocPath, content) - - if (isNew) { - let agentsMd = '' - if (fs.existsSync(fullAgentsMdPath)) { - agentsMd = fs.readFileSync(fullAgentsMdPath, 'utf-8') - } else { - agentsMd = '# Documentation\n\nTable of contents for project documentation.\n\n' - } - - const entry = `- [docs/${docPath}](docs/${docPath})\n` - if (!agentsMd.includes(`docs/${docPath}`)) { - agentsMd += entry - fs.writeFileSync(fullAgentsMdPath, agentsMd) - } - } - - return true - } catch (error) { - console.error(`Failed to apply doc edit: ${error}`) - return false - } -} - -/** - * Remove a doc edit from a repo — deletes the file and removes from AGENTS.md. - */ -export function revertDocEdit( - repoPath: string, - docPath: string, - agentsMdPath?: string, -): boolean { - const fullDocPath = path.join(repoPath, 'docs', docPath) - const fullAgentsMdPath = agentsMdPath || path.join(repoPath, 'AGENTS.md') - - try { - if (fs.existsSync(fullDocPath)) { - fs.rmSync(fullDocPath) - } - - // Remove from AGENTS.md - if (fs.existsSync(fullAgentsMdPath)) { - let agentsMd = fs.readFileSync(fullAgentsMdPath, 'utf-8') - const entry = `- [docs/${docPath}](docs/${docPath})\n` - if (agentsMd.includes(entry)) { - agentsMd = agentsMd.replace(entry, '') - fs.writeFileSync(fullAgentsMdPath, agentsMd) - } - } - - return true - } catch (error) { - console.error(`Failed to revert doc edit: ${error}`) - return false - } -} - -/** - * Compare scores to determine if a doc edit improved things. - * - * With parallelism=5, averages are reasonably stable. A 0.3 threshold - * catches real improvements without being too sensitive to noise. - */ -export function compareScores( - oldScore: number, - newScore: number, -): 'improved' | 'same' | 'worse' { - const delta = newScore - oldScore - const threshold = 0.3 - - if (delta >= threshold) return 'improved' - if (delta <= -threshold) return 'worse' - - return 'same' -} - -/** - * Read all docs from a repo's docs/ directory. - */ -export function readCurrentDocs(repoPath: string): Record { - const docsDir = path.join(repoPath, 'docs') - const docs: Record = {} - - if (!fs.existsSync(docsDir)) return docs - - function readDir(dir: string, prefix: string) { - for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { - if (entry.isDirectory()) { - readDir(path.join(dir, entry.name), `${prefix}${entry.name}/`) - } else if (entry.name.endsWith('.md')) { - const relPath = `${prefix}${entry.name}` - docs[relPath] = fs.readFileSync(path.join(dir, entry.name), 'utf-8') - } - } - } - - readDir(docsDir, '') - return docs -} diff --git a/evalbuff/src/evalbuff-criteria.json b/evalbuff/src/evalbuff-criteria.json deleted file mode 100644 index f080586b81..0000000000 --- a/evalbuff/src/evalbuff-criteria.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "level": 1, - "criteria": [ - { - "name": "Builds & Compiles", - "weight": 3, - "description": "The code compiles, builds, and the project starts without errors. Run the build command and verify it succeeds." - }, - { - "name": "Existing Tests Pass", - "weight": 3, - "description": "All pre-existing tests still pass. Run the test suite and confirm no regressions were introduced." - }, - { - "name": "Basic Completeness", - "weight": 2, - "description": "All aspects of the prompt are addressed. No partial implementations or TODO comments left behind." - } - ], - "promotionThreshold": 8.0, - "promotionWindow": 10 -} diff --git a/evalbuff/src/judge.ts b/evalbuff/src/judge.ts deleted file mode 100644 index 50cd02fdd7..0000000000 --- a/evalbuff/src/judge.ts +++ /dev/null @@ -1,549 +0,0 @@ -import { execSync, spawn } from 'child_process' -import fs from 'fs' -import path from 'path' - -import { z } from 'zod/v4' - -import { formatCriteriaForPrompt } from './criteria' - -import type { QualityCriteria } from './criteria' -import type { EvalCommitV2 } from './types' - -export const JudgingResultSchema = z.object({ - analysis: z - .string() - .describe('Detailed analysis of what was tested and found'), - strengths: z - .array(z.string()) - .describe('Key strengths of the implementation'), - weaknesses: z.array(z.string()).describe('Key weaknesses or issues found'), - e2eTestsPerformed: z - .array(z.string()) - .describe('List of E2E tests that were actually performed'), - completionScore: z - .number() - .min(0) - .max(10) - .describe('How completely the prompt was addressed'), - codeQualityScore: z - .number() - .min(0) - .max(10) - .describe('Code structure and maintainability'), - e2eScore: z - .number() - .min(0) - .max(10) - .describe('How well the change works when tested end-to-end'), - overallScore: z.number().min(0).max(10).describe('Combined assessment'), -}) - -export type JudgingResult = z.infer - -// --- Reviewer agent types --- - -export type ReviewerAgentType = 'claude' | 'codex' | 'gemini' - -interface ReviewerConfig { - type: ReviewerAgentType - command: string[] - env?: Record - timeoutMs: number -} - -const REVIEWER_CONFIGS: Record = { - claude: { - type: 'claude', - command: [ - 'claude', - '-p', - '__PROMPT__', - '--dangerously-skip-permissions', - ], - timeoutMs: 30 * 60 * 1000, - }, - codex: { - type: 'codex', - command: [ - 'codex', - 'exec', - '--full-auto', - '-m', - 'gpt-5.1-codex', - '__PROMPT__', - ], - timeoutMs: 30 * 60 * 1000, - }, - gemini: { - type: 'gemini', - command: ['gemini', '--yolo', '-p', '__PROMPT__'], - timeoutMs: 30 * 60 * 1000, - }, -} - -const RESULT_FILE_NAME = 'evalbuff-review-result.json' - -function buildReviewerPrompt(input: { - commit?: EvalCommitV2 - taskPrompt: string - contextFiles?: Record - agentDiff: string - groundTruthDiff?: string - error?: string - criteria?: QualityCriteria - docsDir?: string -}): string { - const { commit, taskPrompt, contextFiles, agentDiff, groundTruthDiff, error, criteria, docsDir } = input - - const groundTruthSection = groundTruthDiff - ? `## Ground Truth Changes (One valid implementation) -${groundTruthDiff}` - : `## Ground Truth -No reference implementation is available. You must judge the agent's work solely by testing it end-to-end. Focus heavily on: -- Does it build and run? -- Does the feature actually work when you test it? -- Are there errors in the logs? -- Does it handle edge cases?` - - const contextFilesContent = contextFiles - ? Object.entries(contextFiles) - .map(([filePath, content]) => `### ${filePath}\n\`\`\`\n${content}\n\`\`\``) - .join('\n\n') - : '' - - // Legacy support: build ground truth from commit fileDiffs if no explicit groundTruthDiff - const groundTruth = groundTruthDiff - ? groundTruthSection - : commit?.fileDiffs - ? `## Ground Truth Changes (One valid implementation)\n${commit.fileDiffs - .map(({ path: p, diff }) => `### ${p}\n\`\`\`diff\n${diff}\n\`\`\``) - .join('\n\n')}` - : groundTruthSection - - const criteriaText = criteria - ? formatCriteriaForPrompt(criteria) - : '' - - const docsSection = docsDir - ? `\n## Project Docs\nRead the docs in the \`docs/\` directory and \`AGENTS.md\` for project-specific patterns and conventions before reviewing.\n` - : '' - - return `You are a senior engineer performing a thorough code review with E2E testing. - -## Your Mission - -You have been given a coding task and an AI agent's attempt. Your job is to: - -1. **Read the project docs** (if present) to understand conventions and patterns -2. **Review the agent's diff** ${groundTruthDiff || commit?.fileDiffs ? 'against the ground truth' : 'for correctness and completeness'} -3. **Actually test the changes** end-to-end: - - Start the application if possible (check package.json for start/dev scripts) - - Use browser tools, curl, or the appropriate client to exercise the feature - - Check logs for errors - - Test edge cases and error states - - Take screenshots of UI changes if applicable -4. **Write your judgment** to a JSON file - -## Important: You have full access to the repository and can run any commands. - -Use whatever tools you need to verify the change actually works: -- Run the build/compile step -- Run the test suite -- Start the dev server -- Use browser tools to test the UI -- curl API endpoints -- Check logs -- Use tmux for long-running processes -- Any other verification method appropriate for the change - -${docsSection} -## User Prompt (What the agent was asked to do) -${taskPrompt} - -${contextFilesContent ? `## Context Files (from parent commit)\n${contextFilesContent}` : ''} - -${groundTruth} - -## Agent's Changes (What the agent actually did) -\`\`\`diff -${agentDiff || '(No changes made)'} -\`\`\` -${error ? `\n## Error Encountered During Agent Run\n${error}\n` : ''} -${criteriaText} - -## Required Output - -After your review and testing, write your judgment to the file \`${RESULT_FILE_NAME}\` in the current working directory. The JSON must have exactly this structure: - -\`\`\`json -{ - "analysis": "Detailed analysis of what you tested and found...", - "strengths": ["strength 1", "strength 2"], - "weaknesses": ["weakness 1", "weakness 2"], - "e2eTestsPerformed": ["Started dev server and loaded /dashboard", "Submitted form with invalid email", "Checked network tab for API errors"], - "completionScore": 7, - "codeQualityScore": 8, - "e2eScore": 6, - "overallScore": 7 -} -\`\`\` - -All scores are 0-10. The e2eScore specifically measures how well the change works when actually tested, not just how the code looks. - -IMPORTANT: You MUST write the result file. This is the only way your review gets recorded. Do it as your very last action.` -} - -const PROMPT_FILE_NAME = 'EVALBUFF_REVIEW_PROMPT.md' - -const BOOTSTRAP_PROMPT = `Read the file ${PROMPT_FILE_NAME} in the current directory and follow all instructions in it exactly. The file contains a code review task. After your review and testing, you MUST write your judgment to ${RESULT_FILE_NAME} as specified in the prompt file.` - -async function runReviewerAgent( - agentType: ReviewerAgentType, - prompt: string, - cwd: string, - env?: Record, -): Promise { - const config = REVIEWER_CONFIGS[agentType] - - fs.writeFileSync(path.join(cwd, PROMPT_FILE_NAME), prompt) - - const args = config.command - .slice(1) - .map((a) => (a === '__PROMPT__' ? BOOTSTRAP_PROMPT : a)) - - const cmd = config.command[0] - - console.log(`[Reviewer:${agentType}] Starting review in ${cwd}`) - - return new Promise((resolve) => { - const child = spawn(cmd, args, { - cwd, - env: { ...process.env, ...config.env, ...env }, - stdio: ['ignore', 'pipe', 'pipe'], - }) - - let stdout = '' - let stderr = '' - - const timer = setTimeout(() => { - console.warn( - `[Reviewer:${agentType}] Timed out after ${config.timeoutMs / 1000}s`, - ) - child.kill('SIGTERM') - setTimeout(() => { - if (!child.killed) child.kill('SIGKILL') - }, 5000) - }, config.timeoutMs) - - child.stdout.on('data', (data: Buffer) => { - stdout += data.toString() - }) - - child.stderr.on('data', (data: Buffer) => { - stderr += data.toString() - }) - - child.on('error', (error) => { - clearTimeout(timer) - console.error( - `[Reviewer:${agentType}] Failed to start: ${error.message}`, - ) - resolve(null) - }) - - child.on('close', (code) => { - clearTimeout(timer) - console.log( - `[Reviewer:${agentType}] Exited with code ${code}`, - ) - if (code !== 0) { - console.warn( - `[Reviewer:${agentType}] stderr (last 1000 chars): ${stderr.slice(-1000)}`, - ) - console.warn( - `[Reviewer:${agentType}] stdout (last 500 chars): ${stdout.slice(-500)}`, - ) - } - - const resultPath = path.join(cwd, RESULT_FILE_NAME) - const result = parseResultFile(resultPath, agentType) - - if (result) { - resolve(result) - return - } - - const extracted = extractJsonFromOutput(stdout, agentType) - if (extracted) { - resolve(extracted) - return - } - - console.warn( - `[Reviewer:${agentType}] No result file or parseable output found`, - ) - resolve(null) - }) - }) -} - -function parseResultFile( - resultPath: string, - agentType: string, -): JudgingResult | null { - try { - if (!fs.existsSync(resultPath)) return null - const raw = JSON.parse(fs.readFileSync(resultPath, 'utf-8')) - const parsed = JudgingResultSchema.safeParse(raw) - if (parsed.success) { - console.log( - `[Reviewer:${agentType}] Parsed result file successfully`, - ) - return parsed.data - } - console.warn( - `[Reviewer:${agentType}] Result file failed validation:`, - parsed.error, - ) - return salvagePartialResult(raw) - } catch (error) { - console.warn( - `[Reviewer:${agentType}] Failed to parse result file:`, - error, - ) - return null - } -} - -function extractJsonFromOutput( - output: string, - agentType: string, -): JudgingResult | null { - const jsonPatterns = [ - /```(?:json)?\s*\n({[\s\S]*?})\n\s*```/g, - /(\{[^{}]*"overallScore"[^{}]*\})/g, - ] - - for (const pattern of jsonPatterns) { - const matches = [...output.matchAll(pattern)] - for (let i = matches.length - 1; i >= 0; i--) { - try { - const raw = JSON.parse(matches[i][1]) - const parsed = JudgingResultSchema.safeParse(raw) - if (parsed.success) { - console.log( - `[Reviewer:${agentType}] Extracted result from stdout`, - ) - return parsed.data - } - const salvaged = salvagePartialResult(raw) - if (salvaged) return salvaged - } catch { - continue - } - } - } - - return null -} - -function salvagePartialResult(raw: any): JudgingResult | null { - if (typeof raw !== 'object' || raw === null) return null - if (typeof raw.overallScore !== 'number') return null - - return { - analysis: raw.analysis || 'No analysis provided', - strengths: Array.isArray(raw.strengths) ? raw.strengths : [], - weaknesses: Array.isArray(raw.weaknesses) ? raw.weaknesses : [], - e2eTestsPerformed: Array.isArray(raw.e2eTestsPerformed) - ? raw.e2eTestsPerformed - : [], - completionScore: - typeof raw.completionScore === 'number' ? raw.completionScore : raw.overallScore, - codeQualityScore: - typeof raw.codeQualityScore === 'number' - ? raw.codeQualityScore - : raw.overallScore, - e2eScore: - typeof raw.e2eScore === 'number' ? raw.e2eScore : raw.overallScore, - overallScore: raw.overallScore, - } -} - -// --- Public API --- - -export interface JudgeCommitResultInput { - commit: EvalCommitV2 - contextFiles: Record - agentDiff: string - repoDir: string - error?: string - criteria?: QualityCriteria - reviewerAgents?: ReviewerAgentType[] - env?: Record -} - -/** - * Judge a commit result by running reviewer agents in the repo. - * Each reviewer agent can read docs, run the app, test E2E, and write a result file. - */ -export async function judgeCommitResult( - input: JudgeCommitResultInput, -): Promise { - const { - commit, - contextFiles, - agentDiff, - repoDir, - error, - criteria, - reviewerAgents = ['claude', 'codex'], - env, - } = input - - const prompt = buildReviewerPrompt({ - commit, - taskPrompt: commit.prompt, - contextFiles, - agentDiff, - error, - criteria, - docsDir: fs.existsSync(path.join(repoDir, 'docs')) ? repoDir : undefined, - }) - - return runReviewersAndAggregate(prompt, repoDir, reviewerAgents, env) -} - -/** - * Judge an agent's work on a task prompt — no ground truth commit needed. - * Used for both commit-learning mode (with ground truth diff) and prompt mode (without). - */ -export interface JudgeTaskResultInput { - taskPrompt: string - agentDiff: string - groundTruthDiff?: string - repoDir: string - error?: string - criteria?: QualityCriteria - reviewerAgents?: ReviewerAgentType[] - env?: Record -} - -export async function judgeTaskResult( - input: JudgeTaskResultInput, -): Promise { - const { - taskPrompt, - agentDiff, - groundTruthDiff, - repoDir, - error, - criteria, - reviewerAgents = ['claude', 'codex'], - env, - } = input - - const prompt = buildReviewerPrompt({ - taskPrompt, - agentDiff, - groundTruthDiff, - error, - criteria, - docsDir: fs.existsSync(path.join(repoDir, 'docs')) ? repoDir : undefined, - }) - - return runReviewersAndAggregate(prompt, repoDir, reviewerAgents, env) -} - -/** - * Shared logic: run reviewer agents in parallel and aggregate results. - */ -async function runReviewersAndAggregate( - prompt: string, - repoDir: string, - reviewerAgents: ReviewerAgentType[], - env?: Record, -): Promise { - const reviewPromises = reviewerAgents.map(async (agentType) => { - const reviewDir = `${repoDir}-review-${agentType}` - try { - const nodeModulesPath = path.join(repoDir, 'node_modules') - const hasNodeModules = fs.existsSync(nodeModulesPath) - if (hasNodeModules) { - execSync( - `rsync -a --exclude node_modules "${repoDir}/" "${reviewDir}/"`, - { stdio: 'ignore' }, - ) - fs.symlinkSync(nodeModulesPath, path.join(reviewDir, 'node_modules')) - } else { - execSync(`cp -r "${repoDir}" "${reviewDir}"`, { stdio: 'ignore' }) - } - return await runReviewerAgent(agentType, prompt, reviewDir) - } finally { - try { - fs.rmSync(reviewDir, { recursive: true, force: true }) - } catch { - // ignore cleanup errors - } - } - }) - - const results = await Promise.all(reviewPromises) - const validResults = results.filter( - (r): r is JudgingResult => r !== null, - ) - - if (validResults.length === 0) { - console.error( - `All reviewer agents failed (${reviewerAgents.join(', ')})`, - ) - return { - analysis: 'Error: all reviewer agents failed to provide results', - strengths: [], - weaknesses: ['All reviewer agents failed'], - e2eTestsPerformed: [], - completionScore: 0, - codeQualityScore: 0, - e2eScore: 0, - overallScore: 0, - } - } - - // Use median for qualitative analysis (pick the most representative reviewer) - // but average for scores. Averaging is better because models have consistent - // scoring biases (e.g. GPT-5 scores lower) — median would always pick the - // same model's score, while average blends them. - const sorted = validResults.sort( - (a, b) => a.overallScore - b.overallScore, - ) - const medianIdx = Math.floor(sorted.length / 2) - const medianResult = sorted[medianIdx] - - const avg = (key: keyof JudgingResult) => - validResults.reduce((sum, r) => sum + (r[key] as number), 0) / - validResults.length - - const avgCompletionScore = avg('completionScore') - const avgCodeQualityScore = avg('codeQualityScore') - const avgE2eScore = avg('e2eScore') - const avgOverallScore = avg('overallScore') - - const allE2eTests = [ - ...new Set(validResults.flatMap((r) => r.e2eTestsPerformed)), - ] - - console.log( - `Review results: overall=${avgOverallScore.toFixed(1)}, e2e=${avgE2eScore.toFixed(1)} (${validResults.length}/${reviewerAgents.length} reviewers)`, - ) - - return { - analysis: medianResult.analysis, - strengths: medianResult.strengths, - weaknesses: medianResult.weaknesses, - e2eTestsPerformed: allE2eTests, - completionScore: avgCompletionScore, - codeQualityScore: avgCodeQualityScore, - e2eScore: avgE2eScore, - overallScore: avgOverallScore, - } -} diff --git a/evalbuff/src/llm.ts b/evalbuff/src/llm.ts deleted file mode 100644 index 36e5eee61e..0000000000 --- a/evalbuff/src/llm.ts +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Direct LLM API calls for evalbuff, replacing Claude CLI spawning. - * - * Using the API directly is 2-5x faster than spawning `claude` CLI: - * - No process startup overhead (~5s saved per call) - * - No CLAUDE.md/AGENTS.md context pollution - * - Structured JSON output with schema validation - * - Better error handling and retry logic - */ -import { createAnthropic } from '@ai-sdk/anthropic' -import { generateText } from 'ai' - -const anthropic = createAnthropic() - -const DEFAULT_MODEL = 'claude-sonnet-4-6' - -/** - * Generate a task prompt from a commit diff using the LLM API directly. - * Replaces the `claude --dangerously-skip-permissions -p` call in commit-task-generator.ts. - */ -export async function generatePrompt( - systemPrompt: string, - userPrompt: string, -): Promise { - const result = await generateText({ - model: anthropic(DEFAULT_MODEL), - system: systemPrompt, - prompt: userPrompt, - }) - - return result.text.trim() -} - -/** - * Analyze a failure and suggest a doc edit using the LLM API directly. - * Replaces the `claude --dangerously-skip-permissions -p` call in docs-optimizer.ts. - * - * Returns raw JSON string (caller handles parsing). - */ -export async function analyzeFailureViaApi( - prompt: string, -): Promise { - const result = await generateText({ - model: anthropic(DEFAULT_MODEL), - prompt, - }) - - return result.text.trim() -} diff --git a/evalbuff/src/morning-report.ts b/evalbuff/src/morning-report.ts deleted file mode 100644 index 9682bed16e..0000000000 --- a/evalbuff/src/morning-report.ts +++ /dev/null @@ -1,197 +0,0 @@ -import fs from 'fs' - -export interface EvalbuffLogEntry { - taskId: string - timestamp: string - oldScore: number - newScore: number | null - docEdit: { - path: string - reasoning: string - } | null - scoreComparison: 'improved' | 'same' | 'worse' | null - costUsd: number - durationMs: number - error?: string - criteriaLevel: number -} - -export interface MorningReportData { - startTime: string - endTime: string - totalIterations: number - totalCostUsd: number - totalDurationMs: number - avgOldScore: number - avgNewScore: number - docsAdded: number - docsKept: number - docsReverted: number - criteriaLevel: number - entries: EvalbuffLogEntry[] -} - -export function generateMorningReport(logPath: string): string { - if (!fs.existsSync(logPath)) { - return generateEmptyReport() - } - - const content = fs.readFileSync(logPath, 'utf-8').trim() - if (!content) { - return generateEmptyReport() - } - - const entries: EvalbuffLogEntry[] = content - .split('\n') - .filter((line) => line.trim()) - .map((line) => JSON.parse(line)) - - const data = computeReportData(entries) - return formatReport(data) -} - -function generateEmptyReport(): string { - return `# Evalbuff Morning Report - -**No iterations were run.** The log file is empty or missing. - -| Metric | Value | -|--------|-------| -| Iterations | 0 | -| Total Cost | $0.00 | -| Total Duration | 0s | -| Docs Added | 0 | -| Docs Kept | 0 | -| Criteria Level | - | -` -} - -function computeReportData(entries: EvalbuffLogEntry[]): MorningReportData { - const oldScores = entries.map((e) => e.oldScore) - const newScores = entries - .filter((e) => e.newScore !== null) - .map((e) => e.newScore!) - - const docsAdded = entries.filter((e) => e.docEdit !== null).length - const docsKept = entries.filter((e) => e.scoreComparison === 'improved').length - const docsReverted = docsAdded - docsKept - - return { - startTime: entries[0]?.timestamp || '', - endTime: entries[entries.length - 1]?.timestamp || '', - totalIterations: entries.length, - totalCostUsd: entries.reduce((sum, e) => sum + e.costUsd, 0), - totalDurationMs: entries.reduce((sum, e) => sum + e.durationMs, 0), - avgOldScore: - oldScores.length > 0 - ? oldScores.reduce((a, b) => a + b, 0) / oldScores.length - : 0, - avgNewScore: - newScores.length > 0 - ? newScores.reduce((a, b) => a + b, 0) / newScores.length - : 0, - docsAdded, - docsKept, - docsReverted, - criteriaLevel: entries[entries.length - 1]?.criteriaLevel || 1, - entries, - } -} - -function formatDuration(ms: number): string { - const seconds = Math.floor(ms / 1000) - const minutes = Math.floor(seconds / 60) - const hours = Math.floor(minutes / 60) - if (hours > 0) return `${hours}h ${minutes % 60}m` - if (minutes > 0) return `${minutes}m ${seconds % 60}s` - return `${seconds}s` -} - -function formatReport(data: MorningReportData): string { - const lines: string[] = [ - '# Evalbuff Morning Report', - '', - `**Run:** ${data.startTime || 'N/A'} to ${data.endTime || 'N/A'}`, - '', - '## Summary', - '', - '| Metric | Value |', - '|--------|-------|', - `| Iterations | ${data.totalIterations} |`, - `| Total Cost | $${data.totalCostUsd.toFixed(2)} |`, - `| Total Duration | ${formatDuration(data.totalDurationMs)} |`, - `| Avg Score (before docs) | ${data.avgOldScore.toFixed(1)} |`, - `| Avg Score (after docs) | ${data.avgNewScore > 0 ? data.avgNewScore.toFixed(1) : 'N/A'} |`, - `| Docs Attempted | ${data.docsAdded} |`, - `| Docs Kept (improved score) | ${data.docsKept} |`, - `| Docs Reverted | ${data.docsReverted} |`, - `| Criteria Level | ${data.criteriaLevel}/5 |`, - '', - ] - - // Doc changes table - const docEntries = data.entries.filter((e) => e.docEdit !== null) - if (docEntries.length > 0) { - lines.push('## Doc Changes') - lines.push('') - lines.push('| Task | Doc Path | Score Impact | Kept? | Reasoning |') - lines.push('|------|----------|-------------|-------|-----------|') - for (const entry of docEntries) { - const impact = - entry.newScore !== null - ? `${entry.oldScore.toFixed(1)} -> ${entry.newScore.toFixed(1)}` - : 'N/A' - const kept = entry.scoreComparison === 'improved' ? 'Yes' : 'No' - const reasoning = - entry.docEdit!.reasoning.length > 60 - ? entry.docEdit!.reasoning.slice(0, 57) + '...' - : entry.docEdit!.reasoning - lines.push( - `| ${entry.taskId} | ${entry.docEdit!.path} | ${impact} | ${kept} | ${reasoning} |`, - ) - } - lines.push('') - } - - // Failed iterations - const failedEntries = data.entries.filter((e) => e.error) - if (failedEntries.length > 0) { - lines.push('## Errors') - lines.push('') - lines.push('| Task | Error |') - lines.push('|------|-------|') - for (const entry of failedEntries) { - const errorMsg = - entry.error!.length > 80 - ? entry.error!.slice(0, 77) + '...' - : entry.error! - lines.push(`| ${entry.taskId} | ${errorMsg} |`) - } - lines.push('') - } - - // Score trajectory - lines.push('## Score Trajectory') - lines.push('') - lines.push('```') - for (const entry of data.entries) { - const bar = '#'.repeat(Math.round(entry.oldScore)) - const newBar = - entry.newScore !== null - ? ` -> ${'#'.repeat(Math.round(entry.newScore))}` - : '' - lines.push( - `${entry.taskId.padEnd(20)} ${entry.oldScore.toFixed(1).padStart(4)} ${bar}${newBar}`, - ) - } - lines.push('```') - - return lines.join('\n') -} - -export function appendLogEntry( - logPath: string, - entry: EvalbuffLogEntry, -): void { - fs.appendFileSync(logPath, JSON.stringify(entry) + '\n') -} diff --git a/evalbuff/src/run-carve-eval.ts b/evalbuff/src/run-carve-eval.ts deleted file mode 100644 index 2fc174ab9c..0000000000 --- a/evalbuff/src/run-carve-eval.ts +++ /dev/null @@ -1,668 +0,0 @@ -/** - * Run carve-based evals: apply a carve (delete a feature), run agents to rebuild it, - * judge against the original code, then iterate on docs. - * - * Usage: - * bun run evalbuff/src/run-carve-eval.ts --repo /path/to/repo --carve-file carve-2026-03-30.json [--feature cli-init-command] [--parallelism 5] - */ -import { execSync } from 'child_process' -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { - analyzeFailure, - applyDocEdit, - compareScores, - readCurrentDocs, - revertDocEdit, -} from './docs-optimizer' -import { judgeTaskResult } from './judge' -import { ClaudeRunner } from './runners/claude' - -import type { CarvedFeature, CarveResult, FileOperation } from './carve-features' -import type { JudgingResult, ReviewerAgentType } from './judge' -import type { RunnerResult } from './runners/runner' - -// --- Doc read stats --- - -/** Extract doc file reads from an agent trace (JSONL of PrintModeEvents). */ -function extractDocReads(agentTrace: string): Record { - const counts: Record = {} - for (const line of agentTrace.split('\n')) { - if (!line.trim()) continue - try { - const event = JSON.parse(line) - if (event.type !== 'tool_call' || event.toolName !== 'Read') continue - const filePath: string = event.input?.file_path ?? '' - // Normalize to repo-relative path - const match = filePath.match(/(?:^|\/)(?:docs\/.*|AGENTS\.md|CLAUDE\.md)$/) - if (!match) continue - const relPath = match[0].startsWith('/') ? match[0].slice(1) : match[0] - counts[relPath] = (counts[relPath] || 0) + 1 - } catch { - // not JSON - } - } - return counts -} - -/** Merge multiple doc-read count maps into one (summing counts). */ -function mergeDocReads(maps: Record[]): Record { - const merged: Record = {} - for (const m of maps) { - for (const [k, v] of Object.entries(m)) { - merged[k] = (merged[k] || 0) + v - } - } - return merged -} - -// --- Apply carve operations to a repo directory --- - -function applyCarveOperations(repoDir: string, operations: FileOperation[]): void { - for (const op of operations) { - const fullPath = path.join(repoDir, op.path) - if (op.action === 'delete') { - if (fs.existsSync(fullPath)) { - fs.rmSync(fullPath) - } - } else if (op.action === 'modify' && op.newContent !== undefined) { - fs.mkdirSync(path.dirname(fullPath), { recursive: true }) - fs.writeFileSync(fullPath, op.newContent) - } - } -} - -/** - * Compute a reverse diff (what needs to be added back) from a carve. - * This is the "ground truth" — the original code that was removed. - */ -function computeGroundTruthDiff(feature: CarvedFeature): string { - const diffs: string[] = [] - - for (const op of feature.operations) { - if (op.action === 'delete' && feature.originalFiles[op.path]) { - // File was deleted — ground truth is to recreate it - const lines = feature.originalFiles[op.path].split('\n') - diffs.push( - `--- /dev/null\n+++ b/${op.path}\n@@ -0,0 +1,${lines.length} @@\n` + - lines.map((l) => `+${l}`).join('\n'), - ) - } else if (op.action === 'modify' && feature.originalFiles[op.path]) { - // File was modified — ground truth is the original version - const origLines = feature.originalFiles[op.path].split('\n') - const carvedLines = (op.newContent || '').split('\n') - diffs.push( - `--- a/${op.path}\n+++ b/${op.path}\n@@ -1,${carvedLines.length} +1,${origLines.length} @@\n` + - carvedLines.map((l) => `-${l}`).join('\n') + - '\n' + - origLines.map((l) => `+${l}`).join('\n'), - ) - } - } - - return diffs.join('\n\n') -} - -// --- Clone repo and apply carve --- - -interface TestRepoResult { - result: T - cleanup: () => void -} - -async function withCarvedRepo( - repoPath: string, - feature: CarvedFeature, - initCommand: string | undefined, - fn: (repoDir: string, carveSha: string) => Promise, -): Promise { - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'carve-eval-')) - const repoDir = path.join(tempDir, 'repo') - - try { - // Local clone (fast, uses hardlinks) - execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { - stdio: 'ignore', - }) - const headSha = execSync('git rev-parse HEAD', { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' }) - - // Apply the carve operations (delete the feature) - applyCarveOperations(repoDir, feature.operations) - - // Commit the carved state so agents start from a clean working tree - execSync('git add -A', { cwd: repoDir, stdio: 'ignore' }) - execSync( - `git commit -m "carve: remove ${feature.id}" --allow-empty`, - { cwd: repoDir, stdio: 'ignore' }, - ) - const carveSha = execSync('git rev-parse HEAD', { - cwd: repoDir, - encoding: 'utf-8', - }).trim() - - // Run init command if provided - if (initCommand) { - try { - execSync(initCommand, { cwd: repoDir, stdio: 'ignore' }) - } catch (e) { - console.warn(`Init command failed: ${e}`) - } - } - - return await fn(repoDir, carveSha) - } finally { - try { - fs.rmSync(tempDir, { recursive: true, force: true }) - } catch { - // ignore - } - } -} - -// --- Run a single agent on a carved repo --- - -async function runAgentOnCarve(opts: { - idx: number - total: number - repoPath: string - feature: CarvedFeature - initCommand?: string - model: string - agentTimeoutMs: number - groundTruthDiff: string - reviewerAgents: ReviewerAgentType[] - docsSourcePath: string -}): Promise<{ - score: number - diff: string - agentTrace: string - judging: JudgingResult - costEstimate: number -}> { - const { - idx, - total, - repoPath, - feature, - initCommand, - model, - agentTimeoutMs, - groundTruthDiff, - reviewerAgents, - docsSourcePath, - } = opts - - return withCarvedRepo(repoPath, feature, initCommand, async (repoDir, carveSha) => { - // Copy docs into the carved repo - copyDocsIntoRepo(docsSourcePath, repoDir) - - console.log(` [Run ${idx + 1}/${total}] Running claude (${model}) on carved repo...`) - const runner = new ClaudeRunner(repoDir, {}, model) - - let result: RunnerResult - try { - result = await runner.run(feature.prompt) - } catch (runError) { - const errMsg = - runError instanceof Error ? runError.message : String(runError) - console.warn(` [Run ${idx + 1}/${total}] Agent failed: ${errMsg.slice(0, 200)}`) - return { - score: -1, - diff: '', - agentTrace: `Agent error: ${errMsg}`, - judging: { - analysis: `Agent failed: ${errMsg.slice(0, 500)}`, - strengths: [], - weaknesses: ['Agent failed due to infrastructure error'], - e2eTestsPerformed: [], - completionScore: -1, - codeQualityScore: -1, - e2eScore: -1, - overallScore: -1, - }, - costEstimate: 0, - } - } - - const agentTrace = result.steps - .map((step) => JSON.stringify(step)) - .join('\n') - - console.log(` [Run ${idx + 1}/${total}] Judging...`) - const judging = await judgeTaskResult({ - taskPrompt: feature.prompt, - agentDiff: result.diff, - groundTruthDiff, - repoDir, - error: result.diff === '' ? 'Agent made no changes' : undefined, - reviewerAgents, - }) - - return { - score: judging.overallScore, - diff: result.diff, - agentTrace, - judging, - costEstimate: result.totalCostUsd, - } - }) -} - -function copyDocsIntoRepo(sourceRepoPath: string, targetRepoPath: string): void { - const sourceDocsDir = path.join(sourceRepoPath, 'docs') - const sourceAgentsMd = path.join(sourceRepoPath, 'AGENTS.md') - const targetDocsDir = path.join(targetRepoPath, 'docs') - const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md') - const targetClaudeMd = path.join(targetRepoPath, 'CLAUDE.md') - - let copied = false - if (fs.existsSync(sourceDocsDir)) { - fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true }) - copied = true - } - if (fs.existsSync(sourceAgentsMd)) { - fs.cpSync(sourceAgentsMd, targetAgentsMd) - // Ensure CLAUDE.md symlink exists so Claude Code auto-loads the same content - if (!fs.existsSync(targetClaudeMd)) { - fs.symlinkSync('AGENTS.md', targetClaudeMd) - } - copied = true - } - - if (copied) { - try { - execSync( - 'git add docs/ AGENTS.md CLAUDE.md 2>/dev/null; git add -u docs/ AGENTS.md CLAUDE.md 2>/dev/null', - { cwd: targetRepoPath, stdio: 'ignore' }, - ) - execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', { - cwd: targetRepoPath, - stdio: 'ignore', - }) - } catch { - // fine - } - } -} - -// --- Main carve eval loop --- - -interface CarveEvalOptions { - repoPath: string - carveFile: string - featureId?: string // run only this feature (default: all) - model: string - parallelism: number - agentTimeoutMs: number - reviewerAgents: ReviewerAgentType[] - initCommand?: string - maxImprovementIterations: number -} - -interface CarveEvalResult { - featureId: string - prompt: string - baselineScore: number - finalScore: number - docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> - docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> - totalCost: number - /** Which doc files agents read and how many times (summed across all parallel runs). */ - docsRead: Record -} - -async function runCarveEval(options: CarveEvalOptions): Promise { - const { - repoPath, - carveFile, - featureId, - model, - parallelism, - agentTimeoutMs, - reviewerAgents, - initCommand, - maxImprovementIterations, - } = options - - // Load carve data - const carveData: CarveResult = JSON.parse( - fs.readFileSync(carveFile, 'utf-8'), - ) - - // Select features - let features = carveData.features - if (featureId) { - features = features.filter((f) => f.id === featureId) - if (features.length === 0) { - console.error( - `Feature "${featureId}" not found. Available: ${carveData.features.map((f) => f.id).join(', ')}`, - ) - process.exit(1) - } - } - - console.log(`\nCarve Eval:`) - console.log(` Repo: ${repoPath}`) - console.log(` Model: ${model}`) - console.log(` Parallelism: ${parallelism}`) - console.log(` Reviewers: ${reviewerAgents.join(', ')}`) - console.log(` Features: ${features.length}`) - console.log(` Max doc improvement iterations: ${maxImprovementIterations}`) - - const results: CarveEvalResult[] = [] - - for (const feature of features) { - console.log(`\n${'='.repeat(60)}`) - console.log(`Feature: ${feature.id}`) - console.log(`Prompt: ${feature.prompt}`) - console.log(`Operations: ${feature.operations.length} (${feature.operations.filter((o) => o.action === 'delete').length} deletes, ${feature.operations.filter((o) => o.action === 'modify').length} modifies)`) - console.log(`${'='.repeat(60)}`) - - const groundTruthDiff = computeGroundTruthDiff(feature) - - // --- Baseline: run agents in parallel --- - console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) - const baselineResults = await Promise.all( - Array.from({ length: parallelism }, (_, i) => - runAgentOnCarve({ - idx: i, - total: parallelism, - repoPath, - feature, - initCommand, - model, - agentTimeoutMs, - groundTruthDiff, - reviewerAgents, - docsSourcePath: repoPath, - }), - ), - ) - - const validBaseline = baselineResults.filter((r) => r.score >= 0) - let totalCost = baselineResults.reduce((a, r) => a + r.costEstimate, 0) - - if (validBaseline.length === 0) { - console.log(` All agents failed. Skipping feature.`) - results.push({ - featureId: feature.id, - prompt: feature.prompt, - baselineScore: 0, - finalScore: 0, - docsKept: [], - docsRejected: [], - totalCost, - docsRead: {}, - }) - continue - } - - const baselineScores = validBaseline.map((r) => r.score) - let currentScore = - baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length - console.log( - ` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`, - ) - - // Track which docs agents read across all runs for this feature - let allDocReadsForFeature = mergeDocReads(validBaseline.map((r) => extractDocReads(r.agentTrace))) - const baselineDocReadEntries = Object.entries(allDocReadsForFeature).sort((a, b) => b[1] - a[1]) - if (baselineDocReadEntries.length > 0) { - console.log(` Docs read (baseline): ${baselineDocReadEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`) - } else { - console.log(` Docs read (baseline): none`) - } - - const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] - const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] - - // --- Doc improvement loop --- - if (currentScore < 9.0) { - let latestJudgings = validBaseline.map((r) => r.judging) - let latestDiffs = validBaseline.map((r) => r.diff) - let latestTraces = validBaseline.map((r) => r.agentTrace) - - for (let iter = 0; iter < maxImprovementIterations; iter++) { - // Pick worst run for analysis - const worstIdx = latestJudgings.reduce( - (minIdx, j, idx, arr) => - j.overallScore < arr[minIdx].overallScore ? idx : minIdx, - 0, - ) - - const currentDocs = readCurrentDocs(repoPath) - const editHistory = [ - ...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })), - ...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })), - ] - - console.log(` Analyzing for doc improvements (iteration ${iter + 1})...`) - const docSuggestion = await analyzeFailure({ - judgeResult: latestJudgings[worstIdx], - taskPrompt: feature.prompt, - agentDiff: latestDiffs[worstIdx], - agentTrace: latestTraces[worstIdx], - groundTruthDiff, - currentDocs, - editHistory, - }) - - if (!docSuggestion) { - console.log(` No doc suggestion — stopping.`) - break - } - - console.log(` Doc suggestion: ${docSuggestion.suggestedDocPath}`) - console.log(` Reasoning: ${docSuggestion.reasoning}`) - - // Save previous content for revert - const docFullPath = path.join(repoPath, 'docs', docSuggestion.suggestedDocPath) - const previousContent = fs.existsSync(docFullPath) - ? fs.readFileSync(docFullPath, 'utf-8') - : null - - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, docSuggestion.suggestedContent) - - // Re-run with new docs - console.log(` Re-running ${parallelism} agents with new docs...`) - const rerunResults = await Promise.all( - Array.from({ length: parallelism }, (_, i) => - runAgentOnCarve({ - idx: i, - total: parallelism, - repoPath, - feature, - initCommand, - model, - agentTimeoutMs, - groundTruthDiff, - reviewerAgents, - docsSourcePath: repoPath, - }), - ), - ) - - const validRerun = rerunResults.filter((r) => r.score >= 0) - totalCost += rerunResults.reduce((a, r) => a + r.costEstimate, 0) - - // Accumulate doc reads from re-run - const rerunDocReads = mergeDocReads(validRerun.map((r) => extractDocReads(r.agentTrace))) - allDocReadsForFeature = mergeDocReads([allDocReadsForFeature, rerunDocReads]) - const rerunDocEntries = Object.entries(rerunDocReads).sort((a, b) => b[1] - a[1]) - if (rerunDocEntries.length > 0) { - console.log(` Docs read (iteration ${iter + 1}): ${rerunDocEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`) - } - - if (validRerun.length === 0) { - console.log(` Re-run failed. Reverting doc.`) - if (previousContent !== null) { - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) - } else { - revertDocEdit(repoPath, docSuggestion.suggestedDocPath) - } - break - } - - const rerunScores = validRerun.map((r) => r.score) - const rerunAvg = - rerunScores.reduce((a, b) => a + b, 0) / rerunScores.length - const comparison = compareScores(currentScore, rerunAvg) - console.log( - ` New score: ${rerunAvg.toFixed(1)}/10 (${comparison}) (${rerunScores.map((s) => s.toFixed(1)).join(', ')})`, - ) - - if (comparison === 'improved' || comparison === 'same') { - const reason = comparison === 'improved' ? 'improved' : 'within noise, keeping' - console.log(` Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`) - docsKept.push({ - path: docSuggestion.suggestedDocPath, - reasoning: docSuggestion.reasoning, - scoreBefore: currentScore, - scoreAfter: rerunAvg, - }) - - // Commit the doc - try { - execSync('git add docs/ AGENTS.md', { cwd: repoPath, stdio: 'ignore' }) - execSync( - `git commit -m "evalbuff: add ${docSuggestion.suggestedDocPath} (carve: ${feature.id})"`, - { cwd: repoPath, stdio: 'ignore' }, - ) - } catch { - console.warn('Failed to commit doc change') - } - - currentScore = rerunAvg - latestJudgings = validRerun.map((r) => r.judging) - latestDiffs = validRerun.map((r) => r.diff) - latestTraces = validRerun.map((r) => r.agentTrace) - } else { - console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath}`) - docsRejected.push({ - path: docSuggestion.suggestedDocPath, - reasoning: docSuggestion.reasoning, - scoreBefore: currentScore, - scoreAfter: rerunAvg, - }) - - if (previousContent !== null) { - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) - } else { - revertDocEdit(repoPath, docSuggestion.suggestedDocPath) - } - break - } - } - } - - results.push({ - featureId: feature.id, - prompt: feature.prompt, - baselineScore: baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length, - finalScore: currentScore, - docsKept, - docsRejected, - totalCost, - docsRead: allDocReadsForFeature, - }) - } - - // --- Summary --- - console.log(`\n${'='.repeat(60)}`) - console.log('CARVE EVAL RESULTS') - console.log(`${'='.repeat(60)}`) - - let totalCostAll = 0 - for (const r of results) { - console.log(`\n ${r.featureId}:`) - console.log(` Prompt: ${r.prompt.slice(0, 80)}...`) - console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`) - console.log(` Final: ${r.finalScore.toFixed(1)}/10`) - console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`) - const readEntries = Object.entries(r.docsRead).sort((a, b) => b[1] - a[1]) - if (readEntries.length > 0) { - console.log(` Docs read: ${readEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`) - } else { - console.log(` Docs read: none`) - } - console.log(` Cost: $${r.totalCost.toFixed(2)}`) - totalCostAll += r.totalCost - } - - const avgBaseline = - results.reduce((a, r) => a + r.baselineScore, 0) / results.length - const avgFinal = - results.reduce((a, r) => a + r.finalScore, 0) / results.length - - console.log(`\n Average baseline: ${avgBaseline.toFixed(1)}/10`) - console.log(` Average final: ${avgFinal.toFixed(1)}/10`) - console.log(` Total cost: $${totalCostAll.toFixed(2)}`) - - // Aggregate doc read stats across all features - const allDocReads = mergeDocReads(results.map((r) => r.docsRead)) - const allReadEntries = Object.entries(allDocReads).sort((a, b) => b[1] - a[1]) - if (allReadEntries.length > 0) { - console.log(`\n Doc read stats (all features):`) - for (const [docPath, count] of allReadEntries) { - console.log(` ${docPath}: ${count} reads`) - } - } else { - console.log(`\n No docs were read by any agent.`) - } - - // Save results - const outputPath = path.join( - repoPath, - `carve-eval-results-${new Date().toISOString().slice(0, 10)}.json`, - ) - fs.writeFileSync(outputPath, JSON.stringify(results, null, 2)) - console.log(`\nResults saved to: ${outputPath}`) -} - -// --- CLI --- - -if (import.meta.main) { - const args = process.argv.slice(2) - - const getArg = (name: string, defaultValue?: string): string => { - const idx = args.indexOf(`--${name}`) - if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] - if (defaultValue !== undefined) return defaultValue - throw new Error(`Missing required argument: --${name}`) - } - const hasArg = (name: string): boolean => args.includes(`--${name}`) - - const repoPath = getArg('repo') - const carveFile = getArg('carve-file') - const featureId = hasArg('feature') ? getArg('feature') : undefined - const model = getArg('model', 'sonnet') - const parallelism = parseInt(getArg('parallelism', '3')) - const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) - const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined - const reviewerAgents: ReviewerAgentType[] = reviewerAgentsArg - ? (reviewerAgentsArg.split(',') as ReviewerAgentType[]) - : ['claude', 'codex'] - const initCommand = hasArg('init-command') ? getArg('init-command') : undefined - const maxImprovementIterations = parseInt(getArg('max-iterations', '3')) - - runCarveEval({ - repoPath, - carveFile, - featureId, - model, - parallelism, - agentTimeoutMs, - reviewerAgents, - initCommand, - maxImprovementIterations, - }).catch((error) => { - console.error('Carve eval failed:', error) - process.exit(1) - }) -} diff --git a/evalbuff/src/run-e2e-test.ts b/evalbuff/src/run-e2e-test.ts deleted file mode 100644 index bb6f576f12..0000000000 --- a/evalbuff/src/run-e2e-test.ts +++ /dev/null @@ -1,296 +0,0 @@ -/** - * Real E2E test for evalbuff. - * - * Creates a local git repo with a simple project, then runs evalbuff's - * learn mode against it using real CLI coding agents and real reviewer agents. - * No mocks. - * - * Prerequisites: - * - `claude` CLI installed and authenticated - * - `codebuff` CLI installed - * - (Optional) `codex` CLI installed with OPENAI_API_KEY set - * - * Usage: - * bun run evalbuff/src/run-e2e-test.ts - */ -import { execSync } from 'child_process' -import fs from 'fs' -import os from 'os' -import path from 'path' - -import { runLearnMode } from './run-evalbuff' - -import type { ReviewerAgentType } from './judge' - -// --- Setup --- - -const BASE_DIR = fs.mkdtempSync(path.join(os.tmpdir(), 'evalbuff-real-e2e-')) -const PROJECT_DIR = path.join(BASE_DIR, 'project') - -const gitEnv = { - GIT_AUTHOR_NAME: 'evalbuff-test', - GIT_AUTHOR_EMAIL: 'test@evalbuff.dev', - GIT_COMMITTER_NAME: 'evalbuff-test', - GIT_COMMITTER_EMAIL: 'test@evalbuff.dev', -} - -function git(cmd: string, cwd: string) { - return execSync(`git ${cmd}`, { - cwd, - encoding: 'utf-8', - stdio: ['ignore', 'pipe', 'pipe'], - env: { ...process.env, ...gitEnv }, - }).trim() -} - -function setupProject() { - console.log('\n=== Setting up test project ===') - - fs.mkdirSync(PROJECT_DIR, { recursive: true }) - git('init', PROJECT_DIR) - - // Initial commit - fs.writeFileSync( - path.join(PROJECT_DIR, 'package.json'), - JSON.stringify( - { - name: 'evalbuff-test-project', - version: '1.0.0', - type: 'module', - scripts: { - test: 'node test.js', - start: 'node index.js', - }, - }, - null, - 2, - ), - ) - - fs.writeFileSync( - path.join(PROJECT_DIR, 'index.js'), - `// Simple math utility -export function add(a, b) { - return a + b -} - -export function multiply(a, b) { - return a * b -} -`, - ) - - fs.writeFileSync( - path.join(PROJECT_DIR, 'test.js'), - `import { add, multiply } from './index.js' - -let passed = 0 -let failed = 0 - -function assert(name, actual, expected) { - if (actual === expected) { - console.log(\` pass: \${name}\`) - passed++ - } else { - console.log(\` fail: \${name}: expected \${expected}, got \${actual}\`) - failed++ - } -} - -console.log('Running tests...') -assert('add(2, 3)', add(2, 3), 5) -assert('multiply(3, 4)', multiply(3, 4), 12) - -console.log(\`\\n\${passed} passed, \${failed} failed\`) -if (failed > 0) process.exit(1) -`, - ) - - git('add .', PROJECT_DIR) - git('commit -m "Initial project with add and multiply"', PROJECT_DIR) - - // Second commit: add subtract (with a bug) - fs.writeFileSync( - path.join(PROJECT_DIR, 'index.js'), - `// Simple math utility -export function add(a, b) { - return a + b -} - -export function multiply(a, b) { - return a * b -} - -// BUG: adds instead of subtracting -export function subtract(a, b) { - return a + b -} -`, - ) - - git('add .', PROJECT_DIR) - git('commit -m "Add subtract function (has bug)"', PROJECT_DIR) - - // Third commit: fix the bug - fs.writeFileSync( - path.join(PROJECT_DIR, 'index.js'), - `// Simple math utility -export function add(a, b) { - return a + b -} - -export function multiply(a, b) { - return a * b -} - -export function subtract(a, b) { - return a - b -} -`, - ) - - fs.writeFileSync( - path.join(PROJECT_DIR, 'test.js'), - `import { add, multiply, subtract } from './index.js' - -let passed = 0 -let failed = 0 - -function assert(name, actual, expected) { - if (actual === expected) { - console.log(\` pass: \${name}\`) - passed++ - } else { - console.log(\` fail: \${name}: expected \${expected}, got \${actual}\`) - failed++ - } -} - -console.log('Running tests...') -assert('add(2, 3)', add(2, 3), 5) -assert('multiply(3, 4)', multiply(3, 4), 12) -assert('subtract(10, 3)', subtract(10, 3), 7) - -console.log(\`\\n\${passed} passed, \${failed} failed\`) -if (failed > 0) process.exit(1) -`, - ) - - git('add .', PROJECT_DIR) - git('commit -m "Fix subtract bug and add test"', PROJECT_DIR) - - // Add a remote pointing to itself (learn mode needs git remote get-url) - git(`remote add origin file://${PROJECT_DIR}`, PROJECT_DIR) - - const commitCount = parseInt( - git('rev-list --count HEAD', PROJECT_DIR), - ) - console.log(` Project dir: ${PROJECT_DIR}`) - console.log(` Commits: ${commitCount}`) -} - -function detectAvailableReviewers(): ReviewerAgentType[] { - const reviewers: ReviewerAgentType[] = [] - - try { - execSync('which claude', { stdio: 'ignore' }) - reviewers.push('claude') - console.log(' reviewer: claude') - } catch { - console.log(' claude not found') - } - - try { - execSync('which codex', { stdio: 'ignore' }) - if (process.env.OPENAI_API_KEY) { - reviewers.push('codex') - console.log(' reviewer: codex') - } - } catch { - // skip - } - - return reviewers -} - -async function main() { - console.log('Evalbuff Real E2E Test') - console.log(`Base dir: ${BASE_DIR}`) - - console.log('\n=== Detecting available agents ===') - const reviewers = detectAvailableReviewers() - - if (reviewers.length === 0) { - console.error('No reviewer agents available. Need at least: claude') - process.exit(1) - } - - setupProject() - - // Run evalbuff learn mode against the project's own history - console.log('\n=== Running evalbuff learn mode ===') - - const startTime = Date.now() - - try { - await runLearnMode({ - mode: 'learn', - repoPath: PROJECT_DIR, - agentId: 'base2-free-evals', - parallelism: 2, - maxCostUsd: 10, - agentTimeoutMs: 5 * 60 * 1000, - commitCount: 10, // only 3 commits in this repo - reviewerAgents: reviewers, - }) - } catch (error) { - console.error('Evalbuff failed:', error) - } - - const durationMs = Date.now() - startTime - - // Verify results - console.log('\n=== Results ===') - - const logPath = path.join(PROJECT_DIR, 'evalbuff-log.jsonl') - if (fs.existsSync(logPath)) { - const logContent = fs.readFileSync(logPath, 'utf-8').trim() - if (logContent) { - const entries = logContent.split('\n').map((l) => JSON.parse(l)) - console.log(` Log entries: ${entries.length}`) - for (const entry of entries) { - console.log(` Commit: ${entry.taskId}`) - console.log(` Baseline: ${entry.oldScore}`) - console.log(` After docs: ${entry.newScore ?? 'N/A'}`) - console.log(` Docs: ${entry.docEdit ? entry.docEdit.path : 'none'}`) - } - } - } - - const statePath = path.join(PROJECT_DIR, 'evalbuff-state.json') - if (fs.existsSync(statePath)) { - const state = JSON.parse(fs.readFileSync(statePath, 'utf-8')) - console.log(` Processed: ${state.processedCommitCount} commits`) - console.log(` Cost: $${state.totalCostUsd.toFixed(2)}`) - } - - const docsDir = path.join(PROJECT_DIR, 'docs') - if (fs.existsSync(docsDir)) { - const docs = execSync(`find ${docsDir} -name '*.md'`, { encoding: 'utf-8' }).trim() - if (docs) { - console.log(` Docs generated:`) - for (const f of docs.split('\n')) { - console.log(` ${f}`) - } - } - } - - console.log(`\nCompleted in ${(durationMs / 1000).toFixed(1)}s`) - console.log(`Inspect: ${PROJECT_DIR}`) - console.log(`Cleanup: rm -rf ${BASE_DIR}`) -} - -main().catch((error) => { - console.error('E2E test failed:', error) - process.exit(1) -}) diff --git a/evalbuff/src/run-evalbuff.ts b/evalbuff/src/run-evalbuff.ts deleted file mode 100644 index cac655a1d6..0000000000 --- a/evalbuff/src/run-evalbuff.ts +++ /dev/null @@ -1,898 +0,0 @@ -import { execSync } from 'child_process' -import fs from 'fs' -import path from 'path' - -import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk' - -import { buildCommitTask, getCommitList } from './commit-task-generator' -import { - getCriteriaForLevel, - loadCriteria, - maybePromoteCriteria, - saveCriteria, -} from './criteria' -import { - analyzeFailure, - applyDocEdit, - compareScores, - readCurrentDocs, - revertDocEdit, -} from './docs-optimizer' -import { judgeTaskResult } from './judge' -import { - appendLogEntry, - generateMorningReport, -} from './morning-report' -import { CodebuffRunner } from './runners/codebuff' -import { withTestRepo } from './test-repo-utils' - -import type { QualityCriteria } from './criteria' -import type { ReviewerAgentType } from './judge' -import type { EvalbuffLogEntry } from './morning-report' -import type { CommitTask } from './commit-task-generator' - -// --- State --- - -interface EvalbuffState { - lastProcessedCommitSha: string | null - totalCostUsd: number - recentScores: number[] - processedCommitCount: number -} - -function loadState(statePath: string): EvalbuffState { - if (fs.existsSync(statePath)) { - return JSON.parse(fs.readFileSync(statePath, 'utf-8')) - } - return { - lastProcessedCommitSha: null, - totalCostUsd: 0, - recentScores: [], - processedCommitCount: 0, - } -} - -function saveState(statePath: string, state: EvalbuffState): void { - fs.writeFileSync(statePath, JSON.stringify(state, null, 2)) -} - -// --- Shared options --- - -export interface EvalbuffOptions { - repoPath: string - agentCommand?: string // deprecated — kept for backward compat with CLI runner - agentId: string // codebuff agent ID, e.g. 'base2-free-evals' - parallelism: number - maxCostUsd: number - agentTimeoutMs: number - criteriaPath?: string - reviewerAgents?: ReviewerAgentType[] - initCommand?: string -} - -export interface LearnOptions extends EvalbuffOptions { - mode: 'learn' - commitCount: number -} - -export interface PromptOptions extends EvalbuffOptions { - mode: 'prompt' - prompt: string -} - -// --- Core: run N agents in parallel, return average score --- - -interface ParallelRunResult { - avgScore: number - scores: number[] - diffs: string[] - agentTraces: string[] // stdout from each agent run (their reasoning/tool calls) - judgings: Array - costEstimate: number -} - -async function runAgentsInParallel(opts: { - client: CodebuffClient - agentId: string - agentDefinitions: any[] - prompt: string - repoPath: string - repoUrl: string - localRepoPath?: string - parentSha: string - initCommand?: string - groundTruthDiff?: string - parallelism: number - agentTimeoutMs: number - criteria: QualityCriteria - reviewerAgents?: ReviewerAgentType[] - docsSourcePath: string // path to the repo where docs/ lives -}): Promise { - const { - client, - agentId, - agentDefinitions, - prompt, - repoUrl, - localRepoPath, - parentSha, - initCommand, - groundTruthDiff, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - docsSourcePath, - } = opts - - const runOne = async (idx: number) => { - return withTestRepo( - { repoUrl, localRepoPath, parentSha, initCommand }, - async (repoDir) => { - // Copy current docs into the test repo - copyDocsIntoRepo(docsSourcePath, repoDir) - - console.log(` [Run ${idx + 1}/${parallelism}] Running agent via SDK...`) - const shortSha = parentSha.slice(0, 8) - const runner = new CodebuffRunner({ - cwd: repoDir, - client, - agentId, - localAgentDefinitions: agentDefinitions, - printEvents: false, - commitId: shortSha, - parentSha, - }) - - let result: Awaited> - try { - result = await runner.run(prompt) - } catch (runError) { - // Infrastructure errors (503s, timeouts) should not produce a 0 score. - // Return a sentinel so the caller can detect and handle it. - const errMsg = runError instanceof Error ? runError.message : String(runError) - console.warn(` [Run ${idx + 1}/${parallelism}] Agent failed: ${errMsg.slice(0, 200)}`) - return { - score: -1, // sentinel: infrastructure failure - diff: '', - agentTrace: `Agent error: ${errMsg}`, - judging: { - analysis: `Agent failed: ${errMsg.slice(0, 500)}`, - strengths: [], - weaknesses: ['Agent failed due to infrastructure error'], - e2eTestsPerformed: [], - completionScore: -1, - codeQualityScore: -1, - e2eScore: -1, - overallScore: -1, - }, - costEstimate: 0, - } - } - - // Serialize trace steps as JSON for the doc writer to analyze - const agentTrace = result.steps - .map((step) => JSON.stringify(step)) - .join('\n') - - console.log(` [Run ${idx + 1}/${parallelism}] Judging...`) - const judging = await judgeTaskResult({ - taskPrompt: prompt, - agentDiff: result.diff, - groundTruthDiff, - repoDir, - error: result.diff === '' ? 'Agent made no changes' : undefined, - criteria, - reviewerAgents, - }) - - return { - score: judging.overallScore, - diff: result.diff, - agentTrace, - judging, - costEstimate: result.totalCostUsd, - } - }, - ) - } - - const allResults = await Promise.all( - Array.from({ length: parallelism }, (_, i) => runOne(i)), - ) - - // Filter out infrastructure failures (score === -1) - const results = allResults.filter((r) => r.score >= 0) - const totalCost = allResults.reduce((a, r) => a + r.costEstimate, 0) - - if (results.length === 0) { - console.warn(` All ${parallelism} agent runs failed (infrastructure errors)`) - return { - avgScore: -1, - scores: [], - diffs: [], - agentTraces: allResults.map((r) => r.agentTrace), - judgings: [], - costEstimate: totalCost, - } - } - - if (results.length < allResults.length) { - console.warn(` ${allResults.length - results.length}/${allResults.length} runs failed, using ${results.length} valid results`) - } - - const scores = results.map((r) => r.score) - const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length - - return { - avgScore, - scores, - diffs: results.map((r) => r.diff), - agentTraces: results.map((r) => r.agentTrace), - judgings: results.map((r) => r.judging), - costEstimate: totalCost, - } -} - -/** - * Copy docs into a test repo and commit them so they don't appear in the agent's diff. - * - * Without this commit, `git diff HEAD` after the agent runs would include - * the pre-copied docs as "new files", corrupting the diff attribution — - * the judge would penalize or credit the agent for docs it didn't create. - */ -function copyDocsIntoRepo( - sourceRepoPath: string, - targetRepoPath: string, -): void { - const sourceDocsDir = path.join(sourceRepoPath, 'docs') - const sourceAgentsMd = path.join(sourceRepoPath, 'AGENTS.md') - const targetDocsDir = path.join(targetRepoPath, 'docs') - const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md') - - let copied = false - if (fs.existsSync(sourceDocsDir)) { - fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true }) - copied = true - } - if (fs.existsSync(sourceAgentsMd)) { - fs.cpSync(sourceAgentsMd, targetAgentsMd) - copied = true - } - - // Commit the docs so they become part of HEAD — otherwise git diff HEAD - // after the agent runs will include these docs as agent-created changes. - if (copied) { - try { - execSync('git add docs/ AGENTS.md 2>/dev/null; git add -u docs/ AGENTS.md 2>/dev/null', { - cwd: targetRepoPath, - stdio: 'ignore', - }) - execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', { - cwd: targetRepoPath, - stdio: 'ignore', - }) - } catch { - // If nothing to commit, that's fine - } - } -} - -// --- Iterative doc improvement loop --- - -/** - * Run the iterative doc improvement loop for a single task. - * Always analyzes failures. Keeps proposing doc changes until one is rejected. - * Returns the final average score and log info. - */ -async function improveDocs(opts: { - taskId: string - prompt: string - commitMessage?: string - repoPath: string - repoUrl: string - localRepoPath?: string - parentSha: string - initCommand?: string - groundTruthDiff?: string - client: CodebuffClient - agentId: string - agentDefinitions: any[] - parallelism: number - agentTimeoutMs: number - criteria: QualityCriteria - reviewerAgents?: ReviewerAgentType[] -}): Promise<{ - finalScore: number - baselineScore: number - docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> - docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> - totalCost: number -}> { - const { - taskId, - prompt, - commitMessage, - repoPath, - repoUrl, - localRepoPath, - parentSha, - initCommand, - groundTruthDiff, - client, - agentId, - agentDefinitions, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - } = opts - - let totalCost = 0 - const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] - const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] - - // Step 1: Baseline run - console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) - const baseline = await runAgentsInParallel({ - client, - agentId, - agentDefinitions, - prompt, - repoPath, - repoUrl, - localRepoPath, - parentSha, - initCommand, - groundTruthDiff, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - docsSourcePath: repoPath, - }) - totalCost += baseline.costEstimate - - let currentScore = baseline.avgScore - console.log(` Baseline score: ${currentScore.toFixed(1)}/10 (scores: ${baseline.scores.map((s) => s.toFixed(1)).join(', ')})`) - - // All agents failed — skip this task entirely - if (currentScore < 0) { - console.log(` All agent runs failed, skipping task.`) - return { - finalScore: 0, - baselineScore: 0, - docsKept: [], - docsRejected: [], - totalCost, - } - } - - // Early stopping: if baseline is already excellent, skip improvement loop - const EARLY_STOP_THRESHOLD = 9.0 - if (currentScore >= EARLY_STOP_THRESHOLD) { - console.log(` Baseline score ${currentScore.toFixed(1)} >= ${EARLY_STOP_THRESHOLD}, skipping improvement loop.`) - return { - finalScore: currentScore, - baselineScore: baseline.avgScore, - docsKept: [], - docsRejected: [], - totalCost: totalCost, - } - } - - // Step 2: Iterative doc improvement - let improving = true - const MAX_IMPROVEMENT_ITERATIONS = 5 - let iterationCount = 0 - while (improving) { - iterationCount++ - if (iterationCount > MAX_IMPROVEMENT_ITERATIONS) { - console.log(` Hit max improvement iterations (${MAX_IMPROVEMENT_ITERATIONS}), stopping.`) - break - } - // Pick the worst-scoring judging for analysis - const worstIdx = baseline.judgings.reduce( - (minIdx, j, idx, arr) => - j.overallScore < arr[minIdx].overallScore ? idx : minIdx, - 0, - ) - const worstJudging = baseline.judgings[worstIdx] - const worstDiff = baseline.diffs[worstIdx] - const worstTrace = baseline.agentTraces[worstIdx] - - const currentDocs = readCurrentDocs(repoPath) - - console.log(` Analyzing for doc improvements...`) - const editHistory = [ - ...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })), - ...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })), - ] - const docSuggestion = await analyzeFailure({ - judgeResult: worstJudging, - taskPrompt: prompt, - agentDiff: worstDiff, - agentTrace: worstTrace, - groundTruthDiff, - currentDocs, - editHistory, - commitMessage, - }) - - if (!docSuggestion) { - console.log(` No doc suggestion — stopping improvement loop.`) - break - } - - console.log(` Doc suggestion: ${docSuggestion.suggestedDocPath}`) - console.log(` Reasoning: ${docSuggestion.reasoning}`) - - // Save previous content so we can restore on rejection - const docFullPath = path.join(repoPath, 'docs', docSuggestion.suggestedDocPath) - const previousContent = fs.existsSync(docFullPath) - ? fs.readFileSync(docFullPath, 'utf-8') - : null - - // Apply doc to the main repo - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, docSuggestion.suggestedContent) - - // Re-run with new docs - console.log(` Re-running ${parallelism} agents with new docs...`) - const rerun = await runAgentsInParallel({ - client, - agentId, - agentDefinitions, - prompt, - repoPath, - repoUrl, - localRepoPath, - parentSha, - initCommand, - groundTruthDiff, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - docsSourcePath: repoPath, - }) - totalCost += rerun.costEstimate - - // If re-run failed entirely, don't count it as a rejection - if (rerun.avgScore < 0) { - console.log(` Re-run failed (infrastructure errors), reverting doc and retrying later.`) - if (previousContent !== null) { - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) - } else { - revertDocEdit(repoPath, docSuggestion.suggestedDocPath) - } - break - } - - const comparison = compareScores(currentScore, rerun.avgScore) - console.log(` New score: ${rerun.avgScore.toFixed(1)}/10 (${comparison}) (scores: ${rerun.scores.map((s) => s.toFixed(1)).join(', ')})`) - - if (comparison === 'improved' || comparison === 'same') { - // 'improved' = clear signal the doc helps - // 'same' = within noise range — keep it (benefit of the doubt) - const reason = comparison === 'improved' ? 'score improved' : 'within noise range, keeping' - console.log(` Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`) - docsKept.push({ - path: docSuggestion.suggestedDocPath, - reasoning: docSuggestion.reasoning, - scoreBefore: currentScore, - scoreAfter: rerun.avgScore, - }) - - // Commit the doc change - try { - execSync('git add docs/ AGENTS.md', { cwd: repoPath, stdio: 'ignore' }) - execSync( - `git commit -m "evalbuff: add ${docSuggestion.suggestedDocPath} (${taskId})"`, - { cwd: repoPath, stdio: 'ignore' }, - ) - } catch { - console.warn('Failed to commit doc change') - } - - currentScore = rerun.avgScore - - // Update baseline data for next iteration - baseline.judgings.splice(0, baseline.judgings.length, ...rerun.judgings) - baseline.diffs.splice(0, baseline.diffs.length, ...rerun.diffs) - baseline.agentTraces.splice(0, baseline.agentTraces.length, ...rerun.agentTraces) - - // Continue loop — try to improve more - } else { - console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath} (score dropped significantly)`) - docsRejected.push({ - path: docSuggestion.suggestedDocPath, - reasoning: docSuggestion.reasoning, - scoreBefore: currentScore, - scoreAfter: rerun.avgScore, - }) - - // Revert the doc edit — restore previous content if it existed - if (previousContent !== null) { - // Restore the previously-accepted version - applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) - } else { - revertDocEdit(repoPath, docSuggestion.suggestedDocPath) - } - - // Stop improving for this task - improving = false - } - } - - return { - finalScore: currentScore, - baselineScore: baseline.avgScore, - docsKept, - docsRejected, - totalCost, - } -} - -// --- Mode: Commit Learning --- - -export async function runLearnMode(options: LearnOptions): Promise { - const { - repoPath, - agentId, - parallelism, - maxCostUsd, - agentTimeoutMs, - criteriaPath, - reviewerAgents, - commitCount, - initCommand, - } = options - - const statePath = path.join(repoPath, 'evalbuff-state.json') - const logPath = path.join(repoPath, 'evalbuff-log.jsonl') - const defaultCriteriaPath = - criteriaPath || path.join(repoPath, 'evalbuff-criteria.json') - - const state = loadState(statePath) - let criteria = loadCriteria(defaultCriteriaPath) - - // Initialize codebuff SDK client and load agent definitions - const client = new CodebuffClient({ cwd: repoPath }) - const agentsDir = path.resolve(__dirname, '../../agents') - const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) - const agentDefinitions = Object.values(loadedAgents) - console.log(`Loaded ${agentDefinitions.length} agent definitions from ${agentsDir}`) - - // Get the repo's remote URL - let repoUrl: string - try { - repoUrl = execSync('git remote get-url origin', { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - } catch { - throw new Error( - `Could not determine remote URL for ${repoPath}. Make sure it has an 'origin' remote.`, - ) - } - - // Get commits to process - const commits = getCommitList( - repoPath, - commitCount, - state.lastProcessedCommitSha || undefined, - ) - - console.log(`Evalbuff Learn Mode:`) - console.log(` Repo: ${repoPath}`) - console.log(` Remote: ${repoUrl}`) - console.log(` Agent: ${agentId}`) - console.log(` Parallelism: ${parallelism}`) - console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) - console.log(` Commits to process: ${commits.length}`) - console.log(` Max cost: $${maxCostUsd}`) - console.log(` Criteria level: ${criteria.level}/5`) - console.log( - ` Resumed from: ${state.lastProcessedCommitSha?.slice(0, 8) || '(fresh start)'}`, - ) - console.log(` Previously processed: ${state.processedCommitCount} commits`) - - for (const sha of commits) { - // Budget check - if (state.totalCostUsd >= maxCostUsd) { - console.log( - `\nReached max cost ($${state.totalCostUsd.toFixed(2)} >= $${maxCostUsd}). Stopping.`, - ) - break - } - - const shortSha = sha.slice(0, 8) - console.log( - `\n${'='.repeat(60)}\nCommit ${shortSha} (${state.processedCommitCount + 1})\n${'='.repeat(60)}`, - ) - - // Build task from commit - const task = await buildCommitTask(repoPath, sha) - if (!task) { - console.log(`Skipping ${shortSha} (merge commit, initial commit, or too large)`) - state.lastProcessedCommitSha = sha - saveState(statePath, state) - continue - } - - console.log(` Message: ${task.message.split('\n')[0].slice(0, 80)}`) - console.log(` Files: ${task.filesChanged.length}`) - console.log(` Prompt: ${task.prompt.slice(0, 100)}...`) - - const iterationStart = Date.now() - - let logEntry: EvalbuffLogEntry = { - taskId: shortSha, - timestamp: new Date().toISOString(), - oldScore: 0, - newScore: null, - docEdit: null, - scoreComparison: null, - costUsd: 0, - durationMs: 0, - criteriaLevel: criteria.level, - } - - try { - const result = await improveDocs({ - taskId: shortSha, - prompt: task.prompt, - commitMessage: task.message, - repoPath, - repoUrl, - localRepoPath: repoPath, - parentSha: task.parentSha, - initCommand, - groundTruthDiff: task.diff, - client, - agentId, - agentDefinitions, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - }) - - logEntry.oldScore = result.baselineScore - logEntry.newScore = - result.docsKept.length > 0 ? result.finalScore : null - logEntry.costUsd = result.totalCost - - if (result.docsKept.length > 0) { - logEntry.docEdit = { - path: result.docsKept.map((d) => d.path).join(', '), - reasoning: result.docsKept.map((d) => d.reasoning).join('; '), - } - logEntry.scoreComparison = 'improved' - } - - // Update scores tracking - state.recentScores.push(result.finalScore) - - // Check criteria promotion - const newLevel = maybePromoteCriteria(criteria, state.recentScores) - if (newLevel !== criteria.level) { - criteria = { - ...criteria, - level: newLevel, - criteria: getCriteriaForLevel(newLevel), - } - saveCriteria(defaultCriteriaPath, criteria) - logEntry.criteriaLevel = newLevel - } - } catch (error) { - const errorMsg = - error instanceof Error ? error.message : String(error) - console.error(`Error on commit ${shortSha}:`, errorMsg) - logEntry.error = errorMsg - } - - logEntry.durationMs = Date.now() - iterationStart - state.totalCostUsd += logEntry.costUsd - state.lastProcessedCommitSha = sha - state.processedCommitCount++ - - appendLogEntry(logPath, logEntry) - saveState(statePath, state) - } - - // Generate morning report - console.log('\nGenerating report...') - const report = generateMorningReport(logPath) - const reportPath = path.join( - repoPath, - `evalbuff-report-${new Date().toISOString().slice(0, 10)}.md`, - ) - fs.writeFileSync(reportPath, report) - console.log(`Report written to: ${reportPath}`) - console.log(report) -} - -// --- Mode: Prompt --- - -export async function runPromptMode(options: PromptOptions): Promise { - const { - repoPath, - agentId, - parallelism, - maxCostUsd, - agentTimeoutMs, - criteriaPath, - reviewerAgents, - prompt, - initCommand, - } = options - - const logPath = path.join(repoPath, 'evalbuff-log.jsonl') - const defaultCriteriaPath = - criteriaPath || path.join(repoPath, 'evalbuff-criteria.json') - - const criteria = loadCriteria(defaultCriteriaPath) - - // Initialize codebuff SDK client and load agent definitions - const client = new CodebuffClient({ cwd: repoPath }) - const agentsDir = path.resolve(__dirname, '../../agents') - const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) - const agentDefinitions = Object.values(loadedAgents) - - let repoUrl: string - try { - repoUrl = execSync('git remote get-url origin', { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - } catch { - throw new Error( - `Could not determine remote URL for ${repoPath}. Make sure it has an 'origin' remote.`, - ) - } - - // Get current HEAD as the parentSha (agents work on the current state) - const headSha = execSync('git rev-parse HEAD', { - cwd: repoPath, - encoding: 'utf-8', - }).trim() - - console.log(`Evalbuff Prompt Mode:`) - console.log(` Repo: ${repoPath}`) - console.log(` Remote: ${repoUrl}`) - console.log(` Agent: ${agentId}`) - console.log(` Parallelism: ${parallelism}`) - console.log(` Reviewer agents: ${(reviewerAgents || ['claude', 'codex']).join(', ')}`) - console.log(` Max cost: $${maxCostUsd}`) - console.log(` Criteria level: ${criteria.level}/5`) - console.log(` Prompt: ${prompt.slice(0, 100)}...`) - - const iterationStart = Date.now() - - const logEntry: EvalbuffLogEntry = { - taskId: 'prompt-mode', - timestamp: new Date().toISOString(), - oldScore: 0, - newScore: null, - docEdit: null, - scoreComparison: null, - costUsd: 0, - durationMs: 0, - criteriaLevel: criteria.level, - } - - try { - const result = await improveDocs({ - taskId: 'prompt-mode', - prompt, - repoPath, - repoUrl, - localRepoPath: repoPath, - parentSha: headSha, - initCommand, - // No ground truth diff in prompt mode - client, - agentId, - agentDefinitions, - parallelism, - agentTimeoutMs, - criteria, - reviewerAgents, - }) - - logEntry.oldScore = result.baselineScore - logEntry.newScore = - result.docsKept.length > 0 ? result.finalScore : null - logEntry.costUsd = result.totalCost - - if (result.docsKept.length > 0) { - logEntry.docEdit = { - path: result.docsKept.map((d) => d.path).join(', '), - reasoning: result.docsKept.map((d) => d.reasoning).join('; '), - } - logEntry.scoreComparison = 'improved' - } - - console.log(`\nResult:`) - console.log(` Baseline score: ${result.baselineScore.toFixed(1)}/10`) - console.log(` Final score: ${result.finalScore.toFixed(1)}/10`) - console.log(` Docs kept: ${result.docsKept.length}`) - console.log(` Docs rejected: ${result.docsRejected.length}`) - console.log(` Cost: $${result.totalCost.toFixed(2)}`) - } catch (error) { - const errorMsg = - error instanceof Error ? error.message : String(error) - console.error(`Error in prompt mode:`, errorMsg) - logEntry.error = errorMsg - } - - logEntry.durationMs = Date.now() - iterationStart - appendLogEntry(logPath, logEntry) -} - -// --- CLI entry point --- - -async function main() { - const args = process.argv.slice(2) - const getArg = (name: string, defaultValue?: string): string => { - const idx = args.indexOf(`--${name}`) - if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] - if (defaultValue !== undefined) return defaultValue - throw new Error(`Missing required argument: --${name}`) - } - const hasArg = (name: string): boolean => args.includes(`--${name}`) - - const repoPath = getArg('repo') - const agentId = getArg('agent', 'base2-free-evals') - const parallelism = parseInt(getArg('parallelism', '5')) - const maxCostUsd = parseFloat(getArg('max-cost', '100')) - const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) - const criteriaPath = hasArg('criteria') ? getArg('criteria') : undefined - const initCommand = hasArg('init-command') ? getArg('init-command') : undefined - const reviewerAgentsArg = hasArg('reviewers') - ? getArg('reviewers') - : undefined - const reviewerAgents = reviewerAgentsArg - ? (reviewerAgentsArg.split(',') as ReviewerAgentType[]) - : undefined - - if (hasArg('prompt')) { - // Prompt mode - const prompt = getArg('prompt') - await runPromptMode({ - mode: 'prompt', - repoPath, - agentId, - parallelism, - maxCostUsd, - agentTimeoutMs, - criteriaPath, - reviewerAgents, - prompt, - initCommand, - }) - } else { - // Learn mode (default) - const commitCount = parseInt(getArg('commits', '500')) - await runLearnMode({ - mode: 'learn', - repoPath, - agentId, - parallelism, - maxCostUsd, - agentTimeoutMs, - criteriaPath, - reviewerAgents, - commitCount, - initCommand, - }) - } -} - -if (import.meta.main) { - main().catch((error) => { - console.error('Evalbuff failed:', error) - process.exit(1) - }) -} diff --git a/evalbuff/src/runners/claude.ts b/evalbuff/src/runners/claude.ts deleted file mode 100644 index 2c1f228f51..0000000000 --- a/evalbuff/src/runners/claude.ts +++ /dev/null @@ -1,182 +0,0 @@ -import { execSync, spawn } from 'child_process' - -import type { Runner, RunnerResult, AgentStep } from './runner' -import type { - PrintModeToolCall, - PrintModeToolResult, -} from '@codebuff/common/types/print-mode' - -export class ClaudeRunner implements Runner { - private cwd: string - private env: Record - private model: string - - constructor( - cwd: string, - env: Record = {}, - model: string = 'claude-opus-4-5-20251101', - ) { - this.cwd = cwd - this.env = env - this.model = model - } - - async run(prompt: string): Promise { - const steps: AgentStep[] = [] - let totalCostUsd = 0 - - return new Promise((resolve, reject) => { - const args = [ - '-p', - prompt, - '--output-format', - 'stream-json', - '--verbose', - '--dangerously-skip-permissions', - '--model', - this.model, - ] - - console.log(`[ClaudeRunner] Running: claude ${args.join(' ')}`) - - const child = spawn('claude', args, { - cwd: this.cwd, - env: { - ...process.env, - ...this.env, - // Ensure ANTHROPIC_API_KEY is set from CLAUDE_CODE_KEY if available - ANTHROPIC_API_KEY: - process.env.CLAUDE_CODE_KEY || process.env.ANTHROPIC_API_KEY, - }, - // Use 'ignore' for stdin to prevent the CLI from waiting for input - stdio: ['ignore', 'pipe', 'pipe'], - }) - - let _stdout = '' - let stderr = '' - let responseText = '' - let toolCalls: PrintModeToolCall[] = [] - let toolResults: PrintModeToolResult[] = [] - - function flushStep() { - if (responseText.length > 0) { - steps.push({ type: 'text', text: responseText }) - } - for (const call of toolCalls) { - steps.push(call) - } - for (const result of toolResults) { - steps.push(result) - } - responseText = '' - toolCalls = [] - toolResults = [] - } - - child.stdout.on('data', (data: Buffer) => { - const chunk = data.toString() - _stdout += chunk - - // Parse streaming JSON output from Claude CLI - const lines = chunk.split('\n').filter((line) => line.trim()) - for (const line of lines) { - try { - const event = JSON.parse(line) - - if (event.type === 'assistant') { - if (event.message?.content) { - for (const content of event.message.content) { - if (content.type === 'text') { - if (toolResults.length > 0) { - flushStep() - } - responseText += content.text - process.stdout.write(content.text) - } else if (content.type === 'tool_use') { - toolCalls.push({ - type: 'tool_call', - toolName: content.name, - toolCallId: content.id, - input: content.input || {}, - }) - } - } - } - } else if (event.type === 'user') { - if (event.message?.content) { - for (const content of event.message.content) { - if (content.type === 'tool_result') { - toolResults.push({ - type: 'tool_result', - toolName: 'unknown', - toolCallId: content.tool_use_id, - output: [ - { - type: 'json', - value: - typeof content.content === 'string' - ? content.content - : content.content, - }, - ], - }) - } - } - } - } else if (event.type === 'result') { - if (event.total_cost_usd) { - totalCostUsd += event.total_cost_usd - } - } - } catch { - // Not JSON, might be plain text output - responseText += line - } - } - }) - - child.stderr.on('data', (data: Buffer) => { - stderr += data.toString() - process.stderr.write(data) - }) - - child.on('error', (error) => { - reject( - new Error( - `Claude CLI failed to start: ${error.message}. Make sure 'claude' is installed and in PATH.`, - ), - ) - }) - - child.on('close', (code) => { - flushStep() - - // Get git diff after Claude has made changes - let diff = '' - try { - execSync('git add .', { cwd: this.cwd, stdio: 'ignore' }) - diff = execSync('git diff HEAD', { - cwd: this.cwd, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - } catch { - // Ignore git errors - } - - if (code !== 0) { - reject( - new Error(`Claude CLI exited with code ${code}. stderr: ${stderr}`), - ) - return - } - - resolve({ - steps, - totalCostUsd, - diff, - }) - }) - }) - } -} diff --git a/evalbuff/src/runners/codebuff.ts b/evalbuff/src/runners/codebuff.ts deleted file mode 100644 index 867b95ee1a..0000000000 --- a/evalbuff/src/runners/codebuff.ts +++ /dev/null @@ -1,139 +0,0 @@ -import { execSync } from 'child_process' -import fs from 'fs' -import path from 'path' - -import type { Runner, RunnerResult, AgentStep } from './runner' -import type { CodebuffClient } from '@codebuff/sdk' - - -const DEBUG_ERROR = true - -export class CodebuffRunner implements Runner { - private cwd: string - private env?: Record - private client: CodebuffClient - private agentId: string - private localAgentDefinitions: any[] - private printEvents: boolean - private commitId: string - private parentSha: string - - constructor(options: { - cwd: string - env?: Record - client: CodebuffClient - agentId: string - localAgentDefinitions: any[] - printEvents: boolean - commitId: string - parentSha: string - }) { - this.cwd = options.cwd - this.env = options.env - this.client = options.client - this.agentId = options.agentId - this.localAgentDefinitions = options.localAgentDefinitions - this.printEvents = options.printEvents - this.commitId = options.commitId - this.parentSha = options.parentSha - } - - async run(prompt: string): Promise { - const steps: AgentStep[] = [] - let totalCostUsd = 0 - - const maxAgentSteps = 40 - const result = await this.client.run({ - agent: this.agentId, - prompt, - agentDefinitions: this.localAgentDefinitions, - cwd: this.cwd, - env: this.env, - maxAgentSteps, - handleEvent: (event) => { - if ( - (event.type === 'tool_call' || event.type === 'tool_result') && - event.toolName === 'set_messages' - ) { - return - } - if (event.type === 'error') { - console.error( - `[${this.commitId}:${this.agentId}] Error event:`, - event.message, - ) - if (DEBUG_ERROR && !event.message.startsWith('Invalid JSON')) { - // Save errors in a file, but not tool calls with invalid json. - fs.writeFileSync( - path.join( - __dirname, - '..', - `${this.commitId}-${this.agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`, - ), - JSON.stringify( - { - error: event.message, - trace: steps, - }, - null, - 2, - ), - ) - } - } else if (this.printEvents) { - console.log( - `[${this.commitId}:${this.agentId}]`, - JSON.stringify(event, null, 2), - ) - } - steps.push(event) - }, - }) - - if (result.output.type === 'error') { - console.error( - `[${this.commitId}:${this.agentId}] Error:`, - result.output.message, - ) - if (DEBUG_ERROR) { - // Save errors in a file, but not tool calls with invalid json. - fs.writeFileSync( - path.join( - __dirname, - '..', - `${this.commitId}-${this.agentId}-error-${Math.random().toString(36).substring(2, 6)}.json`, - ), - JSON.stringify( - { - ...result.output, - trace: steps, - }, - null, - 2, - ), - ) - } - } - - totalCostUsd = (result.sessionState?.mainAgentState.creditsUsed ?? 0) / 100 - - // Get git diff after Codebuff has made changes - let diff = '' - try { - execSync('git add .', { cwd: this.cwd, stdio: 'ignore' }) - diff = execSync(`git diff ${this.parentSha}`, { - cwd: this.cwd, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - } catch { - // Ignore git errors - } - - return { - steps, - totalCostUsd, - diff, - } - } -} diff --git a/evalbuff/src/runners/codex.ts b/evalbuff/src/runners/codex.ts deleted file mode 100644 index b8a3ad7726..0000000000 --- a/evalbuff/src/runners/codex.ts +++ /dev/null @@ -1,143 +0,0 @@ -import { execSync, spawn } from 'child_process' - -import type { Runner, RunnerResult, AgentStep } from './runner' - -export class CodexRunner implements Runner { - private cwd: string - private env: Record - - constructor(cwd: string, env: Record = {}) { - this.cwd = cwd - this.env = env - } - - async run(prompt: string): Promise { - const steps: AgentStep[] = [] - let totalCostUsd = 0 - - return new Promise((resolve, reject) => { - // Codex CLI uses the prompt as a positional argument - // Use exec subcommand with --full-auto for automatic execution - // --full-auto enables -a on-failure and --sandbox workspace-write - // Use --json for structured output that we can parse - const args = [ - 'exec', - '--full-auto', - '--json', - '-m', - 'gpt-5.1-codex', - prompt, - ] - - console.log(`[CodexRunner] Running: codex ${args.join(' ')}`) - - const child = spawn('codex', args, { - cwd: this.cwd, - env: { - ...process.env, - ...this.env, - CODEX_API_KEY: process.env.OPENAI_API_KEY || this.env.OPENAI_API_KEY, - }, - // Use 'ignore' for stdin to prevent the CLI from waiting for input - stdio: ['ignore', 'pipe', 'pipe'], - }) - - let _stdout = '' - let stderr = '' - - child.stdout.on('data', (data: Buffer) => { - const chunk = data.toString() - _stdout += chunk - process.stdout.write(chunk) - - // Codex outputs events as JSON lines in some modes - const lines = chunk.split('\n').filter((line) => line.trim()) - for (const line of lines) { - try { - const event = JSON.parse(line) - if (event.type === 'message') { - steps.push({ - type: 'text', - text: event.content || event.message || '', - }) - } else if ( - event.type === 'function_call' || - event.type === 'tool' - ) { - steps.push({ - type: 'tool_call', - toolName: event.name || event.function?.name || 'unknown', - toolCallId: event.id || `codex-${Date.now()}`, - input: event.arguments || event.function?.arguments || {}, - }) - } else if ( - event.type === 'function_result' || - event.type === 'tool_result' - ) { - steps.push({ - type: 'tool_result', - toolName: event.name || 'unknown', - toolCallId: event.id || `codex-${Date.now()}`, - output: [ - { - type: 'json', - value: event.result || event.output || '', - }, - ], - }) - } - } catch { - // Plain text output, add as text step - if (line.trim()) { - steps.push({ - type: 'text', - text: line, - }) - } - } - } - }) - - child.stderr.on('data', (data: Buffer) => { - stderr += data.toString() - process.stderr.write(data) - }) - - child.on('error', (error) => { - reject( - new Error( - `Codex CLI failed to start: ${error.message}. Make sure 'codex' is installed and in PATH.`, - ), - ) - }) - - child.on('close', (code) => { - // Get git diff after Codex has made changes - let diff = '' - try { - execSync('git add .', { cwd: this.cwd, stdio: 'ignore' }) - diff = execSync('git diff HEAD', { - cwd: this.cwd, - encoding: 'utf-8', - maxBuffer: 10 * 1024 * 1024, - }) - } catch { - // Ignore git errors - } - - if (code !== 0) { - reject( - new Error(`Codex CLI exited with code ${code}. stderr: ${stderr}`), - ) - return - } - - resolve({ - steps, - totalCostUsd, // Codex doesn't report cost in CLI output - diff, - }) - }) - }) - } -} diff --git a/evalbuff/src/runners/index.ts b/evalbuff/src/runners/index.ts deleted file mode 100644 index 99adc3d28a..0000000000 --- a/evalbuff/src/runners/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -export { ClaudeRunner } from './claude' -export { CodexRunner } from './codex' -export type { Runner, RunnerResult } from './runner' diff --git a/evalbuff/src/runners/runner.ts b/evalbuff/src/runners/runner.ts deleted file mode 100644 index ea450caaab..0000000000 --- a/evalbuff/src/runners/runner.ts +++ /dev/null @@ -1,13 +0,0 @@ -import type { PrintModeEvent } from '@codebuff/common/types/print-mode' - -export type AgentStep = PrintModeEvent - -export type RunnerResult = { - steps: AgentStep[] - totalCostUsd: number - diff: string -} - -export interface Runner { - run: (prompt: string) => Promise -} diff --git a/evalbuff/src/test-repo-utils.ts b/evalbuff/src/test-repo-utils.ts deleted file mode 100644 index 7c1ba6700e..0000000000 --- a/evalbuff/src/test-repo-utils.ts +++ /dev/null @@ -1,143 +0,0 @@ -import { execSync } from 'child_process' -import fs from 'fs' -import * as os from 'os' -import path from 'path' - -import { getErrorObject } from '@codebuff/common/util/error' - -/** - * Helper function to manage test repository lifecycle - * Sets up a test repo, runs a function with the repo cwd, then cleans up. - * - * When localRepoPath is provided, uses a local clone (near-instant via hardlinks) - * instead of a remote clone (5-30s per clone). This is the single biggest - * speedup in evalbuff — with parallelism=5, saves 10-30 remote clones per commit. - */ -export const withTestRepo = async ( - repoConfig: { - repoUrl: string - localRepoPath?: string - // The sha of the commit to checkout. If you have a commit with changes to replicate, you would check out the parent commit. - parentSha: string - initCommand?: string - env?: Record - }, - fn: (cwd: string) => Promise, -): Promise => { - const { repoUrl, localRepoPath, parentSha, initCommand, env } = repoConfig - - // Create a temporary directory for the test repo - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-')) - const repoDir = path.join(tempDir, 'repo') - - try { - if (localRepoPath) { - // Local clone: uses hardlinks for objects, nearly instant - execSync(`git clone --no-checkout "${localRepoPath}" "${repoDir}"`, { stdio: 'ignore' }) - execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) - } else { - // Remote clone: slow but works without local repo - execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' }) - - execSync(`git fetch --depth 1 origin ${parentSha}`, { - cwd: repoDir, - stdio: 'ignore', - }) - execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) - } - - if (initCommand) { - console.log(`Running init command: ${initCommand}...`) - try { - execSync(initCommand, { - cwd: repoDir, - stdio: 'ignore', - env: { ...process.env, ...env }, - }) - } catch (error) { - console.error( - `Error running init command: ${getErrorObject(error).message}`, - ) - } - } - - // Run the provided function with the repo directory - return await fn(repoDir) - } finally { - // Clean up the temporary directory - try { - fs.rmSync(tempDir, { recursive: true, force: true }) - } catch (error) { - console.warn(`Failed to clean up temporary directory: ${error}`) - } - } -} - -export const withTestRepoAndParent = async ( - repoConfig: { - repoUrl: string - commitSha: string - initCommand?: string - }, - fn: (cwd: string, commitSha: string, parentSha: string) => Promise, -): Promise => { - const { repoUrl, commitSha, initCommand } = repoConfig - - const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codebuff-eval-')) - const repoDir = path.join(tempDir, 'repo') - - try { - execSync(`git clone --depth 1 ${repoUrl} ${repoDir}`, { stdio: 'ignore' }) - - execSync(`git fetch --depth 2 origin ${commitSha}`, { - cwd: repoDir, - stdio: 'ignore', - }) - - execSync(`git checkout ${commitSha}`, { cwd: repoDir, stdio: 'ignore' }) - - let parentSha: string - try { - const parents = execSync(`git log --pretty=%P -n 1 ${commitSha}`, { - cwd: repoDir, - encoding: 'utf-8', - stdio: ['ignore', 'pipe', 'ignore'], - }).trim() - - if (!parents) { - console.warn( - `Commit ${commitSha.slice(0, 8)} has no parent (initial commit)`, - ) - return null - } - - const parentList = parents.split(' ') - if (parentList.length > 1) { - console.warn( - `Commit ${commitSha.slice(0, 8)} is a merge commit (${parentList.length} parents)`, - ) - return null - } - - parentSha = parentList[0] - } catch (error) { - console.error(`Error getting parent for ${commitSha.slice(0, 8)}:`, error) - return null - } - - execSync(`git checkout ${parentSha}`, { cwd: repoDir, stdio: 'ignore' }) - - if (initCommand) { - console.log(`Running init command: ${initCommand}...`) - execSync(initCommand, { cwd: repoDir, stdio: 'ignore' }) - } - - return await fn(repoDir, commitSha, parentSha) - } finally { - try { - fs.rmSync(tempDir, { recursive: true, force: true }) - } catch (error) { - console.warn(`Failed to clean up temporary directory: ${error}`) - } - } -} diff --git a/evalbuff/src/trace-compressor.ts b/evalbuff/src/trace-compressor.ts deleted file mode 100644 index 995f08b2cd..0000000000 --- a/evalbuff/src/trace-compressor.ts +++ /dev/null @@ -1,284 +0,0 @@ -import fs from 'fs' -import path from 'path' - -/** - * A compressed trace where large tool results are stored in separate files. - * The inline trace keeps the full reasoning + tool calls but replaces - * tool result bodies with pointers like: - * [Tool result stored in: /tmp/evalbuff-traces-xxx/result-003.txt (2847 chars)] - */ -export interface CompressedTrace { - /** The trace with large tool results replaced by file pointers */ - inline: string - /** Directory containing the extracted result files (caller should clean up) */ - traceDir: string -} - -/** Minimum size (chars) for a tool result body to get extracted to a file */ -const EXTRACT_THRESHOLD = 300 - -/** - * Compress an agent trace by extracting large tool results into files. - * - * Supports multiple trace formats: - * 1. JSON-lines streaming (Claude `--output-format stream-json`) - * 2. Structured text with code blocks / indented output - * - * Returns the compressed inline trace + path to the directory of result files. - */ -export function compressTrace( - rawTrace: string, - traceDir: string, -): CompressedTrace { - fs.mkdirSync(traceDir, { recursive: true }) - - // Try JSON-lines first (Claude streaming format) - const jsonResult = tryCompressJsonLines(rawTrace, traceDir) - if (jsonResult) return jsonResult - - // Fall back to heuristic text compression - return compressTextTrace(rawTrace, traceDir) -} - -/** - * Try to parse as JSON-lines (one JSON object per line). - * Claude CLI with --output-format stream-json emits events like: - * {"type":"tool_use","name":"Read","input":{...}} - * {"type":"tool_result","content":"...huge file contents..."} - */ -function tryCompressJsonLines( - rawTrace: string, - traceDir: string, -): CompressedTrace | null { - const lines = rawTrace.split('\n') - - // Quick check: are most non-empty lines valid JSON? - const nonEmpty = lines.filter((l) => l.trim()) - if (nonEmpty.length < 2) return null - - let jsonCount = 0 - for (const line of nonEmpty.slice(0, 10)) { - try { - JSON.parse(line) - jsonCount++ - } catch { - // not json - } - } - if (jsonCount < nonEmpty.length * 0.5) return null - - // Parse and compress - const outputLines: string[] = [] - let fileIdx = 0 - - for (const line of lines) { - const trimmed = line.trim() - if (!trimmed) { - outputLines.push('') - continue - } - - let parsed: any - try { - parsed = JSON.parse(trimmed) - } catch { - outputLines.push(line) - continue - } - - // Check if this is a tool result with large content - if (isToolResultEvent(parsed)) { - const content = extractToolResultContent(parsed) - if (content && content.length > EXTRACT_THRESHOLD) { - const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` - const filePath = path.join(traceDir, fileName) - fs.writeFileSync(filePath, content) - fileIdx++ - - // Replace content with pointer, keep the rest of the event - const summary = summarizeContent(content) - const compressed = replaceToolResultContent( - parsed, - `[Stored in: ${filePath} (${content.length} chars) — ${summary}]`, - ) - outputLines.push(JSON.stringify(compressed)) - continue - } - } - - outputLines.push(line) - } - - return { - inline: outputLines.join('\n'), - traceDir, - } -} - -/** - * Heuristic compression for unstructured text traces. - * Detects large blocks (code fences, indented blocks, long output runs) - * and extracts them to files. - */ -function compressTextTrace( - rawTrace: string, - traceDir: string, -): CompressedTrace { - const lines = rawTrace.split('\n') - const outputLines: string[] = [] - let fileIdx = 0 - let i = 0 - - while (i < lines.length) { - // Detect code fence blocks: ``` ... ``` - if (lines[i].trim().startsWith('```')) { - const blockStart = i - const openFence = lines[i].trim() - i++ - const blockLines: string[] = [lines[blockStart]] - - // Find closing fence - while (i < lines.length) { - blockLines.push(lines[i]) - if (lines[i].trim() === '```' || lines[i].trim() === openFence) { - i++ - break - } - i++ - } - - const blockContent = blockLines.join('\n') - if (blockContent.length > EXTRACT_THRESHOLD) { - const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` - const filePath = path.join(traceDir, fileName) - fs.writeFileSync(filePath, blockContent) - fileIdx++ - const summary = summarizeContent(blockContent) - outputLines.push( - `[Code block stored in: ${filePath} (${blockContent.length} chars) — ${summary}]`, - ) - } else { - outputLines.push(...blockLines) - } - continue - } - - // Detect indented blocks (4+ spaces or tab) — common for tool output - if (/^(?: |\t)/.test(lines[i]) && i + 1 < lines.length) { - const blockStart = i - const blockLines: string[] = [] - while (i < lines.length && (/^(?: |\t)/.test(lines[i]) || lines[i].trim() === '')) { - blockLines.push(lines[i]) - i++ - } - - // Only extract if it's a large block (not just 2-3 indented lines) - const blockContent = blockLines.join('\n') - if (blockContent.length > EXTRACT_THRESHOLD && blockLines.length > 5) { - const fileName = `result-${String(fileIdx).padStart(3, '0')}.txt` - const filePath = path.join(traceDir, fileName) - fs.writeFileSync(filePath, blockContent) - fileIdx++ - const summary = summarizeContent(blockContent) - outputLines.push( - `[Indented block stored in: ${filePath} (${blockContent.length} chars, ${blockLines.length} lines) — ${summary}]`, - ) - } else { - outputLines.push(...blockLines) - } - continue - } - - outputLines.push(lines[i]) - i++ - } - - return { - inline: outputLines.join('\n'), - traceDir, - } -} - -// --- Helpers --- - -/** Check if a parsed JSON event is a tool result */ -function isToolResultEvent(event: any): boolean { - if (!event || typeof event !== 'object') return false - // Claude streaming: {"type":"tool_result",...} or {"type":"content_block_delta","delta":{"type":"tool_result",...}} - if (event.type === 'tool_result') return true - if (event.type === 'content_block_stop' && event.content_block?.type === 'tool_result') return true - // Codex: {"type":"function_result",...} - if (event.type === 'function_result') return true - // Generic: anything with a large "content" or "output" or "result" field - for (const key of ['content', 'output', 'result', 'text']) { - if (typeof event[key] === 'string' && event[key].length > EXTRACT_THRESHOLD) return true - } - return false -} - -/** Extract the large content body from a tool result event */ -function extractToolResultContent(event: any): string | null { - // Try common field names in order of specificity - for (const key of ['content', 'output', 'result', 'text']) { - if (typeof event[key] === 'string') return event[key] - // Nested: event.content[0].text (Claude format) - if (Array.isArray(event[key])) { - const texts = event[key] - .filter((item: any) => typeof item === 'object' && typeof item.text === 'string') - .map((item: any) => item.text) - if (texts.length > 0) return texts.join('\n') - } - } - // Check nested delta - if (event.delta && typeof event.delta === 'object') { - return extractToolResultContent(event.delta) - } - return null -} - -/** Replace the content body in a tool result event with a pointer string */ -function replaceToolResultContent(event: any, pointer: string): any { - const clone = { ...event } - for (const key of ['content', 'output', 'result', 'text']) { - if (typeof clone[key] === 'string') { - clone[key] = pointer - return clone - } - if (Array.isArray(clone[key])) { - clone[key] = [{ type: 'text', text: pointer }] - return clone - } - } - if (clone.delta) { - clone.delta = replaceToolResultContent({ ...clone.delta }, pointer) - } - return clone -} - -/** Generate a short summary of content for the inline pointer */ -function summarizeContent(content: string): string { - const firstLine = content.split('\n').find((l) => l.trim())?.trim() || '' - const lineCount = content.split('\n').length - - // Detect content type - if (content.includes('```')) return `code block, ${lineCount} lines` - if (firstLine.startsWith('{') || firstLine.startsWith('[')) return `JSON, ${lineCount} lines` - if (firstLine.match(/^\s*\d+[→|│:]/)) return `file content, ${lineCount} lines` - if (firstLine.startsWith('diff ') || firstLine.startsWith('---')) return `diff, ${lineCount} lines` - if (firstLine.startsWith('$') || firstLine.startsWith('>')) return `command output, ${lineCount} lines` - - // Use first line as summary, truncated - const short = firstLine.length > 60 ? firstLine.slice(0, 57) + '...' : firstLine - return `${short} (${lineCount} lines)` -} - -/** - * Clean up a trace directory. - */ -export function cleanupTraceDir(traceDir: string): void { - try { - fs.rmSync(traceDir, { recursive: true, force: true }) - } catch { - // ignore - } -} diff --git a/evalbuff/src/types.ts b/evalbuff/src/types.ts deleted file mode 100644 index 52d30196aa..0000000000 --- a/evalbuff/src/types.ts +++ /dev/null @@ -1,83 +0,0 @@ -import type { JudgingResult } from './judge' - -export interface FileState { - path: string - preContent: string - postContent: string -} - -export interface EvalCommit { - sha: string - parentSha: string - spec: string - fileStates: FileState[] -} - -export interface EvalData { - repoUrl: string - testRepoName?: string - generationDate: string - initCommand?: string - evalCommits: EvalCommit[] -} - -export interface FileDiff { - path: string - status: 'modified' | 'added' | 'deleted' | 'renamed' - oldPath?: string - diff: string -} - -export interface EvalCommitV2 { - id: string - sha: string - parentSha: string - spec: string - prompt: string - supplementalFiles: string[] - fileDiffs: FileDiff[] -} - -export interface BinInstall { - name: string - installScript: string - binPath: string -} - -export interface EvalDataV2 { - repoUrl: string - testRepoName?: string - generationDate: string - initCommand?: string - binInstalls?: BinInstall[] - env?: Record - finalCheckCommands?: string[] - evalCommits: EvalCommitV2[] -} - -export interface FinalCheckOutput { - command: string - exitCode: number - stdout: string - stderr: string -} - -export interface EvalRun { - commitSha: string - prompt: string - diff: string - judging: JudgingResult - cost: number - durationMs: number - error?: string - finalCheckOutputs?: FinalCheckOutput[] -} - -export interface AgentEvalResults { - agentId: string - runs: EvalRun[] - averageScore: number - averageScoreExcludingFailures: number - averageCost: number - averageDuration: number -} diff --git a/evalbuff/tsconfig.json b/evalbuff/tsconfig.json deleted file mode 100644 index fcd93ea3e0..0000000000 --- a/evalbuff/tsconfig.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "extends": "../tsconfig.base.json", - "compilerOptions": { - "types": ["bun", "node"], - "baseUrl": ".", - "skipLibCheck": true, - "paths": { - "@codebuff/sdk": ["../sdk/src/index.ts"], - "@codebuff/sdk/*": ["../sdk/src/*"] - } - }, - "include": ["src/**/*.ts"], - "exclude": ["node_modules"] -} diff --git a/expensivebuff/cli/release/README.md b/expensivebuff/cli/release/README.md deleted file mode 100644 index 759196485b..0000000000 --- a/expensivebuff/cli/release/README.md +++ /dev/null @@ -1,51 +0,0 @@ -# Expensivebuff - -**The world's most expensive coding agent.** Because sometimes you just need to show off. - -An AI coding agent that runs in your terminal with premium branding and absolutely no additional features. - -## Install - -```bash -npm install -g expensivebuff -``` - -## Usage - -```bash -cd ~/my-project -expensivebuff -``` - -## Why Expensivebuff? - -**Expensive** - It says so right in the name. What more do you need? - -**Premium** - Built with luxury in mind. Same code as Codebuff, but fancier. - -**Irony** - Sometimes the best jokes are the ones that cost money. - -## FAQ - -**Is it actually more expensive?** No. It's exactly the same as Codebuff. The joke is the point. - -**Why would I use this?** You probably wouldn't. But it's fun to run `npm i -g expensivebuff` and see the logo. - -**Is this for real?** It's an April Fools joke. The code is identical to Codebuff. - -## The Joke - -``` -Codebuff is now Expensivebuff! -npm i -g expensivebuff -``` - -Because nothing says "I have too much money to spend on software" like a coding agent with a different name. - -## Links - -- [Documentation](https://codebuff.com/docs) -- [GitHub](https://github.com/CodebuffAI/codebuff) -- [Website](https://codebuff.com) - -> Built on the [Codebuff](https://codebuff.com) platform. \ No newline at end of file diff --git a/expensivebuff/cli/release/index.js b/expensivebuff/cli/release/index.js deleted file mode 100644 index caea24c263..0000000000 --- a/expensivebuff/cli/release/index.js +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env node - -const LOGO = ` - ███████╗██╗ ██╗██████╗ ███████╗███╗ ██╗███████╗██╗██╗ ██╗███████╗ - ██╔════╝╚██╗██╔╝██╔══██╗██╔════╝████╗ ██║██╔════╝██║██║ ██║██╔════╝ - █████╗ ╚███╔╝ ██████╔╝█████╗ ██╔██╗ ██║███████╗██║██║ ██║█████╗ - ██╔══╝ ██╔██╗ ██╔═══╝ ██╔══╝ ██║╚██╗██║╚════██║██║╚██╗ ██╔╝██╔══╝ - ███████╗██╔╝ ██╗██║ ███████╗██║ ╚████║███████║██║ ╚████╔╝ ███████╗ - ╚══════╝╚═╝ ╚═╝╚═╝ ╚══════╝╚═╝ ╚═══╝╚══════╝╚═╝ ╚═══╝ ╚══════╝ - ██████╗ ██╗ ██╗███████╗███████╗ - ██╔══██╗██║ ██║██╔════╝██╔════╝ - ██████╔╝██║ ██║█████╗ █████╗ - ██╔══██╗██║ ██║██╔══╝ ██╔══╝ - ██████╔╝╚██████╔╝██║ ██║ - ╚═════╝ ╚═════╝ ╚═╝ ╚═╝ -` - -console.log(LOGO) -console.log(' 🎉 April Fools! 🎉') -console.log() -console.log(' Expensivebuff isn\'t real (yet). But while you\'re here, check out these other coding agents:') -console.log() -console.log(' Codebuff — the powerful AI coding agent') -console.log(' \x1b[36mnpm i -g codebuff\x1b[0m') -console.log() -console.log(' Freebuff — the free AI coding agent') -console.log(' \x1b[36mnpm i -g freebuff\x1b[0m') -console.log() -console.log(' Learn more at \x1b[4mhttps://codebuff.com\x1b[0m') -console.log() diff --git a/expensivebuff/cli/release/package.json b/expensivebuff/cli/release/package.json deleted file mode 100644 index 7b761c8d7d..0000000000 --- a/expensivebuff/cli/release/package.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "expensivebuff", - "version": "1.0.4", - "description": "The world's most expensive coding agent", - "license": "MIT", - "bin": { - "expensivebuff": "index.js" - }, - "files": [ - "index.js", - "README.md" - ], - "engines": { - "node": ">=16" - }, - "repository": { - "type": "git", - "url": "https://github.com/CodebuffAI/codebuff.git" - }, - "homepage": "https://codebuff.com", - "publishConfig": { - "access": "public" - } -} diff --git a/package.json b/package.json index ef4f2ea967..ad1d8002dc 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,6 @@ "packages/*", "scripts", "evals", - "evalbuff", "sdk", "agents", "cli"