garrytan · garrytan · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -120,7 +120,7 @@ Refs (`@e1`, `@e2`, `@c1`) are how the agent addresses page elements without wri
 2. Server calls Playwright's page.accessibility.snapshot()
 3. Parser walks the ARIA tree, assigns sequential refs: @e1, @e2, @e3...
 4. For each ref, builds a Playwright Locator: getByRole(role, { name }).nth(index)
-5. Stores Map<string, Locator> on the BrowserManager instance
+5. Stores Map<string, RefEntry> on the BrowserManager instance (role + name + Locator)
 6. Returns the annotated tree as plain text
 
 Later:
@@ -142,6 +142,19 @@ Playwright Locators are external to the DOM. They use the accessibility tree (wh
 
 Refs are cleared on navigation (the `framenavigated` event on the main frame). This is correct — after navigation, all locators are stale. The agent must run `snapshot` again to get fresh refs. This is by design: stale refs should fail loudly, not click the wrong element.
 
+### Ref staleness detection
+
+SPAs can mutate the DOM without triggering `framenavigated` (e.g. React router transitions, tab switches, modal opens). This makes refs stale even though the page URL didn't change. To catch this, `resolveRef()` performs an async `count()` check before using any ref:
+
+```
+resolveRef(@e3) → entry = refMap.get("e3")
+                → count = await entry.locator.count()
+                → if count === 0: throw "Ref @e3 is stale — element no longer exists. Run 'snapshot' to get fresh refs."
+                → if count > 0: return { locator }
+```
+
+This fails fast (~5ms overhead) instead of letting Playwright's 30-second action timeout expire on a missing element. The `RefEntry` stores `role` and `name` metadata alongside the Locator so the error message can tell the agent what the element was.
+
 ### Cursor-interactive refs (@c)
 
 The `-C` flag finds elements that are clickable but not in the ARIA tree — things styled with `cursor: pointer`, elements with `onclick` attributes, or custom `tabindex`. These get `@c1`, `@c2` refs in a separate namespace. This catches custom components that frameworks render as `<div>` but are actually buttons.

diff --git a/BROWSER.md b/BROWSER.md
@@ -87,6 +87,8 @@ The browser's key innovation is ref-based element selection, built on Playwright
 
 No DOM mutation. No injected scripts. Just Playwright's native accessibility API.
 
+**Ref staleness detection:** SPAs can mutate the DOM without navigation (React router, tab switches, modals). When this happens, refs collected from a previous `snapshot` may point to elements that no longer exist. To handle this, `resolveRef()` runs an async `count()` check before using any ref — if the element count is 0, it throws immediately with a message telling the agent to re-run `snapshot`. This fails fast (~5ms) instead of waiting for Playwright's 30-second action timeout.
+
 **Extended snapshot features:**
 - `--diff` (`-D`): Stores each snapshot as a baseline. On the next `-D` call, returns a unified diff showing what changed. Use this to verify that an action (click, fill, etc.) actually worked.
 - `--annotate` (`-a`): Injects temporary overlay divs at each ref's bounding box, takes a screenshot with ref labels visible, then removes the overlays. Use `-o <path>` to control the output path.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Changelog
 
+## 0.4.0 — 2026-03-16
+
+### Added
+- **QA-only skill** (`/qa-only`) — report-only QA mode that finds and documents bugs without making fixes. Hand off a clean bug report to your team without the agent touching your code.
+- **QA fix loop** — `/qa` now runs a find-fix-verify cycle: discover bugs, fix them, commit, re-navigate to confirm the fix took. One command to go from broken to shipped.
+- **Plan-to-QA artifact flow** — `/plan-eng-review` writes test-plan artifacts that `/qa` picks up automatically. Your engineering review now feeds directly into QA testing with no manual copy-paste.
+- **`{{QA_METHODOLOGY}}` DRY placeholder** — shared QA methodology block injected into both `/qa` and `/qa-only` templates. Keeps both skills in sync when you update testing standards.
+- **Eval efficiency metrics** — turns, duration, and cost now displayed across all eval surfaces with natural-language **Takeaway** commentary. See at a glance whether your prompt changes made the agent faster or slower.
+- **`generateCommentary()` engine** — interprets comparison deltas so you don't have to: flags regressions, notes improvements, and produces an overall efficiency summary.
+- **Eval list columns** — `bun run eval:list` now shows Turns and Duration per run. Spot expensive or slow runs instantly.
+- **Eval summary per-test efficiency** — `bun run eval:summary` shows average turns/duration/cost per test across runs. Identify which tests are costing you the most over time.
+- **`judgePassed()` unit tests** — extracted and tested the pass/fail judgment logic.
+- **3 new E2E tests** — qa-only no-fix guardrail, qa fix loop with commit verification, plan-eng-review test-plan artifact.
+- **Browser ref staleness detection** — `resolveRef()` now checks element count to detect stale refs after page mutations. SPA navigation no longer causes 30-second timeouts on missing elements.
+- 3 new snapshot tests for ref staleness.
+
+### Changed
+- QA skill prompt restructured with explicit two-cycle workflow (find → fix → verify).
+- `formatComparison()` now shows per-test turns and duration deltas alongside cost.
+- `printSummary()` shows turns and duration columns.
+- `eval-store.test.ts` fixed pre-existing `_partial` file assertion bug.
+
+### Fixed
+- Browser ref staleness — refs collected before page mutation (e.g. SPA navigation) are now detected and re-collected. Eliminates a class of flaky QA failures on dynamic sites.
+
 ## 0.3.9 — 2026-03-15
 
 ### Added

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -42,6 +42,7 @@ gstack/
 │   ├── gen-skill-docs.test.ts    # Tier 1: generator quality (free, <1s)
 │   ├── skill-llm-eval.test.ts   # Tier 3: LLM-as-judge (~$0.15/run)
 │   └── skill-e2e.test.ts         # Tier 2: E2E via claude -p (~$3.85/run)
+├── qa-only/         # /qa-only skill (report-only QA, no fixes)
 ├── ship/            # Ship workflow skill
 ├── review/          # PR review skill
 ├── plan-ceo-review/ # /plan-ceo-review skill

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -131,11 +131,13 @@ When E2E tests run, they produce machine-readable artifacts in `~/.gstack-dev/`:
 **Eval history tools:**
 
 ```bash
-bun run eval:list            # list all eval runs
-bun run eval:compare         # compare two runs (auto-picks most recent)
-bun run eval:summary         # aggregate stats across all runs
+bun run eval:list            # list all eval runs (turns, duration, cost per run)
+bun run eval:compare         # compare two runs — shows per-test deltas + Takeaway commentary
+bun run eval:summary         # aggregate stats + per-test efficiency averages across runs
 ```
 
+**Eval comparison commentary:** `eval:compare` generates natural-language Takeaway sections interpreting what changed between runs — flagging regressions, noting improvements, calling out efficiency gains (fewer turns, faster, cheaper), and producing an overall summary. This is driven by `generateCommentary()` in `eval-store.ts`.
+
 Artifacts are never cleaned up — they accumulate in `~/.gstack-dev/` for post-mortem debugging and trend analysis.
 
 ### Tier 3: LLM-as-judge (~$0.15/run)

diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 **gstack turns Claude Code from one generic assistant into a team of specialists you can summon on demand.**
 
-Eight opinionated workflow skills for [Claude Code](https://docs.anthropic.com/en/docs/claude-code). Plan review, code review, one-command shipping, browser automation, QA testing, and engineering retrospectives — all as slash commands.
+Nine opinionated workflow skills for [Claude Code](https://docs.anthropic.com/en/docs/claude-code). Plan review, code review, one-command shipping, browser automation, QA testing, and engineering retrospectives — all as slash commands.
 
 ### Without gstack
 
@@ -22,7 +22,8 @@ Eight opinionated workflow skills for [Claude Code](https://docs.anthropic.com/e
 | `/review` | Paranoid staff engineer | Find the bugs that pass CI but blow up in production. Triages Greptile review comments. |
 | `/ship` | Release engineer | Sync main, run tests, resolve Greptile reviews, push, open PR. For a ready branch, not for deciding what to build. |
 | `/browse` | QA engineer | Give the agent eyes. It logs in, clicks through your app, takes screenshots, catches breakage. Full QA pass in 60 seconds. |
-| `/qa` | QA lead | Systematic QA testing. On a feature branch, auto-analyzes your diff, identifies affected pages, and tests them. Also: full exploration, quick smoke test, regression mode. |
+| `/qa` | QA + fix engineer | Test app, find bugs, fix them with atomic commits, re-verify. Before/after health scores and ship-readiness summary. Three tiers: Quick, Standard, Exhaustive. |
+| `/qa-only` | QA reporter | Report-only QA testing. Same methodology as /qa but never fixes anything. Use when you want a pure bug report without code changes. |
 | `/setup-browser-cookies` | Session manager | Import cookies from your real browser (Comet, Chrome, Arc, Brave, Edge) into the headless session. Test authenticated pages without logging in manually. |
 | `/retro` | Engineering manager | Team-aware retro: your deep-dive + per-person praise and growth opportunities for every contributor. |
 
@@ -103,7 +104,7 @@ This is the setup I use. One person, ten parallel agents, each with the right co
 
 Open Claude Code and paste this. Claude will do the rest.
 
-> Install gstack: run `git clone https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup` then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /plan-ceo-review, /plan-eng-review, /review, /ship, /browse, /qa, /setup-browser-cookies, /retro. Then ask the user if they also want to add gstack to the current project so teammates get it.
+> Install gstack: run `git clone https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup` then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /plan-ceo-review, /plan-eng-review, /review, /ship, /browse, /qa, /qa-only, /setup-browser-cookies, /retro. Then ask the user if they also want to add gstack to the current project so teammates get it.
 
 ### Step 2: Add to your repo so teammates get it (optional)
 
@@ -613,7 +614,7 @@ Or set `auto_upgrade: true` in `~/.gstack/config.yaml` to upgrade automatically
 
 Paste this into Claude Code:
 
-> Uninstall gstack: remove the skill symlinks by running `for s in browse plan-ceo-review plan-eng-review review ship retro qa setup-browser-cookies; do rm -f ~/.claude/skills/$s; done` then run `rm -rf ~/.claude/skills/gstack` and remove the gstack section from CLAUDE.md. If this project also has gstack at .claude/skills/gstack, remove it by running `for s in browse plan-ceo-review plan-eng-review review ship retro qa setup-browser-cookies; do rm -f .claude/skills/$s; done && rm -rf .claude/skills/gstack` and remove the gstack section from the project CLAUDE.md too.
+> Uninstall gstack: remove the skill symlinks by running `for s in browse plan-ceo-review plan-eng-review review ship retro qa qa-only setup-browser-cookies; do rm -f ~/.claude/skills/$s; done` then run `rm -rf ~/.claude/skills/gstack` and remove the gstack section from CLAUDE.md. If this project also has gstack at .claude/skills/gstack, remove it by running `for s in browse plan-ceo-review plan-eng-review review ship retro qa qa-only setup-browser-cookies; do rm -f .claude/skills/$s; done && rm -rf .claude/skills/gstack` and remove the gstack section from the project CLAUDE.md too.
 
 ## Development
 

diff --git a/TODOS.md b/TODOS.md
@@ -350,6 +350,30 @@
 **Priority:** P3
 **Depends on:** Eval persistence (shipped in v0.3.6)
 
+### CI/CD QA quality gate
+
+**What:** Run `/qa` as a GitHub Action step, fail PR if health score drops below threshold.
+
+**Why:** Automated quality gate catches regressions before merge. Currently QA is manual — CI integration makes it part of the standard workflow.
+
+**Context:** Requires headless browse binary available in CI. The `/qa` skill already produces `baseline.json` with health scores — CI step would compare against the main branch baseline and fail if score drops. Would need `ANTHROPIC_API_KEY` in CI secrets since `/qa` uses Claude.
+
+**Effort:** M
+**Priority:** P2
+**Depends on:** None
+
+### CDP-based DOM mutation detection for ref staleness
+
+**What:** Use Chrome DevTools Protocol `DOM.documentUpdated` / MutationObserver events to proactively invalidate stale refs when the DOM changes, without requiring an explicit `snapshot` call.
+
+**Why:** Current ref staleness detection (async count() check) only catches stale refs at action time. CDP mutation detection would proactively warn when refs become stale, preventing the 5-second timeout entirely for SPA re-renders.
+
+**Context:** Parts 1+2 of ref staleness fix (RefEntry metadata + eager validation via count()) are shipped. This is Part 3 — the most ambitious piece. Requires CDP session alongside Playwright, MutationObserver bridge, and careful performance tuning to avoid overhead on every DOM change.
+
+**Effort:** L
+**Priority:** P3
+**Depends on:** Ref staleness Parts 1+2 (shipped)
+
 ## Completed
 
 ### Phase 1: Foundations (v0.2.0)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.3.9
+0.4.0
diff --git a/browse/src/browser-manager.ts b/browse/src/browser-manager.ts
@@ -18,6 +18,12 @@
 import { chromium, type Browser, type BrowserContext, type Page, type Locator } from 'playwright';
 import { addConsoleEntry, addNetworkEntry, addDialogEntry, networkBuffer, type DialogEntry } from './buffers';
 
+export interface RefEntry {
+  locator: Locator;
+  role: string;
+  name: string;
+}
+
 export class BrowserManager {
   private browser: Browser | null = null;
   private context: BrowserContext | null = null;
@@ -31,7 +37,7 @@ export class BrowserManager {
   public serverPort: number = 0;
 
   // ─── Ref Map (snapshot → @e1, @e2, @c1, @c2, ...) ────────
-  private refMap: Map<string, Locator> = new Map();
+  private refMap: Map<string, RefEntry> = new Map();
 
   // ─── Snapshot Diffing ─────────────────────────────────────
   // NOT cleared on navigation — it's a text baseline for diffing
@@ -169,7 +175,7 @@ export class BrowserManager {
   }
 
   // ─── Ref Map ──────────────────────────────────────────────
-  setRefMap(refs: Map<string, Locator>) {
+  setRefMap(refs: Map<string, RefEntry>) {
     this.refMap = refs;
   }
 
@@ -181,16 +187,23 @@ export class BrowserManager {
    * Resolve a selector that may be a @ref (e.g., "@e3", "@c1") or a CSS selector.
    * Returns { locator } for refs or { selector } for CSS selectors.
    */
-  resolveRef(selector: string): { locator: Locator } | { selector: string } {
+  async resolveRef(selector: string): Promise<{ locator: Locator } | { selector: string }> {
     if (selector.startsWith('@e') || selector.startsWith('@c')) {
       const ref = selector.slice(1); // "e3" or "c1"
-      const locator = this.refMap.get(ref);
-      if (!locator) {
+      const entry = this.refMap.get(ref);
+      if (!entry) {
+        throw new Error(
+          `Ref ${selector} not found. Run 'snapshot' to get fresh refs.`
+        );
+      }
+      const count = await entry.locator.count();
+      if (count === 0) {
         throw new Error(
-          `Ref ${selector} not found. Page may have changed — run 'snapshot' to get fresh refs.`
+          `Ref ${selector} (${entry.role} "${entry.name}") is stale — element no longer exists. ` +
+          `Run 'snapshot' for fresh refs.`
         );
       }
-      return { locator };
+      return { locator: entry.locator };
     }
     return { selector };
   }

diff --git a/browse/src/meta-commands.ts b/browse/src/meta-commands.ts
@@ -150,7 +150,7 @@ export async function handleMetaCommand(
       }
 
       if (targetSelector) {
-        const resolved = bm.resolveRef(targetSelector);
+        const resolved = await bm.resolveRef(targetSelector);
         const locator = 'locator' in resolved ? resolved.locator : page.locator(resolved.selector);
         await locator.screenshot({ path: outputPath, timeout: 5000 });
         return `Screenshot saved (element): ${outputPath}`;

diff --git a/browse/src/read-commands.ts b/browse/src/read-commands.ts
@@ -61,7 +61,7 @@ export async function handleReadCommand(
     case 'html': {
       const selector = args[0];
       if (selector) {
-        const resolved = bm.resolveRef(selector);
+        const resolved = await bm.resolveRef(selector);
         if ('locator' in resolved) {
           return await resolved.locator.innerHTML({ timeout: 5000 });
         }
@@ -135,7 +135,7 @@ export async function handleReadCommand(
     case 'css': {
       const [selector, property] = args;
       if (!selector || !property) throw new Error('Usage: browse css <selector> <property>');
-      const resolved = bm.resolveRef(selector);
+      const resolved = await bm.resolveRef(selector);
       if ('locator' in resolved) {
         const value = await resolved.locator.evaluate(
           (el, prop) => getComputedStyle(el).getPropertyValue(prop),
@@ -157,7 +157,7 @@ export async function handleReadCommand(
     case 'attrs': {
       const selector = args[0];
       if (!selector) throw new Error('Usage: browse attrs <selector>');
-      const resolved = bm.resolveRef(selector);
+      const resolved = await bm.resolveRef(selector);
       if ('locator' in resolved) {
         const attrs = await resolved.locator.evaluate((el) => {
           const result: Record<string, string> = {};
@@ -221,7 +221,7 @@ export async function handleReadCommand(
       const selector = args[1];
       if (!property || !selector) throw new Error('Usage: browse is <property> <selector>\nProperties: visible, hidden, enabled, disabled, checked, editable, focused');
 
-      const resolved = bm.resolveRef(selector);
+      const resolved = await bm.resolveRef(selector);
       let locator;
       if ('locator' in resolved) {
         locator = resolved.locator;

diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts
@@ -18,7 +18,7 @@
  */
 
 import type { Page, Locator } from 'playwright';
-import type { BrowserManager } from './browser-manager';
+import type { BrowserManager, RefEntry } from './browser-manager';
 import * as Diff from 'diff';
 
 // Roles considered "interactive" for the -i flag
@@ -154,7 +154,7 @@ export async function handleSnapshot(
 
   // Parse the ariaSnapshot output
   const lines = ariaText.split('\n');
-  const refMap = new Map<string, Locator>();
+  const refMap = new Map<string, RefEntry>();
   const output: string[] = [];
   let refCounter = 1;
 
@@ -218,7 +218,7 @@ export async function handleSnapshot(
       locator = locator.nth(seenIndex);
     }
 
-    refMap.set(ref, locator);
+    refMap.set(ref, { locator, role: node.role, name: node.name || '' });
 
     // Format output line
     let outputLine = `${indent}@${ref} [${node.role}]`;
@@ -287,7 +287,7 @@ export async function handleSnapshot(
         for (const elem of cursorElements) {
           const ref = `c${cRefCounter++}`;
           const locator = page.locator(elem.selector);
-          refMap.set(ref, locator);
+          refMap.set(ref, { locator, role: 'cursor-interactive', name: elem.text });
           output.push(`@${ref} [${elem.reason}] "${elem.text}"`);
         }
       }
@@ -318,9 +318,9 @@ export async function handleSnapshot(
     try {
       // Inject overlay divs at each ref's bounding box
       const boxes: Array<{ ref: string; box: { x: number; y: number; width: number; height: number } }> = [];
-      for (const [ref, locator] of refMap) {
+      for (const [ref, entry] of refMap) {
         try {
-          const box = await locator.boundingBox({ timeout: 1000 });
+          const box = await entry.locator.boundingBox({ timeout: 1000 });
           if (box) {
             boxes.push({ ref: `@${ref}`, box });
           }