diff --git a/cloudflare-gastown/AGENTS.md b/cloudflare-gastown/AGENTS.md index f06633c06..28af50abb 100644 --- a/cloudflare-gastown/AGENTS.md +++ b/cloudflare-gastown/AGENTS.md @@ -58,6 +58,7 @@ - `${beads}` → bare table name. Use for `FROM`, `INSERT INTO`, `DELETE FROM`. - `${beads.columns.status}` → bare column name. Use for `SET` clauses and `INSERT` column lists where the table is already implied. - `${beads.status}` → qualified `table.column`. Use for `SELECT`, `WHERE`, `JOIN ON`, `ORDER BY`, and anywhere a column could be ambiguous. +- **Do not alias tables in SQL queries.** Always use the full table name and the qualified `${table.column}` interpolator. Aliases like `FROM beads b` combined with the qualified interpolator produce double-qualified names (`b.beads.bead_id`) that SQLite rejects. If a self-join requires disambiguation, use a raw string alias only for the second copy and reference its columns with `${table.columns.col}` (bare) prefixed manually. - Prefer static queries over dynamically constructed ones. Move conditional logic into the query itself using SQL constructs like `COALESCE`, `CASE`, `NULLIF`, or `WHERE (? IS NULL OR col = ?)` patterns so the full query is always visible as a single readable string. - Always parse query results with the Zod `Record` schemas from `db/tables/*.table.ts`. Never use ad-hoc `as Record` casts or `String(row.col)` to extract fields — use `.pick()` for partial selects and `.array()` for lists, e.g. `BeadRecord.pick({ bead_id: true }).array().parse(rows)`. This keeps row parsing type-safe and co-located with the schema definition. - When a column has a SQL `CHECK` constraint that restricts it to a set of values (i.e. an enum), mirror that in the Record schema using `z.enum()` rather than `z.string()`, e.g. `role: z.enum(['polecat', 'refinery', 'mayor', 'witness'])`. diff --git a/cloudflare-gastown/container/src/completion-reporter.ts b/cloudflare-gastown/container/src/completion-reporter.ts index 8ad4be23b..7a7766e61 100644 --- a/cloudflare-gastown/container/src/completion-reporter.ts +++ b/cloudflare-gastown/container/src/completion-reporter.ts @@ -7,6 +7,47 @@ import type { ManagedAgent } from './types'; +/** + * Notify the TownDO that the mayor has finished processing a prompt and + * is now waiting for user input. This lets the TownDO transition the + * mayor from "working" to "waiting", which drops the alarm to the idle + * cadence and stops health-check pings that reset the container's + * sleepAfter timer. + * + * Best-effort: errors are logged but do not propagate. + */ +export async function reportMayorWaiting(agent: ManagedAgent): Promise { + const apiUrl = agent.gastownApiUrl; + const authToken = + process.env.GASTOWN_CONTAINER_TOKEN ?? agent.gastownContainerToken ?? agent.gastownSessionToken; + if (!apiUrl || !authToken) { + console.warn( + `Cannot report mayor ${agent.agentId} waiting: no API credentials on agent record` + ); + return; + } + + const url = `${apiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/waiting`; + try { + const response = await fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + }, + body: JSON.stringify({ agentId: agent.agentId, firedAt: Date.now() }), + }); + + if (!response.ok) { + console.warn( + `Failed to report mayor ${agent.agentId} waiting: ${response.status} ${response.statusText}` + ); + } + } catch (err) { + console.warn(`Error reporting mayor ${agent.agentId} waiting:`, err); + } +} + /** * Notify the Rig DO that an agent session has completed or failed. * Best-effort: errors are logged but do not propagate. diff --git a/cloudflare-gastown/container/src/control-server.ts b/cloudflare-gastown/container/src/control-server.ts index 70ba58c62..72e9dcbb7 100644 --- a/cloudflare-gastown/container/src/control-server.ts +++ b/cloudflare-gastown/container/src/control-server.ts @@ -10,10 +10,12 @@ import { activeServerCount, getUptime, stopAll, + drainAll, + isDraining, getAgentEvents, registerEventSink, } from './process-manager'; -import { startHeartbeat, stopHeartbeat } from './heartbeat'; +import { startHeartbeat, stopHeartbeat, notifyContainerReady } from './heartbeat'; import { pushContext as pushDashboardContext } from './dashboard-context'; import { mergeBranch, setupRigBrowseWorktree } from './git-manager'; import { @@ -46,6 +48,53 @@ export function getCurrentTownConfig(): Record | null { return lastKnownTownConfig; } +/** + * Sync config-derived env vars from the last-known town config into + * process.env. Safe to call at any time — no-ops when no config is cached. + */ +function syncTownConfigToProcessEnv(): void { + const cfg = getCurrentTownConfig(); + if (!cfg) return; + + const CONFIG_ENV_MAP: Array<[string, string]> = [ + ['github_cli_pat', 'GITHUB_CLI_PAT'], + ['git_author_name', 'GASTOWN_GIT_AUTHOR_NAME'], + ['git_author_email', 'GASTOWN_GIT_AUTHOR_EMAIL'], + ['kilocode_token', 'KILOCODE_TOKEN'], + ]; + for (const [cfgKey, envKey] of CONFIG_ENV_MAP) { + const val = cfg[cfgKey]; + if (typeof val === 'string' && val) { + process.env[envKey] = val; + } else { + delete process.env[envKey]; + } + } + + const gitAuth = cfg.git_auth; + if (typeof gitAuth === 'object' && gitAuth !== null) { + const auth = gitAuth as Record; + for (const [authKey, envKey] of [ + ['github_token', 'GIT_TOKEN'], + ['gitlab_token', 'GITLAB_TOKEN'], + ['gitlab_instance_url', 'GITLAB_INSTANCE_URL'], + ] as const) { + const val = auth[authKey]; + if (typeof val === 'string' && val) { + process.env[envKey] = val; + } else { + delete process.env[envKey]; + } + } + } + + if (cfg.disable_ai_coauthor) { + process.env.GASTOWN_DISABLE_AI_COAUTHOR = '1'; + } else { + delete process.env.GASTOWN_DISABLE_AI_COAUTHOR; + } +} + export const app = new Hono(); // Parse and validate town config from X-Town-Config header (sent by TownDO on @@ -92,11 +141,21 @@ app.use('*', async (c, next) => { // GET /health app.get('/health', c => { + // When the TownDO is draining, it passes the drain nonce and town + // ID via headers so idle containers (no running agents) can + // acknowledge readiness and clear the drain flag. + const drainNonce = c.req.header('X-Drain-Nonce'); + const townId = c.req.header('X-Town-Id'); + if (drainNonce && townId) { + void notifyContainerReady(townId, drainNonce); + } + const response: HealthResponse = { status: 'ok', agents: activeAgentCount(), servers: activeServerCount(), uptime: getUptime(), + draining: isDraining() || undefined, }; return c.json(response); }); @@ -133,8 +192,23 @@ app.post('/refresh-token', async c => { return c.json({ refreshed: true }); }); +// POST /sync-config +// Push config-derived env vars from X-Town-Config into process.env on +// the running container. Called by TownDO.syncConfigToContainer() after +// persisting env vars to DO storage, so the live process picks up +// changes (e.g. refreshed KILOCODE_TOKEN) without a container restart. +app.post('/sync-config', async c => { + syncTownConfigToProcessEnv(); + return c.json({ synced: true }); +}); + // POST /agents/start app.post('/agents/start', async c => { + if (isDraining()) { + console.warn('[control-server] /agents/start: rejected — container is draining'); + return c.json({ error: 'Container is draining, cannot start new agents' }, 503); + } + const body: unknown = await c.req.json().catch(() => null); const parsed = StartAgentRequest.safeParse(body); if (!parsed.success) { @@ -214,45 +288,7 @@ app.patch('/agents/:agentId/model', async c => { // Sync config-derived env vars from X-Town-Config into process.env so // the SDK server restart picks up fresh tokens and git identity. // The middleware already parsed the header into lastKnownTownConfig. - const cfg = getCurrentTownConfig(); - if (cfg) { - const CONFIG_ENV_MAP: Array<[string, string]> = [ - ['github_cli_pat', 'GITHUB_CLI_PAT'], - ['git_author_name', 'GASTOWN_GIT_AUTHOR_NAME'], - ['git_author_email', 'GASTOWN_GIT_AUTHOR_EMAIL'], - ]; - for (const [cfgKey, envKey] of CONFIG_ENV_MAP) { - const val = cfg[cfgKey]; - if (typeof val === 'string' && val) { - process.env[envKey] = val; - } else { - delete process.env[envKey]; - } - } - // git_auth tokens - const gitAuth = cfg.git_auth; - if (typeof gitAuth === 'object' && gitAuth !== null) { - const auth = gitAuth as Record; - for (const [authKey, envKey] of [ - ['github_token', 'GIT_TOKEN'], - ['gitlab_token', 'GITLAB_TOKEN'], - ['gitlab_instance_url', 'GITLAB_INSTANCE_URL'], - ] as const) { - const val = auth[authKey]; - if (typeof val === 'string' && val) { - process.env[envKey] = val; - } else { - delete process.env[envKey]; - } - } - } - // disable_ai_coauthor - if (cfg.disable_ai_coauthor) { - process.env.GASTOWN_DISABLE_AI_COAUTHOR = '1'; - } else { - delete process.env.GASTOWN_DISABLE_AI_COAUTHOR; - } - } + syncTownConfigToProcessEnv(); await updateAgentModel( agentId, @@ -723,7 +759,7 @@ export function startControlServer(): void { startHeartbeat(apiUrl, authToken); } - // Handle graceful shutdown + // Handle graceful shutdown (immediate, no drain — used by SIGINT for dev) const shutdown = async () => { console.log('Shutting down control server...'); stopHeartbeat(); @@ -731,7 +767,18 @@ export function startControlServer(): void { process.exit(0); }; - process.on('SIGTERM', () => void shutdown()); + process.on( + 'SIGTERM', + () => + void (async () => { + console.log('[control-server] SIGTERM received — starting graceful drain...'); + stopHeartbeat(); + await drainAll(); + await stopAll(); + process.exit(0); + })() + ); + process.on('SIGINT', () => void shutdown()); // Track connected WebSocket clients with optional agent filter diff --git a/cloudflare-gastown/container/src/git-manager.ts b/cloudflare-gastown/container/src/git-manager.ts index a65d34370..5be0b134a 100644 --- a/cloudflare-gastown/container/src/git-manager.ts +++ b/cloudflare-gastown/container/src/git-manager.ts @@ -275,9 +275,45 @@ async function cloneRepoInner( `Cloning repo for rig ${options.rigId}: hasAuth=${hasAuth} envKeys=[${Object.keys(options.envVars ?? {}).join(',')}]` ); + // Omit --branch: on empty repos (no commits) the default branch doesn't + // exist yet, so `git clone --branch ` would fail with + // "Remote branch not found in upstream origin". await mkdir(dir, { recursive: true }); - await exec('git', ['clone', '--no-checkout', '--branch', options.defaultBranch, authUrl, dir]); + await exec('git', ['clone', '--no-checkout', authUrl, dir]); await configureRepoCredentials(dir, options.gitUrl, options.envVars); + + // Detect empty repo: git rev-parse HEAD fails when there are no commits. + const isEmpty = await exec('git', ['rev-parse', 'HEAD'], dir) + .then(() => false) + .catch(() => true); + + if (isEmpty) { + console.log(`Detected empty repo for rig ${options.rigId}, creating initial commit`); + // Create an initial empty commit so branches/worktrees can be created. + // Use -c flags for user identity (the repo has no config yet and the + // container may not have GIT_AUTHOR_NAME set). + await exec( + 'git', + [ + '-c', + 'user.name=Gastown', + '-c', + 'user.email=gastown@kilo.ai', + 'commit', + '--allow-empty', + '-m', + 'Initial commit', + ], + dir + ); + await exec('git', ['push', 'origin', `HEAD:${options.defaultBranch}`], dir); + // Best-effort: set remote HEAD so future operations know the default branch + await exec('git', ['remote', 'set-head', 'origin', options.defaultBranch], dir).catch(() => {}); + // Fetch so origin/ ref is available locally + await exec('git', ['fetch', 'origin'], dir); + console.log(`Created initial commit on empty repo for rig ${options.rigId}`); + } + console.log(`Cloned repo for rig ${options.rigId}`); return dir; } @@ -303,6 +339,18 @@ async function createWorktreeInner(options: WorktreeOptions): Promise { return dir; } + // Verify the repo has at least one commit. If cloneRepoInner's initial + // commit push failed, there's no HEAD and we can't create branches. + const hasHead = await exec('git', ['rev-parse', '--verify', 'HEAD'], repo) + .then(() => true) + .catch(() => false); + + if (!hasHead) { + throw new Error( + `Cannot create worktree: repo has no commits. Push an initial commit first or re-connect the rig.` + ); + } + // When a startPoint is provided (e.g. a convoy feature branch), create // the new branch from that ref so the agent begins with the latest // merged work from upstream. Without a startPoint, try to track the @@ -398,6 +446,24 @@ async function setupBrowseWorktreeInner(rigId: string, defaultBranch: string): P return browseDir; } + // Check whether origin/ exists. On a repo that was just + // initialized with an empty commit in cloneRepoInner the ref should + // exist, but if the push failed (network, permissions) it may not. + const hasRemoteBranch = await exec( + 'git', + ['rev-parse', '--verify', `origin/${defaultBranch}`], + repo + ) + .then(() => true) + .catch(() => false); + + if (!hasRemoteBranch) { + console.log( + `Skipping browse worktree for rig ${rigId}: origin/${defaultBranch} not found (repo may be empty), will create on next fetch` + ); + return browseDir; + } + // Create a worktree on the default branch for browsing. // Force-create (or reset) the tracking branch to origin/ // so a recreated browse worktree always starts from the latest remote diff --git a/cloudflare-gastown/container/src/heartbeat.ts b/cloudflare-gastown/container/src/heartbeat.ts index bd9dd8db3..ea7fd4e3e 100644 --- a/cloudflare-gastown/container/src/heartbeat.ts +++ b/cloudflare-gastown/container/src/heartbeat.ts @@ -6,6 +6,15 @@ const HEARTBEAT_INTERVAL_MS = 30_000; let heartbeatTimer: ReturnType | null = null; let gastownApiUrl: string | null = null; let sessionToken: string | null = null; +/** Set once we've successfully acknowledged container-ready. */ +let containerReadyAcknowledged = false; + +/** + * Unique ID for this container instance. Generated once at import time. + * Sent with every heartbeat so the TownDO can detect container restarts + * (new instance ID ≠ old one → clear drain flag). + */ +const CONTAINER_INSTANCE_ID = crypto.randomUUID(); /** * Configure and start the heartbeat reporter. @@ -38,6 +47,49 @@ export function stopHeartbeat(): void { console.log('Heartbeat reporter stopped'); } +/** + * Notify the TownDO that the replacement container is ready. + * Exported so the health endpoint can trigger it when the TownDO + * passes the drain nonce via headers (handles idle containers that + * have no running agents and thus no per-agent heartbeats). + */ +export async function notifyContainerReady(townId: string, drainNonce: string): Promise { + if (containerReadyAcknowledged) return; + await acknowledgeContainerReady(townId, drainNonce); +} + +/** + * Call POST /container-ready to acknowledge that this is a fresh + * container replacing an evicted one. Clears the TownDO drain flag + * so the reconciler can resume dispatching. + */ +async function acknowledgeContainerReady(townId: string, drainNonce: string): Promise { + const apiUrl = gastownApiUrl ?? process.env.GASTOWN_API_URL; + const currentToken = process.env.GASTOWN_CONTAINER_TOKEN ?? sessionToken; + if (!apiUrl || !currentToken) return; + + try { + const response = await fetch(`${apiUrl}/api/towns/${townId}/container-ready`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${currentToken}`, + }, + body: JSON.stringify({ nonce: drainNonce }), + }); + if (response.ok) { + containerReadyAcknowledged = true; + console.log(`[heartbeat] container-ready acknowledged for town=${townId}`); + } else { + console.warn( + `[heartbeat] container-ready failed for town=${townId}: ${response.status} ${response.statusText}` + ); + } + } catch (err) { + console.warn(`[heartbeat] container-ready error for town=${townId}:`, err); + } +} + async function sendHeartbeats(): Promise { // Prefer the live container token (refreshed via POST /refresh-token) // over the token captured at startHeartbeat() time. @@ -46,6 +98,12 @@ async function sendHeartbeats(): Promise { const active = listAgents().filter(a => a.status === 'running' || a.status === 'starting'); + // When no agents are active, the per-agent heartbeat loop has + // nothing to send. Idle container drain acknowledgment is handled + // by the /health endpoint instead (the TownDO passes the nonce via + // X-Drain-Nonce headers in ensureContainerReady). + if (active.length === 0) return; + for (const agent of active) { const payload: HeartbeatPayload = { agentId: agent.agentId, @@ -57,6 +115,7 @@ async function sendHeartbeats(): Promise { lastEventAt: agent.lastEventAt ?? null, activeTools: agent.activeTools ?? [], messageCount: agent.messageCount ?? 0, + containerInstanceId: CONTAINER_INSTANCE_ID, }; try { @@ -77,6 +136,18 @@ async function sendHeartbeats(): Promise { console.warn( `Heartbeat failed for agent ${agent.agentId}: ${response.status} ${response.statusText}` ); + } else if (!containerReadyAcknowledged) { + // If the TownDO is draining, the heartbeat response includes a + // drainNonce. Use it to call /container-ready and clear drain. + try { + const body = (await response.json()) as { data?: { drainNonce?: string } }; + const nonce = body?.data?.drainNonce; + if (nonce) { + void acknowledgeContainerReady(agent.townId, nonce); + } + } catch { + // Non-JSON or unexpected shape — ignore + } } } catch (err) { console.warn(`Heartbeat error for agent ${agent.agentId}:`, err); diff --git a/cloudflare-gastown/container/src/main.ts b/cloudflare-gastown/container/src/main.ts index fa03c4e31..8b2215019 100644 --- a/cloudflare-gastown/container/src/main.ts +++ b/cloudflare-gastown/container/src/main.ts @@ -8,4 +8,8 @@ process.on('uncaughtException', err => { process.exit(1); }); +process.on('SIGTERM', () => { + console.log('SIGTERM received — starting graceful drain...'); +}); + startControlServer(); diff --git a/cloudflare-gastown/container/src/process-manager.ts b/cloudflare-gastown/container/src/process-manager.ts index f0008e320..26f49b550 100644 --- a/cloudflare-gastown/container/src/process-manager.ts +++ b/cloudflare-gastown/container/src/process-manager.ts @@ -9,7 +9,7 @@ import { createKilo, type KiloClient } from '@kilocode/sdk'; import { z } from 'zod'; import type { ManagedAgent, StartAgentRequest } from './types'; -import { reportAgentCompleted } from './completion-reporter'; +import { reportAgentCompleted, reportMayorWaiting } from './completion-reporter'; import { buildKiloConfigContent } from './agent-runner'; import { log } from './logger'; @@ -35,9 +35,27 @@ const eventSinks = new Set<(agentId: string, event: string, data: unknown) => vo // Per-agent idle timers — fires exit when no nudges arrive const idleTimers = new Map>(); +// Server-level lifecycle events that should NOT cancel an agent's idle +// timer. These fire periodically (heartbeat) or on connect and don't +// represent actual agent work. Includes runtime-only types that aren't +// in the SDK's TS union (e.g. 'server.heartbeat'). +const IDLE_TIMER_IGNORE_EVENTS = new Set([ + 'server.heartbeat', + 'server.connected', + 'server.instance.disposed', +]); + let nextPort = 4096; const startTime = Date.now(); +// Set to true when drainAll() starts — prevents new agent starts and +// lets the drain loop nudge agents that transition to running mid-drain. +let _draining = false; + +export function isDraining(): boolean { + return _draining; +} + // Mutex for ensureSDKServer — createKilo() reads process.cwd() and // process.env during startup, so concurrent calls with different workdirs // would corrupt each other's globals. This serializes server creation only; @@ -263,7 +281,7 @@ async function fetchPendingNudges( }; const resp = await fetch( `${agent.gastownApiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/pending-nudges`, - { headers } + { headers, signal: AbortSignal.timeout(10_000) } ); if (!resp.ok) { console.warn( @@ -315,6 +333,48 @@ async function markNudgeDelivered(agent: ManagedAgent, nudgeId: string): Promise } } +/** + * Write eviction context on the agent's bead so the next agent dispatched + * to it knows there is WIP code pushed to a branch. Appends a note to the + * bead's body via the Gastown API. + * Best-effort: errors are logged but never propagated. + */ +async function writeEvictionCheckpoint( + agent: ManagedAgent, + context: { branch: string; agent_name: string; saved_at: string } +): Promise { + const authToken = + process.env.GASTOWN_CONTAINER_TOKEN ?? agent.gastownContainerToken ?? agent.gastownSessionToken; + if (!agent.gastownApiUrl || !authToken || !agent.townId || !agent.rigId) { + console.warn( + `${MANAGER_LOG} writeEvictionCheckpoint: missing API credentials for ${agent.agentId}` + ); + return; + } + + try { + const resp = await fetch( + `${agent.gastownApiUrl}/api/towns/${agent.townId}/rigs/${agent.rigId}/agents/${agent.agentId}/eviction-context`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${authToken}`, + 'X-Gastown-Agent-Id': agent.agentId, + 'X-Gastown-Rig-Id': agent.rigId, + }, + body: JSON.stringify(context), + signal: AbortSignal.timeout(5_000), + } + ); + if (!resp.ok) { + console.warn(`${MANAGER_LOG} writeEvictionCheckpoint: ${resp.status} for ${agent.agentId}`); + } + } catch (err) { + console.warn(`${MANAGER_LOG} writeEvictionCheckpoint: error for ${agent.agentId}:`, err); + } +} + /** * Clear the idle timer for an agent (if any). */ @@ -340,7 +400,10 @@ async function handleIdleEvent(agent: ManagedAgent, onExit: () => void): Promise const agentId = agent.agentId; console.log(`${MANAGER_LOG} handleIdleEvent: checking nudges for agent ${agentId}`); - const nudges = await fetchPendingNudges(agent); + // During drain, skip the nudge fetch — it can hang if the container + // runtime's outbound networking is degraded after SIGTERM. The agent + // finished its work; just start the idle timer so it exits promptly. + const nudges = _draining ? null : await fetchPendingNudges(agent); if (nudges === null) { // Error fetching — treat as no nudges, start idle timer @@ -370,19 +433,23 @@ async function handleIdleEvent(agent: ManagedAgent, onExit: () => void): Promise } // No nudges (or fetch error) — (re)start the idle timeout. - // Refineries get a longer timeout because their workflow is multi-step - // (diff → analyze → decide → merge/rework). The 2-min default kills the - // session between LLM turns when the refinery responds with text before - // issuing a tool call. See #1342. + // During drain, use a short idle timeout. Agents aren't nudged — they + // complete naturally — so this idle means the agent is done with its + // current work and can exit promptly. clearIdleTimer(agentId); - const timeoutMs = - agent.role === 'refinery' - ? process.env.REFINERY_IDLE_TIMEOUT_MS !== undefined - ? Number(process.env.REFINERY_IDLE_TIMEOUT_MS) - : 600_000 - : process.env.AGENT_IDLE_TIMEOUT_MS !== undefined - ? Number(process.env.AGENT_IDLE_TIMEOUT_MS) - : 120_000; + let timeoutMs: number; + if (_draining) { + timeoutMs = 10_000; + } else { + timeoutMs = + agent.role === 'refinery' + ? process.env.REFINERY_IDLE_TIMEOUT_MS !== undefined + ? Number(process.env.REFINERY_IDLE_TIMEOUT_MS) + : 600_000 + : process.env.AGENT_IDLE_TIMEOUT_MS !== undefined + ? Number(process.env.AGENT_IDLE_TIMEOUT_MS) + : 120_000; + } console.log( `${MANAGER_LOG} handleIdleEvent: no nudges for ${agentId}, idle timeout in ${timeoutMs}ms` @@ -485,14 +552,21 @@ async function subscribeToEvents( if (event.type === 'session.idle') { if (request.role === 'mayor') { // Mayor agents are persistent — session.idle means "turn done", not exit. + // Notify the TownDO so it can transition the mayor to "waiting" + // (alive in container, not doing LLM work). This lets the alarm + // drop to the idle cadence and stops health-check pings that + // would reset the container's sleepAfter timer. + void reportMayorWaiting(agent); continue; } // Non-mayor: check for pending nudges before deciding to exit. // handleIdleEvent is async; we run it in the background so the event // loop continues. The exitAgent callback will abort the stream if needed. void handleIdleEvent(agent, exitAgent); - } else { - // Non-idle event means the agent resumed work — cancel any pending idle timer. + } else if (!IDLE_TIMER_IGNORE_EVENTS.has(event.type ?? '')) { + // Non-idle event means the agent resumed work — cancel any pending + // idle timer. But skip server-level lifecycle events (heartbeats, + // connections) that don't represent actual agent activity. clearIdleTimer(agent.agentId); } @@ -613,6 +687,15 @@ export async function startAgent( // 3. Subscribe to events (async, runs in background) void subscribeToEvents(client, agent, request); + // Mark as running BEFORE the initial prompt. The event subscription + // is already active and events may be flowing (the agent is + // functionally running). session.prompt() can block if the SDK + // server is busy, which would leave the agent stuck in 'starting' + // despite being active — causing the drain to wait indefinitely. + if (agent.status === 'starting') { + agent.status = 'running'; + } + // 4. Send the initial prompt // The model string is an OpenRouter-style ID like "anthropic/claude-sonnet-4.6". // The kilo provider (which wraps OpenRouter) takes the FULL model string as modelID. @@ -631,9 +714,15 @@ export async function startAgent( }, }); - if (agent.status === 'starting') { - agent.status = 'running'; + // If the event stream errored while we were awaiting the prompt, + // the stream-error handler already set the agent to 'failed', + // reported completion, and decremented sessionCount. Mark + // sessionCounted false so the catch block doesn't double-decrement. + if (agent.status === 'failed') { + sessionCounted = false; + throw new Error('Event stream failed during initial prompt'); } + agent.messageCount = 1; log.info('agent.start', { @@ -843,6 +932,7 @@ export async function updateAgentModel( 'GASTOWN_GIT_AUTHOR_NAME', 'GASTOWN_GIT_AUTHOR_EMAIL', 'GASTOWN_DISABLE_AI_COAUTHOR', + 'KILOCODE_TOKEN', ]); const hotSwapEnv: Record = {}; for (const [key, value] of Object.entries(agent.startupEnv)) { @@ -973,6 +1063,243 @@ export function activeServerCount(): number { return sdkInstances.size; } +/** + * Gracefully drain all running agents before container eviction. + * + * 3-phase sequence: + * 1. Notify TownDO of the eviction (blocks new dispatch) + * 2. Wait up to 5 min for non-mayor agents to finish naturally + * 3. Force-save any stragglers via WIP git commit + push + * + * No nudging — agents complete their current work via gt_done and + * exit through the normal idle timeout path. The TownDO's draining + * flag prevents new work from being dispatched. + * + * Never throws — all errors are logged and swallowed so the caller + * can always proceed to stopAll() + process.exit(). + */ +export async function drainAll(): Promise { + const DRAIN_LOG = '[drain]'; + _draining = true; + + // ── Phase 1: Notify TownDO ────────────────────────────────────────── + try { + const apiUrl = process.env.GASTOWN_API_URL; + const token = process.env.GASTOWN_CONTAINER_TOKEN; + // Grab townId from any registered agent — all agents in a container + // belong to the same town. + const anyAgent = [...agents.values()][0]; + const townId = anyAgent?.townId; + + if (apiUrl && token && townId) { + console.log(`${DRAIN_LOG} Phase 1: notifying TownDO of container eviction`); + const resp = await fetch(`${apiUrl}/api/towns/${townId}/container-eviction`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}`, + }, + signal: AbortSignal.timeout(10_000), + }); + console.log(`${DRAIN_LOG} Phase 1: TownDO responded ${resp.status}`); + } else { + console.warn( + `${DRAIN_LOG} Phase 1: skipping TownDO notification (missing apiUrl=${!!apiUrl} token=${!!token} townId=${!!townId})` + ); + } + } catch (err) { + console.warn(`${DRAIN_LOG} Phase 1: TownDO notification failed, continuing:`, err); + } + + // ── Phase 2: Wait for agents to finish their current work ───────────── + // No nudging — agents complete naturally (call gt_done, go idle, etc.). + // The TownDO's draining flag blocks new dispatch so no new work starts. + // We just give them time to wrap up, then Phase 3 force-saves stragglers. + const DRAIN_WAIT_MS = 5 * 60 * 1000; + const pollInterval = 5000; + const start = Date.now(); + + const allAgents = [...agents.values()]; + console.log( + `${DRAIN_LOG} Phase 2: waiting up to ${DRAIN_WAIT_MS / 1000}s for non-mayor agents to finish. ` + + `Statuses: ${allAgents.map(a => `${a.role}:${a.agentId.slice(0, 8)}=${a.status}`).join(', ')}` + ); + + while (Date.now() - start < DRAIN_WAIT_MS) { + const active = [...agents.values()].filter( + a => (a.status === 'running' || a.status === 'starting') && a.role !== 'mayor' + ); + if (active.length === 0) break; + + // If every active agent already has an idle timer running, they've + // finished their work and are just waiting for the 10s timer to + // fire via the normal completion path (exitAgent → reportAgentCompleted). + // Poll more frequently so we notice the exit promptly, but don't + // break to Phase 3 — that would force-save WIP commits on agents + // that already called gt_done and are about to exit cleanly. + if (active.every(a => idleTimers.has(a.agentId))) { + console.log( + `${DRAIN_LOG} All ${active.length} non-mayor agents are idle (timers pending), waiting for clean exit` + ); + await new Promise(r => setTimeout(r, 1000)); + continue; + } + + console.log( + `${DRAIN_LOG} Waiting for ${active.length} non-mayor agents: ` + + active.map(a => `${a.role}:${a.agentId.slice(0, 8)}=${a.status}`).join(', ') + ); + await new Promise(r => setTimeout(r, pollInterval)); + } + + // ── Phase 3: Force-save remaining agents ──────────────────────────── + // Two sub-steps: first freeze all stragglers (cancel idle timers, + // abort event subscriptions and SDK sessions), then snapshot each + // worktree. Freezing first prevents the normal completion path + // (idle timer → onExit → bead completion) from racing with the WIP + // git save, and avoids .git/index.lock collisions with agent git ops. + const stragglers = [...agents.values()].filter( + a => a.status === 'running' || a.status === 'starting' + ); + if (stragglers.length > 0) { + console.log(`${DRAIN_LOG} Phase 3: freezing ${stragglers.length} straggler(s)`); + } else { + console.log(`${DRAIN_LOG} Phase 3: all agents finished, no force-save needed`); + } + + // 4a: Freeze — cancel idle timers and abort sessions so no + // completion/exit callbacks can fire during the git snapshot. + // Only agents that freeze successfully are safe to snapshot. + const frozen: typeof stragglers = []; + for (const agent of stragglers) { + try { + // Cancel idle timer FIRST — prevents the timer from firing and + // marking the agent as completed via onExit() while we abort. + clearIdleTimer(agent.agentId); + + // Abort event subscription + const controller = eventAbortControllers.get(agent.agentId); + if (controller) { + controller.abort(); + eventAbortControllers.delete(agent.agentId); + } + + // Abort the SDK session + const instance = sdkInstances.get(agent.workdir); + if (instance) { + await instance.client.session.abort({ + path: { id: agent.sessionId }, + }); + } + + agent.status = 'exited'; + agent.exitReason = 'container eviction'; + frozen.push(agent); + console.log(`${DRAIN_LOG} Phase 3: froze agent ${agent.agentId}`); + } catch (err) { + // Freeze failed — the session may still be writing to the + // worktree. Skip this agent in 4b to avoid .git/index.lock + // races and partial snapshots. + console.warn( + `${DRAIN_LOG} Phase 3: failed to freeze agent ${agent.agentId}, skipping snapshot:`, + err + ); + } + } + + // 4b: Snapshot — git add/commit/push each worktree now that + // all sessions are frozen. Only iterate agents that froze + // successfully; unfrozen agents are skipped to avoid racing + // with a still-active SDK session. + for (const agent of frozen) { + try { + console.log(`${DRAIN_LOG} Phase 3: force-saving agent ${agent.agentId} in ${agent.workdir}`); + + // Check whether a remote named "origin" exists. Lightweight + // workspaces (mayor/triage) are created with `git init` and + // never add a remote, so pushing would fail with + // "fatal: 'origin' does not appear to be a git repository". + const remoteCheck = Bun.spawn(['git', 'remote', 'get-url', 'origin'], { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + }); + const hasOrigin = (await remoteCheck.exited) === 0; + + const gitCmd = hasOrigin + ? "git add -A && git commit --allow-empty -m 'WIP: container eviction save' && git push --set-upstream origin HEAD" + : "git add -A && git commit --allow-empty -m 'WIP: container eviction save'"; + + if (!hasOrigin && agent.role !== 'mayor' && agent.role !== 'triage') { + console.warn( + `${DRAIN_LOG} Phase 3: no origin remote for ${agent.role} agent ${agent.agentId}, committing locally only (push skipped)` + ); + } + + // Use the agent's startup env for git author/committer identity. + const gitEnv: Record = { ...process.env }; + const authorName = + agent.startupEnv?.GIT_AUTHOR_NAME ?? process.env.GASTOWN_GIT_AUTHOR_NAME ?? 'Gastown'; + const authorEmail = + agent.startupEnv?.GIT_AUTHOR_EMAIL ?? + process.env.GASTOWN_GIT_AUTHOR_EMAIL ?? + 'gastown@kilo.ai'; + gitEnv.GIT_AUTHOR_NAME = authorName; + gitEnv.GIT_COMMITTER_NAME = authorName; + gitEnv.GIT_AUTHOR_EMAIL = authorEmail; + gitEnv.GIT_COMMITTER_EMAIL = authorEmail; + + const proc = Bun.spawn(['bash', '-c', gitCmd], { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + env: gitEnv, + }); + const exitCode = await proc.exited; + const stdout = await new Response(proc.stdout).text(); + const stderr = await new Response(proc.stderr).text(); + console.log( + `${DRAIN_LOG} Phase 3: agent ${agent.agentId} git save exited ${exitCode}` + + (stdout ? ` stdout=${stdout.trim()}` : '') + + (stderr ? ` stderr=${stderr.trim()}` : '') + ); + + // 4c: Write eviction context on the bead so the next agent + // dispatched to it knows there is WIP code on the branch. + // Must happen BEFORE reportAgentCompleted (which unhooks the agent). + if (hasOrigin && exitCode === 0 && agent.role === 'polecat') { + const branchProc = Bun.spawn(['git', 'rev-parse', '--abbrev-ref', 'HEAD'], { + cwd: agent.workdir, + stdout: 'pipe', + stderr: 'pipe', + }); + const branchName = (await new Response(branchProc.stdout).text()).trim(); + await branchProc.exited; + + console.log( + `${DRAIN_LOG} Phase 3: writing eviction context for agent ${agent.agentId}: branch=${branchName}` + ); + await writeEvictionCheckpoint(agent, { + branch: branchName, + agent_name: agent.name, + saved_at: new Date().toISOString(), + }); + } + + // 4d: Report the agent as completed so the TownDO can unhook it + // and transition the bead. Without this, the bead stays in_progress + // and the agent stays working until stale-bead recovery kicks in. + if (agent.role !== 'mayor' && agent.role !== 'triage') { + await reportAgentCompleted(agent, 'completed', 'container eviction'); + } + } catch (err) { + console.warn(`${DRAIN_LOG} Phase 3: force-save failed for agent ${agent.agentId}:`, err); + } + } + + console.log(`${DRAIN_LOG} Drain complete`); +} + export async function stopAll(): Promise { // Cancel all idle timers for (const [, timer] of idleTimers) { diff --git a/cloudflare-gastown/container/src/types.ts b/cloudflare-gastown/container/src/types.ts index da458e349..923e82ac4 100644 --- a/cloudflare-gastown/container/src/types.ts +++ b/cloudflare-gastown/container/src/types.ts @@ -152,6 +152,7 @@ export type HealthResponse = { agents: number; servers: number; uptime: number; + draining?: boolean; }; // ── Kilo serve instance ───────────────────────────────────────────────── @@ -318,6 +319,8 @@ export type HeartbeatPayload = { lastEventAt: string | null; activeTools: string[]; messageCount: number; + /** Unique ID for this container instance, used to detect restarts. */ + containerInstanceId?: string; }; // ── Stream ticket (for WebSocket streaming) ───────────────────────────── diff --git a/cloudflare-gastown/src/db/tables/agent-metadata.table.ts b/cloudflare-gastown/src/db/tables/agent-metadata.table.ts index 6409e4c1a..f9b9228dc 100644 --- a/cloudflare-gastown/src/db/tables/agent-metadata.table.ts +++ b/cloudflare-gastown/src/db/tables/agent-metadata.table.ts @@ -5,7 +5,7 @@ import { getTableFromZodSchema, getCreateTableQueryFromTable } from '../../util/ // queries parsing through AgentMetadataRecord don't throw on old rows. // Application code should only create the known roles below. const AgentRole = z.enum(['polecat', 'refinery', 'mayor']).or(z.string()); -const AgentProcessStatus = z.enum(['idle', 'working', 'stalled', 'dead']).or(z.string()); +const AgentProcessStatus = z.enum(['idle', 'working', 'waiting', 'stalled', 'dead']).or(z.string()); export const AgentMetadataRecord = z.object({ bead_id: z.string(), diff --git a/cloudflare-gastown/src/db/tables/beads.table.ts b/cloudflare-gastown/src/db/tables/beads.table.ts index e629017a1..c2eb7a0e6 100644 --- a/cloudflare-gastown/src/db/tables/beads.table.ts +++ b/cloudflare-gastown/src/db/tables/beads.table.ts @@ -50,6 +50,8 @@ export const BeadRecord = z.object({ } }) .pipe(z.record(z.string(), z.any())), // z.any() needed for Rpc.Serializable compatibility + dispatch_attempts: z.number().default(0), + last_dispatch_attempt_at: z.string().nullable().default(null), created_by: z.string().nullable(), created_at: z.string(), updated_at: z.string(), @@ -126,6 +128,8 @@ export function createTableBeads(): string { priority: `text default 'medium'`, labels: `text default '[]'`, metadata: `text default '{}'`, + dispatch_attempts: `integer not null default 0`, + last_dispatch_attempt_at: `text`, created_by: `text`, created_at: `text not null`, updated_at: `text not null`, @@ -133,6 +137,13 @@ export function createTableBeads(): string { }); } +export function migrateBeads(): string[] { + return [ + `ALTER TABLE beads ADD COLUMN dispatch_attempts integer not null default 0`, + `ALTER TABLE beads ADD COLUMN last_dispatch_attempt_at text`, + ]; +} + export function getIndexesBeads(): string[] { return [ `CREATE INDEX IF NOT EXISTS idx_beads_type_status ON ${beads}(${beads.columns.type}, ${beads.columns.status})`, diff --git a/cloudflare-gastown/src/db/tables/town-events.table.ts b/cloudflare-gastown/src/db/tables/town-events.table.ts index 95309e7a0..30be09c65 100644 --- a/cloudflare-gastown/src/db/tables/town-events.table.ts +++ b/cloudflare-gastown/src/db/tables/town-events.table.ts @@ -5,6 +5,7 @@ export const TownEventType = z.enum([ 'agent_done', 'agent_completed', 'container_status', + 'container_eviction', 'pr_status_changed', 'bead_created', 'bead_cancelled', diff --git a/cloudflare-gastown/src/dos/Town.do.ts b/cloudflare-gastown/src/dos/Town.do.ts index b3eeaa158..d1172b5e2 100644 --- a/cloudflare-gastown/src/dos/Town.do.ts +++ b/cloudflare-gastown/src/dos/Town.do.ts @@ -19,6 +19,7 @@ import { z } from 'zod'; // Sub-modules (plain functions, not classes — per coding style) import * as beadOps from './town/beads'; +import type { FailureReason } from './town/types'; import * as agents from './town/agents'; import * as mail from './town/mail'; import * as reviewQueue from './town/review-queue'; @@ -56,6 +57,10 @@ import { query } from '../util/query.util'; import { getAgentDOStub } from './Agent.do'; import { getTownContainerStub } from './TownContainer.do'; +import { kiloTokenPayload } from '@kilocode/worker-utils'; +import { jwtVerify } from 'jose'; +import { generateKiloApiToken } from '../util/kilo-token.util'; +import { resolveSecret } from '../util/secret.util'; import { writeEvent, type GastownEventData } from '../util/analytics.util'; import { logger, withLogTags } from '../util/log.util'; import { BeadPriority } from '../types'; @@ -120,7 +125,7 @@ function formatEventMessage(row: Record): string { // Alarm intervals const ACTIVE_ALARM_INTERVAL_MS = 5_000; // 5s when agents are active -const IDLE_ALARM_INTERVAL_MS = 1 * 60_000; // 1m when idle +const IDLE_ALARM_INTERVAL_MS = 5 * 60_000; // 5m when idle (no working agents) // Escalation constants const STALE_ESCALATION_THRESHOLD_MS = 4 * 60 * 60 * 1000; @@ -504,6 +509,11 @@ export class TownDO extends DurableObject { const townConfig = await config.getTownConfig(this.ctx.storage); this._ownerUserId = townConfig.owner_user_id; + // Load persisted draining flag, nonce, and start time + this._draining = (await this.ctx.storage.get('town:draining')) ?? false; + this._drainNonce = (await this.ctx.storage.get('town:drainNonce')) ?? null; + this._drainStartedAt = (await this.ctx.storage.get('town:drainStartedAt')) ?? null; + // All tables are now initialized via beads.initBeadTables(): // beads, bead_events, bead_dependencies, agent_metadata, review_metadata, // escalation_metadata, convoy_metadata @@ -537,6 +547,15 @@ export class TownDO extends DurableObject { private _townId: string | null = null; private _lastReconcilerMetrics: reconciler.ReconcilerMetrics | null = null; private _dashboardContext: string | null = null; + /** Monotonic timestamp of the last working → transition for the mayor. + * Used to reject stale session.idle callbacks that arrive after a new + * prompt has already re-activated the mayor. */ + private _mayorWorkingSince = 0; + private _draining = false; + private _drainNonce: string | null = null; + private _drainStartedAt: number | null = null; + /** Instance UUID of the current container, set by the first heartbeat. */ + private _containerInstanceId: string | null = null; private get townId(): string { return this._townId ?? this.ctx.id.name ?? this.ctx.id.toString(); @@ -563,6 +582,77 @@ export class TownDO extends DurableObject { return this._dashboardContext; } + // ══════════════════════════════════════════════════════════════════ + // Container Eviction (graceful drain) + // ══════════════════════════════════════════════════════════════════ + + /** + * Record a container eviction event and set the draining flag. + * Called by the container when it receives SIGTERM. While draining, + * the reconciler skips dispatch to prevent new work from starting. + * + * Returns a drain nonce that must be presented via + * `acknowledgeContainerReady()` to clear the drain flag. This + * prevents stale heartbeats from the dying container from + * prematurely re-enabling dispatch. + */ + async recordContainerEviction(): Promise { + events.insertEvent(this.sql, 'container_eviction', {}); + const nonce = crypto.randomUUID(); + const startedAt = Date.now(); + this._draining = true; + this._drainNonce = nonce; + this._drainStartedAt = startedAt; + await this.ctx.storage.put('town:draining', true); + await this.ctx.storage.put('town:drainNonce', nonce); + await this.ctx.storage.put('town:drainStartedAt', startedAt); + console.log(`${TOWN_LOG} recordContainerEviction: draining flag set, nonce=${nonce}`); + return nonce; + } + + /** + * Acknowledge that the replacement container is ready. Clears the + * draining flag only if the provided nonce matches the one generated + * during `recordContainerEviction()`. This ensures that only the + * new container (which received the nonce via startup config) can + * re-enable dispatch — not a stale heartbeat from the old container. + */ + async acknowledgeContainerReady(nonce: string): Promise { + if (!this._draining) { + console.log(`${TOWN_LOG} acknowledgeContainerReady: not draining, noop`); + return true; + } + if (nonce !== this._drainNonce) { + console.warn( + `${TOWN_LOG} acknowledgeContainerReady: nonce mismatch (got=${nonce}, expected=${this._drainNonce})` + ); + return false; + } + this._draining = false; + this._drainNonce = null; + this._drainStartedAt = null; + await this.ctx.storage.put('town:draining', false); + await this.ctx.storage.delete('town:drainNonce'); + await this.ctx.storage.delete('town:drainStartedAt'); + console.log(`${TOWN_LOG} acknowledgeContainerReady: draining flag cleared`); + return true; + } + + /** Whether the town is in draining mode (container eviction in progress). */ + async isDraining(): Promise { + return this._draining; + } + + /** The current drain nonce (null when not draining). */ + async getDrainNonce(): Promise { + return this._drainNonce; + } + + /** When the drain started (epoch ms), or null when not draining. */ + async getDrainStartedAt(): Promise { + return this._drainStartedAt; + } + // ══════════════════════════════════════════════════════════════════ // Town Configuration // ══════════════════════════════════════════════════════════════════ @@ -592,13 +682,17 @@ export class TownDO extends DurableObject { const townConfig = await this.getTownConfig(); const userId = townConfig.owner_user_id ?? townId; await dispatch.forceRefreshContainerToken(this.env, townId, userId); - this.lastContainerTokenRefreshAt = Date.now(); + await this.ctx.storage.put('container:lastTokenRefreshAt', Date.now()); } /** * Push config-derived env vars to the running container. Called after * updateTownConfig so that settings changes take effect without a * container restart. New agent processes inherit the updated values. + * + * Two-phase push: + * 1. setEnvVar — persists to DO storage for next boot + * 2. POST /sync-config — hot-swaps process.env on the running container */ async syncConfigToContainer(): Promise { const townId = this.townId; @@ -606,8 +700,7 @@ export class TownDO extends DurableObject { const townConfig = await this.getTownConfig(); const container = getTownContainerStub(this.env, townId); - // Map config fields to their container env var equivalents. - // When a value is set, push it; when cleared, remove it. + // Phase 1: Persist to DO storage for next boot. const envMapping: Array<[string, string | undefined]> = [ ['GIT_TOKEN', townConfig.git_auth?.github_token], ['GITLAB_TOKEN', townConfig.git_auth?.gitlab_token], @@ -616,6 +709,7 @@ export class TownDO extends DurableObject { ['GASTOWN_GIT_AUTHOR_NAME', townConfig.git_author_name], ['GASTOWN_GIT_AUTHOR_EMAIL', townConfig.git_author_email], ['GASTOWN_DISABLE_AI_COAUTHOR', townConfig.disable_ai_coauthor ? '1' : undefined], + ['KILOCODE_TOKEN', townConfig.kilocode_token], ]; for (const [key, value] of envMapping) { @@ -629,6 +723,26 @@ export class TownDO extends DurableObject { console.warn(`[Town.do] syncConfigToContainer: ${key} sync failed:`, err); } } + + // Phase 2: Push to the running container's process.env via the + // /sync-config endpoint. The X-Town-Config header delivers the + // full config; the endpoint applies CONFIG_ENV_MAP to process.env. + try { + const containerConfig = await config.buildContainerConfig(this.ctx.storage, this.env); + await container.fetch('http://container/sync-config', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'X-Town-Config': JSON.stringify(containerConfig), + }, + }); + } catch (err) { + // Best-effort — container may not be running yet. + console.warn( + `[Town.do] syncConfigToContainer: /sync-config push failed:`, + err instanceof Error ? err.message : err + ); + } } // ══════════════════════════════════════════════════════════════════ @@ -812,7 +926,12 @@ export class TownDO extends DurableObject { return beadOps.listBeads(this.sql, filter); } - async updateBeadStatus(beadId: string, status: string, agentId: string): Promise { + async updateBeadStatus( + beadId: string, + status: string, + agentId: string, + failureReason?: FailureReason + ): Promise { // Record terminal transitions as bead_cancelled events for the reconciler. // Non-terminal transitions are normal lifecycle changes, not cancellations. if (status === 'closed' || status === 'failed') { @@ -824,7 +943,7 @@ export class TownDO extends DurableObject { // Convoy progress is updated automatically inside beadOps.updateBeadStatus // when the bead reaches a terminal status (closed/failed). - const bead = beadOps.updateBeadStatus(this.sql, beadId, status, agentId); + const bead = beadOps.updateBeadStatus(this.sql, beadId, status, agentId, failureReason); if (status === 'closed') { const durationMs = Date.now() - new Date(bead.created_at).getTime(); @@ -915,6 +1034,15 @@ export class TownDO extends DurableObject { }>, actorId: string ): Promise { + // Record terminal transitions as bead_cancelled events for the reconciler, + // matching the behaviour of updateBeadStatus (the dedicated status method). + if (fields.status === 'closed' || fields.status === 'failed') { + events.insertEvent(this.sql, 'bead_cancelled', { + bead_id: beadId, + payload: { cancel_status: fields.status }, + }); + } + const bead = beadOps.updateBeadFields(this.sql, beadId, fields, actorId); // When a bead closes via field update, check for newly unblocked beads @@ -1103,18 +1231,84 @@ export class TownDO extends DurableObject { return agents.readCheckpoint(this.sql, agentId); } + /** + * Append eviction context to a bead's body so the next agent dispatched + * to it knows there is WIP code on a branch. Called by the container's + * Phase 4 force-save after pushing the WIP commit. + */ + async writeBeadEvictionContext( + agentId: string, + context: { branch: string; agent_name: string; saved_at: string } + ): Promise { + const agent = agents.getAgent(this.sql, agentId); + if (!agent?.current_hook_bead_id) return; + const bead = beadOps.getBead(this.sql, agent.current_hook_bead_id); + if (!bead) return; + const evictionNote = + `\n\n---\n**Container eviction note:** ${context.agent_name} pushed WIP progress ` + + `to branch \`${context.branch}\` before container eviction at ${context.saved_at}. ` + + `Pick up from where they left off — pull the branch and continue the work.`; + const updatedBody = (bead.body ?? '') + evictionNote; + beadOps.updateBeadFields(this.sql, bead.bead_id, { body: updatedBody }, 'system'); + } + // ── Heartbeat ───────────────────────────────────────────────────── + /** + * Update an agent's heartbeat timestamp. Returns the current drain + * nonce (if draining) so the caller can include it in the HTTP + * response without a second RPC — preventing a TOCTOU race where + * an in-flight heartbeat from the old container could observe a + * nonce generated between two separate DO calls. + */ async touchAgentHeartbeat( agentId: string, watermark?: { lastEventType?: string | null; lastEventAt?: string | null; activeTools?: string[]; + containerInstanceId?: string; } - ): Promise { + ): Promise<{ drainNonce: string | null }> { agents.touchAgent(this.sql, agentId, watermark); await this.armAlarmIfNeeded(); + + // Detect container restarts via instance ID change. The instance ID + // is persisted so it survives DO restarts (unlike in-memory only). + if (watermark?.containerInstanceId) { + // Hydrate from storage on first access after DO restart + if (this._containerInstanceId === null) { + this._containerInstanceId = + (await this.ctx.storage.get('town:containerInstanceId')) ?? null; + } + + if ( + this._draining && + this._containerInstanceId && + watermark.containerInstanceId !== this._containerInstanceId + ) { + // New container started — clear drain flag. This supplements the + // nonce handshake (acknowledgeContainerReady) as a faster path: + // the heartbeat fires every 30s vs the nonce which requires the + // container to explicitly call /container-ready. + this._draining = false; + this._drainNonce = null; + this._drainStartedAt = null; + await this.ctx.storage.put('town:draining', false); + await this.ctx.storage.delete('town:drainNonce'); + await this.ctx.storage.delete('town:drainStartedAt'); + console.log( + `${TOWN_LOG} heartbeat: new container instance ${watermark.containerInstanceId} (was ${this._containerInstanceId}), clearing drain flag` + ); + } + + if (watermark.containerInstanceId !== this._containerInstanceId) { + this._containerInstanceId = watermark.containerInstanceId; + await this.ctx.storage.put('town:containerInstanceId', watermark.containerInstanceId); + } + } + + return { drainNonce: this._drainNonce }; } async updateAgentStatusMessage(agentId: string, message: string): Promise { @@ -1407,6 +1601,43 @@ export class TownDO extends DurableObject { await this.armAlarmIfNeeded(); } + /** + * Transition the mayor from "working" to "waiting". Called by the + * container when the mayor's session goes idle (turn done, waiting for + * user input). The "waiting" status means the mayor is alive in the + * container but not doing LLM work — hasActiveWork() returns false, + * so the alarm drops to the idle cadence and health-check pings stop + * resetting the container's sleepAfter timer. + * + * @param firedAt - Timestamp (ms) when the container fired this + * callback. Used to reject stale session.idle callbacks from a + * previous turn that arrive after the mayor has already been + * re-activated by a new prompt. + */ + async mayorWaiting(agentId?: string, firedAt?: number): Promise { + let resolvedAgentId = agentId; + if (!resolvedAgentId) { + const mayor = agents.listAgents(this.sql, { role: 'mayor' })[0]; + if (mayor) resolvedAgentId = mayor.id; + } + if (!resolvedAgentId) return; + + const agent = agents.getAgent(this.sql, resolvedAgentId); + if (!agent || agent.role !== 'mayor') return; + + // Only transition from working → waiting. If the agent has already + // been set to idle/stalled/dead by another path, don't overwrite. + // Guard against stale session.idle callbacks: reportMayorWaiting is + // fire-and-forget, so a callback from a previous turn can arrive + // after sendMayorMessage has already re-activated the mayor. If the + // callback carries a firedAt timestamp that predates the last + // working transition, it belongs to an older turn — reject it. + if (agent.status === 'working') { + if (firedAt && firedAt < this._mayorWorkingSince) return; + agents.updateAgentStatus(this.sql, resolvedAgentId, 'waiting'); + } + } + async agentCompleted( agentId: string, input: { status: 'completed' | 'failed'; reason?: string } @@ -1555,27 +1786,71 @@ export class TownDO extends DurableObject { switch (action) { case 'RESTART': case 'RESTART_WITH_BACKOFF': { - // Stop the agent in the container, reset to idle so the - // scheduler picks it up again on the next alarm cycle. - if (targetAgent?.status === 'working' || targetAgent?.status === 'stalled') { - dispatch.stopAgentInContainer(this.env, this.townId, targetAgentId).catch(() => {}); - } if (targetAgent) { - // RESTART clears last_activity_at so the scheduler picks it - // up immediately. RESTART_WITH_BACKOFF sets it to now() so - // the dispatch cooldown (DISPATCH_COOLDOWN_MS) delays the - // next attempt, preventing immediate restart of crash loops. - const activityAt = action === 'RESTART_WITH_BACKOFF' ? now() : null; - query( - this.sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.last_activity_at} = ? - WHERE ${agent_metadata.bead_id} = ? - `, - [activityAt, targetAgentId] - ); + // Use the bead captured in the triage snapshot (not the agent's + // current hook, which may have changed since the triage request + // was created). Fall back to current hook for backward compat. + const restartBeadId = snapshotHookedBeadId ?? targetAgent.current_hook_bead_id; + + // Only stop the agent if it's still working on the snapshot bead. + // If it has moved on, stopping it would abort unrelated work. + const agentStillOnBead = + restartBeadId && targetAgent.current_hook_bead_id === restartBeadId; + if ( + agentStillOnBead && + (targetAgent.status === 'working' || targetAgent.status === 'stalled') + ) { + dispatch.stopAgentInContainer(this.env, this.townId, targetAgentId).catch(() => {}); + } + + // Check if the hooked bead has exhausted its dispatch cap. + // If so, fail it immediately instead of letting the reconciler + // re-dispatch indefinitely (#1653). + if (restartBeadId) { + const hookedBead = beadOps.getBead(this.sql, restartBeadId); + if (hookedBead && hookedBead.dispatch_attempts >= scheduling.MAX_DISPATCH_ATTEMPTS) { + beadOps.updateBeadStatus(this.sql, restartBeadId, 'failed', 'system', { + code: 'max_dispatch_attempts', + message: `Dispatch attempts exhausted (${hookedBead.dispatch_attempts})`, + source: 'triage', + }); + agents.unhookBead(this.sql, targetAgentId); + break; + } + } + // Only reset agent state if it's still on the snapshot bead. + // If it moved on, let it continue its current work. + if (agentStillOnBead) { + // RESTART clears last_activity_at so the scheduler picks it + // up immediately. RESTART_WITH_BACKOFF sets it to now() so + // the dispatch cooldown (DISPATCH_COOLDOWN_MS) delays the + // next attempt, preventing immediate restart of crash loops. + const activityAt = action === 'RESTART_WITH_BACKOFF' ? now() : null; + query( + this.sql, + /* sql */ ` + UPDATE ${agent_metadata} + SET ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.last_activity_at} = ? + WHERE ${agent_metadata.bead_id} = ? + `, + [activityAt, targetAgentId] + ); + } + // Stamp the bead's last_dispatch_attempt_at regardless — even + // if the agent moved on, the backoff gate should still fire + // on the snapshot bead to prevent immediate redispatch. + if (action === 'RESTART_WITH_BACKOFF' && restartBeadId) { + query( + this.sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.last_dispatch_attempt_at} = ? + WHERE ${beads.bead_id} = ? + `, + [now(), restartBeadId] + ); + } } break; } @@ -1584,7 +1859,11 @@ export class TownDO extends DurableObject { // created (not the agent's current hook, which may differ). const beadToClose = snapshotHookedBeadId ?? targetAgent?.current_hook_bead_id; if (beadToClose) { - beadOps.updateBeadStatus(this.sql, beadToClose, 'failed', input.agent_id); + beadOps.updateBeadStatus(this.sql, beadToClose, 'failed', input.agent_id, { + code: 'triage_close', + message: input.resolution_notes || 'Closed via triage', + source: 'triage', + }); // Only stop and unhook if the agent is still working on this // specific bead. If the agent has moved on, stopping it would // abort unrelated work. @@ -1643,20 +1922,35 @@ export class TownDO extends DurableObject { } agents.unhookBead(this.sql, targetAgentId); } - // Reset the bead to open so the scheduler can re-assign it - query( - this.sql, - /* sql */ ` - UPDATE ${beads} - SET ${beads.columns.assignee_agent_bead_id} = NULL, - ${beads.columns.status} = 'open', - ${beads.columns.updated_at} = ? - WHERE ${beads.bead_id} = ? - AND ${beads.status} != 'closed' - AND ${beads.status} != 'failed' - `, - [now(), beadToReassign] - ); + // Check the bead's dispatch_attempts before resetting to open. + // If the bead exhausted its dispatch cap, fail it instead of + // re-entering the infinite retry loop (#1653). + const reassignBead = beadOps.getBead(this.sql, beadToReassign); + if ( + reassignBead && + reassignBead.dispatch_attempts >= scheduling.MAX_DISPATCH_ATTEMPTS + ) { + beadOps.updateBeadStatus(this.sql, beadToReassign, 'failed', input.agent_id, { + code: 'max_dispatch_attempts', + message: `Dispatch attempts exhausted during reassign (${reassignBead.dispatch_attempts})`, + source: 'triage', + }); + } else { + // Reset the bead to open so the scheduler can re-assign it + query( + this.sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL, + ${beads.columns.status} = 'open', + ${beads.columns.updated_at} = ? + WHERE ${beads.bead_id} = ? + AND ${beads.status} != 'closed' + AND ${beads.status} != 'failed' + `, + [now(), beadToReassign] + ); + } } break; } @@ -1892,7 +2186,23 @@ export class TownDO extends DurableObject { if (isAlive) { const sent = await dispatch.sendMessageToAgent(this.env, townId, mayor.id, combinedMessage); - sessionStatus = sent ? 'active' : 'idle'; + if (sent) { + // Transition waiting → working so the alarm runs at the active cadence + // while the mayor processes this prompt. Also reschedule the alarm + // immediately — the idle alarm may be up to 5 min away, and we need + // the reconciler/health-check loop to resume promptly. + // Always refresh the watermark so a stale mayorWaiting callback + // from a previous turn can't flip the mayor back to waiting + // while a queued prompt is being processed. + this._mayorWorkingSince = Date.now(); + if (mayor.status === 'waiting') { + agents.updateAgentStatus(this.sql, mayor.id, 'working'); + await this.ctx.storage.setAlarm(Date.now() + ACTIVE_ALARM_INTERVAL_MS); + } + sessionStatus = 'active'; + } else { + sessionStatus = 'idle'; + } } else { const townConfig = await this.getTownConfig(); const rigConfig = await this.getMayorRigConfig(); @@ -1938,6 +2248,7 @@ export class TownDO extends DurableObject { if (started) { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + this._mayorWorkingSince = Date.now(); sessionStatus = 'starting'; } else { sessionStatus = 'idle'; @@ -1986,8 +2297,9 @@ export class TownDO extends DurableObject { const isAlive = containerStatus.status === 'running' || containerStatus.status === 'starting'; if (isAlive) { - const status = mayor.status === 'working' || mayor.status === 'stalled' ? 'active' : 'idle'; - return { agentId: mayor.id, sessionStatus: status }; + const isActive = + mayor.status === 'working' || mayor.status === 'stalled' || mayor.status === 'waiting'; + return { agentId: mayor.id, sessionStatus: isActive ? 'active' : 'idle' }; } // Start the container with an idle mayor (no initial prompt) @@ -2039,6 +2351,7 @@ export class TownDO extends DurableObject { if (started) { agents.updateAgentStatus(this.sql, mayor.id, 'working'); + this._mayorWorkingSince = Date.now(); return { agentId: mayor.id, sessionStatus: 'starting' }; } @@ -2103,7 +2416,7 @@ export class TownDO extends DurableObject { const mapStatus = (agentStatus: string): 'idle' | 'active' | 'starting' => { switch (agentStatus) { case 'working': - return 'active'; + case 'waiting': case 'stalled': return 'active'; default: @@ -2991,10 +3304,13 @@ export class TownDO extends DurableObject { logger.setTags({ townId }); logger.info('alarm: fired'); - const hasRigs = rigs.listRigs(this.sql).length > 0; + // Call once per tick — threaded to ensureContainerReady, maybeDispatchTriageAgent, and getAlarmStatus + const rigList = rigs.listRigs(this.sql); + const hasRigs = rigList.length > 0; + if (hasRigs) { try { - await this.ensureContainerReady(); + await this.ensureContainerReady(rigList); } catch (err) { logger.warn('alarm: container health check failed', { error: err instanceof Error ? err.message : String(err), @@ -3014,6 +3330,16 @@ export class TownDO extends DurableObject { error: err instanceof Error ? err.message : String(err), }); } + + // Proactively remint KILOCODE_TOKEN before it expires (30-day + // expiry, checked daily, refreshed within 7 days of expiry). + try { + await this.refreshKilocodeTokenIfExpiring(); + } catch (err) { + logger.warn('alarm: refreshKilocodeTokenIfExpiring failed', { + error: err instanceof Error ? err.message : String(err), + }); + } } // ── Pre-phase: Observe container status for working agents ──────── @@ -3113,10 +3439,27 @@ export class TownDO extends DurableObject { Sentry.captureException(err); } + // Safety-net: auto-clear drain flag if it has been active too long. + // The primary clear mechanism is the heartbeat instance ID check + // (see recordHeartbeat), but this catches edge cases where no + // heartbeat arrives (e.g. container failed to start). + if (this._draining && this._drainStartedAt) { + const DRAIN_TIMEOUT_MS = 7 * 60 * 1000; + if (Date.now() - this._drainStartedAt > DRAIN_TIMEOUT_MS) { + this._draining = false; + this._drainNonce = null; + this._drainStartedAt = null; + await this.ctx.storage.put('town:draining', false); + await this.ctx.storage.delete('town:drainNonce'); + await this.ctx.storage.delete('town:drainStartedAt'); + logger.info('reconciler: drain timeout exceeded, auto-clearing draining flag'); + } + } + // Phase 1: Reconcile — compute desired state vs actual state const sideEffects: Array<() => Promise> = []; try { - const actions = reconciler.reconcile(this.sql); + const actions = reconciler.reconcile(this.sql, { draining: this._draining }); metrics.actionsEmitted = actions.length; for (const a of actions) { metrics.actionsByType[a.type] = (metrics.actionsByType[a.type] ?? 0) + 1; @@ -3228,7 +3571,16 @@ export class TownDO extends DurableObject { label: JSON.stringify(metrics.actionsByType), }); + // ── Post-reconciliation: cache activity snapshot ──────────────── + // Computed after Phases 0-2 so re-arm and getAlarmStatus reflect + // any work created during reconciliation (hooks, dispatches, triage). + const activeWork = this.hasActiveWork(); + // ── Phase 3: Housekeeping (independent, all parallelizable) ──── + + // Call once per tick — threaded to maybeDispatchTriageAgent and getAlarmStatus + const cachedTriageCount = patrol.countPendingTriageRequests(this.sql); + await Promise.allSettled([ this.deliverPendingMail().catch(err => logger.warn('alarm: deliverPendingMail failed', { @@ -3245,7 +3597,7 @@ export class TownDO extends DurableObject { error: err instanceof Error ? err.message : String(err), }) ), - this.maybeDispatchTriageAgent().catch(err => + this.maybeDispatchTriageAgent(cachedTriageCount, rigList).catch(err => logger.warn('alarm: maybeDispatchTriageAgent failed', { error: err instanceof Error ? err.message : String(err), }) @@ -3261,19 +3613,25 @@ export class TownDO extends DurableObject { } }), ]); + // Re-arm: fast when active, slow when idle - const active = this.hasActiveWork(); - const interval = active ? ACTIVE_ALARM_INTERVAL_MS : IDLE_ALARM_INTERVAL_MS; + const interval = activeWork ? ACTIVE_ALARM_INTERVAL_MS : IDLE_ALARM_INTERVAL_MS; await this.ctx.storage.setAlarm(Date.now() + interval); - // Broadcast status snapshot to connected WebSocket clients - try { - const snapshot = await this.getAlarmStatus(); - this.broadcastAlarmStatus(snapshot); - } catch (err) { - logger.warn('alarm: status broadcast failed', { - error: err instanceof Error ? err.message : String(err), - }); + // Broadcast status snapshot to connected WebSocket clients (skip if nobody is listening) + const statusClients = this.ctx.getWebSockets('status'); + if (statusClients.length > 0) { + try { + const snapshot = await this.getAlarmStatus({ + activeWork, + triageCount: cachedTriageCount, + }); + this.broadcastAlarmStatus(snapshot); + } catch (err) { + logger.warn('alarm: status broadcast failed', { + error: err instanceof Error ? err.message : String(err), + }); + } } } @@ -3282,12 +3640,17 @@ export class TownDO extends DurableObject { * from the alarm handler, throttled to once per hour (tokens have * 8h expiry). The TownContainerDO stores it as an env var so it's * available to all agents in the container. + * + * The throttle timestamp is persisted in ctx.storage so it survives + * DO eviction. Without persistence, eviction resets the throttle to 0 + * and the refresh fires immediately on the next alarm tick, sending + * requests that reset the container's sleepAfter timer (#1409). */ - private lastContainerTokenRefreshAt = 0; private async refreshContainerToken(): Promise { const TOKEN_REFRESH_INTERVAL_MS = 60 * 60_000; // 1 hour const now = Date.now(); - if (now - this.lastContainerTokenRefreshAt < TOKEN_REFRESH_INTERVAL_MS) return; + const lastRefresh = (await this.ctx.storage.get('container:lastTokenRefreshAt')) ?? 0; + if (now - lastRefresh < TOKEN_REFRESH_INTERVAL_MS) return; const townId = this.townId; if (!townId) return; @@ -3296,7 +3659,85 @@ export class TownDO extends DurableObject { await dispatch.refreshContainerToken(this.env, townId, userId); // Only mark as refreshed after success — failed refreshes should // be retried on the next alarm tick, not throttled for an hour. - this.lastContainerTokenRefreshAt = now; + await this.ctx.storage.put('container:lastTokenRefreshAt', now); + } + + /** + * Proactively remint KILOCODE_TOKEN when it's approaching expiry. + * Throttled to once per day — the 30-day token is refreshed when + * within 7 days of expiry, providing ample safety margin. + * + * Verifies the existing token's signature before trusting its claims, + * preventing a forged near-expiry token from being re-signed with + * real credentials. + */ + private lastKilocodeTokenCheckAt = 0; + private async refreshKilocodeTokenIfExpiring(): Promise { + const CHECK_INTERVAL_MS = 24 * 60 * 60_000; // once per day + const REFRESH_WINDOW_SECONDS = 7 * 24 * 60 * 60; // 7 days + const now = Date.now(); + if (now - this.lastKilocodeTokenCheckAt < CHECK_INTERVAL_MS) return; + this.lastKilocodeTokenCheckAt = now; + + const townConfig = await this.getTownConfig(); + const token = townConfig.kilocode_token; + if (!token) return; + + if (!this.env.NEXTAUTH_SECRET) { + logger.warn('refreshKilocodeTokenIfExpiring: NEXTAUTH_SECRET not configured'); + return; + } + const secret = await resolveSecret(this.env.NEXTAUTH_SECRET); + if (!secret) { + logger.warn('refreshKilocodeTokenIfExpiring: failed to resolve NEXTAUTH_SECRET'); + return; + } + + // Verify the existing token's signature before trusting its claims. + // This prevents a forged token from being re-signed with real credentials. + // Use a very large clockTolerance so that already-expired (but validly + // signed) tokens are still accepted — this alarm is the recovery path + // for expired tokens, so rejecting them on exp would leave the town + // permanently stuck if it missed the 7-day refresh window. + let payload: { kiloUserId: string; apiTokenPepper?: string | null; exp?: number }; + try { + const TEN_YEARS_SECONDS = 10 * 365 * 24 * 60 * 60; + const { payload: raw } = await jwtVerify(token, new TextEncoder().encode(secret), { + algorithms: ['HS256'], + clockTolerance: TEN_YEARS_SECONDS, + }); + const parsed = kiloTokenPayload.safeParse(raw); + if (!parsed.success) { + logger.warn('refreshKilocodeTokenIfExpiring: token payload failed schema validation'); + return; + } + payload = parsed.data; + } catch { + // Signature invalid or token malformed — don't remint from untrusted claims. + logger.warn('refreshKilocodeTokenIfExpiring: existing token failed signature verification'); + return; + } + + const exp = payload.exp; + if (!exp) return; + + const nowSeconds = Math.floor(now / 1000); + if (exp - nowSeconds > REFRESH_WINDOW_SECONDS) return; + + // Token expires within 7 days — remint it + const userId = payload.kiloUserId; + if (!userId) return; + + const newToken = await generateKiloApiToken( + { id: userId, api_token_pepper: payload.apiTokenPepper ?? null }, + secret + ); + await this.updateTownConfig({ kilocode_token: newToken }); + await this.syncConfigToContainer(); + logger.info('refreshKilocodeTokenIfExpiring: reminted KILOCODE_TOKEN proactively', { + userId, + oldExp: new Date(exp * 1000).toISOString(), + }); } private hasActiveWork(): boolean { @@ -3324,8 +3765,11 @@ export class TownDO extends DurableObject { * * Skips dispatch if a triage agent is already working. */ - private async maybeDispatchTriageAgent(): Promise { - const pendingCount = patrol.countPendingTriageRequests(this.sql); + private async maybeDispatchTriageAgent( + cachedTriageCount?: number, + cachedRigList?: rigs.RigRecord[] + ): Promise { + const pendingCount = cachedTriageCount ?? patrol.countPendingTriageRequests(this.sql); if (pendingCount === 0) return; // Check if a triage batch bead is already in progress (meaning a @@ -3362,7 +3806,7 @@ export class TownDO extends DurableObject { // Validate preconditions before creating any beads to avoid // leaked phantom issue beads on early-return paths. - const rigList = rigs.listRigs(this.sql); + const rigList = cachedRigList ?? rigs.listRigs(this.sql); if (rigList.length === 0) { console.warn(`${TOWN_LOG} maybeDispatchTriageAgent: no rigs available, skipping`); return; @@ -3430,7 +3874,11 @@ export class TownDO extends DurableObject { // Failing the batch bead triggers cooldown: the guard at the top of // this method skips dispatch while a failed batch bead's updated_at // is within DISPATCH_COOLDOWN_MS. - beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'failed', triageAgent.id); + beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'failed', triageAgent.id, { + code: 'container_start_failed', + message: 'Triage agent failed to start in container', + source: 'container', + }); console.error(`${TOWN_LOG} maybeDispatchTriageAgent: triage agent failed to start`); } } @@ -3643,13 +4091,15 @@ export class TownDO extends DurableObject { } } - private async ensureContainerReady(): Promise { - const hasRigs = rigs.listRigs(this.sql).length > 0; - if (!hasRigs) return; + private async ensureContainerReady( + cachedRigList?: rigs.RigRecord[], + cachedActiveWork?: boolean + ): Promise { + const rigList = cachedRigList ?? rigs.listRigs(this.sql); + if (rigList.length === 0) return; - const hasWork = this.hasActiveWork(); - if (!hasWork) { - const rigList = rigs.listRigs(this.sql); + const hasWork = cachedActiveWork ?? this.hasActiveWork(); + if (!hasWork && !this._draining) { const newestRigAge = rigList.reduce((min, r) => { const age = Date.now() - new Date(r.created_at).getTime(); return Math.min(min, age); @@ -3663,8 +4113,27 @@ export class TownDO extends DurableObject { try { const container = getTownContainerStub(this.env, townId); + const headers: Record = {}; + // When draining AND enough time has passed for the old container + // to have exited (drainAll waits up to 10 min + exit), pass the + // nonce so the replacement container can acknowledge readiness. + // We only send the nonce after 11 minutes to avoid the old + // (still-draining) container receiving it and clearing drain + // prematurely — the health check goes to whichever container is + // currently serving this town. + const DRAIN_HANDOFF_DELAY_MS = 11 * 60 * 1000; + if ( + this._draining && + this._drainNonce && + this._drainStartedAt && + Date.now() - this._drainStartedAt > DRAIN_HANDOFF_DELAY_MS + ) { + headers['X-Drain-Nonce'] = this._drainNonce; + headers['X-Town-Id'] = townId; + } await container.fetch('http://container/health', { signal: AbortSignal.timeout(5_000), + headers, }); } catch { // Container is starting up or unavailable — alarm will retry @@ -3739,7 +4208,7 @@ export class TownDO extends DurableObject { * Return a structured snapshot of the alarm loop and patrol state * for the dashboard Status tab. */ - async getAlarmStatus(): Promise<{ + async getAlarmStatus(cached?: { activeWork?: boolean; triageCount?: number }): Promise<{ alarm: { nextFireAt: string | null; intervalMs: number; @@ -3747,6 +4216,7 @@ export class TownDO extends DurableObject { }; agents: { working: number; + waiting: number; idle: number; stalled: number; dead: number; @@ -3771,9 +4241,11 @@ export class TownDO extends DurableObject { type: string; message: string; }>; + draining?: boolean; + drainStartedAt?: string; }> { const currentAlarm = await this.ctx.storage.getAlarm(); - const active = this.hasActiveWork(); + const active = cached?.activeWork ?? this.hasActiveWork(); const intervalMs = active ? ACTIVE_ALARM_INTERVAL_MS : IDLE_ALARM_INTERVAL_MS; // Agent counts by status @@ -3788,7 +4260,7 @@ export class TownDO extends DurableObject { [] ), ]; - const agentCounts = { working: 0, idle: 0, stalled: 0, dead: 0, total: 0 }; + const agentCounts = { working: 0, waiting: 0, idle: 0, stalled: 0, dead: 0, total: 0 }; for (const row of agentRows) { const s = `${row.status as string}`; const c = Number(row.cnt); @@ -3826,38 +4298,26 @@ export class TownDO extends DurableObject { } // Triage request count (issue beads with gt:triage-request label) - beadCounts.triageRequests = patrol.countPendingTriageRequests(this.sql); - - // Patrol indicators — count active warnings/issues - const guppWarnings = Number( - [ - ...query( - this.sql, - /* sql */ ` - SELECT COUNT(*) AS cnt FROM ${beads} - WHERE ${beads.type} = 'message' - AND ${beads.title} = 'GUPP_CHECK' - AND ${beads.status} = 'open' - `, - [] - ), - ][0]?.cnt ?? 0 - ); + beadCounts.triageRequests = cached?.triageCount ?? patrol.countPendingTriageRequests(this.sql); - const guppEscalations = Number( - [ - ...query( - this.sql, - /* sql */ ` - SELECT COUNT(*) AS cnt FROM ${beads} - WHERE ${beads.type} = 'message' - AND ${beads.title} = 'GUPP_ESCALATION' - AND ${beads.status} = 'open' - `, - [] - ), - ][0]?.cnt ?? 0 - ); + // Patrol indicators — count active GUPP warnings + escalations in one query + const guppRows = [ + ...query( + this.sql, + /* sql */ ` + SELECT + SUM(CASE WHEN ${beads.title} = 'GUPP_CHECK' THEN 1 ELSE 0 END) AS warnings, + SUM(CASE WHEN ${beads.title} = 'GUPP_ESCALATION' THEN 1 ELSE 0 END) AS escalations + FROM ${beads} + WHERE ${beads.type} = 'message' + AND ${beads.title} IN ('GUPP_CHECK', 'GUPP_ESCALATION') + AND ${beads.status} = 'open' + `, + [] + ), + ]; + const guppWarnings = Number(guppRows[0]?.warnings ?? 0); + const guppEscalations = Number(guppRows[0]?.escalations ?? 0); const stalledAgents = agentCounts.stalled; @@ -3909,7 +4369,7 @@ export class TownDO extends DurableObject { alarm: { nextFireAt: currentAlarm ? new Date(Number(currentAlarm)).toISOString() : null, intervalMs, - intervalLabel: active ? 'active (5s)' : 'idle (60s)', + intervalLabel: active ? 'active (5s)' : 'idle (5m)', }, agents: agentCounts, beads: beadCounts, @@ -3921,6 +4381,10 @@ export class TownDO extends DurableObject { }, reconciler: this._lastReconcilerMetrics, recentEvents, + draining: this._draining || undefined, + drainStartedAt: this._drainStartedAt + ? new Date(this._drainStartedAt).toISOString() + : undefined, }; } @@ -4096,6 +4560,30 @@ export class TownDO extends DurableObject { } // DEBUG: raw agent_metadata dump — remove after debugging + async debugPendingNudges(): Promise { + return [ + ...query( + this.sql, + /* sql */ ` + SELECT ${agent_nudges.nudge_id}, + ${agent_nudges.agent_bead_id}, + ${agent_nudges.message}, + ${agent_nudges.mode}, + ${agent_nudges.priority}, + ${agent_nudges.source}, + ${agent_nudges.created_at}, + ${agent_nudges.delivered_at}, + ${agent_nudges.expires_at} + FROM ${agent_nudges} + WHERE ${agent_nudges.delivered_at} IS NULL + ORDER BY ${agent_nudges.created_at} DESC + LIMIT 20 + `, + [] + ), + ]; + } + async debugAgentMetadata(): Promise { return [ ...query( diff --git a/cloudflare-gastown/src/dos/town/actions.ts b/cloudflare-gastown/src/dos/town/actions.ts index 5f799ec90..d9bbac5c3 100644 --- a/cloudflare-gastown/src/dos/town/actions.ts +++ b/cloudflare-gastown/src/dos/town/actions.ts @@ -278,7 +278,11 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro case 'transition_bead': { try { - beadOps.updateBeadStatus(sql, action.bead_id, action.to, action.actor); + const failureReason = + action.to === 'failed' + ? { code: 'reconciler', message: action.reason, source: 'scheduler' } + : undefined; + beadOps.updateBeadStatus(sql, action.bead_id, action.to, action.actor, failureReason); } catch (err) { console.warn(`${LOG} transition_bead failed: bead=${action.bead_id} to=${action.to}`, err); } @@ -506,17 +510,10 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } } - // Set agent to working and bead to in_progress synchronously + // Set agent to working and bead to in_progress synchronously. + // dispatch_attempts are NOT incremented here — scheduling.dispatchAgent() + // is the single source of truth for both agent_metadata and bead counters. agentOps.updateAgentStatus(sql, agentId, 'working'); - query( - sql, - /* sql */ ` - UPDATE ${agent_metadata} - SET ${agent_metadata.columns.dispatch_attempts} = ${agent_metadata.columns.dispatch_attempts} + 1 - WHERE ${agent_metadata.bead_id} = ? - `, - [agentId] - ); beadOps.updateBeadStatus(sql, beadId, 'in_progress', agentId); const capturedAgentId = agentId; diff --git a/cloudflare-gastown/src/dos/town/agents.ts b/cloudflare-gastown/src/dos/town/agents.ts index c208a3834..a723ce6d9 100644 --- a/cloudflare-gastown/src/dos/town/agents.ts +++ b/cloudflare-gastown/src/dos/town/agents.ts @@ -288,7 +288,6 @@ export function hookBead(sql: SqlStorage, agentId: string, beadId: string): void UPDATE ${agent_metadata} SET ${agent_metadata.columns.current_hook_bead_id} = ?, ${agent_metadata.columns.status} = 'idle', - ${agent_metadata.columns.dispatch_attempts} = 0, ${agent_metadata.columns.last_activity_at} = ?, ${agent_metadata.columns.agent_status_message} = NULL, ${agent_metadata.columns.agent_status_updated_at} = NULL @@ -325,12 +324,15 @@ export function unhookBead(sql: SqlStorage, agentId: string): void { const beadId = agent.current_hook_bead_id; + // Clear checkpoint when unhooking — the agent is done with this bead + // and the checkpoint (if any) should not leak into the next dispatch. query( sql, /* sql */ ` UPDATE ${agent_metadata} SET ${agent_metadata.columns.current_hook_bead_id} = NULL, - ${agent_metadata.columns.status} = 'idle' + ${agent_metadata.columns.status} = 'idle', + ${agent_metadata.columns.checkpoint} = NULL WHERE ${agent_metadata.bead_id} = ? `, [agentId] diff --git a/cloudflare-gastown/src/dos/town/beads.ts b/cloudflare-gastown/src/dos/town/beads.ts index 5fd9463fa..c2e865d55 100644 --- a/cloudflare-gastown/src/dos/town/beads.ts +++ b/cloudflare-gastown/src/dos/town/beads.ts @@ -4,7 +4,13 @@ */ import { z } from 'zod'; -import { beads, BeadRecord, createTableBeads, getIndexesBeads } from '../../db/tables/beads.table'; +import { + beads, + BeadRecord, + createTableBeads, + getIndexesBeads, + migrateBeads, +} from '../../db/tables/beads.table'; import { bead_events, BeadEventRecord, @@ -41,6 +47,7 @@ import type { BeadType, } from '../../types'; import type { BeadEventType } from '../../db/tables/bead-events.table'; +import type { FailureReason } from './types'; function generateId(): string { return crypto.randomUUID(); @@ -65,7 +72,7 @@ export function initBeadTables(sql: SqlStorage): void { dropCheckConstraints(sql); // Migrations: add columns to existing tables (idempotent) - for (const stmt of [...migrateConvoyMetadata(), ...migrateAgentMetadata()]) { + for (const stmt of [...migrateBeads(), ...migrateConvoyMetadata(), ...migrateAgentMetadata()]) { try { query(sql, stmt, []); } catch { @@ -250,7 +257,8 @@ export function updateBeadStatus( sql: SqlStorage, beadId: string, status: string, - agentId: string | null + agentId: string | null, + failureReason?: FailureReason ): Bead { const bead = getBead(sql, beadId); if (!bead) throw new Error(`Bead ${beadId} not found`); @@ -291,6 +299,7 @@ export function updateBeadStatus( eventType: 'status_changed', oldValue: oldStatus, newValue: status, + metadata: failureReason && status === 'failed' ? { failure_reason: failureReason } : {}, }); // If the bead reached a terminal status and is tracked by a convoy, @@ -565,6 +574,14 @@ export function updateBeadFields( const bead = getBead(sql, beadId); if (!bead) throw new Error(`Bead ${beadId} not found`); + // Delegate status changes to updateBeadStatus so they produce a + // status_changed event (with old/new values), respect the terminal + // state guard, and carry structured failureReason metadata. + if (fields.status !== undefined && fields.status !== bead.status) { + updateBeadStatus(sql, beadId, fields.status, actorId); + } + + // Build the SQL update for non-status fields only. const timestamp = now(); const setClauses: string[] = []; const values: unknown[] = []; @@ -585,19 +602,6 @@ export function updateBeadFields( setClauses.push(`${beads.columns.labels} = ?`); values.push(JSON.stringify(fields.labels)); } - if (fields.status !== undefined) { - setClauses.push(`${beads.columns.status} = ?`); - values.push(fields.status); - if (fields.status === 'closed') { - // Set closed_at when transitioning to closed (preserve existing if already set) - setClauses.push(`${beads.columns.closed_at} = ?`); - values.push(bead.closed_at ?? timestamp); - } else if (bead.closed_at) { - // Clear closed_at when reopening a previously-closed bead - setClauses.push(`${beads.columns.closed_at} = ?`); - values.push(null); - } - } if (fields.metadata !== undefined) { setClauses.push(`${beads.columns.metadata} = ?`); values.push(JSON.stringify(fields.metadata)); @@ -615,31 +619,27 @@ export function updateBeadFields( values.push(fields.parent_bead_id); } - if (setClauses.length === 0) return bead; - - setClauses.push(`${beads.columns.updated_at} = ?`); - values.push(timestamp); - values.push(beadId); - - // Dynamic SET clause — query() can't statically verify param count here, - // so use sql.exec() directly. The early return above guarantees values is non-empty. - sql.exec( - /* sql */ `UPDATE ${beads} SET ${setClauses.join(', ')} WHERE ${beads.bead_id} = ?`, - ...values - ); + if (setClauses.length > 0) { + setClauses.push(`${beads.columns.updated_at} = ?`); + values.push(timestamp); + values.push(beadId); - const changedFields = Object.keys(fields); - logBeadEvent(sql, { - beadId, - agentId: actorId, - eventType: 'fields_updated', - newValue: changedFields.join(','), - metadata: { changed: changedFields, actor: actorId }, - }); + sql.exec( + /* sql */ `UPDATE ${beads} SET ${setClauses.join(', ')} WHERE ${beads.bead_id} = ?`, + ...values + ); - // If status was updated to a terminal value, run convoy progress logic - if (fields.status === 'closed' || fields.status === 'failed') { - updateConvoyProgress(sql, beadId, timestamp); + // Log fields_updated only for the non-status fields that were changed. + const nonStatusFields = Object.keys(fields).filter(k => k !== 'status'); + if (nonStatusFields.length > 0) { + logBeadEvent(sql, { + beadId, + agentId: actorId, + eventType: 'fields_updated', + newValue: nonStatusFields.join(','), + metadata: { changed: nonStatusFields, actor: actorId }, + }); + } } const updated = getBead(sql, beadId); diff --git a/cloudflare-gastown/src/dos/town/reconciler.ts b/cloudflare-gastown/src/dos/town/reconciler.ts index e9fef8d21..e3129896e 100644 --- a/cloudflare-gastown/src/dos/town/reconciler.ts +++ b/cloudflare-gastown/src/dos/town/reconciler.ts @@ -35,6 +35,54 @@ import type { TownEventRecord } from '../../db/tables/town-events.table'; const LOG = '[reconciler]'; +// ── Circuit breaker ───────────────────────────────────────────────── + +/** Number of dispatch failures in a 30-min window to trip the town-level breaker. */ +const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 20; +/** Window in minutes for counting dispatch failures. */ +const CIRCUIT_BREAKER_WINDOW_MINUTES = 30; + +/** + * Town-level dispatch circuit breaker. Counts beads with at least one + * dispatch attempt in the recent window that have not yet closed + * successfully. This captures beads in active retry loops (in_progress + * after a failed container start), beads that have been explicitly + * failed, and beads that exhausted all attempts — while excluding + * beads that eventually succeeded (status = 'closed'). + */ +function checkDispatchCircuitBreaker(sql: SqlStorage): Action[] { + const rows = z + .object({ failure_count: z.number() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT count(*) as failure_count + FROM ${beads} + WHERE ${beads.last_dispatch_attempt_at} > strftime('%Y-%m-%dT%H:%M:%fZ', 'now', '-${CIRCUIT_BREAKER_WINDOW_MINUTES} minutes') + AND ${beads.dispatch_attempts} > 0 + AND ${beads.status} != 'closed' + `, + [] + ), + ]); + + const failureCount = rows[0]?.failure_count ?? 0; + if (failureCount >= CIRCUIT_BREAKER_FAILURE_THRESHOLD) { + console.warn( + `${LOG} circuit breaker OPEN: ${failureCount} dispatch failures in last ${CIRCUIT_BREAKER_WINDOW_MINUTES}min (threshold=${CIRCUIT_BREAKER_FAILURE_THRESHOLD})` + ); + return [ + { + type: 'notify_mayor', + message: `Dispatch circuit breaker is OPEN: ${failureCount} dispatch failures in the last ${CIRCUIT_BREAKER_WINDOW_MINUTES} minutes. All dispatch actions are paused until failures clear.`, + }, + ]; + } + return []; +} + // ── Timeouts (from spec §7) ───────────────────────────────────────── /** Reset non-PR MR beads stuck in_progress with no working agent */ @@ -61,6 +109,21 @@ function staleMs(timestamp: string | null, thresholdMs: number): boolean { return Date.now() - new Date(timestamp).getTime() > thresholdMs; } +/** + * Compute the dispatch cooldown for a bead based on its attempt count. + * Implements exponential backoff: + * attempts 1-2: 2 min (DISPATCH_COOLDOWN_MS) + * attempt 3: 5 min + * attempt 4: 10 min + * attempt 5+: 30 min + */ +function getDispatchCooldownMs(dispatchAttempts: number): number { + if (dispatchAttempts <= 2) return DISPATCH_COOLDOWN_MS; // 2 min + if (dispatchAttempts === 3) return 5 * 60_000; // 5 min + if (dispatchAttempts === 4) return 10 * 60_000; // 10 min + return 30 * 60_000; // 30 min +} + // ── Row schemas for queries ───────────────────────────────────────── // Derived from table record schemas for traceability back to table defs. @@ -89,6 +152,8 @@ const BeadRow = BeadRecord.pick({ updated_at: true, labels: true, created_by: true, + dispatch_attempts: true, + last_dispatch_attempt_at: true, }); type BeadRow = z.infer; @@ -290,6 +355,13 @@ export function applyEvent(sql: SqlStorage, event: TownEventRecord): void { return; } + case 'container_eviction': { + // Draining flag is managed by the TownDO via KV storage. + // The reconciler reads it from there; no SQL state change needed here. + // The event is recorded for audit trail. + return; + } + case 'nudge_timeout': { // GUPP violations are handled by reconcileGUPP on the next pass. // The event just records the fact for audit trail. @@ -306,13 +378,14 @@ export function applyEvent(sql: SqlStorage, event: TownEventRecord): void { // Top-level reconcile // ════════════════════════════════════════════════════════════════════ -export function reconcile(sql: SqlStorage): Action[] { +export function reconcile(sql: SqlStorage, opts?: { draining?: boolean }): Action[] { + const draining = opts?.draining ?? false; const actions: Action[] = []; - actions.push(...reconcileAgents(sql)); - actions.push(...reconcileBeads(sql)); - actions.push(...reconcileReviewQueue(sql)); + actions.push(...reconcileAgents(sql, { draining })); + actions.push(...reconcileBeads(sql, { draining })); + actions.push(...reconcileReviewQueue(sql, { draining })); actions.push(...reconcileConvoys(sql)); - actions.push(...reconcileGUPP(sql)); + actions.push(...reconcileGUPP(sql, { draining })); actions.push(...reconcileGC(sql)); return actions; } @@ -322,7 +395,7 @@ export function reconcile(sql: SqlStorage): Action[] { // idle agents with stale hooks to terminal beads // ════════════════════════════════════════════════════════════════════ -export function reconcileAgents(sql: SqlStorage): Action[] { +export function reconcileAgents(sql: SqlStorage, opts?: { draining?: boolean }): Action[] { const actions: Action[] = []; // Working agents with stale or missing heartbeat — container probably dead. @@ -353,6 +426,11 @@ export function reconcileAgents(sql: SqlStorage): Action[] { // Mayors are always working with no hook — skip them if (agent.role === 'mayor') continue; + // During container drain the heartbeat reporter is stopped, so + // last_activity_at freezes. Skip stale-heartbeat checks to avoid + // false-positive idle transitions while agents are still working. + if (opts?.draining) continue; + if (!agent.last_activity_at) { // No heartbeat ever received — container may have failed to start actions.push({ @@ -457,9 +535,15 @@ export function reconcileAgents(sql: SqlStorage): Action[] { // reconcileBeads — handle unassigned beads, lost agents, stale reviews // ════════════════════════════════════════════════════════════════════ -export function reconcileBeads(sql: SqlStorage): Action[] { +export function reconcileBeads(sql: SqlStorage, opts?: { draining?: boolean }): Action[] { + const draining = opts?.draining ?? false; const actions: Action[] = []; + // Town-level circuit breaker: if too many dispatch failures in the + // window, skip all dispatch_agent actions and escalate to mayor. + const circuitBreakerActions = checkDispatchCircuitBreaker(sql); + const circuitBreakerOpen = circuitBreakerActions.length > 0; + // Rule 1: Open issue beads with no assignee, no blockers, not staged, not triage const unassigned = BeadRow.array().parse([ ...query( @@ -470,7 +554,9 @@ export function reconcileBeads(sql: SqlStorage): Action[] { b.${beads.columns.assignee_agent_bead_id}, b.${beads.columns.updated_at}, b.${beads.columns.labels}, - b.${beads.columns.created_by} + b.${beads.columns.created_by}, + b.${beads.columns.dispatch_attempts}, + b.${beads.columns.last_dispatch_attempt_at} FROM ${beads} b WHERE b.${beads.columns.type} = 'issue' AND b.${beads.columns.status} = 'open' @@ -498,9 +584,31 @@ export function reconcileBeads(sql: SqlStorage): Action[] { for (const bead of unassigned) { if (!bead.rig_id) continue; - // In shadow mode we can't call getOrCreateAgent, so we just note - // that a hook_agent + dispatch_agent is needed. - // The action includes rig_id so Phase 3's applyAction can resolve the agent. + if (draining) { + console.log(`${LOG} Town is draining, skipping dispatch for bead ${bead.bead_id}`); + continue; + } + + // Per-bead dispatch cap: fail the bead if it exhausted all attempts + if (bead.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { + actions.push({ + type: 'transition_bead', + bead_id: bead.bead_id, + from: 'open', + to: 'failed', + reason: `max dispatch attempts exceeded (${bead.dispatch_attempts})`, + actor: 'system', + }); + continue; + } + + // Exponential backoff: skip if last dispatch attempt was too recent + const cooldownMs = getDispatchCooldownMs(bead.dispatch_attempts); + if (!staleMs(bead.last_dispatch_attempt_at, cooldownMs)) continue; + + // Town-level circuit breaker suppresses dispatch + if (circuitBreakerOpen) continue; + actions.push({ type: 'dispatch_agent', agent_id: '', // resolved at apply time @@ -509,6 +617,55 @@ export function reconcileBeads(sql: SqlStorage): Action[] { }); } + // Rule 1b: Open issue beads with a stale assignee (agent exists but is not + // hooked to this bead). This happens when a container restart causes the + // agent to be unhooked while the bead is reset to open (e.g. by the mayor). + // Clear the assignee so Rule 1 can pick it up on the next reconciler tick. + const staleAssigned = BeadRow.array().parse([ + ...query( + sql, + /* sql */ ` + SELECT ${beads.bead_id}, ${beads.type}, + ${beads.status}, ${beads.rig_id}, + ${beads.assignee_agent_bead_id}, + ${beads.updated_at}, + ${beads.labels}, + ${beads.created_by}, + ${beads.dispatch_attempts}, + ${beads.last_dispatch_attempt_at} + FROM ${beads} + WHERE ${beads.type} = 'issue' + AND ${beads.status} = 'open' + AND ${beads.assignee_agent_bead_id} IS NOT NULL + AND ${beads.rig_id} IS NOT NULL + AND NOT EXISTS ( + SELECT 1 FROM ${agent_metadata} + WHERE ${agent_metadata.bead_id} = ${beads.assignee_agent_bead_id} + AND ${agent_metadata.current_hook_bead_id} = ${beads.bead_id} + ) + `, + [] + ), + ]); + + for (const bead of staleAssigned) { + // Skip system-assigned beads (escalations, rework requests) — those + // are handled by other subsystems and don't need dispatch. + if (bead.assignee_agent_bead_id === 'system') continue; + + // Skip triage-request beads — patrol.createTriageRequest() sets + // assignee_agent_bead_id to route the request to a specific agent, + // but hookBead() intentionally refuses to hook triage-request beads. + // Without this skip, the reconciler would clear the assignee on + // every tick because the hook will never exist. + if (bead.labels.includes('gt:triage-request')) continue; + + actions.push({ + type: 'clear_bead_assignee', + bead_id: bead.bead_id, + }); + } + // Rule 2: Idle agents with hooks need dispatch (schedulePendingWork equivalent) const idleHooked = AgentRow.array().parse([ ...query( @@ -532,36 +689,22 @@ export function reconcileBeads(sql: SqlStorage): Action[] { for (const agent of idleHooked) { if (!agent.current_hook_bead_id) continue; - // Check dispatch cooldown - if (!staleMs(agent.last_activity_at, DISPATCH_COOLDOWN_MS)) continue; - - // Check max dispatch attempts - if (agent.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { - actions.push({ - type: 'transition_bead', - bead_id: agent.current_hook_bead_id, - from: null, - to: 'failed', - reason: 'max dispatch attempts exceeded', - actor: 'system', - }); - actions.push({ - type: 'unhook_agent', - agent_id: agent.bead_id, - reason: 'max dispatch attempts', - }); - continue; - } - - // Check if the hooked bead is open and unblocked + // Check if the hooked bead is open and unblocked, and read its + // dispatch_attempts for the per-bead circuit breaker. const hookedRows = z - .object({ status: z.string(), rig_id: z.string().nullable() }) + .object({ + status: z.string(), + rig_id: z.string().nullable(), + dispatch_attempts: z.number(), + last_dispatch_attempt_at: z.string().nullable(), + }) .array() .parse([ ...query( sql, /* sql */ ` - SELECT ${beads.status}, ${beads.rig_id} + SELECT ${beads.status}, ${beads.rig_id}, + ${beads.dispatch_attempts}, ${beads.last_dispatch_attempt_at} FROM ${beads} WHERE ${beads.bead_id} = ? `, @@ -573,6 +716,28 @@ export function reconcileBeads(sql: SqlStorage): Action[] { const hooked = hookedRows[0]; if (hooked.status !== 'open') continue; + // Per-bead dispatch cap (uses bead counter, not agent counter) + if (hooked.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { + actions.push({ + type: 'transition_bead', + bead_id: agent.current_hook_bead_id, + from: null, + to: 'failed', + reason: `max dispatch attempts exceeded (${hooked.dispatch_attempts})`, + actor: 'system', + }); + actions.push({ + type: 'unhook_agent', + agent_id: agent.bead_id, + reason: 'max dispatch attempts', + }); + continue; + } + + // Exponential backoff using bead's last_dispatch_attempt_at + const cooldownMs = getDispatchCooldownMs(hooked.dispatch_attempts); + if (!staleMs(hooked.last_dispatch_attempt_at, cooldownMs)) continue; + // Check blockers const blockerCount = z .object({ cnt: z.number() }) @@ -594,6 +759,16 @@ export function reconcileBeads(sql: SqlStorage): Action[] { if (blockerCount[0]?.cnt > 0) continue; + if (draining) { + console.log( + `${LOG} Town is draining, skipping dispatch for bead ${agent.current_hook_bead_id}` + ); + continue; + } + + // Town-level circuit breaker suppresses dispatch + if (circuitBreakerOpen) continue; + actions.push({ type: 'dispatch_agent', agent_id: agent.bead_id, @@ -612,7 +787,9 @@ export function reconcileBeads(sql: SqlStorage): Action[] { b.${beads.columns.assignee_agent_bead_id}, b.${beads.columns.updated_at}, b.${beads.columns.labels}, - b.${beads.columns.created_by} + b.${beads.columns.created_by}, + b.${beads.columns.dispatch_attempts}, + b.${beads.columns.last_dispatch_attempt_at} FROM ${beads} b WHERE b.${beads.columns.type} = 'issue' AND b.${beads.columns.status} = 'in_progress' @@ -649,6 +826,24 @@ export function reconcileBeads(sql: SqlStorage): Action[] { if (hookedAgent.length > 0) continue; + // If the bead has exhausted its dispatch attempts, fail it instead + // of resetting to open (which would cause an infinite retry loop). + if (bead.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { + actions.push({ + type: 'transition_bead', + bead_id: bead.bead_id, + from: 'in_progress', + to: 'failed', + reason: `agent lost, max dispatch attempts exhausted (${bead.dispatch_attempts})`, + actor: 'system', + }); + actions.push({ + type: 'clear_bead_assignee', + bead_id: bead.bead_id, + }); + continue; + } + actions.push({ type: 'transition_bead', bead_id: bead.bead_id, @@ -735,6 +930,11 @@ export function reconcileBeads(sql: SqlStorage): Action[] { } } + // Emit circuit breaker notification (once per reconcile pass) + if (circuitBreakerOpen) { + actions.push(...circuitBreakerActions); + } + return actions; } @@ -743,9 +943,13 @@ export function reconcileBeads(sql: SqlStorage): Action[] { // refinery dispatch // ════════════════════════════════════════════════════════════════════ -export function reconcileReviewQueue(sql: SqlStorage): Action[] { +export function reconcileReviewQueue(sql: SqlStorage, opts?: { draining?: boolean }): Action[] { + const draining = opts?.draining ?? false; const actions: Action[] = []; + // Town-level circuit breaker + const circuitBreakerOpen = checkDispatchCircuitBreaker(sql).length > 0; + // Get all MR beads that need attention const mrBeads = MrBeadRow.array().parse([ ...query( @@ -933,6 +1137,15 @@ export function reconcileReviewQueue(sql: SqlStorage): Action[] { if (oldestMr.length === 0) continue; + // Skip dispatch if the town is draining (container eviction in progress) + if (draining) { + console.log(`${LOG} Town is draining, skipping dispatch for bead ${oldestMr[0].bead_id}`); + continue; + } + + // Town-level circuit breaker suppresses dispatch + if (circuitBreakerOpen) continue; + // If no refinery exists or it's busy, emit a dispatch_agent with empty // agent_id — applyAction will create the refinery via getOrCreateAgent. if (refinery.length === 0) { @@ -1000,39 +1213,22 @@ export function reconcileReviewQueue(sql: SqlStorage): Action[] { for (const ref of idleRefineries) { if (!ref.current_hook_bead_id) continue; - // Cooldown: skip if last activity is too recent (#1342) - if (!staleMs(ref.last_activity_at, DISPATCH_COOLDOWN_MS)) continue; - - // Circuit-breaker: fail the MR bead after too many attempts (#1342) - if (ref.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { - actions.push({ - type: 'transition_bead', - bead_id: ref.current_hook_bead_id, - from: null, - to: 'failed', - reason: 'refinery max dispatch attempts exceeded', - actor: 'system', - }); - actions.push({ - type: 'unhook_agent', - agent_id: ref.bead_id, - reason: 'max dispatch attempts', - }); - continue; - } - + // Read the bead's dispatch_attempts for the per-bead circuit breaker const mrRows = z .object({ status: z.string(), type: z.string(), rig_id: z.string().nullable(), + dispatch_attempts: z.number(), + last_dispatch_attempt_at: z.string().nullable(), }) .array() .parse([ ...query( sql, /* sql */ ` - SELECT ${beads.status}, ${beads.type}, ${beads.rig_id} + SELECT ${beads.status}, ${beads.type}, ${beads.rig_id}, + ${beads.dispatch_attempts}, ${beads.last_dispatch_attempt_at} FROM ${beads} WHERE ${beads.bead_id} = ? `, @@ -1044,6 +1240,39 @@ export function reconcileReviewQueue(sql: SqlStorage): Action[] { const mr = mrRows[0]; if (mr.type !== 'merge_request' || mr.status !== 'in_progress') continue; + if (draining) { + console.log( + `${LOG} Town is draining, skipping dispatch for bead ${ref.current_hook_bead_id}` + ); + continue; + } + + // Per-bead dispatch cap — check before cooldown so max-attempt MR + // beads are failed immediately rather than waiting for the cooldown. + if (mr.dispatch_attempts >= MAX_DISPATCH_ATTEMPTS) { + actions.push({ + type: 'transition_bead', + bead_id: ref.current_hook_bead_id, + from: null, + to: 'failed', + reason: `refinery max dispatch attempts exceeded (${mr.dispatch_attempts})`, + actor: 'system', + }); + actions.push({ + type: 'unhook_agent', + agent_id: ref.bead_id, + reason: 'max dispatch attempts', + }); + continue; + } + + // Exponential backoff using bead's last_dispatch_attempt_at + const cooldownMs = getDispatchCooldownMs(mr.dispatch_attempts); + if (!staleMs(mr.last_dispatch_attempt_at, cooldownMs)) continue; + + // Town-level circuit breaker suppresses dispatch + if (circuitBreakerOpen) continue; + // Container status is checked at apply time (async). In shadow mode, // we just note that a dispatch is needed. actions.push({ @@ -1248,7 +1477,12 @@ export function reconcileConvoys(sql: SqlStorage): Action[] { // reconcileGUPP — detect agents exceeding activity thresholds // ════════════════════════════════════════════════════════════════════ -export function reconcileGUPP(sql: SqlStorage): Action[] { +export function reconcileGUPP(sql: SqlStorage, opts?: { draining?: boolean }): Action[] { + // During container drain the heartbeat reporter is stopped, so + // last_event_at freezes. Skip GUPP checks entirely to avoid + // false-positive "idle for 15 minutes" nudges while agents are + // still actively working in the draining container. + if (opts?.draining) return []; const actions: Action[] = []; const workingAgents = AgentRow.array().parse([ diff --git a/cloudflare-gastown/src/dos/town/review-queue.ts b/cloudflare-gastown/src/dos/town/review-queue.ts index e1a1e19d6..44c7e26bf 100644 --- a/cloudflare-gastown/src/dos/town/review-queue.ts +++ b/cloudflare-gastown/src/dos/town/review-queue.ts @@ -37,6 +37,29 @@ function now(): string { return new Date().toISOString(); } +/** + * Extract the human-readable failure message from a bead event's metadata. + * + * Two sources: + * - status_changed events store it at `metadata.failure_reason.message` + * - review_completed / pr_creation_failed events store it at `metadata.message` + */ +function extractFailureMessage( + status: string, + metadata: Record | null | undefined +): string | null { + if (status !== 'failed' || !metadata) return null; + // Structured failure_reason (from status_changed events via updateBeadStatus) + const fr = metadata.failure_reason; + if (typeof fr === 'object' && fr !== null && 'message' in fr) { + const msg = (fr as Record).message; + if (typeof msg === 'string') return msg; + } + // Top-level message (from review_completed / pr_creation_failed events) + if (typeof metadata.message === 'string') return metadata.message; + return null; +} + export function initReviewQueueTables(_sql: SqlStorage): void { // Review queue and molecule tables are now part of beads + satellite tables. // Initialization happens in beads.initBeadTables(). @@ -235,29 +258,11 @@ export function completeReview( entryId: string, status: 'merged' | 'failed' ): void { - // Guard: don't overwrite terminal states (closed MR bead that was - // already merged should never be set to 'failed' by a stale call) - const current = getBead(sql, entryId); - if (current && (current.status === 'closed' || current.status === 'failed')) { - console.warn( - `[review-queue] completeReview: bead ${entryId} already ${current.status}, skipping` - ); - return; - } - const beadStatus = status === 'merged' ? 'closed' : 'failed'; - const timestamp = now(); - query( - sql, - /* sql */ ` - UPDATE ${beads} - SET ${beads.columns.status} = ?, - ${beads.columns.updated_at} = ?, - ${beads.columns.closed_at} = ? - WHERE ${beads.bead_id} = ? - `, - [beadStatus, timestamp, beadStatus === 'closed' ? timestamp : null, entryId] - ); + // Delegate to updateBeadStatus so a status_changed event is recorded + // on the event timeline. It also handles terminal-state guards, + // closed_at timestamps, and convoy progress updates. + updateBeadStatus(sql, entryId, beadStatus, 'system'); } /** @@ -705,14 +710,38 @@ export function agentCompleted( // Rule 3 will reset it to open after the staleness timeout. const hookedBead = getBead(sql, agent.current_hook_bead_id); if (input.status === 'failed') { - updateBeadStatus(sql, agent.current_hook_bead_id, 'failed', agentId); + updateBeadStatus(sql, agent.current_hook_bead_id, 'failed', agentId, { + code: 'agent_failed', + message: 'Agent exited with failed status', + source: 'container', + }); } else if (hookedBead && hookedBead.status === 'in_progress') { - // Agent exited 'completed' but bead is still in_progress — gt_done was never called. - // Don't close the bead. Rule 3 will handle rework. - console.log( - `[review-queue] agentCompleted: polecat ${agentId} exited without gt_done — ` + - `bead ${agent.current_hook_bead_id} stays in_progress (Rule 3 will recover)` - ); + if (input.reason === 'container eviction') { + // Container eviction: WIP was force-pushed and eviction context + // was written on the bead body. Reset to open and clear the + // stale assignee so the reconciler can re-dispatch immediately. + console.log( + `[review-queue] agentCompleted: polecat ${agentId} evicted — ` + + `resetting bead ${agent.current_hook_bead_id} to open` + ); + updateBeadStatus(sql, agent.current_hook_bead_id, 'open', agentId); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL + WHERE ${beads.bead_id} = ? + `, + [agent.current_hook_bead_id] + ); + } else { + // Agent exited 'completed' but bead is still in_progress — gt_done was never called. + // Don't close the bead. Rule 3 will handle rework. + console.log( + `[review-queue] agentCompleted: polecat ${agentId} exited without gt_done — ` + + `bead ${agent.current_hook_bead_id} stays in_progress (Rule 3 will recover)` + ); + } } else if (hookedBead && hookedBead.status === 'open') { // Bead is open (wasn't dispatched yet or was already reset). No-op. } else { @@ -1197,12 +1226,7 @@ function mrBeadRowToItem(row: z.output): MergeQueueItem { : null, rigName: row.rig_name, staleSince: null, - failureReason: - row.status === 'failed' && row.failure_event_metadata - ? typeof row.failure_event_metadata.message === 'string' - ? row.failure_event_metadata.message - : null - : null, + failureReason: extractFailureMessage(row.status, row.failure_event_metadata), }; } diff --git a/cloudflare-gastown/src/dos/town/scheduling.ts b/cloudflare-gastown/src/dos/town/scheduling.ts index 2c48347b5..80dc7a1fb 100644 --- a/cloudflare-gastown/src/dos/town/scheduling.ts +++ b/cloudflare-gastown/src/dos/town/scheduling.ts @@ -23,7 +23,7 @@ const LOG = '[scheduling]'; // ── Constants ────────────────────────────────────────────────────────── export const DISPATCH_COOLDOWN_MS = 2 * 60_000; // 2 min -export const MAX_DISPATCH_ATTEMPTS = 20; +export const MAX_DISPATCH_ATTEMPTS = 5; // ── Context passed by the Town DO ────────────────────────────────────── @@ -109,6 +109,18 @@ export async function dispatchAgent( `, [timestamp, agent.id] ); + // Track dispatch attempts on the bead itself so the counter + // survives agent re-creation and hookBead cycles. + query( + ctx.sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.dispatch_attempts} = ${beads.columns.dispatch_attempts} + 1, + ${beads.columns.last_dispatch_attempt_at} = ? + WHERE ${beads.bead_id} = ? + `, + [timestamp, bead.bead_id] + ); const started = await dispatch.startAgentInContainer(ctx.env, ctx.storage, { townId: ctx.townId, @@ -170,6 +182,7 @@ export async function dispatchAgent( agentId: agent.id, beadId: bead.bead_id, role: agent.role, + reason: 'container returned false', }); } return started; @@ -199,6 +212,7 @@ export async function dispatchAgent( agentId: agent.id, beadId: bead.bead_id, role: agent.role, + reason: err instanceof Error ? err.message : String(err), }); return false; } diff --git a/cloudflare-gastown/src/dos/town/types.ts b/cloudflare-gastown/src/dos/town/types.ts new file mode 100644 index 000000000..2120be142 --- /dev/null +++ b/cloudflare-gastown/src/dos/town/types.ts @@ -0,0 +1,14 @@ +/** + * Shared types for Town DO operations. + */ + +export type FailureReason = { + /** Machine-readable failure code. */ + code: string; + /** Human-readable summary of what went wrong. */ + message: string; + /** Optional detail: stack trace, error output, container logs, etc. */ + details?: string; + /** What triggered the failure: 'scheduler' | 'patrol' | 'refinery' | 'triage' | 'admin' | 'container' */ + source: string; +}; diff --git a/cloudflare-gastown/src/gastown.worker.ts b/cloudflare-gastown/src/gastown.worker.ts index 32d676e0a..13e88dc2a 100644 --- a/cloudflare-gastown/src/gastown.worker.ts +++ b/cloudflare-gastown/src/gastown.worker.ts @@ -36,7 +36,9 @@ import { handleAgentDone, handleRequestChanges, handleAgentCompleted, + handleAgentWaiting, handleWriteCheckpoint, + handleWriteEvictionContext, handleCheckMail, handleHeartbeat, handleGetOrCreateAgent, @@ -134,6 +136,11 @@ import { handleListEscalations, handleAcknowledgeEscalation, } from './handlers/town-escalations.handler'; +import { + handleContainerEviction, + handleContainerReady, + handleDrainStatus, +} from './handlers/town-eviction.handler'; export { GastownUserDO } from './dos/GastownUser.do'; export { GastownOrgDO } from './dos/GastownOrg.do'; @@ -256,6 +263,45 @@ app.post('/debug/towns/:townId/replay-events', async c => { return c.json(result); }); +app.get('/debug/towns/:townId/drain-status', async c => { + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const draining = await town.isDraining(); + // eslint-disable-next-line @typescript-eslint/await-thenable + const drainNonce = await town.getDrainNonce(); + return c.json({ draining, drainNonce }); +}); + +app.get('/debug/towns/:townId/nudges', async c => { + const townId = c.req.param('townId'); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable -- DO RPC returns promise at runtime + const nudges = await town.debugPendingNudges(); + return c.json({ nudges }); +}); + +app.post('/debug/towns/:townId/send-message', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const body: { message: string; model?: string } = await c.req.json(); + const town = getTownDOStub(c.env, townId); + // eslint-disable-next-line @typescript-eslint/await-thenable + const result = await town.sendMayorMessage( + body.message, + body.model ?? 'anthropic/claude-sonnet-4.6' + ); + return c.json(result); +}); + +app.post('/debug/towns/:townId/graceful-stop', async c => { + if (c.env.ENVIRONMENT !== 'development') return c.json({ error: 'dev only' }, 403); + const townId = c.req.param('townId'); + const containerStub = getTownContainerStub(c.env, townId); + await containerStub.stop(); + return c.json({ stopped: true }); +}); + // ── Town ID + Auth ────────────────────────────────────────────────────── // All rig routes live under /api/towns/:townId/rigs/:rigId so the townId // is always available from the URL path. @@ -377,11 +423,21 @@ app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/completed', c => handleAgentCompleted(c, c.req.param()) ) ); +app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/waiting', c => + instrumented(c, 'POST /api/towns/:townId/rigs/:rigId/agents/:agentId/waiting', () => + handleAgentWaiting(c, c.req.param()) + ) +); app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/checkpoint', c => instrumented(c, 'POST /api/towns/:townId/rigs/:rigId/agents/:agentId/checkpoint', () => handleWriteCheckpoint(c, c.req.param()) ) ); +app.post('/api/towns/:townId/rigs/:rigId/agents/:agentId/eviction-context', c => + instrumented(c, 'POST /api/towns/:townId/rigs/:rigId/agents/:agentId/eviction-context', () => + handleWriteEvictionContext(c, c.req.param()) + ) +); app.get('/api/towns/:townId/rigs/:rigId/agents/:agentId/mail', c => instrumented(c, 'GET /api/towns/:townId/rigs/:rigId/agents/:agentId/mail', () => handleCheckMail(c, c.req.param()) @@ -478,6 +534,27 @@ app.post('/api/towns/:townId/rigs/:rigId/triage/resolve', c => ) ); +// ── Container Eviction ────────────────────────────────────────────────── +// Called by the container on SIGTERM. Uses container JWT auth (not kilo +// user auth), so it must be registered before the kiloAuthMiddleware +// wildcard below. + +app.post('/api/towns/:townId/container-eviction', c => + instrumented(c, 'POST /api/towns/:townId/container-eviction', () => + handleContainerEviction(c, c.req.param()) + ) +); + +app.post('/api/towns/:townId/container-ready', c => + instrumented(c, 'POST /api/towns/:townId/container-ready', () => + handleContainerReady(c, c.req.param()) + ) +); + +app.get('/api/towns/:townId/drain-status', c => + instrumented(c, 'GET /api/towns/:townId/drain-status', () => handleDrainStatus(c, c.req.param())) +); + // ── Kilo User Auth ────────────────────────────────────────────────────── // Validate Kilo user JWT (signed with NEXTAUTH_SECRET) for dashboard/user // routes. Container→worker routes use the agent JWT middleware instead @@ -604,6 +681,32 @@ app.patch('/api/towns/:townId/config', c => instrumented(c, 'PATCH /api/towns/:townId/config', () => handleUpdateTownConfig(c, c.req.param())) ); +// ── Cloudflare Debug ──────────────────────────────────────────────── +// Returns DO IDs and namespace IDs for constructing Cloudflare dashboard URLs. +// containerDoId is only returned when the container is actually running, +// so the UI correctly shows a disabled state when the container is stopped. + +app.get('/api/towns/:townId/cloudflare-debug', async c => { + const townId = c.req.param('townId'); + const townDoId = c.env.TOWN.idFromName(townId).toString(); + + // Check actual container runtime state before returning the DO ID. + // idFromName() is deterministic and always returns an ID even when + // no container instance is running — we need to gate on getState(). + const containerStub = getTownContainerStub(c.env, townId); + const containerState = await containerStub.getState(); + const containerRunning = + containerState.status === 'running' || containerState.status === 'healthy'; + const containerDoId = containerRunning + ? c.env.TOWN_CONTAINER.idFromName(townId).toString() + : null; + + return c.json({ + success: true, + data: { townDoId, containerDoId }, + }); +}); + // ── Town Events ───────────────────────────────────────────────────────── app.use('/api/users/:userId/towns/:townId/events', async (c: Context, next) => diff --git a/cloudflare-gastown/src/handlers/rig-agents.handler.ts b/cloudflare-gastown/src/handlers/rig-agents.handler.ts index 24c429a4c..c8d5a7367 100644 --- a/cloudflare-gastown/src/handlers/rig-agents.handler.ts +++ b/cloudflare-gastown/src/handlers/rig-agents.handler.ts @@ -162,6 +162,24 @@ export async function handleAgentCompleted( return c.json(resSuccess({ completed: true })); } +/** + * Called by the container when the mayor's session goes idle (turn done, + * waiting for user input). Transitions the mayor from "working" to + * "waiting" so the alarm drops to the idle cadence and health-check + * pings stop resetting the container's sleepAfter timer. + */ +export async function handleAgentWaiting( + c: Context, + params: { rigId: string; agentId: string } +) { + const body = (await parseJsonBody(c)) as Record; + const firedAt = typeof body?.firedAt === 'number' ? body.firedAt : undefined; + const townId = c.get('townId'); + const town = getTownDOStub(c.env, townId); + await town.mayorWaiting(params.agentId, firedAt); + return c.json(resSuccess({ acknowledged: true })); +} + export async function handleWriteCheckpoint( c: Context, params: { rigId: string; agentId: string } @@ -179,6 +197,29 @@ export async function handleWriteCheckpoint( return c.json(resSuccess({ written: true })); } +const EvictionContextBody = z.object({ + branch: z.string(), + agent_name: z.string(), + saved_at: z.string(), +}); + +export async function handleWriteEvictionContext( + c: Context, + params: { rigId: string; agentId: string } +) { + const parsed = EvictionContextBody.safeParse(await parseJsonBody(c)); + if (!parsed.success) { + return c.json( + { success: false, error: 'Invalid request body', issues: parsed.error.issues }, + 400 + ); + } + const townId = c.get('townId'); + const town = getTownDOStub(c.env, townId); + await town.writeBeadEvictionContext(params.agentId, parsed.data); + return c.json(resSuccess({ written: true })); +} + export async function handleCheckMail( c: Context, params: { rigId: string; agentId: string } @@ -194,6 +235,7 @@ const HeartbeatWatermark = z lastEventType: z.string().nullable().optional(), lastEventAt: z.string().nullable().optional(), activeTools: z.array(z.string()).optional(), + containerInstanceId: z.string().optional(), }) .passthrough(); @@ -221,18 +263,23 @@ export async function handleHeartbeat( // No body or invalid JSON — old container format, just touch } - await town.touchAgentHeartbeat( + // touchAgentHeartbeat returns the drain nonce atomically — no + // second RPC needed, which prevents a TOCTOU race where an + // in-flight heartbeat from the old container could observe a nonce + // generated between two separate DO calls. + const { drainNonce } = await town.touchAgentHeartbeat( params.agentId, watermark ? { lastEventType: watermark.lastEventType ?? null, lastEventAt: watermark.lastEventAt ?? null, activeTools: watermark.activeTools, + containerInstanceId: watermark.containerInstanceId, } : undefined ); - return c.json(resSuccess({ heartbeat: true })); + return c.json(resSuccess({ heartbeat: true, ...(drainNonce ? { drainNonce } : {}) })); } const GetOrCreateAgentBody = z.object({ diff --git a/cloudflare-gastown/src/handlers/town-eviction.handler.ts b/cloudflare-gastown/src/handlers/town-eviction.handler.ts new file mode 100644 index 000000000..36d7a8e14 --- /dev/null +++ b/cloudflare-gastown/src/handlers/town-eviction.handler.ts @@ -0,0 +1,152 @@ +import type { Context } from 'hono'; +import { extractBearerToken } from '@kilocode/worker-utils'; +import type { GastownEnv } from '../gastown.worker'; +import { getTownDOStub } from '../dos/Town.do'; +import { verifyContainerJWT } from '../util/jwt.util'; +import { resolveSecret } from '../util/secret.util'; +import { resSuccess, resError } from '../util/res.util'; + +/** + * POST /api/towns/:townId/container-eviction + * + * Called by the container's process-manager when the container receives + * SIGTERM. Inserts a `container_eviction` event and sets the draining + * flag so the reconciler stops dispatching new work. + * + * Returns a `drainNonce` that must be presented via `/container-ready` + * to clear the drain flag. This prevents stale heartbeats from the + * dying container from prematurely re-enabling dispatch. + * + * Authenticated with the container-scoped JWT (same token used for all + * container→worker calls). + */ +export async function handleContainerEviction( + c: Context, + params: { townId: string } +): Promise { + // Authenticate with container JWT + const token = extractBearerToken(c.req.header('Authorization')); + if (!token) { + return c.json(resError('Authentication required'), 401); + } + + const secret = await resolveSecret(c.env.GASTOWN_JWT_SECRET); + if (!secret) { + console.error('[town-eviction] failed to resolve GASTOWN_JWT_SECRET'); + return c.json(resError('Internal server error'), 500); + } + + const result = verifyContainerJWT(token, secret); + if (!result.success) { + return c.json(resError(result.error), 401); + } + + // Cross-town guard + if (result.payload.townId !== params.townId) { + return c.json(resError('Cross-town access denied'), 403); + } + + const town = getTownDOStub(c.env, params.townId); + const drainNonce = await town.recordContainerEviction(); + + console.log(`[town-eviction] container eviction recorded for town=${params.townId}`); + return c.json(resSuccess({ acknowledged: true, drainNonce }), 200); +} + +/** + * GET /api/towns/:townId/drain-status + * + * Lightweight endpoint for the container to poll drain state. Used by + * the heartbeat module when no agents are running — the per-agent + * heartbeat loop has nothing to iterate, so a separate check is needed + * to discover the drain nonce and call /container-ready. + * + * Authenticated with the container-scoped JWT. + */ +export async function handleDrainStatus( + c: Context, + params: { townId: string } +): Promise { + const token = extractBearerToken(c.req.header('Authorization')); + if (!token) { + return c.json(resError('Authentication required'), 401); + } + + const secret = await resolveSecret(c.env.GASTOWN_JWT_SECRET); + if (!secret) { + return c.json(resError('Internal server error'), 500); + } + + const result = verifyContainerJWT(token, secret); + if (!result.success) { + return c.json(resError(result.error), 401); + } + + if (result.payload.townId !== params.townId) { + return c.json(resError('Cross-town access denied'), 403); + } + + const town = getTownDOStub(c.env, params.townId); + const [draining, drainNonce] = await Promise.all([town.isDraining(), town.getDrainNonce()]); + + return c.json(resSuccess({ draining, drainNonce }), 200); +} + +/** + * POST /api/towns/:townId/container-ready + * + * Called by the replacement container on startup to signal readiness. + * Clears the draining flag only if the provided `drainNonce` matches + * the nonce generated during the eviction that triggered the drain. + * + * Authenticated with the container-scoped JWT. + */ +export async function handleContainerReady( + c: Context, + params: { townId: string } +): Promise { + const token = extractBearerToken(c.req.header('Authorization')); + if (!token) { + return c.json(resError('Authentication required'), 401); + } + + const secret = await resolveSecret(c.env.GASTOWN_JWT_SECRET); + if (!secret) { + console.error('[container-ready] failed to resolve GASTOWN_JWT_SECRET'); + return c.json(resError('Internal server error'), 500); + } + + const result = verifyContainerJWT(token, secret); + if (!result.success) { + return c.json(resError(result.error), 401); + } + + if (result.payload.townId !== params.townId) { + return c.json(resError('Cross-town access denied'), 403); + } + + let nonce: string | undefined; + try { + const body: unknown = await c.req.json(); + if ( + body && + typeof body === 'object' && + 'nonce' in body && + typeof (body as { nonce: unknown }).nonce === 'string' + ) { + nonce = (body as { nonce: string }).nonce; + } + } catch { + // No body or invalid JSON + } + + if (!nonce) { + return c.json(resError('Missing required field: nonce'), 400); + } + + const town = getTownDOStub(c.env, params.townId); + const cleared = await town.acknowledgeContainerReady(nonce); + + console.log(`[container-ready] town=${params.townId} nonce=${nonce} cleared=${cleared}`); + return c.json(resSuccess({ cleared }), 200); +} diff --git a/cloudflare-gastown/src/trpc/router.ts b/cloudflare-gastown/src/trpc/router.ts index 8169c0d7c..be1d5feaf 100644 --- a/cloudflare-gastown/src/trpc/router.ts +++ b/cloudflare-gastown/src/trpc/router.ts @@ -363,6 +363,22 @@ export const gastownRouter = router({ return verifyTownOwnership(ctx.env, ctx, input.townId); }), + getDrainStatus: gastownProcedure + .input(z.object({ townId: z.string().uuid() })) + .output(z.object({ draining: z.boolean(), drainStartedAt: z.string().nullable() })) + .query(async ({ ctx, input }) => { + await verifyTownOwnership(ctx.env, ctx, input.townId); + const town = getTownDOStub(ctx.env, input.townId); + const [draining, startedAt] = await Promise.all([ + town.isDraining() as Promise, + town.getDrainStartedAt() as Promise, + ]); + return { + draining, + drainStartedAt: startedAt ? new Date(startedAt).toISOString() : null, + }; + }), + /** * Check whether the current user is an admin viewing a town they don't own. * Used by the frontend to show an admin banner. @@ -1018,6 +1034,105 @@ export const gastownRouter = router({ } const townStub = getTownDOStub(ctx.env, input.townId); await townStub.forceRefreshContainerToken(); + + // Also remint and push KILOCODE_TOKEN — this is what actually + // authenticates GT tool calls and is the main reason users hit 401s. + // For personal towns the caller IS the owner; for org towns we must + // use the town owner's identity (not the caller's) so that + // git-credentials and other owner-scoped APIs continue to work. + let tokenUser: { id: string; api_token_pepper: string | null }; + if (ownership.type === 'user') { + tokenUser = userFromCtx(ctx); + } else { + // Org town: resolve the owner from the town config + const config = await townStub.getTownConfig(); + const ownerId = config.owner_user_id ?? config.created_by_user_id; + if (ownerId && ownerId === ctx.userId) { + // Caller happens to be the owner — use their live context + tokenUser = userFromCtx(ctx); + } else if (ownerId) { + // Different org member — look up the owner's pepper from the DB + if (!ctx.env.HYPERDRIVE) { + throw new TRPCError({ + code: 'INTERNAL_SERVER_ERROR', + message: 'HYPERDRIVE binding not configured — cannot resolve town owner', + }); + } + const { findUserById } = await import('../util/user-db.util'); + const ownerUser = await findUserById(ctx.env.HYPERDRIVE.connectionString, ownerId); + if (!ownerUser) { + throw new TRPCError({ + code: 'INTERNAL_SERVER_ERROR', + message: 'Town owner not found — cannot refresh KILOCODE_TOKEN', + }); + } + tokenUser = { id: ownerUser.id, api_token_pepper: ownerUser.api_token_pepper }; + } else { + // No owner recorded — fall back to caller + tokenUser = userFromCtx(ctx); + } + } + const newKilocodeToken = await mintKilocodeToken(ctx.env, tokenUser); + await townStub.updateTownConfig({ kilocode_token: newKilocodeToken }); + await townStub.syncConfigToContainer(); + }), + + forceRestartContainer: gastownProcedure + .input(z.object({ townId: z.string().uuid() })) + .mutation(async ({ ctx, input }) => { + const ownership = await resolveTownOwnership(ctx.env, ctx, input.townId); + if (ownership.type === 'admin') { + throw new TRPCError({ + code: 'FORBIDDEN', + message: 'Admins cannot restart containers for towns they do not own', + }); + } + if (ownership.type === 'org') { + const townStub = getTownDOStub(ctx.env, input.townId); + const config = await townStub.getTownConfig(); + const membership = getOrgMembership(ctx.orgMemberships, ownership.orgId); + const isOrgOwner = membership?.role === 'owner'; + const isTownCreator = ctx.userId === config.created_by_user_id; + if (!isOrgOwner && !isTownCreator) { + throw new TRPCError({ + code: 'FORBIDDEN', + message: 'Only town creators and org owners can restart containers', + }); + } + } + // stop() sends SIGTERM so the container's drain handler can run + // drainAll() — nudging agents to commit/push WIP before exiting. + const containerStub = getTownContainerStub(ctx.env, input.townId); + await containerStub.stop(); + }), + + destroyContainer: gastownProcedure + .input(z.object({ townId: z.string().uuid() })) + .mutation(async ({ ctx, input }) => { + const ownership = await resolveTownOwnership(ctx.env, ctx, input.townId); + if (ownership.type === 'admin') { + throw new TRPCError({ + code: 'FORBIDDEN', + message: 'Admins cannot destroy containers for towns they do not own', + }); + } + if (ownership.type === 'org') { + const townStub = getTownDOStub(ctx.env, input.townId); + const config = await townStub.getTownConfig(); + const membership = getOrgMembership(ctx.orgMemberships, ownership.orgId); + const isOrgOwner = membership?.role === 'owner'; + const isTownCreator = ctx.userId === config.created_by_user_id; + if (!isOrgOwner && !isTownCreator) { + throw new TRPCError({ + code: 'FORBIDDEN', + message: 'Only town creators and org owners can destroy containers', + }); + } + } + // destroy() sends SIGKILL — the container dies immediately with + // no graceful drain. Use when the container is stuck or unresponsive. + const containerStub = getTownContainerStub(ctx.env, input.townId); + await containerStub.destroy(); }), // ── Events ────────────────────────────────────────────────────────── @@ -1367,7 +1482,11 @@ export const gastownRouter = router({ .output(RpcBeadOutput) .mutation(async ({ ctx, input }) => { const townStub = getTownDOStub(ctx.env, input.townId); - return townStub.updateBeadStatus(input.beadId, 'failed', 'admin'); + return townStub.updateBeadStatus(input.beadId, 'failed', 'admin', { + code: 'admin_force_fail', + message: 'Manually failed by admin', + source: 'admin', + }); }), adminGetAlarmStatus: adminProcedure diff --git a/cloudflare-gastown/src/types.ts b/cloudflare-gastown/src/types.ts index f59b3d216..245097035 100644 --- a/cloudflare-gastown/src/types.ts +++ b/cloudflare-gastown/src/types.ts @@ -51,7 +51,7 @@ export type BeadFilter = { export const AgentRole = z.enum(['polecat', 'refinery', 'mayor']); export type AgentRole = z.infer; -export const AgentStatus = z.enum(['idle', 'working', 'stalled', 'dead']); +export const AgentStatus = z.enum(['idle', 'working', 'waiting', 'stalled', 'dead']); export type AgentStatus = z.infer; /** diff --git a/cloudflare-gastown/src/util/analytics.util.ts b/cloudflare-gastown/src/util/analytics.util.ts index 99d1e54c0..af5676b32 100644 --- a/cloudflare-gastown/src/util/analytics.util.ts +++ b/cloudflare-gastown/src/util/analytics.util.ts @@ -38,6 +38,7 @@ export type GastownEventData = { beadId?: string; convoyId?: string; role?: string; // 'polecat' | 'refinery' | 'mayor' + reason?: string; // dispatch failure reason, triage action, etc. beadType?: string; durationMs?: number; value?: number; @@ -79,6 +80,7 @@ export function writeEvent( data.convoyId ?? '', // blob11 data.role ?? '', // blob12 data.beadType ?? '', // blob13 + data.reason ?? '', // blob14 ], doubles: [ data.durationMs ?? 0, // double1 diff --git a/cloudflare-gastown/test/integration/mayor-idle.test.ts b/cloudflare-gastown/test/integration/mayor-idle.test.ts new file mode 100644 index 000000000..c12527a4a --- /dev/null +++ b/cloudflare-gastown/test/integration/mayor-idle.test.ts @@ -0,0 +1,153 @@ +/** + * Integration tests for the mayor idle (waiting) lifecycle. + * + * Verifies that: + * 1. The "waiting" agent status exists and can be set + * 2. hasActiveWork() returns false when the only agent is a waiting mayor + * 3. The alarm interval drops to idle cadence when the mayor is waiting + * 4. mayorWaiting() transitions a working mayor to waiting + * 5. sendMayorMessage transitions a waiting mayor back to working (when container is alive) + * 6. Token refresh throttle persists across DO eviction (ctx.storage) + */ + +import { env, runDurableObjectAlarm } from 'cloudflare:test'; +import { describe, it, expect, beforeEach } from 'vitest'; + +function getTownStub(name = 'test-town') { + const id = env.TOWN.idFromName(name); + return env.TOWN.get(id); +} + +describe('Mayor idle lifecycle', () => { + let town: ReturnType; + let townName: string; + + beforeEach(async () => { + townName = `mayor-idle-${crypto.randomUUID()}`; + town = getTownStub(townName); + await town.setTownId(townName); + await town.addRig({ + rigId: 'rig-1', + name: 'main-rig', + gitUrl: 'https://github.com/test/repo.git', + defaultBranch: 'main', + }); + }); + + // ── waiting status ────────────────────────────────────────────────── + + describe('waiting status', () => { + it('should allow setting an agent to waiting', async () => { + // Register a mayor agent directly + const agentsBefore = await town.listAgents({ role: 'mayor' }); + expect(agentsBefore.length).toBe(0); + + // ensureMayor creates the agent (won't start container in test env) + const result = await town.ensureMayor(); + expect(result.agentId).toBeTruthy(); + + // Set the agent to working first, then waiting + await town.updateAgentStatus(result.agentId, 'working'); + const workingAgent = await town.getAgentAsync(result.agentId); + expect(workingAgent?.status).toBe('working'); + + // mayorWaiting should transition working → waiting + await town.mayorWaiting(result.agentId); + const waitingAgent = await town.getAgentAsync(result.agentId); + expect(waitingAgent?.status).toBe('waiting'); + }); + + it('should not transition non-working agents to waiting', async () => { + const result = await town.ensureMayor(); + + // Agent starts as idle (container not running in test env) + const agent = await town.getAgentAsync(result.agentId); + expect(agent?.status).toBe('idle'); + + // mayorWaiting should NOT change idle to waiting + await town.mayorWaiting(result.agentId); + const afterAgent = await town.getAgentAsync(result.agentId); + expect(afterAgent?.status).toBe('idle'); + }); + + it('should resolve empty agentId to the mayor', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'working'); + + // Call with undefined agentId — should resolve to mayor + await town.mayorWaiting(); + const agent = await town.getAgentAsync(result.agentId); + expect(agent?.status).toBe('waiting'); + }); + }); + + // ── hasActiveWork / alarm interval ────────────────────────────────── + + describe('alarm interval with waiting mayor', () => { + it('should use idle alarm interval when mayor is waiting', async () => { + const result = await town.ensureMayor(); + + // Set mayor to working → alarm should be active (5s) + await town.updateAgentStatus(result.agentId, 'working'); + const activeStatus = await town.getAlarmStatus(); + expect(activeStatus.alarm.intervalMs).toBe(5_000); + + // Set mayor to waiting → alarm should drop to idle (5 min) + await town.updateAgentStatus(result.agentId, 'waiting'); + const idleStatus = await town.getAlarmStatus(); + expect(idleStatus.alarm.intervalMs).toBe(5 * 60_000); + }); + + it('should use active alarm interval when a polecat is working alongside a waiting mayor', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + // Create a convoy to get a working polecat + const convoy = await town.slingConvoy({ + rigId: 'rig-1', + convoyTitle: 'Test', + tasks: [{ title: 'Task 1' }], + }); + + // Run alarm to assign and dispatch the polecat + await runDurableObjectAlarm(town); + + const bead = await town.getBeadAsync(convoy.beads[0].bead.bead_id); + expect(bead?.assignee_agent_bead_id).toBeTruthy(); + + // Set the polecat to working + if (bead?.assignee_agent_bead_id) { + await town.updateAgentStatus(bead.assignee_agent_bead_id, 'working'); + } + + // Now alarm should be active (polecat is working) + const status = await town.getAlarmStatus(); + expect(status.alarm.intervalMs).toBe(5_000); + }); + }); + + // ── getMayorStatus mapping ───────────────────────────────────────── + + describe('getMayorStatus', () => { + it('should report waiting mayor as active', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + const status = await town.getMayorStatus(); + expect(status.session?.status).toBe('active'); + }); + }); + + // ── getAlarmStatus agent counts ──────────────────────────────────── + + describe('getAlarmStatus agent counts', () => { + it('should include waiting in agent counts', async () => { + const result = await town.ensureMayor(); + await town.updateAgentStatus(result.agentId, 'waiting'); + + const status = await town.getAlarmStatus(); + expect(status.agents.waiting).toBe(1); + expect(status.agents.working).toBe(0); + }); + }); +}); diff --git a/cloudflare-gastown/wrangler.jsonc b/cloudflare-gastown/wrangler.jsonc index 440cf8367..f045ffd72 100644 --- a/cloudflare-gastown/wrangler.jsonc +++ b/cloudflare-gastown/wrangler.jsonc @@ -36,7 +36,7 @@ "class_name": "TownContainerDO", "image": "./container/Dockerfile", "instance_type": "standard-4", - "max_instances": 500, + "max_instances": 700, }, ], @@ -117,7 +117,7 @@ // Desktop VM host gateway IP directly so containers can reach // the host's dev servers. "KILO_API_URL": "http://192.168.65.254:3000", - "GASTOWN_API_URL": "http://192.168.65.254:8787", + "GASTOWN_API_URL": "http://192.168.65.254:8803", }, "containers": [ { diff --git a/src/app/(app)/gastown/[townId]/TownOverviewPageClient.tsx b/src/app/(app)/gastown/[townId]/TownOverviewPageClient.tsx index a75006bfb..c27c34e89 100644 --- a/src/app/(app)/gastown/[townId]/TownOverviewPageClient.tsx +++ b/src/app/(app)/gastown/[townId]/TownOverviewPageClient.tsx @@ -36,6 +36,7 @@ import { AreaChart, Area, ResponsiveContainer, Tooltip, XAxis, YAxis } from 'rec import { motion, AnimatePresence } from 'motion/react'; import type { GastownOutputs } from '@/lib/gastown/trpc'; import { AdminViewingBanner } from '@/components/gastown/AdminViewingBanner'; +import { DrainStatusBanner } from '@/components/gastown/DrainStatusBanner'; type Agent = GastownOutputs['gastown']['listAgents'][number]; @@ -208,6 +209,9 @@ export function TownOverviewPageClient({ return (
+
+ +
{/* Top bar — sticky */}
diff --git a/src/app/(app)/gastown/[townId]/settings/TownSettingsPageClient.tsx b/src/app/(app)/gastown/[townId]/settings/TownSettingsPageClient.tsx index 19fe2fade..07bd79ae6 100644 --- a/src/app/(app)/gastown/[townId]/settings/TownSettingsPageClient.tsx +++ b/src/app/(app)/gastown/[townId]/settings/TownSettingsPageClient.tsx @@ -26,6 +26,8 @@ import { Variable, Layers, RefreshCw, + RotateCcw, + Power, Container, User, Key, @@ -238,6 +240,21 @@ export function TownSettingsPageClient({ townId, readOnly = false, organizationI }) ); + const restartContainer = useMutation( + trpc.gastown.forceRestartContainer.mutationOptions({ + onSuccess: () => + toast.success('Container stopping gracefully — agents will save work before exiting'), + onError: err => toast.error(`Container restart failed: ${err.message}`), + }) + ); + + const destroyContainer = useMutation( + trpc.gastown.destroyContainer.mutationOptions({ + onSuccess: () => toast.success('Container destroyed — it will restart on next dispatch'), + onError: err => toast.error(`Container destroy failed: ${err.message}`), + }) + ); + // Local state for form fields const [envVars, setEnvVars] = useState([]); const [githubToken, setGithubToken] = useState(''); @@ -815,7 +832,7 @@ export function TownSettingsPageClient({ townId, readOnly = false, organizationI
+
+
+

Graceful Stop

+

+ Sends SIGTERM — agents save their work before the container exits. It will + restart on the next dispatch cycle. +

+
+ +
+
+
+

Destroy Container

+

+ Sends SIGKILL — the container dies immediately with no graceful drain. Use + when the container is stuck or unresponsive. +

+
+ +
@@ -852,7 +911,7 @@ export function TownSettingsPageClient({ townId, readOnly = false, organizationI disabled={ deleteTown.isPending || deleteOrgTown.isPending || effectiveReadOnly } - variant="destructive" + variant="secondary" size="sm" className="ml-4 shrink-0 gap-1.5" > diff --git a/src/app/admin/gastown/towns/[townId]/ContainerTab.tsx b/src/app/admin/gastown/towns/[townId]/ContainerTab.tsx index 6b7532b72..4f3eaeb53 100644 --- a/src/app/admin/gastown/towns/[townId]/ContainerTab.tsx +++ b/src/app/admin/gastown/towns/[townId]/ContainerTab.tsx @@ -15,8 +15,45 @@ import { DialogTitle, } from '@/components/ui/dialog'; import { Badge } from '@/components/ui/badge'; +import { Tooltip, TooltipContent, TooltipTrigger } from '@/components/ui/tooltip'; +import { ExternalLink } from 'lucide-react'; import { formatDistanceToNow } from 'date-fns'; +function CFLink({ + href, + label, + disabledTooltip, +}: { + href: string | null | undefined; + label: string; + disabledTooltip?: string; +}) { + if (!href) { + return ( + + + + + + + {disabledTooltip && {disabledTooltip}} + + ); + } + + return ( + + ); +} + export function ContainerTab({ townId }: { townId: string }) { const trpc = useTRPC(); @@ -25,6 +62,7 @@ export function ContainerTab({ townId }: { townId: string }) { const healthQuery = useQuery(trpc.admin.gastown.getTownHealth.queryOptions({ townId })); const eventsQuery = useQuery(trpc.admin.gastown.listContainerEvents.queryOptions({ townId })); const configQuery = useQuery(trpc.admin.gastown.getTownConfig.queryOptions({ townId })); + const cfLinksQuery = useQuery(trpc.admin.gastown.getCloudflareLinks.queryOptions({ townId })); const forceRestartMutation = useMutation( trpc.admin.gastown.forceRestartContainer.mutationOptions({ @@ -63,8 +101,47 @@ export function ContainerTab({ townId }: { townId: string }) { ? 'bg-red-500/10 text-red-400 border-red-500/20' : 'bg-gray-500/10 text-gray-400 border-gray-500/20'; + const cfLinks = cfLinksQuery.data; + return (
+ {/* Cloudflare Dashboard */} + + + Cloudflare Dashboard + + + {cfLinksQuery.isLoading && ( +

Loading links…

+ )} + {cfLinksQuery.isError && ( +

+ Failed to load Cloudflare links: {cfLinksQuery.error.message} +

+ )} + {!cfLinksQuery.isLoading && !cfLinksQuery.isError && ( +
+ + + + +
+ )} +
+
+ {/* Health & Actions */} diff --git a/src/components/gastown/ActivityFeed.tsx b/src/components/gastown/ActivityFeed.tsx index f198df655..61dee9017 100644 --- a/src/components/gastown/ActivityFeed.tsx +++ b/src/components/gastown/ActivityFeed.tsx @@ -67,8 +67,17 @@ function eventDescription(event: { return `${rigPrefix}Agent hooked to bead`; case 'unhooked': return `${rigPrefix}Agent unhooked from bead`; - case 'status_changed': - return `${rigPrefix}Status: ${event.old_value ?? '?'} → ${event.new_value ?? '?'}`; + case 'status_changed': { + const desc = `${rigPrefix}Status: ${event.old_value ?? '?'} → ${event.new_value ?? '?'}`; + if (event.new_value === 'failed') { + const fr = event.metadata?.failure_reason; + if (typeof fr === 'object' && fr !== null && 'message' in fr) { + const msg = (fr as Record).message; + if (typeof msg === 'string') return `${desc} — ${msg}`; + } + } + return desc; + } case 'closed': return `${rigPrefix}Bead closed`; case 'escalated': diff --git a/src/components/gastown/DrainStatusBanner.tsx b/src/components/gastown/DrainStatusBanner.tsx new file mode 100644 index 000000000..ccc5cce60 --- /dev/null +++ b/src/components/gastown/DrainStatusBanner.tsx @@ -0,0 +1,83 @@ +'use client'; + +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { useGastownTRPC } from '@/lib/gastown/trpc'; +import { Banner } from '@/components/shared/Banner'; +import { Button } from '@/components/ui/button'; +import { AlertTriangle, Loader2 } from 'lucide-react'; +import { toast } from 'sonner'; + +/** + * Banner displayed when the town's container is draining (graceful restart + * in progress). Shows how long the drain has been running and provides a + * force-shutdown button to expedite the process. + * + * Polls every 5s so it appears/disappears promptly. + */ +export function DrainStatusBanner({ townId }: { townId: string }) { + const trpc = useGastownTRPC(); + const queryClient = useQueryClient(); + + const { data } = useQuery({ + ...trpc.gastown.getDrainStatus.queryOptions({ townId }), + refetchInterval: 5_000, + }); + + const { data: adminAccess } = useQuery(trpc.gastown.checkAdminAccess.queryOptions({ townId })); + const isReadOnly = adminAccess?.isAdminViewing === true; + + const destroyContainer = useMutation( + trpc.gastown.destroyContainer.mutationOptions({ + onSuccess: () => { + toast.success('Container destroyed — it will restart on next dispatch'); + void queryClient.invalidateQueries({ + queryKey: trpc.gastown.getDrainStatus.queryKey({ townId }), + }); + }, + onError: err => toast.error(`Force shutdown failed: ${err.message}`), + }) + ); + + if (!data?.draining) return null; + + const elapsed = data.drainStartedAt + ? Math.round((Date.now() - new Date(data.drainStartedAt).getTime()) / 1000) + : null; + const elapsedLabel = + elapsed !== null + ? elapsed < 60 + ? `${elapsed}s ago` + : `${Math.floor(elapsed / 60)}m ${elapsed % 60}s ago` + : null; + + return ( + + + + + + Container restart in progress + + A graceful shutdown was initiated{elapsedLabel ? ` ${elapsedLabel}` : ''}. Agents are + finishing their current work — no new tasks will be dispatched until the restart + completes. + + + {!isReadOnly && ( + + + + )} + + ); +} diff --git a/src/components/gastown/TerminalBar.module.css b/src/components/gastown/TerminalBar.module.css new file mode 100644 index 000000000..db2e57584 --- /dev/null +++ b/src/components/gastown/TerminalBar.module.css @@ -0,0 +1,12 @@ +.fullscreen { + position: fixed; + inset: 0; + z-index: 100; + padding: 0; + margin: 0; + overflow: hidden; +} + +.fullscreenTransition { + transition: all 0.3s ease-in-out; +} diff --git a/src/components/gastown/TerminalBar.tsx b/src/components/gastown/TerminalBar.tsx index a17242549..ad2af05ca 100644 --- a/src/components/gastown/TerminalBar.tsx +++ b/src/components/gastown/TerminalBar.tsx @@ -33,6 +33,7 @@ import { MessageCircle, } from 'lucide-react'; import { motion, AnimatePresence } from 'motion/react'; +import styles from './TerminalBar.module.css'; type TerminalBarProps = { townId: string; @@ -146,13 +147,60 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP const effectiveActiveId = activeTabId ?? 'mayor'; const activeTab = allTabs.find(t => t.id === effectiveActiveId) ?? allTabs[0]; + // ── Fullscreen state (purely local — toggled via double-click / Escape) ── + const [isFullscreen, setLocalFullscreen] = useState(false); + const previousSizeRef = useRef(size); + + const enterFullscreen = useCallback(() => { + previousSizeRef.current = size; + setLocalFullscreen(true); + }, [size]); + + const exitFullscreen = useCallback(() => { + setSize(previousSizeRef.current); + setLocalFullscreen(false); + }, [setSize]); + + const toggleFullscreen = useCallback(() => { + if (isFullscreen) { + exitFullscreen(); + } else { + enterFullscreen(); + } + }, [isFullscreen, enterFullscreen, exitFullscreen]); + + // Escape key exits fullscreen + useEffect(() => { + if (!isFullscreen) return; + const handleKeyDown = (e: KeyboardEvent) => { + if (e.key === 'Escape') { + exitFullscreen(); + } + }; + document.addEventListener('keydown', handleKeyDown); + return () => document.removeEventListener('keydown', handleKeyDown); + }, [isFullscreen, exitFullscreen]); + // ── Resize drag logic ────────────────────────────────────────────── const isDragging = useRef(false); const startPos = useRef(0); const startSize = useRef(0); + const lastClickTime = useRef(0); const onResizePointerDown = useCallback( (e: React.PointerEvent) => { + // Prevent drag on double-click (detected by < 300ms between clicks) + const now = Date.now(); + if (now - lastClickTime.current < 300) { + return; + } + lastClickTime.current = now; + + if (isFullscreen) { + exitFullscreen(); + return; + } + e.preventDefault(); isDragging.current = true; startSize.current = size; @@ -184,9 +232,16 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP document.addEventListener('pointermove', onPointerMove); document.addEventListener('pointerup', onPointerUp); }, - [size, position, horizontal, setSize] + [size, position, horizontal, setSize, isFullscreen, exitFullscreen] ); + // Double-click handler for resize bar + const onResizeDoubleClick = useCallback(() => { + if (!collapsed) { + toggleFullscreen(); + } + }, [collapsed, toggleFullscreen]); + // ── Compute container styles ─────────────────────────────────────── const totalSize = collapsed ? COLLAPSED_SIZE : COLLAPSED_SIZE + size; @@ -270,14 +325,18 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP return (
{position === 'bottom' && ( <> {!collapsed && ( -
+
)} @@ -300,6 +359,7 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP size={size} townId={townId} alarmWs={alarmWs} + fullscreen={isFullscreen} /> )} @@ -312,6 +372,7 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP size={size} townId={townId} alarmWs={alarmWs} + fullscreen={isFullscreen} /> {!collapsed && ( -
+
)} @@ -335,7 +400,11 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP {position === 'right' && ( <> {!collapsed && ( -
+
)} @@ -358,6 +427,7 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP size={size} townId={townId} alarmWs={alarmWs} + fullscreen={isFullscreen} /> )} @@ -382,9 +452,14 @@ export function TerminalBar({ townId, basePath: basePathOverride }: TerminalBarP size={size} townId={townId} alarmWs={alarmWs} + fullscreen={isFullscreen} /> {!collapsed && ( -
+
)} @@ -759,6 +834,7 @@ function TerminalContent({ size, townId, alarmWs, + fullscreen, }: { activeTab: TabDef; collapsed: boolean; @@ -766,6 +842,7 @@ function TerminalContent({ size: number; townId: string; alarmWs: AlarmWsResult; + fullscreen?: boolean; }) { if (collapsed) return null; @@ -777,8 +854,8 @@ function TerminalContent({ animate={{ opacity: 1 }} exit={{ opacity: 0 }} transition={{ duration: 0.15 }} - style={horizontal ? { height: size } : { width: size }} - className={`overflow-hidden ${horizontal ? '' : 'h-full'}`} + style={fullscreen ? {} : horizontal ? { height: size } : { width: size }} + className={`overflow-hidden ${horizontal ? '' : 'h-full'} ${fullscreen ? 'h-full' : ''}`} > {activeTab.kind === 'mayor' ? ( @@ -811,6 +888,8 @@ type AlarmStatus = { orphanedHooks: number; }; recentEvents: Array<{ time: string; type: string; message: string }>; + draining?: boolean; + drainStartedAt?: string; }; type AgentStatusEvent = { diff --git a/src/components/gastown/drawer-panels/EventPanel.tsx b/src/components/gastown/drawer-panels/EventPanel.tsx index 247f895fe..bc3c65f9a 100644 --- a/src/components/gastown/drawer-panels/EventPanel.tsx +++ b/src/components/gastown/drawer-panels/EventPanel.tsx @@ -143,6 +143,17 @@ export function EventPanel({ const mailSubject = typeof meta.subject === 'string' ? meta.subject : null; const mailTo = typeof meta.to === 'string' ? meta.to : null; + // Extract structured failure reason from status_changed → failed events + const failureReason = + typeof meta.failure_reason === 'object' && meta.failure_reason !== null + ? (meta.failure_reason as { + code?: string; + message?: string; + details?: string; + source?: string; + }) + : null; + // Metadata entries excluding the ones we render in context sections const contextKeys = new Set([ 'title', @@ -153,6 +164,7 @@ export function EventPanel({ 'completedBy', 'subject', 'to', + 'failure_reason', ]); const extraMetadata = Object.entries(meta).filter( ([k, v]) => !contextKeys.has(k) && v !== null && v !== undefined && v !== '' @@ -222,6 +234,19 @@ export function EventPanel({ {event.new_value ?? '—'}
+ {event.new_value === 'failed' && failureReason && ( +
+

{failureReason.message}

+ {failureReason.details && ( +

+ {failureReason.details} +

+ )} +

+ {failureReason.source} / {failureReason.code} +

+
+ )} )} diff --git a/src/lib/config.server.ts b/src/lib/config.server.ts index 20db96d6a..dfd78d00e 100644 --- a/src/lib/config.server.ts +++ b/src/lib/config.server.ts @@ -176,6 +176,13 @@ if (process.env.NODE_ENV === 'production') { } } +// Cloudflare dashboard link construction (admin town inspector) +export const CLOUDFLARE_ACCOUNT_ID = getEnvVariable('CLOUDFLARE_ACCOUNT_ID'); +export const CLOUDFLARE_TOWN_DO_NAMESPACE_ID = getEnvVariable('CLOUDFLARE_TOWN_DO_NAMESPACE_ID'); +export const CLOUDFLARE_CONTAINER_DO_NAMESPACE_ID = getEnvVariable( + 'CLOUDFLARE_CONTAINER_DO_NAMESPACE_ID' +); + // KiloClaw Worker export const KILOCLAW_API_URL = getEnvVariable('KILOCLAW_API_URL') || ''; export const KILOCLAW_INTERNAL_API_SECRET = getEnvVariable('KILOCLAW_INTERNAL_API_SECRET') || ''; diff --git a/src/lib/gastown/types/router.d.ts b/src/lib/gastown/types/router.d.ts index b8160316c..ce9ece484 100644 --- a/src/lib/gastown/types/router.d.ts +++ b/src/lib/gastown/types/router.d.ts @@ -59,6 +59,16 @@ export declare const gastownRouter: import('@trpc/server').TRPCBuiltRouter< }; meta: object; }>; + getDrainStatus: import('@trpc/server').TRPCQueryProcedure<{ + input: { + townId: string; + }; + output: { + draining: boolean; + drainStartedAt: string | null; + }; + meta: object; + }>; deleteTown: import('@trpc/server').TRPCMutationProcedure<{ input: { townId: string; @@ -597,6 +607,20 @@ export declare const gastownRouter: import('@trpc/server').TRPCBuiltRouter< output: void; meta: object; }>; + forceRestartContainer: import('@trpc/server').TRPCMutationProcedure<{ + input: { + townId: string; + }; + output: void; + meta: object; + }>; + destroyContainer: import('@trpc/server').TRPCMutationProcedure<{ + input: { + townId: string; + }; + output: void; + meta: object; + }>; getBeadEvents: import('@trpc/server').TRPCQueryProcedure<{ input: { rigId: string; @@ -1331,6 +1355,16 @@ export declare const wrappedGastownRouter: import('@trpc/server').TRPCBuiltRoute }; meta: object; }>; + getDrainStatus: import('@trpc/server').TRPCQueryProcedure<{ + input: { + townId: string; + }; + output: { + draining: boolean; + drainStartedAt: string | null; + }; + meta: object; + }>; deleteTown: import('@trpc/server').TRPCMutationProcedure<{ input: { townId: string; @@ -1869,6 +1903,20 @@ export declare const wrappedGastownRouter: import('@trpc/server').TRPCBuiltRoute output: void; meta: object; }>; + forceRestartContainer: import('@trpc/server').TRPCMutationProcedure<{ + input: { + townId: string; + }; + output: void; + meta: object; + }>; + destroyContainer: import('@trpc/server').TRPCMutationProcedure<{ + input: { + townId: string; + }; + output: void; + meta: object; + }>; getBeadEvents: import('@trpc/server').TRPCQueryProcedure<{ input: { rigId: string; diff --git a/src/lib/gastown/types/schemas.d.ts b/src/lib/gastown/types/schemas.d.ts index d56dbe856..5ccdf157d 100644 --- a/src/lib/gastown/types/schemas.d.ts +++ b/src/lib/gastown/types/schemas.d.ts @@ -1,4 +1,4 @@ -import type { z } from 'zod'; +import { z } from 'zod'; export declare const TownOutput: z.ZodObject< { id: z.ZodString; diff --git a/src/routers/admin/gastown-router.ts b/src/routers/admin/gastown-router.ts index c3dc6925d..3e8284ad5 100644 --- a/src/routers/admin/gastown-router.ts +++ b/src/routers/admin/gastown-router.ts @@ -7,6 +7,9 @@ import { GASTOWN_SERVICE_URL, GASTOWN_CF_ACCESS_CLIENT_ID, GASTOWN_CF_ACCESS_CLIENT_SECRET, + CLOUDFLARE_ACCOUNT_ID, + CLOUDFLARE_TOWN_DO_NAMESPACE_ID, + CLOUDFLARE_CONTAINER_DO_NAMESPACE_ID, } from '@/lib/config.server'; import { generateApiToken } from '@/lib/tokens'; import type { User } from '@kilocode/db/schema'; @@ -429,6 +432,59 @@ export const adminGastownRouter = createTRPCRouter({ ); }), + /** + * Get Cloudflare dashboard links for a town. + * Fetches DO IDs from the gastown worker and constructs CF dashboard URLs. + * Gracefully degrades when env vars are not configured. + */ + getCloudflareLinks: adminProcedure + .input(z.object({ townId: z.string().uuid() })) + .output( + z.object({ + workerLogsUrl: z.string(), + containerInstanceUrl: z.string().nullable(), + townDoLogsUrl: z.string().nullable(), + containerDoLogsUrl: z.string().nullable(), + }) + ) + .query(async ({ input, ctx }) => { + const accountId = CLOUDFLARE_ACCOUNT_ID; + if (!accountId) { + return { + workerLogsUrl: + 'https://dash.cloudflare.com/workers/services/view/gastown/production/logs/live', + containerInstanceUrl: null, + townDoLogsUrl: null, + containerDoLogsUrl: null, + }; + } + + const debugInfo = await gastownGet( + ctx.user, + `/api/towns/${input.townId}/cloudflare-debug`, + z.object({ containerDoId: z.string().nullable(), townDoId: z.string() }) + ).catch(() => null); + + const townDoNamespaceId = CLOUDFLARE_TOWN_DO_NAMESPACE_ID; + const containerDoNamespaceId = CLOUDFLARE_CONTAINER_DO_NAMESPACE_ID; + + return { + workerLogsUrl: `https://dash.cloudflare.com/${accountId}/workers/services/view/gastown/production/logs/live`, + // containerDoId is only non-null when the container is actually running + containerInstanceUrl: debugInfo?.containerDoId + ? `https://dash.cloudflare.com/${accountId}/workers/containers/app-gastown/instances/${debugInfo.containerDoId}` + : null, + townDoLogsUrl: + townDoNamespaceId && debugInfo + ? `https://dash.cloudflare.com/${accountId}/workers/durable-objects/view/${townDoNamespaceId}/${debugInfo.townDoId}/logs` + : null, + containerDoLogsUrl: + containerDoNamespaceId && debugInfo?.containerDoId + ? `https://dash.cloudflare.com/${accountId}/workers/durable-objects/view/${containerDoNamespaceId}/${debugInfo.containerDoId}/logs` + : null, + }; + }), + /** * List all beads in a town, with optional filters. * The user-facing tRPC listBeads requires a rigId and verifies ownership.