From 5f86bba79e88ff1ac928a5b7f77c3916cfddbb5c Mon Sep 17 00:00:00 2001 From: syn Date: Fri, 3 Apr 2026 17:13:05 -0500 Subject: [PATCH 1/3] feat(kiloclaw): add stop recovery admin tooling --- kiloclaw/src/config.ts | 7 + .../durable-objects/kiloclaw-instance.test.ts | 321 +++++++++++++- .../kiloclaw-instance/fly-machines.ts | 60 ++- .../kiloclaw-instance/index.ts | 114 ++++- .../durable-objects/kiloclaw-instance/log.ts | 3 + .../kiloclaw-instance/reconcile.ts | 105 +++-- .../kiloclaw-instance/recovery.ts | 400 ++++++++++++++++++ .../kiloclaw-instance/state.ts | 18 + .../kiloclaw-instance/types.ts | 6 + kiloclaw/src/index.test.ts | 78 ++++ kiloclaw/src/index.ts | 23 + kiloclaw/src/routes/platform.ts | 21 + kiloclaw/src/schemas/instance-config.ts | 8 + kiloclaw/src/utils/analytics.ts | 3 + packages/db/src/schema-types.ts | 1 + .../claw/components/InstanceControls.tsx | 21 +- src/app/(app)/claw/components/SettingsTab.tsx | 9 +- src/app/(app)/claw/components/claw.types.ts | 4 + .../KiloclawInstanceDetail.tsx | 192 ++++++++- src/lib/kiloclaw/kiloclaw-internal-client.ts | 16 + src/lib/kiloclaw/types.ts | 12 + .../admin-kiloclaw-instances-router.ts | 37 ++ 22 files changed, 1396 insertions(+), 63 deletions(-) create mode 100644 kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts diff --git a/kiloclaw/src/config.ts b/kiloclaw/src/config.ts index 24f9ee194..9daf1f426 100644 --- a/kiloclaw/src/config.ts +++ b/kiloclaw/src/config.ts @@ -56,10 +56,14 @@ export const ALARM_INTERVAL_RUNNING_MS = 5 * 60 * 1000; // 5 min export const ALARM_INTERVAL_STARTING_MS = 60 * 1000; // 1 min /** Restarting: wait for restartMachine() background work and reconcile quickly */ export const ALARM_INTERVAL_RESTARTING_MS = 60 * 1000; // 1 min +/** Recovering: relocate onto a new volume/host and reconcile quickly */ +export const ALARM_INTERVAL_RECOVERING_MS = 60 * 1000; // 1 min /** Maximum time to stay in 'starting' before falling back to 'stopped' */ export const STARTING_TIMEOUT_MS = 5 * 60 * 1000; // 5 min /** Maximum time to stay in 'restarting' before surfacing a timeout */ export const RESTARTING_TIMEOUT_MS = 5 * 60 * 1000; // 5 min +/** Maximum time to stay in 'recovering' before surfacing a timeout */ +export const RECOVERING_TIMEOUT_MS = 10 * 60 * 1000; // 10 min /** Destroying: retry pending deletes quickly */ export const ALARM_INTERVAL_DESTROYING_MS = 60 * 1000; // 1 min /** Provisioned/stopped: slow drift detection */ @@ -70,6 +74,9 @@ export const ALARM_JITTER_MS = 60 * 1000; // 0-60s /** Consecutive failed health checks before marking a running instance as stopped */ export const SELF_HEAL_THRESHOLD = 5; +/** Retain a replaced volume for rollback/debug only when snapshots exist. */ +export const PREVIOUS_VOLUME_RETENTION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days + /** Minimum interval between live Fly API checks in getStatus() (30 seconds). * At 10s UI poll interval, only ~1 in 3 polls will hit Fly. */ export const LIVE_CHECK_THROTTLE_MS = 30 * 1000; diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts index 11093d0c2..57c2d0566 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts @@ -119,6 +119,7 @@ import { SELF_HEAL_THRESHOLD, STARTING_TIMEOUT_MS, RESTARTING_TIMEOUT_MS, + RECOVERING_TIMEOUT_MS, STALE_PROVISION_THRESHOLD_MS, } from '../config'; @@ -313,6 +314,18 @@ async function seedRestarting( }); } +async function seedRecovering( + storage: ReturnType, + overrides: Record = {} +) { + await seedProvisioned(storage, { + status: 'recovering', + flyMachineId: null, + recoveryStartedAt: Date.now(), + ...overrides, + }); +} + // ============================================================================ // Tests // ============================================================================ @@ -778,7 +791,7 @@ describe('destroy error tracking', () => { }); describe('reconciliation: machine status sync', () => { - it('syncs DO status from running to stopped after threshold failures', async () => { + it('transitions running to recovering after threshold failures and launches recovery once', async () => { const { storage } = createInstance(); await seedRunning(storage); @@ -788,12 +801,41 @@ describe('reconciliation: machine status sync', () => { // Need SELF_HEAL_THRESHOLD consecutive alarms for (let i = 0; i < SELF_HEAL_THRESHOLD; i++) { - const { instance: inst } = createInstance(storage); + const { instance: inst, waitUntilPromises } = createInstance(storage); await inst.alarm(); + if (i === SELF_HEAL_THRESHOLD - 1) { + expect(waitUntilPromises).toHaveLength(1); + } } - expect(storage._store.get('status')).toBe('stopped'); + expect(storage._store.get('status')).toBe('recovering'); expect(storage._store.get('healthCheckFailCount')).toBe(0); + expect(storage._store.get('recoveryStartedAt')).not.toBeNull(); + }); + + it('does not relaunch automatic recovery on subsequent alarms while already recovering', async () => { + const { instance, storage, waitUntilPromises } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-new', + }); + + await instance.alarm(); + + expect(waitUntilPromises).toHaveLength(0); + }); + + it('does not clean up a pending recovery volume while recovery is still in progress', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + pendingRecoveryVolumeId: 'vol-recovery', + recoveryStartedAt: Date.now(), + }); + + await instance.alarm(); + + expect(flyClient.getVolume).not.toHaveBeenCalledWith(expect.anything(), 'vol-recovery'); + expect(flyClient.deleteVolume).not.toHaveBeenCalledWith(expect.anything(), 'vol-recovery'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBe('vol-recovery'); }); it('resets fail count when machine is healthy', async () => { @@ -812,6 +854,279 @@ describe('reconciliation: machine status sync', () => { }); }); +describe('unexpected stop recovery', () => { + it('relocates to a different region, preserves the old volume when snapshots exist, and returns to running', async () => { + const env = createFakeEnv(); + env.FLY_REGION = 'iad,ord,cdg'; + const { storage } = createInstance(undefined, env); + await seedRunning(storage); + + (flyClient.getMachine as Mock).mockResolvedValue({ state: 'stopped', config: {} }); + (flyClient.getVolume as Mock).mockImplementation(async (_config: unknown, volumeId: string) => { + if (volumeId === 'vol-1') { + return { + id: 'vol-1', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'iad', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + if (volumeId === 'vol-recovery') { + return { + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + throw new Error(`unexpected volume lookup ${volumeId}`); + }); + (flyClient.createVolumeWithFallback as Mock).mockResolvedValue({ + id: 'vol-recovery', + region: 'ord', + }); + (flyClient.createMachine as Mock).mockResolvedValue({ + id: 'machine-recovery', + region: 'ord', + }); + (flyClient.waitForState as Mock).mockResolvedValue(undefined); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.listVolumeSnapshots as Mock).mockResolvedValue([ + { id: 'snap-1', created_at: new Date().toISOString() }, + ]); + + let finalWaitUntilPromises: Promise[] = []; + for (let i = 0; i < SELF_HEAL_THRESHOLD; i++) { + const { instance, waitUntilPromises } = createInstance(storage, env); + await instance.alarm(); + finalWaitUntilPromises = waitUntilPromises; + } + + await Promise.all(finalWaitUntilPromises); + + expect((flyClient.createVolumeWithFallback as Mock).mock.calls[0][2]).toEqual( + expect.arrayContaining(['ord', 'cdg']) + ); + expect((flyClient.createVolumeWithFallback as Mock).mock.calls[0][2]).not.toContain('iad'); + expect(storage._store.get('status')).toBe('running'); + expect(storage._store.get('flyMachineId')).toBe('machine-recovery'); + expect(storage._store.get('flyVolumeId')).toBe('vol-recovery'); + expect(storage._store.get('flyRegion')).toBe('ord'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBeNull(); + expect(storage._store.get('recoveryPreviousVolumeId')).toBe('vol-1'); + expect(storage._store.get('recoveryPreviousVolumeCleanupAfter')).toBeGreaterThan(Date.now()); + expect(flyClient.deleteVolume).not.toHaveBeenCalledWith(expect.anything(), 'vol-1'); + }); + + it('deletes the old volume immediately when it has no snapshots, force-destroying any attached machine first', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-1', + flyVolumeId: 'vol-1', + flyRegion: 'iad', + }); + + (flyClient.getVolume as Mock) + .mockResolvedValueOnce({ + id: 'vol-1', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'iad', + attached_machine_id: null, + created_at: new Date().toISOString(), + }) + .mockResolvedValueOnce({ + id: 'vol-1', + name: 'sandbox-1', + state: 'attached', + size_gb: 10, + region: 'iad', + attached_machine_id: 'machine-attached', + created_at: new Date().toISOString(), + }); + (flyClient.createVolumeWithFallback as Mock).mockResolvedValue({ + id: 'vol-recovery', + region: 'ord', + }); + (flyClient.createMachine as Mock).mockResolvedValue({ + id: 'machine-recovery', + region: 'ord', + }); + (flyClient.waitForState as Mock).mockResolvedValue(undefined); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.getMachine as Mock).mockResolvedValue({ + config: { metadata: { kiloclaw_sandbox_id: 'sandbox-1' } }, + }); + (flyClient.listVolumeSnapshots as Mock).mockResolvedValue([]); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + await ( + instance as unknown as { recoverUnexpectedStopInBackground: () => Promise } + ).recoverUnexpectedStopInBackground(); + + expect(flyClient.destroyMachine).toHaveBeenCalledWith(expect.anything(), 'machine-1', true); + expect(flyClient.destroyMachine).toHaveBeenCalledWith( + expect.anything(), + 'machine-attached', + true + ); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-1'); + expect(storage._store.get('status')).toBe('running'); + expect(storage._store.get('recoveryPreviousVolumeId')).toBeNull(); + expect(storage._store.get('recoveryPreviousVolumeCleanupAfter')).toBeNull(); + }); + + it('cleans up retained recovery volumes after the TTL, force-destroying any attached machine first', async () => { + const { instance, storage } = createInstance(); + await seedProvisioned(storage, { + status: 'stopped', + flyMachineId: null, + flyVolumeId: 'vol-current', + recoveryPreviousVolumeId: 'vol-old', + recoveryPreviousVolumeCleanupAfter: Date.now() - 1_000, + }); + + (flyClient.getVolume as Mock).mockImplementation(async (_config: unknown, volumeId: string) => { + if (volumeId === 'vol-old') { + return { + id: 'vol-old', + name: 'sandbox-1', + state: 'attached', + size_gb: 10, + region: 'iad', + attached_machine_id: 'machine-old', + created_at: new Date().toISOString(), + }; + } + return { + id: 'vol-current', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'iad', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + }); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.getMachine as Mock).mockResolvedValue({ + config: { metadata: { kiloclaw_sandbox_id: 'sandbox-1' } }, + }); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + await instance.alarm(); + + expect(flyClient.destroyMachine).toHaveBeenCalledWith(expect.anything(), 'machine-old', true); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-old'); + expect(storage._store.get('recoveryPreviousVolumeId')).toBeNull(); + expect(storage._store.get('recoveryPreviousVolumeCleanupAfter')).toBeNull(); + }); + + it('allows admin cleanup of a retained recovery volume and clears the retention fields', async () => { + const { instance, storage } = createInstance(); + await seedProvisioned(storage, { + status: 'running', + recoveryPreviousVolumeId: 'vol-old', + recoveryPreviousVolumeCleanupAfter: Date.now() + 60_000, + }); + + (flyClient.getVolume as Mock).mockResolvedValue({ + id: 'vol-old', + name: 'sandbox-1', + state: 'attached', + size_gb: 10, + region: 'iad', + attached_machine_id: 'machine-old', + created_at: new Date().toISOString(), + }); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.getMachine as Mock).mockResolvedValue({ + config: { metadata: { kiloclaw_sandbox_id: 'sandbox-1' } }, + }); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + const result = await instance.cleanupRecoveryPreviousVolume(); + + expect(result).toEqual({ ok: true, deletedVolumeId: 'vol-old' }); + expect(flyClient.destroyMachine).toHaveBeenCalledWith(expect.anything(), 'machine-old', true); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-old'); + expect(storage._store.get('recoveryPreviousVolumeId')).toBeNull(); + expect(storage._store.get('recoveryPreviousVolumeCleanupAfter')).toBeNull(); + }); + + it('times out recovering instances through the shared failure cleanup path', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-recovery', + pendingRecoveryVolumeId: 'vol-recovery', + recoveryStartedAt: Date.now() - RECOVERING_TIMEOUT_MS - 1_000, + }); + + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.getVolume as Mock).mockResolvedValue({ + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + await instance.alarm(); + + expect(flyClient.destroyMachine).toHaveBeenCalledWith( + expect.anything(), + 'machine-recovery', + true + ); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-recovery'); + expect(storage._store.get('status')).toBe('stopped'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBeNull(); + expect(storage._store.get('lastRecoveryErrorMessage')).toBe( + 'unexpected stop recovery timed out' + ); + }); + + it('refuses retained volume cleanup when the attached machine belongs to a different sandbox', async () => { + const { instance, storage } = createInstance(); + await seedProvisioned(storage, { + status: 'running', + recoveryPreviousVolumeId: 'vol-old', + recoveryPreviousVolumeCleanupAfter: Date.now() + 60_000, + }); + + (flyClient.getVolume as Mock).mockResolvedValue({ + id: 'vol-old', + name: 'sandbox-1', + state: 'attached', + size_gb: 10, + region: 'iad', + attached_machine_id: 'machine-other', + created_at: new Date().toISOString(), + }); + (flyClient.getMachine as Mock).mockResolvedValue({ + config: { metadata: { kiloclaw_sandbox_id: 'sandbox-other' } }, + }); + + await expect(instance.cleanupRecoveryPreviousVolume()).rejects.toThrow( + 'Refusing to destroy attached machine machine-other' + ); + expect(flyClient.destroyMachine).not.toHaveBeenCalled(); + expect(flyClient.deleteVolume).not.toHaveBeenCalled(); + expect(storage._store.get('recoveryPreviousVolumeId')).toBe('vol-old'); + expect(storage._store.get('recoveryPreviousVolumeCleanupAfter')).not.toBeNull(); + }); +}); + describe('reconciliation: Fly failed state', () => { it("immediately transitions running → stopped on Fly 'failed' (no threshold)", async () => { const { instance, storage } = createInstance(); diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/fly-machines.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/fly-machines.ts index 459819f7b..ca1b4bbe2 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/fly-machines.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/fly-machines.ts @@ -13,7 +13,7 @@ import { resolveRegions, evictCapacityRegionFromKV, } from '../regions'; -import { guestFromSize, volumeNameFromSandboxId } from '../machine-config'; +import { guestFromSize, volumeNameFromSandboxId, METADATA_KEY_SANDBOX_ID } from '../machine-config'; import type { InstanceMutableState } from './types'; import { storageUpdate } from './state'; import { reconcileLog, doError, doWarn, toLoggable } from './log'; @@ -263,6 +263,64 @@ export async function createNewMachine( console.log('[DO] Machine started'); } +/** + * Delete a volume, force-destroying any attached machine first. + */ +export async function deleteVolumeAndAttachedMachine( + flyConfig: FlyClientConfig, + volumeId: string, + reason: string, + expectedSandboxId?: string +): Promise { + let attachedMachineId: string | null = null; + + try { + const volume = await fly.getVolume(flyConfig, volumeId); + attachedMachineId = volume.attached_machine_id; + } catch (err) { + if (fly.isFlyNotFound(err)) return; + throw err; + } + + if (attachedMachineId) { + if (expectedSandboxId) { + try { + const attachedMachine = await fly.getMachine(flyConfig, attachedMachineId); + const attachedSandboxId = attachedMachine.config.metadata?.[METADATA_KEY_SANDBOX_ID]; + if (attachedSandboxId !== expectedSandboxId) { + throw new Error( + `Refusing to destroy attached machine ${attachedMachineId} for volume ${volumeId}: expected sandbox ${expectedSandboxId}, found ${attachedSandboxId ?? 'unknown'}` + ); + } + } catch (err) { + if (!fly.isFlyNotFound(err)) throw err; + } + } + + try { + await fly.destroyMachine(flyConfig, attachedMachineId, true); + reconcileLog(reason, 'destroy_machine_for_volume_cleanup', { + fly_app_name: flyConfig.appName, + machine_id: attachedMachineId, + volume_id: volumeId, + }); + } catch (err) { + if (!fly.isFlyNotFound(err)) throw err; + } + } + + try { + await fly.deleteVolume(flyConfig, volumeId); + reconcileLog(reason, 'delete_volume', { + fly_app_name: flyConfig.appName, + volume_id: volumeId, + }); + } catch (err) { + if (fly.isFlyNotFound(err)) return; + throw err; + } +} + /** * Returns the age in ms if this instance is a stale abandoned provision, or null. */ diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index ae691b1b1..fd2473427 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -43,12 +43,8 @@ import { MAX_CUSTOM_SECRETS, type SecretFieldKey, } from '@kilocode/kiloclaw-secret-catalog'; -import { - parseRegions, - prepareRegions, - resolveRegions, - evictCapacityRegionFromKV, -} from '../regions'; +import { parseRegions, prepareRegions, resolveRegions } from '../regions'; +import * as regionHelpers from '../regions'; import { buildMachineConfig, guestFromSize, volumeNameFromSandboxId } from '../machine-config'; import type { GatewayProcessStatus } from '../gateway-controller-types'; @@ -73,6 +69,15 @@ import { markRestartSuccessful, } from './reconcile'; import { restoreFromPostgres, markDestroyedInPostgresHelper } from './postgres'; +import { + beginUnexpectedStopRecovery, + cleanupPendingRecoveryVolumeIfNeeded, + cleanupRecoveryPreviousVolume, + cleanupRetainedRecoveryVolumeIfDue, + failUnexpectedStopRecovery, + runUnexpectedStopRecoveryInBackground, + type RecoveryRuntime, +} from './recovery'; import { setupDefaultStreamChatChannel, createShortLivedUserToken, @@ -116,6 +121,18 @@ export class KiloClawInstance extends DurableObject { await this.ctx.storage.setAlarm(nextAlarmTime(this.s.status)); } + private recoveryRuntime(): RecoveryRuntime { + return { + env: this.env, + ctx: this.ctx, + state: this.s, + loadState: () => this.loadState(), + persist: patch => this.persist(patch), + scheduleAlarm: () => this.scheduleAlarm(), + emitEvent: data => this.emitEvent(data), + }; + } + /** * Exposed as a private method so tests that cast to access internals * can still call `instance.buildUserEnvVars()`. @@ -220,6 +237,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'destroying') { throw new Error('Cannot provision: instance is being destroyed'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot provision: instance is recovering from an unexpected stop'); + } if (this.s.status === 'restoring') { throw new Error('Cannot provision: instance is restoring from snapshot'); } @@ -261,7 +281,11 @@ export class KiloClawInstance extends DurableObject { regions, { onCapacityError: failedRegion => { - void evictCapacityRegionFromKV(this.env.KV_CLAW_CACHE, this.env, failedRegion); + void regionHelpers.evictCapacityRegionFromKV( + this.env.KV_CLAW_CACHE, + this.env, + failedRegion + ); }, } ); @@ -923,6 +947,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'destroying') { throw new Error('Cannot start: instance is being destroyed'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot start: instance is recovering from an unexpected stop'); + } if (this.s.status === 'restoring') { throw new Error('Cannot start: instance is restoring from snapshot'); } @@ -1081,7 +1108,11 @@ export class KiloClawInstance extends DurableObject { // createVolumeWithFallback already evicts on volume-creation failures, // but machine-creation 403s bypass that path. if (code === 403 && this.s.flyRegion) { - await evictCapacityRegionFromKV(this.env.KV_CLAW_CACHE, this.env, this.s.flyRegion); + await regionHelpers.evictCapacityRegionFromKV( + this.env.KV_CLAW_CACHE, + this.env, + this.s.flyRegion + ); } await flyMachines.replaceStrandedVolume( @@ -1162,6 +1193,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'destroying') { throw new Error('Cannot start: instance is being destroyed'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot start: instance is recovering from an unexpected stop'); + } if (this.s.status === 'restarting') { throw new Error('Cannot start: instance is restarting'); } @@ -1225,6 +1259,7 @@ export class KiloClawInstance extends DurableObject { this.s.status === 'provisioned' || this.s.status === 'starting' || this.s.status === 'restarting' || + this.s.status === 'recovering' || this.s.status === 'destroying' || this.s.status === 'restoring' ) { @@ -1271,6 +1306,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'restoring') { throw new Error('Cannot destroy: instance is restoring from snapshot'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot destroy: instance is recovering from an unexpected stop'); + } const machineUptimeMs = this.s.lastStartedAt ? Date.now() - this.s.lastStartedAt : 0; @@ -1523,6 +1561,12 @@ export class KiloClawInstance extends DurableObject { lastStartErrorAt: number | null; lastRestartErrorMessage: string | null; lastRestartErrorAt: number | null; + recoveryStartedAt: number | null; + pendingRecoveryVolumeId: string | null; + recoveryPreviousVolumeId: string | null; + recoveryPreviousVolumeCleanupAfter: number | null; + lastRecoveryErrorMessage: string | null; + lastRecoveryErrorAt: number | null; previousVolumeId: string | null; restoreStartedAt: string | null; pendingRestoreVolumeId: string | null; @@ -1569,6 +1613,12 @@ export class KiloClawInstance extends DurableObject { lastStartErrorAt: this.s.lastStartErrorAt, lastRestartErrorMessage: this.s.lastRestartErrorMessage, lastRestartErrorAt: this.s.lastRestartErrorAt, + recoveryStartedAt: this.s.recoveryStartedAt, + pendingRecoveryVolumeId: this.s.pendingRecoveryVolumeId, + recoveryPreviousVolumeId: this.s.recoveryPreviousVolumeId, + recoveryPreviousVolumeCleanupAfter: this.s.recoveryPreviousVolumeCleanupAfter, + lastRecoveryErrorMessage: this.s.lastRecoveryErrorMessage, + lastRecoveryErrorAt: this.s.lastRecoveryErrorAt, previousVolumeId: this.s.previousVolumeId, restoreStartedAt: this.s.restoreStartedAt, pendingRestoreVolumeId: this.s.pendingRestoreVolumeId, @@ -1619,6 +1669,11 @@ export class KiloClawInstance extends DurableObject { return fly.listVolumeSnapshots(flyConfig, this.s.flyVolumeId); } + async cleanupRecoveryPreviousVolume(): Promise<{ ok: true; deletedVolumeId: string | null }> { + await this.loadState(); + return cleanupRecoveryPreviousVolume(this.recoveryRuntime()); + } + // ── Volume reassociation (admin) ─────────────────────────────────── async listCandidateVolumes(): Promise<{ @@ -1653,6 +1708,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'restoring') { throw new Error('Cannot reassociate: instance is restoring from snapshot'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot reassociate: instance is recovering from an unexpected stop'); + } if (this.s.status !== 'stopped') { throw new Error('Instance must be stopped before reassociating volume'); } @@ -1730,6 +1788,9 @@ export class KiloClawInstance extends DurableObject { if (this.s.status === 'destroying') { throw new Error('Cannot restore: instance is being destroyed'); } + if (this.s.status === 'recovering') { + throw new Error('Cannot restore: instance is recovering from an unexpected stop'); + } if (this.s.status === 'restoring') { throw new Error('Cannot restore: instance is already restoring'); } @@ -2004,6 +2065,7 @@ export class KiloClawInstance extends DurableObject { this.s.status === 'destroying' || this.s.status === 'starting' || this.s.status === 'restarting' || + this.s.status === 'recovering' || this.s.status === 'restoring' ) { return { success: false, error: 'Instance is busy' }; @@ -2251,6 +2313,10 @@ export class KiloClawInstance extends DurableObject { } } + private async recoverUnexpectedStopInBackground(): Promise { + await runUnexpectedStopRecoveryInBackground(this.recoveryRuntime()); + } + // ======================================================================== // Alarm (reconciliation loop) // ======================================================================== @@ -2281,9 +2347,20 @@ export class KiloClawInstance extends DurableObject { return; } + if (this.s.status !== 'recovering') { + await cleanupPendingRecoveryVolumeIfNeeded( + this.recoveryRuntime(), + 'alarm_pending_recovery_cleanup' + ); + } + await cleanupRetainedRecoveryVolumeIfDue( + this.recoveryRuntime(), + 'alarm_retained_recovery_cleanup' + ); + try { const flyConfig = getFlyConfig(this.env, this.s); - await reconcileWithFly( + const reconcileResult = await reconcileWithFly( flyConfig, this.ctx, this.s, @@ -2293,6 +2370,25 @@ export class KiloClawInstance extends DurableObject { (userId, sandboxId) => markDestroyedInPostgresHelper(this.env, this.ctx, this.s, userId, sandboxId) ); + + if (reconcileResult.beginUnexpectedStopRecovery && this.s.status === 'running') { + await beginUnexpectedStopRecovery( + this.recoveryRuntime(), + reconcileResult.beginUnexpectedStopRecovery + ); + this.ctx.waitUntil(this.recoverUnexpectedStopInBackground()); + return; + } + + if (reconcileResult.timedOutUnexpectedStopRecovery && this.s.status === 'recovering') { + await failUnexpectedStopRecovery( + this.recoveryRuntime(), + reconcileResult.timedOutUnexpectedStopRecovery.errorMessage, + 'alarm_timeout' + ); + await this.scheduleAlarm(); + return; + } } catch (err) { doError(this.s, 'reconcileWithFly failed', { error: toLoggable(err), diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts index d816126a8..0d2a6effa 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts @@ -3,6 +3,7 @@ import { ALARM_INTERVAL_RUNNING_MS, ALARM_INTERVAL_STARTING_MS, ALARM_INTERVAL_RESTARTING_MS, + ALARM_INTERVAL_RECOVERING_MS, ALARM_INTERVAL_DESTROYING_MS, ALARM_INTERVAL_IDLE_MS, ALARM_JITTER_MS, @@ -214,6 +215,8 @@ export function alarmIntervalForStatus(status: InstanceStatus): number { return ALARM_INTERVAL_STARTING_MS; case 'restarting': return ALARM_INTERVAL_RESTARTING_MS; + case 'recovering': + return ALARM_INTERVAL_RECOVERING_MS; case 'destroying': return ALARM_INTERVAL_DESTROYING_MS; case 'restoring': diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts index 5d490b2aa..b4594ac5a 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts @@ -8,6 +8,7 @@ import { STARTUP_TIMEOUT_SECONDS, STARTING_TIMEOUT_MS, RESTARTING_TIMEOUT_MS, + RECOVERING_TIMEOUT_MS, getProactiveRefreshThresholdMs, } from '../../config'; import { ENCRYPTED_ENV_PREFIX, encryptEnvValue } from '../../utils/env-encryption'; @@ -29,6 +30,17 @@ import { mintFreshApiKey } from './config'; import * as gateway from './gateway'; import { writeEvent, eventContextFromState } from '../../utils/analytics'; +export type ReconcileWithFlyResult = { + beginUnexpectedStopRecovery?: { + flyState: 'stopped' | 'created'; + failCount: number; + }; + timedOutUnexpectedStopRecovery?: { + errorMessage: string; + durationMs?: number; + }; +}; + function emitStartFailedEvent( env: { KILOCLAW_AE?: AnalyticsEngineDataset }, state: InstanceMutableState, @@ -59,25 +71,34 @@ export async function reconcileWithFly( triggerDestroy: () => Promise, /** Callback for marking Postgres row destroyed during finalization. */ markDestroyedInPostgres?: (userId: string, sandboxId: string) => Promise -): Promise { +): Promise { const rctx = createReconcileContext(state, env, reason); if (state.status === 'destroying') { await retryPendingDestroy(flyConfig, ctx, state, rctx, markDestroyedInPostgres); - return; + return {}; } if (state.status === 'starting') { await reconcileStarting(flyConfig, ctx, state, env, rctx); - return; + return {}; } if (state.status === 'restarting') { await reconcileRestarting(flyConfig, ctx, state, env, rctx); - return; + return {}; } - const machineReconciled = await reconcileMachine(flyConfig, ctx, state, rctx); + if (state.status === 'recovering') { + return reconcileRecovering(state, rctx); + } + + const { reconciled: machineReconciled, result } = await reconcileMachine( + flyConfig, + ctx, + state, + rctx + ); // Auto-destroy stale provisioned instances const staleAge = staleProvisionAgeMs(state); @@ -91,11 +112,12 @@ export async function reconcileWithFly( state.pendingPostgresMarkOnFinalize = true; await ctx.storage.put(storageUpdate({ pendingPostgresMarkOnFinalize: true })); await triggerDestroy(); - return; + return {}; } await reconcileVolume(flyConfig, ctx, state, env, rctx); await reconcileApiKeyExpiry(flyConfig, ctx, state, env, rctx); + return result; } // ---- API key proactive refresh ---- @@ -617,22 +639,22 @@ async function reconcileMachine( ctx: DurableObjectState, state: InstanceMutableState, rctx: ReconcileContext -): Promise { +): Promise<{ reconciled: boolean; result: ReconcileWithFlyResult }> { if (!state.flyMachineId) { - return attemptMetadataRecovery(flyConfig, ctx, state, rctx); + return { reconciled: await attemptMetadataRecovery(flyConfig, ctx, state, rctx), result: {} }; } try { const machine = await fly.getMachine(flyConfig, state.flyMachineId); - await syncStatusWithFly(ctx, state, machine.state, rctx); + const result = await syncStatusWithFly(ctx, state, machine.state, rctx); await reconcileMachineMount(flyConfig, ctx, state, machine, rctx); - return true; + return { reconciled: true, result }; } catch (err) { if (fly.isFlyNotFound(err)) { await handleMachineGone(ctx, state, rctx); - return true; + return { reconciled: true, result: {} }; } - return false; + return { reconciled: false, result: {} }; } } @@ -744,7 +766,7 @@ export async function syncStatusWithFly( state: InstanceMutableState, flyState: string, rctx: ReconcileContext -): Promise { +): Promise { if (flyState === 'started' && state.status !== 'running') { rctx.log('sync_status', { old_state: state.status, @@ -769,7 +791,7 @@ export async function syncStatusWithFly( lastStartedAt: state.lastStartedAt, }) ); - return; + return {}; } if (flyState === 'started' && state.status === 'running') { @@ -777,7 +799,7 @@ export async function syncStatusWithFly( state.healthCheckFailCount = 0; await ctx.storage.put(storageUpdate({ healthCheckFailCount: 0 })); } - return; + return {}; } // destroyed means the Fly machine is gone — clear the stale ID immediately @@ -801,7 +823,7 @@ export async function syncStatusWithFly( healthCheckFailCount: 0, }) ); - return; + return {}; } // failed is definitively terminal — transition immediately without waiting for @@ -828,7 +850,7 @@ export async function syncStatusWithFly( if (wasStarting) { emitStartFailedEvent(rctx.env, state, 'fly_failed_state', 'fly machine entered failed state'); } - return; + return {}; } if ((flyState === 'stopped' || flyState === 'created') && state.status === 'running') { @@ -836,25 +858,50 @@ export async function syncStatusWithFly( await ctx.storage.put(storageUpdate({ healthCheckFailCount: state.healthCheckFailCount })); if (state.healthCheckFailCount >= SELF_HEAL_THRESHOLD) { - rctx.log('mark_stopped', { + rctx.log('unexpected_stop_recovery_trigger', { old_state: 'running', - new_state: 'stopped', + new_state: 'recovering', fly_state: flyState, fail_count: state.healthCheckFailCount, value: SELF_HEAL_THRESHOLD, }); - state.status = 'stopped'; - state.lastStoppedAt = Date.now(); - state.healthCheckFailCount = 0; - await ctx.storage.put( - storageUpdate({ - status: 'stopped', - lastStoppedAt: state.lastStoppedAt, - healthCheckFailCount: 0, - }) - ); + return { + beginUnexpectedStopRecovery: { + flyState, + failCount: state.healthCheckFailCount, + }, + }; } } + + return {}; +} + +async function reconcileRecovering( + state: InstanceMutableState, + rctx: ReconcileContext +): Promise { + const recoveryStartedAt = state.recoveryStartedAt; + const isTimedOut = + recoveryStartedAt !== null && Date.now() - recoveryStartedAt > RECOVERING_TIMEOUT_MS; + + if (!isTimedOut) return {}; + + const errorMessage = 'unexpected stop recovery timed out'; + const durationMs = recoveryStartedAt ? Date.now() - recoveryStartedAt : undefined; + rctx.log('unexpected_stop_recovery_timeout', { + old_state: 'recovering', + new_state: 'stopped', + durationMs, + error: errorMessage, + }); + + return { + timedOutUnexpectedStopRecovery: { + errorMessage, + durationMs, + }, + }; } export async function markRestartSuccessful( diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts new file mode 100644 index 000000000..73e7ba6b5 --- /dev/null +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts @@ -0,0 +1,400 @@ +import type { PersistedState } from '../../schemas/instance-config'; +import type { KiloClawEnv } from '../../types'; +import * as fly from '../../fly/client'; +import { PREVIOUS_VOLUME_RETENTION_MS } from '../../config'; +import * as regionHelpers from '../regions'; +import { buildMachineConfig, guestFromSize, volumeNameFromSandboxId } from '../machine-config'; +import type { InstanceMutableState } from './types'; +import { getFlyConfig } from './types'; +import { resolveImageTag, getRegistryApp, buildUserEnvVars } from './config'; +import * as gateway from './gateway'; +import * as flyMachines from './fly-machines'; +import { doError, doWarn, toLoggable } from './log'; +import type { KiloClawEventData, KiloClawEventName } from '../../utils/analytics'; + +export type InstanceEventInput = Omit< + KiloClawEventData, + | 'userId' + | 'sandboxId' + | 'delivery' + | 'flyAppName' + | 'flyMachineId' + | 'openclawVersion' + | 'imageTag' + | 'flyRegion' +> & { event: KiloClawEventName }; + +export type RecoveryRuntime = { + env: KiloClawEnv; + ctx: DurableObjectState; + state: InstanceMutableState; + loadState: () => Promise; + persist: (patch: Partial) => Promise; + scheduleAlarm: () => Promise; + emitEvent: (data: InstanceEventInput) => void; +}; + +export async function cleanupRecoveryPreviousVolume( + runtime: RecoveryRuntime +): Promise<{ ok: true; deletedVolumeId: string | null }> { + const { state } = runtime; + const volumeId = state.recoveryPreviousVolumeId; + if (!volumeId) { + return { ok: true, deletedVolumeId: null }; + } + + const flyConfig = getFlyConfig(runtime.env, state); + await flyMachines.deleteVolumeAndAttachedMachine( + flyConfig, + volumeId, + 'admin_recovery_previous_volume_cleanup', + state.sandboxId ?? undefined + ); + + state.recoveryPreviousVolumeId = null; + state.recoveryPreviousVolumeCleanupAfter = null; + await runtime.persist({ + recoveryPreviousVolumeId: null, + recoveryPreviousVolumeCleanupAfter: null, + }); + + return { ok: true, deletedVolumeId: volumeId }; +} + +export async function cleanupPendingRecoveryVolumeIfNeeded( + runtime: RecoveryRuntime, + reason: string +): Promise { + const { state } = runtime; + if (!state.pendingRecoveryVolumeId) return; + + const volumeId = state.pendingRecoveryVolumeId; + try { + const flyConfig = getFlyConfig(runtime.env, state); + await flyMachines.deleteVolumeAndAttachedMachine( + flyConfig, + volumeId, + reason, + state.sandboxId ?? undefined + ); + if (state.pendingRecoveryVolumeId === volumeId) { + state.pendingRecoveryVolumeId = null; + await runtime.persist({ pendingRecoveryVolumeId: null }); + } + } catch (err) { + doWarn(state, 'pending recovery volume cleanup failed', { + volumeId, + error: toLoggable(err), + }); + } +} + +export async function cleanupRetainedRecoveryVolumeIfDue( + runtime: RecoveryRuntime, + reason: string +): Promise { + const { state } = runtime; + if (!state.recoveryPreviousVolumeId || state.recoveryPreviousVolumeCleanupAfter === null) { + return; + } + if (Date.now() < state.recoveryPreviousVolumeCleanupAfter) return; + + const volumeId = state.recoveryPreviousVolumeId; + try { + const flyConfig = getFlyConfig(runtime.env, state); + await flyMachines.deleteVolumeAndAttachedMachine( + flyConfig, + volumeId, + reason, + state.sandboxId ?? undefined + ); + if (state.recoveryPreviousVolumeId === volumeId) { + state.recoveryPreviousVolumeId = null; + state.recoveryPreviousVolumeCleanupAfter = null; + await runtime.persist({ + recoveryPreviousVolumeId: null, + recoveryPreviousVolumeCleanupAfter: null, + }); + } + } catch (err) { + doWarn(state, 'retained recovery volume cleanup failed', { + volumeId, + error: toLoggable(err), + }); + } +} + +export async function beginUnexpectedStopRecovery( + runtime: RecoveryRuntime, + trigger: { flyState: 'stopped' | 'created'; failCount: number } +): Promise { + const { state } = runtime; + const recoveryStartedAt = Date.now(); + state.status = 'recovering'; + state.recoveryStartedAt = recoveryStartedAt; + state.healthCheckFailCount = 0; + state.pendingRecoveryVolumeId = null; + state.lastRecoveryErrorMessage = null; + state.lastRecoveryErrorAt = null; + await runtime.persist({ + status: 'recovering', + recoveryStartedAt, + healthCheckFailCount: 0, + pendingRecoveryVolumeId: null, + lastRecoveryErrorMessage: null, + lastRecoveryErrorAt: null, + }); + await runtime.scheduleAlarm(); + runtime.emitEvent({ + event: 'instance.unexpected_stop_recovery_started', + status: 'recovering', + label: `alarm_${trigger.flyState}`, + value: trigger.failCount, + }); +} + +export async function failUnexpectedStopRecovery( + runtime: RecoveryRuntime, + message: string, + label: string +): Promise { + const { state, ctx } = runtime; + const currentStatus = await ctx.storage.get('status'); + if (currentStatus !== 'recovering') return; + + if (state.flyMachineId) { + try { + const flyConfig = getFlyConfig(runtime.env, state); + await fly.destroyMachine(flyConfig, state.flyMachineId, true); + state.flyMachineId = null; + await runtime.persist({ flyMachineId: null }); + } catch (err) { + if (!fly.isFlyNotFound(err)) { + doWarn(state, 'failed to destroy in-progress recovery machine', { + machineId: state.flyMachineId, + error: toLoggable(err), + }); + } + } + } + + await cleanupPendingRecoveryVolumeIfNeeded(runtime, 'unexpected_stop_recovery_failed_cleanup'); + + const durationMs = state.recoveryStartedAt ? Date.now() - state.recoveryStartedAt : undefined; + const now = Date.now(); + state.status = 'stopped'; + state.recoveryStartedAt = null; + state.healthCheckFailCount = 0; + state.lastStoppedAt = now; + state.lastRecoveryErrorMessage = message; + state.lastRecoveryErrorAt = now; + await runtime.persist({ + status: 'stopped', + recoveryStartedAt: null, + healthCheckFailCount: 0, + lastStoppedAt: now, + lastRecoveryErrorMessage: message, + lastRecoveryErrorAt: now, + }); + + runtime.emitEvent({ + event: 'instance.unexpected_stop_recovery_failed', + status: 'stopped', + label, + error: message, + durationMs, + }); +} + +export async function runUnexpectedStopRecoveryInBackground( + runtime: RecoveryRuntime +): Promise { + const { state, ctx, env } = runtime; + + try { + await runtime.loadState(); + + const currentStatus = await ctx.storage.get('status'); + if (currentStatus !== 'recovering') return; + + if (!state.userId || !state.sandboxId || !state.flyVolumeId) { + throw new Error('Cannot recover unexpected stop: missing user, sandbox, or volume'); + } + + const flyConfig = getFlyConfig(env, state); + const oldVolumeId = state.flyVolumeId; + let oldVolumeRegion = state.flyRegion; + + try { + const sourceVolume = await fly.getVolume(flyConfig, oldVolumeId); + oldVolumeRegion = sourceVolume.region; + } catch (err) { + if (fly.isFlyNotFound(err)) { + throw new Error(`Cannot recover unexpected stop: source volume ${oldVolumeId} missing`); + } + throw err; + } + + if (state.flyMachineId) { + try { + await fly.destroyMachine(flyConfig, state.flyMachineId, true); + } catch (err) { + if (!fly.isFlyNotFound(err)) throw err; + } + state.flyMachineId = null; + await runtime.persist({ flyMachineId: null }); + } + + let recoveryVolumeId = state.pendingRecoveryVolumeId; + let recoveryVolumeRegion: string | null = null; + + if (recoveryVolumeId) { + try { + const existingRecoveryVolume = await fly.getVolume(flyConfig, recoveryVolumeId); + recoveryVolumeRegion = existingRecoveryVolume.region; + } catch (err) { + if (!fly.isFlyNotFound(err)) throw err; + recoveryVolumeId = null; + recoveryVolumeRegion = null; + state.pendingRecoveryVolumeId = null; + await runtime.persist({ pendingRecoveryVolumeId: null }); + } + } + + if (!recoveryVolumeId) { + const regions = regionHelpers.deprioritizeRegion( + await regionHelpers.resolveRegions(env.KV_CLAW_CACHE, env.FLY_REGION), + oldVolumeRegion + ); + const recoveryVolume = await fly.createVolumeWithFallback( + flyConfig, + { + name: volumeNameFromSandboxId(state.sandboxId), + source_volume_id: oldVolumeId, + compute: guestFromSize(state.machineSize), + }, + regions, + { + onCapacityError: failedRegion => { + void regionHelpers.evictCapacityRegionFromKV(env.KV_CLAW_CACHE, env, failedRegion); + }, + } + ); + recoveryVolumeId = recoveryVolume.id; + recoveryVolumeRegion = recoveryVolume.region; + state.pendingRecoveryVolumeId = recoveryVolumeId; + await runtime.persist({ pendingRecoveryVolumeId: recoveryVolumeId }); + } + + const { envVars, minSecretsVersion } = await buildUserEnvVars(env, ctx, state); + const imageTag = resolveImageTag(state, env); + const identity = { + userId: state.userId, + sandboxId: state.sandboxId, + orgId: state.orgId, + openclawVersion: state.openclawVersion, + imageVariant: state.imageVariant, + }; + const machineConfig = buildMachineConfig( + getRegistryApp(env), + imageTag, + envVars, + guestFromSize(state.machineSize), + recoveryVolumeId, + identity + ); + + const previousRegion = state.flyRegion; + state.flyRegion = recoveryVolumeRegion ?? oldVolumeRegion ?? previousRegion; + await flyMachines.createNewMachine( + flyConfig, + ctx, + state, + machineConfig, + minSecretsVersion, + env.FLY_REGION + ); + if (!state.flyMachineId) { + throw new Error('Unexpected stop recovery created no machine'); + } + await gateway.waitForHealthy(state, env, flyConfig.appName, state.flyMachineId); + + let retainedRecoveryVolumeId: string | null = null; + let retainedRecoveryVolumeCleanupAfter: number | null = null; + try { + const snapshots = await fly.listVolumeSnapshots(flyConfig, oldVolumeId); + if (snapshots.length > 0) { + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; + } else { + try { + await flyMachines.deleteVolumeAndAttachedMachine( + flyConfig, + oldVolumeId, + 'unexpected_stop_recovery_immediate_cleanup', + state.sandboxId ?? undefined + ); + } catch (cleanupErr) { + doWarn(state, 'old recovery source volume cleanup failed; retrying via alarm', { + volumeId: oldVolumeId, + error: toLoggable(cleanupErr), + }); + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now(); + } + } + } catch (snapshotErr) { + doWarn(state, 'failed to inspect old volume snapshots; retaining for TTL cleanup', { + volumeId: oldVolumeId, + error: toLoggable(snapshotErr), + }); + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; + } + + const postStatus = await ctx.storage.get('status'); + if (postStatus !== 'recovering') return; + + const now = Date.now(); + const durationMs = state.recoveryStartedAt ? now - state.recoveryStartedAt : undefined; + state.status = 'running'; + state.flyVolumeId = recoveryVolumeId; + state.flyRegion = recoveryVolumeRegion ?? state.flyRegion; + state.recoveryStartedAt = null; + state.pendingRecoveryVolumeId = null; + state.recoveryPreviousVolumeId = retainedRecoveryVolumeId; + state.recoveryPreviousVolumeCleanupAfter = retainedRecoveryVolumeCleanupAfter; + state.healthCheckFailCount = 0; + state.lastStartedAt = now; + state.lastRecoveryErrorMessage = null; + state.lastRecoveryErrorAt = null; + await runtime.persist({ + status: 'running', + flyMachineId: state.flyMachineId, + flyVolumeId: recoveryVolumeId, + flyRegion: state.flyRegion, + recoveryStartedAt: null, + pendingRecoveryVolumeId: null, + recoveryPreviousVolumeId: retainedRecoveryVolumeId, + recoveryPreviousVolumeCleanupAfter: retainedRecoveryVolumeCleanupAfter, + healthCheckFailCount: 0, + lastStartedAt: now, + lastRecoveryErrorMessage: null, + lastRecoveryErrorAt: null, + }); + + runtime.emitEvent({ + event: 'instance.unexpected_stop_recovery_succeeded', + status: 'running', + label: 'alarm_relocated', + durationMs, + }); + await runtime.scheduleAlarm(); + } catch (err) { + doError(state, 'unexpected stop recovery failed', { + error: toLoggable(err), + }); + const errorMessage = err instanceof Error ? err.message : String(err); + await failUnexpectedStopRecovery(runtime, errorMessage, 'alarm_relocated'); + } +} diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/state.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/state.ts index d5480c321..8ff30660d 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/state.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/state.ts @@ -41,6 +41,7 @@ export async function loadState(ctx: DurableObjectState, s: InstanceMutableState s.provisionedAt = d.provisionedAt; s.startingAt = d.startingAt; s.restartingAt = d.restartingAt; + s.recoveryStartedAt = d.recoveryStartedAt; s.restartUpdateSent = d.restartUpdateSent; s.lastStartedAt = d.lastStartedAt; s.lastStoppedAt = d.lastStoppedAt; @@ -66,6 +67,11 @@ export async function loadState(ctx: DurableObjectState, s: InstanceMutableState s.lastStartErrorAt = d.lastStartErrorAt; s.lastRestartErrorMessage = d.lastRestartErrorMessage; s.lastRestartErrorAt = d.lastRestartErrorAt; + s.pendingRecoveryVolumeId = d.pendingRecoveryVolumeId; + s.recoveryPreviousVolumeId = d.recoveryPreviousVolumeId; + s.recoveryPreviousVolumeCleanupAfter = d.recoveryPreviousVolumeCleanupAfter; + s.lastRecoveryErrorMessage = d.lastRecoveryErrorMessage; + s.lastRecoveryErrorAt = d.lastRecoveryErrorAt; s.lastBoundMachineRecoveryAt = d.lastBoundMachineRecoveryAt; s.instanceFeatures = d.instanceFeatures; s.gmailNotificationsEnabled = d.gmailNotificationsEnabled; @@ -116,6 +122,7 @@ export function resetMutableState(s: InstanceMutableState): void { s.provisionedAt = null; s.startingAt = null; s.restartingAt = null; + s.recoveryStartedAt = null; s.restartUpdateSent = false; s.lastStartedAt = null; s.lastStoppedAt = null; @@ -141,6 +148,11 @@ export function resetMutableState(s: InstanceMutableState): void { s.lastStartErrorAt = null; s.lastRestartErrorMessage = null; s.lastRestartErrorAt = null; + s.pendingRecoveryVolumeId = null; + s.recoveryPreviousVolumeId = null; + s.recoveryPreviousVolumeCleanupAfter = null; + s.lastRecoveryErrorMessage = null; + s.lastRecoveryErrorAt = null; s.lastBoundMachineRecoveryAt = null; s.instanceFeatures = []; s.gmailNotificationsEnabled = false; @@ -182,6 +194,7 @@ export function createMutableState(): InstanceMutableState { provisionedAt: null, startingAt: null, restartingAt: null, + recoveryStartedAt: null, restartUpdateSent: false, lastStartedAt: null, lastStoppedAt: null, @@ -207,6 +220,11 @@ export function createMutableState(): InstanceMutableState { lastStartErrorAt: null, lastRestartErrorMessage: null, lastRestartErrorAt: null, + pendingRecoveryVolumeId: null, + recoveryPreviousVolumeId: null, + recoveryPreviousVolumeCleanupAfter: null, + lastRecoveryErrorMessage: null, + lastRecoveryErrorAt: null, lastBoundMachineRecoveryAt: null, instanceFeatures: [], gmailNotificationsEnabled: false, diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/types.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/types.ts index e3deadd71..de5fccc1f 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/types.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/types.ts @@ -57,6 +57,7 @@ export type InstanceMutableState = { provisionedAt: number | null; startingAt: number | null; restartingAt: number | null; + recoveryStartedAt: number | null; restartUpdateSent: boolean; lastStartedAt: number | null; lastStoppedAt: number | null; @@ -82,6 +83,11 @@ export type InstanceMutableState = { lastStartErrorAt: number | null; lastRestartErrorMessage: string | null; lastRestartErrorAt: number | null; + pendingRecoveryVolumeId: string | null; + recoveryPreviousVolumeId: string | null; + recoveryPreviousVolumeCleanupAfter: number | null; + lastRecoveryErrorMessage: string | null; + lastRecoveryErrorAt: number | null; lastBoundMachineRecoveryAt: number | null; instanceFeatures: string[]; gmailNotificationsEnabled: boolean; diff --git a/kiloclaw/src/index.test.ts b/kiloclaw/src/index.test.ts index 453ff8c29..209cf4b3d 100644 --- a/kiloclaw/src/index.test.ts +++ b/kiloclaw/src/index.test.ts @@ -4,6 +4,34 @@ vi.mock('cloudflare:workers', () => ({ DurableObject: class FakeDurableObject {}, })); +vi.mock('./routes', async () => { + const { Hono } = await import('hono'); + const empty = new Hono(); + return { + accessGatewayRoutes: empty, + publicRoutes: empty, + api: empty, + kiloclaw: empty, + platform: empty, + controller: empty, + }; +}); + +vi.mock('./auth', () => ({ + authMiddleware: async ( + c: { set: (key: string, value: string) => void }, + next: () => Promise + ) => { + c.set('userId', 'user-1'); + await next(); + }, + internalApiMiddleware: async (_c: unknown, next: () => Promise) => next(), +})); + +vi.mock('./middleware/analytics', () => ({ + timingMiddleware: async (_c: unknown, next: () => Promise) => next(), +})); + vi.mock('./lib/image-version', async () => { const actual = await vi.importActual('./lib/image-version'); return { @@ -46,3 +74,53 @@ describe('platform route env validation', () => { ); }); }); + +describe('proxy recovering state', () => { + it('returns 409 while the instance is recovering', async () => { + const registryStub = { + listInstances: vi.fn().mockResolvedValue([ + { + doKey: 'user-1', + instanceId: '', + assignedUserId: 'user-1', + createdAt: new Date().toISOString(), + destroyedAt: null, + }, + ]), + }; + const instanceStub = { + getStatus: vi.fn().mockResolvedValue({ + userId: 'user-1', + sandboxId: 'sandbox-1', + status: 'recovering', + flyMachineId: 'machine-1', + flyAppName: 'test-app', + }), + }; + + const response = await worker.fetch( + new Request('https://example.com/'), + { + NEXTAUTH_SECRET: 'nextauth-secret', + GATEWAY_TOKEN_SECRET: 'gateway-secret', + FLY_API_TOKEN: 'fly-token', + FLY_APP_NAME: 'test-app', + KILOCLAW_REGISTRY: { + idFromName: vi.fn().mockReturnValue('registry-id'), + get: vi.fn().mockReturnValue(registryStub), + }, + KILOCLAW_INSTANCE: { + idFromName: vi.fn().mockReturnValue('instance-id'), + get: vi.fn().mockReturnValue(instanceStub), + }, + } as never, + { waitUntil: vi.fn() } as never + ); + + expect(response.status).toBe(409); + await expect(response.json()).resolves.toEqual({ + error: 'Instance is recovering', + hint: 'Your instance is being recovered after an unexpected stop. Please wait.', + }); + }); +}); diff --git a/kiloclaw/src/index.ts b/kiloclaw/src/index.ts index 8f1b159ee..a0a36c533 100644 --- a/kiloclaw/src/index.ts +++ b/kiloclaw/src/index.ts @@ -219,6 +219,9 @@ app.all('/i/:instanceId/*', async c => { if (status.status === 'restoring') { return c.json({ error: 'Instance is restoring from a snapshot' }, 409); } + if (status.status === 'recovering') { + return c.json({ error: 'Instance is recovering from an unexpected stop' }, 409); + } if (!status.flyMachineId) { return c.json({ error: 'Instance not provisioned' }, 404); } @@ -439,6 +442,8 @@ async function resolveInstance(c: Context): Promise<{ return { machineId: null, flyAppName: null, sandboxId: null, status: 'destroying' }; if (s.status === 'restoring') return { machineId: null, flyAppName: null, sandboxId: null, status: 'restoring' }; + if (s.status === 'recovering') + return { machineId: null, flyAppName: null, sandboxId: null, status: 'recovering' }; return { machineId: s.flyMachineId, @@ -494,6 +499,15 @@ app.all('*', async c => { 409 ); } + if (instanceStatus.status === 'recovering') { + return c.json( + { + error: 'Instance is recovering', + hint: 'This instance is being recovered after an unexpected stop. Please wait.', + }, + 409 + ); + } if (!instanceStatus.flyMachineId) { return c.json( { error: 'Instance not provisioned', hint: 'The instance has no running machine.' }, @@ -600,6 +614,15 @@ app.all('*', async c => { 409 ); } + if (status === 'recovering') { + return c.json( + { + error: 'Instance is recovering', + hint: 'Your instance is being recovered after an unexpected stop. Please wait.', + }, + 409 + ); + } if (!machineId) { return c.json( { diff --git a/kiloclaw/src/routes/platform.ts b/kiloclaw/src/routes/platform.ts index dd9901cc5..b6770aa6b 100644 --- a/kiloclaw/src/routes/platform.ts +++ b/kiloclaw/src/routes/platform.ts @@ -1252,6 +1252,27 @@ platform.post('/force-retry-recovery', async c => { } }); +// POST /api/platform/cleanup-recovery-previous-volume +platform.post('/cleanup-recovery-previous-volume', async c => { + const result = await parseBody(c, UserIdRequestSchema); + if ('error' in result) return result.error; + + const iidResult = parseInstanceIdQuery(c); + if ('error' in iidResult) return iidResult.error; + + try { + const response = await withDORetry( + instanceStubFactory(c.env, result.data.userId, iidResult.instanceId), + stub => stub.cleanupRecoveryPreviousVolume(), + 'cleanupRecoveryPreviousVolume' + ); + return c.json(response); + } catch (err) { + const { message, status } = sanitizeError(err, 'cleanup-recovery-previous-volume'); + return jsonError(message, status); + } +}); + // POST /api/platform/stop platform.post('/stop', async c => { const result = await parseBody(c, UserIdRequestSchema); diff --git a/kiloclaw/src/schemas/instance-config.ts b/kiloclaw/src/schemas/instance-config.ts index 37a6663e4..25818e1bf 100644 --- a/kiloclaw/src/schemas/instance-config.ts +++ b/kiloclaw/src/schemas/instance-config.ts @@ -159,6 +159,7 @@ export const PersistedStateSchema = z.object({ 'provisioned', 'starting', 'restarting', + 'recovering', 'running', 'stopped', 'destroying', @@ -183,6 +184,7 @@ export const PersistedStateSchema = z.object({ provisionedAt: z.number().nullable().default(null), startingAt: z.number().nullable().default(null), restartingAt: z.number().nullable().default(null), + recoveryStartedAt: z.number().nullable().default(null), restartUpdateSent: z.boolean().default(false), lastStartedAt: z.number().nullable().default(null), lastStoppedAt: z.number().nullable().default(null), @@ -218,6 +220,11 @@ export const PersistedStateSchema = z.object({ lastStartErrorAt: z.number().nullable().default(null), lastRestartErrorMessage: z.string().nullable().default(null), lastRestartErrorAt: z.number().nullable().default(null), + pendingRecoveryVolumeId: z.string().nullable().default(null), + recoveryPreviousVolumeId: z.string().nullable().default(null), + recoveryPreviousVolumeCleanupAfter: z.number().nullable().default(null), + lastRecoveryErrorMessage: z.string().nullable().default(null), + lastRecoveryErrorAt: z.number().nullable().default(null), // Cooldown for bound-machine recovery during destroy: avoids repeated getVolume // calls when the volume consistently reports no attached machine. lastBoundMachineRecoveryAt: z.number().nullable().default(null), @@ -246,6 +253,7 @@ export const PersistedStateSchema = z.object({ 'provisioned', 'starting', 'restarting', + 'recovering', 'running', 'stopped', 'destroying', diff --git a/kiloclaw/src/utils/analytics.ts b/kiloclaw/src/utils/analytics.ts index 43fbba4f6..06787bcdb 100644 --- a/kiloclaw/src/utils/analytics.ts +++ b/kiloclaw/src/utils/analytics.ts @@ -40,6 +40,9 @@ export type KiloClawEventName = | 'instance.manual_start_failed' | 'instance.crash_recovery_succeeded' | 'instance.crash_recovery_failed' + | 'instance.unexpected_stop_recovery_started' + | 'instance.unexpected_stop_recovery_succeeded' + | 'instance.unexpected_stop_recovery_failed' | 'instance.stopped' | 'instance.restarting' | 'instance.destroy_started' diff --git a/packages/db/src/schema-types.ts b/packages/db/src/schema-types.ts index b254263c9..579053304 100644 --- a/packages/db/src/schema-types.ts +++ b/packages/db/src/schema-types.ts @@ -166,6 +166,7 @@ export type AffiliateProvider = (typeof AffiliateProvider)[keyof typeof Affiliat export const KiloClawAdminAuditAction = z.enum([ 'kiloclaw.volume.reassociate', 'kiloclaw.snapshot.restore', + 'kiloclaw.recovery.cleanup_retained_volume', 'kiloclaw.subscription.update_trial_end', 'kiloclaw.subscription.reset_trial', 'kiloclaw.machine.start', diff --git a/src/app/(app)/claw/components/InstanceControls.tsx b/src/app/(app)/claw/components/InstanceControls.tsx index d2ffcddca..ff5a6ddc6 100644 --- a/src/app/(app)/claw/components/InstanceControls.tsx +++ b/src/app/(app)/claw/components/InstanceControls.tsx @@ -67,6 +67,7 @@ export function InstanceControls({ const isProvisioned = status.status === 'provisioned'; const isStarting = status.status === 'starting'; const isRestarting = status.status === 'restarting'; + const isRecovering = status.status === 'recovering'; const isStopped = status.status === 'stopped'; const isStartable = isStopped || isProvisioned; const isDestroying = status.status === 'destroying'; @@ -249,7 +250,8 @@ export function InstanceControls({ isAutoStarting || isDestroying || isStarting || - isRestarting + isRestarting || + isRecovering } onClick={() => { posthog?.capture('claw_start_instance_clicked', { instance_status: status.status }); @@ -277,7 +279,8 @@ export function InstanceControls({ mutations.restartOpenClaw.isPending || isDestroying || isStarting || - isRestarting + isRestarting || + isRecovering } onClick={() => { posthog?.capture('claw_restart_openclaw_prompted', { @@ -299,7 +302,8 @@ export function InstanceControls({ mutations.restartMachine.isPending || isDestroying || isStarting || - isRestarting + isRestarting || + isRecovering } onClick={() => { posthog?.capture('claw_redeploy_prompted', { instance_status: status.status }); @@ -319,7 +323,8 @@ export function InstanceControls({ mutations.runDoctor.isPending || isDestroying || isStarting || - isRestarting + isRestarting || + isRecovering } onClick={() => { posthog?.capture('claw_doctor_clicked', { instance_status: status.status }); @@ -333,7 +338,7 @@ export function InstanceControls({ size="sm" variant="outline" className="border-emerald-500/30 text-emerald-400 hover:bg-emerald-500/10 hover:text-emerald-300" - disabled={!isRunning || isDestroying || isStarting || isRestarting} + disabled={!isRunning || isDestroying || isStarting || isRestarting || isRecovering} onClick={() => { posthog?.capture('claw_kilo_run_clicked', { instance_status: status.status }); setKiloRunOpen(true); @@ -439,16 +444,16 @@ export function InstanceControls({ onError: err => toast.error(err.message, { duration: 10000 }), }); }} - disabled={mutations.restartMachine.isPending || isRestarting} + disabled={mutations.restartMachine.isPending || isRestarting || isRecovering} > {mutations.restartMachine.isPending ? ( <> {redeployMode === 'redeploy' ? 'Redeploying' : 'Upgrading'} - ) : isRestarting ? ( + ) : isRestarting || isRecovering ? ( <> - Restarting + {isRecovering ? 'Recovering' : 'Restarting'} ) : ( diff --git a/src/app/(app)/claw/components/SettingsTab.tsx b/src/app/(app)/claw/components/SettingsTab.tsx index e3f005d01..9b74573b0 100644 --- a/src/app/(app)/claw/components/SettingsTab.tsx +++ b/src/app/(app)/claw/components/SettingsTab.tsx @@ -590,6 +590,7 @@ export function SettingsTab({ const isSaving = mutations.patchConfig.isPending; const isStarting = status.status === 'starting'; const isRestarting = status.status === 'restarting'; + const isRecovering = status.status === 'recovering'; const isDestroying = status.status === 'destroying'; const supportsConfigRestore = calverAtLeast( cleanVersion(controllerVersion?.version), @@ -978,7 +979,8 @@ export function SettingsTab({ !isRunning || mutations.restoreConfig.isPending || isDestroying || - isRestarting + isRestarting || + isRecovering } onClick={() => { posthog?.capture('claw_restore_config_clicked', { @@ -1000,7 +1002,7 @@ export function SettingsTab({ )} @@ -1970,6 +2005,95 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { + {data.workerStatus && ( + + +
+
+ Unexpected Stop Recovery + + Alarm-driven relocation state and retained recovery volume cleanup + +
+ {data.workerStatus.recoveryPreviousVolumeId && ( + + )} +
+
+ + {isRecovering && ( + + + + The instance is currently relocating after an unexpected Fly stop. + {data.workerStatus.recoveryStartedAt !== null && + ` Recovery began ${formatEpochRelativeTime( + data.workerStatus.recoveryStartedAt + )}.`} + + + )} + +
+ + {isRecovering ? : 'Idle'} + + + {formatEpochTime(data.workerStatus.recoveryStartedAt)} + + + + {data.workerStatus.pendingRecoveryVolumeId ?? '—'} + + + + + {data.workerStatus.recoveryPreviousVolumeId ?? '—'} + + + + {data.workerStatus.recoveryPreviousVolumeCleanupAfter !== null ? ( + + {formatEpochRelativeTime( + data.workerStatus.recoveryPreviousVolumeCleanupAfter + )} + + ) : ( + '—' + )} + + + {data.workerStatus.lastRecoveryErrorMessage ? ( + + {data.workerStatus.lastRecoveryErrorMessage} +
+ + {formatEpochTime(data.workerStatus.lastRecoveryErrorAt)} + +
+ ) : ( + '—' + )} +
+
+
+
+ )} + {/* Events */} {data.sandbox_id && ( { setRestoreSnapshotId(snap.id); @@ -2593,6 +2718,53 @@ export function KiloclawInstanceDetail({ instanceId }: { instanceId: string }) { + {} : setCleanupRecoveryVolumeDialogOpen} + > + + + + + Delete Retained Recovery Volume + + + This will permanently delete the retained old recovery volume. + + Volume:{' '} + + {data?.workerStatus?.recoveryPreviousVolumeId ?? '—'} + + + + If Fly still reports an attached machine on that volume, the cleanup will + force-destroy that machine first. + + + + + + + + + + + + {/* Run Doctor Dialog */} { + const params = instanceId ? `?instanceId=${encodeURIComponent(instanceId)}` : ''; + return this.request( + `/api/platform/cleanup-recovery-previous-volume${params}`, + { + method: 'POST', + body: JSON.stringify({ userId }), + }, + { userId } + ); + } + async listCandidateVolumes( userId: string, instanceId?: string diff --git a/src/lib/kiloclaw/types.ts b/src/lib/kiloclaw/types.ts index bf58c35ce..94fd2ab26 100644 --- a/src/lib/kiloclaw/types.ts +++ b/src/lib/kiloclaw/types.ts @@ -130,6 +130,7 @@ export type PlatformStatusResponse = { | 'provisioned' | 'starting' | 'restarting' + | 'recovering' | 'running' | 'stopped' | 'destroying' @@ -189,12 +190,23 @@ export type PlatformDebugStatusResponse = PlatformStatusResponse & { lastDestroyErrorAt: number | null; lastRestartErrorMessage: string | null; lastRestartErrorAt: number | null; + recoveryStartedAt: number | null; + pendingRecoveryVolumeId: string | null; + recoveryPreviousVolumeId: string | null; + recoveryPreviousVolumeCleanupAfter: number | null; + lastRecoveryErrorMessage: string | null; + lastRecoveryErrorAt: number | null; previousVolumeId: string | null; restoreStartedAt: string | null; pendingRestoreVolumeId: string | null; instanceReadyEmailSent: boolean; }; +export type CleanupRecoveryPreviousVolumeResponse = { + ok: true; + deletedVolumeId: string | null; +}; + /** A Fly volume snapshot. */ export type VolumeSnapshot = { id: string; diff --git a/src/routers/admin-kiloclaw-instances-router.ts b/src/routers/admin-kiloclaw-instances-router.ts index 2980b975d..92725c143 100644 --- a/src/routers/admin-kiloclaw-instances-router.ts +++ b/src/routers/admin-kiloclaw-instances-router.ts @@ -760,6 +760,43 @@ export const adminKiloclawInstancesRouter = createTRPCRouter({ } }), + cleanupRecoveryPreviousVolume: adminProcedure + .input(GatewayProcessSchema) + .mutation(async ({ input, ctx }) => { + const fallbackMessage = 'Failed to clean up retained recovery volume'; + try { + const instance = await resolveInstance(input.userId, input.instanceId); + const client = new KiloClawInternalClient(); + const result = await client.cleanupRecoveryPreviousVolume( + input.userId, + workerInstanceId(instance) + ); + + if (result.deletedVolumeId) { + try { + await createKiloClawAdminAuditLog({ + action: 'kiloclaw.recovery.cleanup_retained_volume', + actor_id: ctx.user.id, + actor_email: ctx.user.google_user_email, + actor_name: ctx.user.google_user_name, + target_user_id: input.userId, + message: `Retained recovery volume deleted: ${result.deletedVolumeId}`, + metadata: { + deletedVolumeId: result.deletedVolumeId, + }, + }); + } catch (auditErr) { + console.error('Failed to write audit log for cleanupRecoveryPreviousVolume:', auditErr); + } + } + + return result; + } catch (err) { + console.error('Failed to clean up retained recovery volume for user:', input.userId, err); + throwKiloclawAdminError(err, fallbackMessage); + } + }), + machineStop: adminProcedure.input(GatewayProcessSchema).mutation(async ({ input }) => { const fallbackMessage = 'Failed to stop machine'; try { From 494724cb2a498ab75e0cbfd34a2cae4c85dfc480 Mon Sep 17 00:00:00 2001 From: syn Date: Sun, 5 Apr 2026 18:06:33 -0500 Subject: [PATCH 2/3] fix(kiloclaw): trigger recovery on first stopped alarm --- kiloclaw/src/config.ts | 4 ++-- .../src/durable-objects/kiloclaw-instance.test.ts | 15 +++++++++++++++ .../kiloclaw-instance/reconcile.ts | 6 +++--- .../durable-objects/kiloclaw-instance/recovery.ts | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/kiloclaw/src/config.ts b/kiloclaw/src/config.ts index 9daf1f426..efcac5a92 100644 --- a/kiloclaw/src/config.ts +++ b/kiloclaw/src/config.ts @@ -71,8 +71,8 @@ export const ALARM_INTERVAL_IDLE_MS = 30 * 60 * 1000; // 30 min /** Random jitter added to alarm scheduling to prevent Fly API bursts */ export const ALARM_JITTER_MS = 60 * 1000; // 0-60s -/** Consecutive failed health checks before marking a running instance as stopped */ -export const SELF_HEAL_THRESHOLD = 5; +/** Consecutive Fly `stopped` confirmations before triggering unexpected-stop recovery */ +export const SELF_HEAL_THRESHOLD = 1; /** Retain a replaced volume for rollback/debug only when snapshots exist. */ export const PREVIOUS_VOLUME_RETENTION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts index 57c2d0566..dcb44ac6d 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts @@ -813,6 +813,21 @@ describe('reconciliation: machine status sync', () => { expect(storage._store.get('recoveryStartedAt')).not.toBeNull(); }); + it("does not trigger unexpected-stop recovery when Fly reports 'created'", async () => { + const { instance, storage, waitUntilPromises } = createInstance(); + await seedRunning(storage); + + (flyClient.getMachine as Mock).mockResolvedValue({ state: 'created', config: {} }); + (flyClient.getVolume as Mock).mockResolvedValue({ id: 'vol-1' }); + + await instance.alarm(); + + expect(waitUntilPromises).toHaveLength(0); + expect(storage._store.get('status')).toBe('running'); + expect(storage._store.get('healthCheckFailCount')).toBe(0); + expect(storage._store.get('recoveryStartedAt')).toBeUndefined(); + }); + it('does not relaunch automatic recovery on subsequent alarms while already recovering', async () => { const { instance, storage, waitUntilPromises } = createInstance(); await seedRecovering(storage, { diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts index b4594ac5a..cb7670f19 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts @@ -32,7 +32,7 @@ import { writeEvent, eventContextFromState } from '../../utils/analytics'; export type ReconcileWithFlyResult = { beginUnexpectedStopRecovery?: { - flyState: 'stopped' | 'created'; + flyState: 'stopped'; failCount: number; }; timedOutUnexpectedStopRecovery?: { @@ -827,7 +827,7 @@ export async function syncStatusWithFly( } // failed is definitively terminal — transition immediately without waiting for - // SELF_HEAL_THRESHOLD consecutive checks like we do for stopped/created. + // the unexpected-stop recovery confirmation path used for stopped. if (flyState === 'failed' && state.status !== 'stopped') { const wasStarting = state.status === 'starting'; rctx.log('sync_status_failed', { @@ -853,7 +853,7 @@ export async function syncStatusWithFly( return {}; } - if ((flyState === 'stopped' || flyState === 'created') && state.status === 'running') { + if (flyState === 'stopped' && state.status === 'running') { state.healthCheckFailCount++; await ctx.storage.put(storageUpdate({ healthCheckFailCount: state.healthCheckFailCount })); diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts index 73e7ba6b5..b5e0255dd 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts @@ -126,7 +126,7 @@ export async function cleanupRetainedRecoveryVolumeIfDue( export async function beginUnexpectedStopRecovery( runtime: RecoveryRuntime, - trigger: { flyState: 'stopped' | 'created'; failCount: number } + trigger: { flyState: 'stopped'; failCount: number } ): Promise { const { state } = runtime; const recoveryStartedAt = Date.now(); From 7fd48eb0e9ec79dd89446a1f71a576634b3fafc3 Mon Sep 17 00:00:00 2001 From: syn Date: Sun, 5 Apr 2026 18:53:19 -0500 Subject: [PATCH 3/3] fix(kiloclaw): hand off recovery startup timeouts --- .../durable-objects/kiloclaw-instance.test.ts | 206 ++++++++++++++++-- .../kiloclaw-instance/index.ts | 28 +++ .../kiloclaw-instance/reconcile.ts | 69 +++++- .../kiloclaw-instance/recovery.ts | 204 ++++++++++------- 4 files changed, 406 insertions(+), 101 deletions(-) diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts index dcb44ac6d..a338f6e00 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts @@ -939,6 +939,168 @@ describe('unexpected stop recovery', () => { expect(flyClient.deleteVolume).not.toHaveBeenCalledWith(expect.anything(), 'vol-1'); }); + it('hands off to recovering reconcile when replacement machine startup times out', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-old', + flyVolumeId: 'vol-1', + flyRegion: 'iad', + }); + + (flyClient.getVolume as Mock).mockImplementation(async (_config: unknown, volumeId: string) => { + if (volumeId === 'vol-1') { + return { + id: 'vol-1', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'iad', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + if (volumeId === 'vol-recovery') { + return { + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + throw new Error(`unexpected volume lookup ${volumeId}`); + }); + (flyClient.createVolumeWithFallback as Mock).mockResolvedValue({ + id: 'vol-recovery', + region: 'ord', + }); + (flyClient.createMachine as Mock).mockResolvedValue({ + id: 'machine-recovery', + region: 'ord', + }); + (flyClient.waitForState as Mock).mockRejectedValue( + new FlyApiError( + 'Fly API waitForState(started) failed (408): {"error":"deadline_exceeded"}', + 408, + '{"error":"deadline_exceeded"}' + ) + ); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + + await ( + instance as unknown as { recoverUnexpectedStopInBackground: () => Promise } + ).recoverUnexpectedStopInBackground(); + + expect(storage._store.get('status')).toBe('recovering'); + expect(storage._store.get('flyMachineId')).toBe('machine-recovery'); + expect(storage._store.get('flyVolumeId')).toBe('vol-1'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBe('vol-recovery'); + expect(storage._store.get('lastRecoveryErrorMessage')).toBeUndefined(); + expect(flyClient.deleteVolume).not.toHaveBeenCalled(); + }); + + it('stays recovering when reconcile sees the replacement machine still in created state', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-recovery', + flyVolumeId: 'vol-1', + pendingRecoveryVolumeId: 'vol-recovery', + recoveryStartedAt: Date.now(), + }); + + (flyClient.getMachine as Mock).mockResolvedValue({ state: 'created', config: {} }); + + await instance.alarm(); + + expect(storage._store.get('status')).toBe('recovering'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBe('vol-recovery'); + expect(flyClient.deleteVolume).not.toHaveBeenCalled(); + }); + + it('completes recovery from alarm reconcile once the replacement machine reaches started', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-recovery', + flyVolumeId: 'vol-1', + pendingRecoveryVolumeId: 'vol-recovery', + recoveryStartedAt: Date.now(), + flyRegion: 'iad', + }); + + (flyClient.getMachine as Mock).mockResolvedValue({ state: 'started', config: {} }); + (flyClient.getVolume as Mock).mockImplementation(async (_config: unknown, volumeId: string) => { + if (volumeId === 'vol-recovery') { + return { + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + if (volumeId === 'vol-1') { + return { + id: 'vol-1', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'iad', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + throw new Error(`unexpected volume lookup ${volumeId}`); + }); + (flyClient.listVolumeSnapshots as Mock).mockResolvedValue([]); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + await instance.alarm(); + + expect(storage._store.get('status')).toBe('running'); + expect(storage._store.get('flyMachineId')).toBe('machine-recovery'); + expect(storage._store.get('flyVolumeId')).toBe('vol-recovery'); + expect(storage._store.get('flyRegion')).toBe('ord'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBeNull(); + expect(storage._store.get('recoveryPreviousVolumeId')).toBeNull(); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-1'); + }); + + it('fails recovery through shared cleanup when reconcile sees the replacement machine is gone', async () => { + const { instance, storage } = createInstance(); + await seedRecovering(storage, { + flyMachineId: 'machine-recovery', + flyVolumeId: 'vol-1', + pendingRecoveryVolumeId: 'vol-recovery', + recoveryStartedAt: Date.now(), + }); + + (flyClient.getMachine as Mock).mockRejectedValue(new FlyApiError('not found', 404, '{}')); + (flyClient.destroyMachine as Mock).mockResolvedValue(undefined); + (flyClient.getVolume as Mock).mockResolvedValue({ + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }); + (flyClient.deleteVolume as Mock).mockResolvedValue(undefined); + + await instance.alarm(); + + expect(storage._store.get('status')).toBe('stopped'); + expect(storage._store.get('pendingRecoveryVolumeId')).toBeNull(); + expect(storage._store.get('lastRecoveryErrorMessage')).toBe( + 'unexpected stop recovery replacement machine disappeared' + ); + expect(flyClient.deleteVolume).toHaveBeenCalledWith(expect.anything(), 'vol-recovery'); + }); + it('deletes the old volume immediately when it has no snapshots, force-destroying any attached machine first', async () => { const { instance, storage } = createInstance(); await seedRecovering(storage, { @@ -947,25 +1109,31 @@ describe('unexpected stop recovery', () => { flyRegion: 'iad', }); - (flyClient.getVolume as Mock) - .mockResolvedValueOnce({ - id: 'vol-1', - name: 'sandbox-1', - state: 'detached', - size_gb: 10, - region: 'iad', - attached_machine_id: null, - created_at: new Date().toISOString(), - }) - .mockResolvedValueOnce({ - id: 'vol-1', - name: 'sandbox-1', - state: 'attached', - size_gb: 10, - region: 'iad', - attached_machine_id: 'machine-attached', - created_at: new Date().toISOString(), - }); + (flyClient.getVolume as Mock).mockImplementation(async (_config: unknown, volumeId: string) => { + if (volumeId === 'vol-1') { + return { + id: 'vol-1', + name: 'sandbox-1', + state: 'attached', + size_gb: 10, + region: 'iad', + attached_machine_id: 'machine-attached', + created_at: new Date().toISOString(), + }; + } + if (volumeId === 'vol-recovery') { + return { + id: 'vol-recovery', + name: 'sandbox-1', + state: 'detached', + size_gb: 10, + region: 'ord', + attached_machine_id: null, + created_at: new Date().toISOString(), + }; + } + throw new Error(`unexpected volume lookup ${volumeId}`); + }); (flyClient.createVolumeWithFallback as Mock).mockResolvedValue({ id: 'vol-recovery', region: 'ord', diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index fd2473427..bca78c36f 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -72,6 +72,7 @@ import { restoreFromPostgres, markDestroyedInPostgresHelper } from './postgres'; import { beginUnexpectedStopRecovery, cleanupPendingRecoveryVolumeIfNeeded, + completeUnexpectedStopRecovery, cleanupRecoveryPreviousVolume, cleanupRetainedRecoveryVolumeIfDue, failUnexpectedStopRecovery, @@ -2380,6 +2381,33 @@ export class KiloClawInstance extends DurableObject { return; } + if (reconcileResult.completeUnexpectedStopRecovery && this.s.status === 'recovering') { + try { + await completeUnexpectedStopRecovery(this.recoveryRuntime()); + } catch (err) { + doError(this.s, 'completeUnexpectedStopRecovery failed during alarm reconcile', { + error: toLoggable(err), + }); + const errorMessage = err instanceof Error ? err.message : String(err); + await failUnexpectedStopRecovery( + this.recoveryRuntime(), + errorMessage, + 'alarm_reconcile_complete' + ); + } + return; + } + + if (reconcileResult.failedUnexpectedStopRecovery && this.s.status === 'recovering') { + await failUnexpectedStopRecovery( + this.recoveryRuntime(), + reconcileResult.failedUnexpectedStopRecovery.errorMessage, + reconcileResult.failedUnexpectedStopRecovery.label + ); + await this.scheduleAlarm(); + return; + } + if (reconcileResult.timedOutUnexpectedStopRecovery && this.s.status === 'recovering') { await failUnexpectedStopRecovery( this.recoveryRuntime(), diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts index cb7670f19..5eaef0830 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts @@ -35,6 +35,11 @@ export type ReconcileWithFlyResult = { flyState: 'stopped'; failCount: number; }; + completeUnexpectedStopRecovery?: true; + failedUnexpectedStopRecovery?: { + errorMessage: string; + label: string; + }; timedOutUnexpectedStopRecovery?: { errorMessage: string; durationMs?: number; @@ -90,7 +95,7 @@ export async function reconcileWithFly( } if (state.status === 'recovering') { - return reconcileRecovering(state, rctx); + return reconcileRecovering(flyConfig, state, rctx); } const { reconciled: machineReconciled, result } = await reconcileMachine( @@ -878,6 +883,7 @@ export async function syncStatusWithFly( } async function reconcileRecovering( + flyConfig: FlyClientConfig, state: InstanceMutableState, rctx: ReconcileContext ): Promise { @@ -885,6 +891,67 @@ async function reconcileRecovering( const isTimedOut = recoveryStartedAt !== null && Date.now() - recoveryStartedAt > RECOVERING_TIMEOUT_MS; + if (state.flyMachineId) { + try { + const machine = await fly.getMachine(flyConfig, state.flyMachineId); + + if (machine.state === 'started') { + rctx.log('unexpected_stop_recovery_machine_started', { + machine_id: state.flyMachineId, + old_state: 'recovering', + new_state: 'running', + }); + return { completeUnexpectedStopRecovery: true }; + } + + if ( + machine.state === 'stopped' || + machine.state === 'failed' || + machine.state === 'destroyed' + ) { + const errorMessage = `unexpected stop recovery replacement machine entered ${machine.state}`; + rctx.log('unexpected_stop_recovery_terminal_machine_state', { + machine_id: state.flyMachineId, + fly_state: machine.state, + error: errorMessage, + old_state: 'recovering', + new_state: 'stopped', + }); + return { + failedUnexpectedStopRecovery: { + errorMessage, + label: `alarm_${machine.state}`, + }, + }; + } + + rctx.log('unexpected_stop_recovery_waiting_for_start', { + machine_id: state.flyMachineId, + fly_state: machine.state, + }); + } catch (err) { + if (fly.isFlyNotFound(err)) { + const errorMessage = 'unexpected stop recovery replacement machine disappeared'; + rctx.log('unexpected_stop_recovery_machine_gone', { + machine_id: state.flyMachineId, + error: errorMessage, + old_state: 'recovering', + new_state: 'stopped', + }); + return { + failedUnexpectedStopRecovery: { + errorMessage, + label: 'alarm_machine_gone', + }, + }; + } + + doError(state, 'reconcileRecovering: transient error checking replacement machine', { + error: toLoggable(err), + }); + } + } + if (!isTimedOut) return {}; const errorMessage = 'unexpected stop recovery timed out'; diff --git a/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts index b5e0255dd..7d2eff667 100644 --- a/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts +++ b/kiloclaw/src/durable-objects/kiloclaw-instance/recovery.ts @@ -206,6 +206,101 @@ export async function failUnexpectedStopRecovery( }); } +export async function completeUnexpectedStopRecovery(runtime: RecoveryRuntime): Promise { + const { state, ctx, env } = runtime; + + if (!state.flyMachineId) { + throw new Error('Cannot complete unexpected stop recovery: missing replacement machine'); + } + if (!state.flyVolumeId) { + throw new Error('Cannot complete unexpected stop recovery: missing source volume'); + } + if (!state.pendingRecoveryVolumeId) { + throw new Error('Cannot complete unexpected stop recovery: missing replacement volume'); + } + + const flyConfig = getFlyConfig(env, state); + const oldVolumeId = state.flyVolumeId; + const recoveryVolumeId = state.pendingRecoveryVolumeId; + + const recoveryVolume = await fly.getVolume(flyConfig, recoveryVolumeId); + const recoveryVolumeRegion = recoveryVolume.region; + + await gateway.waitForHealthy(state, env, flyConfig.appName, state.flyMachineId); + + let retainedRecoveryVolumeId: string | null = null; + let retainedRecoveryVolumeCleanupAfter: number | null = null; + try { + const snapshots = await fly.listVolumeSnapshots(flyConfig, oldVolumeId); + if (snapshots.length > 0) { + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; + } else { + try { + await flyMachines.deleteVolumeAndAttachedMachine( + flyConfig, + oldVolumeId, + 'unexpected_stop_recovery_immediate_cleanup', + state.sandboxId ?? undefined + ); + } catch (cleanupErr) { + doWarn(state, 'old recovery source volume cleanup failed; retrying via alarm', { + volumeId: oldVolumeId, + error: toLoggable(cleanupErr), + }); + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now(); + } + } + } catch (snapshotErr) { + doWarn(state, 'failed to inspect old volume snapshots; retaining for TTL cleanup', { + volumeId: oldVolumeId, + error: toLoggable(snapshotErr), + }); + retainedRecoveryVolumeId = oldVolumeId; + retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; + } + + const postStatus = await ctx.storage.get('status'); + if (postStatus !== 'recovering') return; + + const now = Date.now(); + const durationMs = state.recoveryStartedAt ? now - state.recoveryStartedAt : undefined; + state.status = 'running'; + state.flyVolumeId = recoveryVolumeId; + state.flyRegion = recoveryVolumeRegion ?? state.flyRegion; + state.recoveryStartedAt = null; + state.pendingRecoveryVolumeId = null; + state.recoveryPreviousVolumeId = retainedRecoveryVolumeId; + state.recoveryPreviousVolumeCleanupAfter = retainedRecoveryVolumeCleanupAfter; + state.healthCheckFailCount = 0; + state.lastStartedAt = now; + state.lastRecoveryErrorMessage = null; + state.lastRecoveryErrorAt = null; + await runtime.persist({ + status: 'running', + flyMachineId: state.flyMachineId, + flyVolumeId: recoveryVolumeId, + flyRegion: state.flyRegion, + recoveryStartedAt: null, + pendingRecoveryVolumeId: null, + recoveryPreviousVolumeId: retainedRecoveryVolumeId, + recoveryPreviousVolumeCleanupAfter: retainedRecoveryVolumeCleanupAfter, + healthCheckFailCount: 0, + lastStartedAt: now, + lastRecoveryErrorMessage: null, + lastRecoveryErrorAt: null, + }); + + runtime.emitEvent({ + event: 'instance.unexpected_stop_recovery_succeeded', + status: 'running', + label: 'alarm_relocated', + durationMs, + }); + await runtime.scheduleAlarm(); +} + export async function runUnexpectedStopRecoveryInBackground( runtime: RecoveryRuntime ): Promise { @@ -306,90 +401,37 @@ export async function runUnexpectedStopRecoveryInBackground( const previousRegion = state.flyRegion; state.flyRegion = recoveryVolumeRegion ?? oldVolumeRegion ?? previousRegion; - await flyMachines.createNewMachine( - flyConfig, - ctx, - state, - machineConfig, - minSecretsVersion, - env.FLY_REGION - ); - if (!state.flyMachineId) { - throw new Error('Unexpected stop recovery created no machine'); - } - await gateway.waitForHealthy(state, env, flyConfig.appName, state.flyMachineId); - - let retainedRecoveryVolumeId: string | null = null; - let retainedRecoveryVolumeCleanupAfter: number | null = null; try { - const snapshots = await fly.listVolumeSnapshots(flyConfig, oldVolumeId); - if (snapshots.length > 0) { - retainedRecoveryVolumeId = oldVolumeId; - retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; - } else { - try { - await flyMachines.deleteVolumeAndAttachedMachine( - flyConfig, - oldVolumeId, - 'unexpected_stop_recovery_immediate_cleanup', - state.sandboxId ?? undefined - ); - } catch (cleanupErr) { - doWarn(state, 'old recovery source volume cleanup failed; retrying via alarm', { - volumeId: oldVolumeId, - error: toLoggable(cleanupErr), - }); - retainedRecoveryVolumeId = oldVolumeId; - retainedRecoveryVolumeCleanupAfter = Date.now(); - } + await flyMachines.createNewMachine( + flyConfig, + ctx, + state, + machineConfig, + minSecretsVersion, + env.FLY_REGION + ); + } catch (err) { + const isStartupTimeout = err instanceof fly.FlyApiError && err.status === 408; + if (!isStartupTimeout || !state.flyMachineId) { + throw err; } - } catch (snapshotErr) { - doWarn(state, 'failed to inspect old volume snapshots; retaining for TTL cleanup', { - volumeId: oldVolumeId, - error: toLoggable(snapshotErr), - }); - retainedRecoveryVolumeId = oldVolumeId; - retainedRecoveryVolumeCleanupAfter = Date.now() + PREVIOUS_VOLUME_RETENTION_MS; - } - - const postStatus = await ctx.storage.get('status'); - if (postStatus !== 'recovering') return; - - const now = Date.now(); - const durationMs = state.recoveryStartedAt ? now - state.recoveryStartedAt : undefined; - state.status = 'running'; - state.flyVolumeId = recoveryVolumeId; - state.flyRegion = recoveryVolumeRegion ?? state.flyRegion; - state.recoveryStartedAt = null; - state.pendingRecoveryVolumeId = null; - state.recoveryPreviousVolumeId = retainedRecoveryVolumeId; - state.recoveryPreviousVolumeCleanupAfter = retainedRecoveryVolumeCleanupAfter; - state.healthCheckFailCount = 0; - state.lastStartedAt = now; - state.lastRecoveryErrorMessage = null; - state.lastRecoveryErrorAt = null; - await runtime.persist({ - status: 'running', - flyMachineId: state.flyMachineId, - flyVolumeId: recoveryVolumeId, - flyRegion: state.flyRegion, - recoveryStartedAt: null, - pendingRecoveryVolumeId: null, - recoveryPreviousVolumeId: retainedRecoveryVolumeId, - recoveryPreviousVolumeCleanupAfter: retainedRecoveryVolumeCleanupAfter, - healthCheckFailCount: 0, - lastStartedAt: now, - lastRecoveryErrorMessage: null, - lastRecoveryErrorAt: null, - }); - runtime.emitEvent({ - event: 'instance.unexpected_stop_recovery_succeeded', - status: 'running', - label: 'alarm_relocated', - durationMs, - }); - await runtime.scheduleAlarm(); + doWarn( + state, + 'unexpected stop recovery timed out waiting for replacement machine startup; reconcile will continue', + { + error: toLoggable(err), + flyMachineId: state.flyMachineId, + pendingRecoveryVolumeId: recoveryVolumeId, + } + ); + await runtime.scheduleAlarm(); + return; + } + if (!state.flyMachineId) { + throw new Error('Unexpected stop recovery created no machine'); + } + await completeUnexpectedStopRecovery(runtime); } catch (err) { doError(state, 'unexpected stop recovery failed', { error: toLoggable(err),