From 23d3f3051835d58981af2ac5674b218daeb1ff28 Mon Sep 17 00:00:00 2001 From: reisepass <1474408+reisepass@users.noreply.github.com> Date: Tue, 17 Feb 2026 00:31:01 +0100 Subject: [PATCH 1/3] Add PGlite crash safety tests that expose corruption bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests that FAIL without the fix — reproducing real corruption: - Overlapping instances: triple, staggered, DDL writer, rapid cycling - HMR double-instance: lock-based blocking and rapid swap corruption - WAL bloat burst mode: rapid kills without checkpoint cause corruption Co-Authored-By: Matthaus Wolff <8714327+WolffM@users.noreply.github.com> --- packages/pglite/tests/crash-safety/README.md | 133 ++++ packages/pglite/tests/crash-safety/harness.js | 188 +++++ .../crash-safety/hmr-double-instance.test.js | 425 ++++++++++++ .../overlapping-instances.test.js | 486 +++++++++++++ .../wal-bloat-no-checkpoint.test.js | 651 ++++++++++++++++++ .../workers/hmr-double-instance.js | 88 +++ .../workers/overlapping-ddl-writer.js | 116 ++++ .../workers/overlapping-rapid-cycling.js | 91 +++ .../workers/overlapping-staggered.js | 101 +++ .../workers/overlapping-three-instances.js | 104 +++ .../workers/wal-bloat-no-checkpoint.js | 174 +++++ 11 files changed, 2557 insertions(+) create mode 100644 packages/pglite/tests/crash-safety/README.md create mode 100644 packages/pglite/tests/crash-safety/harness.js create mode 100644 packages/pglite/tests/crash-safety/hmr-double-instance.test.js create mode 100644 packages/pglite/tests/crash-safety/overlapping-instances.test.js create mode 100644 packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js create mode 100644 packages/pglite/tests/crash-safety/workers/hmr-double-instance.js create mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js create mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js create mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-staggered.js create mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js create mode 100644 packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js diff --git a/packages/pglite/tests/crash-safety/README.md b/packages/pglite/tests/crash-safety/README.md new file mode 100644 index 000000000..14a7879a7 --- /dev/null +++ b/packages/pglite/tests/crash-safety/README.md @@ -0,0 +1,133 @@ +# PGlite Crash Safety Test Suite + +Tests that reproduce real corruption bugs in PGlite — specifically the issues fixed by the PID file lock and partial initdb detection in `nodefs.ts`. Every test here **fails without the fix** and passes with it. + +Single-instance WAL recovery tests (kill-during-insert, kill-during-transaction, etc.) were intentionally excluded because PostgreSQL handles those correctly without any code changes. Those tests are preserved in the `archive/all-crash-safety-tests` branch. + +## Running the Tests + +```bash +# Run all crash safety tests +pnpm vitest run tests/crash-safety/ --reporter=verbose + +# Run a single scenario +pnpm vitest run tests/crash-safety/overlapping-instances.test.js + +# Keep data directories for debugging (not cleaned up after test) +RETAIN_DATA=1 pnpm vitest run tests/crash-safety/ +``` + +> **Note:** Do not use `--no-file-parallelism` — PGlite's WASM module conflicts with vitest's single-worker mode. + +## Architecture + +``` +tests/crash-safety/ +├── harness.js # Shared test infrastructure +├── README.md # This file +├── CRASH-SAFETY.md # Detailed failure mode documentation +├── RESULTS.md # Test results log +├── hmr-double-instance.test.js # HMR double-instance lock test +├── overlapping-instances.test.js # Overlapping instance corruption test +├── wal-bloat-no-checkpoint.test.js # WAL bloat burst mode corruption test +├── partial-init-backup.test.js # Partial initdb backup behavior test +└── workers/ + ├── hmr-double-instance.js + ├── overlapping-three-instances.js + ├── overlapping-staggered.js + ├── overlapping-ddl-writer.js + ├── overlapping-rapid-cycling.js + └── wal-bloat-no-checkpoint.js +``` + +### How It Works + +Each test follows the same pattern: + +1. **Worker script** — A standalone Node.js script that creates a PGlite instance on a data directory (passed via `PGLITE_DATA_DIR` env var), performs database operations, and sends IPC messages to the parent via `process.send()` to signal progress. + +2. **Test file** — Uses vitest. Calls `crashTest()` from the harness to spawn the worker as a child process via `fork()`. The harness kills the child with `SIGKILL` either after a timer or when a specific IPC message is received. + +3. **Verification** — After the kill, the test reopens PGlite on the same data directory and checks: + + - The database opens without error (no PANIC, no hang) + - Basic queries succeed (`SELECT 1`) + - All user tables are scannable + - Data is consistent (committed rows present, uncommitted rows absent) + +4. **Cleanup** — Each test uses a unique `/tmp/pglite-crash-*` directory and removes it in `afterAll`, unless `RETAIN_DATA=1` is set. + +## Test Scenarios + +### 1. Overlapping Instances + +**File:** `overlapping-instances.test.js` (4 tests) + +Multiple PGlite instances opening the same data directory concurrently. Without the PID file lock, this causes silent corruption (`Aborted()` on next open). + +- **Triple instances** — three instances open simultaneously +- **Staggered** — second instance opens while first is mid-write +- **DDL writer** — overlapping DDL operations +- **Rapid cycling** — rapid open/kill/reopen cycles with overlapping lifetimes + +### 2. HMR Double-Instance + +**File:** `hmr-double-instance.test.js` (2 tests) + +Simulates hot module replacement (HMR) in dev servers where a new PGlite instance is created before the old one is closed. + +- **Lock blocking** — verifies instance B is blocked by the lock while instance A is alive +- **Rapid HMR cycles** — fast instance swaps that corrupt without the lock + +### 3. WAL Bloat Burst Mode + +**File:** `wal-bloat-no-checkpoint.test.js` (1 failing test) + +15 extremely rapid kill cycles with no delay, accumulating WAL without checkpointing. Without partial initdb detection, interrupted initializations leave corrupt state that causes `Aborted()`. + +### 4. Partial Init Backup + +**File:** `partial-init-backup.test.js` (3 tests) + +Directly tests the partial initdb detection and backup behavior in `nodefs.ts`: + +- Partial dir (no `PG_VERSION`) → moved to `.corrupt-` backup +- Partial dir (`PG_VERSION` but incomplete `base/`) → moved to backup +- Fully initialized dir → NOT moved (no false positives) + +## Harness API (`harness.js`) + +### `crashTest(options)` + +Spawns a child process and kills it. + +| Option | Type | Default | Description | +| --------------- | ------ | ----------- | -------------------------------------------------------------- | +| `dataDir` | string | required | Path to PGlite data directory | +| `workerScript` | string | required | Path to the worker `.js` file | +| `killAfterMs` | number | `500` | Delay before sending kill signal | +| `signal` | string | `'SIGKILL'` | Signal to send (usually SIGKILL) | +| `killOnMessage` | string | `null` | Kill when worker sends this IPC message instead of using timer | +| `env` | object | `{}` | Extra environment variables for the child | + +Returns: `{ workerKilled, workerError, workerMessages, workerExitCode, workerSignal, stdout, stderr }` + +### `tryOpen(dataDir, timeoutMs?)` + +Attempts to open a PGlite instance on a potentially corrupted data directory. Includes a timeout (default 15s) to handle cases where a corrupted database hangs forever during initialization. + +Returns: `{ success, db, error }` + +### `verifyIntegrity(db)` + +Runs integrity checks against an open PGlite instance: basic query, table scan, index scan. + +Returns: `{ intact, issues }` + +### `cleanupDataDir(dataDir)` + +Removes a test data directory and its sibling `.lock` file. + +### `testDataDir(scenarioName)` + +Generates a unique `/tmp/pglite-crash---` path. diff --git a/packages/pglite/tests/crash-safety/harness.js b/packages/pglite/tests/crash-safety/harness.js new file mode 100644 index 000000000..cc8702dd0 --- /dev/null +++ b/packages/pglite/tests/crash-safety/harness.js @@ -0,0 +1,188 @@ +// Crash safety test harness: spawn a PGlite worker, kill it, verify recovery. + +import { fork } from 'node:child_process' +import { existsSync, rmSync, mkdirSync } from 'node:fs' +import { resolve } from 'node:path' + +/** + * Spawn a worker, optionally kill it on a message or timer, return results. + */ +export async function crashTest(options) { + const { + dataDir, + workerScript, + killAfterMs = 500, + signal = 'SIGKILL', + env = {}, + killOnMessage = null, + } = options + + const parentDir = resolve(dataDir, '..') + if (!existsSync(parentDir)) { + mkdirSync(parentDir, { recursive: true }) + } + + // eslint-disable-next-line @typescript-eslint/no-unused-vars + return new Promise((resolvePromise, rejectPromise) => { + const messages = [] + let workerError = null + let killed = false + let killTimer = null + + const child = fork(workerScript, [], { + env: { + ...process.env, + PGLITE_DATA_DIR: dataDir, + ...env, + }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + let stdout = '' + let stderr = '' + child.stdout.on('data', (d) => { + stdout += d.toString() + }) + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + + child.on('message', (msg) => { + messages.push(msg) + + if (killOnMessage && msg === killOnMessage && !killed) { + killed = true + if (killTimer) clearTimeout(killTimer) + child.kill(signal) + } + }) + + child.on('error', (err) => { + workerError = err.message + }) + + child.on('exit', (code, sig) => { + if (killTimer) clearTimeout(killTimer) + resolvePromise({ + workerKilled: sig === signal || killed, + workerError, + workerMessages: messages, + workerExitCode: code, + workerSignal: sig, + stdout, + stderr, + }) + }) + + if (!killOnMessage) { + killTimer = setTimeout(() => { + if (!killed) { + killed = true + child.kill(signal) + } + }, killAfterMs) + } + + // Safety timeout + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 30000) + }) +} + +/** Try to open PGlite on a possibly-corrupted data dir. Caller must close db on success. */ +export async function tryOpen(dataDir, timeoutMs = 15000) { + const { PGlite } = await import('../../dist/index.js') + + try { + const db = new PGlite(dataDir) + + await Promise.race([ + db.waitReady, + new Promise((_, reject) => + setTimeout( + () => + reject( + new Error( + `PGlite open timed out after ${timeoutMs}ms (likely corrupted)`, + ), + ), + timeoutMs, + ), + ), + ]) + + return { success: true, db, error: null } + } catch (err) { + return { success: false, db: null, error: err } + } +} + +/** Run basic integrity checks: health query, table scans, index scans. */ +export async function verifyIntegrity(db) { + const issues = [] + + try { + await db.query('SELECT 1 as health_check') + } catch (err) { + issues.push(`Basic query failed: ${err.message}`) + return { intact: false, issues } + } + + try { + const tables = await db.query(` + SELECT tablename FROM pg_tables + WHERE schemaname = 'public' + ORDER BY tablename + `) + + for (const row of tables.rows) { + try { + await db.query(`SELECT count(*) FROM "${row.tablename}"`) + } catch (err) { + issues.push(`Count on ${row.tablename} failed: ${err.message}`) + } + } + } catch (err) { + issues.push(`Table listing failed: ${err.message}`) + } + + try { + const indexes = await db.query(` + SELECT indexname, tablename FROM pg_indexes + WHERE schemaname = 'public' + ORDER BY indexname + `) + + for (const row of indexes.rows) { + try { + await db.query(`SELECT count(*) FROM "${row.tablename}"`) + } catch (err) { + issues.push(`Index check on ${row.indexname} failed: ${err.message}`) + } + } + } catch (err) { + issues.push(`Index listing failed: ${err.message}`) + } + + return { intact: issues.length === 0, issues } +} + +export function cleanupDataDir(dataDir) { + if (existsSync(dataDir)) { + rmSync(dataDir, { recursive: true, force: true }) + } + const lockFile = dataDir + '.lock' + if (existsSync(lockFile)) { + rmSync(lockFile, { force: true }) + } +} + +export function testDataDir(scenarioName) { + const timestamp = Date.now() + const rand = Math.random().toString(36).slice(2, 8) + return `/tmp/pglite-crash-${scenarioName}-${timestamp}-${rand}` +} diff --git a/packages/pglite/tests/crash-safety/hmr-double-instance.test.js b/packages/pglite/tests/crash-safety/hmr-double-instance.test.js new file mode 100644 index 000000000..1942c19c0 --- /dev/null +++ b/packages/pglite/tests/crash-safety/hmr-double-instance.test.js @@ -0,0 +1,425 @@ +/** + * Crash Safety Test: HMR (Hot Module Reload) Double Instance + * + * Tests the file lock mechanism that prevents overlapping PGlite instances + * from corrupting the database during HMR in dev servers like Vite/Next.js. + * + * With the lock file implementation: + * 1. Instance A opens and acquires the lock + * 2. Instance B attempts to open the SAME data dir and is BLOCKED by the lock + * 3. Only instance A writes data + * 4. When killed with SIGKILL, the stale lock is detected on next open + * + * This prevents the corruption that previously occurred when two WASM heaps + * both accessed the same PostgreSQL data directory simultaneously. + */ + +import { describe, it, expect, afterAll } from 'vitest' +import { fork } from 'node:child_process' +import { existsSync, rmSync } from 'node:fs' + +const dataDir = `/tmp/pglite-crash-hmr-double-${Date.now()}` + +afterAll(async () => { + if (!process.env.RETAIN_DATA) { + if (existsSync(dataDir)) { + rmSync(dataDir, { recursive: true, force: true }) + } + // Clean up sibling lock files + const lockFile = dataDir + '.lock' + if (existsSync(lockFile)) { + rmSync(lockFile, { force: true }) + } + } +}) + +describe('crash safety: HMR double-instance corruption', () => { + it( + 'should survive multiple HMR-style instance replacement cycles with SIGKILL', + async () => { + const { PGlite } = await import('../../dist/index.js') + const workerPath = new URL( + './workers/hmr-double-instance.js', + import.meta.url, + ).pathname + + const NUM_CYCLES = 7 + const cycleResults = [] + + for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { + const overlapOps = cycle < 2 ? 10 : cycle < 5 ? 25 : 40 + + const result = await new Promise((resolve) => { + const messages = [] + let killed = false + + const child = fork(workerPath, [], { + env: { + ...process.env, + PGLITE_DATA_DIR: dataDir, + CYCLE: String(cycle), + OVERLAP_OPS: String(overlapOps), + }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + let stderr = '' + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + child.stdout.on('data', () => {}) // drain + + child.on('message', (msg) => { + messages.push(msg) + + // Kill after all operations are done (instance A has written data) + if (msg === 'all-operations-done' && !killed) { + killed = true + child.kill('SIGKILL') + } + }) + + child.on('exit', (code, sig) => { + resolve({ + cycle, + killed: killed || sig === 'SIGKILL', + messages, + exitCode: code, + signal: sig, + stderr, + }) + }) + + // Fallback timer: kill after 15s if worker hasn't finished + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 15000) + }) + + cycleResults.push(result) + + // The worker should either be killed by us or exit cleanly + // (it's killed after 'all-operations-done') + expect(result.killed).toBe(true) + + // Instance A should have been ready in every cycle + expect(result.messages).toContain('instance-a-ready') + + // Instance B should have been BLOCKED by the lock + expect(result.messages).toContain('instance-b-blocked') + expect(result.messages).not.toContain('instance-b-ready') + + // On the first cycle, schema should have been created + if (cycle === 0) { + expect(result.messages).toContain('schema-created') + } + + // Log cycle details for debugging + console.log( + `Cycle ${cycle}: messages=[${result.messages.join(', ')}], ` + + `exit=${result.exitCode}, signal=${result.signal}`, + ) + + // ---- Intermediate recovery check every 2 cycles ---- + if (cycle % 2 === 1 || cycle === NUM_CYCLES - 1) { + let db = null + let openSuccess = false + try { + db = new PGlite(dataDir) + await Promise.race([ + db.waitReady, + new Promise((_, reject) => + setTimeout( + () => + reject(new Error(`Open timed out after cycle ${cycle}`)), + 20000, + ), + ), + ]) + openSuccess = true + } catch (err) { + console.log( + `CORRUPTION DETECTED after cycle ${cycle}: ${err.message}`, + ) + openSuccess = false + } + + if (openSuccess && db) { + try { + // Basic health check + await db.query('SELECT 1 as ok') + + // Table should exist after cycle 0 + const tableCheck = await db.query(` + SELECT tablename FROM pg_tables + WHERE schemaname = 'public' AND tablename = 'hmr_data' + `) + expect(tableCheck.rows.length).toBe(1) + + // Count rows - with the lock, only instance A writes + const countResult = await db.query( + 'SELECT count(*)::int as cnt FROM hmr_data', + ) + const rowCount = countResult.rows[0].cnt + console.log(`After cycle ${cycle}: ${rowCount} rows in hmr_data`) + expect(rowCount).toBeGreaterThan(0) + + // All rows should be from instance A (B was blocked) + const instanceCheck = await db.query( + `SELECT DISTINCT instance FROM hmr_data`, + ) + const instances = instanceCheck.rows.map((r) => r.instance) + expect(instances).toContain('A') + expect(instances).not.toContain('B') + + // Verify index is usable + const indexScan = await db.query( + `SELECT count(*)::int as cnt FROM hmr_data WHERE cycle = $1`, + [0], + ) + expect(indexScan.rows[0].cnt).toBeGreaterThanOrEqual(0) + + // Verify we can do a full sequential scan without errors + const allRows = await db.query( + 'SELECT id, cycle, instance, phase, seq FROM hmr_data ORDER BY id', + ) + expect(allRows.rows.length).toBe(rowCount) + + // Check for data consistency + for (const row of allRows.rows) { + expect(row.id).toBeGreaterThan(0) + expect(row.cycle).toBeGreaterThanOrEqual(0) + expect(row.cycle).toBeLessThanOrEqual(cycle) + expect(row.instance).toBe('A') + expect(typeof row.phase).toBe('string') + expect(row.seq).toBeGreaterThanOrEqual(0) + } + } finally { + await db.close() + } + } + + // The DB MUST be openable + expect(openSuccess).toBe(true) + } + + // Small delay between cycles + await new Promise((r) => setTimeout(r, 200)) + } + + // ---- Final comprehensive verification ---- + console.log('\n--- Final verification after all HMR cycles ---') + + let finalDb = null + try { + finalDb = new PGlite(dataDir) + await Promise.race([ + finalDb.waitReady, + new Promise((_, reject) => + setTimeout(() => reject(new Error('Final open timed out')), 30000), + ), + ]) + } catch (err) { + console.log(`FINAL CORRUPTION: ${err.message}`) + expect.fail( + `Database corrupted after ${NUM_CYCLES} HMR cycles: ${err.message}`, + ) + } + + try { + // Full integrity check + const tables = await finalDb.query(` + SELECT tablename FROM pg_tables WHERE schemaname = 'public' + `) + expect(tables.rows.length).toBeGreaterThanOrEqual(1) + + // Check all indexes are usable + const indexes = await finalDb.query(` + SELECT indexname, tablename FROM pg_indexes + WHERE schemaname = 'public' + `) + for (const idx of indexes.rows) { + await finalDb.query(`SELECT count(*) FROM "${idx.tablename}"`) + } + + // Row count audit + const finalCount = await finalDb.query( + 'SELECT count(*)::int as cnt FROM hmr_data', + ) + console.log(`Final row count: ${finalCount.rows[0].cnt}`) + + // Per-cycle breakdown (all should be instance A only) + const cycleBreakdown = await finalDb.query(` + SELECT cycle, instance, count(*)::int as cnt + FROM hmr_data + GROUP BY cycle, instance + ORDER BY cycle, instance + `) + console.log('Per-cycle breakdown:') + for (const row of cycleBreakdown.rows) { + console.log( + ` cycle=${row.cycle} instance=${row.instance} count=${row.cnt}`, + ) + // With the lock, all rows should be from instance A + expect(row.instance).toBe('A') + } + + // Verify we can still write to the database + await finalDb.query( + `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) + VALUES ($1, $2, $3, $4, $5)`, + [999, 'verify', 'final', 0, 'final-verification-row'], + ) + const verifyRow = await finalDb.query( + `SELECT * FROM hmr_data WHERE cycle = 999`, + ) + expect(verifyRow.rows.length).toBe(1) + } finally { + await finalDb.close() + } + }, + { timeout: 300000 }, + ) + + it( + 'should survive rapid HMR cycles with minimal delay between instance swaps', + async () => { + const { PGlite } = await import('../../dist/index.js') + + // Use a separate data dir for this sub-test + const rapidDataDir = `${dataDir}-rapid` + + const workerPath = new URL( + './workers/hmr-double-instance.js', + import.meta.url, + ).pathname + + const RAPID_CYCLES = 5 + + for (let cycle = 0; cycle < RAPID_CYCLES; cycle++) { + const result = await new Promise((resolve) => { + const messages = [] + let killed = false + + const child = fork(workerPath, [], { + env: { + ...process.env, + PGLITE_DATA_DIR: rapidDataDir, + CYCLE: String(cycle), + OVERLAP_OPS: '50', + }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + let stderr = '' + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + child.stdout.on('data', () => {}) + + child.on('message', (msg) => { + messages.push(msg) + + // Kill after instance B is blocked (or after instance A continues) + if ( + (msg === 'instance-b-blocked' || + msg === 'instance-a-continued') && + !killed + ) { + // Give a tiny window then kill + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 50) + } + }) + + child.on('exit', (code, sig) => { + resolve({ + cycle, + killed: killed || sig === 'SIGKILL', + messages, + stderr, + }) + }) + + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 15000) + }) + + // Worker should be either killed by us or crashed due to lock error + expect(result.killed || result.exitCode !== 0).toBe(true) + } + + // Verify the DB is still usable after rapid HMR cycles + let db = null + try { + db = new PGlite(rapidDataDir) + await Promise.race([ + db.waitReady, + new Promise((_, reject) => + setTimeout( + () => reject(new Error('Rapid HMR final open timed out')), + 20000, + ), + ), + ]) + + await db.query('SELECT 1') + + const tableExists = await db.query(` + SELECT tablename FROM pg_tables + WHERE schemaname = 'public' AND tablename = 'hmr_data' + `) + if (tableExists.rows.length > 0) { + const count = await db.query( + 'SELECT count(*)::int as cnt FROM hmr_data', + ) + console.log(`Rapid HMR test: ${count.rows[0].cnt} rows survived`) + expect(count.rows[0].cnt).toBeGreaterThanOrEqual(0) + + // All surviving rows should be from instance A + const instanceCheck = await db.query( + `SELECT DISTINCT instance FROM hmr_data`, + ) + if (instanceCheck.rows.length > 0) { + for (const row of instanceCheck.rows) { + expect(row.instance).toBe('A') + } + } + } + + await db.close() + } catch (err) { + console.log(`Rapid HMR CORRUPTION: ${err.message}`) + if (db) + try { + await db.close() + } catch (_) { + /* ignore */ + } + expect.fail(`Rapid HMR corrupted DB: ${err.message}`) + } finally { + if (!process.env.RETAIN_DATA) { + if (existsSync(rapidDataDir)) { + rmSync(rapidDataDir, { recursive: true, force: true }) + } + const lockFile = rapidDataDir + '.lock' + if (existsSync(lockFile)) { + rmSync(lockFile, { force: true }) + } + } + } + }, + { timeout: 180000 }, + ) +}) diff --git a/packages/pglite/tests/crash-safety/overlapping-instances.test.js b/packages/pglite/tests/crash-safety/overlapping-instances.test.js new file mode 100644 index 000000000..a642841b2 --- /dev/null +++ b/packages/pglite/tests/crash-safety/overlapping-instances.test.js @@ -0,0 +1,486 @@ +/** + * Crash Safety Test: Overlapping Instances (with File Locking) + * + * PGlite now implements file locking for NodeFS. Multiple instances cannot + * open the same data directory simultaneously — the second instance will + * receive a lock error, preventing the corruption that previously occurred. + * + * These tests verify that: + * 1. The lock prevents multiple instances from opening the same data dir + * 2. Workers that can't acquire the lock crash with an error (expected behavior) + * 3. The database remains intact and recoverable after all scenarios + * 4. Stale locks from killed processes are detected and overridden + */ + +import { describe, it, expect, afterAll } from 'vitest' +import { fork } from 'node:child_process' +import { existsSync, rmSync } from 'node:fs' + +const baseDir = `/tmp/pglite-crash-overlapping-${Date.now()}` + +afterAll(async () => { + if (!process.env.RETAIN_DATA) { + for (const suffix of ['triple', 'stagger', 'ddl', 'rapid']) { + const dir = `${baseDir}-${suffix}` + if (existsSync(dir)) { + rmSync(dir, { recursive: true, force: true }) + } + const lockFile = dir + '.lock' + if (existsSync(lockFile)) { + rmSync(lockFile, { force: true }) + } + } + } +}) + +/** + * Helper: fork a worker, collect messages, kill on trigger or timer. + */ +function runWorker({ + workerPath, + dataDir, + env = {}, + killOnMessage = null, + killAfterMs = null, + killDelayAfterMsg = 0, +}) { + return new Promise((resolve) => { + const messages = [] + let killed = false + let killTimer = null + + const child = fork(workerPath, [], { + env: { ...process.env, PGLITE_DATA_DIR: dataDir, ...env }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + let stderr = '' + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + child.stdout.on('data', () => {}) + + child.on('message', (msg) => { + messages.push(msg) + + if (killOnMessage && msg === killOnMessage && !killed) { + if (killDelayAfterMsg > 0) { + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, killDelayAfterMsg) + } else { + killed = true + child.kill('SIGKILL') + } + } + }) + + child.on('exit', (code, sig) => { + if (killTimer) clearTimeout(killTimer) + const wasKilled = killed || sig === 'SIGKILL' + const crashed = !wasKilled && code !== 0 + resolve({ + killed: wasKilled, + crashed, + terminated: wasKilled || crashed, + messages, + exitCode: code, + signal: sig, + stderr, + }) + }) + + if (killAfterMs) { + killTimer = setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, killAfterMs) + } + + // Safety timeout + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 30000) + }) +} + +/** + * Try to open a PGlite instance and run basic checks. + * Returns { success, error, rowCount }. + */ +async function tryOpenAndVerify(PGlite, dataDir, tableName, timeoutMs = 20000) { + let db = null + try { + db = new PGlite(dataDir) + await Promise.race([ + db.waitReady, + new Promise((_, reject) => + setTimeout( + () => reject(new Error(`Open timed out after ${timeoutMs}ms`)), + timeoutMs, + ), + ), + ]) + + // Basic health + await db.query('SELECT 1 as ok') + + // Check table exists + const tableCheck = await db.query( + `SELECT tablename FROM pg_tables WHERE schemaname = 'public' AND tablename = $1`, + [tableName], + ) + if (tableCheck.rows.length === 0) { + await db.close() + return { success: true, rowCount: 0 } + } + + // Count rows + const count = await db.query( + `SELECT count(*)::int as cnt FROM "${tableName}"`, + ) + const rowCount = count.rows[0].cnt + + // Full sequential scan + const allRows = await db.query(`SELECT * FROM "${tableName}" ORDER BY id`) + if (allRows.rows.length !== rowCount) { + await db.close() + return { + success: false, + error: new Error( + `Row count mismatch: count=${rowCount} scan=${allRows.rows.length}`, + ), + } + } + + await db.close() + return { success: true, rowCount } + } catch (err) { + if (db) + try { + await db.close() + } catch (_) { + /* ignore */ + } + return { success: false, error: err } + } +} + +describe('crash safety: overlapping instances (lock prevents corruption)', () => { + // ======================================================================== + // Scenario 1: Three simultaneous instances — lock blocks instances B and C + // ======================================================================== + it( + 'should block second and third instances via lock, preserving data integrity', + async () => { + const { PGlite } = await import('../../dist/index.js') + const dataDir = `${baseDir}-triple` + const workerPath = new URL( + './workers/overlapping-three-instances.js', + import.meta.url, + ).pathname + + const NUM_CYCLES = 5 + + for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { + const killMsg = cycle < 2 ? 'concurrent-writes-done' : 'all-done' + + const result = await runWorker({ + workerPath, + dataDir, + env: { CYCLE: String(cycle) }, + killOnMessage: killMsg, + killAfterMs: 15000, // fallback if message never arrives (lock blocks worker) + }) + + console.log( + `Triple cycle ${cycle}: terminated=${result.terminated} killed=${result.killed} crashed=${result.crashed} ` + + `messages=[${result.messages.join(', ')}]`, + ) + + // Worker should have terminated (killed by us or crashed due to lock error) + expect(result.terminated).toBe(true) + + // If worker crashed, it's likely due to lock error — this is expected behavior. + // The important thing is that the DB remains intact. + if (result.crashed) { + console.log( + ` Triple cycle ${cycle}: worker crashed (expected — lock prevented second instance)`, + ) + // Check that the crash was due to a lock error + const lockRelatedCrash = + result.stderr.includes('locked by') || + result.messages.some( + (m) => typeof m === 'string' && m.includes('locked by'), + ) + if (lockRelatedCrash) { + console.log( + ` Confirmed: lock error caused the crash (correct behavior)`, + ) + } + } + + // Verify recovery — DB should ALWAYS be openable + const verify = await tryOpenAndVerify(PGlite, dataDir, 'triple_data') + if (!verify.success) { + console.log( + `UNEXPECTED CORRUPTION after triple cycle ${cycle}: ${verify.error.message}`, + ) + } + // With the lock, the DB should always be intact + expect(verify.success).toBe(true) + console.log(` Triple cycle ${cycle}: ${verify.rowCount} rows, DB OK`) + + await new Promise((r) => setTimeout(r, 200)) + } + }, + { timeout: 300000 }, + ) + + // ======================================================================== + // Scenario 2: Staggered overlap — lock blocks instance B + // ======================================================================== + it( + 'should block staggered second instance via lock, preserving data integrity', + async () => { + const { PGlite } = await import('../../dist/index.js') + const dataDir = `${baseDir}-stagger` + const workerPath = new URL( + './workers/overlapping-staggered.js', + import.meta.url, + ).pathname + + const NUM_CYCLES = 7 + + for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { + const staggerMs = cycle < 3 ? 100 : cycle < 5 ? 300 : 500 + const killMsg = cycle < 3 ? 'overlap-writes-done' : 'all-done' + + const result = await runWorker({ + workerPath, + dataDir, + env: { CYCLE: String(cycle), STAGGER_MS: String(staggerMs) }, + killOnMessage: killMsg, + killAfterMs: 15000, // fallback + }) + + console.log( + `Stagger cycle ${cycle} (${staggerMs}ms): terminated=${result.terminated} ` + + `messages=[${result.messages.join(', ')}]`, + ) + + expect(result.terminated).toBe(true) + + // Check recovery every 2 cycles and at the end + if (cycle % 2 === 1 || cycle === NUM_CYCLES - 1) { + const verify = await tryOpenAndVerify(PGlite, dataDir, 'stagger_data') + expect(verify.success).toBe(true) + console.log( + ` Stagger cycle ${cycle}: ${verify.rowCount} rows, DB OK`, + ) + } + + await new Promise((r) => setTimeout(r, 200)) + } + }, + { timeout: 300000 }, + ) + + // ======================================================================== + // Scenario 3: DDL collision — lock blocks second process + // ======================================================================== + it( + 'should block DDL/DML collision between two processes via lock', + async () => { + const { PGlite } = await import('../../dist/index.js') + const dataDir = `${baseDir}-ddl` + const ddlWorkerPath = new URL( + './workers/overlapping-ddl-writer.js', + import.meta.url, + ).pathname + + const NUM_CYCLES = 5 + + for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { + // Spawn TWO separate child processes on the SAME data dir. + // With the lock, only one will acquire the lock; the other crashes. + const [ddlResult, dmlResult] = await Promise.all([ + runWorker({ + workerPath: ddlWorkerPath, + dataDir, + env: { CYCLE: String(cycle), WRITER_MODE: 'ddl' }, + killAfterMs: cycle < 2 ? 3000 : 5000, + }), + runWorker({ + workerPath: ddlWorkerPath, + dataDir, + env: { CYCLE: String(cycle), WRITER_MODE: 'dml' }, + killAfterMs: cycle < 2 ? 3000 : 5000, + }), + ]) + + console.log( + `DDL cycle ${cycle}: DDL terminated=${ddlResult.terminated} crashed=${ddlResult.crashed} ` + + `messages=[${ddlResult.messages.join(', ')}]`, + ) + console.log( + `DDL cycle ${cycle}: DML terminated=${dmlResult.terminated} crashed=${dmlResult.crashed} ` + + `messages=[${dmlResult.messages.join(', ')}]`, + ) + + // At least one should have been blocked by the lock + const bothCrashed = ddlResult.crashed && dmlResult.crashed + if (bothCrashed) { + // Both crashed — possibly the first one held the lock briefly + // and the second couldn't get it. This is acceptable. + console.log( + ` DDL cycle ${cycle}: both workers crashed (lock prevented simultaneous access)`, + ) + } + + // Verify recovery + await new Promise((r) => setTimeout(r, 300)) + const verify = await tryOpenAndVerify(PGlite, dataDir, 'ddl_base') + expect(verify.success).toBe(true) + console.log( + ` DDL cycle ${cycle}: ddl_base has ${verify.rowCount} rows, DB OK`, + ) + + await new Promise((r) => setTimeout(r, 200)) + } + }, + { timeout: 300000 }, + ) + + // ======================================================================== + // Scenario 4: Rapid instance cycling — lock blocks all after first + // ======================================================================== + it( + 'should block rapid instance cycling via lock, preserving data integrity', + async () => { + const { PGlite } = await import('../../dist/index.js') + const dataDir = `${baseDir}-rapid` + const workerPath = new URL( + './workers/overlapping-rapid-cycling.js', + import.meta.url, + ).pathname + + const NUM_RUNS = 3 + + for (let run = 0; run < NUM_RUNS; run++) { + const numInstances = run === 0 ? 8 : run === 1 ? 10 : 12 + const killMsg = run < 2 ? 'all-instances-created' : 'all-done' + + const result = await runWorker({ + workerPath, + dataDir, + env: { NUM_INSTANCES: String(numInstances) }, + killOnMessage: killMsg, + killAfterMs: 15000, // fallback + }) + + const instWrote = result.messages.filter((m) => + m.endsWith('-wrote'), + ).length + console.log( + `Rapid run ${run} (${numInstances} instances): terminated=${result.terminated} ` + + `${instWrote} wrote, messages=[${result.messages.join(', ')}]`, + ) + + expect(result.terminated).toBe(true) + + // With the lock, only the first instance should open successfully. + // The worker will crash when trying to open the second instance. + if (result.crashed) { + console.log( + ` Rapid run ${run}: worker crashed (expected — lock blocked subsequent instances)`, + ) + } + + // Verify recovery — DB should always be intact + const verify = await tryOpenAndVerify(PGlite, dataDir, 'rapid_data') + expect(verify.success).toBe(true) + console.log(` Rapid run ${run}: ${verify.rowCount} rows, DB OK`) + + await new Promise((r) => setTimeout(r, 300)) + } + }, + { timeout: 300000 }, + ) + + // ======================================================================== + // Scenario 5: Kill-during-recovery overlap — lock prevents simultaneous recovery + // ======================================================================== + it( + 'should prevent two instances from racing to recover the same data directory', + async () => { + const { PGlite } = await import('../../dist/index.js') + const dataDir = `${baseDir}-triple` // Reuse from scenario 1 + + const workerPath = new URL( + './workers/overlapping-three-instances.js', + import.meta.url, + ).pathname + + // Step 1: Create dirty state by killing a worker mid-operation + const seedResult = await runWorker({ + workerPath, + dataDir, + env: { CYCLE: '99' }, + killAfterMs: 3000, // Kill after 3 seconds regardless + }) + + console.log( + `Recovery-overlap seed: terminated=${seedResult.terminated} crashed=${seedResult.crashed}`, + ) + + // Step 2: Spawn TWO workers simultaneously on the dirty data dir. + // With the lock, only the first one should open; the second gets blocked. + const [recoverA, recoverB] = await Promise.all([ + runWorker({ + workerPath, + dataDir, + env: { CYCLE: '100' }, + killAfterMs: 5000, + }), + new Promise((resolve) => setTimeout(resolve, 200)).then(() => + runWorker({ + workerPath, + dataDir, + env: { CYCLE: '101' }, + killAfterMs: 5000, + }), + ), + ]) + + console.log( + `Recovery A: terminated=${recoverA.terminated} crashed=${recoverA.crashed} messages=[${recoverA.messages.join(', ')}]`, + ) + console.log( + `Recovery B: terminated=${recoverB.terminated} crashed=${recoverB.crashed} messages=[${recoverB.messages.join(', ')}]`, + ) + + // At least one should have been blocked by the lock + // (B starts 200ms later, so A likely holds the lock) + if (recoverB.crashed) { + console.log(' Recovery B was blocked by lock (correct behavior)') + } + + // Step 3: Verify the DB is still intact after the dual-recovery attempt + await new Promise((r) => setTimeout(r, 500)) + + const verify = await tryOpenAndVerify(PGlite, dataDir, 'triple_data') + expect(verify.success).toBe(true) + console.log(` After recovery overlap: ${verify.rowCount} rows, DB OK`) + }, + { timeout: 180000 }, + ) +}) diff --git a/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js b/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js new file mode 100644 index 000000000..934234e9d --- /dev/null +++ b/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js @@ -0,0 +1,651 @@ +/** + * Crash Safety Test: WAL Bloat Without Checkpoint + * + * This test targets the most insidious corruption vector: accumulated WAL + * entries that never get checkpointed because close() is never called. + * + * In real dev usage, developers: + * 1. Start their app (PGlite opens, creates tables, inserts data) + * 2. Kill the process (Ctrl+C, crash, OOM, etc.) -- no close(), no checkpoint + * 3. Restart, repeat -- each time more WAL accumulates + * 4. After many such cycles, the WAL is enormous and recovery becomes fragile + * + * PostgreSQL's WAL recovery is designed to replay from the last checkpoint. + * But when there's NEVER a checkpoint (because close() never runs and + * _pgl_shutdown() never fires), the WAL grows unbounded and recovery must + * replay from the very beginning. + * + * This test runs 30+ kill cycles, each adding heavy mixed DML (INSERT, UPDATE, + * DELETE) with ~1KB rows, index modifications, and schema changes. The worker + * is SIGKILL'd every time before close() can run. + * + * After all cycles, we verify the database can still open and is consistent. + */ + +import { describe, it, expect, afterAll } from 'vitest' +import { fork } from 'node:child_process' +import { existsSync, rmSync, statSync, readdirSync } from 'node:fs' +import { join } from 'node:path' + +const dataDir = `/tmp/pglite-crash-wal-bloat-${Date.now()}` + +afterAll(async () => { + if (!process.env.RETAIN_DATA) { + if (existsSync(dataDir)) { + rmSync(dataDir, { recursive: true, force: true }) + } + } +}) + +/** + * Recursively compute the total size of a directory in bytes. + */ +function dirSizeBytes(dir) { + let total = 0 + try { + const entries = readdirSync(dir, { withFileTypes: true }) + for (const entry of entries) { + const fullPath = join(dir, entry.name) + if (entry.isDirectory()) { + total += dirSizeBytes(fullPath) + } else if (entry.isFile()) { + total += statSync(fullPath).size + } + } + } catch { + // directory may not exist yet + } + return total +} + +/** + * Spawn worker, let it run inner cycles, then SIGKILL. + */ +function runWorkerAndKill(workerPath, cycle, innerCycles, killStrategy) { + return new Promise((resolve) => { + const messages = [] + let killed = false + let killTimer = null + + const child = fork(workerPath, [], { + env: { + ...process.env, + PGLITE_DATA_DIR: dataDir, + INNER_CYCLES: String(innerCycles), + START_CYCLE: String(cycle), + }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + let stderr = '' + child.stderr.on('data', (d) => { + stderr += d.toString() + }) + child.stdout.on('data', () => {}) // drain + + child.on('message', (msg) => { + messages.push(msg) + + if ( + killStrategy.onMessage && + msg.startsWith(killStrategy.onMessage) && + !killed + ) { + killed = true + if (killTimer) clearTimeout(killTimer) + child.kill('SIGKILL') + } + }) + + child.on('exit', (code, sig) => { + if (killTimer) clearTimeout(killTimer) + resolve({ + cycle, + killed: killed || sig === 'SIGKILL', + messages, + exitCode: code, + signal: sig, + stderr, + }) + }) + + // Timer-based kill if no message trigger + if (killStrategy.afterMs && !killStrategy.onMessage) { + killTimer = setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, killStrategy.afterMs) + } + + // Safety timeout + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 60000) + }) +} + +describe('crash safety: WAL bloat without checkpoint (30+ kill cycles)', () => { + it( + 'should survive 35 kill cycles of heavy DML without any checkpoint', + async () => { + const { PGlite } = await import('../../dist/index.js') + const workerPath = new URL( + './workers/wal-bloat-no-checkpoint.js', + import.meta.url, + ).pathname + + const TOTAL_OUTER_CYCLES = 35 + const allResults = [] + + // Track data dir size growth over time + const sizeHistory = [] + + // Track the cumulative start cycle for worker numbering + let nextStartCycle = 0 + + for (let outerCycle = 0; outerCycle < TOTAL_OUTER_CYCLES; outerCycle++) { + // Vary the kill strategy to hit different points in the DML cycle: + // - Early cycles: kill after schema + first inserts (mid-DML) + // - Middle cycles: kill after inserts but before updates complete + // - Late cycles: kill after full cycles complete but before close() + // - Some cycles: kill on a timer for unpredictable timing + + let innerCycles + let killStrategy + const startCycle = nextStartCycle + + if (outerCycle === 0) { + // First cycle: let schema creation + first batch complete + innerCycles = 2 + killStrategy = { onMessage: `cycle-done:${startCycle + 1}` } + } else if (outerCycle < 5) { + // Kill mid-insert (after first inner cycle's inserts) + innerCycles = 3 + killStrategy = { onMessage: `inserts-done:${startCycle + 1}` } + } else if (outerCycle < 10) { + // Kill mid-update + innerCycles = 2 + killStrategy = { onMessage: `updates-done:${startCycle}` } + } else if (outerCycle < 15) { + // Kill mid-delete + innerCycles = 2 + killStrategy = { onMessage: `deletes-done:${startCycle}` } + } else if (outerCycle < 20) { + // Kill after a full cycle completes (dirty but "consistent" state) + innerCycles = 3 + killStrategy = { onMessage: `cycle-done:${startCycle}` } + } else if (outerCycle < 25) { + // Timer-based kill for unpredictable timing (500ms-2000ms) + innerCycles = 4 + killStrategy = { afterMs: 500 + outerCycle * 60 } + } else if (outerCycle < 30) { + // Let many cycles run, kill after all are done (max WAL, no checkpoint) + innerCycles = 5 + killStrategy = { onMessage: 'all-cycles-done' } + } else { + // Final stretch: aggressive timer kills (200ms) + innerCycles = 3 + killStrategy = { afterMs: 200 } + } + + nextStartCycle += innerCycles + + const result = await runWorkerAndKill( + workerPath, + startCycle, + innerCycles, + killStrategy, + ) + + allResults.push(result) + + // Track directory size + const currentSize = dirSizeBytes(dataDir) + sizeHistory.push({ outerCycle, sizeBytes: currentSize }) + + // Log progress every 5 cycles + if (outerCycle % 5 === 0 || outerCycle === TOTAL_OUTER_CYCLES - 1) { + const sizeMB = (currentSize / 1024 / 1024).toFixed(2) + const messagesPreview = result.messages.slice(0, 5).join(', ') + console.log( + `Outer cycle ${outerCycle}/${TOTAL_OUTER_CYCLES}: ` + + `size=${sizeMB}MB, killed=${result.killed}, ` + + `messages=[${messagesPreview}...]`, + ) + } + + // The worker should have been killed OR crashed (crashing on open + // is itself evidence of corruption from previous cycles) + const workerCrashedOrKilled = result.killed || result.exitCode !== 0 + if (!result.killed && result.exitCode !== 0) { + console.log( + ` Worker CRASHED on its own at outer cycle ${outerCycle} (exit=${result.exitCode}): ${result.stderr.slice(0, 200)}`, + ) + // If the worker can't even open the DB, that's corruption — skip remaining cycles + if ( + result.messages.length === 0 || + !result.messages.includes('ready') + ) { + console.log( + ` CORRUPTION: Worker couldn't open DB at outer cycle ${outerCycle}`, + ) + break + } + } + expect(workerCrashedOrKilled).toBe(true) + + // For message-based kills, the worker MUST have started (it sent the trigger) + // For timer-based kills, the worker may not have had time to open the DB + // (especially after WAL bloat makes recovery slow) — that's OK + if (result.killed && !killStrategy.afterMs) { + const started = + result.messages.includes('ready') || + result.messages.includes('schema-created') || + result.messages.some((m) => m.startsWith('cycle-start:')) + expect(started).toBe(true) + } else if ( + result.killed && + killStrategy.afterMs && + result.messages.length === 0 + ) { + // Worker couldn't even open before timer — WAL bloat is severe + console.log( + ` WAL BLOAT: Worker couldn't open DB within ${killStrategy.afterMs}ms at outer cycle ${outerCycle}`, + ) + } + + // Intermediate verification every 10 cycles to catch progressive corruption + if (outerCycle > 0 && outerCycle % 10 === 0) { + console.log( + `\n--- Intermediate check after outer cycle ${outerCycle} ---`, + ) + let checkDb = null + try { + checkDb = new PGlite(dataDir) + await Promise.race([ + checkDb.waitReady, + new Promise((_, reject) => + setTimeout( + () => + reject( + new Error( + `Intermediate open timed out at cycle ${outerCycle}`, + ), + ), + 30000, + ), + ), + ]) + + // Basic health + await checkDb.query('SELECT 1') + + // Count rows + const count = await checkDb.query( + 'SELECT count(*)::int as cnt FROM wal_stress', + ) + console.log(` wal_stress rows: ${count.rows[0].cnt}`) + + const logCount = await checkDb.query( + 'SELECT count(*)::int as cnt FROM wal_stress_log', + ) + console.log(` wal_stress_log rows: ${logCount.rows[0].cnt}`) + + // Verify indexes work + await checkDb.query( + `SELECT count(*) FROM wal_stress WHERE kind = 'alpha'`, + ) + await checkDb.query( + `SELECT count(*) FROM wal_stress WHERE cycle = 0`, + ) + + // Full sequential scan + const allRows = await checkDb.query( + 'SELECT id, cycle, batch, kind FROM wal_stress ORDER BY id', + ) + expect(allRows.rows.length).toBe(count.rows[0].cnt) + + await checkDb.close() + + // IMPORTANT: After this close(), a checkpoint DOES run. + // This resets the WAL accumulation. For the purest test of + // "never checkpointed" WAL, we only do this check occasionally. + console.log(` (checkpoint occurred due to close() -- WAL reset)`) + } catch (err) { + console.log(` CORRUPTION at cycle ${outerCycle}: ${err.message}`) + if (checkDb) + try { + await checkDb.close() + } catch (_) { + /* ignore */ + } + expect.fail( + `DB corrupted after ${outerCycle} kill cycles: ${err.message}`, + ) + } + } + + // No delay between cycles - maximum aggression + } + + // ---- Final comprehensive verification ---- + console.log('\n====== FINAL VERIFICATION ======') + console.log(`Completed ${TOTAL_OUTER_CYCLES} outer kill cycles`) + console.log(`Data dir size history:`) + for (const s of sizeHistory.filter( + (_, i) => i % 5 === 0 || i === sizeHistory.length - 1, + )) { + console.log( + ` Cycle ${s.outerCycle}: ${(s.sizeBytes / 1024 / 1024).toFixed(2)} MB`, + ) + } + + const finalSize = dirSizeBytes(dataDir) + console.log( + `Final data dir size: ${(finalSize / 1024 / 1024).toFixed(2)} MB`, + ) + + let finalDb = null + try { + finalDb = new PGlite(dataDir) + await Promise.race([ + finalDb.waitReady, + new Promise((_, reject) => + setTimeout( + () => reject(new Error('Final open timed out (WAL bloat)')), + 60000, + ), + ), + ]) + } catch (err) { + console.log( + `FINAL CORRUPTION after ${TOTAL_OUTER_CYCLES} cycles: ${err.message}`, + ) + expect.fail(`DB corrupted after all WAL bloat cycles: ${err.message}`) + } + + try { + // Health check + await finalDb.query('SELECT 1 as ok') + + // Verify tables exist + const tables = await finalDb.query(` + SELECT tablename FROM pg_tables + WHERE schemaname = 'public' + ORDER BY tablename + `) + const tableNames = tables.rows.map((r) => r.tablename) + expect(tableNames).toContain('wal_stress') + expect(tableNames).toContain('wal_stress_log') + console.log(`Tables: ${tableNames.join(', ')}`) + + // Row counts + const stressCount = await finalDb.query( + 'SELECT count(*)::int as cnt FROM wal_stress', + ) + const logCount = await finalDb.query( + 'SELECT count(*)::int as cnt FROM wal_stress_log', + ) + console.log(`wal_stress rows: ${stressCount.rows[0].cnt}`) + console.log(`wal_stress_log rows: ${logCount.rows[0].cnt}`) + expect(stressCount.rows[0].cnt).toBeGreaterThan(0) + + // Index verification + const indexes = await finalDb.query(` + SELECT indexname, tablename FROM pg_indexes + WHERE schemaname = 'public' + ORDER BY indexname + `) + console.log( + `Indexes: ${indexes.rows.map((r) => r.indexname).join(', ')}`, + ) + for (const idx of indexes.rows) { + await finalDb.query(`SELECT count(*) FROM "${idx.tablename}"`) + } + + // Full sequential scan of wal_stress + const allRows = await finalDb.query( + 'SELECT id, cycle, batch, kind, counter FROM wal_stress ORDER BY id', + ) + expect(allRows.rows.length).toBe(stressCount.rows[0].cnt) + + // Verify data integrity: no NULLs in NOT NULL columns + for (const row of allRows.rows) { + expect(row.id).not.toBeNull() + expect(row.cycle).not.toBeNull() + expect(row.batch).not.toBeNull() + expect(row.kind).not.toBeNull() + expect(['alpha', 'beta', 'gamma', 'delta']).toContain(row.kind) + } + + // Aggregate checks + const kindCounts = await finalDb.query(` + SELECT kind, count(*)::int as cnt, avg(counter)::int as avg_counter + FROM wal_stress + GROUP BY kind + ORDER BY kind + `) + console.log('Kind distribution:') + for (const row of kindCounts.rows) { + console.log( + ` ${row.kind}: ${row.cnt} rows, avg_counter=${row.avg_counter}`, + ) + } + + // Verify cycle distribution (should span multiple cycles) + const cycleDist = await finalDb.query(` + SELECT min(cycle)::int as min_cycle, max(cycle)::int as max_cycle, + count(DISTINCT cycle)::int as distinct_cycles + FROM wal_stress + `) + console.log( + `Cycle range: ${cycleDist.rows[0].min_cycle} - ${cycleDist.rows[0].max_cycle}, ` + + `${cycleDist.rows[0].distinct_cycles} distinct cycles`, + ) + expect(cycleDist.rows[0].distinct_cycles).toBeGreaterThan(1) + + // Verify log table integrity + const logRows = await finalDb.query( + 'SELECT id, cycle, operation, row_count FROM wal_stress_log ORDER BY id', + ) + expect(logRows.rows.length).toBe(logCount.rows[0].cnt) + for (const row of logRows.rows) { + expect(row.operation).toBe('full-cycle') + expect(row.row_count).toBeGreaterThan(0) + } + + // Verify we can still write after recovery + await finalDb.query( + `INSERT INTO wal_stress (cycle, batch, kind, value, counter) + VALUES ($1, $2, $3, $4, $5)`, + [99999, 0, 'alpha', 'final-verification-row', 0], + ) + const verifyRow = await finalDb.query( + `SELECT * FROM wal_stress WHERE cycle = 99999`, + ) + expect(verifyRow.rows.length).toBe(1) + + // Verify we can still UPDATE after recovery + await finalDb.query( + `UPDATE wal_stress SET counter = -1 WHERE cycle = 99999`, + ) + const updatedRow = await finalDb.query( + `SELECT counter FROM wal_stress WHERE cycle = 99999`, + ) + expect(updatedRow.rows[0].counter).toBe(-1) + + // Verify we can still DELETE after recovery + await finalDb.query(`DELETE FROM wal_stress WHERE cycle = 99999`) + const deletedRow = await finalDb.query( + `SELECT count(*)::int as cnt FROM wal_stress WHERE cycle = 99999`, + ) + expect(deletedRow.rows[0].cnt).toBe(0) + } finally { + await finalDb.close() + } + }, + { timeout: 300000 }, + ) + + it( + 'should survive burst-mode: 15 extremely rapid kill cycles with no delay', + async () => { + const { PGlite } = await import('../../dist/index.js') + const burstDataDir = `${dataDir}-burst` + const workerPath = new URL( + './workers/wal-bloat-no-checkpoint.js', + import.meta.url, + ).pathname + + const BURST_CYCLES = 15 + + for (let i = 0; i < BURST_CYCLES; i++) { + // Each burst cycle: open, do 1-2 inner cycles of heavy DML, SIGKILL + // Kill very aggressively: 300ms timer (may kill mid-INSERT) + const burstResult = await new Promise((resolve) => { + const messages = [] + let killed = false + + const child = fork(workerPath, [], { + env: { + ...process.env, + PGLITE_DATA_DIR: burstDataDir, + INNER_CYCLES: '2', + START_CYCLE: String(i * 2), + }, + stdio: ['pipe', 'pipe', 'pipe', 'ipc'], + }) + + child.stderr.on('data', () => {}) + child.stdout.on('data', () => {}) + + child.on('message', (msg) => { + messages.push(msg) + }) + + child.on('exit', (code, sig) => { + resolve({ + killed: killed || sig === 'SIGKILL', + exitCode: code, + messages, + }) + }) + + // Very aggressive: kill after 300ms no matter what + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 300) + + // Safety + setTimeout(() => { + if (!killed) { + killed = true + child.kill('SIGKILL') + } + }, 30000) + }) + + // Worker should either be killed by timer, crash on open, or crash during DML + const workerKilled = burstResult.killed + const workerCrashed = + burstResult.exitCode !== 0 && burstResult.exitCode !== null + const workerCouldntOpen = !burstResult.messages.includes('ready') + + if (workerCouldntOpen && !workerKilled) { + console.log( + ` Burst cycle ${i}: Worker couldn't open DB (corruption from previous cycles)`, + ) + break // DB is corrupted, no point continuing + } + + // The worker must have been killed OR crashed (either before or after opening) + expect(workerKilled || workerCrashed || workerCouldntOpen).toBe(true) + } + + // Verify after burst + console.log('\n--- Burst mode verification ---') + const burstSize = dirSizeBytes(burstDataDir) + console.log( + `Burst data dir size: ${(burstSize / 1024 / 1024).toFixed(2)} MB`, + ) + + let db = null + try { + db = new PGlite(burstDataDir) + await Promise.race([ + db.waitReady, + new Promise((_, reject) => + setTimeout( + () => reject(new Error('Burst mode final open timed out')), + 30000, + ), + ), + ]) + + await db.query('SELECT 1') + + // The table may or may not exist depending on whether the first + // cycle's schema creation completed before the 300ms kill + const tableCheck = await db.query(` + SELECT tablename FROM pg_tables + WHERE schemaname = 'public' AND tablename = 'wal_stress' + `) + + if (tableCheck.rows.length > 0) { + const count = await db.query( + 'SELECT count(*)::int as cnt FROM wal_stress', + ) + console.log(`Burst mode rows: ${count.rows[0].cnt}`) + + // Full scan + const rows = await db.query( + 'SELECT id, cycle, kind FROM wal_stress ORDER BY id', + ) + expect(rows.rows.length).toBe(count.rows[0].cnt) + } else { + // Table doesn't exist = schema creation was killed every time. + // This is a valid outcome, not corruption. + console.log('Burst mode: schema never completed (killed too fast)') + } + + // Verify DB is writable + await db.query(` + CREATE TABLE IF NOT EXISTS burst_verify (id SERIAL PRIMARY KEY, ok BOOLEAN) + `) + await db.query(`INSERT INTO burst_verify (ok) VALUES (true)`) + const verify = await db.query('SELECT ok FROM burst_verify') + expect(verify.rows[0].ok).toBe(true) + + await db.close() + } catch (err) { + console.log(`Burst mode CORRUPTION: ${err.message}`) + if (db) + try { + await db.close() + } catch (_) { + /* ignore */ + } + expect.fail(`Burst mode corrupted DB: ${err.message}`) + } finally { + if (!process.env.RETAIN_DATA) { + if (existsSync(burstDataDir)) { + rmSync(burstDataDir, { recursive: true, force: true }) + } + const lockFile = burstDataDir + '.lock' + if (existsSync(lockFile)) { + rmSync(lockFile, { force: true }) + } + } + } + }, + { timeout: 180000 }, + ) +}) diff --git a/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js b/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js new file mode 100644 index 000000000..edee7ba82 --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js @@ -0,0 +1,88 @@ +// HMR double-instance: opens instance A, then tries to open instance B on the +// same data dir without closing A. Instance B should be blocked by file locking. +// Process stays alive for external SIGKILL. + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const cycle = parseInt(process.env.CYCLE || '0', 10) +const overlapOps = parseInt(process.env.OVERLAP_OPS || '20', 10) + +async function run() { + const instanceA = new PGlite(dataDir) + await instanceA.waitReady + process.send('instance-a-ready') + + if (cycle === 0) { + await instanceA.query(` + CREATE TABLE IF NOT EXISTS hmr_data ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + instance TEXT NOT NULL, + phase TEXT NOT NULL, + seq INTEGER NOT NULL, + payload TEXT NOT NULL, + created_at TIMESTAMP DEFAULT NOW() + ) + `) + await instanceA.query(` + CREATE INDEX IF NOT EXISTS idx_hmr_data_cycle ON hmr_data (cycle) + `) + await instanceA.query(` + CREATE INDEX IF NOT EXISTS idx_hmr_data_instance ON hmr_data (instance) + `) + process.send('schema-created') + } + + const padding = 'A'.repeat(500) + for (let i = 0; i < 10; i++) { + await instanceA.query( + `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) + VALUES ($1, $2, $3, $4, $5)`, + [cycle, 'A', 'pre-hmr', i, `cycle${cycle}-A-pre-${i}-${padding}`], + ) + } + process.send('instance-a-wrote') + + // Instance B should fail to open due to file lock + try { + const instanceB = new PGlite(dataDir) + await instanceB.waitReady + process.send('instance-b-ready') + + for (let i = 0; i < overlapOps; i++) { + await instanceB.query( + `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) + VALUES ($1, $2, $3, $4, $5)`, + [cycle, 'B', 'post-hmr', i, `cycle${cycle}-B-post-${i}-${padding}`], + ) + } + process.send('overlap-done') + } catch (err) { + process.send('instance-b-blocked') + process.send(`lock-error:${err.message}`) + } + + for (let i = 10; i < 20; i++) { + await instanceA.query( + `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) + VALUES ($1, $2, $3, $4, $5)`, + [cycle, 'A', 'post-hmr', i, `cycle${cycle}-A-post-${i}-${padding}`], + ) + } + process.send('instance-a-continued') + + process.send('all-operations-done') + + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`HMR worker cycle ${cycle} error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js b/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js new file mode 100644 index 000000000..12cce03dc --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js @@ -0,0 +1,116 @@ +// DDL/DML writer: runs DDL or DML ops based on WRITER_MODE env var. +// Killed externally by parent to simulate crash. + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const mode = process.env.WRITER_MODE || 'dml' +const cycle = parseInt(process.env.CYCLE || '0', 10) + +async function run() { + const pad = 'D'.repeat(300) + + const db = new PGlite(dataDir) + await db.waitReady + process.send('ready') + + if (mode === 'ddl') { + try { + await db.query(` + CREATE TABLE IF NOT EXISTS ddl_base ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + data TEXT NOT NULL + ) + `) + process.send('base-table-ready') + + const tbl = `ddl_cycle_${cycle}` + await db.query(` + CREATE TABLE IF NOT EXISTS ${tbl} ( + id SERIAL PRIMARY KEY, + val TEXT NOT NULL, + num INTEGER DEFAULT 0 + ) + `) + process.send('new-table-created') + + await db.query( + `ALTER TABLE ${tbl} ADD COLUMN IF NOT EXISTS extra TEXT DEFAULT 'none'`, + ) + process.send('alter-done') + + await db.query( + `CREATE INDEX IF NOT EXISTS idx_${tbl}_val ON ${tbl} (val)`, + ) + await db.query( + `CREATE INDEX IF NOT EXISTS idx_${tbl}_num ON ${tbl} (num)`, + ) + process.send('indexes-created') + + for (let i = 0; i < 30; i++) { + await db.query( + `INSERT INTO ${tbl} (val, num, extra) VALUES ($1, $2, $3)`, + [`val-${i}`, i * 10, `extra-${i}-${pad}`], + ) + } + process.send('ddl-inserts-done') + + for (let i = 0; i < 20; i++) { + await db.query(`INSERT INTO ddl_base (cycle, data) VALUES ($1, $2)`, [ + cycle, + `ddl-writer-${i}-${pad}`, + ]) + } + process.send('ddl-base-inserts-done') + } catch (err) { + process.send(`ddl-error:${err.message}`) + } + } else { + try { + await db.query(` + CREATE TABLE IF NOT EXISTS ddl_base ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + data TEXT NOT NULL + ) + `) + process.send('base-table-ready') + + for (let i = 0; i < 50; i++) { + await db.query(`INSERT INTO ddl_base (cycle, data) VALUES ($1, $2)`, [ + cycle, + `dml-writer-${i}-${pad}`, + ]) + } + process.send('dml-inserts-done') + + await db.query( + `UPDATE ddl_base SET data = data || '-updated' WHERE cycle = $1`, + [cycle], + ) + process.send('dml-updates-done') + + await db.query(`DELETE FROM ddl_base WHERE cycle = $1 AND id % 3 = 0`, [ + cycle, + ]) + process.send('dml-deletes-done') + } catch (err) { + process.send(`dml-error:${err.message}`) + } + } + + process.send('all-done') + + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`DDL writer (${mode}) cycle ${cycle} error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js b/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js new file mode 100644 index 000000000..f15597619 --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js @@ -0,0 +1,91 @@ +// Rapid instance cycling: opens N instances on the same data dir without +// closing any. All hold stale WASM heaps simultaneously. Simulates worst-case +// dev server rapid reload. + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const numInstances = parseInt(process.env.NUM_INSTANCES || '10', 10) + +async function run() { + const instances = [] + const pad = 'R'.repeat(500) + + const first = new PGlite(dataDir) + await first.waitReady + instances.push(first) + + await first.query(` + CREATE TABLE IF NOT EXISTS rapid_data ( + id SERIAL PRIMARY KEY, + instance_num INTEGER NOT NULL, + seq INTEGER NOT NULL, + payload TEXT NOT NULL + ) + `) + await first.query( + `CREATE INDEX IF NOT EXISTS idx_rapid_inst ON rapid_data (instance_num)`, + ) + process.send('schema-created') + + await first.query( + `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, + [0, 0, `inst-0-row-0-${pad}`], + ) + process.send('instance-0-wrote') + + for (let i = 1; i < numInstances; i++) { + const inst = new PGlite(dataDir) + await inst.waitReady + instances.push(inst) + + await inst.query( + `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, + [i, 0, `inst-${i}-row-0-${pad}`], + ) + process.send(`instance-${i}-wrote`) + + // Stale cache: also write from a random older instance + if (instances.length > 2) { + const oldIdx = Math.floor(Math.random() * (instances.length - 1)) + try { + await instances[oldIdx].query( + `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, + [oldIdx, i, `inst-${oldIdx}-stale-write-from-cycle-${i}-${pad}`], + ) + } catch (err) { + process.send(`stale-write-error-${oldIdx}:${err.message}`) + } + } + } + + process.send('all-instances-created') + + try { + await Promise.all( + instances.map((inst, idx) => + inst.query( + `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, + [idx, 999, `inst-${idx}-final-burst-${pad}`], + ), + ), + ) + process.send('final-burst-done') + } catch (err) { + process.send(`final-burst-error:${err.message}`) + } + + process.send('all-done') + + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`Rapid cycling worker error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js b/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js new file mode 100644 index 000000000..ca6cdc359 --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js @@ -0,0 +1,101 @@ +// Staggered overlap: instance A starts writing, instance B opens after a delay +// while A is still active. Both write concurrently to the same data dir. + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const cycle = parseInt(process.env.CYCLE || '0', 10) +const staggerMs = parseInt(process.env.STAGGER_MS || '500', 10) + +async function run() { + const pad = 'S'.repeat(300) + + const a = new PGlite(dataDir) + await a.waitReady + process.send('a-ready') + + if (cycle === 0) { + await a.query(` + CREATE TABLE IF NOT EXISTS stagger_data ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + instance TEXT NOT NULL, + seq INTEGER NOT NULL, + payload TEXT NOT NULL + ) + `) + await a.query( + `CREATE INDEX IF NOT EXISTS idx_stagger_cycle ON stagger_data (cycle)`, + ) + process.send('schema-created') + } + + const aWritePromise = (async () => { + for (let i = 0; i < 30; i++) { + await a.query( + `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, 'A', i, `c${cycle}-A-${i}-${pad}`], + ) + if (i % 5 === 0) { + await new Promise((r) => setTimeout(r, 10)) + } + } + await a.query( + `UPDATE stagger_data SET payload = payload || '-a-updated' WHERE instance = 'A' AND cycle = $1 AND seq < 10`, + [cycle], + ) + })() + + await new Promise((r) => setTimeout(r, staggerMs)) + + const b = new PGlite(dataDir) + await b.waitReady + process.send('b-ready') + + const bWritePromise = (async () => { + for (let i = 0; i < 30; i++) { + await b.query( + `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, 'B', i, `c${cycle}-B-${i}-${pad}`], + ) + } + // Deletes touch pages A may have cached + try { + await b.query( + `DELETE FROM stagger_data WHERE instance = 'A' AND cycle = $1 AND seq > 20`, + [cycle], + ) + } catch (_) { + /* ignore */ + } + })() + + await Promise.allSettled([aWritePromise, bWritePromise]) + process.send('overlap-writes-done') + + // A continues with stale cache after B has mutated pages + try { + for (let i = 50; i < 60; i++) { + await a.query( + `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, 'A', i, `c${cycle}-A-stale-${i}-${pad}`], + ) + } + process.send('a-stale-writes-done') + } catch (err) { + process.send(`a-stale-error:${err.message}`) + } + + process.send('all-done') + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`Staggered worker cycle ${cycle} error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js b/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js new file mode 100644 index 000000000..51196a2c2 --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js @@ -0,0 +1,104 @@ +// Three simultaneous instances on the same data dir, none closed. +// All three write concurrently with stale WASM heaps. + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const cycle = parseInt(process.env.CYCLE || '0', 10) + +async function run() { + const a = new PGlite(dataDir) + await a.waitReady + process.send('instance-a-ready') + + if (cycle === 0) { + await a.query(` + CREATE TABLE IF NOT EXISTS triple_data ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + instance TEXT NOT NULL, + seq INTEGER NOT NULL, + payload TEXT NOT NULL + ) + `) + await a.query( + `CREATE INDEX IF NOT EXISTS idx_triple_cycle ON triple_data (cycle)`, + ) + process.send('schema-created') + } + + const pad = 'X'.repeat(400) + for (let i = 0; i < 10; i++) { + await a.query( + `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, 'A', i, `c${cycle}-A-${i}-${pad}`], + ) + } + process.send('a-wrote') + + const b = new PGlite(dataDir) + await b.waitReady + process.send('instance-b-ready') + + for (let i = 0; i < 10; i++) { + await b.query( + `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, 'B', i, `c${cycle}-B-${i}-${pad}`], + ) + } + process.send('b-wrote') + + const c = new PGlite(dataDir) + await c.waitReady + process.send('instance-c-ready') + + const writeAll = async (inst, name, start, count) => { + for (let i = start; i < start + count; i++) { + await inst.query( + `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, + [cycle, name, i, `c${cycle}-${name}-concurrent-${i}-${pad}`], + ) + } + } + + await Promise.all([ + writeAll(a, 'A', 100, 20), + writeAll(b, 'B', 100, 20), + writeAll(c, 'C', 100, 20), + ]) + process.send('concurrent-writes-done') + + // Mixed ops: A updates while B and C insert (page conflicts) + try { + await Promise.all([ + a.query( + `UPDATE triple_data SET payload = 'A-updated' WHERE instance = 'A' AND cycle = $1 AND seq < 5`, + [cycle], + ), + b.query( + `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, 'B', 200, 'b-extra')`, + [cycle], + ), + c.query( + `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, 'C', 200, 'c-extra')`, + [cycle], + ), + ]) + } catch (err) { + process.send(`mixed-ops-error:${err.message}`) + } + + process.send('all-done') + + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`Three-instance worker cycle ${cycle} error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) diff --git a/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js b/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js new file mode 100644 index 000000000..c128b2867 --- /dev/null +++ b/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js @@ -0,0 +1,174 @@ +// WAL bloat: heavy INSERT/UPDATE/DELETE without ever calling close() or CHECKPOINT. +// Parent SIGKILLs at random points. Tests recovery under extreme WAL pressure +// with complex entries (DDL, partial indexes, scatter updates). + +import { PGlite } from '../../../dist/index.js' + +const dataDir = process.env.PGLITE_DATA_DIR +const innerCycles = parseInt(process.env.INNER_CYCLES || '10', 10) +const startCycle = parseInt(process.env.START_CYCLE || '0', 10) + +let seed = startCycle * 1000 + 42 +function nextRand() { + seed = (seed * 1103515245 + 12345) & 0x7fffffff + return seed +} + +async function run() { + const db = new PGlite(dataDir) + await db.waitReady + process.send('ready') + + // Idempotent — prior cycle may have been killed mid-schema-creation + await db.query(` + CREATE TABLE IF NOT EXISTS wal_stress ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + batch INTEGER NOT NULL, + kind TEXT NOT NULL, + value TEXT NOT NULL, + counter INTEGER DEFAULT 0, + updated_at TIMESTAMP DEFAULT NOW() + ) + `) + await db.query(` + CREATE INDEX IF NOT EXISTS idx_wal_stress_cycle ON wal_stress (cycle) + `) + await db.query(` + CREATE INDEX IF NOT EXISTS idx_wal_stress_kind ON wal_stress (kind) + `) + await db.query(` + CREATE TABLE IF NOT EXISTS wal_stress_log ( + id SERIAL PRIMARY KEY, + cycle INTEGER NOT NULL, + operation TEXT NOT NULL, + row_count INTEGER, + logged_at TIMESTAMP DEFAULT NOW() + ) + `) + process.send('schema-created') + + for (let c = 0; c < innerCycles; c++) { + const cycleNum = startCycle + c + process.send(`cycle-start:${cycleNum}`) + + const padding = 'W'.repeat(800) + for (let i = 0; i < 50; i++) { + const kind = ['alpha', 'beta', 'gamma', 'delta'][nextRand() % 4] + await db.query( + `INSERT INTO wal_stress (cycle, batch, kind, value, counter) + VALUES ($1, $2, $3, $4, $5)`, + [ + cycleNum, + i, + kind, + `c${cycleNum}-b${i}-${kind}-${padding}`, + nextRand() % 10000, + ], + ) + } + process.send(`inserts-done:${cycleNum}`) + + const kinds = ['alpha', 'beta', 'gamma', 'delta'] + const targetKind = kinds[nextRand() % 4] + await db.query( + `UPDATE wal_stress + SET counter = counter + $1, value = value || $2, updated_at = NOW() + WHERE kind = $3`, + [nextRand() % 100, `-upd${cycleNum}`, targetKind], + ) + + const updateCycle = nextRand() % (cycleNum + 1) + await db.query( + `UPDATE wal_stress + SET counter = counter + 1 + WHERE cycle = $1 AND batch < 25`, + [updateCycle], + ) + + const step = (nextRand() % 5) + 2 + await db.query( + `UPDATE wal_stress + SET value = LEFT(value, 200) || $1 + WHERE id % $2 = 0`, + [`-scatter${cycleNum}`, step], + ) + + process.send(`updates-done:${cycleNum}`) + + if (cycleNum > 2) { + const deleteCycle = nextRand() % Math.max(1, cycleNum - 1) + const deleteLimit = (nextRand() % 15) + 5 + await db.query( + `DELETE FROM wal_stress + WHERE id IN ( + SELECT id FROM wal_stress + WHERE cycle = $1 AND batch >= $2 + ORDER BY id + LIMIT $3 + )`, + [deleteCycle, 30, deleteLimit], + ) + } + + if (cycleNum % 3 === 0) { + await db.query( + `DELETE FROM wal_stress + WHERE kind = $1 AND cycle < $2 AND batch > 40`, + [kinds[nextRand() % 4], cycleNum], + ) + } + + process.send(`deletes-done:${cycleNum}`) + + await db.query( + `INSERT INTO wal_stress_log (cycle, operation, row_count) + VALUES ($1, $2, (SELECT count(*) FROM wal_stress))`, + [cycleNum, 'full-cycle'], + ) + + // Occasional DDL to diversify WAL entries + if (cycleNum % 5 === 0 && cycleNum > 0) { + const colName = `extra_${cycleNum}` + await db.query(` + ALTER TABLE wal_stress ADD COLUMN IF NOT EXISTS ${colName} TEXT DEFAULT NULL + `) + + await db.query(` + CREATE INDEX IF NOT EXISTS idx_wal_stress_c${cycleNum} + ON wal_stress (counter) + WHERE cycle = ${cycleNum} + `) + + process.send(`ddl-done:${cycleNum}`) + } + + if (cycleNum % 4 === 0) { + await db.query( + `UPDATE wal_stress + SET counter = counter + $1 + WHERE cycle >= $2`, + [1, Math.max(0, cycleNum - 3)], + ) + } + + process.send(`cycle-done:${cycleNum}`) + } + + process.send('all-cycles-done') + + // No close() or CHECKPOINT — keep alive for SIGKILL. + // setInterval needed because a bare Promise doesn't keep Node alive. + setInterval(() => {}, 60000) + await new Promise(() => {}) +} + +run().catch((err) => { + console.error(`WAL bloat worker error:`, err) + try { + process.send(`fatal:${err.message}`) + } catch (_) { + /* ignore */ + } + process.exit(1) +}) From 92653eb2709fb43ba5b15fb8918e923e4199d7a9 Mon Sep 17 00:00:00 2001 From: reisepass <1474408+reisepass@users.noreply.github.com> Date: Tue, 17 Feb 2026 00:31:07 +0100 Subject: [PATCH 2/3] Fix PGlite crash safety: PID file lock and partial initdb detection - Add PID-based file lock to NodeFS to prevent overlapping instance corruption - Detect partially-initialized data dirs and move to .corrupt- backup - Add tests verifying partial init backup behavior --- .changeset/add-nodefs-lock-file.md | 5 + .gitignore | 4 + packages/pglite/src/fs/nodefs.ts | 105 +++++++++++++ .../crash-safety/partial-init-backup.test.js | 146 ++++++++++++++++++ 4 files changed, 260 insertions(+) create mode 100644 .changeset/add-nodefs-lock-file.md create mode 100644 packages/pglite/tests/crash-safety/partial-init-backup.test.js diff --git a/.changeset/add-nodefs-lock-file.md b/.changeset/add-nodefs-lock-file.md new file mode 100644 index 000000000..c2bdbe9f5 --- /dev/null +++ b/.changeset/add-nodefs-lock-file.md @@ -0,0 +1,5 @@ +--- +'@electric-sql/pglite': patch +--- + +Add data directory locking and partial initdb recovery to NodeFS diff --git a/.gitignore b/.gitignore index 22bb9a307..370b92fd8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,7 @@ node_modules docs/.vitepress/dist docs/.vitepress/cache + +# Local test logs +packages/pglite/tests/crash-safety/*.log +prompt_history.txt diff --git a/packages/pglite/src/fs/nodefs.ts b/packages/pglite/src/fs/nodefs.ts index fb411aab5..3383274ad 100644 --- a/packages/pglite/src/fs/nodefs.ts +++ b/packages/pglite/src/fs/nodefs.ts @@ -4,8 +4,11 @@ import { EmscriptenBuiltinFilesystem, PGDATA } from './base.js' import type { PostgresMod } from '../postgresMod.js' import { PGlite } from '../pglite.js' +// TODO: Add locking for browser backends via Web Locks API + export class NodeFS extends EmscriptenBuiltinFilesystem { protected rootDir: string + #lockFd: number | null = null constructor(dataDir: string) { super(dataDir) @@ -17,6 +20,10 @@ export class NodeFS extends EmscriptenBuiltinFilesystem { async init(pg: PGlite, opts: Partial) { this.pg = pg + + this.#acquireLock() + this.#cleanPartialInit() + const options: Partial = { ...opts, preRun: [ @@ -31,7 +38,105 @@ export class NodeFS extends EmscriptenBuiltinFilesystem { return { emscriptenOpts: options } } + // Lock file is a sibling (mydb.lock) to avoid polluting the PG data dir + #acquireLock() { + const lockPath = this.rootDir + '.lock' + + if (fs.existsSync(lockPath)) { + try { + const content = fs.readFileSync(lockPath, 'utf-8').trim() + const lines = content.split('\n') + const pid = parseInt(lines[0], 10) + + if (pid && !isNaN(pid) && this.#isProcessAlive(pid)) { + throw new Error( + `Data directory "${this.rootDir}" is locked by another PGlite instance ` + + `(PID ${pid}). Close the other instance first, or delete ` + + `"${lockPath}" if the process is no longer running.`, + ) + } + // Stale lock from a dead process — safe to take over + } catch (e) { + // Re-throw lock errors, ignore parse errors (corrupt lock file = stale) + if (e instanceof Error && e.message.includes('is locked by')) { + throw e + } + } + } + + // Write our PID to the lock file and keep the fd open + this.#lockFd = fs.openSync(lockPath, 'w') + fs.writeSync(this.#lockFd, `${process.pid}\n${Date.now()}\n`) + } + + #releaseLock() { + if (this.#lockFd !== null) { + try { + fs.closeSync(this.#lockFd) + } catch { + // Ignore errors on close + } + this.#lockFd = null + + const lockPath = this.rootDir + '.lock' + try { + fs.unlinkSync(lockPath) + } catch { + // Ignore errors on unlink (dir may already be cleaned up) + } + } + } + + // If initdb was killed mid-way, the data dir is incomplete and unrecoverable. + // A fully initialized PG always has 3+ databases in base/ (template0, template1, postgres). + #cleanPartialInit() { + try { + const entries = fs.readdirSync(this.rootDir) + if (entries.length === 0) return + + const pgVersionPath = path.join(this.rootDir, 'PG_VERSION') + if (!fs.existsSync(pgVersionPath)) { + this.#moveDataDirToBackup() + return + } + + const basePath = path.join(this.rootDir, 'base') + if (fs.existsSync(basePath)) { + const databases = fs.readdirSync(basePath) + if (databases.length < 3) { + this.#moveDataDirToBackup() + return + } + } else { + this.#moveDataDirToBackup() + } + } catch { + // If we can't read the directory, let PostgreSQL handle the error + } + } + + #moveDataDirToBackup() { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-') + const backupPath = `${this.rootDir}.corrupt-${timestamp}` + fs.renameSync(this.rootDir, backupPath) + fs.mkdirSync(this.rootDir) + console.warn( + `PGlite: Detected partially-initialized data directory. ` + + `Moved to "${backupPath}" for recovery. A fresh database will be created.`, + ) + } + + #isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0) // signal 0 = check if process exists + return true + } catch { + return false // ESRCH = process doesn't exist + } + } + async closeFs(): Promise { + this.#releaseLock() this.pg!.Module.FS.quit() } } diff --git a/packages/pglite/tests/crash-safety/partial-init-backup.test.js b/packages/pglite/tests/crash-safety/partial-init-backup.test.js new file mode 100644 index 000000000..1ef545dab --- /dev/null +++ b/packages/pglite/tests/crash-safety/partial-init-backup.test.js @@ -0,0 +1,146 @@ +import { describe, it, expect, afterAll } from 'vitest' +import { + existsSync, + mkdirSync, + writeFileSync, + readdirSync, + rmSync, +} from 'node:fs' +import { join, dirname } from 'node:path' +import { tryOpen, cleanupDataDir, testDataDir } from './harness.js' + +const dataDir = testDataDir('partial-init-backup') +const parentDir = dirname(dataDir) + +afterAll(async () => { + if (!process.env.RETAIN_DATA) { + cleanupDataDir(dataDir) + // Clean up any .corrupt-* backup directories + if (existsSync(parentDir)) { + const entries = readdirSync(parentDir) + for (const entry of entries) { + if (entry.startsWith(dataDir.split('/').pop() + '.corrupt-')) { + rmSync(join(parentDir, entry), { recursive: true, force: true }) + } + } + } + } +}) + +describe('partial init detection: backup instead of wipe', () => { + it( + 'should move a partially-initialized data dir to a .corrupt-* backup', + async () => { + // Create a data directory that looks like a partial initdb: + // has some files but no PG_VERSION (very early interruption) + mkdirSync(dataDir, { recursive: true }) + writeFileSync(join(dataDir, 'postgresql.conf'), '# partial config') + mkdirSync(join(dataDir, 'base'), { recursive: true }) + + // Open PGlite — it should detect partial init and move to backup + const opened = await tryOpen(dataDir) + expect(opened.success).toBe(true) + + // Verify a .corrupt-* backup was created as a sibling + const siblings = readdirSync(parentDir) + const baseName = dataDir.split('/').pop() + const backups = siblings.filter((s) => + s.startsWith(baseName + '.corrupt-'), + ) + expect(backups.length).toBeGreaterThanOrEqual(1) + + // Verify the backup contains our original files + const backupPath = join(parentDir, backups[0]) + const backupContents = readdirSync(backupPath) + expect(backupContents).toContain('postgresql.conf') + expect(backupContents).toContain('base') + + if (opened.db) { + await opened.db.close() + } + }, + { timeout: 60000 }, + ) + + it( + 'should move a data dir with PG_VERSION but incomplete base/ databases to backup', + async () => { + // Clean from previous test + cleanupDataDir(dataDir) + mkdirSync(dataDir, { recursive: true }) + + // Simulate a later-stage partial initdb: PG_VERSION exists + // but base/ has fewer than 3 database directories + writeFileSync(join(dataDir, 'PG_VERSION'), '16') + mkdirSync(join(dataDir, 'base', '1'), { recursive: true }) + writeFileSync( + join(dataDir, 'base', '1', 'pg_filenode.map'), + 'fake catalog data', + ) + mkdirSync(join(dataDir, 'global'), { recursive: true }) + + const opened = await tryOpen(dataDir) + expect(opened.success).toBe(true) + + // Verify backup was created + const siblings = readdirSync(parentDir) + const baseName = dataDir.split('/').pop() + const backups = siblings.filter((s) => + s.startsWith(baseName + '.corrupt-'), + ) + expect(backups.length).toBeGreaterThanOrEqual(2) // one from each test + + // Find the latest backup + const latestBackup = join(parentDir, backups.sort().pop()) + const backupContents = readdirSync(latestBackup) + expect(backupContents).toContain('PG_VERSION') + expect(backupContents).toContain('base') + expect(backupContents).toContain('global') + + if (opened.db) { + await opened.db.close() + } + }, + { timeout: 60000 }, + ) + + it( + 'should NOT move a fully-initialized data directory', + async () => { + // Clean from previous tests + cleanupDataDir(dataDir) + + // Let PGlite do a real fresh init + const opened = await tryOpen(dataDir) + expect(opened.success).toBe(true) + + // Count backups before closing and reopening + const siblingsBefore = readdirSync(parentDir) + const baseName = dataDir.split('/').pop() + const backupsBefore = siblingsBefore.filter((s) => + s.startsWith(baseName + '.corrupt-'), + ) + + if (opened.db) { + await opened.db.close() + } + + // Reopen — should NOT create a backup since the dir is fully initialized + const reopened = await tryOpen(dataDir) + expect(reopened.success).toBe(true) + + const siblingsAfter = readdirSync(parentDir) + const backupsAfter = siblingsAfter.filter((s) => + s.startsWith(baseName + '.corrupt-'), + ) + + // No new backups should have been created + expect(backupsAfter.length).toBe(backupsBefore.length) + + if (reopened.db) { + await reopened.db.close() + } + }, + { timeout: 60000 }, + ) +}) From cb58e9ae8f917d6a4131ab531b82d5ad0db59c86 Mon Sep 17 00:00:00 2001 From: reisepass <1474408+reisepass@users.noreply.github.com> Date: Fri, 20 Feb 2026 10:39:39 +0100 Subject: [PATCH 3/3] Simplify to single lock test, improve error message, drop extra crash-safety tests - Replace verbose crash-safety test suite with one focused nodefs-lock test - Update error message to clearly state data dir is "already in use" with actionable guidance (close other instance, use different dir, or delete lock) - Remove partial initdb detection (out of scope for this PR) - Clean up .gitignore and changeset description --- .changeset/add-nodefs-lock-file.md | 2 +- .gitignore | 4 - packages/pglite/src/fs/nodefs.ts | 48 +- packages/pglite/tests/crash-safety/README.md | 133 ---- packages/pglite/tests/crash-safety/harness.js | 188 ----- .../crash-safety/hmr-double-instance.test.js | 425 ------------ .../overlapping-instances.test.js | 486 ------------- .../crash-safety/partial-init-backup.test.js | 146 ---- .../wal-bloat-no-checkpoint.test.js | 651 ------------------ .../workers/hmr-double-instance.js | 88 --- .../workers/overlapping-ddl-writer.js | 116 ---- .../workers/overlapping-rapid-cycling.js | 91 --- .../workers/overlapping-staggered.js | 101 --- .../workers/overlapping-three-instances.js | 104 --- .../workers/wal-bloat-no-checkpoint.js | 174 ----- packages/pglite/tests/nodefs-lock.test.js | 71 ++ 16 files changed, 76 insertions(+), 2752 deletions(-) delete mode 100644 packages/pglite/tests/crash-safety/README.md delete mode 100644 packages/pglite/tests/crash-safety/harness.js delete mode 100644 packages/pglite/tests/crash-safety/hmr-double-instance.test.js delete mode 100644 packages/pglite/tests/crash-safety/overlapping-instances.test.js delete mode 100644 packages/pglite/tests/crash-safety/partial-init-backup.test.js delete mode 100644 packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js delete mode 100644 packages/pglite/tests/crash-safety/workers/hmr-double-instance.js delete mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js delete mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js delete mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-staggered.js delete mode 100644 packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js delete mode 100644 packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js create mode 100644 packages/pglite/tests/nodefs-lock.test.js diff --git a/.changeset/add-nodefs-lock-file.md b/.changeset/add-nodefs-lock-file.md index c2bdbe9f5..3f7d1a882 100644 --- a/.changeset/add-nodefs-lock-file.md +++ b/.changeset/add-nodefs-lock-file.md @@ -2,4 +2,4 @@ '@electric-sql/pglite': patch --- -Add data directory locking and partial initdb recovery to NodeFS +Add data directory locking to NodeFS to prevent multi-process corruption diff --git a/.gitignore b/.gitignore index 370b92fd8..22bb9a307 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,3 @@ node_modules docs/.vitepress/dist docs/.vitepress/cache - -# Local test logs -packages/pglite/tests/crash-safety/*.log -prompt_history.txt diff --git a/packages/pglite/src/fs/nodefs.ts b/packages/pglite/src/fs/nodefs.ts index 3383274ad..1038c9035 100644 --- a/packages/pglite/src/fs/nodefs.ts +++ b/packages/pglite/src/fs/nodefs.ts @@ -22,7 +22,6 @@ export class NodeFS extends EmscriptenBuiltinFilesystem { this.pg = pg this.#acquireLock() - this.#cleanPartialInit() const options: Partial = { ...opts, @@ -50,15 +49,15 @@ export class NodeFS extends EmscriptenBuiltinFilesystem { if (pid && !isNaN(pid) && this.#isProcessAlive(pid)) { throw new Error( - `Data directory "${this.rootDir}" is locked by another PGlite instance ` + - `(PID ${pid}). Close the other instance first, or delete ` + - `"${lockPath}" if the process is no longer running.`, + `PGlite data directory "${this.rootDir}" is already in use by another instance (PID ${pid}). ` + + `Close the other instance or use a different data directory. ` + + `Delete "${lockPath}" if PID ${pid} is no longer running.`, ) } // Stale lock from a dead process — safe to take over } catch (e) { // Re-throw lock errors, ignore parse errors (corrupt lock file = stale) - if (e instanceof Error && e.message.includes('is locked by')) { + if (e instanceof Error && e.message.includes('already in use')) { throw e } } @@ -87,45 +86,6 @@ export class NodeFS extends EmscriptenBuiltinFilesystem { } } - // If initdb was killed mid-way, the data dir is incomplete and unrecoverable. - // A fully initialized PG always has 3+ databases in base/ (template0, template1, postgres). - #cleanPartialInit() { - try { - const entries = fs.readdirSync(this.rootDir) - if (entries.length === 0) return - - const pgVersionPath = path.join(this.rootDir, 'PG_VERSION') - if (!fs.existsSync(pgVersionPath)) { - this.#moveDataDirToBackup() - return - } - - const basePath = path.join(this.rootDir, 'base') - if (fs.existsSync(basePath)) { - const databases = fs.readdirSync(basePath) - if (databases.length < 3) { - this.#moveDataDirToBackup() - return - } - } else { - this.#moveDataDirToBackup() - } - } catch { - // If we can't read the directory, let PostgreSQL handle the error - } - } - - #moveDataDirToBackup() { - const timestamp = new Date().toISOString().replace(/[:.]/g, '-') - const backupPath = `${this.rootDir}.corrupt-${timestamp}` - fs.renameSync(this.rootDir, backupPath) - fs.mkdirSync(this.rootDir) - console.warn( - `PGlite: Detected partially-initialized data directory. ` + - `Moved to "${backupPath}" for recovery. A fresh database will be created.`, - ) - } - #isProcessAlive(pid: number): boolean { try { process.kill(pid, 0) // signal 0 = check if process exists diff --git a/packages/pglite/tests/crash-safety/README.md b/packages/pglite/tests/crash-safety/README.md deleted file mode 100644 index 14a7879a7..000000000 --- a/packages/pglite/tests/crash-safety/README.md +++ /dev/null @@ -1,133 +0,0 @@ -# PGlite Crash Safety Test Suite - -Tests that reproduce real corruption bugs in PGlite — specifically the issues fixed by the PID file lock and partial initdb detection in `nodefs.ts`. Every test here **fails without the fix** and passes with it. - -Single-instance WAL recovery tests (kill-during-insert, kill-during-transaction, etc.) were intentionally excluded because PostgreSQL handles those correctly without any code changes. Those tests are preserved in the `archive/all-crash-safety-tests` branch. - -## Running the Tests - -```bash -# Run all crash safety tests -pnpm vitest run tests/crash-safety/ --reporter=verbose - -# Run a single scenario -pnpm vitest run tests/crash-safety/overlapping-instances.test.js - -# Keep data directories for debugging (not cleaned up after test) -RETAIN_DATA=1 pnpm vitest run tests/crash-safety/ -``` - -> **Note:** Do not use `--no-file-parallelism` — PGlite's WASM module conflicts with vitest's single-worker mode. - -## Architecture - -``` -tests/crash-safety/ -├── harness.js # Shared test infrastructure -├── README.md # This file -├── CRASH-SAFETY.md # Detailed failure mode documentation -├── RESULTS.md # Test results log -├── hmr-double-instance.test.js # HMR double-instance lock test -├── overlapping-instances.test.js # Overlapping instance corruption test -├── wal-bloat-no-checkpoint.test.js # WAL bloat burst mode corruption test -├── partial-init-backup.test.js # Partial initdb backup behavior test -└── workers/ - ├── hmr-double-instance.js - ├── overlapping-three-instances.js - ├── overlapping-staggered.js - ├── overlapping-ddl-writer.js - ├── overlapping-rapid-cycling.js - └── wal-bloat-no-checkpoint.js -``` - -### How It Works - -Each test follows the same pattern: - -1. **Worker script** — A standalone Node.js script that creates a PGlite instance on a data directory (passed via `PGLITE_DATA_DIR` env var), performs database operations, and sends IPC messages to the parent via `process.send()` to signal progress. - -2. **Test file** — Uses vitest. Calls `crashTest()` from the harness to spawn the worker as a child process via `fork()`. The harness kills the child with `SIGKILL` either after a timer or when a specific IPC message is received. - -3. **Verification** — After the kill, the test reopens PGlite on the same data directory and checks: - - - The database opens without error (no PANIC, no hang) - - Basic queries succeed (`SELECT 1`) - - All user tables are scannable - - Data is consistent (committed rows present, uncommitted rows absent) - -4. **Cleanup** — Each test uses a unique `/tmp/pglite-crash-*` directory and removes it in `afterAll`, unless `RETAIN_DATA=1` is set. - -## Test Scenarios - -### 1. Overlapping Instances - -**File:** `overlapping-instances.test.js` (4 tests) - -Multiple PGlite instances opening the same data directory concurrently. Without the PID file lock, this causes silent corruption (`Aborted()` on next open). - -- **Triple instances** — three instances open simultaneously -- **Staggered** — second instance opens while first is mid-write -- **DDL writer** — overlapping DDL operations -- **Rapid cycling** — rapid open/kill/reopen cycles with overlapping lifetimes - -### 2. HMR Double-Instance - -**File:** `hmr-double-instance.test.js` (2 tests) - -Simulates hot module replacement (HMR) in dev servers where a new PGlite instance is created before the old one is closed. - -- **Lock blocking** — verifies instance B is blocked by the lock while instance A is alive -- **Rapid HMR cycles** — fast instance swaps that corrupt without the lock - -### 3. WAL Bloat Burst Mode - -**File:** `wal-bloat-no-checkpoint.test.js` (1 failing test) - -15 extremely rapid kill cycles with no delay, accumulating WAL without checkpointing. Without partial initdb detection, interrupted initializations leave corrupt state that causes `Aborted()`. - -### 4. Partial Init Backup - -**File:** `partial-init-backup.test.js` (3 tests) - -Directly tests the partial initdb detection and backup behavior in `nodefs.ts`: - -- Partial dir (no `PG_VERSION`) → moved to `.corrupt-` backup -- Partial dir (`PG_VERSION` but incomplete `base/`) → moved to backup -- Fully initialized dir → NOT moved (no false positives) - -## Harness API (`harness.js`) - -### `crashTest(options)` - -Spawns a child process and kills it. - -| Option | Type | Default | Description | -| --------------- | ------ | ----------- | -------------------------------------------------------------- | -| `dataDir` | string | required | Path to PGlite data directory | -| `workerScript` | string | required | Path to the worker `.js` file | -| `killAfterMs` | number | `500` | Delay before sending kill signal | -| `signal` | string | `'SIGKILL'` | Signal to send (usually SIGKILL) | -| `killOnMessage` | string | `null` | Kill when worker sends this IPC message instead of using timer | -| `env` | object | `{}` | Extra environment variables for the child | - -Returns: `{ workerKilled, workerError, workerMessages, workerExitCode, workerSignal, stdout, stderr }` - -### `tryOpen(dataDir, timeoutMs?)` - -Attempts to open a PGlite instance on a potentially corrupted data directory. Includes a timeout (default 15s) to handle cases where a corrupted database hangs forever during initialization. - -Returns: `{ success, db, error }` - -### `verifyIntegrity(db)` - -Runs integrity checks against an open PGlite instance: basic query, table scan, index scan. - -Returns: `{ intact, issues }` - -### `cleanupDataDir(dataDir)` - -Removes a test data directory and its sibling `.lock` file. - -### `testDataDir(scenarioName)` - -Generates a unique `/tmp/pglite-crash---` path. diff --git a/packages/pglite/tests/crash-safety/harness.js b/packages/pglite/tests/crash-safety/harness.js deleted file mode 100644 index cc8702dd0..000000000 --- a/packages/pglite/tests/crash-safety/harness.js +++ /dev/null @@ -1,188 +0,0 @@ -// Crash safety test harness: spawn a PGlite worker, kill it, verify recovery. - -import { fork } from 'node:child_process' -import { existsSync, rmSync, mkdirSync } from 'node:fs' -import { resolve } from 'node:path' - -/** - * Spawn a worker, optionally kill it on a message or timer, return results. - */ -export async function crashTest(options) { - const { - dataDir, - workerScript, - killAfterMs = 500, - signal = 'SIGKILL', - env = {}, - killOnMessage = null, - } = options - - const parentDir = resolve(dataDir, '..') - if (!existsSync(parentDir)) { - mkdirSync(parentDir, { recursive: true }) - } - - // eslint-disable-next-line @typescript-eslint/no-unused-vars - return new Promise((resolvePromise, rejectPromise) => { - const messages = [] - let workerError = null - let killed = false - let killTimer = null - - const child = fork(workerScript, [], { - env: { - ...process.env, - PGLITE_DATA_DIR: dataDir, - ...env, - }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - let stdout = '' - let stderr = '' - child.stdout.on('data', (d) => { - stdout += d.toString() - }) - child.stderr.on('data', (d) => { - stderr += d.toString() - }) - - child.on('message', (msg) => { - messages.push(msg) - - if (killOnMessage && msg === killOnMessage && !killed) { - killed = true - if (killTimer) clearTimeout(killTimer) - child.kill(signal) - } - }) - - child.on('error', (err) => { - workerError = err.message - }) - - child.on('exit', (code, sig) => { - if (killTimer) clearTimeout(killTimer) - resolvePromise({ - workerKilled: sig === signal || killed, - workerError, - workerMessages: messages, - workerExitCode: code, - workerSignal: sig, - stdout, - stderr, - }) - }) - - if (!killOnMessage) { - killTimer = setTimeout(() => { - if (!killed) { - killed = true - child.kill(signal) - } - }, killAfterMs) - } - - // Safety timeout - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 30000) - }) -} - -/** Try to open PGlite on a possibly-corrupted data dir. Caller must close db on success. */ -export async function tryOpen(dataDir, timeoutMs = 15000) { - const { PGlite } = await import('../../dist/index.js') - - try { - const db = new PGlite(dataDir) - - await Promise.race([ - db.waitReady, - new Promise((_, reject) => - setTimeout( - () => - reject( - new Error( - `PGlite open timed out after ${timeoutMs}ms (likely corrupted)`, - ), - ), - timeoutMs, - ), - ), - ]) - - return { success: true, db, error: null } - } catch (err) { - return { success: false, db: null, error: err } - } -} - -/** Run basic integrity checks: health query, table scans, index scans. */ -export async function verifyIntegrity(db) { - const issues = [] - - try { - await db.query('SELECT 1 as health_check') - } catch (err) { - issues.push(`Basic query failed: ${err.message}`) - return { intact: false, issues } - } - - try { - const tables = await db.query(` - SELECT tablename FROM pg_tables - WHERE schemaname = 'public' - ORDER BY tablename - `) - - for (const row of tables.rows) { - try { - await db.query(`SELECT count(*) FROM "${row.tablename}"`) - } catch (err) { - issues.push(`Count on ${row.tablename} failed: ${err.message}`) - } - } - } catch (err) { - issues.push(`Table listing failed: ${err.message}`) - } - - try { - const indexes = await db.query(` - SELECT indexname, tablename FROM pg_indexes - WHERE schemaname = 'public' - ORDER BY indexname - `) - - for (const row of indexes.rows) { - try { - await db.query(`SELECT count(*) FROM "${row.tablename}"`) - } catch (err) { - issues.push(`Index check on ${row.indexname} failed: ${err.message}`) - } - } - } catch (err) { - issues.push(`Index listing failed: ${err.message}`) - } - - return { intact: issues.length === 0, issues } -} - -export function cleanupDataDir(dataDir) { - if (existsSync(dataDir)) { - rmSync(dataDir, { recursive: true, force: true }) - } - const lockFile = dataDir + '.lock' - if (existsSync(lockFile)) { - rmSync(lockFile, { force: true }) - } -} - -export function testDataDir(scenarioName) { - const timestamp = Date.now() - const rand = Math.random().toString(36).slice(2, 8) - return `/tmp/pglite-crash-${scenarioName}-${timestamp}-${rand}` -} diff --git a/packages/pglite/tests/crash-safety/hmr-double-instance.test.js b/packages/pglite/tests/crash-safety/hmr-double-instance.test.js deleted file mode 100644 index 1942c19c0..000000000 --- a/packages/pglite/tests/crash-safety/hmr-double-instance.test.js +++ /dev/null @@ -1,425 +0,0 @@ -/** - * Crash Safety Test: HMR (Hot Module Reload) Double Instance - * - * Tests the file lock mechanism that prevents overlapping PGlite instances - * from corrupting the database during HMR in dev servers like Vite/Next.js. - * - * With the lock file implementation: - * 1. Instance A opens and acquires the lock - * 2. Instance B attempts to open the SAME data dir and is BLOCKED by the lock - * 3. Only instance A writes data - * 4. When killed with SIGKILL, the stale lock is detected on next open - * - * This prevents the corruption that previously occurred when two WASM heaps - * both accessed the same PostgreSQL data directory simultaneously. - */ - -import { describe, it, expect, afterAll } from 'vitest' -import { fork } from 'node:child_process' -import { existsSync, rmSync } from 'node:fs' - -const dataDir = `/tmp/pglite-crash-hmr-double-${Date.now()}` - -afterAll(async () => { - if (!process.env.RETAIN_DATA) { - if (existsSync(dataDir)) { - rmSync(dataDir, { recursive: true, force: true }) - } - // Clean up sibling lock files - const lockFile = dataDir + '.lock' - if (existsSync(lockFile)) { - rmSync(lockFile, { force: true }) - } - } -}) - -describe('crash safety: HMR double-instance corruption', () => { - it( - 'should survive multiple HMR-style instance replacement cycles with SIGKILL', - async () => { - const { PGlite } = await import('../../dist/index.js') - const workerPath = new URL( - './workers/hmr-double-instance.js', - import.meta.url, - ).pathname - - const NUM_CYCLES = 7 - const cycleResults = [] - - for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { - const overlapOps = cycle < 2 ? 10 : cycle < 5 ? 25 : 40 - - const result = await new Promise((resolve) => { - const messages = [] - let killed = false - - const child = fork(workerPath, [], { - env: { - ...process.env, - PGLITE_DATA_DIR: dataDir, - CYCLE: String(cycle), - OVERLAP_OPS: String(overlapOps), - }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - let stderr = '' - child.stderr.on('data', (d) => { - stderr += d.toString() - }) - child.stdout.on('data', () => {}) // drain - - child.on('message', (msg) => { - messages.push(msg) - - // Kill after all operations are done (instance A has written data) - if (msg === 'all-operations-done' && !killed) { - killed = true - child.kill('SIGKILL') - } - }) - - child.on('exit', (code, sig) => { - resolve({ - cycle, - killed: killed || sig === 'SIGKILL', - messages, - exitCode: code, - signal: sig, - stderr, - }) - }) - - // Fallback timer: kill after 15s if worker hasn't finished - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 15000) - }) - - cycleResults.push(result) - - // The worker should either be killed by us or exit cleanly - // (it's killed after 'all-operations-done') - expect(result.killed).toBe(true) - - // Instance A should have been ready in every cycle - expect(result.messages).toContain('instance-a-ready') - - // Instance B should have been BLOCKED by the lock - expect(result.messages).toContain('instance-b-blocked') - expect(result.messages).not.toContain('instance-b-ready') - - // On the first cycle, schema should have been created - if (cycle === 0) { - expect(result.messages).toContain('schema-created') - } - - // Log cycle details for debugging - console.log( - `Cycle ${cycle}: messages=[${result.messages.join(', ')}], ` + - `exit=${result.exitCode}, signal=${result.signal}`, - ) - - // ---- Intermediate recovery check every 2 cycles ---- - if (cycle % 2 === 1 || cycle === NUM_CYCLES - 1) { - let db = null - let openSuccess = false - try { - db = new PGlite(dataDir) - await Promise.race([ - db.waitReady, - new Promise((_, reject) => - setTimeout( - () => - reject(new Error(`Open timed out after cycle ${cycle}`)), - 20000, - ), - ), - ]) - openSuccess = true - } catch (err) { - console.log( - `CORRUPTION DETECTED after cycle ${cycle}: ${err.message}`, - ) - openSuccess = false - } - - if (openSuccess && db) { - try { - // Basic health check - await db.query('SELECT 1 as ok') - - // Table should exist after cycle 0 - const tableCheck = await db.query(` - SELECT tablename FROM pg_tables - WHERE schemaname = 'public' AND tablename = 'hmr_data' - `) - expect(tableCheck.rows.length).toBe(1) - - // Count rows - with the lock, only instance A writes - const countResult = await db.query( - 'SELECT count(*)::int as cnt FROM hmr_data', - ) - const rowCount = countResult.rows[0].cnt - console.log(`After cycle ${cycle}: ${rowCount} rows in hmr_data`) - expect(rowCount).toBeGreaterThan(0) - - // All rows should be from instance A (B was blocked) - const instanceCheck = await db.query( - `SELECT DISTINCT instance FROM hmr_data`, - ) - const instances = instanceCheck.rows.map((r) => r.instance) - expect(instances).toContain('A') - expect(instances).not.toContain('B') - - // Verify index is usable - const indexScan = await db.query( - `SELECT count(*)::int as cnt FROM hmr_data WHERE cycle = $1`, - [0], - ) - expect(indexScan.rows[0].cnt).toBeGreaterThanOrEqual(0) - - // Verify we can do a full sequential scan without errors - const allRows = await db.query( - 'SELECT id, cycle, instance, phase, seq FROM hmr_data ORDER BY id', - ) - expect(allRows.rows.length).toBe(rowCount) - - // Check for data consistency - for (const row of allRows.rows) { - expect(row.id).toBeGreaterThan(0) - expect(row.cycle).toBeGreaterThanOrEqual(0) - expect(row.cycle).toBeLessThanOrEqual(cycle) - expect(row.instance).toBe('A') - expect(typeof row.phase).toBe('string') - expect(row.seq).toBeGreaterThanOrEqual(0) - } - } finally { - await db.close() - } - } - - // The DB MUST be openable - expect(openSuccess).toBe(true) - } - - // Small delay between cycles - await new Promise((r) => setTimeout(r, 200)) - } - - // ---- Final comprehensive verification ---- - console.log('\n--- Final verification after all HMR cycles ---') - - let finalDb = null - try { - finalDb = new PGlite(dataDir) - await Promise.race([ - finalDb.waitReady, - new Promise((_, reject) => - setTimeout(() => reject(new Error('Final open timed out')), 30000), - ), - ]) - } catch (err) { - console.log(`FINAL CORRUPTION: ${err.message}`) - expect.fail( - `Database corrupted after ${NUM_CYCLES} HMR cycles: ${err.message}`, - ) - } - - try { - // Full integrity check - const tables = await finalDb.query(` - SELECT tablename FROM pg_tables WHERE schemaname = 'public' - `) - expect(tables.rows.length).toBeGreaterThanOrEqual(1) - - // Check all indexes are usable - const indexes = await finalDb.query(` - SELECT indexname, tablename FROM pg_indexes - WHERE schemaname = 'public' - `) - for (const idx of indexes.rows) { - await finalDb.query(`SELECT count(*) FROM "${idx.tablename}"`) - } - - // Row count audit - const finalCount = await finalDb.query( - 'SELECT count(*)::int as cnt FROM hmr_data', - ) - console.log(`Final row count: ${finalCount.rows[0].cnt}`) - - // Per-cycle breakdown (all should be instance A only) - const cycleBreakdown = await finalDb.query(` - SELECT cycle, instance, count(*)::int as cnt - FROM hmr_data - GROUP BY cycle, instance - ORDER BY cycle, instance - `) - console.log('Per-cycle breakdown:') - for (const row of cycleBreakdown.rows) { - console.log( - ` cycle=${row.cycle} instance=${row.instance} count=${row.cnt}`, - ) - // With the lock, all rows should be from instance A - expect(row.instance).toBe('A') - } - - // Verify we can still write to the database - await finalDb.query( - `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) - VALUES ($1, $2, $3, $4, $5)`, - [999, 'verify', 'final', 0, 'final-verification-row'], - ) - const verifyRow = await finalDb.query( - `SELECT * FROM hmr_data WHERE cycle = 999`, - ) - expect(verifyRow.rows.length).toBe(1) - } finally { - await finalDb.close() - } - }, - { timeout: 300000 }, - ) - - it( - 'should survive rapid HMR cycles with minimal delay between instance swaps', - async () => { - const { PGlite } = await import('../../dist/index.js') - - // Use a separate data dir for this sub-test - const rapidDataDir = `${dataDir}-rapid` - - const workerPath = new URL( - './workers/hmr-double-instance.js', - import.meta.url, - ).pathname - - const RAPID_CYCLES = 5 - - for (let cycle = 0; cycle < RAPID_CYCLES; cycle++) { - const result = await new Promise((resolve) => { - const messages = [] - let killed = false - - const child = fork(workerPath, [], { - env: { - ...process.env, - PGLITE_DATA_DIR: rapidDataDir, - CYCLE: String(cycle), - OVERLAP_OPS: '50', - }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - let stderr = '' - child.stderr.on('data', (d) => { - stderr += d.toString() - }) - child.stdout.on('data', () => {}) - - child.on('message', (msg) => { - messages.push(msg) - - // Kill after instance B is blocked (or after instance A continues) - if ( - (msg === 'instance-b-blocked' || - msg === 'instance-a-continued') && - !killed - ) { - // Give a tiny window then kill - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 50) - } - }) - - child.on('exit', (code, sig) => { - resolve({ - cycle, - killed: killed || sig === 'SIGKILL', - messages, - stderr, - }) - }) - - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 15000) - }) - - // Worker should be either killed by us or crashed due to lock error - expect(result.killed || result.exitCode !== 0).toBe(true) - } - - // Verify the DB is still usable after rapid HMR cycles - let db = null - try { - db = new PGlite(rapidDataDir) - await Promise.race([ - db.waitReady, - new Promise((_, reject) => - setTimeout( - () => reject(new Error('Rapid HMR final open timed out')), - 20000, - ), - ), - ]) - - await db.query('SELECT 1') - - const tableExists = await db.query(` - SELECT tablename FROM pg_tables - WHERE schemaname = 'public' AND tablename = 'hmr_data' - `) - if (tableExists.rows.length > 0) { - const count = await db.query( - 'SELECT count(*)::int as cnt FROM hmr_data', - ) - console.log(`Rapid HMR test: ${count.rows[0].cnt} rows survived`) - expect(count.rows[0].cnt).toBeGreaterThanOrEqual(0) - - // All surviving rows should be from instance A - const instanceCheck = await db.query( - `SELECT DISTINCT instance FROM hmr_data`, - ) - if (instanceCheck.rows.length > 0) { - for (const row of instanceCheck.rows) { - expect(row.instance).toBe('A') - } - } - } - - await db.close() - } catch (err) { - console.log(`Rapid HMR CORRUPTION: ${err.message}`) - if (db) - try { - await db.close() - } catch (_) { - /* ignore */ - } - expect.fail(`Rapid HMR corrupted DB: ${err.message}`) - } finally { - if (!process.env.RETAIN_DATA) { - if (existsSync(rapidDataDir)) { - rmSync(rapidDataDir, { recursive: true, force: true }) - } - const lockFile = rapidDataDir + '.lock' - if (existsSync(lockFile)) { - rmSync(lockFile, { force: true }) - } - } - } - }, - { timeout: 180000 }, - ) -}) diff --git a/packages/pglite/tests/crash-safety/overlapping-instances.test.js b/packages/pglite/tests/crash-safety/overlapping-instances.test.js deleted file mode 100644 index a642841b2..000000000 --- a/packages/pglite/tests/crash-safety/overlapping-instances.test.js +++ /dev/null @@ -1,486 +0,0 @@ -/** - * Crash Safety Test: Overlapping Instances (with File Locking) - * - * PGlite now implements file locking for NodeFS. Multiple instances cannot - * open the same data directory simultaneously — the second instance will - * receive a lock error, preventing the corruption that previously occurred. - * - * These tests verify that: - * 1. The lock prevents multiple instances from opening the same data dir - * 2. Workers that can't acquire the lock crash with an error (expected behavior) - * 3. The database remains intact and recoverable after all scenarios - * 4. Stale locks from killed processes are detected and overridden - */ - -import { describe, it, expect, afterAll } from 'vitest' -import { fork } from 'node:child_process' -import { existsSync, rmSync } from 'node:fs' - -const baseDir = `/tmp/pglite-crash-overlapping-${Date.now()}` - -afterAll(async () => { - if (!process.env.RETAIN_DATA) { - for (const suffix of ['triple', 'stagger', 'ddl', 'rapid']) { - const dir = `${baseDir}-${suffix}` - if (existsSync(dir)) { - rmSync(dir, { recursive: true, force: true }) - } - const lockFile = dir + '.lock' - if (existsSync(lockFile)) { - rmSync(lockFile, { force: true }) - } - } - } -}) - -/** - * Helper: fork a worker, collect messages, kill on trigger or timer. - */ -function runWorker({ - workerPath, - dataDir, - env = {}, - killOnMessage = null, - killAfterMs = null, - killDelayAfterMsg = 0, -}) { - return new Promise((resolve) => { - const messages = [] - let killed = false - let killTimer = null - - const child = fork(workerPath, [], { - env: { ...process.env, PGLITE_DATA_DIR: dataDir, ...env }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - let stderr = '' - child.stderr.on('data', (d) => { - stderr += d.toString() - }) - child.stdout.on('data', () => {}) - - child.on('message', (msg) => { - messages.push(msg) - - if (killOnMessage && msg === killOnMessage && !killed) { - if (killDelayAfterMsg > 0) { - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, killDelayAfterMsg) - } else { - killed = true - child.kill('SIGKILL') - } - } - }) - - child.on('exit', (code, sig) => { - if (killTimer) clearTimeout(killTimer) - const wasKilled = killed || sig === 'SIGKILL' - const crashed = !wasKilled && code !== 0 - resolve({ - killed: wasKilled, - crashed, - terminated: wasKilled || crashed, - messages, - exitCode: code, - signal: sig, - stderr, - }) - }) - - if (killAfterMs) { - killTimer = setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, killAfterMs) - } - - // Safety timeout - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 30000) - }) -} - -/** - * Try to open a PGlite instance and run basic checks. - * Returns { success, error, rowCount }. - */ -async function tryOpenAndVerify(PGlite, dataDir, tableName, timeoutMs = 20000) { - let db = null - try { - db = new PGlite(dataDir) - await Promise.race([ - db.waitReady, - new Promise((_, reject) => - setTimeout( - () => reject(new Error(`Open timed out after ${timeoutMs}ms`)), - timeoutMs, - ), - ), - ]) - - // Basic health - await db.query('SELECT 1 as ok') - - // Check table exists - const tableCheck = await db.query( - `SELECT tablename FROM pg_tables WHERE schemaname = 'public' AND tablename = $1`, - [tableName], - ) - if (tableCheck.rows.length === 0) { - await db.close() - return { success: true, rowCount: 0 } - } - - // Count rows - const count = await db.query( - `SELECT count(*)::int as cnt FROM "${tableName}"`, - ) - const rowCount = count.rows[0].cnt - - // Full sequential scan - const allRows = await db.query(`SELECT * FROM "${tableName}" ORDER BY id`) - if (allRows.rows.length !== rowCount) { - await db.close() - return { - success: false, - error: new Error( - `Row count mismatch: count=${rowCount} scan=${allRows.rows.length}`, - ), - } - } - - await db.close() - return { success: true, rowCount } - } catch (err) { - if (db) - try { - await db.close() - } catch (_) { - /* ignore */ - } - return { success: false, error: err } - } -} - -describe('crash safety: overlapping instances (lock prevents corruption)', () => { - // ======================================================================== - // Scenario 1: Three simultaneous instances — lock blocks instances B and C - // ======================================================================== - it( - 'should block second and third instances via lock, preserving data integrity', - async () => { - const { PGlite } = await import('../../dist/index.js') - const dataDir = `${baseDir}-triple` - const workerPath = new URL( - './workers/overlapping-three-instances.js', - import.meta.url, - ).pathname - - const NUM_CYCLES = 5 - - for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { - const killMsg = cycle < 2 ? 'concurrent-writes-done' : 'all-done' - - const result = await runWorker({ - workerPath, - dataDir, - env: { CYCLE: String(cycle) }, - killOnMessage: killMsg, - killAfterMs: 15000, // fallback if message never arrives (lock blocks worker) - }) - - console.log( - `Triple cycle ${cycle}: terminated=${result.terminated} killed=${result.killed} crashed=${result.crashed} ` + - `messages=[${result.messages.join(', ')}]`, - ) - - // Worker should have terminated (killed by us or crashed due to lock error) - expect(result.terminated).toBe(true) - - // If worker crashed, it's likely due to lock error — this is expected behavior. - // The important thing is that the DB remains intact. - if (result.crashed) { - console.log( - ` Triple cycle ${cycle}: worker crashed (expected — lock prevented second instance)`, - ) - // Check that the crash was due to a lock error - const lockRelatedCrash = - result.stderr.includes('locked by') || - result.messages.some( - (m) => typeof m === 'string' && m.includes('locked by'), - ) - if (lockRelatedCrash) { - console.log( - ` Confirmed: lock error caused the crash (correct behavior)`, - ) - } - } - - // Verify recovery — DB should ALWAYS be openable - const verify = await tryOpenAndVerify(PGlite, dataDir, 'triple_data') - if (!verify.success) { - console.log( - `UNEXPECTED CORRUPTION after triple cycle ${cycle}: ${verify.error.message}`, - ) - } - // With the lock, the DB should always be intact - expect(verify.success).toBe(true) - console.log(` Triple cycle ${cycle}: ${verify.rowCount} rows, DB OK`) - - await new Promise((r) => setTimeout(r, 200)) - } - }, - { timeout: 300000 }, - ) - - // ======================================================================== - // Scenario 2: Staggered overlap — lock blocks instance B - // ======================================================================== - it( - 'should block staggered second instance via lock, preserving data integrity', - async () => { - const { PGlite } = await import('../../dist/index.js') - const dataDir = `${baseDir}-stagger` - const workerPath = new URL( - './workers/overlapping-staggered.js', - import.meta.url, - ).pathname - - const NUM_CYCLES = 7 - - for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { - const staggerMs = cycle < 3 ? 100 : cycle < 5 ? 300 : 500 - const killMsg = cycle < 3 ? 'overlap-writes-done' : 'all-done' - - const result = await runWorker({ - workerPath, - dataDir, - env: { CYCLE: String(cycle), STAGGER_MS: String(staggerMs) }, - killOnMessage: killMsg, - killAfterMs: 15000, // fallback - }) - - console.log( - `Stagger cycle ${cycle} (${staggerMs}ms): terminated=${result.terminated} ` + - `messages=[${result.messages.join(', ')}]`, - ) - - expect(result.terminated).toBe(true) - - // Check recovery every 2 cycles and at the end - if (cycle % 2 === 1 || cycle === NUM_CYCLES - 1) { - const verify = await tryOpenAndVerify(PGlite, dataDir, 'stagger_data') - expect(verify.success).toBe(true) - console.log( - ` Stagger cycle ${cycle}: ${verify.rowCount} rows, DB OK`, - ) - } - - await new Promise((r) => setTimeout(r, 200)) - } - }, - { timeout: 300000 }, - ) - - // ======================================================================== - // Scenario 3: DDL collision — lock blocks second process - // ======================================================================== - it( - 'should block DDL/DML collision between two processes via lock', - async () => { - const { PGlite } = await import('../../dist/index.js') - const dataDir = `${baseDir}-ddl` - const ddlWorkerPath = new URL( - './workers/overlapping-ddl-writer.js', - import.meta.url, - ).pathname - - const NUM_CYCLES = 5 - - for (let cycle = 0; cycle < NUM_CYCLES; cycle++) { - // Spawn TWO separate child processes on the SAME data dir. - // With the lock, only one will acquire the lock; the other crashes. - const [ddlResult, dmlResult] = await Promise.all([ - runWorker({ - workerPath: ddlWorkerPath, - dataDir, - env: { CYCLE: String(cycle), WRITER_MODE: 'ddl' }, - killAfterMs: cycle < 2 ? 3000 : 5000, - }), - runWorker({ - workerPath: ddlWorkerPath, - dataDir, - env: { CYCLE: String(cycle), WRITER_MODE: 'dml' }, - killAfterMs: cycle < 2 ? 3000 : 5000, - }), - ]) - - console.log( - `DDL cycle ${cycle}: DDL terminated=${ddlResult.terminated} crashed=${ddlResult.crashed} ` + - `messages=[${ddlResult.messages.join(', ')}]`, - ) - console.log( - `DDL cycle ${cycle}: DML terminated=${dmlResult.terminated} crashed=${dmlResult.crashed} ` + - `messages=[${dmlResult.messages.join(', ')}]`, - ) - - // At least one should have been blocked by the lock - const bothCrashed = ddlResult.crashed && dmlResult.crashed - if (bothCrashed) { - // Both crashed — possibly the first one held the lock briefly - // and the second couldn't get it. This is acceptable. - console.log( - ` DDL cycle ${cycle}: both workers crashed (lock prevented simultaneous access)`, - ) - } - - // Verify recovery - await new Promise((r) => setTimeout(r, 300)) - const verify = await tryOpenAndVerify(PGlite, dataDir, 'ddl_base') - expect(verify.success).toBe(true) - console.log( - ` DDL cycle ${cycle}: ddl_base has ${verify.rowCount} rows, DB OK`, - ) - - await new Promise((r) => setTimeout(r, 200)) - } - }, - { timeout: 300000 }, - ) - - // ======================================================================== - // Scenario 4: Rapid instance cycling — lock blocks all after first - // ======================================================================== - it( - 'should block rapid instance cycling via lock, preserving data integrity', - async () => { - const { PGlite } = await import('../../dist/index.js') - const dataDir = `${baseDir}-rapid` - const workerPath = new URL( - './workers/overlapping-rapid-cycling.js', - import.meta.url, - ).pathname - - const NUM_RUNS = 3 - - for (let run = 0; run < NUM_RUNS; run++) { - const numInstances = run === 0 ? 8 : run === 1 ? 10 : 12 - const killMsg = run < 2 ? 'all-instances-created' : 'all-done' - - const result = await runWorker({ - workerPath, - dataDir, - env: { NUM_INSTANCES: String(numInstances) }, - killOnMessage: killMsg, - killAfterMs: 15000, // fallback - }) - - const instWrote = result.messages.filter((m) => - m.endsWith('-wrote'), - ).length - console.log( - `Rapid run ${run} (${numInstances} instances): terminated=${result.terminated} ` + - `${instWrote} wrote, messages=[${result.messages.join(', ')}]`, - ) - - expect(result.terminated).toBe(true) - - // With the lock, only the first instance should open successfully. - // The worker will crash when trying to open the second instance. - if (result.crashed) { - console.log( - ` Rapid run ${run}: worker crashed (expected — lock blocked subsequent instances)`, - ) - } - - // Verify recovery — DB should always be intact - const verify = await tryOpenAndVerify(PGlite, dataDir, 'rapid_data') - expect(verify.success).toBe(true) - console.log(` Rapid run ${run}: ${verify.rowCount} rows, DB OK`) - - await new Promise((r) => setTimeout(r, 300)) - } - }, - { timeout: 300000 }, - ) - - // ======================================================================== - // Scenario 5: Kill-during-recovery overlap — lock prevents simultaneous recovery - // ======================================================================== - it( - 'should prevent two instances from racing to recover the same data directory', - async () => { - const { PGlite } = await import('../../dist/index.js') - const dataDir = `${baseDir}-triple` // Reuse from scenario 1 - - const workerPath = new URL( - './workers/overlapping-three-instances.js', - import.meta.url, - ).pathname - - // Step 1: Create dirty state by killing a worker mid-operation - const seedResult = await runWorker({ - workerPath, - dataDir, - env: { CYCLE: '99' }, - killAfterMs: 3000, // Kill after 3 seconds regardless - }) - - console.log( - `Recovery-overlap seed: terminated=${seedResult.terminated} crashed=${seedResult.crashed}`, - ) - - // Step 2: Spawn TWO workers simultaneously on the dirty data dir. - // With the lock, only the first one should open; the second gets blocked. - const [recoverA, recoverB] = await Promise.all([ - runWorker({ - workerPath, - dataDir, - env: { CYCLE: '100' }, - killAfterMs: 5000, - }), - new Promise((resolve) => setTimeout(resolve, 200)).then(() => - runWorker({ - workerPath, - dataDir, - env: { CYCLE: '101' }, - killAfterMs: 5000, - }), - ), - ]) - - console.log( - `Recovery A: terminated=${recoverA.terminated} crashed=${recoverA.crashed} messages=[${recoverA.messages.join(', ')}]`, - ) - console.log( - `Recovery B: terminated=${recoverB.terminated} crashed=${recoverB.crashed} messages=[${recoverB.messages.join(', ')}]`, - ) - - // At least one should have been blocked by the lock - // (B starts 200ms later, so A likely holds the lock) - if (recoverB.crashed) { - console.log(' Recovery B was blocked by lock (correct behavior)') - } - - // Step 3: Verify the DB is still intact after the dual-recovery attempt - await new Promise((r) => setTimeout(r, 500)) - - const verify = await tryOpenAndVerify(PGlite, dataDir, 'triple_data') - expect(verify.success).toBe(true) - console.log(` After recovery overlap: ${verify.rowCount} rows, DB OK`) - }, - { timeout: 180000 }, - ) -}) diff --git a/packages/pglite/tests/crash-safety/partial-init-backup.test.js b/packages/pglite/tests/crash-safety/partial-init-backup.test.js deleted file mode 100644 index 1ef545dab..000000000 --- a/packages/pglite/tests/crash-safety/partial-init-backup.test.js +++ /dev/null @@ -1,146 +0,0 @@ -import { describe, it, expect, afterAll } from 'vitest' -import { - existsSync, - mkdirSync, - writeFileSync, - readdirSync, - rmSync, -} from 'node:fs' -import { join, dirname } from 'node:path' -import { tryOpen, cleanupDataDir, testDataDir } from './harness.js' - -const dataDir = testDataDir('partial-init-backup') -const parentDir = dirname(dataDir) - -afterAll(async () => { - if (!process.env.RETAIN_DATA) { - cleanupDataDir(dataDir) - // Clean up any .corrupt-* backup directories - if (existsSync(parentDir)) { - const entries = readdirSync(parentDir) - for (const entry of entries) { - if (entry.startsWith(dataDir.split('/').pop() + '.corrupt-')) { - rmSync(join(parentDir, entry), { recursive: true, force: true }) - } - } - } - } -}) - -describe('partial init detection: backup instead of wipe', () => { - it( - 'should move a partially-initialized data dir to a .corrupt-* backup', - async () => { - // Create a data directory that looks like a partial initdb: - // has some files but no PG_VERSION (very early interruption) - mkdirSync(dataDir, { recursive: true }) - writeFileSync(join(dataDir, 'postgresql.conf'), '# partial config') - mkdirSync(join(dataDir, 'base'), { recursive: true }) - - // Open PGlite — it should detect partial init and move to backup - const opened = await tryOpen(dataDir) - expect(opened.success).toBe(true) - - // Verify a .corrupt-* backup was created as a sibling - const siblings = readdirSync(parentDir) - const baseName = dataDir.split('/').pop() - const backups = siblings.filter((s) => - s.startsWith(baseName + '.corrupt-'), - ) - expect(backups.length).toBeGreaterThanOrEqual(1) - - // Verify the backup contains our original files - const backupPath = join(parentDir, backups[0]) - const backupContents = readdirSync(backupPath) - expect(backupContents).toContain('postgresql.conf') - expect(backupContents).toContain('base') - - if (opened.db) { - await opened.db.close() - } - }, - { timeout: 60000 }, - ) - - it( - 'should move a data dir with PG_VERSION but incomplete base/ databases to backup', - async () => { - // Clean from previous test - cleanupDataDir(dataDir) - mkdirSync(dataDir, { recursive: true }) - - // Simulate a later-stage partial initdb: PG_VERSION exists - // but base/ has fewer than 3 database directories - writeFileSync(join(dataDir, 'PG_VERSION'), '16') - mkdirSync(join(dataDir, 'base', '1'), { recursive: true }) - writeFileSync( - join(dataDir, 'base', '1', 'pg_filenode.map'), - 'fake catalog data', - ) - mkdirSync(join(dataDir, 'global'), { recursive: true }) - - const opened = await tryOpen(dataDir) - expect(opened.success).toBe(true) - - // Verify backup was created - const siblings = readdirSync(parentDir) - const baseName = dataDir.split('/').pop() - const backups = siblings.filter((s) => - s.startsWith(baseName + '.corrupt-'), - ) - expect(backups.length).toBeGreaterThanOrEqual(2) // one from each test - - // Find the latest backup - const latestBackup = join(parentDir, backups.sort().pop()) - const backupContents = readdirSync(latestBackup) - expect(backupContents).toContain('PG_VERSION') - expect(backupContents).toContain('base') - expect(backupContents).toContain('global') - - if (opened.db) { - await opened.db.close() - } - }, - { timeout: 60000 }, - ) - - it( - 'should NOT move a fully-initialized data directory', - async () => { - // Clean from previous tests - cleanupDataDir(dataDir) - - // Let PGlite do a real fresh init - const opened = await tryOpen(dataDir) - expect(opened.success).toBe(true) - - // Count backups before closing and reopening - const siblingsBefore = readdirSync(parentDir) - const baseName = dataDir.split('/').pop() - const backupsBefore = siblingsBefore.filter((s) => - s.startsWith(baseName + '.corrupt-'), - ) - - if (opened.db) { - await opened.db.close() - } - - // Reopen — should NOT create a backup since the dir is fully initialized - const reopened = await tryOpen(dataDir) - expect(reopened.success).toBe(true) - - const siblingsAfter = readdirSync(parentDir) - const backupsAfter = siblingsAfter.filter((s) => - s.startsWith(baseName + '.corrupt-'), - ) - - // No new backups should have been created - expect(backupsAfter.length).toBe(backupsBefore.length) - - if (reopened.db) { - await reopened.db.close() - } - }, - { timeout: 60000 }, - ) -}) diff --git a/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js b/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js deleted file mode 100644 index 934234e9d..000000000 --- a/packages/pglite/tests/crash-safety/wal-bloat-no-checkpoint.test.js +++ /dev/null @@ -1,651 +0,0 @@ -/** - * Crash Safety Test: WAL Bloat Without Checkpoint - * - * This test targets the most insidious corruption vector: accumulated WAL - * entries that never get checkpointed because close() is never called. - * - * In real dev usage, developers: - * 1. Start their app (PGlite opens, creates tables, inserts data) - * 2. Kill the process (Ctrl+C, crash, OOM, etc.) -- no close(), no checkpoint - * 3. Restart, repeat -- each time more WAL accumulates - * 4. After many such cycles, the WAL is enormous and recovery becomes fragile - * - * PostgreSQL's WAL recovery is designed to replay from the last checkpoint. - * But when there's NEVER a checkpoint (because close() never runs and - * _pgl_shutdown() never fires), the WAL grows unbounded and recovery must - * replay from the very beginning. - * - * This test runs 30+ kill cycles, each adding heavy mixed DML (INSERT, UPDATE, - * DELETE) with ~1KB rows, index modifications, and schema changes. The worker - * is SIGKILL'd every time before close() can run. - * - * After all cycles, we verify the database can still open and is consistent. - */ - -import { describe, it, expect, afterAll } from 'vitest' -import { fork } from 'node:child_process' -import { existsSync, rmSync, statSync, readdirSync } from 'node:fs' -import { join } from 'node:path' - -const dataDir = `/tmp/pglite-crash-wal-bloat-${Date.now()}` - -afterAll(async () => { - if (!process.env.RETAIN_DATA) { - if (existsSync(dataDir)) { - rmSync(dataDir, { recursive: true, force: true }) - } - } -}) - -/** - * Recursively compute the total size of a directory in bytes. - */ -function dirSizeBytes(dir) { - let total = 0 - try { - const entries = readdirSync(dir, { withFileTypes: true }) - for (const entry of entries) { - const fullPath = join(dir, entry.name) - if (entry.isDirectory()) { - total += dirSizeBytes(fullPath) - } else if (entry.isFile()) { - total += statSync(fullPath).size - } - } - } catch { - // directory may not exist yet - } - return total -} - -/** - * Spawn worker, let it run inner cycles, then SIGKILL. - */ -function runWorkerAndKill(workerPath, cycle, innerCycles, killStrategy) { - return new Promise((resolve) => { - const messages = [] - let killed = false - let killTimer = null - - const child = fork(workerPath, [], { - env: { - ...process.env, - PGLITE_DATA_DIR: dataDir, - INNER_CYCLES: String(innerCycles), - START_CYCLE: String(cycle), - }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - let stderr = '' - child.stderr.on('data', (d) => { - stderr += d.toString() - }) - child.stdout.on('data', () => {}) // drain - - child.on('message', (msg) => { - messages.push(msg) - - if ( - killStrategy.onMessage && - msg.startsWith(killStrategy.onMessage) && - !killed - ) { - killed = true - if (killTimer) clearTimeout(killTimer) - child.kill('SIGKILL') - } - }) - - child.on('exit', (code, sig) => { - if (killTimer) clearTimeout(killTimer) - resolve({ - cycle, - killed: killed || sig === 'SIGKILL', - messages, - exitCode: code, - signal: sig, - stderr, - }) - }) - - // Timer-based kill if no message trigger - if (killStrategy.afterMs && !killStrategy.onMessage) { - killTimer = setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, killStrategy.afterMs) - } - - // Safety timeout - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 60000) - }) -} - -describe('crash safety: WAL bloat without checkpoint (30+ kill cycles)', () => { - it( - 'should survive 35 kill cycles of heavy DML without any checkpoint', - async () => { - const { PGlite } = await import('../../dist/index.js') - const workerPath = new URL( - './workers/wal-bloat-no-checkpoint.js', - import.meta.url, - ).pathname - - const TOTAL_OUTER_CYCLES = 35 - const allResults = [] - - // Track data dir size growth over time - const sizeHistory = [] - - // Track the cumulative start cycle for worker numbering - let nextStartCycle = 0 - - for (let outerCycle = 0; outerCycle < TOTAL_OUTER_CYCLES; outerCycle++) { - // Vary the kill strategy to hit different points in the DML cycle: - // - Early cycles: kill after schema + first inserts (mid-DML) - // - Middle cycles: kill after inserts but before updates complete - // - Late cycles: kill after full cycles complete but before close() - // - Some cycles: kill on a timer for unpredictable timing - - let innerCycles - let killStrategy - const startCycle = nextStartCycle - - if (outerCycle === 0) { - // First cycle: let schema creation + first batch complete - innerCycles = 2 - killStrategy = { onMessage: `cycle-done:${startCycle + 1}` } - } else if (outerCycle < 5) { - // Kill mid-insert (after first inner cycle's inserts) - innerCycles = 3 - killStrategy = { onMessage: `inserts-done:${startCycle + 1}` } - } else if (outerCycle < 10) { - // Kill mid-update - innerCycles = 2 - killStrategy = { onMessage: `updates-done:${startCycle}` } - } else if (outerCycle < 15) { - // Kill mid-delete - innerCycles = 2 - killStrategy = { onMessage: `deletes-done:${startCycle}` } - } else if (outerCycle < 20) { - // Kill after a full cycle completes (dirty but "consistent" state) - innerCycles = 3 - killStrategy = { onMessage: `cycle-done:${startCycle}` } - } else if (outerCycle < 25) { - // Timer-based kill for unpredictable timing (500ms-2000ms) - innerCycles = 4 - killStrategy = { afterMs: 500 + outerCycle * 60 } - } else if (outerCycle < 30) { - // Let many cycles run, kill after all are done (max WAL, no checkpoint) - innerCycles = 5 - killStrategy = { onMessage: 'all-cycles-done' } - } else { - // Final stretch: aggressive timer kills (200ms) - innerCycles = 3 - killStrategy = { afterMs: 200 } - } - - nextStartCycle += innerCycles - - const result = await runWorkerAndKill( - workerPath, - startCycle, - innerCycles, - killStrategy, - ) - - allResults.push(result) - - // Track directory size - const currentSize = dirSizeBytes(dataDir) - sizeHistory.push({ outerCycle, sizeBytes: currentSize }) - - // Log progress every 5 cycles - if (outerCycle % 5 === 0 || outerCycle === TOTAL_OUTER_CYCLES - 1) { - const sizeMB = (currentSize / 1024 / 1024).toFixed(2) - const messagesPreview = result.messages.slice(0, 5).join(', ') - console.log( - `Outer cycle ${outerCycle}/${TOTAL_OUTER_CYCLES}: ` + - `size=${sizeMB}MB, killed=${result.killed}, ` + - `messages=[${messagesPreview}...]`, - ) - } - - // The worker should have been killed OR crashed (crashing on open - // is itself evidence of corruption from previous cycles) - const workerCrashedOrKilled = result.killed || result.exitCode !== 0 - if (!result.killed && result.exitCode !== 0) { - console.log( - ` Worker CRASHED on its own at outer cycle ${outerCycle} (exit=${result.exitCode}): ${result.stderr.slice(0, 200)}`, - ) - // If the worker can't even open the DB, that's corruption — skip remaining cycles - if ( - result.messages.length === 0 || - !result.messages.includes('ready') - ) { - console.log( - ` CORRUPTION: Worker couldn't open DB at outer cycle ${outerCycle}`, - ) - break - } - } - expect(workerCrashedOrKilled).toBe(true) - - // For message-based kills, the worker MUST have started (it sent the trigger) - // For timer-based kills, the worker may not have had time to open the DB - // (especially after WAL bloat makes recovery slow) — that's OK - if (result.killed && !killStrategy.afterMs) { - const started = - result.messages.includes('ready') || - result.messages.includes('schema-created') || - result.messages.some((m) => m.startsWith('cycle-start:')) - expect(started).toBe(true) - } else if ( - result.killed && - killStrategy.afterMs && - result.messages.length === 0 - ) { - // Worker couldn't even open before timer — WAL bloat is severe - console.log( - ` WAL BLOAT: Worker couldn't open DB within ${killStrategy.afterMs}ms at outer cycle ${outerCycle}`, - ) - } - - // Intermediate verification every 10 cycles to catch progressive corruption - if (outerCycle > 0 && outerCycle % 10 === 0) { - console.log( - `\n--- Intermediate check after outer cycle ${outerCycle} ---`, - ) - let checkDb = null - try { - checkDb = new PGlite(dataDir) - await Promise.race([ - checkDb.waitReady, - new Promise((_, reject) => - setTimeout( - () => - reject( - new Error( - `Intermediate open timed out at cycle ${outerCycle}`, - ), - ), - 30000, - ), - ), - ]) - - // Basic health - await checkDb.query('SELECT 1') - - // Count rows - const count = await checkDb.query( - 'SELECT count(*)::int as cnt FROM wal_stress', - ) - console.log(` wal_stress rows: ${count.rows[0].cnt}`) - - const logCount = await checkDb.query( - 'SELECT count(*)::int as cnt FROM wal_stress_log', - ) - console.log(` wal_stress_log rows: ${logCount.rows[0].cnt}`) - - // Verify indexes work - await checkDb.query( - `SELECT count(*) FROM wal_stress WHERE kind = 'alpha'`, - ) - await checkDb.query( - `SELECT count(*) FROM wal_stress WHERE cycle = 0`, - ) - - // Full sequential scan - const allRows = await checkDb.query( - 'SELECT id, cycle, batch, kind FROM wal_stress ORDER BY id', - ) - expect(allRows.rows.length).toBe(count.rows[0].cnt) - - await checkDb.close() - - // IMPORTANT: After this close(), a checkpoint DOES run. - // This resets the WAL accumulation. For the purest test of - // "never checkpointed" WAL, we only do this check occasionally. - console.log(` (checkpoint occurred due to close() -- WAL reset)`) - } catch (err) { - console.log(` CORRUPTION at cycle ${outerCycle}: ${err.message}`) - if (checkDb) - try { - await checkDb.close() - } catch (_) { - /* ignore */ - } - expect.fail( - `DB corrupted after ${outerCycle} kill cycles: ${err.message}`, - ) - } - } - - // No delay between cycles - maximum aggression - } - - // ---- Final comprehensive verification ---- - console.log('\n====== FINAL VERIFICATION ======') - console.log(`Completed ${TOTAL_OUTER_CYCLES} outer kill cycles`) - console.log(`Data dir size history:`) - for (const s of sizeHistory.filter( - (_, i) => i % 5 === 0 || i === sizeHistory.length - 1, - )) { - console.log( - ` Cycle ${s.outerCycle}: ${(s.sizeBytes / 1024 / 1024).toFixed(2)} MB`, - ) - } - - const finalSize = dirSizeBytes(dataDir) - console.log( - `Final data dir size: ${(finalSize / 1024 / 1024).toFixed(2)} MB`, - ) - - let finalDb = null - try { - finalDb = new PGlite(dataDir) - await Promise.race([ - finalDb.waitReady, - new Promise((_, reject) => - setTimeout( - () => reject(new Error('Final open timed out (WAL bloat)')), - 60000, - ), - ), - ]) - } catch (err) { - console.log( - `FINAL CORRUPTION after ${TOTAL_OUTER_CYCLES} cycles: ${err.message}`, - ) - expect.fail(`DB corrupted after all WAL bloat cycles: ${err.message}`) - } - - try { - // Health check - await finalDb.query('SELECT 1 as ok') - - // Verify tables exist - const tables = await finalDb.query(` - SELECT tablename FROM pg_tables - WHERE schemaname = 'public' - ORDER BY tablename - `) - const tableNames = tables.rows.map((r) => r.tablename) - expect(tableNames).toContain('wal_stress') - expect(tableNames).toContain('wal_stress_log') - console.log(`Tables: ${tableNames.join(', ')}`) - - // Row counts - const stressCount = await finalDb.query( - 'SELECT count(*)::int as cnt FROM wal_stress', - ) - const logCount = await finalDb.query( - 'SELECT count(*)::int as cnt FROM wal_stress_log', - ) - console.log(`wal_stress rows: ${stressCount.rows[0].cnt}`) - console.log(`wal_stress_log rows: ${logCount.rows[0].cnt}`) - expect(stressCount.rows[0].cnt).toBeGreaterThan(0) - - // Index verification - const indexes = await finalDb.query(` - SELECT indexname, tablename FROM pg_indexes - WHERE schemaname = 'public' - ORDER BY indexname - `) - console.log( - `Indexes: ${indexes.rows.map((r) => r.indexname).join(', ')}`, - ) - for (const idx of indexes.rows) { - await finalDb.query(`SELECT count(*) FROM "${idx.tablename}"`) - } - - // Full sequential scan of wal_stress - const allRows = await finalDb.query( - 'SELECT id, cycle, batch, kind, counter FROM wal_stress ORDER BY id', - ) - expect(allRows.rows.length).toBe(stressCount.rows[0].cnt) - - // Verify data integrity: no NULLs in NOT NULL columns - for (const row of allRows.rows) { - expect(row.id).not.toBeNull() - expect(row.cycle).not.toBeNull() - expect(row.batch).not.toBeNull() - expect(row.kind).not.toBeNull() - expect(['alpha', 'beta', 'gamma', 'delta']).toContain(row.kind) - } - - // Aggregate checks - const kindCounts = await finalDb.query(` - SELECT kind, count(*)::int as cnt, avg(counter)::int as avg_counter - FROM wal_stress - GROUP BY kind - ORDER BY kind - `) - console.log('Kind distribution:') - for (const row of kindCounts.rows) { - console.log( - ` ${row.kind}: ${row.cnt} rows, avg_counter=${row.avg_counter}`, - ) - } - - // Verify cycle distribution (should span multiple cycles) - const cycleDist = await finalDb.query(` - SELECT min(cycle)::int as min_cycle, max(cycle)::int as max_cycle, - count(DISTINCT cycle)::int as distinct_cycles - FROM wal_stress - `) - console.log( - `Cycle range: ${cycleDist.rows[0].min_cycle} - ${cycleDist.rows[0].max_cycle}, ` + - `${cycleDist.rows[0].distinct_cycles} distinct cycles`, - ) - expect(cycleDist.rows[0].distinct_cycles).toBeGreaterThan(1) - - // Verify log table integrity - const logRows = await finalDb.query( - 'SELECT id, cycle, operation, row_count FROM wal_stress_log ORDER BY id', - ) - expect(logRows.rows.length).toBe(logCount.rows[0].cnt) - for (const row of logRows.rows) { - expect(row.operation).toBe('full-cycle') - expect(row.row_count).toBeGreaterThan(0) - } - - // Verify we can still write after recovery - await finalDb.query( - `INSERT INTO wal_stress (cycle, batch, kind, value, counter) - VALUES ($1, $2, $3, $4, $5)`, - [99999, 0, 'alpha', 'final-verification-row', 0], - ) - const verifyRow = await finalDb.query( - `SELECT * FROM wal_stress WHERE cycle = 99999`, - ) - expect(verifyRow.rows.length).toBe(1) - - // Verify we can still UPDATE after recovery - await finalDb.query( - `UPDATE wal_stress SET counter = -1 WHERE cycle = 99999`, - ) - const updatedRow = await finalDb.query( - `SELECT counter FROM wal_stress WHERE cycle = 99999`, - ) - expect(updatedRow.rows[0].counter).toBe(-1) - - // Verify we can still DELETE after recovery - await finalDb.query(`DELETE FROM wal_stress WHERE cycle = 99999`) - const deletedRow = await finalDb.query( - `SELECT count(*)::int as cnt FROM wal_stress WHERE cycle = 99999`, - ) - expect(deletedRow.rows[0].cnt).toBe(0) - } finally { - await finalDb.close() - } - }, - { timeout: 300000 }, - ) - - it( - 'should survive burst-mode: 15 extremely rapid kill cycles with no delay', - async () => { - const { PGlite } = await import('../../dist/index.js') - const burstDataDir = `${dataDir}-burst` - const workerPath = new URL( - './workers/wal-bloat-no-checkpoint.js', - import.meta.url, - ).pathname - - const BURST_CYCLES = 15 - - for (let i = 0; i < BURST_CYCLES; i++) { - // Each burst cycle: open, do 1-2 inner cycles of heavy DML, SIGKILL - // Kill very aggressively: 300ms timer (may kill mid-INSERT) - const burstResult = await new Promise((resolve) => { - const messages = [] - let killed = false - - const child = fork(workerPath, [], { - env: { - ...process.env, - PGLITE_DATA_DIR: burstDataDir, - INNER_CYCLES: '2', - START_CYCLE: String(i * 2), - }, - stdio: ['pipe', 'pipe', 'pipe', 'ipc'], - }) - - child.stderr.on('data', () => {}) - child.stdout.on('data', () => {}) - - child.on('message', (msg) => { - messages.push(msg) - }) - - child.on('exit', (code, sig) => { - resolve({ - killed: killed || sig === 'SIGKILL', - exitCode: code, - messages, - }) - }) - - // Very aggressive: kill after 300ms no matter what - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 300) - - // Safety - setTimeout(() => { - if (!killed) { - killed = true - child.kill('SIGKILL') - } - }, 30000) - }) - - // Worker should either be killed by timer, crash on open, or crash during DML - const workerKilled = burstResult.killed - const workerCrashed = - burstResult.exitCode !== 0 && burstResult.exitCode !== null - const workerCouldntOpen = !burstResult.messages.includes('ready') - - if (workerCouldntOpen && !workerKilled) { - console.log( - ` Burst cycle ${i}: Worker couldn't open DB (corruption from previous cycles)`, - ) - break // DB is corrupted, no point continuing - } - - // The worker must have been killed OR crashed (either before or after opening) - expect(workerKilled || workerCrashed || workerCouldntOpen).toBe(true) - } - - // Verify after burst - console.log('\n--- Burst mode verification ---') - const burstSize = dirSizeBytes(burstDataDir) - console.log( - `Burst data dir size: ${(burstSize / 1024 / 1024).toFixed(2)} MB`, - ) - - let db = null - try { - db = new PGlite(burstDataDir) - await Promise.race([ - db.waitReady, - new Promise((_, reject) => - setTimeout( - () => reject(new Error('Burst mode final open timed out')), - 30000, - ), - ), - ]) - - await db.query('SELECT 1') - - // The table may or may not exist depending on whether the first - // cycle's schema creation completed before the 300ms kill - const tableCheck = await db.query(` - SELECT tablename FROM pg_tables - WHERE schemaname = 'public' AND tablename = 'wal_stress' - `) - - if (tableCheck.rows.length > 0) { - const count = await db.query( - 'SELECT count(*)::int as cnt FROM wal_stress', - ) - console.log(`Burst mode rows: ${count.rows[0].cnt}`) - - // Full scan - const rows = await db.query( - 'SELECT id, cycle, kind FROM wal_stress ORDER BY id', - ) - expect(rows.rows.length).toBe(count.rows[0].cnt) - } else { - // Table doesn't exist = schema creation was killed every time. - // This is a valid outcome, not corruption. - console.log('Burst mode: schema never completed (killed too fast)') - } - - // Verify DB is writable - await db.query(` - CREATE TABLE IF NOT EXISTS burst_verify (id SERIAL PRIMARY KEY, ok BOOLEAN) - `) - await db.query(`INSERT INTO burst_verify (ok) VALUES (true)`) - const verify = await db.query('SELECT ok FROM burst_verify') - expect(verify.rows[0].ok).toBe(true) - - await db.close() - } catch (err) { - console.log(`Burst mode CORRUPTION: ${err.message}`) - if (db) - try { - await db.close() - } catch (_) { - /* ignore */ - } - expect.fail(`Burst mode corrupted DB: ${err.message}`) - } finally { - if (!process.env.RETAIN_DATA) { - if (existsSync(burstDataDir)) { - rmSync(burstDataDir, { recursive: true, force: true }) - } - const lockFile = burstDataDir + '.lock' - if (existsSync(lockFile)) { - rmSync(lockFile, { force: true }) - } - } - } - }, - { timeout: 180000 }, - ) -}) diff --git a/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js b/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js deleted file mode 100644 index edee7ba82..000000000 --- a/packages/pglite/tests/crash-safety/workers/hmr-double-instance.js +++ /dev/null @@ -1,88 +0,0 @@ -// HMR double-instance: opens instance A, then tries to open instance B on the -// same data dir without closing A. Instance B should be blocked by file locking. -// Process stays alive for external SIGKILL. - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const cycle = parseInt(process.env.CYCLE || '0', 10) -const overlapOps = parseInt(process.env.OVERLAP_OPS || '20', 10) - -async function run() { - const instanceA = new PGlite(dataDir) - await instanceA.waitReady - process.send('instance-a-ready') - - if (cycle === 0) { - await instanceA.query(` - CREATE TABLE IF NOT EXISTS hmr_data ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - instance TEXT NOT NULL, - phase TEXT NOT NULL, - seq INTEGER NOT NULL, - payload TEXT NOT NULL, - created_at TIMESTAMP DEFAULT NOW() - ) - `) - await instanceA.query(` - CREATE INDEX IF NOT EXISTS idx_hmr_data_cycle ON hmr_data (cycle) - `) - await instanceA.query(` - CREATE INDEX IF NOT EXISTS idx_hmr_data_instance ON hmr_data (instance) - `) - process.send('schema-created') - } - - const padding = 'A'.repeat(500) - for (let i = 0; i < 10; i++) { - await instanceA.query( - `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) - VALUES ($1, $2, $3, $4, $5)`, - [cycle, 'A', 'pre-hmr', i, `cycle${cycle}-A-pre-${i}-${padding}`], - ) - } - process.send('instance-a-wrote') - - // Instance B should fail to open due to file lock - try { - const instanceB = new PGlite(dataDir) - await instanceB.waitReady - process.send('instance-b-ready') - - for (let i = 0; i < overlapOps; i++) { - await instanceB.query( - `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) - VALUES ($1, $2, $3, $4, $5)`, - [cycle, 'B', 'post-hmr', i, `cycle${cycle}-B-post-${i}-${padding}`], - ) - } - process.send('overlap-done') - } catch (err) { - process.send('instance-b-blocked') - process.send(`lock-error:${err.message}`) - } - - for (let i = 10; i < 20; i++) { - await instanceA.query( - `INSERT INTO hmr_data (cycle, instance, phase, seq, payload) - VALUES ($1, $2, $3, $4, $5)`, - [cycle, 'A', 'post-hmr', i, `cycle${cycle}-A-post-${i}-${padding}`], - ) - } - process.send('instance-a-continued') - - process.send('all-operations-done') - - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`HMR worker cycle ${cycle} error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js b/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js deleted file mode 100644 index 12cce03dc..000000000 --- a/packages/pglite/tests/crash-safety/workers/overlapping-ddl-writer.js +++ /dev/null @@ -1,116 +0,0 @@ -// DDL/DML writer: runs DDL or DML ops based on WRITER_MODE env var. -// Killed externally by parent to simulate crash. - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const mode = process.env.WRITER_MODE || 'dml' -const cycle = parseInt(process.env.CYCLE || '0', 10) - -async function run() { - const pad = 'D'.repeat(300) - - const db = new PGlite(dataDir) - await db.waitReady - process.send('ready') - - if (mode === 'ddl') { - try { - await db.query(` - CREATE TABLE IF NOT EXISTS ddl_base ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - data TEXT NOT NULL - ) - `) - process.send('base-table-ready') - - const tbl = `ddl_cycle_${cycle}` - await db.query(` - CREATE TABLE IF NOT EXISTS ${tbl} ( - id SERIAL PRIMARY KEY, - val TEXT NOT NULL, - num INTEGER DEFAULT 0 - ) - `) - process.send('new-table-created') - - await db.query( - `ALTER TABLE ${tbl} ADD COLUMN IF NOT EXISTS extra TEXT DEFAULT 'none'`, - ) - process.send('alter-done') - - await db.query( - `CREATE INDEX IF NOT EXISTS idx_${tbl}_val ON ${tbl} (val)`, - ) - await db.query( - `CREATE INDEX IF NOT EXISTS idx_${tbl}_num ON ${tbl} (num)`, - ) - process.send('indexes-created') - - for (let i = 0; i < 30; i++) { - await db.query( - `INSERT INTO ${tbl} (val, num, extra) VALUES ($1, $2, $3)`, - [`val-${i}`, i * 10, `extra-${i}-${pad}`], - ) - } - process.send('ddl-inserts-done') - - for (let i = 0; i < 20; i++) { - await db.query(`INSERT INTO ddl_base (cycle, data) VALUES ($1, $2)`, [ - cycle, - `ddl-writer-${i}-${pad}`, - ]) - } - process.send('ddl-base-inserts-done') - } catch (err) { - process.send(`ddl-error:${err.message}`) - } - } else { - try { - await db.query(` - CREATE TABLE IF NOT EXISTS ddl_base ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - data TEXT NOT NULL - ) - `) - process.send('base-table-ready') - - for (let i = 0; i < 50; i++) { - await db.query(`INSERT INTO ddl_base (cycle, data) VALUES ($1, $2)`, [ - cycle, - `dml-writer-${i}-${pad}`, - ]) - } - process.send('dml-inserts-done') - - await db.query( - `UPDATE ddl_base SET data = data || '-updated' WHERE cycle = $1`, - [cycle], - ) - process.send('dml-updates-done') - - await db.query(`DELETE FROM ddl_base WHERE cycle = $1 AND id % 3 = 0`, [ - cycle, - ]) - process.send('dml-deletes-done') - } catch (err) { - process.send(`dml-error:${err.message}`) - } - } - - process.send('all-done') - - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`DDL writer (${mode}) cycle ${cycle} error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js b/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js deleted file mode 100644 index f15597619..000000000 --- a/packages/pglite/tests/crash-safety/workers/overlapping-rapid-cycling.js +++ /dev/null @@ -1,91 +0,0 @@ -// Rapid instance cycling: opens N instances on the same data dir without -// closing any. All hold stale WASM heaps simultaneously. Simulates worst-case -// dev server rapid reload. - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const numInstances = parseInt(process.env.NUM_INSTANCES || '10', 10) - -async function run() { - const instances = [] - const pad = 'R'.repeat(500) - - const first = new PGlite(dataDir) - await first.waitReady - instances.push(first) - - await first.query(` - CREATE TABLE IF NOT EXISTS rapid_data ( - id SERIAL PRIMARY KEY, - instance_num INTEGER NOT NULL, - seq INTEGER NOT NULL, - payload TEXT NOT NULL - ) - `) - await first.query( - `CREATE INDEX IF NOT EXISTS idx_rapid_inst ON rapid_data (instance_num)`, - ) - process.send('schema-created') - - await first.query( - `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, - [0, 0, `inst-0-row-0-${pad}`], - ) - process.send('instance-0-wrote') - - for (let i = 1; i < numInstances; i++) { - const inst = new PGlite(dataDir) - await inst.waitReady - instances.push(inst) - - await inst.query( - `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, - [i, 0, `inst-${i}-row-0-${pad}`], - ) - process.send(`instance-${i}-wrote`) - - // Stale cache: also write from a random older instance - if (instances.length > 2) { - const oldIdx = Math.floor(Math.random() * (instances.length - 1)) - try { - await instances[oldIdx].query( - `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, - [oldIdx, i, `inst-${oldIdx}-stale-write-from-cycle-${i}-${pad}`], - ) - } catch (err) { - process.send(`stale-write-error-${oldIdx}:${err.message}`) - } - } - } - - process.send('all-instances-created') - - try { - await Promise.all( - instances.map((inst, idx) => - inst.query( - `INSERT INTO rapid_data (instance_num, seq, payload) VALUES ($1, $2, $3)`, - [idx, 999, `inst-${idx}-final-burst-${pad}`], - ), - ), - ) - process.send('final-burst-done') - } catch (err) { - process.send(`final-burst-error:${err.message}`) - } - - process.send('all-done') - - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`Rapid cycling worker error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js b/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js deleted file mode 100644 index ca6cdc359..000000000 --- a/packages/pglite/tests/crash-safety/workers/overlapping-staggered.js +++ /dev/null @@ -1,101 +0,0 @@ -// Staggered overlap: instance A starts writing, instance B opens after a delay -// while A is still active. Both write concurrently to the same data dir. - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const cycle = parseInt(process.env.CYCLE || '0', 10) -const staggerMs = parseInt(process.env.STAGGER_MS || '500', 10) - -async function run() { - const pad = 'S'.repeat(300) - - const a = new PGlite(dataDir) - await a.waitReady - process.send('a-ready') - - if (cycle === 0) { - await a.query(` - CREATE TABLE IF NOT EXISTS stagger_data ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - instance TEXT NOT NULL, - seq INTEGER NOT NULL, - payload TEXT NOT NULL - ) - `) - await a.query( - `CREATE INDEX IF NOT EXISTS idx_stagger_cycle ON stagger_data (cycle)`, - ) - process.send('schema-created') - } - - const aWritePromise = (async () => { - for (let i = 0; i < 30; i++) { - await a.query( - `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, 'A', i, `c${cycle}-A-${i}-${pad}`], - ) - if (i % 5 === 0) { - await new Promise((r) => setTimeout(r, 10)) - } - } - await a.query( - `UPDATE stagger_data SET payload = payload || '-a-updated' WHERE instance = 'A' AND cycle = $1 AND seq < 10`, - [cycle], - ) - })() - - await new Promise((r) => setTimeout(r, staggerMs)) - - const b = new PGlite(dataDir) - await b.waitReady - process.send('b-ready') - - const bWritePromise = (async () => { - for (let i = 0; i < 30; i++) { - await b.query( - `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, 'B', i, `c${cycle}-B-${i}-${pad}`], - ) - } - // Deletes touch pages A may have cached - try { - await b.query( - `DELETE FROM stagger_data WHERE instance = 'A' AND cycle = $1 AND seq > 20`, - [cycle], - ) - } catch (_) { - /* ignore */ - } - })() - - await Promise.allSettled([aWritePromise, bWritePromise]) - process.send('overlap-writes-done') - - // A continues with stale cache after B has mutated pages - try { - for (let i = 50; i < 60; i++) { - await a.query( - `INSERT INTO stagger_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, 'A', i, `c${cycle}-A-stale-${i}-${pad}`], - ) - } - process.send('a-stale-writes-done') - } catch (err) { - process.send(`a-stale-error:${err.message}`) - } - - process.send('all-done') - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`Staggered worker cycle ${cycle} error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js b/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js deleted file mode 100644 index 51196a2c2..000000000 --- a/packages/pglite/tests/crash-safety/workers/overlapping-three-instances.js +++ /dev/null @@ -1,104 +0,0 @@ -// Three simultaneous instances on the same data dir, none closed. -// All three write concurrently with stale WASM heaps. - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const cycle = parseInt(process.env.CYCLE || '0', 10) - -async function run() { - const a = new PGlite(dataDir) - await a.waitReady - process.send('instance-a-ready') - - if (cycle === 0) { - await a.query(` - CREATE TABLE IF NOT EXISTS triple_data ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - instance TEXT NOT NULL, - seq INTEGER NOT NULL, - payload TEXT NOT NULL - ) - `) - await a.query( - `CREATE INDEX IF NOT EXISTS idx_triple_cycle ON triple_data (cycle)`, - ) - process.send('schema-created') - } - - const pad = 'X'.repeat(400) - for (let i = 0; i < 10; i++) { - await a.query( - `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, 'A', i, `c${cycle}-A-${i}-${pad}`], - ) - } - process.send('a-wrote') - - const b = new PGlite(dataDir) - await b.waitReady - process.send('instance-b-ready') - - for (let i = 0; i < 10; i++) { - await b.query( - `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, 'B', i, `c${cycle}-B-${i}-${pad}`], - ) - } - process.send('b-wrote') - - const c = new PGlite(dataDir) - await c.waitReady - process.send('instance-c-ready') - - const writeAll = async (inst, name, start, count) => { - for (let i = start; i < start + count; i++) { - await inst.query( - `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, $2, $3, $4)`, - [cycle, name, i, `c${cycle}-${name}-concurrent-${i}-${pad}`], - ) - } - } - - await Promise.all([ - writeAll(a, 'A', 100, 20), - writeAll(b, 'B', 100, 20), - writeAll(c, 'C', 100, 20), - ]) - process.send('concurrent-writes-done') - - // Mixed ops: A updates while B and C insert (page conflicts) - try { - await Promise.all([ - a.query( - `UPDATE triple_data SET payload = 'A-updated' WHERE instance = 'A' AND cycle = $1 AND seq < 5`, - [cycle], - ), - b.query( - `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, 'B', 200, 'b-extra')`, - [cycle], - ), - c.query( - `INSERT INTO triple_data (cycle, instance, seq, payload) VALUES ($1, 'C', 200, 'c-extra')`, - [cycle], - ), - ]) - } catch (err) { - process.send(`mixed-ops-error:${err.message}`) - } - - process.send('all-done') - - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`Three-instance worker cycle ${cycle} error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js b/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js deleted file mode 100644 index c128b2867..000000000 --- a/packages/pglite/tests/crash-safety/workers/wal-bloat-no-checkpoint.js +++ /dev/null @@ -1,174 +0,0 @@ -// WAL bloat: heavy INSERT/UPDATE/DELETE without ever calling close() or CHECKPOINT. -// Parent SIGKILLs at random points. Tests recovery under extreme WAL pressure -// with complex entries (DDL, partial indexes, scatter updates). - -import { PGlite } from '../../../dist/index.js' - -const dataDir = process.env.PGLITE_DATA_DIR -const innerCycles = parseInt(process.env.INNER_CYCLES || '10', 10) -const startCycle = parseInt(process.env.START_CYCLE || '0', 10) - -let seed = startCycle * 1000 + 42 -function nextRand() { - seed = (seed * 1103515245 + 12345) & 0x7fffffff - return seed -} - -async function run() { - const db = new PGlite(dataDir) - await db.waitReady - process.send('ready') - - // Idempotent — prior cycle may have been killed mid-schema-creation - await db.query(` - CREATE TABLE IF NOT EXISTS wal_stress ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - batch INTEGER NOT NULL, - kind TEXT NOT NULL, - value TEXT NOT NULL, - counter INTEGER DEFAULT 0, - updated_at TIMESTAMP DEFAULT NOW() - ) - `) - await db.query(` - CREATE INDEX IF NOT EXISTS idx_wal_stress_cycle ON wal_stress (cycle) - `) - await db.query(` - CREATE INDEX IF NOT EXISTS idx_wal_stress_kind ON wal_stress (kind) - `) - await db.query(` - CREATE TABLE IF NOT EXISTS wal_stress_log ( - id SERIAL PRIMARY KEY, - cycle INTEGER NOT NULL, - operation TEXT NOT NULL, - row_count INTEGER, - logged_at TIMESTAMP DEFAULT NOW() - ) - `) - process.send('schema-created') - - for (let c = 0; c < innerCycles; c++) { - const cycleNum = startCycle + c - process.send(`cycle-start:${cycleNum}`) - - const padding = 'W'.repeat(800) - for (let i = 0; i < 50; i++) { - const kind = ['alpha', 'beta', 'gamma', 'delta'][nextRand() % 4] - await db.query( - `INSERT INTO wal_stress (cycle, batch, kind, value, counter) - VALUES ($1, $2, $3, $4, $5)`, - [ - cycleNum, - i, - kind, - `c${cycleNum}-b${i}-${kind}-${padding}`, - nextRand() % 10000, - ], - ) - } - process.send(`inserts-done:${cycleNum}`) - - const kinds = ['alpha', 'beta', 'gamma', 'delta'] - const targetKind = kinds[nextRand() % 4] - await db.query( - `UPDATE wal_stress - SET counter = counter + $1, value = value || $2, updated_at = NOW() - WHERE kind = $3`, - [nextRand() % 100, `-upd${cycleNum}`, targetKind], - ) - - const updateCycle = nextRand() % (cycleNum + 1) - await db.query( - `UPDATE wal_stress - SET counter = counter + 1 - WHERE cycle = $1 AND batch < 25`, - [updateCycle], - ) - - const step = (nextRand() % 5) + 2 - await db.query( - `UPDATE wal_stress - SET value = LEFT(value, 200) || $1 - WHERE id % $2 = 0`, - [`-scatter${cycleNum}`, step], - ) - - process.send(`updates-done:${cycleNum}`) - - if (cycleNum > 2) { - const deleteCycle = nextRand() % Math.max(1, cycleNum - 1) - const deleteLimit = (nextRand() % 15) + 5 - await db.query( - `DELETE FROM wal_stress - WHERE id IN ( - SELECT id FROM wal_stress - WHERE cycle = $1 AND batch >= $2 - ORDER BY id - LIMIT $3 - )`, - [deleteCycle, 30, deleteLimit], - ) - } - - if (cycleNum % 3 === 0) { - await db.query( - `DELETE FROM wal_stress - WHERE kind = $1 AND cycle < $2 AND batch > 40`, - [kinds[nextRand() % 4], cycleNum], - ) - } - - process.send(`deletes-done:${cycleNum}`) - - await db.query( - `INSERT INTO wal_stress_log (cycle, operation, row_count) - VALUES ($1, $2, (SELECT count(*) FROM wal_stress))`, - [cycleNum, 'full-cycle'], - ) - - // Occasional DDL to diversify WAL entries - if (cycleNum % 5 === 0 && cycleNum > 0) { - const colName = `extra_${cycleNum}` - await db.query(` - ALTER TABLE wal_stress ADD COLUMN IF NOT EXISTS ${colName} TEXT DEFAULT NULL - `) - - await db.query(` - CREATE INDEX IF NOT EXISTS idx_wal_stress_c${cycleNum} - ON wal_stress (counter) - WHERE cycle = ${cycleNum} - `) - - process.send(`ddl-done:${cycleNum}`) - } - - if (cycleNum % 4 === 0) { - await db.query( - `UPDATE wal_stress - SET counter = counter + $1 - WHERE cycle >= $2`, - [1, Math.max(0, cycleNum - 3)], - ) - } - - process.send(`cycle-done:${cycleNum}`) - } - - process.send('all-cycles-done') - - // No close() or CHECKPOINT — keep alive for SIGKILL. - // setInterval needed because a bare Promise doesn't keep Node alive. - setInterval(() => {}, 60000) - await new Promise(() => {}) -} - -run().catch((err) => { - console.error(`WAL bloat worker error:`, err) - try { - process.send(`fatal:${err.message}`) - } catch (_) { - /* ignore */ - } - process.exit(1) -}) diff --git a/packages/pglite/tests/nodefs-lock.test.js b/packages/pglite/tests/nodefs-lock.test.js new file mode 100644 index 000000000..95ab21472 --- /dev/null +++ b/packages/pglite/tests/nodefs-lock.test.js @@ -0,0 +1,71 @@ +import { describe, it, expect, afterAll } from 'vitest' +import { existsSync, writeFileSync, rmSync } from 'node:fs' + +const dataDir = `/tmp/pglite-lock-test-${Date.now()}` + +afterAll(async () => { + if (!process.env.RETAIN_DATA) { + for (const p of [dataDir, dataDir + '.lock']) { + if (existsSync(p)) rmSync(p, { recursive: true, force: true }) + } + } +}) + +describe('NodeFS data directory locking', () => { + it('should block a second instance from opening the same data directory', async () => { + const { PGlite } = await import('../dist/index.js') + + const db1 = new PGlite(dataDir) + await db1.waitReady + + // Lock file should exist while db1 is open + expect(existsSync(dataDir + '.lock')).toBe(true) + + // Second instance on same dir must throw + let lockError = null + try { + const db2 = new PGlite(dataDir) + await db2.waitReady + await db2.close() + } catch (err) { + lockError = err + } + + expect(lockError).not.toBeNull() + expect(lockError.message).toContain('already in use') + expect(lockError.message).toContain(String(process.pid)) + + // First instance should still work fine + const result = await db1.query('SELECT 1 as ok') + expect(result.rows[0].ok).toBe(1) + + await db1.close() + }, 30000) + + it('should allow reopening after the first instance is closed', async () => { + const { PGlite } = await import('../dist/index.js') + + // Lock file should be cleaned up after close + expect(existsSync(dataDir + '.lock')).toBe(false) + + const db = new PGlite(dataDir) + await db.waitReady + const result = await db.query('SELECT 1 as ok') + expect(result.rows[0].ok).toBe(1) + await db.close() + }, 30000) + + it('should override a stale lock from a dead process', async () => { + const { PGlite } = await import('../dist/index.js') + + // Write a fake lock file with a PID that doesn't exist + writeFileSync(dataDir + '.lock', '999999\n0\n') + + // Should succeed — stale lock gets overridden + const db = new PGlite(dataDir) + await db.waitReady + const result = await db.query('SELECT 1 as ok') + expect(result.rows[0].ok).toBe(1) + await db.close() + }, 30000) +})