From 5544e9cb965faf4fea06b38d77a84354abd34c10 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 15:22:07 -0400 Subject: [PATCH 01/16] initial local + pg stubs Signed-off-by: nathancolosimo --- packages/core/src/index.ts | 6 ++ packages/core/src/lock.ts | 36 +++++++++ packages/core/src/workflow/index.ts | 6 ++ packages/world-local/README.md | 3 +- packages/world-local/src/index.ts | 2 + packages/world-local/src/limits.ts | 15 ++++ packages/world-postgres/src/index.ts | 3 + packages/world-postgres/src/limits.ts | 20 +++++ packages/world-vercel/src/index.ts | 2 + packages/world-vercel/src/limits.ts | 16 ++++ packages/world/package.json | 4 +- packages/world/src/index.ts | 18 +++++ packages/world/src/interfaces.ts | 3 + packages/world/src/limits.ts | 105 ++++++++++++++++++++++++++ 14 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 packages/core/src/lock.ts create mode 100644 packages/world-local/src/limits.ts create mode 100644 packages/world-postgres/src/limits.ts create mode 100644 packages/world-vercel/src/limits.ts create mode 100644 packages/world/src/limits.ts diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 1d969aeaa6..413f87fa74 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -25,6 +25,12 @@ export { type WebhookOptions, } from './create-hook.js'; export { defineHook, type TypedHook } from './define-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from './lock.js'; export { sleep } from './sleep.js'; export { getStepMetadata, diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts new file mode 100644 index 0000000000..fc9a848875 --- /dev/null +++ b/packages/core/src/lock.ts @@ -0,0 +1,36 @@ +import { + createLimitsNotImplementedError, + type LimitDefinition, + type LimitKey, + type LimitLease, +} from '@workflow/world'; + +export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; + +/** + * Reserved first-pass user-facing API for future flow concurrency and rate + * limiting inside workflow functions. + */ +export interface LockOptions extends LimitDefinition { + key: LimitKey; + leaseTtlMs?: number; +} + +/** + * Reserved handle shape for future lock acquisition. + */ +export interface LockHandle + extends Pick { + release(): Promise; + heartbeat(ttlMs?: number): Promise; +} + +/** + * Reserved workflow API for future concurrency and rate limiting. + * + * This placeholder intentionally throws until the runtime and world + * implementations gain real support. + */ +export async function lock(_options: LockOptions): Promise { + throw createLimitsNotImplementedError(); +} diff --git a/packages/core/src/workflow/index.ts b/packages/core/src/workflow/index.ts index 61cc317491..86807ed04b 100644 --- a/packages/core/src/workflow/index.ts +++ b/packages/core/src/workflow/index.ts @@ -6,6 +6,12 @@ export { type RetryableErrorOptions, } from '@workflow/errors'; export type { Hook, HookOptions } from '../create-hook.js'; +export { + lock, + type LockHandle, + type LockOptions, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from '../lock.js'; export { sleep } from '../sleep.js'; export { createHook, createWebhook } from './create-hook.js'; export { defineHook } from './define-hook.js'; diff --git a/packages/world-local/README.md b/packages/world-local/README.md index 9e3f0d95cc..cff6a3354a 100644 --- a/packages/world-local/README.md +++ b/packages/world-local/README.md @@ -4,5 +4,6 @@ Filesystem-based workflow backend for local development and testing. Stores workflow data as JSON files on disk and provides in-memory queuing. Automatically detects development server port for queue transport. -Used by default on `next dev` and `next start`. +The `limits` namespace is exposed as part of the shared world contract, but flow concurrency and rate limiting are not implemented in this package yet. +Used by default on `next dev` and `next start`. diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 6ec4800c8e..96f03efa57 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -12,6 +12,7 @@ import { readJSON, } from './fs.js'; import { initDataDir } from './init.js'; +import { createLimits } from './limits.js'; import { createQueue, type DirectHandler } from './queue.js'; import { createStorage } from './storage.js'; import { hashToken } from './storage/helpers.js'; @@ -61,6 +62,7 @@ export function createLocalWorld(args?: Partial): LocalWorld { const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); return { + limits: createLimits(mergedConfig.dataDir, tag), ...queue, ...createStorage(mergedConfig.dataDir, tag), ...createStreamer(mergedConfig.dataDir, tag), diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts new file mode 100644 index 0000000000..5e2f249449 --- /dev/null +++ b/packages/world-local/src/limits.ts @@ -0,0 +1,15 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; + +export function createLimits(dataDir: string, tag?: string): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world-postgres/src/index.ts b/packages/world-postgres/src/index.ts index 6f2993e3db..ad1a4c0028 100644 --- a/packages/world-postgres/src/index.ts +++ b/packages/world-postgres/src/index.ts @@ -3,6 +3,7 @@ import type { Storage, World } from '@workflow/world'; import createPostgres from 'postgres'; import type { PostgresWorldConfig } from './config.js'; import { createClient, type Drizzle } from './drizzle/index.js'; +import { createLimits } from './limits.js'; import { createQueue } from './queue.js'; import { createEventsStorage, @@ -37,8 +38,10 @@ export function createWorld( const queue = createQueue(config, postgres); const storage = createStorage(drizzle); const streamer = createStreamer(postgres, drizzle); + const limits = createLimits(config, drizzle); return { + limits, ...storage, ...streamer, ...queue, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts new file mode 100644 index 0000000000..7294a90c3b --- /dev/null +++ b/packages/world-postgres/src/limits.ts @@ -0,0 +1,20 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import type { PostgresWorldConfig } from './config.js'; +import type { Drizzle } from './drizzle/index.js'; + +export function createLimits( + config: PostgresWorldConfig, + drizzle: Drizzle +): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world-vercel/src/index.ts b/packages/world-vercel/src/index.ts index 975dc49863..ec7b9bdb1b 100644 --- a/packages/world-vercel/src/index.ts +++ b/packages/world-vercel/src/index.ts @@ -1,5 +1,6 @@ import type { World } from '@workflow/world'; import { createGetEncryptionKeyForRun } from './encryption.js'; +import { createLimits } from './limits.js'; import { createQueue } from './queue.js'; import { createResolveLatestDeploymentId } from './resolve-latest-deployment.js'; import { createStorage } from './storage.js'; @@ -23,6 +24,7 @@ export function createVercelWorld(config?: APIConfig): World { config?.projectConfig?.projectId || process.env.VERCEL_PROJECT_ID; return { + limits: createLimits(config), ...createQueue(config), ...createStorage(config), ...createStreamer(config), diff --git a/packages/world-vercel/src/limits.ts b/packages/world-vercel/src/limits.ts new file mode 100644 index 0000000000..bff6c07ac2 --- /dev/null +++ b/packages/world-vercel/src/limits.ts @@ -0,0 +1,16 @@ +import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import type { APIConfig } from './utils.js'; + +export function createLimits(config?: APIConfig): Limits { + return { + async acquire() { + throw createLimitsNotImplementedError(); + }, + async release() { + throw createLimitsNotImplementedError(); + }, + async heartbeat() { + throw createLimitsNotImplementedError(); + }, + }; +} diff --git a/packages/world/package.json b/packages/world/package.json index e250e45412..57546d8b1c 100644 --- a/packages/world/package.json +++ b/packages/world/package.json @@ -20,7 +20,8 @@ "scripts": { "build": "tsc", "dev": "tsc --watch", - "clean": "tsc --build --clean && rm -rf dist" + "clean": "tsc --build --clean && rm -rf dist", + "test": "vitest run src" }, "dependencies": { "ulid": "catalog:" @@ -30,6 +31,7 @@ }, "devDependencies": { "@types/node": "catalog:", + "vitest": "catalog:", "zod": "catalog:", "@workflow/tsconfig": "workspace:*" }, diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index 3e7ed1c4fb..fd12d63d94 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -10,6 +10,24 @@ export { export type * from './hooks.js'; export { HookSchema } from './hooks.js'; export type * from './interfaces.js'; +export type * from './limits.js'; +export { + createLimitsNotImplementedError, + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, + LimitAcquireRequestSchema, + LimitAcquireResultSchema, + LimitAcquireStatusSchema, + LimitBlockedReasonSchema, + LimitConcurrencySchema, + LimitDefinitionSchema, + LimitHeartbeatRequestSchema, + LimitKeySchema, + LimitLeaseSchema, + LimitRateSchema, + LimitReleaseRequestSchema, + LIMITS_NOT_IMPLEMENTED_MESSAGE, +} from './limits.js'; export type * from './queue.js'; export { HealthCheckPayloadSchema, diff --git a/packages/world/src/interfaces.ts b/packages/world/src/interfaces.ts index d53fd96d14..87c57c0c8f 100644 --- a/packages/world/src/interfaces.ts +++ b/packages/world/src/interfaces.ts @@ -9,6 +9,7 @@ import type { RunCreatedEventRequest, } from './events.js'; import type { GetHookParams, Hook, ListHooksParams } from './hooks.js'; +import type { Limits } from './limits.js'; import type { Queue } from './queue.js'; import type { GetWorkflowRunParams, @@ -179,6 +180,8 @@ export interface Storage { * The "World" interface represents how Workflows are able to communicate with the outside world. */ export interface World extends Queue, Storage, Streamer { + limits: Limits; + /** * A function that will be called to start any background tasks needed by the World implementation. * For example, in the case of a queue backed World, this would start the queue processing. diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts new file mode 100644 index 0000000000..ec155b2d8d --- /dev/null +++ b/packages/world/src/limits.ts @@ -0,0 +1,105 @@ +import { z } from 'zod'; + +export const LIMITS_NOT_IMPLEMENTED_MESSAGE = + 'Flow limits are reserved for future support and are not implemented yet.'; + +export function createLimitsNotImplementedError(): Error { + return new Error(LIMITS_NOT_IMPLEMENTED_MESSAGE); +} + +export const LimitKeySchema = z.string().min(1); +export type LimitKey = z.infer; + +export const LimitConcurrencySchema = z.object({ + max: z.number().int().positive(), +}); +export type LimitConcurrency = z.infer; + +export const LimitRateSchema = z.object({ + count: z.number().int().positive(), + periodMs: z.number().int().positive(), +}); +export type LimitRate = z.infer; + +export const LimitDefinitionSchema = z + .object({ + concurrency: LimitConcurrencySchema.optional(), + rate: LimitRateSchema.optional(), + }) + .refine( + (value) => value.concurrency !== undefined || value.rate !== undefined, + { + message: 'At least one limit must be configured', + } + ); +export type LimitDefinition = z.infer; + +export const LimitLeaseSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema, + holderId: z.string().min(1), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date().optional(), + definition: LimitDefinitionSchema, +}); +export type LimitLease = z.infer; + +export const LimitAcquireRequestSchema = z.object({ + key: LimitKeySchema, + holderId: z.string().min(1), + definition: LimitDefinitionSchema, + leaseTtlMs: z.number().int().positive().optional(), +}); +export type LimitAcquireRequest = z.infer; + +export const LimitBlockedReasonSchema = z.enum([ + 'concurrency', + 'rate', + 'concurrency_and_rate', +]); +export type LimitBlockedReason = z.infer; + +export const LimitAcquireStatusSchema = z.enum(['acquired', 'blocked']); +export type LimitAcquireStatus = z.infer; + +export const LimitAcquireAcquiredResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.acquired), + lease: LimitLeaseSchema, +}); +export type LimitAcquireAcquiredResult = z.infer< + typeof LimitAcquireAcquiredResultSchema +>; + +export const LimitAcquireBlockedResultSchema = z.object({ + status: z.literal(LimitAcquireStatusSchema.enum.blocked), + reason: LimitBlockedReasonSchema, + retryAfterMs: z.number().int().nonnegative().optional(), +}); +export type LimitAcquireBlockedResult = z.infer< + typeof LimitAcquireBlockedResultSchema +>; + +export const LimitAcquireResultSchema = z.discriminatedUnion('status', [ + LimitAcquireAcquiredResultSchema, + LimitAcquireBlockedResultSchema, +]); +export type LimitAcquireResult = z.infer; + +export const LimitReleaseRequestSchema = z.object({ + leaseId: z.string().min(1), + key: LimitKeySchema.optional(), + holderId: z.string().min(1).optional(), +}); +export type LimitReleaseRequest = z.infer; + +export const LimitHeartbeatRequestSchema = z.object({ + leaseId: z.string().min(1), + ttlMs: z.number().int().positive().optional(), +}); +export type LimitHeartbeatRequest = z.infer; + +export interface Limits { + acquire(request: LimitAcquireRequest): Promise; + release(request: LimitReleaseRequest): Promise; + heartbeat(request: LimitHeartbeatRequest): Promise; +} From 03b016c77b67f21d3999ec08b37e1f36b86ed6f2 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 15:31:51 -0400 Subject: [PATCH 02/16] added test stubs Signed-off-by: nathancolosimo --- packages/core/src/lock.test.ts | 19 +++++++++++++++++++ packages/world-local/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-local/src/limits.ts | 2 +- packages/world-postgres/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-postgres/src/limits.ts | 4 ++-- packages/world-vercel/src/limits.test.ts | 19 +++++++++++++++++++ packages/world-vercel/src/limits.ts | 2 +- packages/world/src/limits.test.ts | 19 +++++++++++++++++++ 8 files changed, 99 insertions(+), 4 deletions(-) create mode 100644 packages/core/src/lock.test.ts create mode 100644 packages/world-local/src/limits.test.ts create mode 100644 packages/world-postgres/src/limits.test.ts create mode 100644 packages/world-vercel/src/limits.test.ts create mode 100644 packages/world/src/limits.test.ts diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts new file mode 100644 index 0000000000..3c1177b07c --- /dev/null +++ b/packages/core/src/lock.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('lock', () => { + it.fails('is only callable inside workflow execution context', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a handle with release and heartbeat behavior', () => { + throw new Error('TODO: implement'); + }); + + it.fails('allows multiple holders for one key up to the concurrency max', () => { + throw new Error('TODO: implement'); + }); + + it.fails('blocks rate-only locks until the rate window advances', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts new file mode 100644 index 0000000000..16ce754f8a --- /dev/null +++ b/packages/world-local/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('local world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('enforces per-key concurrency limits', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a retry path when rate limits block acquisition', () => { + throw new Error('TODO: implement'); + }); + + it.fails('restores capacity when a lease is released or expires', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 5e2f249449..68de99ccbb 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,6 +1,6 @@ import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; -export function createLimits(dataDir: string, tag?: string): Limits { +export function createLimits(_dataDir: string, _tag?: string): Limits { return { async acquire() { throw createLimitsNotImplementedError(); diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts new file mode 100644 index 0000000000..2c43f08584 --- /dev/null +++ b/packages/world-postgres/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('postgres world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('respects the concurrency cap across concurrent acquires', () => { + throw new Error('TODO: implement'); + }); + + it.fails('wakes waiters in deterministic order when a lease is released', () => { + throw new Error('TODO: implement'); + }); + + it.fails('reclaims stale leases after worker or process death', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 7294a90c3b..01e8184c79 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -3,8 +3,8 @@ import type { PostgresWorldConfig } from './config.js'; import type { Drizzle } from './drizzle/index.js'; export function createLimits( - config: PostgresWorldConfig, - drizzle: Drizzle + _config: PostgresWorldConfig, + _drizzle: Drizzle ): Limits { return { async acquire() { diff --git a/packages/world-vercel/src/limits.test.ts b/packages/world-vercel/src/limits.test.ts new file mode 100644 index 0000000000..2afdf8af80 --- /dev/null +++ b/packages/world-vercel/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('vercel world limits', () => { + it.fails('exposes the required limits namespace', () => { + throw new Error('TODO: implement'); + }); + + it.fails('enforces per-key concurrency limits', () => { + throw new Error('TODO: implement'); + }); + + it.fails('returns a retry path when rate limits block acquisition', () => { + throw new Error('TODO: implement'); + }); + + it.fails('restores capacity when a lease is released or expires', () => { + throw new Error('TODO: implement'); + }); +}); diff --git a/packages/world-vercel/src/limits.ts b/packages/world-vercel/src/limits.ts index bff6c07ac2..785fa4886e 100644 --- a/packages/world-vercel/src/limits.ts +++ b/packages/world-vercel/src/limits.ts @@ -1,7 +1,7 @@ import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; import type { APIConfig } from './utils.js'; -export function createLimits(config?: APIConfig): Limits { +export function createLimits(_config?: APIConfig): Limits { return { async acquire() { throw createLimitsNotImplementedError(); diff --git a/packages/world/src/limits.test.ts b/packages/world/src/limits.test.ts new file mode 100644 index 0000000000..8796d636ad --- /dev/null +++ b/packages/world/src/limits.test.ts @@ -0,0 +1,19 @@ +import { describe, it } from 'vitest'; + +describe('limits schemas', () => { + it.fails('accepts concurrency-only, rate-only, and combined limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('rejects invalid or empty limit definitions', () => { + throw new Error('TODO: implement'); + }); + + it.fails('discriminates acquired and blocked acquire results', () => { + throw new Error('TODO: implement'); + }); + + it.fails('keeps lease, release, and heartbeat request shapes stable', () => { + throw new Error('TODO: implement'); + }); +}); From 4b918ca431dd22a7343e067f4b2e64f3b0442c1d Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 17:46:54 -0400 Subject: [PATCH 03/16] add e2e examples --- packages/core/e2e/e2e.test.ts | 43 ++++++++++++++++++++++++ packages/core/src/lock.test.ts | 2 +- packages/core/src/lock.ts | 3 +- workbench/example/workflows/99_e2e.ts | 48 +++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 2 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 1c9eeb8451..f24042d2a7 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -220,6 +220,9 @@ describe('e2e', () => { const isNext = process.env.APP_NAME?.includes('nextjs'); const isLocal = deploymentUrl.includes('localhost'); + const isPostgresWorld = + process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres'; + const isLocalWorld = isLocalDeployment() && !isPostgresWorld; // only works with framework that transpiles react and // doesn't work on Vercel due to eval hack so react isn't // bundled in function @@ -544,6 +547,46 @@ describe('e2e', () => { expect(elapsed).toBeLessThan(25_000); }); + if (isLocalWorld) { + test.fails( + 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', + { timeout: 60_000 }, + async () => { + const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ + 'local-world', + ]); + const returnValue = await run.returnValue; + + expect(returnValue).toMatchObject({ + workflowKey: 'workflow:user:local-world', + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: 'summary:profile:local-world', + }); + } + ); + } + + if (isPostgresWorld) { + test.fails( + 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', + { timeout: 60_000 }, + async () => { + const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ + 'postgres-world', + ]); + const returnValue = await run.returnValue; + + expect(returnValue).toMatchObject({ + workflowKey: 'workflow:user:postgres-world', + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: 'summary:profile:postgres-world', + }); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index 3c1177b07c..58419ae49e 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -5,7 +5,7 @@ describe('lock', () => { throw new Error('TODO: implement'); }); - it.fails('returns a handle with release and heartbeat behavior', () => { + it.fails('returns a handle with dispose and heartbeat behavior', () => { throw new Error('TODO: implement'); }); diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index fc9a848875..b21fc6834d 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -21,8 +21,9 @@ export interface LockOptions extends LimitDefinition { */ export interface LockHandle extends Pick { - release(): Promise; + dispose(): Promise; heartbeat(ttlMs?: number): Promise; + [Symbol.asyncDispose](): Promise; } /** diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index d6caf9dcc8..13dcd7cb0b 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -9,6 +9,7 @@ import { getStepMetadata, getWorkflowMetadata, getWritable, + lock, type RequestWithResponse, RetryableError, sleep, @@ -213,6 +214,53 @@ export async function parallelSleepWorkflow() { return { startTime, endTime }; } +async function cheapDbStep(userId: string) { + 'use step'; + + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + leaseTtlMs: 30_000, + }); + + return { + userId, + prompt: `profile:${userId}`, + }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + leaseTtlMs: 30_000, + }); + + return `summary:${prompt}`; +} + +export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { + 'use workflow'; + + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + leaseTtlMs: 30_000, + }); + + const row = await cheapDbStep(userId); + const summary = await expensiveAIStep(row.prompt); + + return { + workflowKey: userLimit.key, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary, + }; +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 4b9c8b675e3a31901a0ea271fa7f64b6f18b2300 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Tue, 17 Mar 2026 22:34:23 -0400 Subject: [PATCH 04/16] DCO Remediation Commit for nathancolosimo I, nathancolosimo , hereby add my Signed-off-by to this commit: b0e2f2a37bc813ec244991353f58e5885a5e8540 Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 2 +- packages/core/src/lock.ts | 22 +- packages/core/src/runtime/step-handler.ts | 62 +++-- packages/core/src/step/context-storage.ts | 1 + packages/core/src/step/lock.ts | 78 ++++++ packages/core/src/symbols.ts | 2 + packages/core/src/workflow.ts | 5 + packages/core/src/workflow/lock.ts | 130 ++++++++++ packages/world-local/src/index.ts | 5 +- packages/world-local/src/limits.test.ts | 148 ++++++++++- packages/world-local/src/limits.ts | 301 +++++++++++++++++++++- packages/world/FLOW_LIMITS.md | 280 ++++++++++++++++++++ pnpm-lock.yaml | 13 +- workbench/nextjs-turbopack/next.config.ts | 4 +- 14 files changed, 995 insertions(+), 58 deletions(-) create mode 100644 packages/core/src/step/lock.ts create mode 100644 packages/core/src/workflow/lock.ts create mode 100644 packages/world/FLOW_LIMITS.md diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index f24042d2a7..86325b1c58 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -548,7 +548,7 @@ describe('e2e', () => { }); if (isLocalWorld) { - test.fails( + test( 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', { timeout: 60_000 }, async () => { diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index b21fc6834d..11829957d0 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -4,6 +4,7 @@ import { type LimitKey, type LimitLease, } from '@workflow/world'; +import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; @@ -28,10 +29,23 @@ export interface LockHandle /** * Reserved workflow API for future concurrency and rate limiting. - * - * This placeholder intentionally throws until the runtime and world - * implementations gain real support. */ -export async function lock(_options: LockOptions): Promise { +export async function lock(options: LockOptions): Promise { + const workflowLock = (globalThis as any)[WORKFLOW_LOCK] as + | ((options: LockOptions) => Promise) + | undefined; + + if (workflowLock) { + return workflowLock(options); + } + + const stepLock = (globalThis as any)[STEP_LOCK] as + | ((options: LockOptions) => Promise) + | undefined; + + if (stepLock) { + return stepLock(options); + } + throw createLimitsNotImplementedError(); } diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 8e7f01b983..44c585315a 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -16,6 +16,8 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; +import { createStepLock } from '../step/lock.js'; +import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { getSpanKind, @@ -117,7 +119,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // - Step not in terminal state (returns 409) // - retryAfter timestamp reached (returns 425 with Retry-After header) // - Workflow still active (returns 410 if completed) - let step; + let step: Awaited>; try { const startResult = await world.events.create( workflowRunId, @@ -384,31 +386,43 @@ const stepHandler = getWorldHandlers().createQueueHandler( const executionStartTime = Date.now(); try { + const previousStepLock = (globalThis as any)[STEP_LOCK]; + (globalThis as any)[STEP_LOCK] = createStepLock(world); + result = await trace('step.execute', {}, async () => { - return await contextStorage.run( - { - stepMetadata: { - stepName, - stepId, - stepStartedAt: new Date(+stepStartedAt), - attempt, - }, - workflowMetadata: { - workflowName, - workflowRunId, - workflowStartedAt: new Date(+workflowStartedAt), - // TODO: there should be a getUrl method on the world interface itself. This - // solution only works for vercel + local worlds. - url: isVercel - ? `https://${process.env.VERCEL_URL}` - : `http://localhost:${port ?? 3000}`, + try { + return await contextStorage.run( + { + stepMetadata: { + stepName, + stepId, + stepStartedAt: new Date(+stepStartedAt), + attempt, + }, + workflowMetadata: { + workflowName, + workflowRunId, + workflowStartedAt: new Date(+workflowStartedAt), + // TODO: there should be a getUrl method on the world interface itself. This + // solution only works for vercel + local worlds. + url: isVercel + ? `https://${process.env.VERCEL_URL}` + : `http://localhost:${port ?? 3000}`, + }, + ops, + closureVars: hydratedInput.closureVars, + encryptionKey, + lockCounter: 0, }, - ops, - closureVars: hydratedInput.closureVars, - encryptionKey, - }, - () => stepFn.apply(thisVal, args) - ); + () => stepFn.apply(thisVal, args) + ); + } finally { + if (previousStepLock === undefined) { + delete (globalThis as any)[STEP_LOCK]; + } else { + (globalThis as any)[STEP_LOCK] = previousStepLock; + } + } }); } catch (err) { userCodeError = err; diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index 2a9aa8b7e1..dadb25b132 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -9,4 +9,5 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ ops: Promise[]; closureVars?: Record; encryptionKey?: CryptoKey; + lockCounter: number; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts new file mode 100644 index 0000000000..7451b4e712 --- /dev/null +++ b/packages/core/src/step/lock.ts @@ -0,0 +1,78 @@ +import type { LimitLease, World } from '@workflow/world'; +import type { LockHandle, LockOptions } from '../lock.js'; +import { contextStorage } from './context-storage.js'; + +function createStepLockHandle(lease: LimitLease, world: World): LockHandle { + let currentLease = lease; + let disposed = false; + + const dispose = async () => { + if (disposed) return; + disposed = true; + await world.limits.release({ + leaseId: currentLease.leaseId, + key: currentLease.key, + holderId: currentLease.holderId, + }); + }; + + const heartbeat = async (ttlMs?: number) => { + currentLease = await world.limits.heartbeat({ + leaseId: currentLease.leaseId, + ttlMs, + }); + }; + + return { + get leaseId() { + return currentLease.leaseId; + }, + get key() { + return currentLease.key; + }, + get holderId() { + return currentLease.holderId; + }, + get expiresAt() { + return currentLease.expiresAt; + }, + dispose, + heartbeat, + [Symbol.asyncDispose]: dispose, + }; +} + +export function createStepLock(world: World) { + return async function lockInStep(options: LockOptions): Promise { + const store = contextStorage.getStore(); + if (!store) { + throw new Error( + '`lock()` can only be called inside a workflow or step function' + ); + } + + const lockIndex = store.lockCounter++; + const holderId = `stplock_${store.workflowMetadata.workflowRunId}:${store.stepMetadata.stepId}:${lockIndex}`; + const definition = { + concurrency: options.concurrency, + rate: options.rate, + }; + + while (true) { + const result = await world.limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); + + if (result.status === 'acquired') { + return createStepLockHandle(result.lease, world); + } + + await new Promise((resolve) => + setTimeout(resolve, result.retryAfterMs || 1000) + ); + } + }; +} diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index 92df4058db..cd9616b17e 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -1,6 +1,8 @@ export const WORKFLOW_USE_STEP = Symbol.for('WORKFLOW_USE_STEP'); export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); +export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); +export const STEP_LOCK = Symbol.for('STEP_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index 5d18c085b4..ece1823196 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -22,6 +22,7 @@ import { STABLE_ULID, WORKFLOW_CREATE_HOOK, WORKFLOW_GET_STREAM_ID, + WORKFLOW_LOCK, WORKFLOW_SLEEP, WORKFLOW_USE_STEP, } from './symbols.js'; @@ -32,6 +33,7 @@ import { createContext } from './vm/index.js'; import type { WorkflowMetadata } from './workflow/get-workflow-metadata.js'; import { WORKFLOW_CONTEXT_SYMBOL } from './workflow/get-workflow-metadata.js'; import { createCreateHook } from './workflow/hook.js'; +import { createLock } from './workflow/lock.js'; import { createSleep } from './workflow/sleep.js'; /** @@ -184,6 +186,7 @@ export async function runWorkflow( const useStep = createUseStep(workflowContext); const createHook = createCreateHook(workflowContext); + const lock = createLock(workflowContext); const sleep = createSleep(workflowContext); // @ts-expect-error - `@types/node` says symbol is not valid, but it does work @@ -191,6 +194,8 @@ export async function runWorkflow( // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_CREATE_HOOK] = createHook; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work + vmGlobalThis[WORKFLOW_LOCK] = lock; + // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_SLEEP] = sleep; // @ts-expect-error - `@types/node` says symbol is not valid, but it does work vmGlobalThis[WORKFLOW_GET_STREAM_ID] = (namespace?: string) => diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts new file mode 100644 index 0000000000..21db74d825 --- /dev/null +++ b/packages/core/src/workflow/lock.ts @@ -0,0 +1,130 @@ +import { EventConsumerResult } from '../events-consumer.js'; +import { WorkflowSuspension } from '../global.js'; +import type { LockHandle, LockOptions } from '../lock.js'; +import { + scheduleWhenIdle, + type WorkflowOrchestratorContext, +} from '../private.js'; +import { getWorld } from '../runtime/world.js'; + +function createLockHandle( + lease: { + leaseId: string; + key: string; + holderId: string; + expiresAt?: Date; + }, + ctx: WorkflowOrchestratorContext +): LockHandle { + let currentLease = lease; + let disposed = false; + + const dispose = async () => { + if (disposed) return; + disposed = true; + await getWorld().limits.release({ + leaseId: currentLease.leaseId, + key: currentLease.key, + holderId: currentLease.holderId, + }); + }; + + const heartbeat = async (ttlMs?: number) => { + currentLease = await getWorld().limits.heartbeat({ + leaseId: currentLease.leaseId, + ttlMs, + }); + }; + + const handle: LockHandle = { + get leaseId() { + return currentLease.leaseId; + }, + get key() { + return currentLease.key; + }, + get holderId() { + return currentLease.holderId; + }, + get expiresAt() { + return currentLease.expiresAt; + }, + dispose, + heartbeat, + [Symbol.asyncDispose]: dispose, + }; + + const vmAsyncDispose = ctx.globalThis.Symbol.asyncDispose; + if (vmAsyncDispose && vmAsyncDispose !== Symbol.asyncDispose) { + (handle as any)[vmAsyncDispose] = dispose; + } + + return handle; +} + +export function createLock(ctx: WorkflowOrchestratorContext) { + return async function lockImpl(options: LockOptions): Promise { + const holderId = `wflock_${ctx.generateUlid()}`; + const definition = { + concurrency: options.concurrency, + rate: options.rate, + }; + + while (true) { + const result = await getWorld().limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); + + if (result.status === 'acquired') { + return createLockHandle(result.lease, ctx); + } + + const correlationId = `wflock_wait_${ctx.generateUlid()}`; + const resumeAt = new Date(Date.now() + (result.retryAfterMs || 1000)); + ctx.invocationsQueue.set(correlationId, { + type: 'wait', + correlationId, + resumeAt, + }); + + await new Promise((resolve) => { + ctx.eventsConsumer.subscribe((event) => { + if (!event) { + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); + }); + return EventConsumerResult.NotConsumed; + } + + if (event.correlationId !== correlationId) { + return EventConsumerResult.NotConsumed; + } + + if (event.eventType === 'wait_created') { + const queueItem = ctx.invocationsQueue.get(correlationId); + if (queueItem && queueItem.type === 'wait') { + queueItem.hasCreatedEvent = true; + queueItem.resumeAt = event.eventData.resumeAt; + } + return EventConsumerResult.Consumed; + } + + if (event.eventType === 'wait_completed') { + ctx.invocationsQueue.delete(correlationId); + ctx.promiseQueue = ctx.promiseQueue.then(() => { + resolve(); + }); + return EventConsumerResult.Finished; + } + + return EventConsumerResult.NotConsumed; + }); + }); + } + }; +} diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 96f03efa57..029154649d 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -14,8 +14,8 @@ import { import { initDataDir } from './init.js'; import { createLimits } from './limits.js'; import { createQueue, type DirectHandler } from './queue.js'; -import { createStorage } from './storage.js'; import { hashToken } from './storage/helpers.js'; +import { createStorage } from './storage.js'; import { createStreamer } from './streamer.js'; // Re-export init types and utilities for consumers @@ -28,7 +28,7 @@ export { parseVersion, } from './init.js'; -export { type DirectHandler } from './queue.js'; +export type { DirectHandler } from './queue.js'; export type LocalWorld = World & { /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ @@ -104,6 +104,7 @@ export function createLocalWorld(args?: Partial): LocalWorld { 'steps', 'events', 'hooks', + 'limits', 'waits', 'streams/runs', ]; diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 16ce754f8a..1db72676af 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,19 +1,149 @@ -import { describe, it } from 'vitest'; +import { mkdtemp, rm } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; +import { setTimeout as sleep } from 'node:timers/promises'; +import { describe, expect, it } from 'vitest'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; + +async function withTempDir(fn: (dir: string) => Promise): Promise { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + try { + return await fn(dir); + } finally { + await rm(dir, { recursive: true, force: true }); + } +} describe('local world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); + it('exposes the required limits namespace', async () => { + await withTempDir(async (dir) => { + const world = createLocalWorld({ dataDir: dir }); + expect(world.limits).toBeDefined(); + expect(typeof world.limits.acquire).toBe('function'); + expect(typeof world.limits.release).toBe('function'); + expect(typeof world.limits.heartbeat).toBe('function'); + await world.close?.(); + }); }); - it.fails('enforces per-key concurrency limits', () => { - throw new Error('TODO: implement'); + it('enforces per-key concurrency limits', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + const second = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency', + }); + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(third.status).toBe('acquired'); + }); }); - it.fails('returns a retry path when rate limits block acquisition', () => { - throw new Error('TODO: implement'); + it('returns a retry path when rate limits block acquisition', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-a', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + await limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const second = await limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-b', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + if (second.status !== 'blocked') { + throw new Error('expected second acquire to be blocked'); + } + expect(second.reason).toBe('rate'); + expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + }); }); - it.fails('restores capacity when a lease is released or expires', () => { - throw new Error('TODO: implement'); + it('restores capacity when a lease is released or expires', async () => { + await withTempDir(async (dir) => { + const limits = createLimits(dir); + + const first = await limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 25, + }); + + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') { + throw new Error('expected first lease to be acquired'); + } + + const heartbeat = await limits.heartbeat({ + leaseId: first.lease.leaseId, + ttlMs: 50, + }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( + first.lease.expiresAt?.getTime() ?? 0 + ); + + await sleep(60); + + const second = await limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('acquired'); + }); }); }); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 68de99ccbb..9dfac5d931 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,15 +1,300 @@ -import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import path from 'node:path'; +import { WorkflowAPIError } from '@workflow/errors'; +import { + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + LimitLeaseSchema, + LimitReleaseRequestSchema, + type Limits, +} from '@workflow/world'; +import { z } from 'zod'; +import { readJSON, writeJSON } from './fs.js'; +import { monotonicUlid } from './storage/helpers.js'; + +const LimitTokenSchema = z.object({ + tokenId: z.string(), + holderId: z.string(), + acquiredAt: z.coerce.date(), + expiresAt: z.coerce.date(), +}); + +const KeyStateSchema = z.object({ + key: z.string(), + leases: z.array(LimitLeaseSchema), + tokens: z.array(LimitTokenSchema), +}); + +const LimitsStateSchema = z.object({ + version: z.literal(1), + keys: z.record(z.string(), KeyStateSchema), +}); + +type LimitToken = z.infer; +type KeyState = z.infer; +type LimitsState = z.infer; + +const EMPTY_STATE: LimitsState = { + version: 1, + keys: {}, +}; + +function getStatePath(dataDir: string, tag?: string): string { + return path.join(dataDir, 'limits', tag ? `state.${tag}.json` : 'state.json'); +} + +function cloneToken(token: LimitToken): LimitToken { + return { ...token }; +} + +function cloneState(state: LimitsState): LimitsState { + return { + version: 1, + keys: Object.fromEntries( + Object.entries(state.keys).map(([key, keyState]) => [ + key, + { + key: keyState.key, + leases: keyState.leases.map((lease) => ({ ...lease })), + tokens: keyState.tokens.map(cloneToken), + }, + ]) + ), + }; +} + +function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { + return { + key: keyState.key, + leases: keyState.leases.filter( + (lease) => + lease.expiresAt === undefined || lease.expiresAt.getTime() > now + ), + tokens: keyState.tokens.filter((token) => token.expiresAt.getTime() > now), + }; +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +function getRetryAfterMs( + keyState: KeyState, + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of keyState.leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, lease.expiresAt.getTime() - now)); + } + } + } + + if (rateBlocked) { + for (const token of keyState.tokens) { + candidates.push(Math.max(0, token.expiresAt.getTime() - now)); + } + } + + if (candidates.length === 0) { + return undefined; + } + + return Math.min(...candidates); +} + +export function createLimits(dataDir: string, tag?: string): Limits { + const statePath = getStatePath(dataDir, tag); + let stateOp = Promise.resolve(); + + // This block is an in-process async mutex / operation queue. + // stateOp starts as an already-resolved promise. + // Each call to withStateLock() chains a new operation onto the tail of that promise. + // Because every new operation waits for the previous one, reads/modifies/writes to the limits state file happen serially. + const withStateLock = async (fn: () => Promise): Promise => { + const run = stateOp.then(fn, fn); + stateOp = run.then( + () => undefined, + () => undefined + ); + return run; + }; + + const readState = async (): Promise => { + return ( + (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE) + ); + }; + + const writeState = async (state: LimitsState): Promise => { + await writeJSON(statePath, state, { overwrite: true }); + }; -export function createLimits(_dataDir: string, _tag?: string): Limits { return { - async acquire() { - throw createLimitsNotImplementedError(); + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + + return withStateLock(async (): Promise => { + const state = cloneState(await readState()); + const now = new Date(); + const nowMs = now.getTime(); + const keyState = pruneKeyState( + state.keys[parsed.key] ?? { + key: parsed.key, + leases: [], + tokens: [], + }, + nowMs + ); + + const existingLease = keyState.leases.find( + (lease) => lease.holderId === parsed.holderId + ); + if (existingLease) { + state.keys[parsed.key] = keyState; + await writeState(state); + return { + status: 'acquired', + lease: existingLease, + }; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + keyState.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + keyState.tokens.length >= parsed.definition.rate.count; + + if (concurrencyBlocked || rateBlocked) { + state.keys[parsed.key] = keyState; + await writeState(state); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: getRetryAfterMs( + keyState, + nowMs, + concurrencyBlocked, + rateBlocked + ), + }; + } + + const lease: LimitLease = { + leaseId: `lmt_${monotonicUlid()}`, + key: parsed.key, + holderId: parsed.holderId, + acquiredAt: now, + expiresAt: + parsed.leaseTtlMs !== undefined + ? new Date(nowMs + parsed.leaseTtlMs) + : undefined, + definition: parsed.definition, + }; + + keyState.leases.push(lease); + + if (parsed.definition.rate) { + keyState.tokens.push({ + tokenId: `lmttok_${monotonicUlid()}`, + holderId: parsed.holderId, + acquiredAt: now, + expiresAt: new Date(nowMs + parsed.definition.rate.periodMs), + }); + } + + state.keys[parsed.key] = keyState; + await writeState(state); + + return { + status: 'acquired', + lease, + }; + }); }, - async release() { - throw createLimitsNotImplementedError(); + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + await withStateLock(async () => { + const state = cloneState(await readState()); + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const keyState = pruneKeyState(keyStateValue); + const nextLeases = keyState.leases.filter((lease) => { + if (lease.leaseId !== parsed.leaseId) return true; + if (parsed.key && lease.key !== parsed.key) return true; + if (parsed.holderId && lease.holderId !== parsed.holderId) { + return true; + } + return false; + }); + + state.keys[key] = { + ...keyState, + leases: nextLeases, + }; + + if ( + state.keys[key].leases.length === 0 && + state.keys[key].tokens.length === 0 + ) { + delete state.keys[key]; + } + } + + await writeState(state); + }); }, - async heartbeat() { - throw createLimitsNotImplementedError(); + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + return withStateLock(async () => { + const state = cloneState(await readState()); + const now = Date.now(); + + for (const [key, keyStateValue] of Object.entries(state.keys)) { + const keyState = pruneKeyState(keyStateValue, now); + const leaseIndex = keyState.leases.findIndex( + (lease) => lease.leaseId === parsed.leaseId + ); + + if (leaseIndex === -1) { + state.keys[key] = keyState; + continue; + } + + const lease = keyState.leases[leaseIndex]; + const currentExpiry = lease.expiresAt?.getTime(); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const updatedLease: LimitLease = { + ...lease, + expiresAt: new Date(now + Math.max(1, ttlMs)), + }; + + keyState.leases[leaseIndex] = updatedLease; + state.keys[key] = keyState; + await writeState(state); + return updatedLease; + } + + throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { + status: 404, + }); + }); }, }; } diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md new file mode 100644 index 0000000000..df5d3275c8 --- /dev/null +++ b/packages/world/FLOW_LIMITS.md @@ -0,0 +1,280 @@ +# Planning to delete after PR is implemented / ready to merge + +# Flow Limits Design Notes + +This note summarizes the current direction for flow concurrency and rate limiting +across `@workflow/core`, `@workflow/world`, and concrete world implementations. + +## Status + +- The shared `limits` interface and `lock()` API surface now exist. +- Local world has an initial working implementation for acquire/release/heartbeat. +- Postgres and Vercel worlds still expose `limits` as stubs. +- There is a real local E2E example for workflow and step locks in the Next.js Turbopack workbench. + +## Goals + +- Support keyed concurrency limits. +- Support keyed rate limits. +- Allow concurrency and rate to be colocated in one interface. +- Support workflow-scoped limits and step-scoped limits. +- Make crash recovery possible through leases with TTL/expiry. +- Keep worker throughput controls separate from business-level flow limits. + +## Core Terms + +- `worker concurrency`: backend throughput setting for queue/job processing. +- `workflow limit`: admission control for workflow runs that share a key. +- `step limit`: execution control for a specific step/resource key. +- `lease`: durable record that a workflow or step currently occupies capacity for a key. + +## Decisions So Far + +### 1. Use one shared limits model + +The shared world interface uses a single `limits` namespace and a single limit +definition shape that can contain either or both: + +- `concurrency` +- `rate` + +This allows one key to express: + +- concurrency only +- rate only +- both together + +### 2. Use leases, not plain mutexes + +Limits are modeled as leases with TTL/expiry so capacity can be recovered after: + +- worker crashes +- process death +- machine shutdown +- lost retries + +Normal completion should dispose/release the lease explicitly. Crash recovery +comes from lease expiry plus future reclaim logic. + +### 3. Keep worker concurrency separate from flow limits + +Current world-level concurrency settings are infrastructure controls, not +business-level locking: + +- local world: `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` +- postgres world: `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` + +These control how many queue jobs can be processed at once. They should remain +independent from flow limits like: + +- `workflow:user:123` +- `step:db:cheap` +- `step:provider:openai` + +### 4. Use a sliding-window model for rate limits in v1 + +The current rate-limit model is a sliding-window log model, not a token bucket. + +For a limit like: + +- `rate: { count: 10, periodMs: 60_000 }` + +the intended semantics are: + +- allow at most 10 successful acquires in the last 60 seconds +- each successful acquire records a timestamped rate usage entry +- rate capacity returns only when that entry ages out of the window + +This is simpler than a token bucket and matches the current local-world +implementation direction well. + +Important distinction: + +- `lease`: active occupancy / ownership for a holder +- `token`: internal rate-usage record that remains until the rate window expires + +Releasing a lease should free concurrency capacity immediately, but it should +not restore rate capacity until the associated rate usage entry expires. + +### 5. Use one `lock()` API in both workflows and steps + +We want one user-facing primitive: + +```ts +await using lease = await lock({ ... }); +``` + +But the runtime meaning differs by context. + +#### In workflows + +`lock()` means workflow admission / workflow-scope ownership. + +If placed at the top of a workflow, it should hold the lease across the logical +workflow scope, even though the workflow may suspend and resume many times. + +#### In steps + +`lock()` should act like a step gate. + +The intended long-term behavior is: + +- declare the limit at the top of the step +- runtime/compiler hoists or interprets it as a pre-step requirement +- the step should not occupy a worker just waiting for capacity +- lease is disposed automatically when the step attempt completes + +This means step `lock()` is conceptually the same API, but not a literal +"block inside already-running user step code" implementation. + +### 6. `await using` is the preferred user-facing shape + +The preferred API is explicit resource management: + +```ts +await using lease = await lock({ ... }); +``` + +This gives automatic cleanup on scope exit and reads well for both workflow +scopes and step scopes. + +For manual early cleanup, the user-facing `LockHandle` should expose: + +- `dispose()` +- `[Symbol.asyncDispose]()` + +The backend-facing world contract can continue to use `release(...)` internally. + +### 7. Workflow-scoped locks are logical-scope locks, not request-lifetime locks + +For workflows, `await using` must be tied to the logical workflow scope across: + +- step round trips +- queue turns +- sleeps +- hooks +- replay/resume + +The lease must not be disposed merely because one host process invocation ends. + +### 8. Prefer Option B for deadlock avoidance + +Current preferred model: + +- workflow-level limits may be held by a run +- step-level limits are acquired only at step boundaries +- step-level limits are short-lived +- step code should not acquire additional locks dynamically +- step execution should not wait on workflow-level locks + +This keeps the dependency direction one-way: + +- workflow admission -> step admission -> step execution + +That avoids the classic cycle where one workflow holds a workflow lock and +another holds a step lock and each waits on the other. + +### 9. V1 semantics are intentionally opinionated + +For v1, the intended semantics are: + +- workflow locks count admitted, in-flight workflows for a key +- step locks count or rate-limit specific step execution categories +- worker concurrency remains a separate infrastructure throttle + +More concretely: + +- if a workflow acquires a workflow-scoped lock and then sleeps for 10 minutes, + it still counts as active for that workflow key during the sleep +- if a workflow is parked waiting for a step-level limit, it still counts as + active for its workflow-level lock +- a step-level lock should conceptually be an admission gate for the step + attempt, not a second workflow-level lock +- step-level rate limits should consume rate capacity when the step starts, and + that rate usage should remain counted until the window expires even if the + step releases its lease quickly + +For the current local implementation specifically: + +- workflow locks already behave like durable logical-scope leases +- step locks currently use in-process retry polling once the step is already + executing, which is acceptable for local v1 but not the ideal long-term + admission model + +This means the current v1 interpretation of a workflow lock is: + +- "How many workflows for this key are admitted and in flight at all?" + +not: + +- "How many workflows are actively burning CPU right this instant?" + +## Current Example Shape + +The current placeholder E2E example models: + +- workflow-level user concurrency: + - `workflow:user:${userId}` +- step-level DB concurrency: + - `step:db:cheap` +- step-level AI rate limit: + - `step:provider:openai` + +With intended usage like: + +```ts +async function cheapDbStep(userId: string) { + 'use step'; + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + }); + return { userId, prompt: `profile:${userId}` }; +} + +async function expensiveAIStep(prompt: string) { + 'use step'; + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + }); + return `summary:${prompt}`; +} + +export async function workflowWithWorkflowAndStepLocks(userId: string) { + 'use workflow'; + await using userLimit = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 2 }, + }); + + const row = await cheapDbStep(userId); + const summary = await expensiveAIStep(row.prompt); + return { row, summary }; +} +``` + +## Important Clarification + +Flow limits and worker concurrency are different layers. + +For example: + +- a cheap DB step may continue making progress even while an expensive AI step + is rate-limited +- the main shared coupling between them is the worker pool +- if workers are available, unrelated step categories should continue + +So overall system throughput is not one simple global minimum. Different +workflow paths may be bottlenecked by different limits at different times. + +## Open Questions + +- Exact runtime/compiler behavior for step-scoped `lock()` hoisting. +- Whether workflow-level locks should always be whole-run admission locks or + also support narrower workflow-scoped blocks. +- Whether `heartbeat()` should remain user-visible or become mostly internal. +- Whether step limits should only be expressed through `lock()` or also through + step metadata/config sugar. +- Fairness/wake-up policy for waiters per key in local and Postgres worlds. +- Exact event-log representation for acquire/block/dispose transitions. diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1e644f7416..5ec582352c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1284,6 +1284,9 @@ importers: '@workflow/tsconfig': specifier: workspace:* version: link:../tsconfig + vitest: + specifier: 'catalog:' + version: 4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) zod: specifier: 'catalog:' version: 4.3.6 @@ -23456,14 +23459,6 @@ snapshots: optionalDependencies: vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1))': - dependencies: - '@vitest/spy': 4.0.18 - estree-walker: 3.0.3 - magic-string: 0.30.21 - optionalDependencies: - vite: 7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1) - '@vitest/mocker@4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1))': dependencies: '@vitest/spy': 4.0.18 @@ -32855,7 +32850,7 @@ snapshots: vitest@4.0.18(@opentelemetry/api@1.9.0)(@types/node@22.19.0)(jiti@2.6.1)(jsdom@26.1.0)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1): dependencies: '@vitest/expect': 4.0.18 - '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@22.19.0)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1)) + '@vitest/mocker': 4.0.18(vite@7.1.12(@types/node@24.6.2)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.0)(tsx@4.20.6)(yaml@2.8.1)) '@vitest/pretty-format': 4.0.18 '@vitest/runner': 4.0.18 '@vitest/snapshot': 4.0.18 diff --git a/workbench/nextjs-turbopack/next.config.ts b/workbench/nextjs-turbopack/next.config.ts index 78df6b2090..5d1a204118 100644 --- a/workbench/nextjs-turbopack/next.config.ts +++ b/workbench/nextjs-turbopack/next.config.ts @@ -1,7 +1,9 @@ -import type { NextConfig } from 'next'; import path from 'node:path'; +import type { NextConfig } from 'next'; import { withWorkflow } from 'workflow/next'; +process.env.WORKFLOW_PUBLIC_MANIFEST ??= '1'; + const turbopackRoot = path.resolve(process.cwd(), '../..'); const nextConfig: NextConfig = { From 49ae775d9acf5b42069b39c9d8faafa271f16dd5 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 15:51:31 -0400 Subject: [PATCH 05/16] add pg limit tests, lock tests, schema, migration Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 27 +- packages/core/src/global.ts | 13 +- packages/core/src/lock.test.ts | 49 +- .../core/src/runtime/step-handler.test.ts | 23 + packages/core/src/runtime/step-handler.ts | 18 +- .../core/src/runtime/suspension-handler.ts | 32 + packages/core/src/step/lock.ts | 38 +- packages/core/src/workflow/lock.ts | 50 +- packages/workflow/src/internal/builtins.ts | 3 + packages/world-local/src/limits.test.ts | 152 +-- packages/world-postgres/README.md | 3 + .../migrations/0010_add_flow_limits.sql | 35 + .../migrations/meta/0010_snapshot.json | 973 ++++++++++++++++++ .../src/drizzle/migrations/meta/_journal.json | 7 + packages/world-postgres/src/drizzle/schema.ts | 49 + packages/world-postgres/src/limits.test.ts | 114 +- packages/world-postgres/test/test-db.ts | 59 ++ packages/world-testing/src/limits-contract.ts | 191 ++++ packages/world/FLOW_LIMITS.md | 97 +- workbench/example/workflows/99_e2e.ts | 47 + 20 files changed, 1740 insertions(+), 240 deletions(-) create mode 100644 packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql create mode 100644 packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json create mode 100644 packages/world-postgres/test/test-db.ts create mode 100644 packages/world-testing/src/limits-contract.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 86325b1c58..e4d4379259 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -568,7 +568,7 @@ describe('e2e', () => { } if (isPostgresWorld) { - test.fails( + test( 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', { timeout: 60_000 }, async () => { @@ -587,6 +587,31 @@ describe('e2e', () => { ); } + if (isPostgresWorld) { + test( + 'workflowLockContentionWorkflow serializes workflow and step locks under contention', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 750]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 750]); + + const [resultA, resultB] = await Promise.all([ + runA.returnValue, + runB.returnValue, + ]); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepLockReleasedAt + ); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/global.ts b/packages/core/src/global.ts index 3dd5c52ac8..6891e0a761 100644 --- a/packages/core/src/global.ts +++ b/packages/core/src/global.ts @@ -28,10 +28,17 @@ export interface WaitInvocationQueueItem { hasCreatedEvent?: boolean; } +export interface LimitWaitInvocationQueueItem { + type: 'limit_wait'; + correlationId: string; + resumeAt: Date; +} + export type QueueItem = | StepInvocationQueueItem | HookInvocationQueueItem - | WaitInvocationQueueItem; + | WaitInvocationQueueItem + | LimitWaitInvocationQueueItem; /** * An error that is thrown when one or more operations (steps/hooks/etc.) are called but do @@ -61,7 +68,9 @@ export class WorkflowSuspension extends Error { else if (item.type === 'hook') { if (item.disposed) hookDisposedCount++; else hookCount++; - } else if (item.type === 'wait') waitCount++; + } else if (item.type === 'wait' || item.type === 'limit_wait') { + waitCount++; + } } // Build description parts diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index 58419ae49e..c9237066e3 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -1,19 +1,48 @@ -import { describe, it } from 'vitest'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { lock, LIMITS_NOT_IMPLEMENTED_MESSAGE } from './lock.js'; +import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; + +afterEach(() => { + delete (globalThis as any)[WORKFLOW_LOCK]; + delete (globalThis as any)[STEP_LOCK]; +}); describe('lock', () => { - it.fails('is only callable inside workflow execution context', () => { - throw new Error('TODO: implement'); + it('throws when called outside workflow or step execution context', async () => { + await expect( + lock({ + key: 'workflow:user:test', + concurrency: { max: 1 }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); }); - it.fails('returns a handle with dispose and heartbeat behavior', () => { - throw new Error('TODO: implement'); - }); + it('prefers the workflow runtime lock when both runtimes are present', async () => { + const workflowHandle = { leaseId: 'lease_workflow' }; + const workflowLock = vi.fn().mockResolvedValue(workflowHandle); + const stepLock = vi.fn().mockResolvedValue({ leaseId: 'lease_step' }); + (globalThis as any)[WORKFLOW_LOCK] = workflowLock; + (globalThis as any)[STEP_LOCK] = stepLock; + const options = { + key: 'workflow:user:test', + concurrency: { max: 1 }, + }; - it.fails('allows multiple holders for one key up to the concurrency max', () => { - throw new Error('TODO: implement'); + await expect(lock(options)).resolves.toBe(workflowHandle); + expect(workflowLock).toHaveBeenCalledWith(options); + expect(stepLock).not.toHaveBeenCalled(); }); - it.fails('blocks rate-only locks until the rate window advances', () => { - throw new Error('TODO: implement'); + it('falls back to the step runtime lock when no workflow runtime is present', async () => { + const handle = { leaseId: 'lease_step' }; + const stepLock = vi.fn().mockResolvedValue(handle); + (globalThis as any)[STEP_LOCK] = stepLock; + const options = { + key: 'step:db:cheap', + concurrency: { max: 2 }, + }; + + await expect(lock(options)).resolves.toBe(handle); + expect(stepLock).toHaveBeenCalledWith(options); }); }); diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index e99290661f..ee4df5ea88 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -1,5 +1,6 @@ import { WorkflowAPIError } from '@workflow/errors'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { StepLockBlockedError } from '../step/lock.js'; // Use vi.hoisted so these are available in mock factories const { @@ -223,6 +224,28 @@ describe('step-handler 409 handling', () => { mockStepFn.mockResolvedValue('step-result'); }); + it('returns a timeout when a step lock is blocked before user code can proceed', async () => { + mockEventsCreate.mockResolvedValue({ + step: { + stepId: 'step_abc', + status: 'running', + attempt: 1, + startedAt: new Date(), + input: [], + }, + }); + mockStepFn.mockRejectedValue(new StepLockBlockedError(2_500)); + + const result = await capturedHandler( + createMessage(), + createMetadata('myStep') + ); + + expect(result).toEqual({ timeoutSeconds: 3 }); + expect(mockQueueMessage).not.toHaveBeenCalled(); + expect(mockEventsCreate).toHaveBeenCalledTimes(1); + }); + afterEach(() => { vi.restoreAllMocks(); }); diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 44c585315a..fd3c1292dc 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -16,7 +16,7 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; -import { createStepLock } from '../step/lock.js'; +import { createStepLock, StepLockBlockedError } from '../step/lock.js'; import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { @@ -438,6 +438,22 @@ const stepHandler = getWorldHandlers().createQueueHandler( if (userCodeFailed) { const err = userCodeError; + if (StepLockBlockedError.is(err)) { + const timeoutSeconds = Math.max( + 1, + Math.ceil((err.retryAfterMs ?? 1000) / 1000) + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + span?.addEvent?.('step.lock_blocked', { + 'retry.timeout_seconds': timeoutSeconds, + 'step.id': stepId, + 'step.name': stepName, + }); + return { timeoutSeconds }; + } + // Infrastructure errors that somehow surfaced through user code // should propagate to the queue handler for retry, not consume // step attempts. diff --git a/packages/core/src/runtime/suspension-handler.ts b/packages/core/src/runtime/suspension-handler.ts index caa45da753..dea2a50b5f 100644 --- a/packages/core/src/runtime/suspension-handler.ts +++ b/packages/core/src/runtime/suspension-handler.ts @@ -11,6 +11,7 @@ import { import { importKey } from '../encryption.js'; import type { HookInvocationQueueItem, + LimitWaitInvocationQueueItem, StepInvocationQueueItem, WaitInvocationQueueItem, WorkflowSuspension, @@ -79,6 +80,9 @@ export async function handleSuspension({ const waitItems = suspension.steps.filter( (item): item is WaitInvocationQueueItem => item.type === 'wait' ); + const limitWaitItems = suspension.steps.filter( + (item): item is LimitWaitInvocationQueueItem => item.type === 'limit_wait' + ); // Split hooks by what actions they need const hooksNeedingCreation = allHookItems.filter( @@ -307,6 +311,34 @@ export async function handleSuspension({ } } + // Lock waits: schedule a delayed workflow replay keyed by correlationId so a + // later immediate wake-up can replace it. + for (const queueItem of limitWaitItems) { + ops.push( + (async () => { + const delayMs = Math.max( + 1000, + queueItem.resumeAt.getTime() - Date.now() + ); + const traceCarrier = await serializeTraceCarrier(); + await queueMessage( + world, + `__wkf_workflow_${workflowName}`, + { + runId, + traceCarrier, + requestedAt: new Date(), + }, + { + delaySeconds: Math.ceil(delayMs / 1000), + idempotencyKey: queueItem.correlationId, + headers: extractTraceHeaders(traceCarrier), + } + ); + })() + ); + } + // Wait for all step and wait operations to complete waitUntil( Promise.all(ops).catch((opErr) => { diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index 7451b4e712..fc3901f986 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -2,6 +2,20 @@ import type { LimitLease, World } from '@workflow/world'; import type { LockHandle, LockOptions } from '../lock.js'; import { contextStorage } from './context-storage.js'; +export class StepLockBlockedError extends Error { + retryAfterMs?: number; + + constructor(retryAfterMs?: number) { + super('Step lock blocked'); + this.name = 'StepLockBlockedError'; + this.retryAfterMs = retryAfterMs; + } + + static is(value: unknown): value is StepLockBlockedError { + return value instanceof StepLockBlockedError; + } +} + function createStepLockHandle(lease: LimitLease, world: World): LockHandle { let currentLease = lease; let disposed = false; @@ -58,21 +72,17 @@ export function createStepLock(world: World) { rate: options.rate, }; - while (true) { - const result = await world.limits.acquire({ - key: options.key, - holderId, - definition, - leaseTtlMs: options.leaseTtlMs, - }); - - if (result.status === 'acquired') { - return createStepLockHandle(result.lease, world); - } + const result = await world.limits.acquire({ + key: options.key, + holderId, + definition, + leaseTtlMs: options.leaseTtlMs, + }); - await new Promise((resolve) => - setTimeout(resolve, result.retryAfterMs || 1000) - ); + if (result.status === 'acquired') { + return createStepLockHandle(result.lease, world); } + + throw new StepLockBlockedError(result.retryAfterMs); }; } diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 21db74d825..8f284d1003 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -1,4 +1,3 @@ -import { EventConsumerResult } from '../events-consumer.js'; import { WorkflowSuspension } from '../global.js'; import type { LockHandle, LockOptions } from '../lock.js'; import { @@ -64,7 +63,8 @@ function createLockHandle( export function createLock(ctx: WorkflowOrchestratorContext) { return async function lockImpl(options: LockOptions): Promise { - const holderId = `wflock_${ctx.generateUlid()}`; + const correlationId = `wflock_wait_${ctx.generateUlid()}`; + const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; const definition = { concurrency: options.concurrency, rate: options.rate, @@ -82,49 +82,19 @@ export function createLock(ctx: WorkflowOrchestratorContext) { return createLockHandle(result.lease, ctx); } - const correlationId = `wflock_wait_${ctx.generateUlid()}`; - const resumeAt = new Date(Date.now() + (result.retryAfterMs || 1000)); ctx.invocationsQueue.set(correlationId, { - type: 'wait', + type: 'limit_wait', correlationId, - resumeAt, + resumeAt: new Date(Date.now() + (result.retryAfterMs || 1000)), }); - await new Promise((resolve) => { - ctx.eventsConsumer.subscribe((event) => { - if (!event) { - scheduleWhenIdle(ctx, () => { - ctx.onWorkflowError( - new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) - ); - }); - return EventConsumerResult.NotConsumed; - } - - if (event.correlationId !== correlationId) { - return EventConsumerResult.NotConsumed; - } - - if (event.eventType === 'wait_created') { - const queueItem = ctx.invocationsQueue.get(correlationId); - if (queueItem && queueItem.type === 'wait') { - queueItem.hasCreatedEvent = true; - queueItem.resumeAt = event.eventData.resumeAt; - } - return EventConsumerResult.Consumed; - } - - if (event.eventType === 'wait_completed') { - ctx.invocationsQueue.delete(correlationId); - ctx.promiseQueue = ctx.promiseQueue.then(() => { - resolve(); - }); - return EventConsumerResult.Finished; - } - - return EventConsumerResult.NotConsumed; - }); + scheduleWhenIdle(ctx, () => { + ctx.onWorkflowError( + new WorkflowSuspension(ctx.invocationsQueue, ctx.globalThis) + ); }); + + await new Promise(() => {}); } }; } diff --git a/packages/workflow/src/internal/builtins.ts b/packages/workflow/src/internal/builtins.ts index 886686e50e..624ebbaebd 100644 --- a/packages/workflow/src/internal/builtins.ts +++ b/packages/workflow/src/internal/builtins.ts @@ -2,6 +2,9 @@ * These are the built-in steps that are "automatically available" in the workflow scope. They are * similar to "stdlib" except that are not meant to be imported by users, but are instead "just available" * alongside user defined steps. They are used internally by the runtime + * + * These helpers intentionally rely on the method receiver (`this`) so workflow + * objects like `Request` and `Response` can round-trip through step execution. */ export async function __builtin_response_array_buffer( diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 1db72676af..3f8351f99b 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,149 +1,19 @@ +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { createLocalWorld } from './index.js'; +import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; -import { setTimeout as sleep } from 'node:timers/promises'; -import { describe, expect, it } from 'vitest'; -import { createLocalWorld } from './index.js'; -import { createLimits } from './limits.js'; -async function withTempDir(fn: (dir: string) => Promise): Promise { +createLimitsContractSuite('local world limits', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); - try { - return await fn(dir); - } finally { - await rm(dir, { recursive: true, force: true }); - } -} + const world = createLocalWorld({ dataDir: dir }); -describe('local world limits', () => { - it('exposes the required limits namespace', async () => { - await withTempDir(async (dir) => { - const world = createLocalWorld({ dataDir: dir }); - expect(world.limits).toBeDefined(); - expect(typeof world.limits.acquire).toBe('function'); - expect(typeof world.limits.release).toBe('function'); - expect(typeof world.limits.heartbeat).toBe('function'); + return { + limits: createLimits(dir), + close: async () => { await world.close?.(); - }); - }); - - it('enforces per-key concurrency limits', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - const second = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second).toMatchObject({ - status: 'blocked', - reason: 'concurrency', - }); - - await limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); - - const third = await limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(third.status).toBe('acquired'); - }); - }); - - it('returns a retry path when rate limits block acquisition', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-a', - definition: { rate: { count: 1, periodMs: 100 } }, - leaseTtlMs: 1_000, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - await limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); - - const second = await limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-b', - definition: { rate: { count: 1, periodMs: 100 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('blocked'); - if (second.status !== 'blocked') { - throw new Error('expected second acquire to be blocked'); - } - expect(second.reason).toBe('rate'); - expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); - }); - }); - - it('restores capacity when a lease is released or expires', async () => { - await withTempDir(async (dir) => { - const limits = createLimits(dir); - - const first = await limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 25, - }); - - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') { - throw new Error('expected first lease to be acquired'); - } - - const heartbeat = await limits.heartbeat({ - leaseId: first.lease.leaseId, - ttlMs: 50, - }); - expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( - first.lease.expiresAt?.getTime() ?? 0 - ); - - await sleep(60); - - const second = await limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('acquired'); - }); - }); + await rm(dir, { recursive: true, force: true }); + }, + }; }); diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index bfb617c9b6..7e2888f69f 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -117,6 +117,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - **Durable Storage**: Stores workflow runs, events, steps, hooks, and webhooks in PostgreSQL - **Queue Processing**: Uses graphile-worker as the durable queue and executes jobs over the workflow HTTP routes - **Durable Delays**: Re-schedules waits and retries in PostgreSQL +- **Flow Limits**: Enforces durable concurrency/rate limits with PostgreSQL-backed leases, rate tokens, and waiter promotion - **Streaming**: Real-time event streaming capabilities - **Health Checks**: Built-in connection health monitoring - **Configurable Concurrency**: Adjustable worker concurrency for queue processing @@ -127,6 +128,8 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Graphile jobs are acknowledged only after the workflow or step execution finishes, or after the worker durably schedules a delayed follow-up job - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling +- Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key +- Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` ## Development diff --git a/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql new file mode 100644 index 0000000000..01892d0bfe --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/0010_add_flow_limits.sql @@ -0,0 +1,35 @@ +CREATE TABLE "workflow"."workflow_limit_leases" ( + "lease_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp, + "concurrency_max" integer, + "rate_count" integer, + "rate_period_ms" integer +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_tokens" ( + "token_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "acquired_at" timestamp DEFAULT now() NOT NULL, + "expires_at" timestamp NOT NULL +); +--> statement-breakpoint +CREATE TABLE "workflow"."workflow_limit_waiters" ( + "waiter_id" varchar PRIMARY KEY NOT NULL, + "limit_key" varchar NOT NULL, + "holder_id" varchar NOT NULL, + "created_at" timestamp DEFAULT now() NOT NULL, + "lease_ttl_ms" integer, + "concurrency_max" integer, + "rate_count" integer, + "rate_period_ms" integer +); +--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_leases_limit_key_holder_id_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_leases_limit_key_expires_at_index" ON "workflow"."workflow_limit_leases" USING btree ("limit_key","expires_at");--> statement-breakpoint +CREATE INDEX "workflow_limit_tokens_limit_key_expires_at_index" ON "workflow"."workflow_limit_tokens" USING btree ("limit_key","expires_at");--> statement-breakpoint +CREATE UNIQUE INDEX "workflow_limit_waiters_limit_key_holder_id_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","holder_id");--> statement-breakpoint +CREATE INDEX "workflow_limit_waiters_limit_key_created_at_index" ON "workflow"."workflow_limit_waiters" USING btree ("limit_key","created_at");--> statement-breakpoint diff --git a/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json new file mode 100644 index 0000000000..97ddba3774 --- /dev/null +++ b/packages/world-postgres/src/drizzle/migrations/meta/0010_snapshot.json @@ -0,0 +1,973 @@ +{ + "id": "c4af56df-d588-4810-a8b4-f4eb68b270b2", + "prevId": "7adbbd35-ca90-4353-bb34-3d1b2435a027", + "version": "7", + "dialect": "postgresql", + "tables": { + "workflow.workflow_events": { + "name": "workflow_events", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "type": { + "name": "type", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "correlation_id": { + "name": "correlation_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "payload": { + "name": "payload", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "payload_cbor": { + "name": "payload_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_events_run_id_index": { + "name": "workflow_events_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_events_correlation_id_index": { + "name": "workflow_events_correlation_id_index", + "columns": [ + { + "expression": "correlation_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_hooks": { + "name": "workflow_hooks", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "hook_id": { + "name": "hook_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "token": { + "name": "token", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "owner_id": { + "name": "owner_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "project_id": { + "name": "project_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "environment": { + "name": "environment", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "metadata": { + "name": "metadata", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "metadata_cbor": { + "name": "metadata_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "is_webhook": { + "name": "is_webhook", + "type": "boolean", + "primaryKey": false, + "notNull": false, + "default": true + } + }, + "indexes": { + "workflow_hooks_run_id_index": { + "name": "workflow_hooks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_hooks_token_index": { + "name": "workflow_hooks_token_index", + "columns": [ + { + "expression": "token", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_leases": { + "name": "workflow_limit_leases", + "schema": "workflow", + "columns": { + "lease_id": { + "name": "lease_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_leases_limit_key_holder_id_index": { + "name": "workflow_limit_leases_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_leases_limit_key_expires_at_index": { + "name": "workflow_limit_leases_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_tokens": { + "name": "workflow_limit_tokens", + "schema": "workflow", + "columns": { + "token_id": { + "name": "token_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "acquired_at": { + "name": "acquired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "expires_at": { + "name": "expires_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_limit_tokens_limit_key_expires_at_index": { + "name": "workflow_limit_tokens_limit_key_expires_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "expires_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_limit_waiters": { + "name": "workflow_limit_waiters", + "schema": "workflow", + "columns": { + "waiter_id": { + "name": "waiter_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "limit_key": { + "name": "limit_key", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "holder_id": { + "name": "holder_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "lease_ttl_ms": { + "name": "lease_ttl_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "concurrency_max": { + "name": "concurrency_max", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_count": { + "name": "rate_count", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "rate_period_ms": { + "name": "rate_period_ms", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_limit_waiters_limit_key_holder_id_index": { + "name": "workflow_limit_waiters_limit_key_holder_id_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "holder_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": true, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_limit_waiters_limit_key_created_at_index": { + "name": "workflow_limit_waiters_limit_key_created_at_index", + "columns": [ + { + "expression": "limit_key", + "isExpression": false, + "asc": true, + "nulls": "last" + }, + { + "expression": "created_at", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_runs": { + "name": "workflow_runs", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "deployment_id": { + "name": "deployment_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "name": { + "name": "name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + }, + "execution_context": { + "name": "execution_context", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "execution_context_cbor": { + "name": "execution_context_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "expired_at": { + "name": "expired_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_runs_name_index": { + "name": "workflow_runs_name_index", + "columns": [ + { + "expression": "name", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_runs_status_index": { + "name": "workflow_runs_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_steps": { + "name": "workflow_steps", + "schema": "workflow", + "columns": { + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "step_id": { + "name": "step_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "step_name": { + "name": "step_name", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "step_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "input": { + "name": "input", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "input_cbor": { + "name": "input_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "output": { + "name": "output", + "type": "jsonb", + "primaryKey": false, + "notNull": false + }, + "output_cbor": { + "name": "output_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "error": { + "name": "error", + "type": "text", + "primaryKey": false, + "notNull": false + }, + "error_cbor": { + "name": "error_cbor", + "type": "bytea", + "primaryKey": false, + "notNull": false + }, + "attempt": { + "name": "attempt", + "type": "integer", + "primaryKey": false, + "notNull": true + }, + "started_at": { + "name": "started_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "retry_after": { + "name": "retry_after", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_steps_run_id_index": { + "name": "workflow_steps_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + }, + "workflow_steps_status_index": { + "name": "workflow_steps_status_index", + "columns": [ + { + "expression": "status", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_stream_chunks": { + "name": "workflow_stream_chunks", + "schema": "workflow", + "columns": { + "id": { + "name": "id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "stream_id": { + "name": "stream_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": false + }, + "data": { + "name": "data", + "type": "bytea", + "primaryKey": false, + "notNull": true + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "eof": { + "name": "eof", + "type": "boolean", + "primaryKey": false, + "notNull": true + } + }, + "indexes": { + "workflow_stream_chunks_run_id_index": { + "name": "workflow_stream_chunks_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": { + "workflow_stream_chunks_stream_id_id_pk": { + "name": "workflow_stream_chunks_stream_id_id_pk", + "columns": ["stream_id", "id"] + } + }, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + }, + "workflow.workflow_waits": { + "name": "workflow_waits", + "schema": "workflow", + "columns": { + "wait_id": { + "name": "wait_id", + "type": "varchar", + "primaryKey": true, + "notNull": true + }, + "run_id": { + "name": "run_id", + "type": "varchar", + "primaryKey": false, + "notNull": true + }, + "status": { + "name": "status", + "type": "wait_status", + "typeSchema": "public", + "primaryKey": false, + "notNull": true + }, + "resume_at": { + "name": "resume_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "completed_at": { + "name": "completed_at", + "type": "timestamp", + "primaryKey": false, + "notNull": false + }, + "created_at": { + "name": "created_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "updated_at": { + "name": "updated_at", + "type": "timestamp", + "primaryKey": false, + "notNull": true, + "default": "now()" + }, + "spec_version": { + "name": "spec_version", + "type": "integer", + "primaryKey": false, + "notNull": false + } + }, + "indexes": { + "workflow_waits_run_id_index": { + "name": "workflow_waits_run_id_index", + "columns": [ + { + "expression": "run_id", + "isExpression": false, + "asc": true, + "nulls": "last" + } + ], + "isUnique": false, + "concurrently": false, + "method": "btree", + "with": {} + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "policies": {}, + "checkConstraints": {}, + "isRLSEnabled": false + } + }, + "enums": { + "public.step_status": { + "name": "step_status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + }, + "public.wait_status": { + "name": "wait_status", + "schema": "public", + "values": ["waiting", "completed"] + }, + "public.status": { + "name": "status", + "schema": "public", + "values": ["pending", "running", "completed", "failed", "cancelled"] + } + }, + "schemas": { + "workflow": "workflow" + }, + "sequences": {}, + "roles": {}, + "policies": {}, + "views": {}, + "_meta": { + "columns": {}, + "schemas": {}, + "tables": {} + } +} diff --git a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json index f4956666fc..e98c400c01 100644 --- a/packages/world-postgres/src/drizzle/migrations/meta/_journal.json +++ b/packages/world-postgres/src/drizzle/migrations/meta/_journal.json @@ -71,6 +71,13 @@ "when": 1770500000000, "tag": "0009_add_is_webhook", "breakpoints": true + }, + { + "idx": 10, + "version": "7", + "when": 1773863098757, + "tag": "0010_add_flow_limits", + "breakpoints": true } ] } diff --git a/packages/world-postgres/src/drizzle/schema.ts b/packages/world-postgres/src/drizzle/schema.ts index f353ef8ca1..b6e8205237 100644 --- a/packages/world-postgres/src/drizzle/schema.ts +++ b/packages/world-postgres/src/drizzle/schema.ts @@ -21,6 +21,7 @@ import { primaryKey, text, timestamp, + uniqueIndex, varchar, } from 'drizzle-orm/pg-core'; import { Cbor, type Cborized } from './cbor.js'; @@ -192,6 +193,54 @@ export const waits = schema.table( (tb) => [index().on(tb.runId)] ); +export const limitLeases = schema.table( + 'workflow_limit_leases', + { + leaseId: varchar('lease_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at'), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.expiresAt), + ] +); + +export const limitTokens = schema.table( + 'workflow_limit_tokens', + { + tokenId: varchar('token_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + acquiredAt: timestamp('acquired_at').defaultNow().notNull(), + expiresAt: timestamp('expires_at').notNull(), + }, + (tb) => [index().on(tb.limitKey, tb.expiresAt)] +); + +export const limitWaiters = schema.table( + 'workflow_limit_waiters', + { + waiterId: varchar('waiter_id').primaryKey(), + limitKey: varchar('limit_key').notNull(), + holderId: varchar('holder_id').notNull(), + createdAt: timestamp('created_at').defaultNow().notNull(), + leaseTtlMs: integer('lease_ttl_ms'), + concurrencyMax: integer('concurrency_max'), + rateCount: integer('rate_count'), + ratePeriodMs: integer('rate_period_ms'), + }, + (tb) => [ + uniqueIndex().on(tb.limitKey, tb.holderId), + index().on(tb.limitKey, tb.createdAt), + ] +); + const bytea = customType<{ data: Buffer; notNull: false; default: false }>({ dataType() { return 'bytea'; diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 2c43f08584..bf6ae15e23 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,19 +1,111 @@ -import { describe, it } from 'vitest'; +import { asc, eq } from 'drizzle-orm'; +import { + afterAll, + beforeAll, + beforeEach, + describe, + expect, + it, + test, +} from 'vitest'; +import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import * as Schema from './drizzle/schema.js'; +import { createLimits } from './limits.js'; -describe('postgres world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); +if (process.platform === 'win32') { + test.skip('skipped on Windows since it relies on a docker container', () => {}); +} else { + let db: Awaited< + ReturnType + >; + + beforeAll(async () => { + const { createPostgresTestDb } = await import('../test/test-db.js'); + db = await createPostgresTestDb(); + }, 120_000); + + beforeEach(async () => { + await db.truncateLimits(); }); - it.fails('respects the concurrency cap across concurrent acquires', () => { - throw new Error('TODO: implement'); + afterAll(async () => { + await db.close(); }); - it.fails('wakes waiters in deterministic order when a lease is released', () => { - throw new Error('TODO: implement'); + createLimitsContractSuite('postgres world limits', async () => { + return { + limits: createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ), + }; }); - it.fails('reclaims stale leases after worker or process death', () => { - throw new Error('TODO: implement'); + describe('postgres waiter promotion', () => { + it('promotes the earliest waiter on release', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const first = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + const second = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const third = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + expect(third.status).toBe('blocked'); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy( + asc(Schema.limitLeases.acquiredAt), + asc(Schema.limitLeases.leaseId) + ); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy( + asc(Schema.limitWaiters.createdAt), + asc(Schema.limitWaiters.waiterId) + ); + + expect(leases).toEqual([{ holderId: 'holder-b' }]); + expect(waiters).toEqual([{ holderId: 'holder-c' }]); + + const stillWaiting = await limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(stillWaiting.status).toBe('blocked'); + }); }); -}); +} diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts new file mode 100644 index 0000000000..2bb21aa380 --- /dev/null +++ b/packages/world-postgres/test/test-db.ts @@ -0,0 +1,59 @@ +import { execSync } from 'node:child_process'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import postgres from 'postgres'; +import { createClient } from '../src/drizzle/index.js'; + +const packageDir = path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + '..' +); + +export interface PostgresTestDb { + container: Awaited>; + sql: ReturnType; + drizzle: ReturnType; + connectionString: string; + truncateLimits(): Promise; + close(): Promise; +} + +export async function createPostgresTestDb(): Promise { + const container = await new PostgreSqlContainer('postgres:15-alpine').start(); + const connectionString = container.getConnectionUri(); + process.env.DATABASE_URL = connectionString; + process.env.WORKFLOW_POSTGRES_URL = connectionString; + + execSync('pnpm db:push', { + stdio: 'inherit', + cwd: packageDir, + env: process.env, + }); + + const sql = postgres(connectionString, { max: 1 }); + const drizzle = createClient(sql); + + return { + container, + sql, + drizzle, + connectionString, + async truncateLimits() { + await sql` + truncate table + workflow.workflow_limit_waiters, + workflow.workflow_limit_tokens, + workflow.workflow_limit_leases, + workflow.workflow_steps, + workflow.workflow_events, + workflow.workflow_runs + restart identity cascade + `; + }, + async close() { + await sql.end(); + await container.stop(); + }, + }; +} diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts new file mode 100644 index 0000000000..2acfbc1d72 --- /dev/null +++ b/packages/world-testing/src/limits-contract.ts @@ -0,0 +1,191 @@ +import { setTimeout as sleep } from 'node:timers/promises'; +import type { Limits } from '@workflow/world'; +import { describe, expect, it } from 'vitest'; + +export interface LimitsHarness { + limits: Limits; + close?: () => Promise; +} + +export function createLimitsContractSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('enforces per-key concurrency limits', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency', + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'step:db:cheap', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('returns a retry path when rate limits block acquisition', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-a', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const second = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-b', + definition: { rate: { count: 1, periodMs: 100 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + if (second.status !== 'blocked') throw new Error('expected blocked'); + expect(second.reason).toBe('rate'); + expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + } finally { + await harness.close?.(); + } + }); + + it('returns a combined blocked reason when both limits are saturated', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-a', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs: 1_000 }, + }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs: 1_000 }, + }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'blocked', + reason: 'concurrency_and_rate', + }); + } finally { + await harness.close?.(); + } + }); + + it('restores capacity when a lease is released or expires', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 100, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const heartbeat = await harness.limits.heartbeat({ + leaseId: first.lease.leaseId, + ttlMs: 200, + }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( + first.lease.expiresAt?.getTime() ?? 0 + ); + + await sleep(250); + + const second = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('reuses an existing lease for the same holder', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:reacquire', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:reacquire', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second).toMatchObject({ + status: 'acquired', + lease: { + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + }, + }); + } finally { + await harness.close?.(); + } + }); + }); +} diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index df5d3275c8..769f78ba74 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -1,16 +1,18 @@ -# Planning to delete after PR is implemented / ready to merge - # Flow Limits Design Notes -This note summarizes the current direction for flow concurrency and rate limiting -across `@workflow/core`, `@workflow/world`, and concrete world implementations. +This note summarizes the implemented direction for flow concurrency and rate +limiting across `@workflow/core`, `@workflow/world`, and concrete world +implementations. ## Status - The shared `limits` interface and `lock()` API surface now exist. -- Local world has an initial working implementation for acquire/release/heartbeat. -- Postgres and Vercel worlds still expose `limits` as stubs. -- There is a real local E2E example for workflow and step locks in the Next.js Turbopack workbench. +- Local world has a working lease-based implementation for + acquire/release/heartbeat. +- Postgres now has a PostgreSQL-backed implementation with leases, rate tokens, + and durable waiters. +- Vercel still exposes `limits` as a stub. +- The Next.js Turbopack workbench has E2E coverage for workflow and step locks. ## Goals @@ -115,17 +117,19 @@ workflow scope, even though the workflow may suspend and resume many times. #### In steps -`lock()` should act like a step gate. +`lock()` acts like a step gate. -The intended long-term behavior is: +The current behavior is: - declare the limit at the top of the step -- runtime/compiler hoists or interprets it as a pre-step requirement -- the step should not occupy a worker just waiting for capacity +- the runtime treats a blocked acquisition as step-boundary admission failure +- the step does not keep executing user code while waiting for capacity +- the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes -This means step `lock()` is conceptually the same API, but not a literal -"block inside already-running user step code" implementation. +This means step `lock()` is conceptually the same API, but it is not a literal +"spin inside already-running user step code until capacity appears" +implementation. ### 6. `await using` is the preferred user-facing shape @@ -157,7 +161,7 @@ For workflows, `await using` must be tied to the logical workflow scope across: The lease must not be disposed merely because one host process invocation ends. -### 8. Prefer Option B for deadlock avoidance +### 8. Prefer step-boundary admission for deadlock avoidance Current preferred model: @@ -174,7 +178,55 @@ This keeps the dependency direction one-way: That avoids the classic cycle where one workflow holds a workflow lock and another holds a step lock and each waits on the other. -### 9. V1 semantics are intentionally opinionated +### 9. Waiters are FIFO per key + +The PostgreSQL implementation uses a durable waiter queue and promotes waiters +in FIFO order for a single limit key. + +Important details: + +- FIFO is per key, not global across all limit keys +- promotion order is based on waiter creation order +- a waiter may be skipped if it is no longer eligible when promotion runs +- releasing a lease or reclaiming an expired lease can both trigger promotion +- rate-window expiry can also make the head waiter eligible again + +This gives deterministic and inspectable fairness for a key without requiring a +global scheduler. + +### 10. Blocked limits do not consume worker concurrency + +Blocked flow limits and worker concurrency are intentionally separate. + +In the PostgreSQL world: + +- blocked workflows are suspended and re-queued, not left running on a worker +- blocked steps exit the current attempt and are re-queued instead of polling in + a live worker slot +- backlog remains durable in PostgreSQL while worker slots are free to service + unrelated work + +This is the main practical difference between a durable waiter model and a pure +polling loop. + +### 11. Wake-up is prompt, with a delayed fallback + +The PostgreSQL world uses Graphile for wake-up delivery, but PostgreSQL tables +remain the source of truth for limit state. + +Current behavior: + +- leases, rate tokens, and waiters live in PostgreSQL tables +- promotion decisions are made from SQL state +- when a waiter is promoted, the runtime is woken by enqueuing the appropriate + workflow or step job +- workflows also keep a delayed replay fallback so progress is still possible if + an immediate wake-up is missed + +This means Graphile is used to resume work quickly, not to decide fairness or +capacity ownership. + +### 12. V1 semantics are intentionally opinionated For v1, the intended semantics are: @@ -197,9 +249,8 @@ More concretely: For the current local implementation specifically: - workflow locks already behave like durable logical-scope leases -- step locks currently use in-process retry polling once the step is already - executing, which is acceptable for local v1 but not the ideal long-term - admission model +- step locks are still simpler than Postgres and do not provide the same durable + waiter/wake-up behavior This means the current v1 interpretation of a workflow lock is: @@ -268,13 +319,19 @@ For example: So overall system throughput is not one simple global minimum. Different workflow paths may be bottlenecked by different limits at different times. +Two more practical clarifications: + +- a blocked workflow lock should not monopolize + `WORKFLOW_POSTGRES_WORKER_CONCURRENCY` or + `WORKFLOW_LOCAL_QUEUE_CONCURRENCY` just because it is waiting +- a released concurrency lease frees concurrency immediately, but associated + rate usage still remains counted until its token ages out of the rate window + ## Open Questions -- Exact runtime/compiler behavior for step-scoped `lock()` hoisting. - Whether workflow-level locks should always be whole-run admission locks or also support narrower workflow-scoped blocks. - Whether `heartbeat()` should remain user-visible or become mostly internal. - Whether step limits should only be expressed through `lock()` or also through step metadata/config sugar. -- Fairness/wake-up policy for waiters per key in local and Postgres worlds. - Exact event-log representation for acquire/block/dispose transitions. diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 13dcd7cb0b..985e9331e4 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -261,6 +261,53 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } +async function serializedLimitStep(label: string, holdMs: number) { + 'use step'; + + const stepLock = await lock({ + key: 'step:db:serialized', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const acquiredAt = Date.now(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + await stepLock.dispose(); + const releasedAt = Date.now(); + + return { + label, + acquiredAt, + releasedAt, + }; +} + +export async function workflowLockContentionWorkflow( + userId = 'user-123', + holdMs = 750 +) { + 'use workflow'; + + const workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + const step = await serializedLimitStep(userId, holdMs); + await workflowLock.dispose(); + const workflowLockReleasedAt = Date.now(); + + return { + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + stepLockAcquiredAt: step.acquiredAt, + stepLockReleasedAt: step.releasedAt, + }; +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 1677f3dcf229a6125455a19f307eaae9da81baf6 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 16:47:14 -0400 Subject: [PATCH 06/16] pg limits implementation Signed-off-by: nathancolosimo --- .../core/src/runtime/suspension-handler.ts | 4 + packages/core/src/step/lock.ts | 4 + packages/core/src/workflow/lock.ts | 5 + packages/world-postgres/src/limits.ts | 604 +++++++++++++++++- 4 files changed, 608 insertions(+), 9 deletions(-) diff --git a/packages/core/src/runtime/suspension-handler.ts b/packages/core/src/runtime/suspension-handler.ts index dea2a50b5f..eee6439556 100644 --- a/packages/core/src/runtime/suspension-handler.ts +++ b/packages/core/src/runtime/suspension-handler.ts @@ -316,6 +316,10 @@ export async function handleSuspension({ for (const queueItem of limitWaitItems) { ops.push( (async () => { + /* + Lock waits are runtime control flow, not user-visible wait events. + We only enqueue a fallback replay here; promoted waiters can replace it. + */ const delayMs = Math.max( 1000, queueItem.resumeAt.getTime() - Date.now() diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index fc3901f986..6aa59132a5 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -83,6 +83,10 @@ export function createStepLock(world: World) { return createStepLockHandle(result.lease, world); } + /* + Steps do not sit inside user code polling for a lease. + The runtime catches this and re-queues the step attempt at the boundary. + */ throw new StepLockBlockedError(result.retryAfterMs); }; } diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index 8f284d1003..f0905e06e9 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -63,6 +63,11 @@ function createLockHandle( export function createLock(ctx: WorkflowOrchestratorContext) { return async function lockImpl(options: LockOptions): Promise { + /* + Blocked workflow locks suspend the workflow turn instead of creating a real + wait event. Postgres can wake this correlation id early when the waiter is + promoted, and the delayed replay is just a fallback. + */ const correlationId = `wflock_wait_${ctx.generateUlid()}`; const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; const definition = { diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 01e8184c79..037b57d66a 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -1,20 +1,606 @@ -import { createLimitsNotImplementedError, type Limits } from '@workflow/world'; +import { JsonTransport } from '@vercel/queue'; +import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; +import { WorkflowAPIError } from '@workflow/errors'; +import { + LimitAcquireRequestSchema, + type LimitAcquireResult, + LimitHeartbeatRequestSchema, + type LimitLease, + LimitReleaseRequestSchema, + type Limits, + MessageId, +} from '@workflow/world'; +import { monotonicFactory } from 'ulid'; import type { PostgresWorldConfig } from './config.js'; import type { Drizzle } from './drizzle/index.js'; +import * as Schema from './drizzle/schema.js'; +import { MessageData } from './message.js'; + +type LeaseRow = typeof Schema.limitLeases.$inferSelect; +type TokenRow = typeof Schema.limitTokens.$inferSelect; +type WaiterRow = typeof Schema.limitWaiters.$inferSelect; +type RunRow = Pick< + typeof Schema.runs.$inferSelect, + 'workflowName' | 'startedAt' | 'status' +>; +type StepRow = Pick; +type Tx = Parameters[0]>[0]; +type Db = Drizzle | Tx; + +type HolderTarget = + | { + kind: 'workflow'; + runId: string; + correlationId: string; + } + | { + kind: 'step'; + runId: string; + stepId: string; + } + | { + kind: 'opaque'; + }; + +const transport = new JsonTransport(); +const generateId = monotonicFactory(); + +function getQueues(config: PostgresWorldConfig) { + const prefix = config.jobPrefix || 'workflow_'; + return { + workflow: `${prefix}flows`, + step: `${prefix}steps`, + } as const; +} + +function nowPlus(ms?: number): Date | undefined { + if (ms === undefined) return undefined; + return new Date(Date.now() + ms); +} + +function toDate(value: Date | string | null | undefined): Date | undefined { + if (value === null || value === undefined) return undefined; + return value instanceof Date ? value : new Date(value); +} + +function toMillis(value: Date | string | null | undefined): number | undefined { + const date = toDate(value); + return date ? date.getTime() : undefined; +} + +/* +Holder ids double as wake-up hints. +When a waiter is promoted, we decode the holder id to decide which queue to poke. +*/ +function parseHolderId(holderId: string): HolderTarget { + if (holderId.startsWith('wflock_')) { + const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); + if (runId && correlationId) { + return { kind: 'workflow', runId, correlationId }; + } + } + + if (holderId.startsWith('stplock_')) { + const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); + if (runId && stepId) { + return { kind: 'step', runId, stepId }; + } + } + + return { kind: 'opaque' }; +} + +function toLease(row: LeaseRow): LimitLease { + return { + leaseId: row.leaseId, + key: row.limitKey, + holderId: row.holderId, + acquiredAt: toDate(row.acquiredAt)!, + expiresAt: toDate(row.expiresAt), + definition: { + concurrency: + row.concurrencyMax !== null ? { max: row.concurrencyMax } : undefined, + rate: + row.rateCount !== null && row.ratePeriodMs !== null + ? { + count: row.rateCount, + periodMs: row.ratePeriodMs, + } + : undefined, + }, + }; +} + +function getBlockedReason( + concurrencyBlocked: boolean, + rateBlocked: boolean +): 'concurrency' | 'rate' | 'concurrency_and_rate' { + if (concurrencyBlocked && rateBlocked) return 'concurrency_and_rate'; + if (concurrencyBlocked) return 'concurrency'; + return 'rate'; +} + +/* +When a workflow or step is blocked, we need to calculate the retry after time. +We do this by finding the earliest expiration time for any leases or tokens. +*/ +function getRetryAfterMs( + leases: LeaseRow[], + tokens: TokenRow[], + now: number, + concurrencyBlocked: boolean, + rateBlocked: boolean +): number | undefined { + const candidates: number[] = []; + + if (concurrencyBlocked) { + for (const lease of leases) { + if (lease.expiresAt) { + candidates.push(Math.max(0, toMillis(lease.expiresAt)! - now)); + } + } + } + + if (rateBlocked) { + for (const token of tokens) { + candidates.push(Math.max(0, toMillis(token.expiresAt)! - now)); + } + } + + if (candidates.length === 0) return undefined; + return Math.min(...candidates); +} + +async function queueWorkflowWake( + tx: Db, + config: PostgresWorldConfig, + runId: string, + workflowName: string, + idempotencyKey: string +) { + const messageId = MessageId.parse(`msg_${generateId()}`); + const payload = MessageData.encode({ + id: workflowName, + data: Buffer.from( + transport.serialize({ + runId, + requestedAt: new Date(), + }) + ), + attempt: 1, + idempotencyKey, + messageId, + }); + + await tx.execute(sql` + select graphile_worker.add_job( + ${getQueues(config).workflow}::text, + payload := ${JSON.stringify(payload)}::json, + max_attempts := 3, + job_key := ${idempotencyKey}::text, + job_key_mode := 'replace' + ) + `); +} + +async function queueStepWake( + tx: Db, + config: PostgresWorldConfig, + step: { + stepId: string; + stepName: string; + workflowName: string; + workflowStartedAt: number; + workflowRunId: string; + } +) { + const messageId = MessageId.parse(`msg_${generateId()}`); + const payload = MessageData.encode({ + id: step.stepName, + data: Buffer.from( + transport.serialize({ + workflowName: step.workflowName, + workflowRunId: step.workflowRunId, + workflowStartedAt: step.workflowStartedAt, + stepId: step.stepId, + requestedAt: new Date(), + }) + ), + attempt: 1, + idempotencyKey: step.stepId, + messageId, + }); + + await tx.execute(sql` + select graphile_worker.add_job( + ${getQueues(config).step}::text, + payload := ${JSON.stringify(payload)}::json, + max_attempts := 3, + job_key := ${step.stepId}::text, + job_key_mode := 'replace' + ) + `); +} + +async function queueWakeForHolder( + tx: Db, + config: PostgresWorldConfig, + holderId: string +) { + /* + Limit state is durable in Postgres, but wake-ups still need a runtime target. + If the run or step is already terminal, there is nothing left to resume. + */ + const target = parseHolderId(holderId); + if (target.kind === 'opaque') { + return; + } + + if (target.kind === 'workflow') { + const [run] = (await tx + .select({ + workflowName: Schema.runs.workflowName, + startedAt: Schema.runs.startedAt, + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as RunRow[]; + + if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { + return; + } + + await queueWorkflowWake( + tx, + config, + target.runId, + run.workflowName, + target.correlationId + ); + return; + } + + const [step] = (await tx + .select({ + stepName: Schema.steps.stepName, + status: Schema.steps.status, + }) + .from(Schema.steps) + .where(eq(Schema.steps.stepId, target.stepId)) + .limit(1)) as StepRow[]; + if (!step || ['completed', 'failed'].includes(step.status)) { + return; + } + + const [run] = (await tx + .select({ + workflowName: Schema.runs.workflowName, + startedAt: Schema.runs.startedAt, + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as RunRow[]; + if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { + return; + } + + await queueStepWake(tx, config, { + stepId: target.stepId, + stepName: step.stepName, + workflowName: run.workflowName, + workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), + workflowRunId: target.runId, + }); +} + +async function pruneExpired(tx: Db, key: string): Promise { + /* + Capacity is reclaimed opportunistically whenever a key is touched. + This keeps v1 simple and avoids needing a separate cleanup worker. + */ + const now = new Date(); + + await tx + .delete(Schema.limitTokens) + .where( + and( + eq(Schema.limitTokens.limitKey, key), + lte(Schema.limitTokens.expiresAt, now) + ) + ); + + await tx + .delete(Schema.limitLeases) + .where( + and( + eq(Schema.limitLeases.limitKey, key), + isNotNull(Schema.limitLeases.expiresAt), + lte(Schema.limitLeases.expiresAt, now) + ) + ); +} + +async function getActiveState( + tx: Db, + key: string +): Promise<{ + leases: LeaseRow[]; + tokens: TokenRow[]; + waiters: WaiterRow[]; +}> { + const [leases, tokens, waiters] = await Promise.all([ + tx + .select() + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, key)) + .orderBy( + asc(Schema.limitLeases.acquiredAt), + asc(Schema.limitLeases.leaseId) + ), + tx + .select() + .from(Schema.limitTokens) + .where(eq(Schema.limitTokens.limitKey, key)) + .orderBy(asc(Schema.limitTokens.expiresAt)), + tx + .select() + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, key)) + .orderBy( + asc(Schema.limitWaiters.createdAt), + asc(Schema.limitWaiters.waiterId) + ), + ]); + + return { leases, tokens, waiters }; +} + +async function promoteWaiters( + tx: Db, + config: PostgresWorldConfig, + key: string +): Promise { + /* + We walk waiters in FIFO order and stop at the first waiter that is still blocked. + Later waiters cannot jump ahead of an earlier waiter for the same key. (getActiveState returns waiters in FIFO order) + */ + const state = await getActiveState(tx, key); + let activeLeases = state.leases.length; + let activeTokens = state.tokens.length; + + for (const waiter of state.waiters) { + const concurrencyBlocked = + waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; + const rateBlocked = + waiter.rateCount !== null && activeTokens >= waiter.rateCount; + + if (concurrencyBlocked || rateBlocked) { + break; + } + + const leaseId = `lmt_${generateId()}`; + const expiresAt = nowPlus(waiter.leaseTtlMs ?? undefined); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt, + concurrencyMax: waiter.concurrencyMax, + rateCount: waiter.rateCount, + ratePeriodMs: waiter.ratePeriodMs, + }) + .onConflictDoNothing() + .returning(); + + const acquiredLease = + lease ?? + (await tx.query.limitLeases.findFirst({ + where: and( + eq(Schema.limitLeases.limitKey, key), + eq(Schema.limitLeases.holderId, waiter.holderId) + ), + })); + + if (!acquiredLease) { + continue; + } + + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + await tx.insert(Schema.limitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: key, + holderId: waiter.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + waiter.ratePeriodMs), + }); + activeTokens += 1; + } + + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + + activeLeases += 1; + await queueWakeForHolder(tx, config, acquiredLease.holderId); + } +} export function createLimits( - _config: PostgresWorldConfig, - _drizzle: Drizzle + config: PostgresWorldConfig, + drizzle: Drizzle ): Limits { return { - async acquire() { - throw createLimitsNotImplementedError(); + async acquire(request) { + const parsed = LimitAcquireRequestSchema.parse(request); + + return drizzle.transaction(async (tx) => { + // Prune expired leases and tokens, promote pre-existing waiters before attempting to acquire a new lease or token. + await pruneExpired(tx, parsed.key); + await promoteWaiters(tx, config, parsed.key); + + const state = await getActiveState(tx, parsed.key); + const existingLease = state.leases.find( + (lease) => lease.holderId === parsed.holderId + ); + if (existingLease) { + return { + status: 'acquired', + lease: toLease(existingLease), + } satisfies LimitAcquireResult; + } + + const existingWaiter = state.waiters.find( + (waiter) => waiter.holderId === parsed.holderId + ); + // If there are already waiters for this key and holder no need to queue a new waiter. + if (existingWaiter) { + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason( + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ), + retryAfterMs: + getRetryAfterMs( + state.leases, + state.tokens, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ) ?? 1000, + } satisfies LimitAcquireResult; + } + + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + state.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + state.tokens.length >= parsed.definition.rate.count; + + // If we are not blocked, and there are no waiters for this key and holder, we can acquire a new lease or token. + if (!concurrencyBlocked && !rateBlocked && state.waiters.length === 0) { + const expiresAt = nowPlus(parsed.leaseTtlMs); + const [lease] = await tx + .insert(Schema.limitLeases) + .values({ + leaseId: `lmt_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + acquiredAt: new Date(), + expiresAt, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }) + .returning(); + + if (parsed.definition.rate) { + await tx.insert(Schema.limitTokens).values({ + tokenId: `lmttok_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + acquiredAt: new Date(), + expiresAt: new Date(Date.now() + parsed.definition.rate.periodMs), + }); + } + + return { + status: 'acquired', + lease: toLease(lease), + } satisfies LimitAcquireResult; + } + + // If we are blocked, we need to queue a waiter. + await tx + .insert(Schema.limitWaiters) + .values({ + waiterId: `lmtwait_${generateId()}`, + limitKey: parsed.key, + holderId: parsed.holderId, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs ?? null, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }) + .onConflictDoNothing(); + + const now = Date.now(); + return { + status: 'blocked', + reason: getBlockedReason(concurrencyBlocked, rateBlocked), + retryAfterMs: + getRetryAfterMs( + state.leases, + state.tokens, + now, + parsed.definition.concurrency !== undefined, + parsed.definition.rate !== undefined + ) ?? 1000, + } satisfies LimitAcquireResult; + }); }, - async release() { - throw createLimitsNotImplementedError(); + + async release(request) { + const parsed = LimitReleaseRequestSchema.parse(request); + + await drizzle.transaction(async (tx) => { + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); + if (parsed.key) { + where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; + } + if (parsed.holderId) { + where = and(where, eq(Schema.limitLeases.holderId, parsed.holderId))!; + } + + const [deleted] = await tx + .delete(Schema.limitLeases) + .where(where) + .returning({ limitKey: Schema.limitLeases.limitKey }); + + if (deleted?.limitKey) { + await pruneExpired(tx, deleted.limitKey); + await promoteWaiters(tx, config, deleted.limitKey); + } + }); }, - async heartbeat() { - throw createLimitsNotImplementedError(); + + async heartbeat(request) { + const parsed = LimitHeartbeatRequestSchema.parse(request); + + // Heartbeat a lease to extend its expiry. + return drizzle.transaction(async (tx) => { + const existing = await tx.query.limitLeases.findFirst({ + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }); + + if (!existing) { + throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { + status: 404, + }); + } + + const now = Date.now(); + const currentExpiry = toMillis(existing.expiresAt); + const ttlMs = + parsed.ttlMs ?? (currentExpiry ? currentExpiry - now : 30_000); + const expiresAt = new Date(now + Math.max(1, ttlMs)); + + const [updated] = await tx + .update(Schema.limitLeases) + .set({ expiresAt }) + .where(eq(Schema.limitLeases.leaseId, parsed.leaseId)) + .returning(); + + return toLease(updated); + }); }, }; } From 45cd62bbf49c9815ad89a46a322a01de0ef23ba6 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 17:56:57 -0400 Subject: [PATCH 07/16] DCO Remediation Commit for nathancolosimo I, nathancolosimo , hereby add my Signed-off-by to this commit: 4b918ca431dd22a7343e067f4b2e64f3b0442c1d Signed-off-by: nathancolosimo --- packages/world-postgres/src/limits.test.ts | 36 ++++++++++++++++++++++ packages/world-postgres/src/limits.ts | 26 ++++++++++++++++ packages/world-postgres/test/test-db.ts | 2 +- 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index bf6ae15e23..01d0605c9b 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -42,6 +42,42 @@ if (process.platform === 'win32') { }); describe('postgres waiter promotion', () => { + it('serializes concurrent acquires for the same key', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + const results = await Promise.all( + Array.from({ length: 12 }, (_, index) => + limits.acquire({ + key: 'workflow:user:concurrent', + holderId: `holder-${index}`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }) + ) + ); + + const acquired = results.filter((result) => result.status === 'acquired'); + const blocked = results.filter((result) => result.status === 'blocked'); + + expect(acquired).toHaveLength(1); + expect(blocked).toHaveLength(11); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, 'workflow:user:concurrent')); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, 'workflow:user:concurrent')); + + expect(leases).toHaveLength(1); + expect(waiters).toHaveLength(11); + }); + it('promotes the earliest waiter on release', async () => { const limits = createLimits( { connectionString: db.connectionString, queueConcurrency: 1 }, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 037b57d66a..a3892f72aa 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -357,6 +357,16 @@ async function getActiveState( return { leases, tokens, waiters }; } +/* +We serialize limit mutations per key inside the transaction so concurrent +acquire/release flows cannot both observe the same free capacity. +*/ +async function lockLimitKey(tx: Db, key: string): Promise { + await tx.execute( + sql`select pg_advisory_xact_lock(hashtextextended(${key}, 0))` + ); +} + async function promoteWaiters( tx: Db, config: PostgresWorldConfig, @@ -439,6 +449,7 @@ export function createLimits( const parsed = LimitAcquireRequestSchema.parse(request); return drizzle.transaction(async (tx) => { + await lockLimitKey(tx, parsed.key); // Prune expired leases and tokens, promote pre-existing waiters before attempting to acquire a new lease or token. await pruneExpired(tx, parsed.key); await promoteWaiters(tx, config, parsed.key); @@ -552,6 +563,19 @@ export function createLimits( const parsed = LimitReleaseRequestSchema.parse(request); await drizzle.transaction(async (tx) => { + const key = + parsed.key ?? + ( + await tx.query.limitLeases.findFirst({ + columns: { limitKey: true }, + where: eq(Schema.limitLeases.leaseId, parsed.leaseId), + }) + )?.limitKey; + + if (key) { + await lockLimitKey(tx, key); + } + let where = eq(Schema.limitLeases.leaseId, parsed.leaseId); if (parsed.key) { where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; @@ -587,6 +611,8 @@ export function createLimits( }); } + await lockLimitKey(tx, existing.limitKey); + const now = Date.now(); const currentExpiry = toMillis(existing.expiresAt); const ttlMs = diff --git a/packages/world-postgres/test/test-db.ts b/packages/world-postgres/test/test-db.ts index 2bb21aa380..ef27f70052 100644 --- a/packages/world-postgres/test/test-db.ts +++ b/packages/world-postgres/test/test-db.ts @@ -31,7 +31,7 @@ export async function createPostgresTestDb(): Promise { env: process.env, }); - const sql = postgres(connectionString, { max: 1 }); + const sql = postgres(connectionString, { max: 10 }); const drizzle = createClient(sql); return { From dc85a46607b57a390d97ebb534cbb0d062320373 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 22:16:06 -0400 Subject: [PATCH 08/16] Add in-step locking support - doesn't hang the step though Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 55 ++++++ .../core/src/runtime/step-handler.test.ts | 155 ++++++++++++++++- packages/core/src/runtime/step-handler.ts | 159 +++++++++++++++++- packages/core/src/step.test.ts | 36 ++++ packages/core/src/step.ts | 7 +- packages/core/src/step/context-storage.ts | 2 + packages/core/src/step/lock.ts | 23 ++- packages/world-local/src/storage.test.ts | 77 +++++++++ .../world-local/src/storage/events-storage.ts | 32 +++- packages/world-postgres/src/limits.test.ts | 158 +++++++++++++++++ packages/world-postgres/src/limits.ts | 47 ++++++ packages/world-postgres/src/storage.ts | 61 ++++++- packages/world-postgres/test/storage.test.ts | 77 +++++++++ packages/world/FLOW_LIMITS.md | 8 +- packages/world/src/events.ts | 17 ++ workbench/example/workflows/99_e2e.ts | 32 ++++ 16 files changed, 926 insertions(+), 20 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index e4d4379259..a1979f1bc6 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -612,6 +612,61 @@ describe('e2e', () => { ); } + test( + 'stepLockNoRetriesContentionWorkflow does not consume retries while blocked on a step lock', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 750]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 750]); + + const [resultA, resultB] = await Promise.all([ + runA.returnValue, + runB.returnValue, + ]); + const [firstResult, secondResult] = [resultA, resultB].sort( + (left, right) => left.acquiredAt - right.acquiredAt + ); + + expect(resultA.attempt).toBe(1); + expect(resultB.attempt).toBe(1); + expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( + firstResult.releasedAt + ); + } + ); + + if (isPostgresWorld) { + test( + 'cancelled workflow waiters are skipped before the next waiter is promoted', + { timeout: 60_000 }, + async () => { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, ['shared-user', 500]); + await sleep(100); + const runB = await start(workflow, ['shared-user', 500]); + await sleep(200); + await cliCancel(runB.runId); + const cancelledError = await runB.returnValue.catch((error) => error); + const runC = await start(workflow, ['shared-user', 500]); + + const [resultA, resultC] = await Promise.all([ + runA.returnValue, + runC.returnValue, + ]); + + expect(cancelledError).toBeInstanceOf(WorkflowRunCancelledError); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + } + ); + } + test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index ee4df5ea88..78050bc819 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -6,9 +6,14 @@ import { StepLockBlockedError } from '../step/lock.js'; const { capturedHandlerRef, mockEventsCreate, + mockEventsListByCorrelationId, + mockLimitsAcquire, + mockLimitsHeartbeat, + mockLimitsRelease, mockQueue, mockRuntimeLogger, mockStepLogger, + mockStepGet, mockQueueMessage, mockStepFn, } = vi.hoisted(() => { @@ -20,6 +25,14 @@ const { current: null as null | ((...args: unknown[]) => Promise), }, mockEventsCreate: vi.fn(), + mockEventsListByCorrelationId: vi.fn().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }), + mockLimitsAcquire: vi.fn(), + mockLimitsHeartbeat: vi.fn(), + mockLimitsRelease: vi.fn().mockResolvedValue(undefined), mockQueue: vi.fn().mockResolvedValue({ messageId: 'msg_test' }), mockRuntimeLogger: { warn: vi.fn(), @@ -34,6 +47,16 @@ const { error: vi.fn(), }, mockQueueMessage: vi.fn().mockResolvedValue(undefined), + mockStepGet: vi.fn().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }), mockStepFn, }; }); @@ -49,7 +72,18 @@ vi.mock('@vercel/functions', () => ({ // Mock the world module - createQueueHandler captures the handler vi.mock('./world.js', () => ({ getWorld: vi.fn(() => ({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), })), @@ -204,9 +238,38 @@ describe('step-handler 409 handling', () => { mockStepFn.mockReset().mockResolvedValue('step-result'); mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); + mockEventsListByCorrelationId.mockReset().mockResolvedValue({ + data: [], + cursor: null, + hasMore: false, + }); + mockLimitsAcquire.mockReset(); + mockLimitsHeartbeat.mockReset(); + mockLimitsRelease.mockReset().mockResolvedValue(undefined); + mockStepGet.mockReset().mockResolvedValue({ + stepId: 'step_abc', + runId: 'wrun_test123', + stepName: 'myStep', + status: 'pending', + input: [], + attempt: 0, + createdAt: new Date(), + updatedAt: new Date(), + }); // Re-set getWorld mock since clearAllMocks resets it vi.mocked(getWorld).mockReturnValue({ - events: { create: mockEventsCreate }, + events: { + create: mockEventsCreate, + listByCorrelationId: mockEventsListByCorrelationId, + }, + limits: { + acquire: mockLimitsAcquire, + heartbeat: mockLimitsHeartbeat, + release: mockLimitsRelease, + }, + steps: { + get: mockStepGet, + }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), } as any); @@ -234,7 +297,17 @@ describe('step-handler 409 handling', () => { input: [], }, }); - mockStepFn.mockRejectedValue(new StepLockBlockedError(2_500)); + mockStepFn.mockRejectedValue( + new StepLockBlockedError( + { + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }, + 2_500 + ) + ); const result = await capturedHandler( createMessage(), @@ -243,7 +316,83 @@ describe('step-handler 409 handling', () => { expect(result).toEqual({ timeoutSeconds: 3 }); expect(mockQueueMessage).not.toHaveBeenCalled(); + expect(mockEventsCreate).toHaveBeenCalledTimes(2); + expect(mockEventsCreate).toHaveBeenNthCalledWith( + 1, + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_started', + }), + expect.anything() + ); + expect(mockEventsCreate).toHaveBeenNthCalledWith( + 2, + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_deferred', + correlationId: 'step_abc', + eventData: { + retryAfter: expect.any(Date), + lockRequest: expect.objectContaining({ + key: expect.any(String), + holderId: 'stplock_wrun_test123:step_abc:0', + }), + }, + }), + expect.anything() + ); + }); + + it('rechecks a deferred lock before step_started and re-defers without running user code', async () => { + mockEventsListByCorrelationId.mockResolvedValue({ + data: [ + { + eventId: 'evnt_1', + runId: 'wrun_test123', + eventType: 'step_deferred', + correlationId: 'step_abc', + eventData: { + retryAfter: new Date(Date.now() - 1_000), + lockRequest: { + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }, + }, + createdAt: new Date(), + }, + ], + cursor: null, + hasMore: false, + }); + mockLimitsAcquire.mockResolvedValue({ + status: 'blocked', + reason: 'concurrency', + retryAfterMs: 2_500, + }); + + const result = await capturedHandler( + createMessage(), + createMetadata('myStep') + ); + + expect(result).toEqual({ timeoutSeconds: 3 }); + expect(mockStepFn).not.toHaveBeenCalled(); + expect(mockLimitsAcquire).toHaveBeenCalledWith({ + key: 'step:db:no-retries', + holderId: 'stplock_wrun_test123:step_abc:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); expect(mockEventsCreate).toHaveBeenCalledTimes(1); + expect(mockEventsCreate).toHaveBeenCalledWith( + 'wrun_test123', + expect.objectContaining({ + eventType: 'step_deferred', + }), + expect.anything() + ); }); afterEach(() => { diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index fd3c1292dc..1b2257f256 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -7,7 +7,12 @@ import { } from '@workflow/errors'; import { pluralize } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; -import { SPEC_VERSION_CURRENT, StepInvokePayloadSchema } from '@workflow/world'; +import { + LimitAcquireRequestSchema, + SPEC_VERSION_CURRENT, + StepInvokePayloadSchema, + type LimitLease, +} from '@workflow/world'; import { importKey } from '../encryption.js'; import { runtimeLogger, stepLogger } from '../logger.js'; import { getStepFunction } from '../private.js'; @@ -43,6 +48,65 @@ import { getWorld, getWorldHandlers } from './world.js'; const DEFAULT_STEP_MAX_RETRIES = 3; +async function getDeferredStepLock( + world: ReturnType, + workflowRunId: string, + stepId: string +) { + let step: Awaited>; + try { + step = await world.steps.get(workflowRunId, stepId); + } catch (error) { + if (WorkflowAPIError.is(error) && error.status === 404) { + return null; + } + throw error; + } + if (step.status !== 'pending') { + return null; + } + + const result = await world.events.listByCorrelationId({ + correlationId: stepId, + pagination: { + limit: 1, + sortOrder: 'desc', + }, + }); + const latestEvent = result.data[0]; + + if ( + !latestEvent || + latestEvent.runId !== workflowRunId || + latestEvent.eventType !== 'step_deferred' || + !latestEvent.eventData.lockRequest + ) { + return null; + } + + return { + step, + lockRequest: LimitAcquireRequestSchema.parse( + latestEvent.eventData.lockRequest + ), + }; +} + +async function releaseUnusedPreAcquiredLocks( + world: ReturnType, + preAcquiredLocks: Record +) { + await Promise.all( + Object.values(preAcquiredLocks).map((lease) => + world.limits.release({ + leaseId: lease.leaseId, + key: lease.key, + holderId: lease.holderId, + }) + ) + ); +} + const stepHandler = getWorldHandlers().createQueueHandler( '__wkf_step_', async (message_, metadata) => { @@ -114,6 +178,56 @@ const stepHandler = getWorldHandlers().createQueueHandler( ...Attribute.StepTracePropagated(!!traceContext), }); + const preAcquiredLocks: Record = {}; + const deferredStepLock = await getDeferredStepLock( + world, + workflowRunId, + stepId + ); + if (deferredStepLock) { + const retryAfter = deferredStepLock.step.retryAfter; + if (retryAfter && retryAfter.getTime() > Date.now()) { + const timeoutSeconds = Math.max( + 1, + Math.ceil((retryAfter.getTime() - Date.now()) / 1000) + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + return { timeoutSeconds }; + } + + const lockResult = await world.limits.acquire( + deferredStepLock.lockRequest + ); + if (lockResult.status === 'blocked') { + const retryAfterMs = Math.max(1, lockResult.retryAfterMs ?? 1000); + const timeoutSeconds = Math.max( + 1, + Math.ceil(retryAfterMs / 1000) + ); + await world.events.create( + workflowRunId, + { + eventType: 'step_deferred', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + retryAfter: new Date(Date.now() + retryAfterMs), + lockRequest: deferredStepLock.lockRequest, + }, + }, + { requestId } + ); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + return { timeoutSeconds }; + } + + preAcquiredLocks[lockResult.lease.holderId] = lockResult.lease; + } + // step_started validates state and returns the step entity, so no separate // world.steps.get() call is needed. The server checks: // - Step not in terminal state (returns 409) @@ -140,6 +254,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( } catch (err) { if (WorkflowAPIError.is(err)) { if (err.status === 429) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryRetryAfter = Math.max( 1, typeof err.retryAfter === 'number' ? err.retryAfter : 1 @@ -154,6 +269,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( } // 410 Gone: Workflow has already completed if (err.status === 410) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.info( `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` ); @@ -163,6 +279,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // 409 Conflict: Step in terminal state (completed/failed/cancelled) // Re-enqueue the workflow to continue processing if (err.status === 409) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.debug( 'Step in terminal state, re-enqueuing workflow', { @@ -194,6 +311,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( // 425 Too Early: retryAfter timestamp not reached yet // Return timeout to queue so it retries later if (err.status === 425) { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); // Parse retryAfter from error response meta const retryAfterStr = (err as any).meta?.retryAfter; const retryAfter = retryAfterStr @@ -413,6 +531,7 @@ const stepHandler = getWorldHandlers().createQueueHandler( closureVars: hydratedInput.closureVars, encryptionKey, lockCounter: 0, + preAcquiredLocks, }, () => stepFn.apply(thisVal, args) ); @@ -427,6 +546,8 @@ const stepHandler = getWorldHandlers().createQueueHandler( } catch (err) { userCodeError = err; userCodeFailed = true; + } finally { + await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); } const executionTimeMs = Date.now() - executionStartTime; @@ -439,10 +560,12 @@ const stepHandler = getWorldHandlers().createQueueHandler( const err = userCodeError; if (StepLockBlockedError.is(err)) { + const retryAfterMs = Math.max(1, err.retryAfterMs ?? 1000); const timeoutSeconds = Math.max( 1, - Math.ceil((err.retryAfterMs ?? 1000) / 1000) + Math.ceil(retryAfterMs / 1000) ); + const retryAfter = new Date(Date.now() + retryAfterMs); span?.setAttributes({ ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), }); @@ -451,6 +574,38 @@ const stepHandler = getWorldHandlers().createQueueHandler( 'step.id': stepId, 'step.name': stepName, }); + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_deferred', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + retryAfter, + lockRequest: err.request, + }, + }, + { requestId } + ); + } catch (stepDeferredErr) { + if ( + WorkflowAPIError.is(stepDeferredErr) && + stepDeferredErr.status === 409 + ) { + runtimeLogger.info( + 'Tried deferring step, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: stepDeferredErr.message, + } + ); + return; + } + throw stepDeferredErr; + } return { timeoutSeconds }; } diff --git a/packages/core/src/step.test.ts b/packages/core/src/step.test.ts index a8f080e0b7..5a0e47af56 100644 --- a/packages/core/src/step.test.ts +++ b/packages/core/src/step.test.ts @@ -412,6 +412,42 @@ describe('createUseStep', () => { expect(ctx.invocationsQueue.size).toBe(1); }); + it('should consume step_deferred event and continue waiting', async () => { + const ctx = setupWorkflowContext([ + { + eventId: 'evnt_0', + runId: 'wrun_123', + eventType: 'step_deferred', + correlationId: 'step_01K11TFZ62YS0YYFDQ3E8B9YCV', + eventData: { + retryAfter: new Date(), + }, + createdAt: new Date(), + }, + ]); + + let workflowErrorReject: (err: Error) => void; + const workflowErrorPromise = new Promise((_, reject) => { + workflowErrorReject = reject; + }); + ctx.onWorkflowError = (err) => { + workflowErrorReject(err); + }; + + const useStep = createUseStep(ctx); + const add = useStep('add'); + + let error: Error | undefined; + try { + await Promise.race([add(1, 2), workflowErrorPromise]); + } catch (err_) { + error = err_ as Error; + } + + expect(error).toBeInstanceOf(WorkflowSuspension); + expect(ctx.invocationsQueue.size).toBe(1); + }); + it('should remove queue item when step_completed (terminal state)', async () => { const ctx = setupWorkflowContext([ { diff --git a/packages/core/src/step.ts b/packages/core/src/step.ts index bd45c3008c..3cc9e59ce4 100644 --- a/packages/core/src/step.ts +++ b/packages/core/src/step.ts @@ -96,7 +96,7 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Finished; } queueItem.hasCreatedEvent = true; - // Continue waiting for step_started/step_completed/step_failed events + // Continue waiting for later step lifecycle events. return EventConsumerResult.Consumed; } @@ -112,6 +112,11 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Consumed; } + if (event.eventType === 'step_deferred') { + // Admission was blocked before user work could proceed, so keep waiting. + return EventConsumerResult.Consumed; + } + if (event.eventType === 'step_failed') { // Terminal state - we can remove the invocationQueue item ctx.invocationsQueue.delete(event.correlationId); diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index dadb25b132..b63329dd20 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,4 +1,5 @@ import { AsyncLocalStorage } from 'node:async_hooks'; +import type { LimitLease } from '@workflow/world'; import type { CryptoKey } from '../encryption.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,4 +11,5 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ closureVars?: Record; encryptionKey?: CryptoKey; lockCounter: number; + preAcquiredLocks?: Record; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts index 6aa59132a5..b537cc7503 100644 --- a/packages/core/src/step/lock.ts +++ b/packages/core/src/step/lock.ts @@ -1,14 +1,16 @@ -import type { LimitLease, World } from '@workflow/world'; +import type { LimitAcquireRequest, LimitLease, World } from '@workflow/world'; import type { LockHandle, LockOptions } from '../lock.js'; import { contextStorage } from './context-storage.js'; export class StepLockBlockedError extends Error { retryAfterMs?: number; + request: LimitAcquireRequest; - constructor(retryAfterMs?: number) { + constructor(request: LimitAcquireRequest, retryAfterMs?: number) { super('Step lock blocked'); this.name = 'StepLockBlockedError'; this.retryAfterMs = retryAfterMs; + this.request = request; } static is(value: unknown): value is StepLockBlockedError { @@ -71,13 +73,22 @@ export function createStepLock(world: World) { concurrency: options.concurrency, rate: options.rate, }; - - const result = await world.limits.acquire({ + const request = { key: options.key, holderId, definition, leaseTtlMs: options.leaseTtlMs, - }); + } satisfies LimitAcquireRequest; + + const preAcquiredLease = store.preAcquiredLocks?.[holderId]; + if (preAcquiredLease) { + if (store.preAcquiredLocks) { + delete store.preAcquiredLocks[holderId]; + } + return createStepLockHandle(preAcquiredLease, world); + } + + const result = await world.limits.acquire(request); if (result.status === 'acquired') { return createStepLockHandle(result.lease, world); @@ -87,6 +98,6 @@ export function createStepLock(world: World) { Steps do not sit inside user code polling for a lease. The runtime catches this and re-queues the step attempt at the boundary. */ - throw new StepLockBlockedError(result.retryAfterMs); + throw new StepLockBlockedError(request, result.retryAfterMs); }; } diff --git a/packages/world-local/src/storage.test.ts b/packages/world-local/src/storage.test.ts index 89600b7fa3..b1abcdbce3 100644 --- a/packages/world-local/src/storage.test.ts +++ b/packages/world-local/src/storage.test.ts @@ -2452,6 +2452,83 @@ describe('Storage', () => { }); }); + describe('step_deferred event handling', () => { + let testRunId: string; + + beforeEach(async () => { + const run = await createRun(storage, { + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + input: new Uint8Array(), + }); + testRunId = run.runId; + }); + + it('should roll back the first blocked attempt without recording an error', async () => { + await createStep(storage, testRunId, { + stepId: 'step_deferred_1', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep(storage, testRunId, 'step_deferred_1', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await storage.events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_1', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 0, + startedAt: undefined, + retryAfter, + error: undefined, + }); + }); + + it('should preserve the original startedAt after a prior real attempt', async () => { + await createStep(storage, testRunId, { + stepId: 'step_deferred_2', + stepName: 'test-step', + input: new Uint8Array(), + }); + + const started1 = await updateStep( + storage, + testRunId, + 'step_deferred_2', + 'step_started' + ); + await storage.events.create(testRunId, { + eventType: 'step_retrying', + correlationId: 'step_deferred_2', + eventData: { error: 'Temporary failure' }, + }); + await updateStep(storage, testRunId, 'step_deferred_2', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await storage.events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_2', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 1, + retryAfter, + error: undefined, + }); + expect(result.step?.startedAt).toEqual(started1.startedAt); + }); + }); + describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(storage, { diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index 93e176030a..0f741e018b 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -101,11 +101,15 @@ export function createEventsStorage( ['completed', 'failed', 'cancelled'].includes(status); // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed and step_retrying - they only operate + // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves filesystem reads per step event. let currentRun: WorkflowRun | null = null; - const skipRunValidationEvents = ['step_completed', 'step_retrying']; + const skipRunValidationEvents = [ + 'step_completed', + 'step_deferred', + 'step_retrying', + ]; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -123,7 +127,7 @@ export function createEventsStorage( // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -214,6 +218,7 @@ export function createEventsStorage( 'step_started', 'step_completed', 'step_failed', + 'step_deferred', 'step_retrying', ]; if (stepEvents.includes(data.eventType) && data.correlationId) { @@ -606,6 +611,27 @@ export function createEventsStorage( { overwrite: true } ); } + } else if (data.eventType === 'step_deferred' && 'eventData' in data) { + // step_deferred: returns the step to pending without recording a failure + if (validatedStep) { + const stepCompositeKey = `${effectiveRunId}-${data.correlationId}`; + const rolledBackAttempt = Math.max(0, validatedStep.attempt - 1); + step = { + ...validatedStep, + status: 'pending', + attempt: rolledBackAttempt, + startedAt: + rolledBackAttempt === 0 ? undefined : validatedStep.startedAt, + error: undefined, + retryAfter: data.eventData.retryAfter, + updatedAt: now, + }; + await writeJSON( + taggedPath(basedir, 'steps', stepCompositeKey, tag), + step, + { overwrite: true } + ); + } } else if (data.eventType === 'step_retrying' && 'eventData' in data) { // step_retrying: Sets status back to 'pending', records error // Reuse validatedStep from validation (already read above) diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 01d0605c9b..e54b9e8010 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -143,5 +143,163 @@ if (process.platform === 'win32') { }); expect(stillWaiting.status).toBe('blocked'); }); + + it('skips cancelled workflow waiters before promotion', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await db.drizzle.insert(Schema.runs).values([ + { + runId: 'wrun_dead_workflow', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'cancelled', + }, + ]); + + const first = await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-a', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'wflock_wrun_dead_workflow:limitwait_dead', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + await limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-live', + definition: { + concurrency: { max: 1 }, + rate: { count: 2, periodMs: 5_000 }, + }, + leaseTtlMs: 5_000, + }); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitLeases.acquiredAt)); + const tokens = await db.drizzle + .select({ holderId: Schema.limitTokens.holderId }) + .from(Schema.limitTokens) + .where(eq(Schema.limitTokens.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitTokens.acquiredAt)); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitWaiters.createdAt)); + + expect(leases).toEqual([{ holderId: 'holder-live' }]); + expect(tokens).toEqual([ + { holderId: first.lease.holderId }, + { holderId: 'holder-live' }, + ]); + expect(waiters).toEqual([]); + }); + + it('skips failed step waiters before promotion', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await db.drizzle.insert(Schema.runs).values([ + { + runId: 'wrun_dead_step', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'running', + startedAt: new Date(), + }, + { + runId: 'wrun_live_step', + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + status: 'running', + startedAt: new Date(), + }, + ]); + await db.drizzle.insert(Schema.steps).values([ + { + runId: 'wrun_dead_step', + stepId: 'step_dead', + stepName: 'test-step', + status: 'failed', + attempt: 1, + }, + { + runId: 'wrun_live_step', + stepId: 'step_live', + stepName: 'test-step', + status: 'pending', + attempt: 0, + }, + ]); + + const first = await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') throw new Error('expected acquisition'); + + await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'stplock_wrun_dead_step:step_dead:0', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await limits.acquire({ + key: 'workflow:user:skip-dead-step', + holderId: 'holder-live', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const leases = await db.drizzle + .select({ holderId: Schema.limitLeases.holderId }) + .from(Schema.limitLeases) + .where(eq(Schema.limitLeases.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitLeases.acquiredAt)); + const waiters = await db.drizzle + .select({ holderId: Schema.limitWaiters.holderId }) + .from(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) + .orderBy(asc(Schema.limitWaiters.createdAt)); + + expect(leases).toEqual([{ holderId: 'holder-live' }]); + expect(waiters).toEqual([]); + }); }); } diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index a3892f72aa..d36be6c695 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -367,6 +367,46 @@ async function lockLimitKey(tx: Db, key: string): Promise { ); } +async function isHolderLive(tx: Db, holderId: string): Promise { + const target = parseHolderId(holderId); + if (target.kind === 'opaque') { + return true; + } + + if (target.kind === 'workflow') { + const [run] = (await tx + .select({ + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as Pick[]; + + return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); + } + + const [step] = (await tx + .select({ + status: Schema.steps.status, + }) + .from(Schema.steps) + .where(eq(Schema.steps.stepId, target.stepId)) + .limit(1)) as Pick[]; + if (!step || ['completed', 'failed'].includes(step.status)) { + return false; + } + + const [run] = (await tx + .select({ + status: Schema.runs.status, + }) + .from(Schema.runs) + .where(eq(Schema.runs.runId, target.runId)) + .limit(1)) as Pick[]; + + return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); +} + async function promoteWaiters( tx: Db, config: PostgresWorldConfig, @@ -381,6 +421,13 @@ async function promoteWaiters( let activeTokens = state.tokens.length; for (const waiter of state.waiters) { + if (!(await isHolderLive(tx, waiter.holderId))) { + await tx + .delete(Schema.limitWaiters) + .where(eq(Schema.limitWaiters.waiterId, waiter.waiterId)); + continue; + } + const concurrencyBlocked = waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; const rateBlocked = diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 8d59f929af..cdffba87a9 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -354,12 +354,16 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // ============================================================ // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed and step_retrying - they only operate + // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves database queries per step event. let currentRun: { status: string; specVersion: number | null } | null = null; - const skipRunValidationEvents = ['step_completed', 'step_retrying']; + const skipRunValidationEvents = [ + 'step_completed', + 'step_deferred', + 'step_retrying', + ]; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -375,7 +379,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -472,7 +476,11 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { startedAt: Date | null; retryAfter: Date | null; } | null = null; - const stepEventsNeedingValidation = ['step_started', 'step_retrying']; + const stepEventsNeedingValidation = [ + 'step_started', + 'step_deferred', + 'step_retrying', + ]; if ( stepEventsNeedingValidation.includes(data.eventType) && data.correlationId @@ -928,6 +936,51 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } + // Handle step_deferred event: returns the step to pending without recording a failure + if (data.eventType === 'step_deferred') { + const eventData = (data as any).eventData as { + retryAfter?: Date; + }; + + const [stepValue] = await drizzle + .update(Schema.steps) + .set({ + status: 'pending', + attempt: sql`GREATEST(${Schema.steps.attempt} - 1, 0)`, + startedAt: sql`CASE WHEN ${Schema.steps.attempt} <= 1 THEN NULL ELSE ${Schema.steps.startedAt} END`, + error: null, + retryAfter: eventData.retryAfter, + }) + .where( + and( + eq(Schema.steps.runId, effectiveRunId), + eq(Schema.steps.stepId, data.correlationId!), + notInArray(Schema.steps.status, terminalStepStatuses) + ) + ) + .returning(); + if (stepValue) { + step = deserializeStepError(compact(stepValue)); + } else { + const [existing] = await getStepForValidation.execute({ + runId: effectiveRunId, + stepId: data.correlationId!, + }); + if (!existing) { + throw new WorkflowAPIError( + `Step "${data.correlationId}" not found`, + { status: 404 } + ); + } + if (isStepTerminal(existing.status)) { + throw new WorkflowAPIError( + `Cannot modify step in terminal state "${existing.status}"`, + { status: 409 } + ); + } + } + } + // Handle step_retrying event: sets status back to 'pending', records error // Uses conditional UPDATE to prevent retrying an already-terminal step. if (data.eventType === 'step_retrying') { diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index 5a59b99cde..30d9e7cbb2 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1807,6 +1807,83 @@ describe('Storage (Postgres integration)', () => { }); }); + describe('step_deferred event handling', () => { + let testRunId: string; + + beforeEach(async () => { + const run = await createRun(events, { + deploymentId: 'deployment-123', + workflowName: 'test-workflow', + input: new Uint8Array(), + }); + testRunId = run.runId; + }); + + it('should roll back the first blocked attempt without recording an error', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_1', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep(events, testRunId, 'step_deferred_1', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_1', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 0, + startedAt: undefined, + retryAfter, + error: undefined, + }); + }); + + it('should preserve the original startedAt after a prior real attempt', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_2', + stepName: 'test-step', + input: new Uint8Array(), + }); + + const started1 = await updateStep( + events, + testRunId, + 'step_deferred_2', + 'step_started' + ); + await events.create(testRunId, { + eventType: 'step_retrying', + correlationId: 'step_deferred_2', + eventData: { error: 'Temporary failure' }, + }); + await updateStep(events, testRunId, 'step_deferred_2', 'step_started'); + + const retryAfter = new Date(Date.now() + 5_000); + const result = await events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_2', + eventData: { + retryAfter, + }, + }); + + expect(result.step).toMatchObject({ + status: 'pending', + attempt: 1, + retryAfter, + error: undefined, + }); + expect(result.step?.startedAt).toEqual(started1.startedAt); + }); + }); + describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(events, { diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 769f78ba74..8306576d8a 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -127,6 +127,11 @@ The current behavior is: - the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes +Important caveat: + +- zero-attempt semantics are only guaranteed when `lock()` is used as a top-of-step admission gate +- calling `lock()` after side effects or meaningful user work is unsupported/best-effort + This means step `lock()` is conceptually the same API, but it is not a literal "spin inside already-running user step code until capacity appears" implementation. @@ -187,7 +192,8 @@ Important details: - FIFO is per key, not global across all limit keys - promotion order is based on waiter creation order -- a waiter may be skipped if it is no longer eligible when promotion runs +- dead or terminal waiters are pruned before promotion +- a live waiter may still be skipped if it is no longer eligible when promotion runs - releasing a lease or reclaiming an expired lease can both trigger promotion - rate-window expiry can also make the head waiter eligible again diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index 2965906f7b..eac141c1f7 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,4 +1,5 @@ import { z } from 'zod'; +import { LimitAcquireRequestSchema } from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -64,6 +65,7 @@ export const EventTypeSchema = z.enum([ 'step_created', 'step_completed', 'step_failed', + 'step_deferred', 'step_retrying', 'step_started', // Hook lifecycle events @@ -109,6 +111,19 @@ const StepFailedEventSchema = BaseEventSchema.extend({ }), }); +/** + * Event created when a step is blocked on admission and should be retried + * without counting the blocked attempt against maxRetries. + */ +const StepDeferredEventSchema = BaseEventSchema.extend({ + eventType: z.literal('step_deferred'), + correlationId: z.string(), + eventData: z.object({ + retryAfter: z.coerce.date().optional(), + lockRequest: LimitAcquireRequestSchema.optional(), + }), +}); + /** * Event created when a step fails and will be retried. * Sets the step status back to 'pending' and records the error. @@ -272,6 +287,7 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, + StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events @@ -296,6 +312,7 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, + StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 985e9331e4..f7d43aab9f 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -308,6 +308,38 @@ export async function workflowLockContentionWorkflow( }; } +async function stepLockNoRetriesStep(label: string, holdMs: number) { + 'use step'; + + await using _stepLock = await lock({ + key: 'step:db:no-retries', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const metadata = getStepMetadata(); + const acquiredAt = Date.now(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); + const releasedAt = Date.now(); + + return { + label, + attempt: metadata.attempt, + acquiredAt, + releasedAt, + }; +} +stepLockNoRetriesStep.maxRetries = 0; + +export async function stepLockNoRetriesContentionWorkflow( + userId = 'user-123', + holdMs = 750 +) { + 'use workflow'; + + return await stepLockNoRetriesStep(userId, holdMs); +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From 27486dc2539e72e07091a01b34690daf9d094c59 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Wed, 18 Mar 2026 23:49:42 -0400 Subject: [PATCH 09/16] add new errors Signed-off-by: nathancolosimo --- packages/world-local/src/limits.test.ts | 19 ++++++++++ packages/world-local/src/limits.ts | 6 +-- packages/world-postgres/src/limits.test.ts | 14 +++++++ packages/world-postgres/src/limits.ts | 6 +-- packages/world-postgres/src/storage.ts | 10 ++--- packages/world-postgres/test/storage.test.ts | 40 ++++++++++++++++++++ 6 files changed, 81 insertions(+), 14 deletions(-) diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 3f8351f99b..d6bde93f45 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,4 +1,6 @@ +import { WorkflowWorldError } from '@workflow/errors'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; +import { describe, expect, it } from 'vitest'; import { createLocalWorld } from './index.js'; import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; @@ -17,3 +19,20 @@ createLimitsContractSuite('local world limits', async () => { }, }; }); + +describe('local limits', () => { + it('throws WorkflowWorldError when heartbeating a missing lease', async () => { + const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); + const limits = createLimits(dir); + + try { + await expect( + limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 9dfac5d931..e577e3bfea 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,5 +1,5 @@ import path from 'node:path'; -import { WorkflowAPIError } from '@workflow/errors'; +import { WorkflowWorldError } from '@workflow/errors'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -291,9 +291,7 @@ export function createLimits(dataDir: string, tag?: string): Limits { return updatedLease; } - throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { - status: 404, - }); + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); }); }, }; diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index e54b9e8010..35358b9f15 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,4 +1,5 @@ import { asc, eq } from 'drizzle-orm'; +import { WorkflowWorldError } from '@workflow/errors'; import { afterAll, beforeAll, @@ -42,6 +43,19 @@ if (process.platform === 'win32') { }); describe('postgres waiter promotion', () => { + it('throws WorkflowWorldError when heartbeating a missing lease', async () => { + const limits = createLimits( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.drizzle + ); + + await expect( + limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + }); + it('serializes concurrent acquires for the same key', async () => { const limits = createLimits( { connectionString: db.connectionString, queueConcurrency: 1 }, diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index d36be6c695..7e58f682f6 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -1,6 +1,6 @@ import { JsonTransport } from '@vercel/queue'; import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; -import { WorkflowAPIError } from '@workflow/errors'; +import { WorkflowWorldError } from '@workflow/errors'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -653,9 +653,7 @@ export function createLimits( }); if (!existing) { - throw new WorkflowAPIError(`Lease "${parsed.leaseId}" not found`, { - status: 404, - }); + throw new WorkflowWorldError(`Lease "${parsed.leaseId}" not found`); } await lockLimitKey(tx, existing.limitKey); diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 1afcbc6c2f..5fa5adac83 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -944,15 +944,13 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { stepId: data.correlationId!, }); if (!existing) { - throw new WorkflowAPIError( - `Step "${data.correlationId}" not found`, - { status: 404 } + throw new WorkflowWorldError( + `Step "${data.correlationId}" not found` ); } if (isStepTerminal(existing.status)) { - throw new WorkflowAPIError( - `Cannot modify step in terminal state "${existing.status}"`, - { status: 409 } + throw new EntityConflictError( + `Cannot modify step in terminal state "${existing.status}"` ); } } diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index ded60138f0..8b2328c4c1 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1,5 +1,6 @@ import { execSync } from 'node:child_process'; import { PostgreSqlContainer } from '@testcontainers/postgresql'; +import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; import type { Hook, Step, WorkflowRun } from '@workflow/world'; import { encode } from 'cbor-x'; import postgres from 'postgres'; @@ -1882,6 +1883,45 @@ describe('Storage (Postgres integration)', () => { }); expect(result.step?.startedAt).toEqual(started1.startedAt); }); + + it('throws WorkflowWorldError when step_deferred targets a missing step', async () => { + await expect( + events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_missing_deferred', + eventData: { + retryAfter: new Date(Date.now() + 5_000), + }, + }) + ).rejects.toBeInstanceOf(WorkflowWorldError); + }); + + it('throws EntityConflictError when step_deferred targets a terminal step', async () => { + await createStep(events, testRunId, { + stepId: 'step_deferred_terminal', + stepName: 'test-step', + input: new Uint8Array(), + }); + await updateStep( + events, + testRunId, + 'step_deferred_terminal', + 'step_failed', + { + error: 'already failed', + } + ); + + await expect( + events.create(testRunId, { + eventType: 'step_deferred', + correlationId: 'step_deferred_terminal', + eventData: { + retryAfter: new Date(Date.now() + 5_000), + }, + }) + ).rejects.toBeInstanceOf(EntityConflictError); + }); }); describe('run cancellation with in-flight entities', () => { From 8c683a313abae7835d6b9a7a9fedf89b24891c3c Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 00:49:17 -0400 Subject: [PATCH 10/16] Increase ttl times for flaky tests on slow runners Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 2acfbc1d72..5037039e83 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -129,7 +129,7 @@ export function createLimitsContractSuite( key: 'workflow:user:123', holderId: 'holder-a', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 100, + leaseTtlMs: 500, }); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') @@ -137,13 +137,13 @@ export function createLimitsContractSuite( const heartbeat = await harness.limits.heartbeat({ leaseId: first.lease.leaseId, - ttlMs: 200, + ttlMs: 1_000, }); expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( first.lease.expiresAt?.getTime() ?? 0 ); - await sleep(250); + await sleep(1_100); const second = await harness.limits.acquire({ key: 'workflow:user:123', From 71de1c584fa00f0af8e9e1e5f2c5d0447f71eac1 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 09:37:04 -0400 Subject: [PATCH 11/16] fix e2e test Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index a1979f1bc6..f6eb8d1d45 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -16,6 +16,7 @@ import { } from 'vitest'; import type { Run } from '../src/runtime'; import { + cancelRun, getHookByToken, getRun, getWorld, @@ -643,13 +644,13 @@ describe('e2e', () => { { timeout: 60_000 }, async () => { const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 500]); + const runA = await start(workflow, ['shared-user', 1_500]); await sleep(100); - const runB = await start(workflow, ['shared-user', 500]); - await sleep(200); - await cliCancel(runB.runId); + const runB = await start(workflow, ['shared-user', 1_500]); + await sleep(100); + await cancelRun(getWorld(), runB.runId); const cancelledError = await runB.returnValue.catch((error) => error); - const runC = await start(workflow, ['shared-user', 500]); + const runC = await start(workflow, ['shared-user', 1_500]); const [resultA, resultC] = await Promise.all([ runA.returnValue, From 4ed44fc9cfa93a222cd6b16f2823d97e08d9d519 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 13:45:15 -0400 Subject: [PATCH 12/16] Add FIFO to local and group e2e and contract tests Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 234 +++++----- packages/world-local/README.md | 9 +- packages/world-local/src/index.ts | 9 +- packages/world-local/src/limits.test.ts | 27 +- packages/world-local/src/limits.ts | 419 +++++++++++++++--- packages/world-local/src/queue.test.ts | 74 ++-- packages/world-local/src/queue.ts | 360 +++++++++------ packages/world-postgres/README.md | 5 +- packages/world-postgres/src/limits.test.ts | 308 +------------ packages/world-testing/src/index.mts | 2 + packages/world-testing/src/limits-contract.ts | 334 +++++++++++++- packages/world-testing/src/limits-runtime.ts | 218 +++++++++ packages/world/FLOW_LIMITS.md | 76 +++- workbench/example/tsconfig.json | 1 + workbench/example/workflows/99_e2e.ts | 67 ++- workbench/example/workflows/serde-steps.ts | 2 +- 16 files changed, 1450 insertions(+), 695 deletions(-) create mode 100644 packages/world-testing/src/limits-runtime.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index f6eb8d1d45..5f7aefc97e 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -14,7 +14,8 @@ import { expect, test, } from 'vitest'; -import type { Run } from '../src/runtime'; +import { createLimitsRuntimeSuite } from '../../world-testing/src/limits-runtime.js'; +import type { Run, StartOptions } from '../src/runtime.js'; import { cancelRun, getHookByToken, @@ -23,7 +24,7 @@ import { healthCheck, start as rawStart, resumeHook, -} from '../src/runtime'; +} from '../src/runtime.js'; import { cliCancel, cliHealthJson, @@ -50,10 +51,25 @@ if (!deploymentUrl) { * Tracked wrapper around start() that automatically registers runs * for diagnostics on test failure and observability metadata collection. */ -async function start( - ...args: Parameters> -): Promise> { - const run = await rawStart(...args); +type E2EWorkflowMetadata = Awaited>; + +async function start( + workflow: E2EWorkflowMetadata, + options?: StartOptions +): Promise>; +async function start( + workflow: E2EWorkflowMetadata, + args: TArgs, + options?: StartOptions +): Promise>; +async function start( + workflow: E2EWorkflowMetadata, + argsOrOptions?: unknown[] | StartOptions, + options?: StartOptions +): Promise> { + const run = Array.isArray(argsOrOptions) + ? await rawStart(workflow, argsOrOptions, options) + : await rawStart(workflow, argsOrOptions); trackRun(run); return run; } @@ -229,6 +245,90 @@ describe('e2e', () => { // bundled in function const shouldSkipReactRenderTest = !(isNext && isLocal); + if (isLocalWorld || isPostgresWorld) { + createLimitsRuntimeSuite( + `limits runtime (${isPostgresWorld ? 'postgres' : 'local'})`, + async () => ({ + async runWorkflowWithWorkflowAndStepLocks(userId) { + const run = await start( + await e2e('workflowWithWorkflowAndStepLocks'), + [userId] + ); + return await run.returnValue; + }, + async runWorkflowLockContention(userId, holdMs) { + const workflow = await e2e('workflowLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs]); + await sleep(100); + const runB = await start(workflow, [userId, holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runStepLockNoRetriesContention(userId, holdMs) { + const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + const runC = await start(workflow, [userId, holdMs, 'C']); + return await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + }, + async runWorkflowLockAcrossSuspension(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowRateLimitContention(userId, holdMs, periodMs) { + const workflow = await e2e('workflowRateLimitContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, periodMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runWorkflowFifoThreeWaiters(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + const runC = await start(workflow, [userId, holdMs, 'C']); + return await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + }, + async runCancelledWorkflowWaiter(userId, holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [userId, holdMs, 'A']); + await sleep(100); + const runB = await start(workflow, [userId, holdMs, 'B']); + await sleep(100); + await cancelRun(getWorld(), runB.runId); + const cancelledError = await runB.returnValue.catch((error) => error); + const runC = await start(workflow, [userId, holdMs, 'C']); + const [resultA, resultC] = await Promise.all([ + runA.returnValue, + runC.returnValue, + ]); + return { cancelledError, resultA, resultC }; + }, + async runIndependentWorkflowKeys(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, ['user-a', holdMs]); + await sleep(100); + const runB = await start(workflow, ['user-b', holdMs]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + }) + ); + } + test.skipIf(shouldSkipReactRenderTest)( 'should work with react rendering in step', async () => { @@ -548,126 +648,6 @@ describe('e2e', () => { expect(elapsed).toBeLessThan(25_000); }); - if (isLocalWorld) { - test( - 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on local world', - { timeout: 60_000 }, - async () => { - const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ - 'local-world', - ]); - const returnValue = await run.returnValue; - - expect(returnValue).toMatchObject({ - workflowKey: 'workflow:user:local-world', - dbKey: 'step:db:cheap', - aiKey: 'step:provider:openai', - summary: 'summary:profile:local-world', - }); - } - ); - } - - if (isPostgresWorld) { - test( - 'workflowWithWorkflowAndStepLocks demonstrates workflow and step limits on postgres world', - { timeout: 60_000 }, - async () => { - const run = await start(await e2e('workflowWithWorkflowAndStepLocks'), [ - 'postgres-world', - ]); - const returnValue = await run.returnValue; - - expect(returnValue).toMatchObject({ - workflowKey: 'workflow:user:postgres-world', - dbKey: 'step:db:cheap', - aiKey: 'step:provider:openai', - summary: 'summary:profile:postgres-world', - }); - } - ); - } - - if (isPostgresWorld) { - test( - 'workflowLockContentionWorkflow serializes workflow and step locks under contention', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 750]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 750]); - - const [resultA, resultB] = await Promise.all([ - runA.returnValue, - runB.returnValue, - ]); - - expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.workflowLockReleasedAt - ); - expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.stepLockReleasedAt - ); - } - ); - } - - test( - 'stepLockNoRetriesContentionWorkflow does not consume retries while blocked on a step lock', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 750]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 750]); - - const [resultA, resultB] = await Promise.all([ - runA.returnValue, - runB.returnValue, - ]); - const [firstResult, secondResult] = [resultA, resultB].sort( - (left, right) => left.acquiredAt - right.acquiredAt - ); - - expect(resultA.attempt).toBe(1); - expect(resultB.attempt).toBe(1); - expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( - firstResult.releasedAt - ); - } - ); - - if (isPostgresWorld) { - test( - 'cancelled workflow waiters are skipped before the next waiter is promoted', - { timeout: 60_000 }, - async () => { - const workflow = await e2e('workflowLockContentionWorkflow'); - const runA = await start(workflow, ['shared-user', 1_500]); - await sleep(100); - const runB = await start(workflow, ['shared-user', 1_500]); - await sleep(100); - await cancelRun(getWorld(), runB.runId); - const cancelledError = await runB.returnValue.catch((error) => error); - const runC = await start(workflow, ['shared-user', 1_500]); - - const [resultA, resultC] = await Promise.all([ - runA.returnValue, - runC.returnValue, - ]); - - expect(cancelledError).toBeInstanceOf(WorkflowRunCancelledError); - expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.workflowLockReleasedAt - ); - expect( - resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt - ).toBeLessThan(4_000); - } - ); - } - test('nullByteWorkflow', { timeout: 60_000 }, async () => { const run = await start(await e2e('nullByteWorkflow'), []); const returnValue = await run.returnValue; @@ -1900,7 +1880,7 @@ describe('e2e', () => { // Cancel the run using the core runtime cancelRun function. // This exercises the same cancelRun code path that the CLI uses // (the CLI delegates directly to this function). - const { cancelRun } = await import('../src/runtime'); + const { cancelRun } = await import('../src/runtime.js'); await cancelRun(getWorld(), run.runId); // Verify the run was cancelled - returnValue should throw WorkflowRunCancelledError diff --git a/packages/world-local/README.md b/packages/world-local/README.md index cff6a3354a..fccc554eac 100644 --- a/packages/world-local/README.md +++ b/packages/world-local/README.md @@ -4,6 +4,13 @@ Filesystem-based workflow backend for local development and testing. Stores workflow data as JSON files on disk and provides in-memory queuing. Automatically detects development server port for queue transport. -The `limits` namespace is exposed as part of the shared world contract, but flow concurrency and rate limiting are not implemented in this package yet. +The `limits` namespace implements the shared flow-limits contract for local development: + +- keyed concurrency and rate limits +- FIFO waiter promotion per key +- cancelled workflow / failed step waiter pruning +- prompt wake-ups with delayed fallback retries + +Limit state is persisted on disk, but queue delivery is still in-memory. That means local world matches the same live-process lock semantics as other implemented worlds, while crash-survival and durable backlog behavior remain a PostgreSQL-only advantage today. Used by default on `next dev` and `next start`. diff --git a/packages/world-local/src/index.ts b/packages/world-local/src/index.ts index 029154649d..142fe26ccf 100644 --- a/packages/world-local/src/index.ts +++ b/packages/world-local/src/index.ts @@ -61,10 +61,15 @@ export function createLocalWorld(args?: Partial): LocalWorld { const mergedConfig = { ...config.value, ...definedArgs }; const tag = mergedConfig.tag; const queue = createQueue(mergedConfig); + const storage = createStorage(mergedConfig.dataDir, tag); return { - limits: createLimits(mergedConfig.dataDir, tag), + limits: createLimits(mergedConfig.dataDir, { + tag, + queue, + storage, + }), ...queue, - ...createStorage(mergedConfig.dataDir, tag), + ...storage, ...createStreamer(mergedConfig.dataDir, tag), async start() { await initDataDir(mergedConfig.dataDir); diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index d6bde93f45..2e248bb516 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,8 +1,5 @@ -import { WorkflowWorldError } from '@workflow/errors'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; -import { describe, expect, it } from 'vitest'; import { createLocalWorld } from './index.js'; -import { createLimits } from './limits.js'; import { mkdtemp, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; @@ -10,29 +7,17 @@ import path from 'node:path'; createLimitsContractSuite('local world limits', async () => { const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); const world = createLocalWorld({ dataDir: dir }); + world.registerHandler('__wkf_step_', async () => Response.json({ ok: true })); + world.registerHandler('__wkf_workflow_', async () => + Response.json({ ok: true }) + ); return { - limits: createLimits(dir), + limits: world.limits, + storage: world, close: async () => { await world.close?.(); await rm(dir, { recursive: true, force: true }); }, }; }); - -describe('local limits', () => { - it('throws WorkflowWorldError when heartbeating a missing lease', async () => { - const dir = await mkdtemp(path.join(os.tmpdir(), 'workflow-limits-')); - const limits = createLimits(dir); - - try { - await expect( - limits.heartbeat({ - leaseId: 'lmt_missing', - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - } finally { - await rm(dir, { recursive: true, force: true }); - } - }); -}); diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index e577e3bfea..081b95f63e 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,5 +1,11 @@ import path from 'node:path'; import { WorkflowWorldError } from '@workflow/errors'; +import type { + Queue, + Storage, + WorkflowRunWithoutData, + StepWithoutData, +} from '@workflow/world'; import { LimitAcquireRequestSchema, type LimitAcquireResult, @@ -20,23 +26,56 @@ const LimitTokenSchema = z.object({ expiresAt: z.coerce.date(), }); +const LimitWaiterSchema = z.object({ + waiterId: z.string(), + holderId: z.string(), + createdAt: z.coerce.date(), + leaseTtlMs: z.number().int().positive().optional(), + concurrencyMax: z.number().int().positive().nullable(), + rateCount: z.number().int().positive().nullable(), + ratePeriodMs: z.number().int().positive().nullable(), +}); + const KeyStateSchema = z.object({ key: z.string(), leases: z.array(LimitLeaseSchema), tokens: z.array(LimitTokenSchema), + waiters: z.array(LimitWaiterSchema), }); const LimitsStateSchema = z.object({ - version: z.literal(1), + version: z.literal(2), keys: z.record(z.string(), KeyStateSchema), }); type LimitToken = z.infer; +type LimitWaiter = z.infer; type KeyState = z.infer; type LimitsState = z.infer; +type HolderTarget = + | { + kind: 'workflow'; + runId: string; + correlationId: string; + } + | { + kind: 'step'; + runId: string; + stepId: string; + } + | { + kind: 'opaque'; + }; + +export interface LocalLimitsOptions { + tag?: string; + queue?: Pick; + storage?: Pick; +} + const EMPTY_STATE: LimitsState = { - version: 1, + version: 2, keys: {}, }; @@ -48,17 +87,26 @@ function cloneToken(token: LimitToken): LimitToken { return { ...token }; } +function cloneWaiter(waiter: LimitWaiter): LimitWaiter { + return { ...waiter }; +} + +function normalizeKeyState(keyState: KeyState): KeyState { + return { + key: keyState.key, + leases: keyState.leases.map((lease) => ({ ...lease })), + tokens: keyState.tokens.map(cloneToken), + waiters: keyState.waiters.map(cloneWaiter), + }; +} + function cloneState(state: LimitsState): LimitsState { return { - version: 1, + version: 2, keys: Object.fromEntries( Object.entries(state.keys).map(([key, keyState]) => [ key, - { - key: keyState.key, - leases: keyState.leases.map((lease) => ({ ...lease })), - tokens: keyState.tokens.map(cloneToken), - }, + normalizeKeyState(keyState), ]) ), }; @@ -72,6 +120,7 @@ function pruneKeyState(keyState: KeyState, now = Date.now()): KeyState { lease.expiresAt === undefined || lease.expiresAt.getTime() > now ), tokens: keyState.tokens.filter((token) => token.expiresAt.getTime() > now), + waiters: keyState.waiters.map(cloneWaiter), }; } @@ -113,14 +162,91 @@ function getRetryAfterMs( return Math.min(...candidates); } -export function createLimits(dataDir: string, tag?: string): Limits { - const statePath = getStatePath(dataDir, tag); +function createLease( + key: string, + holderId: string, + definition: LimitLease['definition'], + acquiredAt: Date, + leaseTtlMs?: number +): LimitLease { + return { + leaseId: `lmt_${monotonicUlid()}`, + key, + holderId, + acquiredAt, + expiresAt: + leaseTtlMs !== undefined + ? new Date(acquiredAt.getTime() + leaseTtlMs) + : undefined, + definition, + }; +} + +function insertToken( + keyState: KeyState, + holderId: string, + acquiredAt: Date, + periodMs: number +) { + keyState.tokens.push({ + tokenId: `lmttok_${monotonicUlid()}`, + holderId, + acquiredAt, + expiresAt: new Date(acquiredAt.getTime() + periodMs), + }); +} + +function parseHolderId(holderId: string): HolderTarget { + if (holderId.startsWith('wflock_')) { + const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); + if (runId && correlationId) { + return { kind: 'workflow', runId, correlationId }; + } + } + + if (holderId.startsWith('stplock_')) { + const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); + if (runId && stepId) { + return { kind: 'step', runId, stepId }; + } + } + + return { kind: 'opaque' }; +} + +function isTerminalRun(run: WorkflowRunWithoutData | undefined) { + return !run || ['completed', 'failed', 'cancelled'].includes(run.status); +} + +function isTerminalStep(step: StepWithoutData | undefined) { + return !step || ['completed', 'failed', 'cancelled'].includes(step.status); +} + +function toMillis(value: Date | undefined): number | undefined { + return value ? value.getTime() : undefined; +} + +function deleteEmptyKey(state: LimitsState, key: string) { + const keyState = state.keys[key]; + if (!keyState) return; + if ( + keyState.leases.length === 0 && + keyState.tokens.length === 0 && + keyState.waiters.length === 0 + ) { + delete state.keys[key]; + } +} + +export function createLimits( + dataDir: string, + tagOrOptions?: string | LocalLimitsOptions +): Limits { + const options = + typeof tagOrOptions === 'string' ? { tag: tagOrOptions } : tagOrOptions; + const statePath = getStatePath(dataDir, options?.tag); let stateOp = Promise.resolve(); - // This block is an in-process async mutex / operation queue. - // stateOp starts as an already-resolved promise. - // Each call to withStateLock() chains a new operation onto the tail of that promise. - // Because every new operation waits for the previous one, reads/modifies/writes to the limits state file happen serially. const withStateLock = async (fn: () => Promise): Promise => { const run = stateOp.then(fn, fn); stateOp = run.then( @@ -131,38 +257,205 @@ export function createLimits(dataDir: string, tag?: string): Limits { }; const readState = async (): Promise => { - return ( - (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE) - ); + const raw = + (await readJSON(statePath, LimitsStateSchema)) ?? cloneState(EMPTY_STATE); + + return cloneState(raw); }; const writeState = async (state: LimitsState): Promise => { await writeJSON(statePath, state, { overwrite: true }); }; + const getRun = async ( + runId: string + ): Promise => { + try { + return await options?.storage?.runs.get(runId, { resolveData: 'none' }); + } catch { + return undefined; + } + }; + + const getStep = async ( + runId: string, + stepId: string + ): Promise => { + try { + return await options?.storage?.steps.get(runId, stepId, { + resolveData: 'none', + }); + } catch { + return undefined; + } + }; + + const isHolderLive = async (holderId: string): Promise => { + const target = parseHolderId(holderId); + if (target.kind === 'opaque' || !options?.storage) { + return true; + } + + if (target.kind === 'workflow') { + const run = await getRun(target.runId); + return !isTerminalRun(run); + } + + const [run, step] = await Promise.all([ + getRun(target.runId), + getStep(target.runId, target.stepId), + ]); + return !isTerminalRun(run) && !isTerminalStep(step); + }; + + const queueWakeForHolder = async (holderId: string): Promise => { + const target = parseHolderId(holderId); + if (target.kind === 'opaque' || !options?.queue || !options?.storage) { + return; + } + + try { + if (target.kind === 'workflow') { + const run = await getRun(target.runId); + if (isTerminalRun(run) || !run) return; + + await options.queue.queue( + `__wkf_workflow_${run.workflowName}`, + { + runId: target.runId, + requestedAt: new Date(), + }, + { + idempotencyKey: target.correlationId, + } + ); + return; + } + + const [run, step] = await Promise.all([ + getRun(target.runId), + getStep(target.runId, target.stepId), + ]); + if (isTerminalRun(run) || isTerminalStep(step) || !run || !step) return; + + await options.queue.queue( + `__wkf_step_${step.stepName}`, + { + workflowName: run.workflowName, + workflowRunId: target.runId, + workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), + stepId: target.stepId, + requestedAt: new Date(), + }, + { + idempotencyKey: target.stepId, + } + ); + } catch (error) { + console.warn('[world-local] Failed to queue lock wake-up', error); + } + }; + + const promoteWaiters = async ( + key: string, + keyState: KeyState + ): Promise<{ keyState: KeyState; wakeHolders: string[] }> => { + const wakeHolders: string[] = []; + const promotedKeyState = pruneKeyState(keyState); + const remainingWaiters: LimitWaiter[] = []; + let activeLeases = promotedKeyState.leases.length; + let activeTokens = promotedKeyState.tokens.length; + + for (let index = 0; index < promotedKeyState.waiters.length; index++) { + const waiter = promotedKeyState.waiters[index]; + + if (!(await isHolderLive(waiter.holderId))) { + continue; + } + + const concurrencyBlocked = + waiter.concurrencyMax !== null && activeLeases >= waiter.concurrencyMax; + const rateBlocked = + waiter.rateCount !== null && activeTokens >= waiter.rateCount; + + if (concurrencyBlocked || rateBlocked) { + remainingWaiters.push( + waiter, + ...promotedKeyState.waiters.slice(index + 1) + ); + promotedKeyState.waiters = remainingWaiters; + return { keyState: promotedKeyState, wakeHolders }; + } + + const acquiredAt = new Date(); + const definition = { + concurrency: + waiter.concurrencyMax !== null + ? { max: waiter.concurrencyMax } + : undefined, + rate: + waiter.rateCount !== null && waiter.ratePeriodMs !== null + ? { + count: waiter.rateCount, + periodMs: waiter.ratePeriodMs, + } + : undefined, + }; + + promotedKeyState.leases.push( + createLease( + key, + waiter.holderId, + definition, + acquiredAt, + waiter.leaseTtlMs + ) + ); + activeLeases += 1; + + if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { + insertToken( + promotedKeyState, + waiter.holderId, + acquiredAt, + waiter.ratePeriodMs + ); + activeTokens += 1; + } + + wakeHolders.push(waiter.holderId); + } + + promotedKeyState.waiters = remainingWaiters; + return { keyState: promotedKeyState, wakeHolders }; + }; + return { async acquire(request) { const parsed = LimitAcquireRequestSchema.parse(request); return withStateLock(async (): Promise => { const state = cloneState(await readState()); - const now = new Date(); - const nowMs = now.getTime(); - const keyState = pruneKeyState( + const baseKeyState = pruneKeyState( state.keys[parsed.key] ?? { key: parsed.key, leases: [], tokens: [], - }, - nowMs + waiters: [], + } + ); + const { keyState, wakeHolders } = await promoteWaiters( + parsed.key, + baseKeyState ); + state.keys[parsed.key] = keyState; const existingLease = keyState.leases.find( (lease) => lease.holderId === parsed.holderId ); if (existingLease) { - state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', lease: existingLease, @@ -175,47 +468,66 @@ export function createLimits(dataDir: string, tag?: string): Limits { const rateBlocked = parsed.definition.rate !== undefined && keyState.tokens.length >= parsed.definition.rate.count; + const existingWaiter = keyState.waiters.find( + (waiter) => waiter.holderId === parsed.holderId + ); + + if ( + existingWaiter || + concurrencyBlocked || + rateBlocked || + keyState.waiters.length > 0 + ) { + if (!existingWaiter) { + keyState.waiters.push({ + waiterId: `lmtwait_${monotonicUlid()}`, + holderId: parsed.holderId, + createdAt: new Date(), + leaseTtlMs: parsed.leaseTtlMs, + concurrencyMax: parsed.definition.concurrency?.max ?? null, + rateCount: parsed.definition.rate?.count ?? null, + ratePeriodMs: parsed.definition.rate?.periodMs ?? null, + }); + } - if (concurrencyBlocked || rateBlocked) { state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'blocked', reason: getBlockedReason(concurrencyBlocked, rateBlocked), retryAfterMs: getRetryAfterMs( keyState, - nowMs, + Date.now(), concurrencyBlocked, rateBlocked ), }; } - const lease: LimitLease = { - leaseId: `lmt_${monotonicUlid()}`, - key: parsed.key, - holderId: parsed.holderId, - acquiredAt: now, - expiresAt: - parsed.leaseTtlMs !== undefined - ? new Date(nowMs + parsed.leaseTtlMs) - : undefined, - definition: parsed.definition, - }; + const acquiredAt = new Date(); + const lease = createLease( + parsed.key, + parsed.holderId, + parsed.definition, + acquiredAt, + parsed.leaseTtlMs + ); keyState.leases.push(lease); if (parsed.definition.rate) { - keyState.tokens.push({ - tokenId: `lmttok_${monotonicUlid()}`, - holderId: parsed.holderId, - acquiredAt: now, - expiresAt: new Date(nowMs + parsed.definition.rate.periodMs), - }); + insertToken( + keyState, + parsed.holderId, + acquiredAt, + parsed.definition.rate.periodMs + ); } state.keys[parsed.key] = keyState; await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); return { status: 'acquired', @@ -229,10 +541,12 @@ export function createLimits(dataDir: string, tag?: string): Limits { await withStateLock(async () => { const state = cloneState(await readState()); + const wakeHolders: string[] = []; for (const [key, keyStateValue] of Object.entries(state.keys)) { const keyState = pruneKeyState(keyStateValue); - const nextLeases = keyState.leases.filter((lease) => { + const beforeLeases = keyState.leases.length; + keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; if (parsed.key && lease.key !== parsed.key) return true; if (parsed.holderId && lease.holderId !== parsed.holderId) { @@ -241,20 +555,19 @@ export function createLimits(dataDir: string, tag?: string): Limits { return false; }); - state.keys[key] = { - ...keyState, - leases: nextLeases, - }; - - if ( - state.keys[key].leases.length === 0 && - state.keys[key].tokens.length === 0 - ) { - delete state.keys[key]; + if (keyState.leases.length !== beforeLeases) { + const promoted = await promoteWaiters(key, keyState); + state.keys[key] = promoted.keyState; + wakeHolders.push(...promoted.wakeHolders); + } else { + state.keys[key] = keyState; } + + deleteEmptyKey(state, key); } await writeState(state); + await Promise.all(wakeHolders.map(queueWakeForHolder)); }); }, diff --git a/packages/world-local/src/queue.test.ts b/packages/world-local/src/queue.test.ts index 32c8d1f834..f07677fe49 100644 --- a/packages/world-local/src/queue.test.ts +++ b/packages/world-local/src/queue.test.ts @@ -2,11 +2,6 @@ import type { StepInvokePayload } from '@workflow/world'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; import { createQueue } from './queue'; -// Mock node:timers/promises so setTimeout resolves immediately -vi.mock('node:timers/promises', () => ({ - setTimeout: vi.fn().mockResolvedValue(undefined), -})); - const stepPayload: StepInvokePayload = { workflowName: 'test-workflow', workflowRunId: 'run_01ABC', @@ -18,11 +13,13 @@ describe('queue timeout re-enqueue', () => { let localQueue: ReturnType; beforeEach(() => { + vi.useFakeTimers(); localQueue = createQueue({ baseUrl: 'http://localhost:3000' }); }); afterEach(async () => { await localQueue.close(); + vi.useRealTimers(); }); it('createQueueHandler returns 200 with timeoutSeconds in the body', async () => { @@ -72,29 +69,6 @@ describe('queue timeout re-enqueue', () => { expect(body).toEqual({ ok: true }); }); - it('createQueueHandler returns 200 with timeoutSeconds: 0', async () => { - const handler = localQueue.createQueueHandler('__wkf_step_', async () => ({ - timeoutSeconds: 0, - })); - - const req = new Request('http://localhost/step', { - method: 'POST', - headers: { - 'content-type': 'application/json', - 'x-vqs-queue-name': '__wkf_step_test', - 'x-vqs-message-id': 'msg_01ABC', - 'x-vqs-message-attempt': '1', - }, - body: JSON.stringify(stepPayload), - }); - - const response = await handler(req); - expect(response.status).toBe(200); - - const body = await response.json(); - expect(body).toEqual({ timeoutSeconds: 0 }); - }); - it('queue retries when handler returns timeoutSeconds > 0', async () => { let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { @@ -102,25 +76,18 @@ describe('queue timeout re-enqueue', () => { if (callCount < 3) { return { timeoutSeconds: 5 }; } - // Third call succeeds normally return undefined; }); localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); - // Wait for the async queue processing to complete - // The queue fires off processing asynchronously, so we need to wait - await vi.waitFor(() => { - expect(callCount).toBe(3); - }); + expect(callCount).toBe(3); }); it('queue retries immediately when handler returns timeoutSeconds: 0', async () => { - const { setTimeout: mockSetTimeout } = await import('node:timers/promises'); - vi.mocked(mockSetTimeout).mockClear(); - let callCount = 0; const handler = localQueue.createQueueHandler('__wkf_step_', async () => { callCount++; @@ -133,12 +100,37 @@ describe('queue timeout re-enqueue', () => { localQueue.registerHandler('__wkf_step_', handler); await localQueue.queue('__wkf_step_test' as any, stepPayload); + await vi.runAllTimersAsync(); - await vi.waitFor(() => { - expect(callCount).toBe(3); + expect(callCount).toBe(3); + }); + + it('replaces delayed idempotent deliveries with an immediate wake-up', async () => { + const seenStepIds: string[] = []; + const handler = localQueue.createQueueHandler( + '__wkf_step_', + async (body) => { + seenStepIds.push((body as StepInvokePayload).stepId); + return undefined; + } + ); + + localQueue.registerHandler('__wkf_step_', handler); + + await localQueue.queue('__wkf_step_test' as any, stepPayload, { + idempotencyKey: 'step_01ABC', + delaySeconds: 30, }); + await localQueue.queue( + '__wkf_step_test' as any, + { ...stepPayload, stepId: 'step_replacement' }, + { + idempotencyKey: 'step_01ABC', + } + ); + + await vi.runAllTimersAsync(); - // setTimeout should NOT have been called for timeoutSeconds: 0 - expect(mockSetTimeout).not.toHaveBeenCalled(); + expect(seenStepIds).toEqual(['step_replacement']); }); }); diff --git a/packages/world-local/src/queue.ts b/packages/world-local/src/queue.ts index fd3b511509..c356730daf 100644 --- a/packages/world-local/src/queue.ts +++ b/packages/world-local/src/queue.ts @@ -1,4 +1,3 @@ -import { setTimeout } from 'node:timers/promises'; import { JsonTransport } from '@vercel/queue'; import { MessageId, type Queue, ValidQueueName } from '@workflow/world'; import { Sema } from 'async-sema'; @@ -9,20 +8,10 @@ import type { Config } from './config.js'; import { resolveBaseUrl } from './config.js'; import { getPackageInfo } from './init.js'; -// For local queue, there is no technical limit on the message visibility lifespan, -// but the environment variable can be used for testing purposes to set a max visibility limit. const LOCAL_QUEUE_MAX_VISIBILITY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_MAX_VISIBILITY ?? '0', 10) || Infinity; -// Maximum safe delay for setTimeout in Node.js (2^31 - 1 milliseconds ≈ 24.85 days) -// Larger values cause "TimeoutOverflowWarning: X does not fit into a 32-bit signed integer" -// When the clamped timeout fires, the handler will recalculate remaining time from -// persistent state and return another timeoutSeconds if needed. -const MAX_SAFE_TIMEOUT_MS = 2147483647; - -// The local workers share the same Node.js process and event loop, -// so we need to limit concurrency to avoid overwhelming the system. const DEFAULT_CONCURRENCY_LIMIT = 1000; const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = parseInt(process.env.WORKFLOW_LOCAL_QUEUE_CONCURRENCY ?? '0', 10) || @@ -31,15 +20,27 @@ const WORKFLOW_LOCAL_QUEUE_CONCURRENCY = export type DirectHandler = (req: Request) => Promise; export type LocalQueue = Queue & { - /** Close the HTTP agent and release resources. */ close(): Promise; - /** Register a direct in-process handler for a queue prefix, bypassing HTTP. */ registerHandler( prefix: '__wkf_step_' | '__wkf_workflow_', handler: DirectHandler ): void; }; +type ScheduledMessage = { + attempt: number; + body: Uint8Array; + headers?: Record; + idempotencyKey?: string; + messageId: MessageId; + pendingExecution: boolean; + queueName: ValidQueueName; + remainingServerRetries: number; + running: boolean; + timer?: ReturnType; + version: number; +}; + function getQueueRoute(queueName: ValidQueueName): { pathname: 'flow' | 'step'; prefix: '__wkf_step_' | '__wkf_workflow_'; @@ -54,11 +55,6 @@ function getQueueRoute(queueName: ValidQueueName): { } export function createQueue(config: Partial): LocalQueue { - // Create a custom agent optimized for high-concurrency local workflows: - // - headersTimeout: 0 allows long-running steps - // - connections: 1000 allows many parallel connections to the same host - // - pipelining: 1 (default) for HTTP/1.1 compatibility - // - keepAliveTimeout: 30s keeps connections warm for rapid step execution const httpAgent = new Agent({ headersTimeout: 0, connections: 1000, @@ -67,139 +63,240 @@ export function createQueue(config: Partial): LocalQueue { const transport = new JsonTransport(); const generateId = monotonicFactory(); const semaphore = new Sema(WORKFLOW_LOCAL_QUEUE_CONCURRENCY); - - /** - * holds inflight messages by idempotency key to ensure - * that we don't queue the same message multiple times - */ - const inflightMessages = new Map(); - /** Direct in-process handlers by queue prefix, bypassing HTTP when set. */ + const scheduledMessages = new Map(); const directHandlers = new Map(); + let closed = false; - const queue: Queue['queue'] = async (queueName, message, opts) => { - const cleanup = [] as (() => void)[]; + const cleanupMessage = (message: ScheduledMessage) => { + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } + if (message.idempotencyKey) { + scheduledMessages.delete(message.idempotencyKey); + } + }; - if (opts?.idempotencyKey) { - const existing = inflightMessages.get(opts.idempotencyKey); - if (existing) { - return { messageId: existing }; - } + const scheduleExecution = (message: ScheduledMessage, delayMs: number) => { + if (closed) { + cleanupMessage(message); + return; } - const body = transport.serialize(message); - const { pathname, prefix } = getQueueRoute(queueName); - const messageId = MessageId.parse(`msg_${generateId()}`); + if (message.timer) { + clearTimeout(message.timer); + message.timer = undefined; + } - if (opts?.idempotencyKey) { - const key = opts.idempotencyKey; - inflightMessages.set(key, messageId); - cleanup.push(() => { - inflightMessages.delete(key); - }); + const version = ++message.version; + const enqueueRun = () => { + message.pendingExecution = true; + if (!message.running) { + void executeMessage(message); + } + }; + + if (delayMs <= 0) { + enqueueRun(); + return; } - (async () => { - const token = semaphore.tryAcquire(); - if (!token) { - console.warn( - `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` - ); - await semaphore.acquire(); + message.timer = globalThis.setTimeout(() => { + if (message.version !== version || closed) { + return; } + message.timer = undefined; + enqueueRun(); + }, delayMs); + }; + + const deliverMessage = async ( + message: ScheduledMessage + ): Promise< + | { kind: 'success' } + | { kind: 'timeout'; delayMs: number } + | { kind: 'server_error'; status: number; text: string } + > => { + const { pathname, prefix } = getQueueRoute(message.queueName); + const headers: Record = { + ...message.headers, + 'content-type': 'application/json', + 'x-vqs-queue-name': message.queueName, + 'x-vqs-message-id': message.messageId, + 'x-vqs-message-attempt': String(message.attempt + 1), + }; + const directHandler = directHandlers.get(prefix); + let response: Response; + + if (directHandler) { + const req = new Request( + `http://localhost/.well-known/workflow/v1/${pathname}`, + { + method: 'POST', + headers, + body: message.body, + } + ); + response = await directHandler(req); + } else { + const baseUrl = await resolveBaseUrl(config); + response = await fetch(`${baseUrl}/.well-known/workflow/v1/${pathname}`, { + method: 'POST', + duplex: 'half', + dispatcher: httpAgent, + headers, + body: message.body, + } as any); + } + + const text = await response.text(); + + if (response.ok) { try { - const maxAttempts = 3; - let defaultRetriesLeft = maxAttempts; - for (let attempt = 0; defaultRetriesLeft > 0; attempt++) { - defaultRetriesLeft--; - - const headers: Record = { - ...opts?.headers, - 'content-type': 'application/json', - 'x-vqs-queue-name': queueName, - 'x-vqs-message-id': messageId, - 'x-vqs-message-attempt': String(attempt + 1), + const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); + if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { + return { + kind: 'timeout', + delayMs: timeoutSeconds > 0 ? timeoutSeconds * 1000 : 0, }; - const directHandler = directHandlers.get(prefix); - let response: Response; - - if (directHandler) { - const req = new Request( - `http://localhost/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - headers, - body, - } - ); - response = await directHandler(req); - } else { - const baseUrl = await resolveBaseUrl(config); - // eslint-disable-next-line @typescript-eslint/no-explicit-any -- undici v7 dispatcher types don't match @types/node's RequestInit - response = await fetch( - `${baseUrl}/.well-known/workflow/v1/${pathname}`, - { - method: 'POST', - duplex: 'half', - dispatcher: httpAgent, - headers, - body, - } as any - ); + } + } catch {} + + return { kind: 'success' }; + } + + return { + kind: 'server_error', + status: response.status, + text, + }; + }; + + const executeMessage = async (message: ScheduledMessage): Promise => { + if (closed || message.running) { + return; + } + + message.running = true; + + try { + while (message.pendingExecution && !closed) { + message.pendingExecution = false; + const version = message.version; + const token = semaphore.tryAcquire(); + if (!token) { + console.warn( + `[world-local]: concurrency limit (${WORKFLOW_LOCAL_QUEUE_CONCURRENCY}) reached, waiting for queue to free up` + ); + await semaphore.acquire(); + } + + try { + if (closed) { + cleanupMessage(message); + return; + } + + if (version !== message.version) { + continue; } - const text = await response.text(); - - if (response.ok) { - try { - const timeoutSeconds = Number(JSON.parse(text).timeoutSeconds); - if (Number.isFinite(timeoutSeconds) && timeoutSeconds >= 0) { - // Clamp to MAX_SAFE_TIMEOUT_MS to avoid Node.js setTimeout overflow warning. - // When this fires early, the handler recalculates remaining time from - // persistent state and returns another timeoutSeconds if needed. - if (timeoutSeconds > 0) { - const timeoutMs = Math.min( - timeoutSeconds * 1000, - MAX_SAFE_TIMEOUT_MS - ); - await setTimeout(timeoutMs); - } - defaultRetriesLeft++; - continue; - } - } catch {} + const result = await deliverMessage(message); + + if (result.kind === 'success') { + cleanupMessage(message); return; } + if (result.kind === 'timeout') { + message.attempt += 1; + scheduleExecution( + message, + result.delayMs === 0 + ? 0 + : Math.min(result.delayMs, LOCAL_QUEUE_MAX_VISIBILITY * 1000) + ); + continue; + } + console.error( - `[world-local] Queue message failed (attempt ${attempt + 1}/${maxAttempts}, status ${response.status}): ${text}`, - { queueName, messageId } + `[world-local] Queue message failed (attempt ${ + message.attempt + 1 + }/3, status ${result.status}): ${result.text}`, + { queueName: message.queueName, messageId: message.messageId } ); + + message.attempt += 1; + message.remainingServerRetries -= 1; + if (message.remainingServerRetries > 0) { + scheduleExecution(message, 0); + continue; + } + + console.error(`[world-local] Queue message exhausted all retries`, { + queueName: message.queueName, + messageId: message.messageId, + }); + cleanupMessage(message); + return; + } finally { + semaphore.release(); } + } + } catch (err) { + const queueError = err as { name?: string }; + const isAbortError = + queueError.name === 'AbortError' || + queueError.name === 'ResponseAborted'; + if (!isAbortError) { + console.error('[local world] Queue operation failed:', err); + } + cleanupMessage(message); + } finally { + message.running = false; + if (message.pendingExecution && !closed) { + void executeMessage(message); + } + } + }; - console.error(`[world-local] Queue message exhausted all retries`, { - queueName, - messageId, - }); - } finally { - semaphore.release(); + const queue: Queue['queue'] = async (queueName, message, opts) => { + const body = transport.serialize(message); + const delayMs = + typeof opts?.delaySeconds === 'number' && opts.delaySeconds > 0 + ? opts.delaySeconds * 1000 + : 0; + + if (opts?.idempotencyKey) { + const existing = scheduledMessages.get(opts.idempotencyKey); + if (existing) { + existing.queueName = queueName; + existing.body = body; + existing.headers = opts.headers; + scheduleExecution(existing, delayMs); + return { messageId: existing.messageId }; } - })() - .catch((err) => { - // Silently ignore client disconnect errors (e.g., browser refresh during streaming) - // These are expected and should not cause unhandled rejection warnings - const isAbortError = - err?.name === 'AbortError' || err?.name === 'ResponseAborted'; - if (!isAbortError) { - console.error('[local world] Queue operation failed:', err); - } - }) - .finally(() => { - for (const fn of cleanup) { - fn(); - } - }); + } + + const scheduledMessage: ScheduledMessage = { + attempt: 0, + body, + headers: opts?.headers, + idempotencyKey: opts?.idempotencyKey, + messageId: MessageId.parse(`msg_${generateId()}`), + pendingExecution: false, + queueName, + remainingServerRetries: 3, + running: false, + version: 0, + }; - return { messageId }; + if (opts?.idempotencyKey) { + scheduledMessages.set(opts.idempotencyKey, scheduledMessage); + } + + scheduleExecution(scheduledMessage, delayMs); + return { messageId: scheduledMessage.messageId }; }; const HeaderParser = z.object({ @@ -270,6 +367,11 @@ export function createQueue(config: Partial): LocalQueue { directHandlers.set(prefix, handler); }, async close() { + closed = true; + for (const message of scheduledMessages.values()) { + cleanupMessage(message); + } + scheduledMessages.clear(); await httpAgent.close(); }, }; diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index 7e2888f69f..a96cf3b680 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -117,7 +117,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - **Durable Storage**: Stores workflow runs, events, steps, hooks, and webhooks in PostgreSQL - **Queue Processing**: Uses graphile-worker as the durable queue and executes jobs over the workflow HTTP routes - **Durable Delays**: Re-schedules waits and retries in PostgreSQL -- **Flow Limits**: Enforces durable concurrency/rate limits with PostgreSQL-backed leases, rate tokens, and waiter promotion +- **Flow Limits**: Implements the shared concurrency/rate-limit contract with PostgreSQL-backed leases, rate tokens, FIFO waiters, and prompt wake-ups - **Streaming**: Real-time event streaming capabilities - **Health Checks**: Built-in connection health monitoring - **Configurable Concurrency**: Adjustable worker concurrency for queue processing @@ -129,9 +129,12 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling - Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key +- Cancelled workflow and failed/completed step waiters are pruned before promotion - Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` +PostgreSQL's main advantage over the local world is durability of the queue/backlog itself across host or process loss. The flow-limit behavior is intended to match other implemented worlds while the process is alive. + ## Development For local development, you can use the included Docker Compose configuration: diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 35358b9f15..5d8e1a74f6 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -1,17 +1,12 @@ -import { asc, eq } from 'drizzle-orm'; -import { WorkflowWorldError } from '@workflow/errors'; -import { - afterAll, - beforeAll, - beforeEach, - describe, - expect, - it, - test, -} from 'vitest'; +import { afterAll, beforeAll, beforeEach, test } from 'vitest'; import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; -import * as Schema from './drizzle/schema.js'; import { createLimits } from './limits.js'; +import { + createEventsStorage, + createRunsStorage, + createStepsStorage, +} from './storage.js'; +import { createQueue } from './queue.js'; if (process.platform === 'win32') { test.skip('skipped on Windows since it relies on a docker container', () => {}); @@ -19,10 +14,16 @@ if (process.platform === 'win32') { let db: Awaited< ReturnType >; + let queue: ReturnType; beforeAll(async () => { const { createPostgresTestDb } = await import('../test/test-db.js'); db = await createPostgresTestDb(); + queue = createQueue( + { connectionString: db.connectionString, queueConcurrency: 1 }, + db.sql + ); + await queue.start(); }, 120_000); beforeEach(async () => { @@ -30,7 +31,8 @@ if (process.platform === 'win32') { }); afterAll(async () => { - await db.close(); + await queue?.close(); + await db?.close(); }); createLimitsContractSuite('postgres world limits', async () => { @@ -39,281 +41,11 @@ if (process.platform === 'win32') { { connectionString: db.connectionString, queueConcurrency: 1 }, db.drizzle ), + storage: { + runs: createRunsStorage(db.drizzle), + steps: createStepsStorage(db.drizzle), + events: createEventsStorage(db.drizzle), + }, }; }); - - describe('postgres waiter promotion', () => { - it('throws WorkflowWorldError when heartbeating a missing lease', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await expect( - limits.heartbeat({ - leaseId: 'lmt_missing', - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - }); - - it('serializes concurrent acquires for the same key', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - const results = await Promise.all( - Array.from({ length: 12 }, (_, index) => - limits.acquire({ - key: 'workflow:user:concurrent', - holderId: `holder-${index}`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }) - ) - ); - - const acquired = results.filter((result) => result.status === 'acquired'); - const blocked = results.filter((result) => result.status === 'blocked'); - - expect(acquired).toHaveLength(1); - expect(blocked).toHaveLength(11); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, 'workflow:user:concurrent')); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, 'workflow:user:concurrent')); - - expect(leases).toHaveLength(1); - expect(waiters).toHaveLength(11); - }); - - it('promotes the earliest waiter on release', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - const first = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - const second = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const third = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - - expect(second.status).toBe('blocked'); - expect(third.status).toBe('blocked'); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy( - asc(Schema.limitLeases.acquiredAt), - asc(Schema.limitLeases.leaseId) - ); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy( - asc(Schema.limitWaiters.createdAt), - asc(Schema.limitWaiters.waiterId) - ); - - expect(leases).toEqual([{ holderId: 'holder-b' }]); - expect(waiters).toEqual([{ holderId: 'holder-c' }]); - - const stillWaiting = await limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - expect(stillWaiting.status).toBe('blocked'); - }); - - it('skips cancelled workflow waiters before promotion', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await db.drizzle.insert(Schema.runs).values([ - { - runId: 'wrun_dead_workflow', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'cancelled', - }, - ]); - - const first = await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-a', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'wflock_wrun_dead_workflow:limitwait_dead', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - await limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-live', - definition: { - concurrency: { max: 1 }, - rate: { count: 2, periodMs: 5_000 }, - }, - leaseTtlMs: 5_000, - }); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitLeases.acquiredAt)); - const tokens = await db.drizzle - .select({ holderId: Schema.limitTokens.holderId }) - .from(Schema.limitTokens) - .where(eq(Schema.limitTokens.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitTokens.acquiredAt)); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitWaiters.createdAt)); - - expect(leases).toEqual([{ holderId: 'holder-live' }]); - expect(tokens).toEqual([ - { holderId: first.lease.holderId }, - { holderId: 'holder-live' }, - ]); - expect(waiters).toEqual([]); - }); - - it('skips failed step waiters before promotion', async () => { - const limits = createLimits( - { connectionString: db.connectionString, queueConcurrency: 1 }, - db.drizzle - ); - - await db.drizzle.insert(Schema.runs).values([ - { - runId: 'wrun_dead_step', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'running', - startedAt: new Date(), - }, - { - runId: 'wrun_live_step', - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - status: 'running', - startedAt: new Date(), - }, - ]); - await db.drizzle.insert(Schema.steps).values([ - { - runId: 'wrun_dead_step', - stepId: 'step_dead', - stepName: 'test-step', - status: 'failed', - attempt: 1, - }, - { - runId: 'wrun_live_step', - stepId: 'step_live', - stepName: 'test-step', - status: 'pending', - attempt: 0, - }, - ]); - - const first = await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') throw new Error('expected acquisition'); - - await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'stplock_wrun_dead_step:step_dead:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await limits.acquire({ - key: 'workflow:user:skip-dead-step', - holderId: 'holder-live', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const leases = await db.drizzle - .select({ holderId: Schema.limitLeases.holderId }) - .from(Schema.limitLeases) - .where(eq(Schema.limitLeases.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitLeases.acquiredAt)); - const waiters = await db.drizzle - .select({ holderId: Schema.limitWaiters.holderId }) - .from(Schema.limitWaiters) - .where(eq(Schema.limitWaiters.limitKey, first.lease.key)) - .orderBy(asc(Schema.limitWaiters.createdAt)); - - expect(leases).toEqual([{ holderId: 'holder-live' }]); - expect(waiters).toEqual([]); - }); - }); } diff --git a/packages/world-testing/src/index.mts b/packages/world-testing/src/index.mts index 4b59e15267..db42585942 100644 --- a/packages/world-testing/src/index.mts +++ b/packages/world-testing/src/index.mts @@ -2,6 +2,8 @@ import { addition } from './addition.mjs'; import { errors } from './errors.mjs'; import { hooks } from './hooks.mjs'; import { idempotency } from './idempotency.mjs'; +export { createLimitsContractSuite } from './limits-contract.js'; +export { createLimitsRuntimeSuite } from './limits-runtime.js'; import { nullByte } from './null-byte.mjs'; export function createTestSuite(pkgName: string) { diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 5037039e83..2a65750181 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -1,17 +1,77 @@ import { setTimeout as sleep } from 'node:timers/promises'; -import type { Limits } from '@workflow/world'; +import { + SPEC_VERSION_CURRENT, + type Limits, + type Storage, +} from '@workflow/world'; import { describe, expect, it } from 'vitest'; export interface LimitsHarness { limits: Limits; + storage?: Pick; close?: () => Promise; } +async function createRun( + storage: Pick, + workflowName: string +) { + const result = await storage.events.create(null, { + eventType: 'run_created', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + deploymentId: 'deployment-123', + workflowName, + input: [], + }, + }); + if (!result.run) { + throw new Error('expected run'); + } + return result.run; +} + +async function createStep( + storage: Pick, + runId: string, + stepId: string +) { + const result = await storage.events.create(runId, { + eventType: 'step_created', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + stepName: 'test-step', + input: [], + }, + }); + if (!result.step) { + throw new Error('expected step'); + } + return result.step; +} + export function createLimitsContractSuite( name: string, createHarness: () => Promise ) { describe(name, () => { + it('throws a workflow world error when heartbeating a missing lease', async () => { + const harness = await createHarness(); + try { + await expect( + harness.limits.heartbeat({ + leaseId: 'lmt_missing', + }) + ).rejects.toMatchObject({ + name: 'WorkflowWorldError', + message: expect.stringContaining('not found'), + }); + } finally { + await harness.close?.(); + } + }); + it('enforces per-key concurrency limits', async () => { const harness = await createHarness(); try { @@ -54,13 +114,40 @@ export function createLimitsContractSuite( } }); - it('returns a retry path when rate limits block acquisition', async () => { + it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { + const results = await Promise.all( + Array.from({ length: 12 }, (_, index) => + harness.limits.acquire({ + key: 'workflow:user:concurrent', + holderId: `holder-${index}`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }) + ) + ); + + const acquired = results.filter( + (result) => result.status === 'acquired' + ); + const blocked = results.filter((result) => result.status === 'blocked'); + + expect(acquired).toHaveLength(1); + expect(blocked).toHaveLength(11); + } finally { + await harness.close?.(); + } + }); + + it('keeps rate capacity consumed until the window expires', async () => { + const harness = await createHarness(); + try { + const periodMs = 200; const first = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-a', - definition: { rate: { count: 1, periodMs: 100 } }, + definition: { rate: { count: 1, periodMs } }, leaseTtlMs: 1_000, }); expect(first.status).toBe('acquired'); @@ -76,13 +163,31 @@ export function createLimitsContractSuite( const second = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-b', - definition: { rate: { count: 1, periodMs: 100 } }, + definition: { rate: { count: 1, periodMs } }, leaseTtlMs: 1_000, }); expect(second.status).toBe('blocked'); if (second.status !== 'blocked') throw new Error('expected blocked'); expect(second.reason).toBe('rate'); expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); + + let third = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-c', + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + const deadline = Date.now() + periodMs + 1_000; + while (third.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, third.retryAfterMs) + 50); + third = await harness.limits.acquire({ + key: 'step:provider:openai', + holderId: 'holder-c', + definition: { rate: { count: 1, periodMs } }, + leaseTtlMs: 1_000, + }); + } + expect(third.status).toBe('acquired'); } finally { await harness.close?.(); } @@ -187,5 +292,226 @@ export function createLimitsContractSuite( await harness.close?.(); } }); + + it('promotes waiters in FIFO order per key', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const third = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(second.status).toBe('blocked'); + expect(third.status).toBe('blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const stillWaiting = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(promoted.status).toBe('acquired'); + expect(stillWaiting.status).toBe('blocked'); + if (promoted.status !== 'acquired') + throw new Error('expected waiter-b promotion'); + + await harness.limits.release({ + leaseId: promoted.lease.leaseId, + holderId: promoted.lease.holderId, + key: promoted.lease.key, + }); + + const thirdPromoted = await harness.limits.acquire({ + key: 'workflow:user:ordered', + holderId: 'holder-c', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(thirdPromoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('skips cancelled workflow waiters before promotion', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for workflow waiter liveness'); + } + + const deadRun = await createRun(harness.storage, 'dead-workflow'); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_cancelled', + specVersion: SPEC_VERSION_CURRENT, + }); + + const liveRun = await createRun(harness.storage, 'live-workflow'); + await harness.storage.events.create(liveRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + + const first = await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${deadRun.runId}:limitwait_dead`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${liveRun.runId}:limitwait_live`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'workflow:user:skip-dead-workflow', + holderId: `wflock_${liveRun.runId}:limitwait_live`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('skips failed step waiters before promotion', async () => { + const harness = await createHarness(); + try { + if (!harness.storage) { + throw new Error('storage is required for step waiter liveness'); + } + + const deadRun = await createRun(harness.storage, 'dead-step-workflow'); + await harness.storage.events.create(deadRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const deadStep = await createStep( + harness.storage, + deadRun.runId, + 'step-dead' + ); + await harness.storage.events.create(deadRun.runId, { + eventType: 'step_started', + specVersion: SPEC_VERSION_CURRENT, + correlationId: deadStep.stepId, + }); + await harness.storage.events.create(deadRun.runId, { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: deadStep.stepId, + eventData: { + error: { name: 'Error', message: 'failed waiter' }, + }, + } as any); + + const liveRun = await createRun(harness.storage, 'live-step-workflow'); + await harness.storage.events.create(liveRun.runId, { + eventType: 'run_started', + specVersion: SPEC_VERSION_CURRENT, + }); + const liveStep = await createStep( + harness.storage, + liveRun.runId, + 'step-live' + ); + + const first = await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${deadRun.runId}:${deadStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const promoted = await harness.limits.acquire({ + key: 'step:skip-dead-step', + holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 5_000, + }); + + expect(promoted.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); }); } diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts new file mode 100644 index 0000000000..60bfe5789d --- /dev/null +++ b/packages/world-testing/src/limits-runtime.ts @@ -0,0 +1,218 @@ +import { describe, expect, it } from 'vitest'; + +type WorkflowLockContentionResult = { + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; + stepLockAcquiredAt: number; + stepLockReleasedAt: number; +}; + +type StepLockNoRetriesResult = { + label: string; + attempt: number; + acquiredAt: number; + releasedAt: number; +}; + +type WorkflowOnlyLockResult = { + label: string; + workflowLockAcquiredAt: number; + workflowLockReleasedAt: number; +}; + +type WorkflowRateLimitResult = { + label: string; + workflowRateAcquiredAt: number; + workflowRateReleasedAt: number; + periodMs: number; +}; + +export interface LimitsRuntimeHarness { + runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ + workflowKey: string; + dbKey: string; + aiKey: string; + summary: string; + }>; + runWorkflowLockContention( + userId: string, + holdMs: number + ): Promise<[WorkflowLockContentionResult, WorkflowLockContentionResult]>; + runStepLockNoRetriesContention( + userId: string, + holdMs: number + ): Promise< + [StepLockNoRetriesResult, StepLockNoRetriesResult, StepLockNoRetriesResult] + >; + runWorkflowLockAcrossSuspension( + userId: string, + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runWorkflowRateLimitContention( + userId: string, + holdMs: number, + periodMs: number + ): Promise<[WorkflowRateLimitResult, WorkflowRateLimitResult]>; + runWorkflowFifoThreeWaiters( + userId: string, + holdMs: number + ): Promise< + [WorkflowOnlyLockResult, WorkflowOnlyLockResult, WorkflowOnlyLockResult] + >; + runCancelledWorkflowWaiter( + userId: string, + holdMs: number + ): Promise<{ + cancelledError: unknown; + resultA: WorkflowOnlyLockResult; + resultC: WorkflowOnlyLockResult; + }>; + runIndependentWorkflowKeys( + holdMs: number + ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; +} + +export function createLimitsRuntimeSuite( + name: string, + createHarness: () => Promise +) { + describe(name, () => { + it('runs workflow and step locks end-to-end', async () => { + const harness = await createHarness(); + const userId = 'shared-user'; + const result = await harness.runWorkflowWithWorkflowAndStepLocks(userId); + + expect(result).toMatchObject({ + workflowKey: `workflow:user:${userId}`, + dbKey: 'step:db:cheap', + aiKey: 'step:provider:openai', + summary: `summary:profile:${userId}`, + }); + }); + + it('serializes workflow and step admission under contention', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockContention( + 'shared-user', + 750 + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepLockReleasedAt + ); + }); + + it('wakes promoted workflow and step waiters promptly', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockContention( + 'shared-user', + 1_500 + ); + + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + expect( + resultB.stepLockAcquiredAt - resultA.stepLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('does not consume retries while blocked on a top-of-step lock', async () => { + const harness = await createHarness(); + const [resultA, resultB, resultC] = + await harness.runStepLockNoRetriesContention('shared-user', 750); + const [firstResult, secondResult, thirdResult] = [ + resultA, + resultB, + resultC, + ].sort((left, right) => left.acquiredAt - right.acquiredAt); + + expect(resultA.attempt).toBe(1); + expect(resultB.attempt).toBe(1); + expect(resultC.attempt).toBe(1); + expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( + firstResult.releasedAt + ); + expect(thirdResult.acquiredAt).toBeGreaterThanOrEqual( + secondResult.releasedAt + ); + }); + + it('keeps workflow locks held across suspension until the workflow finishes', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runWorkflowLockAcrossSuspension( + 'shared-user', + 1_500 + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('wakes rate-limited waiters only after the rate window expires', async () => { + const harness = await createHarness(); + const holdMs = 250; + const periodMs = 1_500; + const [resultA, resultB] = await harness.runWorkflowRateLimitContention( + 'shared-user', + holdMs, + periodMs + ); + + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateAcquiredAt + ).toBeGreaterThanOrEqual(periodMs - 100); + + const remainingWindowAfterRelease = + periodMs - + (resultA.workflowRateReleasedAt - resultA.workflowRateAcquiredAt); + expect( + resultB.workflowRateAcquiredAt - resultA.workflowRateReleasedAt + ).toBeGreaterThanOrEqual(Math.max(0, remainingWindowAfterRelease - 100)); + }); + + it('promotes 3 workflow waiters in FIFO order', async () => { + const harness = await createHarness(); + const [resultA, resultB, resultC] = + await harness.runWorkflowFifoThreeWaiters('shared-user', 750); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultB.workflowLockReleasedAt + ); + }); + + it('skips cancelled workflow waiters before promoting the next run', async () => { + const harness = await createHarness(); + const { cancelledError, resultA, resultC } = + await harness.runCancelledWorkflowWaiter('shared-user', 1_500); + + expect(cancelledError).toBeTruthy(); + expect(resultC.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowLockReleasedAt + ); + expect( + resultC.workflowLockAcquiredAt - resultA.workflowLockReleasedAt + ).toBeLessThan(4_000); + }); + + it('does not block unrelated workflow keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = + await harness.runIndependentWorkflowKeys(1_000); + + expect(resultB.workflowLockAcquiredAt).toBeLessThan( + resultA.workflowLockReleasedAt + ); + }); + }); +} diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 8306576d8a..565e258d71 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -7,12 +7,13 @@ implementations. ## Status - The shared `limits` interface and `lock()` API surface now exist. -- Local world has a working lease-based implementation for - acquire/release/heartbeat. -- Postgres now has a PostgreSQL-backed implementation with leases, rate tokens, - and durable waiters. +- Local world now implements the shared live-process limits semantics with + leases, rate tokens, FIFO waiters, and prompt wake-up with delayed fallback. +- Postgres implements the same limits semantics with PostgreSQL-backed leases, + rate tokens, durable waiters, and durable queue wake-up. - Vercel still exposes `limits` as a stub. -- The Next.js Turbopack workbench has E2E coverage for workflow and step locks. +- The Next.js Turbopack workbench has shared E2E coverage for workflow and step + locks on implemented worlds. ## Goals @@ -30,6 +31,34 @@ implementations. - `step limit`: execution control for a specific step/resource key. - `lease`: durable record that a workflow or step currently occupies capacity for a key. +## Shared Contract vs World-Specific Behavior + +The limits contract is intended to describe one shared set of observable +semantics across implemented worlds. That shared contract includes: + +- `acquire()`, `release()`, and `heartbeat()` surface behavior +- `WorkflowWorldError` when heartbeating a missing lease +- per-key concurrency and rate limiting outcomes +- same-holder lease reuse +- serialization of concurrent acquires for a single key +- FIFO waiter promotion per key +- pruning cancelled workflow waiters and failed/completed step waiters +- blocked acquisitions not consuming execution concurrency +- prompt wake-up with delayed fallback replay + +World-specific behavior should be limited to implementation mechanics and +durability characteristics, for example: + +- how waiter state is stored internally +- how per-key mutations are serialized internally +- how prompt wake-up is delivered +- whether queued wake-ups survive process or host loss +- backend-specific observability or debugging surfaces + +That means SQL row layout, advisory locks, and Graphile jobs are PostgreSQL +implementation details, while FIFO fairness and waiter skipping are contract +behavior that local and Postgres should both exhibit. + ## Decisions So Far ### 1. Use one shared limits model @@ -185,8 +214,8 @@ another holds a step lock and each waits on the other. ### 9. Waiters are FIFO per key -The PostgreSQL implementation uses a durable waiter queue and promotes waiters -in FIFO order for a single limit key. +Implemented worlds use a waiter queue and promote waiters in FIFO order for a +single limit key. Important details: @@ -204,33 +233,37 @@ global scheduler. Blocked flow limits and worker concurrency are intentionally separate. -In the PostgreSQL world: +For implemented worlds: - blocked workflows are suspended and re-queued, not left running on a worker - blocked steps exit the current attempt and are re-queued instead of polling in a live worker slot -- backlog remains durable in PostgreSQL while worker slots are free to service - unrelated work +- worker slots are free to service unrelated work while the blocked execution is + waiting to be retried or promoted -This is the main practical difference between a durable waiter model and a pure -polling loop. +PostgreSQL additionally keeps that backlog durable in the database. The local +world keeps queue delivery in-memory, so cross-process crash recovery for the +backlog is explicitly outside the shared limits contract today. ### 11. Wake-up is prompt, with a delayed fallback -The PostgreSQL world uses Graphile for wake-up delivery, but PostgreSQL tables -remain the source of truth for limit state. +Implemented worlds use the world-owned limit state as the source of truth and +try to resume promoted waiters promptly, with a delayed fallback still in place +so progress is possible if an immediate wake-up is missed. Current behavior: -- leases, rate tokens, and waiters live in PostgreSQL tables -- promotion decisions are made from SQL state +- leases, rate tokens, and waiters live in world-owned limit state +- promotion decisions are made from that limit state - when a waiter is promoted, the runtime is woken by enqueuing the appropriate workflow or step job - workflows also keep a delayed replay fallback so progress is still possible if an immediate wake-up is missed -This means Graphile is used to resume work quickly, not to decide fairness or -capacity ownership. +PostgreSQL uses Graphile jobs for that wake-up path and keeps the backlog +durable across host/process failure. The local world uses an in-memory queue, so +prompt wake behavior matches while the process is alive, but durable backlog +survival is not guaranteed after process loss. ### 12. V1 semantics are intentionally opinionated @@ -254,9 +287,10 @@ More concretely: For the current local implementation specifically: -- workflow locks already behave like durable logical-scope leases -- step locks are still simpler than Postgres and do not provide the same durable - waiter/wake-up behavior +- workflow and step locks now follow the same live-process waiter/fairness + semantics as Postgres +- the queue remains in-memory, so queued wake-ups are not durable across process + loss This means the current v1 interpretation of a workflow lock is: diff --git a/workbench/example/tsconfig.json b/workbench/example/tsconfig.json index 39c2f1ea68..4e131954f0 100644 --- a/workbench/example/tsconfig.json +++ b/workbench/example/tsconfig.json @@ -3,6 +3,7 @@ "target": "es2022", "module": "NodeNext", "lib": ["dom", "dom.iterable", "esnext"], + "baseUrl": ".", "allowJs": true, "skipLibCheck": true, "strict": true, diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index f7d43aab9f..0d3db80e83 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -1,6 +1,6 @@ // Test path alias resolution - imports a helper from outside the workbench directory /** biome-ignore-all lint/complexity/noStaticOnlyClass: */ -import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test'; +import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test.js'; import { createHook, createWebhook, @@ -15,8 +15,8 @@ import { sleep, } from 'workflow'; import { getRun, start } from 'workflow/api'; -import { importedStepOnly } from './_imported_step_only'; -import { callThrower, stepThatThrowsFromHelper } from './helpers'; +import { importedStepOnly } from './_imported_step_only.js'; +import { callThrower, stepThatThrowsFromHelper } from './helpers.js'; ////////////////////////////////////////////////////////// @@ -333,11 +333,66 @@ stepLockNoRetriesStep.maxRetries = 0; export async function stepLockNoRetriesContentionWorkflow( userId = 'user-123', - holdMs = 750 + holdMs = 750, + label = userId +) { + 'use workflow'; + + return await stepLockNoRetriesStep(label, holdMs); +} + +////////////////////////////////////////////////////////// + +export async function workflowOnlyLockContentionWorkflow( + userId = 'user-123', + holdMs = 750, + label = userId +) { + 'use workflow'; + + await using _workflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + const workflowLockAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowLockReleasedAt = Date.now(); + + return { + label, + userId, + workflowLockAcquiredAt, + workflowLockReleasedAt, + }; +} + +export async function workflowRateLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId ) { 'use workflow'; - return await stepLockNoRetriesStep(userId, holdMs); + await using _workflowRateLimit = await lock({ + key: `workflow:rate:${userId}`, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; } ////////////////////////////////////////////////////////// @@ -1277,7 +1332,7 @@ import { createVector, scaleVector, sumVectors, -} from './serde-steps'; +} from './serde-steps.js'; /** * Workflow that tests cross-context class registration. diff --git a/workbench/example/workflows/serde-steps.ts b/workbench/example/workflows/serde-steps.ts index 227de88399..9726bbe6c0 100644 --- a/workbench/example/workflows/serde-steps.ts +++ b/workbench/example/workflows/serde-steps.ts @@ -6,7 +6,7 @@ * step calls. This tests cross-context class registration. */ -import { Vector } from './serde-models'; +import { Vector } from './serde-models.js'; /** * Step that receives a Vector and scales it. From 39efdb38bffc6cac8cadeca4a6f63d6f9c068940 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 15:42:38 -0400 Subject: [PATCH 13/16] add more e2e test cases Signed-off-by: nathancolosimo --- .github/workflows/tests.yml | 13 + packages/core/e2e/e2e.test.ts | 104 +++++++- packages/world-local/src/limits.test.ts | 36 ++- packages/world-postgres/src/limits.test.ts | 28 ++ packages/world-postgres/src/limits.ts | 15 +- packages/world-testing/src/limits-contract.ts | 249 +++++++++++++++++- packages/world-testing/src/limits-runtime.ts | 123 ++++++++- packages/world/FLOW_LIMITS.md | 22 +- workbench/example/workflows/99_e2e.ts | 214 ++++++++++++++- 9 files changed, 770 insertions(+), 34 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 768e857a2f..4f6d697db7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -565,6 +565,19 @@ jobs: DEPLOYMENT_URL: "http://localhost:${{ matrix.app.name == 'sveltekit' && '4173' || (matrix.app.name == 'astro' && '4321' || '3000') }}" NEXT_CANARY: ${{ matrix.app.canary && '1' || '' }} + - name: Run Low-Concurrency Worker-Slot Test + if: ${{ !matrix.app.canary && matrix.app.name == 'nextjs-turbopack' }} + run: | + cd "${{ steps.prepare-workbench.outputs.workbench_app_path }}" && PORT=3001 WORKFLOW_POSTGRES_WORKER_CONCURRENCY=1 pnpm start & + echo "starting low-concurrency tests in 10 seconds" && sleep 10 + pnpm vitest run packages/core/e2e/e2e.test.ts -t "frees worker slots for unrelated workflows while a waiter is blocked" + env: + NODE_OPTIONS: "--enable-source-maps" + APP_NAME: ${{ matrix.app.name }} + WORKBENCH_APP_PATH: ${{ steps.prepare-workbench.outputs.workbench_app_path }} + DEPLOYMENT_URL: "http://localhost:3001" + WORKFLOW_LIMITS_LOW_CONCURRENCY: "1" + - name: Generate E2E summary if: always() run: node .github/scripts/aggregate-e2e-results.js . --job-name "E2E Local Postgres (${{ matrix.app.name }})" >> $GITHUB_STEP_SUMMARY || true diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 5f7aefc97e..d20f73e037 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -283,8 +283,40 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs, 'B']); return await Promise.all([runA.returnValue, runB.returnValue]); }, - async runWorkflowRateLimitContention(userId, holdMs, periodMs) { - const workflow = await e2e('workflowRateLimitContentionWorkflow'); + async runWorkflowExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('workflowLeakedLockWorkflow'); + const waiterWorkflow = await e2e( + 'workflowOnlyLockContentionWorkflow' + ); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [userId, 0, 'B']); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runStepExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('stepLeakedLockWorkflow'); + const waiterWorkflow = await e2e('stepKeyLockContentionWorkflow'); + const leakedRun = await start(leakedWorkflow, [ + userId, + leaseTtlMs, + 'A', + ]); + const leakedResult = await leakedRun.returnValue; + const waiterRun = await start(waiterWorkflow, [ + leakedResult.key, + 0, + 'B', + ]); + const waiterResult = await waiterRun.returnValue; + return [leakedResult, waiterResult]; + }, + async runWorkflowMixedLimitContention(userId, holdMs, periodMs) { + const workflow = await e2e('workflowMixedLimitContentionWorkflow'); const runA = await start(workflow, [userId, holdMs, periodMs, 'A']); await sleep(100); const runB = await start(workflow, [userId, holdMs, periodMs, 'B']); @@ -325,6 +357,74 @@ describe('e2e', () => { const runB = await start(workflow, ['user-b', holdMs]); return await Promise.all([runA.returnValue, runB.returnValue]); }, + async runIndependentStepKeys(holdMs) { + const workflow = await e2e('stepKeyLockContentionWorkflow'); + const runA = await start(workflow, [ + 'step:db:isolation:a', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'step:db:isolation:b', + holdMs, + 'B', + ]); + return await Promise.all([runA.returnValue, runB.returnValue]); + }, + async runBlockedWaiterWithUnrelatedWorkflow(holdMs) { + const workflow = await e2e('workflowOnlyLockContentionWorkflow'); + const runA = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'A', + ]); + await sleep(100); + const runB = await start(workflow, [ + 'worker-slot-shared', + holdMs, + 'B', + ]); + await sleep(100); + const runC = await start(workflow, [ + 'worker-slot-unrelated', + Math.max(100, Math.floor(holdMs / 4)), + 'C', + ]); + + const [holder, waiter, unrelated] = await Promise.all([ + runA.returnValue, + runB.returnValue, + runC.returnValue, + ]); + return { holder, waiter, unrelated }; + }, + async runMidStepLockContract(holdMs) { + const holderWorkflow = await e2e('stepKeyLockContentionWorkflow'); + const waiterWorkflow = await e2e('midStepLockContentionWorkflow'); + const traceToken = `mid-step-${Date.now()}-${Math.random() + .toString(36) + .slice(2)}`; + const key = `step:db:mid-step:${traceToken}`; + + const holderRun = await start(holderWorkflow, [ + key, + holdMs, + 'holder', + ]); + await sleep(100); + const waiterRun = await start(waiterWorkflow, [ + key, + traceToken, + 'waiter', + ]); + + const [holder, waiter] = await Promise.all([ + holderRun.returnValue, + waiterRun.returnValue, + ]); + return { holder, waiter }; + }, }) ); } diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 2e248bb516..6428422dbb 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -1,6 +1,6 @@ import { createLimitsContractSuite } from '../../world-testing/src/limits-contract.js'; import { createLocalWorld } from './index.js'; -import { mkdtemp, rm } from 'node:fs/promises'; +import { mkdtemp, readFile, rm } from 'node:fs/promises'; import os from 'node:os'; import path from 'node:path'; @@ -15,6 +15,40 @@ createLimitsContractSuite('local world limits', async () => { return { limits: world.limits, storage: world, + inspectKeyState: async (key) => { + const statePath = path.join(dir, 'limits', 'state.json'); + let raw: { + keys?: Record< + string, + { + leases?: { holderId: string }[]; + waiters?: { holderId: string }[]; + tokens?: { holderId: string }[]; + } + >; + }; + try { + raw = JSON.parse(await readFile(statePath, 'utf8')); + } catch (error) { + const code = (error as NodeJS.ErrnoException).code; + if (code === 'ENOENT') { + return { + leaseHolderIds: [], + waiterHolderIds: [], + tokenHolderIds: [], + }; + } + throw error; + } + + const keyState = raw.keys?.[key]; + return { + leaseHolderIds: keyState?.leases?.map((lease) => lease.holderId) ?? [], + waiterHolderIds: + keyState?.waiters?.map((waiter) => waiter.holderId) ?? [], + tokenHolderIds: keyState?.tokens?.map((token) => token.holderId) ?? [], + }; + }, close: async () => { await world.close?.(); await rm(dir, { recursive: true, force: true }); diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index 5d8e1a74f6..e7c8193788 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -46,6 +46,34 @@ if (process.platform === 'win32') { steps: createStepsStorage(db.drizzle), events: createEventsStorage(db.drizzle), }, + inspectKeyState: async (key) => { + const [leases, waiters, tokens] = await Promise.all([ + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_leases + where limit_key = ${key} + order by holder_id asc + `, + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_waiters + where limit_key = ${key} + order by created_at asc, holder_id asc + `, + db.sql<{ holderId: string }[]>` + select holder_id as "holderId" + from workflow.workflow_limit_tokens + where limit_key = ${key} + order by acquired_at asc, holder_id asc + `, + ]); + + return { + leaseHolderIds: leases.map((row) => row.holderId), + waiterHolderIds: waiters.map((row) => row.holderId), + tokenHolderIds: tokens.map((row) => row.holderId), + }; + }, }; }); } diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index 7e58f682f6..b83680a2f1 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -518,19 +518,22 @@ export function createLimits( // If there are already waiters for this key and holder no need to queue a new waiter. if (existingWaiter) { const now = Date.now(); + const concurrencyBlocked = + parsed.definition.concurrency !== undefined && + state.leases.length >= parsed.definition.concurrency.max; + const rateBlocked = + parsed.definition.rate !== undefined && + state.tokens.length >= parsed.definition.rate.count; return { status: 'blocked', - reason: getBlockedReason( - parsed.definition.concurrency !== undefined, - parsed.definition.rate !== undefined - ), + reason: getBlockedReason(concurrencyBlocked, rateBlocked), retryAfterMs: getRetryAfterMs( state.leases, state.tokens, now, - parsed.definition.concurrency !== undefined, - parsed.definition.rate !== undefined + concurrencyBlocked, + rateBlocked ) ?? 1000, } satisfies LimitAcquireResult; } diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 2a65750181..dba6f7d291 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -9,6 +9,11 @@ import { describe, expect, it } from 'vitest'; export interface LimitsHarness { limits: Limits; storage?: Pick; + inspectKeyState: (key: string) => Promise<{ + leaseHolderIds: string[]; + waiterHolderIds: string[]; + tokenHolderIds: string[]; + }>; close?: () => Promise; } @@ -114,6 +119,31 @@ export function createLimitsContractSuite( } }); + it('isolates unrelated keys at the raw limits layer', async () => { + const harness = await createHarness(); + try { + const [first, second] = await Promise.all([ + harness.limits.acquire({ + key: 'workflow:user:a', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }), + harness.limits.acquire({ + key: 'workflow:user:b', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }), + ]); + + expect(first.status).toBe('acquired'); + expect(second.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { @@ -196,12 +226,13 @@ export function createLimitsContractSuite( it('returns a combined blocked reason when both limits are saturated', async () => { const harness = await createHarness(); try { + const periodMs = 300; const first = await harness.limits.acquire({ key: 'step:mixed', holderId: 'holder-a', definition: { concurrency: { max: 1 }, - rate: { count: 1, periodMs: 1_000 }, + rate: { count: 1, periodMs }, }, leaseTtlMs: 1_000, }); @@ -214,7 +245,7 @@ export function createLimitsContractSuite( holderId: 'holder-b', definition: { concurrency: { max: 1 }, - rate: { count: 1, periodMs: 1_000 }, + rate: { count: 1, periodMs }, }, leaseTtlMs: 1_000, }); @@ -222,19 +253,96 @@ export function createLimitsContractSuite( status: 'blocked', reason: 'concurrency_and_rate', }); + if (second.status !== 'blocked') throw new Error('expected blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 1_000, + }); + expect(third).toMatchObject({ + status: 'blocked', + reason: 'rate', + }); + + let fourth = third; + const deadline = Date.now() + periodMs + 1_000; + while (fourth.status === 'blocked' && Date.now() < deadline) { + await sleep(Math.max(25, fourth.retryAfterMs ?? 0) + 50); + fourth = await harness.limits.acquire({ + key: 'step:mixed', + holderId: 'holder-b', + definition: { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + leaseTtlMs: 1_000, + }); + } + + expect(fourth.status).toBe('acquired'); } finally { await harness.close?.(); } }); - it('restores capacity when a lease is released or expires', async () => { + it('restores capacity immediately when a lease is released', async () => { const harness = await createHarness(); try { const first = await harness.limits.acquire({ key: 'workflow:user:123', holderId: 'holder-a', definition: { concurrency: { max: 1 } }, - leaseTtlMs: 500, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const second = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + key: first.lease.key, + holderId: first.lease.holderId, + }); + + const third = await harness.limits.acquire({ + key: 'workflow:user:123', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); + } finally { + await harness.close?.(); + } + }); + + it('extends lease expiry when heartbeated', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:heartbeat', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 200, }); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') @@ -242,21 +350,55 @@ export function createLimitsContractSuite( const heartbeat = await harness.limits.heartbeat({ leaseId: first.lease.leaseId, - ttlMs: 1_000, + ttlMs: 600, }); + expect(heartbeat.expiresAt?.getTime()).toBeGreaterThan( first.lease.expiresAt?.getTime() ?? 0 ); - await sleep(1_100); + const second = await harness.limits.acquire({ + key: 'workflow:user:heartbeat', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(second.status).toBe('blocked'); + } finally { + await harness.close?.(); + } + }); + + it('reclaims expired leases without manual cleanup', async () => { + const harness = await createHarness(); + try { + const first = await harness.limits.acquire({ + key: 'workflow:user:expired', + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 250, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); const second = await harness.limits.acquire({ - key: 'workflow:user:123', + key: 'workflow:user:expired', holderId: 'holder-b', definition: { concurrency: { max: 1 } }, leaseTtlMs: 1_000, }); - expect(second.status).toBe('acquired'); + expect(second.status).toBe('blocked'); + + await sleep(400); + + const third = await harness.limits.acquire({ + key: 'workflow:user:expired', + holderId: 'holder-b', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(third.status).toBe('acquired'); } finally { await harness.close?.(); } @@ -288,6 +430,21 @@ export function createLimitsContractSuite( holderId: first.lease.holderId, }, }); + + if (!harness.inspectKeyState) { + throw new Error( + 'inspectKeyState is required for duplicate lease checks' + ); + } + const keyState = await harness.inspectKeyState( + 'workflow:user:reacquire' + ); + expect( + keyState.leaseHolderIds.filter((holderId) => holderId === 'holder-a') + ).toHaveLength(1); + expect( + keyState.waiterHolderIds.filter((holderId) => holderId === 'holder-a') + ).toHaveLength(0); } finally { await harness.close?.(); } @@ -513,5 +670,81 @@ export function createLimitsContractSuite( await harness.close?.(); } }); + + it('does not duplicate a replayed blocked holder waiter or lease', async () => { + const harness = await createHarness(); + try { + const key = 'workflow:user:replay'; + const blockedHolderId = 'wflock_wrun_replay:corr_replay:holder_replay'; + + const first = await harness.limits.acquire({ + key, + holderId: 'holder-a', + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(first.status).toBe('acquired'); + if (first.status !== 'acquired') + throw new Error('expected acquisition'); + + const blockedA = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + const blockedB = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + + expect(blockedA.status).toBe('blocked'); + expect(blockedB.status).toBe('blocked'); + + const blockedState = await harness.inspectKeyState(key); + expect( + blockedState.waiterHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(1); + expect( + blockedState.leaseHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(0); + + await harness.limits.release({ + leaseId: first.lease.leaseId, + holderId: first.lease.holderId, + key: first.lease.key, + }); + + const acquired = await harness.limits.acquire({ + key, + holderId: blockedHolderId, + definition: { concurrency: { max: 1 } }, + leaseTtlMs: 1_000, + }); + expect(acquired.status).toBe('acquired'); + if (acquired.status !== 'acquired') + throw new Error('expected replayed holder acquisition'); + + const acquiredState = await harness.inspectKeyState(key); + expect( + acquiredState.waiterHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(0); + expect( + acquiredState.leaseHolderIds.filter( + (holderId) => holderId === blockedHolderId + ) + ).toHaveLength(1); + } finally { + await harness.close?.(); + } + }); }); } diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index 60bfe5789d..4627023ba9 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -9,6 +9,7 @@ type WorkflowLockContentionResult = { type StepLockNoRetriesResult = { label: string; + key?: string; attempt: number; acquiredAt: number; releasedAt: number; @@ -27,6 +28,32 @@ type WorkflowRateLimitResult = { periodMs: number; }; +type WorkflowLeakedLockResult = { + label: string; + key: string; + leaseTtlMs: number; + workflowLockAcquiredAt: number; + workflowCompletedAt: number; +}; + +type StepLeakedLockResult = { + label: string; + key: string; + leaseTtlMs: number; + stepLockAcquiredAt: number; + workflowCompletedAt: number; +}; + +type MidStepLockResult = { + label: string; + key: string; + attempt: number; + lockAcquiredAt: number; + preLockEffects: number; + postLockEffects: number; + trace: string[]; +}; + export interface LimitsRuntimeHarness { runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ workflowKey: string; @@ -48,7 +75,15 @@ export interface LimitsRuntimeHarness { userId: string, holdMs: number ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; - runWorkflowRateLimitContention( + runWorkflowExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[WorkflowLeakedLockResult, WorkflowOnlyLockResult]>; + runStepExpiredLeaseRecovery( + userId: string, + leaseTtlMs: number + ): Promise<[StepLeakedLockResult, StepLockNoRetriesResult]>; + runWorkflowMixedLimitContention( userId: string, holdMs: number, periodMs: number @@ -70,6 +105,18 @@ export interface LimitsRuntimeHarness { runIndependentWorkflowKeys( holdMs: number ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; + runIndependentStepKeys( + holdMs: number + ): Promise<[StepLockNoRetriesResult, StepLockNoRetriesResult]>; + runBlockedWaiterWithUnrelatedWorkflow(holdMs: number): Promise<{ + holder: WorkflowOnlyLockResult; + waiter: WorkflowOnlyLockResult; + unrelated: WorkflowOnlyLockResult; + }>; + runMidStepLockContract(holdMs: number): Promise<{ + holder: StepLockNoRetriesResult; + waiter: MidStepLockResult; + }>; } export function createLimitsRuntimeSuite( @@ -156,11 +203,43 @@ export function createLimitsRuntimeSuite( ).toBeLessThan(4_000); }); - it('wakes rate-limited waiters only after the rate window expires', async () => { + it('reclaims expired leaked workflow leases without manual cleanup', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( + 'expired-workflow-user', + leaseTtlMs + ); + + expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + expect( + resultB.workflowLockAcquiredAt - resultA.workflowLockAcquiredAt + ).toBeGreaterThanOrEqual(leaseTtlMs - 100); + }); + + it('reclaims expired leaked step leases without manual cleanup', async () => { + const harness = await createHarness(); + const leaseTtlMs = 1_250; + const [resultA, resultB] = await harness.runStepExpiredLeaseRecovery( + 'expired-step-user', + leaseTtlMs + ); + + expect(resultB.acquiredAt).toBeGreaterThanOrEqual( + resultA.workflowCompletedAt + ); + expect( + resultB.acquiredAt - resultA.stepLockAcquiredAt + ).toBeGreaterThanOrEqual(leaseTtlMs - 100); + }); + + it('keeps mixed concurrency and rate waiters blocked until the rate window expires', async () => { const harness = await createHarness(); const holdMs = 250; const periodMs = 1_500; - const [resultA, resultB] = await harness.runWorkflowRateLimitContention( + const [resultA, resultB] = await harness.runWorkflowMixedLimitContention( 'shared-user', holdMs, periodMs @@ -214,5 +293,43 @@ export function createLimitsRuntimeSuite( resultA.workflowLockReleasedAt ); }); + + it('does not block unrelated step keys', async () => { + const harness = await createHarness(); + const [resultA, resultB] = await harness.runIndependentStepKeys(1_000); + + expect(resultB.acquiredAt).toBeLessThan(resultA.releasedAt); + }); + + it.skipIf(process.env.WORKFLOW_LIMITS_LOW_CONCURRENCY !== '1')( + 'frees worker slots for unrelated workflows while a waiter is blocked', + async () => { + const harness = await createHarness(); + const { holder, waiter, unrelated } = + await harness.runBlockedWaiterWithUnrelatedWorkflow(1_500); + + expect(waiter.workflowLockAcquiredAt).toBeGreaterThanOrEqual( + holder.workflowLockReleasedAt + ); + expect(unrelated.workflowLockReleasedAt).toBeLessThan( + waiter.workflowLockAcquiredAt + ); + } + ); + + it('replays a mid-step lock at the acquire boundary without duplicating post-lock effects', async () => { + const harness = await createHarness(); + const { holder, waiter } = await harness.runMidStepLockContract(1_500); + + expect(waiter.lockAcquiredAt).toBeGreaterThanOrEqual(holder.releasedAt); + expect(waiter.preLockEffects).toBe(2); + expect(waiter.postLockEffects).toBe(1); + expect(waiter.trace.map((event) => event.split(':')[0])).toEqual([ + 'pre', + 'pre', + 'lock', + 'post', + ]); + }); }); } diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 565e258d71..07cc69168f 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -150,20 +150,22 @@ workflow scope, even though the workflow may suspend and resume many times. The current behavior is: -- declare the limit at the top of the step +- declare the limit at the top of the step when possible - the runtime treats a blocked acquisition as step-boundary admission failure - the step does not keep executing user code while waiting for capacity - the step is re-queued and retried after promotion or timeout - lease is disposed automatically when the step attempt completes -Important caveat: +If `lock()` is called in the middle of a step, the intended contract is: -- zero-attempt semantics are only guaranteed when `lock()` is used as a top-of-step admission gate -- calling `lock()` after side effects or meaningful user work is unsupported/best-effort +- the current attempt stops at the blocked `lock()` call +- the step is deferred and re-queued rather than polling in-process +- code before the blocked `lock()` may replay on the next attempt +- code after the `lock()` runs only after the lock is actually acquired -This means step `lock()` is conceptually the same API, but it is not a literal -"spin inside already-running user step code until capacity appears" -implementation. +This means zero-attempt semantics are still strongest when `lock()` is used as +a top-of-step admission gate, but mid-step `lock()` is now part of the shared +runtime contract rather than unsupported behavior. ### 6. `await using` is the preferred user-facing shape @@ -200,9 +202,8 @@ The lease must not be disposed merely because one host process invocation ends. Current preferred model: - workflow-level limits may be held by a run -- step-level limits are acquired only at step boundaries +- blocked step-level limits return control to the runtime at the step boundary - step-level limits are short-lived -- step code should not acquire additional locks dynamically - step execution should not wait on workflow-level locks This keeps the dependency direction one-way: @@ -280,7 +281,8 @@ More concretely: - if a workflow is parked waiting for a step-level limit, it still counts as active for its workflow-level lock - a step-level lock should conceptually be an admission gate for the step - attempt, not a second workflow-level lock + attempt, not a second workflow-level lock, even when the `lock()` call + appears in the middle of user code - step-level rate limits should consume rate capacity when the step starts, and that rate usage should remain counted until the window expires even if the step releases its lease quickly diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 0d3db80e83..338dc863f3 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -261,15 +261,75 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } -async function serializedLimitStep(label: string, holdMs: number) { +type LimitTraceState = { + events: string[]; +}; + +function sanitizeLimitTraceToken(traceToken: string) { + return traceToken.replace(/[^a-zA-Z0-9_-]/g, '_'); +} + +async function getLimitTracePath(traceToken: string) { + const path = await import('node:path'); + return path.join( + process.cwd(), + '.workflow-e2e', + `limits-${sanitizeLimitTraceToken(traceToken)}.json` + ); +} + +async function readLimitTraceState( + traceToken: string +): Promise { + const { mkdir, readFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const tracePath = await getLimitTracePath(traceToken); + await mkdir(path.dirname(tracePath), { recursive: true }); + + try { + return JSON.parse(await readFile(tracePath, 'utf8')) as LimitTraceState; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + return { events: [] }; + } + throw error; + } +} + +async function writeLimitTraceState( + traceToken: string, + state: LimitTraceState +) { + const { mkdir, writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const tracePath = await getLimitTracePath(traceToken); + await mkdir(path.dirname(tracePath), { recursive: true }); + await writeFile(tracePath, JSON.stringify(state), 'utf8'); +} + +async function appendLimitTraceEvent(traceToken: string, event: string) { + const state = await readLimitTraceState(traceToken); + const nextState = { + events: [...state.events, event], + }; + await writeLimitTraceState(traceToken, nextState); + return nextState.events; +} + +async function serializedLimitStep( + label: string, + holdMs: number, + key = 'step:db:serialized' +) { 'use step'; const stepLock = await lock({ - key: 'step:db:serialized', + key, concurrency: { max: 1 }, leaseTtlMs: holdMs + 5_000, }); + const metadata = getStepMetadata(); const acquiredAt = Date.now(); await new Promise((resolve) => setTimeout(resolve, holdMs)); await stepLock.dispose(); @@ -277,6 +337,8 @@ async function serializedLimitStep(label: string, holdMs: number) { return { label, + key, + attempt: metadata.attempt, acquiredAt, releasedAt, }; @@ -308,11 +370,15 @@ export async function workflowLockContentionWorkflow( }; } -async function stepLockNoRetriesStep(label: string, holdMs: number) { +async function stepLockNoRetriesStep( + label: string, + holdMs: number, + key = 'step:db:no-retries' +) { 'use step'; await using _stepLock = await lock({ - key: 'step:db:no-retries', + key, concurrency: { max: 1 }, leaseTtlMs: holdMs + 5_000, }); @@ -324,6 +390,7 @@ async function stepLockNoRetriesStep(label: string, holdMs: number) { return { label, + key, attempt: metadata.attempt, acquiredAt, releasedAt, @@ -341,6 +408,16 @@ export async function stepLockNoRetriesContentionWorkflow( return await stepLockNoRetriesStep(label, holdMs); } +export async function stepKeyLockContentionWorkflow( + key = 'step:db:key-contention', + holdMs = 750, + label = key +) { + 'use workflow'; + + return await stepLockNoRetriesStep(label, holdMs, key); +} + ////////////////////////////////////////////////////////// export async function workflowOnlyLockContentionWorkflow( @@ -368,6 +445,69 @@ export async function workflowOnlyLockContentionWorkflow( }; } +export async function workflowLeakedLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + const leakedWorkflowLock = await lock({ + key: `workflow:user:${userId}`, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + const workflowLockAcquiredAt = Date.now(); + + return { + label, + userId, + key: leakedWorkflowLock.key, + leaseTtlMs, + leakedLeaseId: leakedWorkflowLock.leaseId, + workflowLockAcquiredAt, + workflowCompletedAt: Date.now(), + }; +} + +async function leakedStepLockStep( + key: string, + leaseTtlMs: number, + label: string +) { + 'use step'; + + const leakedStepLock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs, + }); + + return { + label, + key, + leaseTtlMs, + leakedLeaseId: leakedStepLock.leaseId, + stepLockAcquiredAt: Date.now(), + workflowCompletedAt: Date.now(), + }; +} + +export async function stepLeakedLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId +) { + 'use workflow'; + + return await leakedStepLockStep( + `step:db:expired:${userId}`, + leaseTtlMs, + label + ); +} + export async function workflowRateLimitContentionWorkflow( userId = 'user-123', holdMs = 250, @@ -395,6 +535,72 @@ export async function workflowRateLimitContentionWorkflow( }; } +export async function workflowMixedLimitContentionWorkflow( + userId = 'user-123', + holdMs = 250, + periodMs = 1_500, + label = userId +) { + 'use workflow'; + + await using _mixedLimit = await lock({ + key: `workflow:mixed:${userId}`, + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + leaseTtlMs: periodMs + 5_000, + }); + + const workflowRateAcquiredAt = Date.now(); + await sleep(holdMs); + const workflowRateReleasedAt = Date.now(); + + return { + label, + userId, + periodMs, + workflowRateAcquiredAt, + workflowRateReleasedAt, + }; +} + +async function midStepLockStep(key: string, traceToken: string, label: string) { + 'use step'; + + const { attempt } = getStepMetadata(); + await appendLimitTraceEvent(traceToken, `pre:${attempt}`); + + await using _midStepLock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: 5_000, + }); + + const lockAcquiredAt = Date.now(); + await appendLimitTraceEvent(traceToken, `lock:${attempt}`); + const trace = await appendLimitTraceEvent(traceToken, `post:${attempt}`); + + return { + label, + key, + attempt, + lockAcquiredAt, + preLockEffects: trace.filter((event) => event.startsWith('pre:')).length, + postLockEffects: trace.filter((event) => event.startsWith('post:')).length, + trace, + }; +} +midStepLockStep.maxRetries = 0; + +export async function midStepLockContentionWorkflow( + key = 'step:db:mid-step', + traceToken = 'mid-step', + label = key +) { + 'use workflow'; + + return await midStepLockStep(key, traceToken, label); +} + ////////////////////////////////////////////////////////// async function nullByteStep() { From eabe5ef14e264419107565fcca1ad220e06628cc Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 16:15:07 -0400 Subject: [PATCH 14/16] fixed type error Signed-off-by: nathancolosimo --- packages/world-testing/src/limits-contract.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index dba6f7d291..515bbf7fc4 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -209,7 +209,7 @@ export function createLimitsContractSuite( }); const deadline = Date.now() + periodMs + 1_000; while (third.status === 'blocked' && Date.now() < deadline) { - await sleep(Math.max(25, third.retryAfterMs) + 50); + await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); third = await harness.limits.acquire({ key: 'step:provider:openai', holderId: 'holder-c', From b8480d394d9f7d2883f380969feb488fcad16d7e Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Thu, 19 Mar 2026 17:45:21 -0400 Subject: [PATCH 15/16] fix ci issues Signed-off-by: nathancolosimo --- workbench/example/tsconfig.json | 4 ++-- workbench/example/workflows/99_e2e.ts | 8 ++++---- workbench/example/workflows/serde-steps.ts | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workbench/example/tsconfig.json b/workbench/example/tsconfig.json index 4e131954f0..58fb97394f 100644 --- a/workbench/example/tsconfig.json +++ b/workbench/example/tsconfig.json @@ -1,7 +1,7 @@ { "compilerOptions": { "target": "es2022", - "module": "NodeNext", + "module": "esnext", "lib": ["dom", "dom.iterable", "esnext"], "baseUrl": ".", "allowJs": true, @@ -9,7 +9,7 @@ "strict": true, "noEmit": true, "esModuleInterop": true, - "moduleResolution": "NodeNext", + "moduleResolution": "bundler", "resolveJsonModule": true, "isolatedModules": true, "jsx": "preserve", diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index 338dc863f3..b85e49cf3e 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -1,6 +1,6 @@ // Test path alias resolution - imports a helper from outside the workbench directory /** biome-ignore-all lint/complexity/noStaticOnlyClass: */ -import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test.js'; +import { pathsAliasHelper } from '@repo/lib/steps/paths-alias-test'; import { createHook, createWebhook, @@ -15,8 +15,8 @@ import { sleep, } from 'workflow'; import { getRun, start } from 'workflow/api'; -import { importedStepOnly } from './_imported_step_only.js'; -import { callThrower, stepThatThrowsFromHelper } from './helpers.js'; +import { importedStepOnly } from './_imported_step_only'; +import { callThrower, stepThatThrowsFromHelper } from './helpers'; ////////////////////////////////////////////////////////// @@ -1538,7 +1538,7 @@ import { createVector, scaleVector, sumVectors, -} from './serde-steps.js'; +} from './serde-steps'; /** * Workflow that tests cross-context class registration. diff --git a/workbench/example/workflows/serde-steps.ts b/workbench/example/workflows/serde-steps.ts index 9726bbe6c0..227de88399 100644 --- a/workbench/example/workflows/serde-steps.ts +++ b/workbench/example/workflows/serde-steps.ts @@ -6,7 +6,7 @@ * step calls. This tests cross-context class registration. */ -import { Vector } from './serde-models.js'; +import { Vector } from './serde-models'; /** * Step that receives a Vector and scales it. From a6b603a0d940adffe20c332619779a9b38f49870 Mon Sep 17 00:00:00 2001 From: nathancolosimo Date: Fri, 20 Mar 2026 19:37:16 -0400 Subject: [PATCH 16/16] Removed step lock and added lock index Signed-off-by: nathancolosimo --- packages/core/e2e/e2e.test.ts | 67 +- .../async-deserialization-ordering.test.ts | 1 + .../core/src/hook-sleep-interaction.test.ts | 1 + packages/core/src/lock.test.ts | 40 +- packages/core/src/lock.ts | 19 +- packages/core/src/private.ts | 1 + .../core/src/runtime/step-handler.test.ts | 115 +-- packages/core/src/runtime/step-handler.ts | 234 +----- packages/core/src/step.test.ts | 37 +- packages/core/src/step.ts | 5 - packages/core/src/step/context-storage.ts | 3 - packages/core/src/step/lock.ts | 103 --- packages/core/src/symbols.ts | 1 - packages/core/src/workflow.ts | 1 + packages/core/src/workflow/hook.test.ts | 1 + packages/core/src/workflow/lock.ts | 34 +- packages/core/src/workflow/sleep.test.ts | 1 + packages/world-local/src/limits.test.ts | 12 +- packages/world-local/src/limits.ts | 151 ++-- packages/world-local/src/storage.test.ts | 77 -- .../world-local/src/storage/events-storage.ts | 32 +- packages/world-postgres/README.md | 2 +- packages/world-postgres/src/limits.test.ts | 23 +- packages/world-postgres/src/limits.ts | 171 +--- packages/world-postgres/src/storage.ts | 59 +- packages/world-postgres/test/storage.test.ts | 116 --- packages/world-testing/src/limits-contract.ts | 795 +++++++++--------- packages/world-testing/src/limits-runtime.ts | 129 ++- packages/world-vercel/src/limits.test.ts | 44 +- packages/world/FLOW_LIMITS.md | 136 ++- packages/world/src/events.ts | 17 - packages/world/src/index.ts | 4 + packages/world/src/limits.ts | 41 +- workbench/example/workflows/99_e2e.ts | 257 ++---- 34 files changed, 904 insertions(+), 1826 deletions(-) delete mode 100644 packages/core/src/step/lock.ts diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index d20f73e037..77a5960231 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -249,11 +249,10 @@ describe('e2e', () => { createLimitsRuntimeSuite( `limits runtime (${isPostgresWorld ? 'postgres' : 'local'})`, async () => ({ - async runWorkflowWithWorkflowAndStepLocks(userId) { - const run = await start( - await e2e('workflowWithWorkflowAndStepLocks'), - [userId] - ); + async runWorkflowWithScopedLocks(userId) { + const run = await start(await e2e('workflowWithScopedLocks'), [ + userId, + ]); return await run.returnValue; }, async runWorkflowLockContention(userId, holdMs) { @@ -263,18 +262,17 @@ describe('e2e', () => { const runB = await start(workflow, [userId, holdMs]); return await Promise.all([runA.returnValue, runB.returnValue]); }, - async runStepLockNoRetriesContention(userId, holdMs) { - const workflow = await e2e('stepLockNoRetriesContentionWorkflow'); - const runA = await start(workflow, [userId, holdMs, 'A']); - await sleep(100); - const runB = await start(workflow, [userId, holdMs, 'B']); + async runLockedStepCallContention( + key, + holdMs, + labelA = 'A', + labelB = 'B' + ) { + const workflow = await e2e('lockedStepCallContentionWorkflow'); + const runA = await start(workflow, [key, holdMs, labelA]); await sleep(100); - const runC = await start(workflow, [userId, holdMs, 'C']); - return await Promise.all([ - runA.returnValue, - runB.returnValue, - runC.returnValue, - ]); + const runB = await start(workflow, [key, holdMs, labelB]); + return await Promise.all([runA.returnValue, runB.returnValue]); }, async runWorkflowLockAcrossSuspension(userId, holdMs) { const workflow = await e2e('workflowOnlyLockContentionWorkflow'); @@ -298,9 +296,9 @@ describe('e2e', () => { const waiterResult = await waiterRun.returnValue; return [leakedResult, waiterResult]; }, - async runStepExpiredLeaseRecovery(userId, leaseTtlMs) { - const leakedWorkflow = await e2e('stepLeakedLockWorkflow'); - const waiterWorkflow = await e2e('stepKeyLockContentionWorkflow'); + async runLeakedKeyExpiredLeaseRecovery(userId, leaseTtlMs) { + const leakedWorkflow = await e2e('leakedKeyLockWorkflow'); + const waiterWorkflow = await e2e('lockedStepCallContentionWorkflow'); const leakedRun = await start(leakedWorkflow, [ userId, leaseTtlMs, @@ -358,7 +356,7 @@ describe('e2e', () => { return await Promise.all([runA.returnValue, runB.returnValue]); }, async runIndependentStepKeys(holdMs) { - const workflow = await e2e('stepKeyLockContentionWorkflow'); + const workflow = await e2e('lockedStepCallContentionWorkflow'); const runA = await start(workflow, [ 'step:db:isolation:a', holdMs, @@ -399,31 +397,10 @@ describe('e2e', () => { ]); return { holder, waiter, unrelated }; }, - async runMidStepLockContract(holdMs) { - const holderWorkflow = await e2e('stepKeyLockContentionWorkflow'); - const waiterWorkflow = await e2e('midStepLockContentionWorkflow'); - const traceToken = `mid-step-${Date.now()}-${Math.random() - .toString(36) - .slice(2)}`; - const key = `step:db:mid-step:${traceToken}`; - - const holderRun = await start(holderWorkflow, [ - key, - holdMs, - 'holder', - ]); - await sleep(100); - const waiterRun = await start(waiterWorkflow, [ - key, - traceToken, - 'waiter', - ]); - - const [holder, waiter] = await Promise.all([ - holderRun.returnValue, - waiterRun.returnValue, - ]); - return { holder, waiter }; + async runWorkflowSingleLockAcrossMultipleSteps(holdMs) { + const workflow = await e2e('singleLockAcrossMultipleStepsWorkflow'); + const run = await start(workflow, ['step:db:batch', holdMs]); + return await run.returnValue; }, }) ); diff --git a/packages/core/src/async-deserialization-ordering.test.ts b/packages/core/src/async-deserialization-ordering.test.ts index 0774b7d9d8..463a661ec0 100644 --- a/packages/core/src/async-deserialization-ordering.test.ts +++ b/packages/core/src/async-deserialization-ordering.test.ts @@ -36,6 +36,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/hook-sleep-interaction.test.ts b/packages/core/src/hook-sleep-interaction.test.ts index a706628b81..9ec1bca88d 100644 --- a/packages/core/src/hook-sleep-interaction.test.ts +++ b/packages/core/src/hook-sleep-interaction.test.ts @@ -42,6 +42,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => promiseQueueHolder.current, }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/lock.test.ts b/packages/core/src/lock.test.ts index c9237066e3..9cc1e2dcee 100644 --- a/packages/core/src/lock.test.ts +++ b/packages/core/src/lock.test.ts @@ -1,10 +1,14 @@ import { afterEach, describe, expect, it, vi } from 'vitest'; -import { lock, LIMITS_NOT_IMPLEMENTED_MESSAGE } from './lock.js'; -import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; +import { + lock, + LIMITS_NOT_IMPLEMENTED_MESSAGE, + LOCK_WORKFLOW_ONLY_MESSAGE, +} from './lock.js'; +import { contextStorage } from './step/context-storage.js'; +import { WORKFLOW_LOCK } from './symbols.js'; afterEach(() => { delete (globalThis as any)[WORKFLOW_LOCK]; - delete (globalThis as any)[STEP_LOCK]; }); describe('lock', () => { @@ -20,9 +24,7 @@ describe('lock', () => { it('prefers the workflow runtime lock when both runtimes are present', async () => { const workflowHandle = { leaseId: 'lease_workflow' }; const workflowLock = vi.fn().mockResolvedValue(workflowHandle); - const stepLock = vi.fn().mockResolvedValue({ leaseId: 'lease_step' }); (globalThis as any)[WORKFLOW_LOCK] = workflowLock; - (globalThis as any)[STEP_LOCK] = stepLock; const options = { key: 'workflow:user:test', concurrency: { max: 1 }, @@ -30,19 +32,33 @@ describe('lock', () => { await expect(lock(options)).resolves.toBe(workflowHandle); expect(workflowLock).toHaveBeenCalledWith(options); - expect(stepLock).not.toHaveBeenCalled(); }); - it('falls back to the step runtime lock when no workflow runtime is present', async () => { - const handle = { leaseId: 'lease_step' }; - const stepLock = vi.fn().mockResolvedValue(handle); - (globalThis as any)[STEP_LOCK] = stepLock; + it('throws a workflow-only error when called inside a step context', async () => { const options = { key: 'step:db:cheap', concurrency: { max: 2 }, }; - await expect(lock(options)).resolves.toBe(handle); - expect(stepLock).toHaveBeenCalledWith(options); + await expect( + contextStorage.run( + { + stepMetadata: { + stepId: 'step_test', + stepName: 'testStep', + stepStartedAt: new Date(), + attempt: 1, + }, + workflowMetadata: { + workflowName: 'testWorkflow', + workflowRunId: 'wrun_test', + workflowStartedAt: new Date(), + url: 'http://localhost:3000', + }, + ops: [], + }, + () => lock(options) + ) + ).rejects.toThrow(LOCK_WORKFLOW_ONLY_MESSAGE); }); }); diff --git a/packages/core/src/lock.ts b/packages/core/src/lock.ts index 11829957d0..9791c39e13 100644 --- a/packages/core/src/lock.ts +++ b/packages/core/src/lock.ts @@ -4,10 +4,14 @@ import { type LimitKey, type LimitLease, } from '@workflow/world'; -import { STEP_LOCK, WORKFLOW_LOCK } from './symbols.js'; +import { contextStorage } from './step/context-storage.js'; +import { WORKFLOW_LOCK } from './symbols.js'; export { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; +export const LOCK_WORKFLOW_ONLY_MESSAGE = + '`lock()` is only supported in workflow functions. Wrap the step call with `await using` in workflow code.'; + /** * Reserved first-pass user-facing API for future flow concurrency and rate * limiting inside workflow functions. @@ -21,7 +25,10 @@ export interface LockOptions extends LimitDefinition { * Reserved handle shape for future lock acquisition. */ export interface LockHandle - extends Pick { + extends Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' + > { dispose(): Promise; heartbeat(ttlMs?: number): Promise; [Symbol.asyncDispose](): Promise; @@ -39,12 +46,8 @@ export async function lock(options: LockOptions): Promise { return workflowLock(options); } - const stepLock = (globalThis as any)[STEP_LOCK] as - | ((options: LockOptions) => Promise) - | undefined; - - if (stepLock) { - return stepLock(options); + if (contextStorage.getStore()) { + throw new Error(LOCK_WORKFLOW_ONLY_MESSAGE); } throw createLimitsNotImplementedError(); diff --git a/packages/core/src/private.ts b/packages/core/src/private.ts index 0eabc7b70f..ac827aae05 100644 --- a/packages/core/src/private.ts +++ b/packages/core/src/private.ts @@ -93,6 +93,7 @@ export interface WorkflowOrchestratorContext { encryptionKey: CryptoKey | undefined; globalThis: typeof globalThis; eventsConsumer: EventsConsumer; + nextLockIndex: number; /** * Map of pending invocations keyed by correlationId. * Using Map instead of Array for O(1) lookup/delete operations. diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index 1951e7a162..3c7aae614f 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -1,6 +1,5 @@ import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; -import { StepLockBlockedError } from '../step/lock.js'; // Use vi.hoisted so these are available in mock factories const { @@ -287,116 +286,16 @@ describe('step-handler 409 handling', () => { mockStepFn.mockResolvedValue('step-result'); }); - it('returns a timeout when a step lock is blocked before user code can proceed', async () => { - mockEventsCreate.mockResolvedValue({ - step: { - stepId: 'step_abc', - status: 'running', - attempt: 1, - startedAt: new Date(), - input: [], - }, - }); - mockStepFn.mockRejectedValue( - new StepLockBlockedError( - { - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }, - 2_500 - ) - ); - - const result = await capturedHandler( - createMessage(), - createMetadata('myStep') - ); - - expect(result).toEqual({ timeoutSeconds: 3 }); - expect(mockQueueMessage).not.toHaveBeenCalled(); - expect(mockEventsCreate).toHaveBeenCalledTimes(2); - expect(mockEventsCreate).toHaveBeenNthCalledWith( - 1, - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_started', - }), - expect.anything() - ); - expect(mockEventsCreate).toHaveBeenNthCalledWith( - 2, - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_deferred', - correlationId: 'step_abc', - eventData: { - retryAfter: expect.any(Date), - lockRequest: expect.objectContaining({ - key: expect.any(String), - holderId: 'stplock_wrun_test123:step_abc:0', - }), - }, - }), - expect.anything() - ); + afterEach(() => { + vi.restoreAllMocks(); }); - it('rechecks a deferred lock before step_started and re-defers without running user code', async () => { - mockEventsListByCorrelationId.mockResolvedValue({ - data: [ - { - eventId: 'evnt_1', - runId: 'wrun_test123', - eventType: 'step_deferred', - correlationId: 'step_abc', - eventData: { - retryAfter: new Date(Date.now() - 1_000), - lockRequest: { - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }, - }, - createdAt: new Date(), - }, - ], - cursor: null, - hasMore: false, - }); - mockLimitsAcquire.mockResolvedValue({ - status: 'blocked', - reason: 'concurrency', - retryAfterMs: 2_500, - }); - - const result = await capturedHandler( - createMessage(), - createMetadata('myStep') - ); + it('does not call limits for ordinary step execution without lock()', async () => { + await capturedHandler(createMessage(), createMetadata('myStep')); - expect(result).toEqual({ timeoutSeconds: 3 }); - expect(mockStepFn).not.toHaveBeenCalled(); - expect(mockLimitsAcquire).toHaveBeenCalledWith({ - key: 'step:db:no-retries', - holderId: 'stplock_wrun_test123:step_abc:0', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(mockEventsCreate).toHaveBeenCalledTimes(1); - expect(mockEventsCreate).toHaveBeenCalledWith( - 'wrun_test123', - expect.objectContaining({ - eventType: 'step_deferred', - }), - expect.anything() - ); - }); - - afterEach(() => { - vi.restoreAllMocks(); + expect(mockLimitsAcquire).not.toHaveBeenCalled(); + expect(mockLimitsHeartbeat).not.toHaveBeenCalled(); + expect(mockLimitsRelease).not.toHaveBeenCalled(); }); describe('step_completed 409', () => { diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index c0620e31d3..ec60d06b04 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -11,12 +11,7 @@ import { } from '@workflow/errors'; import { pluralize } from '@workflow/utils'; import { getPort } from '@workflow/utils/get-port'; -import { - LimitAcquireRequestSchema, - SPEC_VERSION_CURRENT, - StepInvokePayloadSchema, - type LimitLease, -} from '@workflow/world'; +import { SPEC_VERSION_CURRENT, StepInvokePayloadSchema } from '@workflow/world'; import { importKey } from '../encryption.js'; import { runtimeLogger, stepLogger } from '../logger.js'; import { getStepFunction } from '../private.js'; @@ -25,8 +20,6 @@ import { hydrateStepArguments, } from '../serialization.js'; import { contextStorage } from '../step/context-storage.js'; -import { createStepLock, StepLockBlockedError } from '../step/lock.js'; -import { STEP_LOCK } from '../symbols.js'; import * as Attribute from '../telemetry/semantic-conventions.js'; import { getSpanKind, @@ -52,68 +45,6 @@ import { getWorld, getWorldHandlers } from './world.js'; const DEFAULT_STEP_MAX_RETRIES = 3; -async function getDeferredStepLock( - world: ReturnType, - workflowRunId: string, - stepId: string -) { - let step: Awaited>; - try { - step = await world.steps.get(workflowRunId, stepId); - } catch (error) { - if ( - WorkflowWorldError.is(error) && - (error.status === 404 || error.message === `Step not found: ${stepId}`) - ) { - return null; - } - throw error; - } - if (step.status !== 'pending') { - return null; - } - - const result = await world.events.listByCorrelationId({ - correlationId: stepId, - pagination: { - limit: 1, - sortOrder: 'desc', - }, - }); - const latestEvent = result.data[0]; - - if ( - !latestEvent || - latestEvent.runId !== workflowRunId || - latestEvent.eventType !== 'step_deferred' || - !latestEvent.eventData.lockRequest - ) { - return null; - } - - return { - step, - lockRequest: LimitAcquireRequestSchema.parse( - latestEvent.eventData.lockRequest - ), - }; -} - -async function releaseUnusedPreAcquiredLocks( - world: ReturnType, - preAcquiredLocks: Record -) { - await Promise.all( - Object.values(preAcquiredLocks).map((lease) => - world.limits.release({ - leaseId: lease.leaseId, - key: lease.key, - holderId: lease.holderId, - }) - ) - ); -} - const stepHandler = getWorldHandlers().createQueueHandler( '__wkf_step_', async (message_, metadata) => { @@ -185,56 +116,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( ...Attribute.StepTracePropagated(!!traceContext), }); - const preAcquiredLocks: Record = {}; - const deferredStepLock = await getDeferredStepLock( - world, - workflowRunId, - stepId - ); - if (deferredStepLock) { - const retryAfter = deferredStepLock.step.retryAfter; - if (retryAfter && retryAfter.getTime() > Date.now()) { - const timeoutSeconds = Math.max( - 1, - Math.ceil((retryAfter.getTime() - Date.now()) / 1000) - ); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - return { timeoutSeconds }; - } - - const lockResult = await world.limits.acquire( - deferredStepLock.lockRequest - ); - if (lockResult.status === 'blocked') { - const retryAfterMs = Math.max(1, lockResult.retryAfterMs ?? 1000); - const timeoutSeconds = Math.max( - 1, - Math.ceil(retryAfterMs / 1000) - ); - await world.events.create( - workflowRunId, - { - eventType: 'step_deferred', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - retryAfter: new Date(Date.now() + retryAfterMs), - lockRequest: deferredStepLock.lockRequest, - }, - }, - { requestId } - ); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - return { timeoutSeconds }; - } - - preAcquiredLocks[lockResult.lease.holderId] = lockResult.lease; - } - // step_started validates state and returns the step entity, so no separate // world.steps.get() call is needed. The server checks: // - Step not in terminal state (returns 409) @@ -260,7 +141,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( step = startResult.step; } catch (err) { if (ThrottleError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryRetryAfter = Math.max( 1, typeof err.retryAfter === 'number' ? err.retryAfter : 1 @@ -274,14 +154,12 @@ const stepHandler = getWorldHandlers().createQueueHandler( return { timeoutSeconds: retryRetryAfter }; } if (RunExpiredError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.info( `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` ); return; } if (EntityConflictError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); runtimeLogger.debug( 'Step in terminal state, re-enqueuing workflow', { @@ -311,7 +189,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( // Too early: retryAfter timestamp not reached yet // Return timeout to queue so it retries later if (TooEarlyError.is(err)) { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); const retryAfter = err.retryAfter ?? new Date(Date.now() + 1000); const timeoutSeconds = Math.max( 1, @@ -334,7 +211,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( }); return { timeoutSeconds }; } - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); // Re-throw other errors throw err; } @@ -497,50 +373,35 @@ const stepHandler = getWorldHandlers().createQueueHandler( const executionStartTime = Date.now(); try { - const previousStepLock = (globalThis as any)[STEP_LOCK]; - (globalThis as any)[STEP_LOCK] = createStepLock(world); - result = await trace('step.execute', {}, async () => { - try { - return await contextStorage.run( - { - stepMetadata: { - stepName, - stepId, - stepStartedAt: new Date(+stepStartedAt), - attempt, - }, - workflowMetadata: { - workflowName, - workflowRunId, - workflowStartedAt: new Date(+workflowStartedAt), - // TODO: there should be a getUrl method on the world interface itself. This - // solution only works for vercel + local worlds. - url: isVercel - ? `https://${process.env.VERCEL_URL}` - : `http://localhost:${port ?? 3000}`, - }, - ops, - closureVars: hydratedInput.closureVars, - encryptionKey, - lockCounter: 0, - preAcquiredLocks, + return await contextStorage.run( + { + stepMetadata: { + stepName, + stepId, + stepStartedAt: new Date(+stepStartedAt), + attempt, }, - () => stepFn.apply(thisVal, args) - ); - } finally { - if (previousStepLock === undefined) { - delete (globalThis as any)[STEP_LOCK]; - } else { - (globalThis as any)[STEP_LOCK] = previousStepLock; - } - } + workflowMetadata: { + workflowName, + workflowRunId, + workflowStartedAt: new Date(+workflowStartedAt), + // TODO: there should be a getUrl method on the world interface itself. This + // solution only works for vercel + local worlds. + url: isVercel + ? `https://${process.env.VERCEL_URL}` + : `http://localhost:${port ?? 3000}`, + }, + ops, + closureVars: hydratedInput.closureVars, + encryptionKey, + }, + () => stepFn.apply(thisVal, args) + ); }); } catch (err) { userCodeError = err; userCodeFailed = true; - } finally { - await releaseUnusedPreAcquiredLocks(world, preAcquiredLocks); } const executionTimeMs = Date.now() - executionStartTime; @@ -552,53 +413,6 @@ const stepHandler = getWorldHandlers().createQueueHandler( if (userCodeFailed) { const err = userCodeError; - if (StepLockBlockedError.is(err)) { - const retryAfterMs = Math.max(1, err.retryAfterMs ?? 1000); - const timeoutSeconds = Math.max( - 1, - Math.ceil(retryAfterMs / 1000) - ); - const retryAfter = new Date(Date.now() + retryAfterMs); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - span?.addEvent?.('step.lock_blocked', { - 'retry.timeout_seconds': timeoutSeconds, - 'step.id': stepId, - 'step.name': stepName, - }); - try { - await world.events.create( - workflowRunId, - { - eventType: 'step_deferred', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - retryAfter, - lockRequest: err.request, - }, - }, - { requestId } - ); - } catch (stepDeferredErr) { - if (EntityConflictError.is(stepDeferredErr)) { - runtimeLogger.info( - 'Tried deferring step, but step has already finished.', - { - workflowRunId, - stepId, - stepName, - message: stepDeferredErr.message, - } - ); - return; - } - throw stepDeferredErr; - } - return { timeoutSeconds }; - } - // Infrastructure errors that somehow surfaced through user code // should propagate to the queue handler for retry, not consume // step attempts. diff --git a/packages/core/src/step.test.ts b/packages/core/src/step.test.ts index 5a0e47af56..506def4361 100644 --- a/packages/core/src/step.test.ts +++ b/packages/core/src/step.test.ts @@ -26,6 +26,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), // All generated ulids use the workflow's started at time generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => @@ -412,42 +413,6 @@ describe('createUseStep', () => { expect(ctx.invocationsQueue.size).toBe(1); }); - it('should consume step_deferred event and continue waiting', async () => { - const ctx = setupWorkflowContext([ - { - eventId: 'evnt_0', - runId: 'wrun_123', - eventType: 'step_deferred', - correlationId: 'step_01K11TFZ62YS0YYFDQ3E8B9YCV', - eventData: { - retryAfter: new Date(), - }, - createdAt: new Date(), - }, - ]); - - let workflowErrorReject: (err: Error) => void; - const workflowErrorPromise = new Promise((_, reject) => { - workflowErrorReject = reject; - }); - ctx.onWorkflowError = (err) => { - workflowErrorReject(err); - }; - - const useStep = createUseStep(ctx); - const add = useStep('add'); - - let error: Error | undefined; - try { - await Promise.race([add(1, 2), workflowErrorPromise]); - } catch (err_) { - error = err_ as Error; - } - - expect(error).toBeInstanceOf(WorkflowSuspension); - expect(ctx.invocationsQueue.size).toBe(1); - }); - it('should remove queue item when step_completed (terminal state)', async () => { const ctx = setupWorkflowContext([ { diff --git a/packages/core/src/step.ts b/packages/core/src/step.ts index 3cc9e59ce4..33e544d19e 100644 --- a/packages/core/src/step.ts +++ b/packages/core/src/step.ts @@ -112,11 +112,6 @@ export function createUseStep(ctx: WorkflowOrchestratorContext) { return EventConsumerResult.Consumed; } - if (event.eventType === 'step_deferred') { - // Admission was blocked before user work could proceed, so keep waiting. - return EventConsumerResult.Consumed; - } - if (event.eventType === 'step_failed') { // Terminal state - we can remove the invocationQueue item ctx.invocationsQueue.delete(event.correlationId); diff --git a/packages/core/src/step/context-storage.ts b/packages/core/src/step/context-storage.ts index b63329dd20..2a9aa8b7e1 100644 --- a/packages/core/src/step/context-storage.ts +++ b/packages/core/src/step/context-storage.ts @@ -1,5 +1,4 @@ import { AsyncLocalStorage } from 'node:async_hooks'; -import type { LimitLease } from '@workflow/world'; import type { CryptoKey } from '../encryption.js'; import type { WorkflowMetadata } from '../workflow/get-workflow-metadata.js'; import type { StepMetadata } from './get-step-metadata.js'; @@ -10,6 +9,4 @@ export const contextStorage = /* @__PURE__ */ new AsyncLocalStorage<{ ops: Promise[]; closureVars?: Record; encryptionKey?: CryptoKey; - lockCounter: number; - preAcquiredLocks?: Record; }>(); diff --git a/packages/core/src/step/lock.ts b/packages/core/src/step/lock.ts deleted file mode 100644 index b537cc7503..0000000000 --- a/packages/core/src/step/lock.ts +++ /dev/null @@ -1,103 +0,0 @@ -import type { LimitAcquireRequest, LimitLease, World } from '@workflow/world'; -import type { LockHandle, LockOptions } from '../lock.js'; -import { contextStorage } from './context-storage.js'; - -export class StepLockBlockedError extends Error { - retryAfterMs?: number; - request: LimitAcquireRequest; - - constructor(request: LimitAcquireRequest, retryAfterMs?: number) { - super('Step lock blocked'); - this.name = 'StepLockBlockedError'; - this.retryAfterMs = retryAfterMs; - this.request = request; - } - - static is(value: unknown): value is StepLockBlockedError { - return value instanceof StepLockBlockedError; - } -} - -function createStepLockHandle(lease: LimitLease, world: World): LockHandle { - let currentLease = lease; - let disposed = false; - - const dispose = async () => { - if (disposed) return; - disposed = true; - await world.limits.release({ - leaseId: currentLease.leaseId, - key: currentLease.key, - holderId: currentLease.holderId, - }); - }; - - const heartbeat = async (ttlMs?: number) => { - currentLease = await world.limits.heartbeat({ - leaseId: currentLease.leaseId, - ttlMs, - }); - }; - - return { - get leaseId() { - return currentLease.leaseId; - }, - get key() { - return currentLease.key; - }, - get holderId() { - return currentLease.holderId; - }, - get expiresAt() { - return currentLease.expiresAt; - }, - dispose, - heartbeat, - [Symbol.asyncDispose]: dispose, - }; -} - -export function createStepLock(world: World) { - return async function lockInStep(options: LockOptions): Promise { - const store = contextStorage.getStore(); - if (!store) { - throw new Error( - '`lock()` can only be called inside a workflow or step function' - ); - } - - const lockIndex = store.lockCounter++; - const holderId = `stplock_${store.workflowMetadata.workflowRunId}:${store.stepMetadata.stepId}:${lockIndex}`; - const definition = { - concurrency: options.concurrency, - rate: options.rate, - }; - const request = { - key: options.key, - holderId, - definition, - leaseTtlMs: options.leaseTtlMs, - } satisfies LimitAcquireRequest; - - const preAcquiredLease = store.preAcquiredLocks?.[holderId]; - if (preAcquiredLease) { - if (store.preAcquiredLocks) { - delete store.preAcquiredLocks[holderId]; - } - return createStepLockHandle(preAcquiredLease, world); - } - - const result = await world.limits.acquire(request); - - if (result.status === 'acquired') { - return createStepLockHandle(result.lease, world); - } - - /* - Steps do not sit inside user code polling for a lease. - The runtime catches this and re-queues the step attempt at the boundary. - */ - throw new StepLockBlockedError(request, result.retryAfterMs); - }; -} diff --git a/packages/core/src/symbols.ts b/packages/core/src/symbols.ts index cd9616b17e..790f2fe46f 100644 --- a/packages/core/src/symbols.ts +++ b/packages/core/src/symbols.ts @@ -2,7 +2,6 @@ export const WORKFLOW_USE_STEP = Symbol.for('WORKFLOW_USE_STEP'); export const WORKFLOW_CREATE_HOOK = Symbol.for('WORKFLOW_CREATE_HOOK'); export const WORKFLOW_SLEEP = Symbol.for('WORKFLOW_SLEEP'); export const WORKFLOW_LOCK = Symbol.for('WORKFLOW_LOCK'); -export const STEP_LOCK = Symbol.for('STEP_LOCK'); export const WORKFLOW_CONTEXT = Symbol.for('WORKFLOW_CONTEXT'); export const WORKFLOW_GET_STREAM_ID = Symbol.for('WORKFLOW_GET_STREAM_ID'); export const STABLE_ULID = Symbol.for('WORKFLOW_STABLE_ULID'); diff --git a/packages/core/src/workflow.ts b/packages/core/src/workflow.ts index ece1823196..01883a0fee 100644 --- a/packages/core/src/workflow.ts +++ b/packages/core/src/workflow.ts @@ -139,6 +139,7 @@ export async function runWorkflow( globalThis: vmGlobalThis, onWorkflowError: workflowDiscontinuation.reject, eventsConsumer, + nextLockIndex: 0, generateUlid: () => ulid(+startedAt), generateNanoid, invocationsQueue: new Map(), diff --git a/packages/core/src/workflow/hook.test.ts b/packages/core/src/workflow/hook.test.ts index baa108cb03..ead1169ea3 100644 --- a/packages/core/src/workflow/hook.test.ts +++ b/packages/core/src/workflow/hook.test.ts @@ -28,6 +28,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { onUnconsumedEvent: () => {}, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/core/src/workflow/lock.ts b/packages/core/src/workflow/lock.ts index f0905e06e9..2010c1fd61 100644 --- a/packages/core/src/workflow/lock.ts +++ b/packages/core/src/workflow/lock.ts @@ -1,18 +1,19 @@ import { WorkflowSuspension } from '../global.js'; import type { LockHandle, LockOptions } from '../lock.js'; +import { createLockWakeCorrelationId, type LimitLease } from '@workflow/world'; import { scheduleWhenIdle, type WorkflowOrchestratorContext, } from '../private.js'; import { getWorld } from '../runtime/world.js'; +const DEFAULT_LOCK_LEASE_TTL_MS = 24 * 60 * 60 * 1000; + function createLockHandle( - lease: { - leaseId: string; - key: string; - holderId: string; - expiresAt?: Date; - }, + lease: Pick< + LimitLease, + 'leaseId' | 'key' | 'lockId' | 'runId' | 'lockIndex' | 'expiresAt' + >, ctx: WorkflowOrchestratorContext ): LockHandle { let currentLease = lease; @@ -24,7 +25,7 @@ function createLockHandle( await getWorld().limits.release({ leaseId: currentLease.leaseId, key: currentLease.key, - holderId: currentLease.holderId, + lockId: currentLease.lockId, }); }; @@ -42,8 +43,14 @@ function createLockHandle( get key() { return currentLease.key; }, - get holderId() { - return currentLease.holderId; + get lockId() { + return currentLease.lockId; + }, + get runId() { + return currentLease.runId; + }, + get lockIndex() { + return currentLease.lockIndex; }, get expiresAt() { return currentLease.expiresAt; @@ -68,8 +75,8 @@ export function createLock(ctx: WorkflowOrchestratorContext) { wait event. Postgres can wake this correlation id early when the waiter is promoted, and the delayed replay is just a fallback. */ - const correlationId = `wflock_wait_${ctx.generateUlid()}`; - const holderId = `wflock_${ctx.runId}:${correlationId}:${ctx.generateUlid()}`; + const lockIndex = ctx.nextLockIndex++; + const correlationId = createLockWakeCorrelationId(ctx.runId, lockIndex); const definition = { concurrency: options.concurrency, rate: options.rate, @@ -78,9 +85,10 @@ export function createLock(ctx: WorkflowOrchestratorContext) { while (true) { const result = await getWorld().limits.acquire({ key: options.key, - holderId, + runId: ctx.runId, + lockIndex, definition, - leaseTtlMs: options.leaseTtlMs, + leaseTtlMs: options.leaseTtlMs ?? DEFAULT_LOCK_LEASE_TTL_MS, }); if (result.status === 'acquired') { diff --git a/packages/core/src/workflow/sleep.test.ts b/packages/core/src/workflow/sleep.test.ts index 8b77ca2c76..b6853c4405 100644 --- a/packages/core/src/workflow/sleep.test.ts +++ b/packages/core/src/workflow/sleep.test.ts @@ -32,6 +32,7 @@ function setupWorkflowContext(events: Event[]): WorkflowOrchestratorContext { }, getPromiseQueue: () => Promise.resolve(), }), + nextLockIndex: 0, invocationsQueue: new Map(), generateUlid: () => ulid(workflowStartedAt), generateNanoid: nanoid.customRandom(nanoid.urlAlphabet, 21, (size) => diff --git a/packages/world-local/src/limits.test.ts b/packages/world-local/src/limits.test.ts index 6428422dbb..8b301c2d00 100644 --- a/packages/world-local/src/limits.test.ts +++ b/packages/world-local/src/limits.test.ts @@ -21,9 +21,9 @@ createLimitsContractSuite('local world limits', async () => { keys?: Record< string, { - leases?: { holderId: string }[]; - waiters?: { holderId: string }[]; - tokens?: { holderId: string }[]; + leases?: { lockId: string }[]; + waiters?: { lockId: string }[]; + tokens?: { lockId: string }[]; } >; }; @@ -43,10 +43,10 @@ createLimitsContractSuite('local world limits', async () => { const keyState = raw.keys?.[key]; return { - leaseHolderIds: keyState?.leases?.map((lease) => lease.holderId) ?? [], + leaseHolderIds: keyState?.leases?.map((lease) => lease.lockId) ?? [], waiterHolderIds: - keyState?.waiters?.map((waiter) => waiter.holderId) ?? [], - tokenHolderIds: keyState?.tokens?.map((token) => token.holderId) ?? [], + keyState?.waiters?.map((waiter) => waiter.lockId) ?? [], + tokenHolderIds: keyState?.tokens?.map((token) => token.lockId) ?? [], }; }, close: async () => { diff --git a/packages/world-local/src/limits.ts b/packages/world-local/src/limits.ts index 081b95f63e..896b9ad3d6 100644 --- a/packages/world-local/src/limits.ts +++ b/packages/world-local/src/limits.ts @@ -1,12 +1,9 @@ import path from 'node:path'; import { WorkflowWorldError } from '@workflow/errors'; -import type { - Queue, - Storage, - WorkflowRunWithoutData, - StepWithoutData, -} from '@workflow/world'; +import type { Queue, Storage, WorkflowRunWithoutData } from '@workflow/world'; import { + createLockId, + createLockWakeCorrelationId, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, @@ -14,6 +11,7 @@ import { LimitLeaseSchema, LimitReleaseRequestSchema, type Limits, + parseLockId, } from '@workflow/world'; import { z } from 'zod'; import { readJSON, writeJSON } from './fs.js'; @@ -21,14 +19,16 @@ import { monotonicUlid } from './storage/helpers.js'; const LimitTokenSchema = z.object({ tokenId: z.string(), - holderId: z.string(), + lockId: z.string(), acquiredAt: z.coerce.date(), expiresAt: z.coerce.date(), }); const LimitWaiterSchema = z.object({ waiterId: z.string(), - holderId: z.string(), + lockId: z.string(), + runId: z.string(), + lockIndex: z.number().int().nonnegative(), createdAt: z.coerce.date(), leaseTtlMs: z.number().int().positive().optional(), concurrencyMax: z.number().int().positive().nullable(), @@ -55,15 +55,10 @@ type LimitsState = z.infer; type HolderTarget = | { - kind: 'workflow'; + kind: 'lock'; runId: string; correlationId: string; } - | { - kind: 'step'; - runId: string; - stepId: string; - } | { kind: 'opaque'; }; @@ -71,7 +66,7 @@ type HolderTarget = export interface LocalLimitsOptions { tag?: string; queue?: Pick; - storage?: Pick; + storage?: Pick; } const EMPTY_STATE: LimitsState = { @@ -164,7 +159,8 @@ function getRetryAfterMs( function createLease( key: string, - holderId: string, + runId: string, + lockIndex: number, definition: LimitLease['definition'], acquiredAt: Date, leaseTtlMs?: number @@ -172,7 +168,9 @@ function createLease( return { leaseId: `lmt_${monotonicUlid()}`, key, - holderId, + lockId: createLockId(runId, lockIndex), + runId, + lockIndex, acquiredAt, expiresAt: leaseTtlMs !== undefined @@ -184,31 +182,29 @@ function createLease( function insertToken( keyState: KeyState, - holderId: string, + lockId: string, acquiredAt: Date, periodMs: number ) { keyState.tokens.push({ tokenId: `lmttok_${monotonicUlid()}`, - holderId, + lockId, acquiredAt, expiresAt: new Date(acquiredAt.getTime() + periodMs), }); } -function parseHolderId(holderId: string): HolderTarget { - if (holderId.startsWith('wflock_')) { - const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); - if (runId && correlationId) { - return { kind: 'workflow', runId, correlationId }; - } - } - - if (holderId.startsWith('stplock_')) { - const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); - if (runId && stepId) { - return { kind: 'step', runId, stepId }; - } +function parseHolderId(lockId: string): HolderTarget { + const parsedLockId = parseLockId(lockId); + if (parsedLockId) { + return { + kind: 'lock', + runId: parsedLockId.runId, + correlationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; } return { kind: 'opaque' }; @@ -218,14 +214,6 @@ function isTerminalRun(run: WorkflowRunWithoutData | undefined) { return !run || ['completed', 'failed', 'cancelled'].includes(run.status); } -function isTerminalStep(step: StepWithoutData | undefined) { - return !step || ['completed', 'failed', 'cancelled'].includes(step.status); -} - -function toMillis(value: Date | undefined): number | undefined { - return value ? value.getTime() : undefined; -} - function deleteEmptyKey(state: LimitsState, key: string) { const keyState = state.keys[key]; if (!keyState) return; @@ -277,35 +265,14 @@ export function createLimits( } }; - const getStep = async ( - runId: string, - stepId: string - ): Promise => { - try { - return await options?.storage?.steps.get(runId, stepId, { - resolveData: 'none', - }); - } catch { - return undefined; - } - }; - const isHolderLive = async (holderId: string): Promise => { const target = parseHolderId(holderId); if (target.kind === 'opaque' || !options?.storage) { return true; } - if (target.kind === 'workflow') { - const run = await getRun(target.runId); - return !isTerminalRun(run); - } - - const [run, step] = await Promise.all([ - getRun(target.runId), - getStep(target.runId, target.stepId), - ]); - return !isTerminalRun(run) && !isTerminalStep(step); + const run = await getRun(target.runId); + return !isTerminalRun(run); }; const queueWakeForHolder = async (holderId: string): Promise => { @@ -315,40 +282,17 @@ export function createLimits( } try { - if (target.kind === 'workflow') { - const run = await getRun(target.runId); - if (isTerminalRun(run) || !run) return; - - await options.queue.queue( - `__wkf_workflow_${run.workflowName}`, - { - runId: target.runId, - requestedAt: new Date(), - }, - { - idempotencyKey: target.correlationId, - } - ); - return; - } - - const [run, step] = await Promise.all([ - getRun(target.runId), - getStep(target.runId, target.stepId), - ]); - if (isTerminalRun(run) || isTerminalStep(step) || !run || !step) return; + const run = await getRun(target.runId); + if (isTerminalRun(run) || !run) return; await options.queue.queue( - `__wkf_step_${step.stepName}`, + `__wkf_workflow_${run.workflowName}`, { - workflowName: run.workflowName, - workflowRunId: target.runId, - workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), - stepId: target.stepId, + runId: target.runId, requestedAt: new Date(), }, { - idempotencyKey: target.stepId, + idempotencyKey: target.correlationId, } ); } catch (error) { @@ -369,7 +313,7 @@ export function createLimits( for (let index = 0; index < promotedKeyState.waiters.length; index++) { const waiter = promotedKeyState.waiters[index]; - if (!(await isHolderLive(waiter.holderId))) { + if (!(await isHolderLive(waiter.lockId))) { continue; } @@ -405,7 +349,8 @@ export function createLimits( promotedKeyState.leases.push( createLease( key, - waiter.holderId, + waiter.runId, + waiter.lockIndex, definition, acquiredAt, waiter.leaseTtlMs @@ -416,14 +361,14 @@ export function createLimits( if (waiter.rateCount !== null && waiter.ratePeriodMs !== null) { insertToken( promotedKeyState, - waiter.holderId, + waiter.lockId, acquiredAt, waiter.ratePeriodMs ); activeTokens += 1; } - wakeHolders.push(waiter.holderId); + wakeHolders.push(waiter.lockId); } promotedKeyState.waiters = remainingWaiters; @@ -433,6 +378,7 @@ export function createLimits( return { async acquire(request) { const parsed = LimitAcquireRequestSchema.parse(request); + const lockId = createLockId(parsed.runId, parsed.lockIndex); return withStateLock(async (): Promise => { const state = cloneState(await readState()); @@ -451,7 +397,7 @@ export function createLimits( state.keys[parsed.key] = keyState; const existingLease = keyState.leases.find( - (lease) => lease.holderId === parsed.holderId + (lease) => lease.lockId === lockId ); if (existingLease) { await writeState(state); @@ -469,7 +415,7 @@ export function createLimits( parsed.definition.rate !== undefined && keyState.tokens.length >= parsed.definition.rate.count; const existingWaiter = keyState.waiters.find( - (waiter) => waiter.holderId === parsed.holderId + (waiter) => waiter.lockId === lockId ); if ( @@ -481,7 +427,9 @@ export function createLimits( if (!existingWaiter) { keyState.waiters.push({ waiterId: `lmtwait_${monotonicUlid()}`, - holderId: parsed.holderId, + lockId, + runId: parsed.runId, + lockIndex: parsed.lockIndex, createdAt: new Date(), leaseTtlMs: parsed.leaseTtlMs, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -508,7 +456,8 @@ export function createLimits( const acquiredAt = new Date(); const lease = createLease( parsed.key, - parsed.holderId, + parsed.runId, + parsed.lockIndex, parsed.definition, acquiredAt, parsed.leaseTtlMs @@ -519,7 +468,7 @@ export function createLimits( if (parsed.definition.rate) { insertToken( keyState, - parsed.holderId, + lockId, acquiredAt, parsed.definition.rate.periodMs ); @@ -549,7 +498,7 @@ export function createLimits( keyState.leases = keyState.leases.filter((lease) => { if (lease.leaseId !== parsed.leaseId) return true; if (parsed.key && lease.key !== parsed.key) return true; - if (parsed.holderId && lease.holderId !== parsed.holderId) { + if (parsed.lockId && lease.lockId !== parsed.lockId) { return true; } return false; diff --git a/packages/world-local/src/storage.test.ts b/packages/world-local/src/storage.test.ts index 6bfef563e6..7f36478179 100644 --- a/packages/world-local/src/storage.test.ts +++ b/packages/world-local/src/storage.test.ts @@ -2449,83 +2449,6 @@ describe('Storage', () => { }); }); - describe('step_deferred event handling', () => { - let testRunId: string; - - beforeEach(async () => { - const run = await createRun(storage, { - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - input: new Uint8Array(), - }); - testRunId = run.runId; - }); - - it('should roll back the first blocked attempt without recording an error', async () => { - await createStep(storage, testRunId, { - stepId: 'step_deferred_1', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep(storage, testRunId, 'step_deferred_1', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await storage.events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_1', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 0, - startedAt: undefined, - retryAfter, - error: undefined, - }); - }); - - it('should preserve the original startedAt after a prior real attempt', async () => { - await createStep(storage, testRunId, { - stepId: 'step_deferred_2', - stepName: 'test-step', - input: new Uint8Array(), - }); - - const started1 = await updateStep( - storage, - testRunId, - 'step_deferred_2', - 'step_started' - ); - await storage.events.create(testRunId, { - eventType: 'step_retrying', - correlationId: 'step_deferred_2', - eventData: { error: 'Temporary failure' }, - }); - await updateStep(storage, testRunId, 'step_deferred_2', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await storage.events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_2', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 1, - retryAfter, - error: undefined, - }); - expect(result.step?.startedAt).toEqual(started1.startedAt); - }); - }); - describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(storage, { diff --git a/packages/world-local/src/storage/events-storage.ts b/packages/world-local/src/storage/events-storage.ts index 2bd5025696..c4d0497e83 100644 --- a/packages/world-local/src/storage/events-storage.ts +++ b/packages/world-local/src/storage/events-storage.ts @@ -108,15 +108,11 @@ export function createEventsStorage( ['completed', 'failed', 'cancelled'].includes(status); // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate + // Skip run validation for step_completed and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves filesystem reads per step event. let currentRun: WorkflowRun | null = null; - const skipRunValidationEvents = [ - 'step_completed', - 'step_deferred', - 'step_retrying', - ]; + const skipRunValidationEvents = ['step_completed', 'step_retrying']; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -134,7 +130,7 @@ export function createEventsStorage( // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -223,7 +219,6 @@ export function createEventsStorage( 'step_started', 'step_completed', 'step_failed', - 'step_deferred', 'step_retrying', ]; if (stepEvents.includes(data.eventType) && data.correlationId) { @@ -605,27 +600,6 @@ export function createEventsStorage( { overwrite: true } ); } - } else if (data.eventType === 'step_deferred' && 'eventData' in data) { - // step_deferred: returns the step to pending without recording a failure - if (validatedStep) { - const stepCompositeKey = `${effectiveRunId}-${data.correlationId}`; - const rolledBackAttempt = Math.max(0, validatedStep.attempt - 1); - step = { - ...validatedStep, - status: 'pending', - attempt: rolledBackAttempt, - startedAt: - rolledBackAttempt === 0 ? undefined : validatedStep.startedAt, - error: undefined, - retryAfter: data.eventData.retryAfter, - updatedAt: now, - }; - await writeJSON( - taggedPath(basedir, 'steps', stepCompositeKey, tag), - step, - { overwrite: true } - ); - } } else if (data.eventType === 'step_retrying' && 'eventData' in data) { // step_retrying: Sets status back to 'pending', records error // Reuse validatedStep from validation (already read above) diff --git a/packages/world-postgres/README.md b/packages/world-postgres/README.md index a96cf3b680..1b48974de8 100644 --- a/packages/world-postgres/README.md +++ b/packages/world-postgres/README.md @@ -129,7 +129,7 @@ Make sure your PostgreSQL database is accessible and the user has sufficient per - Backlog stays in PostgreSQL when all execution slots are busy - Retry and sleep-style delays use Graphile `runAt` scheduling - Flow-limit waiters are stored durably in PostgreSQL and promoted in FIFO order per key -- Cancelled workflow and failed/completed step waiters are pruned before promotion +- Cancelled workflow waiters are pruned before promotion - Blocked steps are re-queued instead of holding a worker slot while waiting for a lease - Workflow and step execution is sent through `/.well-known/workflow/v1/flow` and `/.well-known/workflow/v1/step` diff --git a/packages/world-postgres/src/limits.test.ts b/packages/world-postgres/src/limits.test.ts index e7c8193788..44ab39f16e 100644 --- a/packages/world-postgres/src/limits.test.ts +++ b/packages/world-postgres/src/limits.test.ts @@ -14,16 +14,16 @@ if (process.platform === 'win32') { let db: Awaited< ReturnType >; - let queue: ReturnType; beforeAll(async () => { const { createPostgresTestDb } = await import('../test/test-db.js'); db = await createPostgresTestDb(); - queue = createQueue( + const queue = createQueue( { connectionString: db.connectionString, queueConcurrency: 1 }, db.sql ); await queue.start(); + await queue.close(); }, 120_000); beforeEach(async () => { @@ -31,7 +31,6 @@ if (process.platform === 'win32') { }); afterAll(async () => { - await queue?.close(); await db?.close(); }); @@ -48,20 +47,20 @@ if (process.platform === 'win32') { }, inspectKeyState: async (key) => { const [leases, waiters, tokens] = await Promise.all([ - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_leases where limit_key = ${key} order by holder_id asc `, - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_waiters where limit_key = ${key} order by created_at asc, holder_id asc `, - db.sql<{ holderId: string }[]>` - select holder_id as "holderId" + db.sql<{ lockId: string }[]>` + select holder_id as "lockId" from workflow.workflow_limit_tokens where limit_key = ${key} order by acquired_at asc, holder_id asc @@ -69,9 +68,9 @@ if (process.platform === 'win32') { ]); return { - leaseHolderIds: leases.map((row) => row.holderId), - waiterHolderIds: waiters.map((row) => row.holderId), - tokenHolderIds: tokens.map((row) => row.holderId), + leaseHolderIds: leases.map((row) => row.lockId), + waiterHolderIds: waiters.map((row) => row.lockId), + tokenHolderIds: tokens.map((row) => row.lockId), }; }, }; diff --git a/packages/world-postgres/src/limits.ts b/packages/world-postgres/src/limits.ts index b83680a2f1..22220ff8e3 100644 --- a/packages/world-postgres/src/limits.ts +++ b/packages/world-postgres/src/limits.ts @@ -2,6 +2,8 @@ import { JsonTransport } from '@vercel/queue'; import { and, asc, eq, isNotNull, lte, sql } from 'drizzle-orm'; import { WorkflowWorldError } from '@workflow/errors'; import { + createLockId, + createLockWakeCorrelationId, LimitAcquireRequestSchema, type LimitAcquireResult, LimitHeartbeatRequestSchema, @@ -9,6 +11,7 @@ import { LimitReleaseRequestSchema, type Limits, MessageId, + parseLockId, } from '@workflow/world'; import { monotonicFactory } from 'ulid'; import type { PostgresWorldConfig } from './config.js'; @@ -23,21 +26,15 @@ type RunRow = Pick< typeof Schema.runs.$inferSelect, 'workflowName' | 'startedAt' | 'status' >; -type StepRow = Pick; type Tx = Parameters[0]>[0]; type Db = Drizzle | Tx; type HolderTarget = | { - kind: 'workflow'; + kind: 'lock'; runId: string; correlationId: string; } - | { - kind: 'step'; - runId: string; - stepId: string; - } | { kind: 'opaque'; }; @@ -49,7 +46,6 @@ function getQueues(config: PostgresWorldConfig) { const prefix = config.jobPrefix || 'workflow_'; return { workflow: `${prefix}flows`, - step: `${prefix}steps`, } as const; } @@ -72,29 +68,30 @@ function toMillis(value: Date | string | null | undefined): number | undefined { Holder ids double as wake-up hints. When a waiter is promoted, we decode the holder id to decide which queue to poke. */ -function parseHolderId(holderId: string): HolderTarget { - if (holderId.startsWith('wflock_')) { - const [runId, correlationId] = holderId.slice('wflock_'.length).split(':'); - if (runId && correlationId) { - return { kind: 'workflow', runId, correlationId }; - } - } - - if (holderId.startsWith('stplock_')) { - const [runId, stepId] = holderId.slice('stplock_'.length).split(':'); - if (runId && stepId) { - return { kind: 'step', runId, stepId }; - } +function parseHolderId(lockId: string): HolderTarget { + const parsedLockId = parseLockId(lockId); + if (parsedLockId) { + return { + kind: 'lock', + runId: parsedLockId.runId, + correlationId: createLockWakeCorrelationId( + parsedLockId.runId, + parsedLockId.lockIndex + ), + }; } return { kind: 'opaque' }; } function toLease(row: LeaseRow): LimitLease { + const parsedLockId = parseLockId(row.holderId); return { leaseId: row.leaseId, key: row.limitKey, - holderId: row.holderId, + lockId: row.holderId, + runId: parsedLockId?.runId ?? row.holderId, + lockIndex: parsedLockId?.lockIndex ?? 0, acquiredAt: toDate(row.acquiredAt)!, expiresAt: toDate(row.expiresAt), definition: { @@ -183,45 +180,6 @@ async function queueWorkflowWake( `); } -async function queueStepWake( - tx: Db, - config: PostgresWorldConfig, - step: { - stepId: string; - stepName: string; - workflowName: string; - workflowStartedAt: number; - workflowRunId: string; - } -) { - const messageId = MessageId.parse(`msg_${generateId()}`); - const payload = MessageData.encode({ - id: step.stepName, - data: Buffer.from( - transport.serialize({ - workflowName: step.workflowName, - workflowRunId: step.workflowRunId, - workflowStartedAt: step.workflowStartedAt, - stepId: step.stepId, - requestedAt: new Date(), - }) - ), - attempt: 1, - idempotencyKey: step.stepId, - messageId, - }); - - await tx.execute(sql` - select graphile_worker.add_job( - ${getQueues(config).step}::text, - payload := ${JSON.stringify(payload)}::json, - max_attempts := 3, - job_key := ${step.stepId}::text, - job_key_mode := 'replace' - ) - `); -} - async function queueWakeForHolder( tx: Db, config: PostgresWorldConfig, @@ -229,50 +187,13 @@ async function queueWakeForHolder( ) { /* Limit state is durable in Postgres, but wake-ups still need a runtime target. - If the run or step is already terminal, there is nothing left to resume. + If the workflow is already terminal, there is nothing left to resume. */ const target = parseHolderId(holderId); if (target.kind === 'opaque') { return; } - if (target.kind === 'workflow') { - const [run] = (await tx - .select({ - workflowName: Schema.runs.workflowName, - startedAt: Schema.runs.startedAt, - status: Schema.runs.status, - }) - .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) - .limit(1)) as RunRow[]; - - if (!run || ['completed', 'failed', 'cancelled'].includes(run.status)) { - return; - } - - await queueWorkflowWake( - tx, - config, - target.runId, - run.workflowName, - target.correlationId - ); - return; - } - - const [step] = (await tx - .select({ - stepName: Schema.steps.stepName, - status: Schema.steps.status, - }) - .from(Schema.steps) - .where(eq(Schema.steps.stepId, target.stepId)) - .limit(1)) as StepRow[]; - if (!step || ['completed', 'failed'].includes(step.status)) { - return; - } - const [run] = (await tx .select({ workflowName: Schema.runs.workflowName, @@ -286,13 +207,13 @@ async function queueWakeForHolder( return; } - await queueStepWake(tx, config, { - stepId: target.stepId, - stepName: step.stepName, - workflowName: run.workflowName, - workflowStartedAt: toMillis(run.startedAt) ?? Date.now(), - workflowRunId: target.runId, - }); + await queueWorkflowWake( + tx, + config, + target.runId, + run.workflowName, + target.correlationId + ); } async function pruneExpired(tx: Db, key: string): Promise { @@ -373,29 +294,6 @@ async function isHolderLive(tx: Db, holderId: string): Promise { return true; } - if (target.kind === 'workflow') { - const [run] = (await tx - .select({ - status: Schema.runs.status, - }) - .from(Schema.runs) - .where(eq(Schema.runs.runId, target.runId)) - .limit(1)) as Pick[]; - - return !!run && !['completed', 'failed', 'cancelled'].includes(run.status); - } - - const [step] = (await tx - .select({ - status: Schema.steps.status, - }) - .from(Schema.steps) - .where(eq(Schema.steps.stepId, target.stepId)) - .limit(1)) as Pick[]; - if (!step || ['completed', 'failed'].includes(step.status)) { - return false; - } - const [run] = (await tx .select({ status: Schema.runs.status, @@ -502,8 +400,9 @@ export function createLimits( await promoteWaiters(tx, config, parsed.key); const state = await getActiveState(tx, parsed.key); + const lockId = createLockId(parsed.runId, parsed.lockIndex); const existingLease = state.leases.find( - (lease) => lease.holderId === parsed.holderId + (lease) => lease.holderId === lockId ); if (existingLease) { return { @@ -513,7 +412,7 @@ export function createLimits( } const existingWaiter = state.waiters.find( - (waiter) => waiter.holderId === parsed.holderId + (waiter) => waiter.holderId === lockId ); // If there are already waiters for this key and holder no need to queue a new waiter. if (existingWaiter) { @@ -553,7 +452,7 @@ export function createLimits( .values({ leaseId: `lmt_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, acquiredAt: new Date(), expiresAt, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -566,7 +465,7 @@ export function createLimits( await tx.insert(Schema.limitTokens).values({ tokenId: `lmttok_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, acquiredAt: new Date(), expiresAt: new Date(Date.now() + parsed.definition.rate.periodMs), }); @@ -584,7 +483,7 @@ export function createLimits( .values({ waiterId: `lmtwait_${generateId()}`, limitKey: parsed.key, - holderId: parsed.holderId, + holderId: lockId, createdAt: new Date(), leaseTtlMs: parsed.leaseTtlMs ?? null, concurrencyMax: parsed.definition.concurrency?.max ?? null, @@ -630,8 +529,8 @@ export function createLimits( if (parsed.key) { where = and(where, eq(Schema.limitLeases.limitKey, parsed.key))!; } - if (parsed.holderId) { - where = and(where, eq(Schema.limitLeases.holderId, parsed.holderId))!; + if (parsed.lockId) { + where = and(where, eq(Schema.limitLeases.holderId, parsed.lockId))!; } const [deleted] = await tx diff --git a/packages/world-postgres/src/storage.ts b/packages/world-postgres/src/storage.ts index 5fa5adac83..882f7ec7d8 100644 --- a/packages/world-postgres/src/storage.ts +++ b/packages/world-postgres/src/storage.ts @@ -358,16 +358,12 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // ============================================================ // Get current run state for validation (if not creating a new run) - // Skip run validation for step_completed, step_deferred, and step_retrying - they only operate + // Skip run validation for step_completed and step_retrying - they only operate // on running steps, and running steps are always allowed to modify regardless // of run state. This optimization saves database queries per step event. let currentRun: { status: string; specVersion: number | null } | null = null; - const skipRunValidationEvents = [ - 'step_completed', - 'step_deferred', - 'step_retrying', - ]; + const skipRunValidationEvents = ['step_completed', 'step_retrying']; if ( data.eventType !== 'run_created' && !skipRunValidationEvents.includes(data.eventType) @@ -383,7 +379,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { // VERSION COMPATIBILITY: Check run spec version // ============================================================ // For events that have fetched the run, check version compatibility. - // Skip for run_created (no existing run) and runtime events (step_completed, step_deferred, step_retrying). + // Skip for run_created (no existing run) and runtime events (step_completed, step_retrying). if (currentRun) { // Check if run requires a newer world version if (requiresNewerWorld(currentRun.specVersion)) { @@ -478,11 +474,7 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { startedAt: Date | null; retryAfter: Date | null; } | null = null; - const stepEventsNeedingValidation = [ - 'step_started', - 'step_deferred', - 'step_retrying', - ]; + const stepEventsNeedingValidation = ['step_started', 'step_retrying']; if ( stepEventsNeedingValidation.includes(data.eventType) && data.correlationId @@ -913,49 +905,6 @@ export function createEventsStorage(drizzle: Drizzle): Storage['events'] { } } - // Handle step_deferred event: returns the step to pending without recording a failure - if (data.eventType === 'step_deferred') { - const eventData = (data as any).eventData as { - retryAfter?: Date; - }; - - const [stepValue] = await drizzle - .update(Schema.steps) - .set({ - status: 'pending', - attempt: sql`GREATEST(${Schema.steps.attempt} - 1, 0)`, - startedAt: sql`CASE WHEN ${Schema.steps.attempt} <= 1 THEN NULL ELSE ${Schema.steps.startedAt} END`, - error: null, - retryAfter: eventData.retryAfter, - }) - .where( - and( - eq(Schema.steps.runId, effectiveRunId), - eq(Schema.steps.stepId, data.correlationId!), - notInArray(Schema.steps.status, terminalStepStatuses) - ) - ) - .returning(); - if (stepValue) { - step = deserializeStepError(compact(stepValue)); - } else { - const [existing] = await getStepForValidation.execute({ - runId: effectiveRunId, - stepId: data.correlationId!, - }); - if (!existing) { - throw new WorkflowWorldError( - `Step "${data.correlationId}" not found` - ); - } - if (isStepTerminal(existing.status)) { - throw new EntityConflictError( - `Cannot modify step in terminal state "${existing.status}"` - ); - } - } - } - // Handle step_retrying event: sets status back to 'pending', records error // Uses conditional UPDATE to prevent retrying an already-terminal step. if (data.eventType === 'step_retrying') { diff --git a/packages/world-postgres/test/storage.test.ts b/packages/world-postgres/test/storage.test.ts index 8b2328c4c1..3023790d65 100644 --- a/packages/world-postgres/test/storage.test.ts +++ b/packages/world-postgres/test/storage.test.ts @@ -1808,122 +1808,6 @@ describe('Storage (Postgres integration)', () => { }); }); - describe('step_deferred event handling', () => { - let testRunId: string; - - beforeEach(async () => { - const run = await createRun(events, { - deploymentId: 'deployment-123', - workflowName: 'test-workflow', - input: new Uint8Array(), - }); - testRunId = run.runId; - }); - - it('should roll back the first blocked attempt without recording an error', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_1', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep(events, testRunId, 'step_deferred_1', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_1', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 0, - startedAt: undefined, - retryAfter, - error: undefined, - }); - }); - - it('should preserve the original startedAt after a prior real attempt', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_2', - stepName: 'test-step', - input: new Uint8Array(), - }); - - const started1 = await updateStep( - events, - testRunId, - 'step_deferred_2', - 'step_started' - ); - await events.create(testRunId, { - eventType: 'step_retrying', - correlationId: 'step_deferred_2', - eventData: { error: 'Temporary failure' }, - }); - await updateStep(events, testRunId, 'step_deferred_2', 'step_started'); - - const retryAfter = new Date(Date.now() + 5_000); - const result = await events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_2', - eventData: { - retryAfter, - }, - }); - - expect(result.step).toMatchObject({ - status: 'pending', - attempt: 1, - retryAfter, - error: undefined, - }); - expect(result.step?.startedAt).toEqual(started1.startedAt); - }); - - it('throws WorkflowWorldError when step_deferred targets a missing step', async () => { - await expect( - events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_missing_deferred', - eventData: { - retryAfter: new Date(Date.now() + 5_000), - }, - }) - ).rejects.toBeInstanceOf(WorkflowWorldError); - }); - - it('throws EntityConflictError when step_deferred targets a terminal step', async () => { - await createStep(events, testRunId, { - stepId: 'step_deferred_terminal', - stepName: 'test-step', - input: new Uint8Array(), - }); - await updateStep( - events, - testRunId, - 'step_deferred_terminal', - 'step_failed', - { - error: 'already failed', - } - ); - - await expect( - events.create(testRunId, { - eventType: 'step_deferred', - correlationId: 'step_deferred_terminal', - eventData: { - retryAfter: new Date(Date.now() + 5_000), - }, - }) - ).rejects.toBeInstanceOf(EntityConflictError); - }); - }); - describe('run cancellation with in-flight entities', () => { it('should allow in-progress step to complete after run cancelled', async () => { const run = await createRun(events, { diff --git a/packages/world-testing/src/limits-contract.ts b/packages/world-testing/src/limits-contract.ts index 515bbf7fc4..f36c33c410 100644 --- a/packages/world-testing/src/limits-contract.ts +++ b/packages/world-testing/src/limits-contract.ts @@ -1,6 +1,8 @@ import { setTimeout as sleep } from 'node:timers/promises'; import { SPEC_VERSION_CURRENT, + type LimitDefinition, + type LimitLease, type Limits, type Storage, } from '@workflow/world'; @@ -17,6 +19,16 @@ export interface LimitsHarness { close?: () => Promise; } +interface LockOwner { + lockId: string; + runId: string; + lockIndex: number; +} + +function createTestLockId(runId: string, lockIndex: number) { + return `${runId}:${lockIndex}`; +} + async function createRun( storage: Pick, workflowName: string @@ -36,24 +48,49 @@ async function createRun( return result.run; } -async function createStep( - storage: Pick, - runId: string, - stepId: string -) { - const result = await storage.events.create(runId, { - eventType: 'step_created', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - stepName: 'test-step', - input: [], - }, - }); - if (!result.step) { - throw new Error('expected step'); +function requireEventsStorage( + storage: LimitsHarness['storage'] +): Pick { + if (!storage) { + throw new Error('storage.events is required for limits tests'); } - return result.step; + return storage; +} + +async function createLockOwner( + storage: LimitsHarness['storage'], + workflowName: string, + lockIndex = 0 +): Promise { + const run = await createRun(requireEventsStorage(storage), workflowName); + return { + lockId: createTestLockId(run.runId, lockIndex), + runId: run.runId, + lockIndex, + }; +} + +function acquireRequest( + owner: LockOwner, + key: string, + definition: LimitDefinition, + leaseTtlMs?: number +) { + return { + key, + runId: owner.runId, + lockIndex: owner.lockIndex, + definition, + ...(leaseTtlMs !== undefined ? { leaseTtlMs } : {}), + }; +} + +function releaseRequest(lease: LimitLease) { + return { + leaseId: lease.leaseId, + key: lease.key, + lockId: lease.lockId, + }; } export function createLimitsContractSuite( @@ -80,39 +117,43 @@ export function createLimitsContractSuite( it('enforces per-key concurrency limits', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'blocked', reason: 'concurrency', }); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'step:db:cheap', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:db:cheap', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -122,19 +163,25 @@ export function createLimitsContractSuite( it('isolates unrelated keys at the raw limits layer', async () => { const harness = await createHarness(); try { + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); const [first, second] = await Promise.all([ - harness.limits.acquire({ - key: 'workflow:user:a', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }), - harness.limits.acquire({ - key: 'workflow:user:b', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }), + harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:a', + { concurrency: { max: 1 } }, + 1_000 + ) + ), + harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:b', + { concurrency: { max: 1 } }, + 1_000 + ) + ), ]); expect(first.status).toBe('acquired'); @@ -147,14 +194,21 @@ export function createLimitsContractSuite( it('serializes concurrent acquires for the same key', async () => { const harness = await createHarness(); try { - const results = await Promise.all( + const owners = await Promise.all( Array.from({ length: 12 }, (_, index) => - harness.limits.acquire({ - key: 'workflow:user:concurrent', - holderId: `holder-${index}`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }) + createLockOwner(harness.storage, `holder-${index}`) + ) + ); + const results = await Promise.all( + owners.map((owner) => + harness.limits.acquire( + acquireRequest( + owner, + 'workflow:user:concurrent', + { concurrency: { max: 1 } }, + 1_000 + ) + ) ) ); @@ -174,48 +228,55 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const periodMs = 200; - const first = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-a', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const second = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-b', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); if (second.status !== 'blocked') throw new Error('expected blocked'); expect(second.reason).toBe('rate'); expect(second.retryAfterMs).toBeGreaterThanOrEqual(0); - let third = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-c', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + let third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); const deadline = Date.now() + periodMs + 1_000; while (third.status === 'blocked' && Date.now() < deadline) { await sleep(Math.max(25, third.retryAfterMs ?? 0) + 50); - third = await harness.limits.acquire({ - key: 'step:provider:openai', - holderId: 'holder-c', - definition: { rate: { count: 1, periodMs } }, - leaseTtlMs: 1_000, - }); + third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'step:provider:openai', + { rate: { count: 1, periodMs } }, + 1_000 + ) + ); } expect(third.status).toBe('acquired'); } finally { @@ -227,49 +288,53 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const periodMs = 300; - const first = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-a', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'blocked', reason: 'concurrency_and_rate', }); if (second.status !== 'blocked') throw new Error('expected blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); expect(third).toMatchObject({ status: 'blocked', reason: 'rate', @@ -279,15 +344,17 @@ export function createLimitsContractSuite( const deadline = Date.now() + periodMs + 1_000; while (fourth.status === 'blocked' && Date.now() < deadline) { await sleep(Math.max(25, fourth.retryAfterMs ?? 0) + 50); - fourth = await harness.limits.acquire({ - key: 'step:mixed', - holderId: 'holder-b', - definition: { - concurrency: { max: 1 }, - rate: { count: 1, periodMs }, - }, - leaseTtlMs: 1_000, - }); + fourth = await harness.limits.acquire( + acquireRequest( + ownerB, + 'step:mixed', + { + concurrency: { max: 1 }, + rate: { count: 1, periodMs }, + }, + 1_000 + ) + ); } expect(fourth.status).toBe('acquired'); @@ -299,36 +366,40 @@ export function createLimitsContractSuite( it('restores capacity immediately when a lease is released', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - key: first.lease.key, - holderId: first.lease.holderId, - }); + await harness.limits.release(releaseRequest(first.lease)); - const third = await harness.limits.acquire({ - key: 'workflow:user:123', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:123', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -338,12 +409,16 @@ export function createLimitsContractSuite( it('extends lease expiry when heartbeated', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:heartbeat', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 200, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 200 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); @@ -357,12 +432,14 @@ export function createLimitsContractSuite( first.lease.expiresAt?.getTime() ?? 0 ); - const second = await harness.limits.acquire({ - key: 'workflow:user:heartbeat', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:heartbeat', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); } finally { await harness.close?.(); @@ -372,32 +449,40 @@ export function createLimitsContractSuite( it('reclaims expired leases without manual cleanup', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 250, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 250 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); await sleep(400); - const third = await harness.limits.acquire({ - key: 'workflow:user:expired', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const third = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:expired', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(third.status).toBe('acquired'); } finally { await harness.close?.(); @@ -407,27 +492,32 @@ export function createLimitsContractSuite( it('reuses an existing lease for the same holder', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:reacquire', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:reacquire', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:reacquire', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second).toMatchObject({ status: 'acquired', lease: { leaseId: first.lease.leaseId, - holderId: first.lease.holderId, + lockId: first.lease.lockId, }, }); @@ -440,10 +530,10 @@ export function createLimitsContractSuite( 'workflow:user:reacquire' ); expect( - keyState.leaseHolderIds.filter((holderId) => holderId === 'holder-a') + keyState.leaseHolderIds.filter((lockId) => lockId === ownerA.lockId) ).toHaveLength(1); expect( - keyState.waiterHolderIds.filter((holderId) => holderId === 'holder-a') + keyState.waiterHolderIds.filter((lockId) => lockId === ownerA.lockId) ).toHaveLength(0); } finally { await harness.close?.(); @@ -453,68 +543,75 @@ export function createLimitsContractSuite( it('promotes waiters in FIFO order per key', async () => { const harness = await createHarness(); try { - const first = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const ownerB = await createLockOwner(harness.storage, 'holder-b'); + const ownerC = await createLockOwner(harness.storage, 'holder-c'); + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const second = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const third = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const second = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + const third = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(second.status).toBe('blocked'); expect(third.status).toBe('blocked'); - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const promoted = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-b', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const stillWaiting = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const promoted = await harness.limits.acquire( + acquireRequest( + ownerB, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); + const stillWaiting = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(promoted.status).toBe('acquired'); expect(stillWaiting.status).toBe('blocked'); if (promoted.status !== 'acquired') throw new Error('expected waiter-b promotion'); - await harness.limits.release({ - leaseId: promoted.lease.leaseId, - holderId: promoted.lease.holderId, - key: promoted.lease.key, - }); + await harness.limits.release(releaseRequest(promoted.lease)); - const thirdPromoted = await harness.limits.acquire({ - key: 'workflow:user:ordered', - holderId: 'holder-c', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const thirdPromoted = await harness.limits.acquire( + acquireRequest( + ownerC, + 'workflow:user:ordered', + { concurrency: { max: 1 } }, + 1_000 + ) + ); expect(thirdPromoted.status).toBe('acquired'); } finally { @@ -544,126 +641,57 @@ export function createLimitsContractSuite( eventType: 'run_started', specVersion: SPEC_VERSION_CURRENT, }); - - const first = await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); + const liveOwner = { + lockId: createTestLockId(liveRun.runId, 0), + runId: liveRun.runId, + lockIndex: 0, + }; + const deadOwner = { + lockId: createTestLockId(deadRun.runId, 0), + runId: deadRun.runId, + lockIndex: 0, + }; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + + const first = await harness.limits.acquire( + acquireRequest( + ownerA, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${deadRun.runId}:limitwait_dead`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${liveRun.runId}:limitwait_live`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); - - const promoted = await harness.limits.acquire({ - key: 'workflow:user:skip-dead-workflow', - holderId: `wflock_${liveRun.runId}:limitwait_live`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - expect(promoted.status).toBe('acquired'); - } finally { - await harness.close?.(); - } - }); - - it('skips failed step waiters before promotion', async () => { - const harness = await createHarness(); - try { - if (!harness.storage) { - throw new Error('storage is required for step waiter liveness'); - } - - const deadRun = await createRun(harness.storage, 'dead-step-workflow'); - await harness.storage.events.create(deadRun.runId, { - eventType: 'run_started', - specVersion: SPEC_VERSION_CURRENT, - }); - const deadStep = await createStep( - harness.storage, - deadRun.runId, - 'step-dead' + await harness.limits.acquire( + acquireRequest( + deadOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) ); - await harness.storage.events.create(deadRun.runId, { - eventType: 'step_started', - specVersion: SPEC_VERSION_CURRENT, - correlationId: deadStep.stepId, - }); - await harness.storage.events.create(deadRun.runId, { - eventType: 'step_failed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: deadStep.stepId, - eventData: { - error: { name: 'Error', message: 'failed waiter' }, - }, - } as any); - - const liveRun = await createRun(harness.storage, 'live-step-workflow'); - await harness.storage.events.create(liveRun.runId, { - eventType: 'run_started', - specVersion: SPEC_VERSION_CURRENT, - }); - const liveStep = await createStep( - harness.storage, - liveRun.runId, - 'step-live' + await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) ); - const first = await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - expect(first.status).toBe('acquired'); - if (first.status !== 'acquired') - throw new Error('expected acquisition'); - - await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${deadRun.runId}:${deadStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); - - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const promoted = await harness.limits.acquire({ - key: 'step:skip-dead-step', - holderId: `stplock_${liveRun.runId}:${liveStep.stepId}:0`, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 5_000, - }); + const promoted = await harness.limits.acquire( + acquireRequest( + liveOwner, + 'workflow:user:skip-dead-workflow', + { concurrency: { max: 1 } }, + 5_000 + ) + ); expect(promoted.status).toBe('acquired'); } finally { @@ -675,30 +703,26 @@ export function createLimitsContractSuite( const harness = await createHarness(); try { const key = 'workflow:user:replay'; - const blockedHolderId = 'wflock_wrun_replay:corr_replay:holder_replay'; + const ownerA = await createLockOwner(harness.storage, 'holder-a'); + const replayOwner = await createLockOwner( + harness.storage, + 'holder-replay' + ); + const blockedLockId = replayOwner.lockId; - const first = await harness.limits.acquire({ - key, - holderId: 'holder-a', - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const first = await harness.limits.acquire( + acquireRequest(ownerA, key, { concurrency: { max: 1 } }, 1_000) + ); expect(first.status).toBe('acquired'); if (first.status !== 'acquired') throw new Error('expected acquisition'); - const blockedA = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); - const blockedB = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const blockedA = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); + const blockedB = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); expect(blockedA.status).toBe('blocked'); expect(blockedB.status).toBe('blocked'); @@ -706,27 +730,20 @@ export function createLimitsContractSuite( const blockedState = await harness.inspectKeyState(key); expect( blockedState.waiterHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(1); expect( blockedState.leaseHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(0); - await harness.limits.release({ - leaseId: first.lease.leaseId, - holderId: first.lease.holderId, - key: first.lease.key, - }); + await harness.limits.release(releaseRequest(first.lease)); - const acquired = await harness.limits.acquire({ - key, - holderId: blockedHolderId, - definition: { concurrency: { max: 1 } }, - leaseTtlMs: 1_000, - }); + const acquired = await harness.limits.acquire( + acquireRequest(replayOwner, key, { concurrency: { max: 1 } }, 1_000) + ); expect(acquired.status).toBe('acquired'); if (acquired.status !== 'acquired') throw new Error('expected replayed holder acquisition'); @@ -734,12 +751,12 @@ export function createLimitsContractSuite( const acquiredState = await harness.inspectKeyState(key); expect( acquiredState.waiterHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(0); expect( acquiredState.leaseHolderIds.filter( - (holderId) => holderId === blockedHolderId + (lockId) => lockId === blockedLockId ) ).toHaveLength(1); } finally { diff --git a/packages/world-testing/src/limits-runtime.ts b/packages/world-testing/src/limits-runtime.ts index 4627023ba9..807033e712 100644 --- a/packages/world-testing/src/limits-runtime.ts +++ b/packages/world-testing/src/limits-runtime.ts @@ -3,11 +3,11 @@ import { describe, expect, it } from 'vitest'; type WorkflowLockContentionResult = { workflowLockAcquiredAt: number; workflowLockReleasedAt: number; - stepLockAcquiredAt: number; - stepLockReleasedAt: number; + stepCallLockAcquiredAt: number; + stepCallLockReleasedAt: number; }; -type StepLockNoRetriesResult = { +type LockedStepCallResult = { label: string; key?: string; attempt: number; @@ -28,34 +28,24 @@ type WorkflowRateLimitResult = { periodMs: number; }; -type WorkflowLeakedLockResult = { +type LeakedLockResult = { label: string; key: string; leaseTtlMs: number; - workflowLockAcquiredAt: number; - workflowCompletedAt: number; -}; - -type StepLeakedLockResult = { - label: string; - key: string; - leaseTtlMs: number; - stepLockAcquiredAt: number; + lockAcquiredAt: number; workflowCompletedAt: number; }; -type MidStepLockResult = { - label: string; +type WorkflowMultiStepScopeResult = { key: string; - attempt: number; - lockAcquiredAt: number; - preLockEffects: number; - postLockEffects: number; - trace: string[]; + workflowLockAcquiredAt: number; + firstStepCompletedAt: number; + secondStepCompletedAt: number; + workflowLockReleasedAt: number; }; export interface LimitsRuntimeHarness { - runWorkflowWithWorkflowAndStepLocks(userId: string): Promise<{ + runWorkflowWithScopedLocks(userId: string): Promise<{ workflowKey: string; dbKey: string; aiKey: string; @@ -65,12 +55,12 @@ export interface LimitsRuntimeHarness { userId: string, holdMs: number ): Promise<[WorkflowLockContentionResult, WorkflowLockContentionResult]>; - runStepLockNoRetriesContention( - userId: string, - holdMs: number - ): Promise< - [StepLockNoRetriesResult, StepLockNoRetriesResult, StepLockNoRetriesResult] - >; + runLockedStepCallContention( + key: string, + holdMs: number, + labelA?: string, + labelB?: string + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; runWorkflowLockAcrossSuspension( userId: string, holdMs: number @@ -78,11 +68,11 @@ export interface LimitsRuntimeHarness { runWorkflowExpiredLeaseRecovery( userId: string, leaseTtlMs: number - ): Promise<[WorkflowLeakedLockResult, WorkflowOnlyLockResult]>; - runStepExpiredLeaseRecovery( + ): Promise<[LeakedLockResult, WorkflowOnlyLockResult]>; + runLeakedKeyExpiredLeaseRecovery( userId: string, leaseTtlMs: number - ): Promise<[StepLeakedLockResult, StepLockNoRetriesResult]>; + ): Promise<[LeakedLockResult, LockedStepCallResult]>; runWorkflowMixedLimitContention( userId: string, holdMs: number, @@ -107,16 +97,15 @@ export interface LimitsRuntimeHarness { ): Promise<[WorkflowOnlyLockResult, WorkflowOnlyLockResult]>; runIndependentStepKeys( holdMs: number - ): Promise<[StepLockNoRetriesResult, StepLockNoRetriesResult]>; + ): Promise<[LockedStepCallResult, LockedStepCallResult]>; runBlockedWaiterWithUnrelatedWorkflow(holdMs: number): Promise<{ holder: WorkflowOnlyLockResult; waiter: WorkflowOnlyLockResult; unrelated: WorkflowOnlyLockResult; }>; - runMidStepLockContract(holdMs: number): Promise<{ - holder: StepLockNoRetriesResult; - waiter: MidStepLockResult; - }>; + runWorkflowSingleLockAcrossMultipleSteps( + holdMs: number + ): Promise; } export function createLimitsRuntimeSuite( @@ -124,10 +113,10 @@ export function createLimitsRuntimeSuite( createHarness: () => Promise ) { describe(name, () => { - it('runs workflow and step locks end-to-end', async () => { + it('runs locks around individual step calls end-to-end', async () => { const harness = await createHarness(); const userId = 'shared-user'; - const result = await harness.runWorkflowWithWorkflowAndStepLocks(userId); + const result = await harness.runWorkflowWithScopedLocks(userId); expect(result).toMatchObject({ workflowKey: `workflow:user:${userId}`, @@ -137,7 +126,7 @@ export function createLimitsRuntimeSuite( }); }); - it('serializes workflow and step admission under contention', async () => { + it('serializes workflow locks and locks around step calls under contention', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runWorkflowLockContention( 'shared-user', @@ -147,12 +136,12 @@ export function createLimitsRuntimeSuite( expect(resultB.workflowLockAcquiredAt).toBeGreaterThanOrEqual( resultA.workflowLockReleasedAt ); - expect(resultB.stepLockAcquiredAt).toBeGreaterThanOrEqual( - resultA.stepLockReleasedAt + expect(resultB.stepCallLockAcquiredAt).toBeGreaterThanOrEqual( + resultA.stepCallLockReleasedAt ); }); - it('wakes promoted workflow and step waiters promptly', async () => { + it('wakes promoted workflow and step-call lock waiters promptly', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runWorkflowLockContention( 'shared-user', @@ -163,28 +152,23 @@ export function createLimitsRuntimeSuite( resultB.workflowLockAcquiredAt - resultA.workflowLockReleasedAt ).toBeLessThan(4_000); expect( - resultB.stepLockAcquiredAt - resultA.stepLockReleasedAt + resultB.stepCallLockAcquiredAt - resultA.stepCallLockReleasedAt ).toBeLessThan(4_000); }); - it('does not consume retries while blocked on a top-of-step lock', async () => { + it('can hold one workflow lock across multiple steps in the same scope', async () => { const harness = await createHarness(); - const [resultA, resultB, resultC] = - await harness.runStepLockNoRetriesContention('shared-user', 750); - const [firstResult, secondResult, thirdResult] = [ - resultA, - resultB, - resultC, - ].sort((left, right) => left.acquiredAt - right.acquiredAt); - - expect(resultA.attempt).toBe(1); - expect(resultB.attempt).toBe(1); - expect(resultC.attempt).toBe(1); - expect(secondResult.acquiredAt).toBeGreaterThanOrEqual( - firstResult.releasedAt + const result = + await harness.runWorkflowSingleLockAcrossMultipleSteps(400); + + expect(result.firstStepCompletedAt).toBeGreaterThanOrEqual( + result.workflowLockAcquiredAt + ); + expect(result.secondStepCompletedAt).toBeGreaterThanOrEqual( + result.firstStepCompletedAt ); - expect(thirdResult.acquiredAt).toBeGreaterThanOrEqual( - secondResult.releasedAt + expect(result.workflowLockReleasedAt).toBeGreaterThanOrEqual( + result.secondStepCompletedAt ); }); @@ -203,7 +187,7 @@ export function createLimitsRuntimeSuite( ).toBeLessThan(4_000); }); - it('reclaims expired leaked workflow leases without manual cleanup', async () => { + it('reclaims expired leaked workflow locks without manual cleanup', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; const [resultA, resultB] = await harness.runWorkflowExpiredLeaseRecovery( @@ -215,15 +199,15 @@ export function createLimitsRuntimeSuite( resultA.workflowCompletedAt ); expect( - resultB.workflowLockAcquiredAt - resultA.workflowLockAcquiredAt + resultB.workflowLockAcquiredAt - resultA.lockAcquiredAt ).toBeGreaterThanOrEqual(leaseTtlMs - 100); }); - it('reclaims expired leaked step leases without manual cleanup', async () => { + it('reclaims expired leaked locks on arbitrary keys without manual cleanup', async () => { const harness = await createHarness(); const leaseTtlMs = 1_250; - const [resultA, resultB] = await harness.runStepExpiredLeaseRecovery( - 'expired-step-user', + const [resultA, resultB] = await harness.runLeakedKeyExpiredLeaseRecovery( + 'expired-key-user', leaseTtlMs ); @@ -231,7 +215,7 @@ export function createLimitsRuntimeSuite( resultA.workflowCompletedAt ); expect( - resultB.acquiredAt - resultA.stepLockAcquiredAt + resultB.acquiredAt - resultA.lockAcquiredAt ).toBeGreaterThanOrEqual(leaseTtlMs - 100); }); @@ -294,7 +278,7 @@ export function createLimitsRuntimeSuite( ); }); - it('does not block unrelated step keys', async () => { + it('does not block unrelated step-like keys', async () => { const harness = await createHarness(); const [resultA, resultB] = await harness.runIndependentStepKeys(1_000); @@ -316,20 +300,5 @@ export function createLimitsRuntimeSuite( ); } ); - - it('replays a mid-step lock at the acquire boundary without duplicating post-lock effects', async () => { - const harness = await createHarness(); - const { holder, waiter } = await harness.runMidStepLockContract(1_500); - - expect(waiter.lockAcquiredAt).toBeGreaterThanOrEqual(holder.releasedAt); - expect(waiter.preLockEffects).toBe(2); - expect(waiter.postLockEffects).toBe(1); - expect(waiter.trace.map((event) => event.split(':')[0])).toEqual([ - 'pre', - 'pre', - 'lock', - 'post', - ]); - }); }); } diff --git a/packages/world-vercel/src/limits.test.ts b/packages/world-vercel/src/limits.test.ts index 2afdf8af80..ff6bf0151a 100644 --- a/packages/world-vercel/src/limits.test.ts +++ b/packages/world-vercel/src/limits.test.ts @@ -1,19 +1,41 @@ -import { describe, it } from 'vitest'; +import { describe, expect, it } from 'vitest'; +import { LIMITS_NOT_IMPLEMENTED_MESSAGE } from '@workflow/world'; +import { createVercelWorld } from './index.js'; +import { createLimits } from './limits.js'; describe('vercel world limits', () => { - it.fails('exposes the required limits namespace', () => { - throw new Error('TODO: implement'); - }); + it('exposes the required limits namespace', () => { + const limits = createLimits(); - it.fails('enforces per-key concurrency limits', () => { - throw new Error('TODO: implement'); + expect(limits).toMatchObject({ + acquire: expect.any(Function), + release: expect.any(Function), + heartbeat: expect.any(Function), + }); }); - it.fails('returns a retry path when rate limits block acquisition', () => { - throw new Error('TODO: implement'); - }); + it('keeps limits unimplemented until lock support exists', async () => { + const world = createVercelWorld(); + + await expect( + world.limits.acquire({ + key: 'workflow:user:test', + runId: 'wrun_test', + lockIndex: 0, + definition: { concurrency: { max: 1 } }, + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); + + await expect( + world.limits.release({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); - it.fails('restores capacity when a lease is released or expires', () => { - throw new Error('TODO: implement'); + await expect( + world.limits.heartbeat({ + leaseId: 'lease_test', + }) + ).rejects.toThrow(LIMITS_NOT_IMPLEMENTED_MESSAGE); }); }); diff --git a/packages/world/FLOW_LIMITS.md b/packages/world/FLOW_LIMITS.md index 07cc69168f..b2d30b6376 100644 --- a/packages/world/FLOW_LIMITS.md +++ b/packages/world/FLOW_LIMITS.md @@ -12,15 +12,16 @@ implementations. - Postgres implements the same limits semantics with PostgreSQL-backed leases, rate tokens, durable waiters, and durable queue wake-up. - Vercel still exposes `limits` as a stub. -- The Next.js Turbopack workbench has shared E2E coverage for workflow and step - locks on implemented worlds. +- The Next.js Turbopack workbench has shared E2E coverage for `lock()` used + with `await using`, including locks that wrap individual step calls or + groups of steps. ## Goals - Support keyed concurrency limits. - Support keyed rate limits. - Allow concurrency and rate to be colocated in one interface. -- Support workflow-scoped limits and step-scoped limits. +- Support locks whose lifetime follows normal `await using` lexical scope. - Make crash recovery possible through leases with TTL/expiry. - Keep worker throughput controls separate from business-level flow limits. @@ -28,8 +29,10 @@ implementations. - `worker concurrency`: backend throughput setting for queue/job processing. - `workflow limit`: admission control for workflow runs that share a key. -- `step limit`: execution control for a specific step/resource key. -- `lease`: durable record that a workflow or step currently occupies capacity for a key. +- `scoped resource key`: any user-defined key acquired from workflow scope to + protect one step call, multiple step calls, or a whole workflow section. +- `lease`: durable record that a workflow currently occupies capacity for a + key. ## Shared Contract vs World-Specific Behavior @@ -42,7 +45,7 @@ semantics across implemented worlds. That shared contract includes: - same-holder lease reuse - serialization of concurrent acquires for a single key - FIFO waiter promotion per key -- pruning cancelled workflow waiters and failed/completed step waiters +- pruning cancelled workflow waiters - blocked acquisitions not consuming execution concurrency - prompt wake-up with delayed fallback replay @@ -87,6 +90,10 @@ Limits are modeled as leases with TTL/expiry so capacity can be recovered after: Normal completion should dispose/release the lease explicitly. Crash recovery comes from lease expiry plus future reclaim logic. +The default workflow lock TTL should be high enough to cover normal suspended +execution without making users tune it eagerly. The current runtime default is +24 hours unless the caller overrides `leaseTtlMs`. + ### 3. Keep worker concurrency separate from flow limits Current world-level concurrency settings are infrastructure controls, not @@ -127,7 +134,7 @@ Important distinction: Releasing a lease should free concurrency capacity immediately, but it should not restore rate capacity until the associated rate usage entry expires. -### 5. Use one `lock()` API in both workflows and steps +### 5. Use one `lock()` API from workflow scope We want one user-facing primitive: @@ -135,37 +142,14 @@ We want one user-facing primitive: await using lease = await lock({ ... }); ``` -But the runtime meaning differs by context. - -#### In workflows - -`lock()` means workflow admission / workflow-scope ownership. +`lock()` means workflow code acquires ownership of a keyed lease. If placed at the top of a workflow, it should hold the lease across the logical workflow scope, even though the workflow may suspend and resume many times. -#### In steps - -`lock()` acts like a step gate. - -The current behavior is: - -- declare the limit at the top of the step when possible -- the runtime treats a blocked acquisition as step-boundary admission failure -- the step does not keep executing user code while waiting for capacity -- the step is re-queued and retried after promotion or timeout -- lease is disposed automatically when the step attempt completes - -If `lock()` is called in the middle of a step, the intended contract is: - -- the current attempt stops at the blocked `lock()` call -- the step is deferred and re-queued rather than polling in-process -- code before the blocked `lock()` may replay on the next attempt -- code after the `lock()` runs only after the lock is actually acquired - -This means zero-attempt semantics are still strongest when `lock()` is used as -a top-of-step admission gate, but mid-step `lock()` is now part of the shared -runtime contract rather than unsupported behavior. +Steps themselves do not acquire locks directly. To limit one step category or a +group of steps, the workflow acquires the lock and then calls those steps while +the lease is held. ### 6. `await using` is the preferred user-facing shape @@ -175,8 +159,8 @@ The preferred API is explicit resource management: await using lease = await lock({ ... }); ``` -This gives automatic cleanup on scope exit and reads well for both workflow -scopes and step scopes. +This gives automatic cleanup on scope exit and reads well for critical sections +that may include one or many step calls. For manual early cleanup, the user-facing `LockHandle` should expose: @@ -185,7 +169,7 @@ For manual early cleanup, the user-facing `LockHandle` should expose: The backend-facing world contract can continue to use `release(...)` internally. -### 7. Workflow-scoped locks are logical-scope locks, not request-lifetime locks +### 7. Locks follow logical scope, not request lifetime For workflows, `await using` must be tied to the logical workflow scope across: @@ -197,21 +181,19 @@ For workflows, `await using` must be tied to the logical workflow scope across: The lease must not be disposed merely because one host process invocation ends. -### 8. Prefer step-boundary admission for deadlock avoidance +### 8. Keep admission decisions in workflow code Current preferred model: -- workflow-level limits may be held by a run -- blocked step-level limits return control to the runtime at the step boundary -- step-level limits are short-lived -- step execution should not wait on workflow-level locks +- workflow code acquires and releases limits +- steps execute inside whatever critical section the workflow establishes +- step code never waits on a separate lock of its own -This keeps the dependency direction one-way: +This keeps the dependency direction simple: -- workflow admission -> step admission -> step execution +- workflow admission / critical section -> step execution -That avoids the classic cycle where one workflow holds a workflow lock and -another holds a step lock and each waits on the other. +That avoids needing separate workflow-lock and step-lock runtime semantics. ### 9. Waiters are FIFO per key @@ -237,8 +219,6 @@ Blocked flow limits and worker concurrency are intentionally separate. For implemented worlds: - blocked workflows are suspended and re-queued, not left running on a worker -- blocked steps exit the current attempt and are re-queued instead of polling in - a live worker slot - worker slots are free to service unrelated work while the blocked execution is waiting to be retried or promoted @@ -256,8 +236,7 @@ Current behavior: - leases, rate tokens, and waiters live in world-owned limit state - promotion decisions are made from that limit state -- when a waiter is promoted, the runtime is woken by enqueuing the appropriate - workflow or step job +- when a waiter is promoted, the runtime is woken by enqueuing the workflow job - workflows also keep a delayed replay fallback so progress is still possible if an immediate wake-up is missed @@ -271,26 +250,24 @@ survival is not guaranteed after process loss. For v1, the intended semantics are: - workflow locks count admitted, in-flight workflows for a key -- step locks count or rate-limit specific step execution categories +- workflow-held keys may be used to serialize or rate-limit specific step categories - worker concurrency remains a separate infrastructure throttle More concretely: -- if a workflow acquires a workflow-scoped lock and then sleeps for 10 minutes, +- if a workflow acquires a lock and then sleeps for 10 minutes, it still counts as active for that workflow key during the sleep -- if a workflow is parked waiting for a step-level limit, it still counts as - active for its workflow-level lock -- a step-level lock should conceptually be an admission gate for the step - attempt, not a second workflow-level lock, even when the `lock()` call - appears in the middle of user code -- step-level rate limits should consume rate capacity when the step starts, and - that rate usage should remain counted until the window expires even if the - step releases its lease quickly +- if a workflow acquires a lock for a step-like key such as `step:db:cheap`, + that key remains occupied until the workflow releases it, even if the + protected work is just one step call or a small group of step calls +- rate-limited step-like keys still consume rate capacity when the workflow + acquires that key, and that usage remains counted until the window expires + even if the workflow releases the lease quickly For the current local implementation specifically: -- workflow and step locks now follow the same live-process waiter/fairness - semantics as Postgres +- workflow locks now follow the same live-process waiter/fairness semantics as + Postgres - the queue remains in-memory, so queued wake-ups are not durable across process loss @@ -318,31 +295,38 @@ With intended usage like: ```ts async function cheapDbStep(userId: string) { 'use step'; - await using _dbLimit = await lock({ - key: 'step:db:cheap', - concurrency: { max: 20 }, - }); return { userId, prompt: `profile:${userId}` }; } async function expensiveAIStep(prompt: string) { 'use step'; - await using _aiLimit = await lock({ - key: 'step:provider:openai', - rate: { count: 10, periodMs: 60_000 }, - }); return `summary:${prompt}`; } -export async function workflowWithWorkflowAndStepLocks(userId: string) { +export async function workflowWithScopedLocks(userId: string) { 'use workflow'; await using userLimit = await lock({ key: `workflow:user:${userId}`, concurrency: { max: 2 }, }); - const row = await cheapDbStep(userId); - const summary = await expensiveAIStep(row.prompt); + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + }); + summary = await expensiveAIStep(row.prompt); + } return { row, summary }; } ``` @@ -372,8 +356,8 @@ Two more practical clarifications: ## Open Questions - Whether workflow-level locks should always be whole-run admission locks or - also support narrower workflow-scoped blocks. + also support narrower lexical scopes within workflow code. - Whether `heartbeat()` should remain user-visible or become mostly internal. -- Whether step limits should only be expressed through `lock()` or also through - step metadata/config sugar. +- Whether `lock()` should eventually grow optional metadata or + config sugar for common per-step resource keys. - Exact event-log representation for acquire/block/dispose transitions. diff --git a/packages/world/src/events.ts b/packages/world/src/events.ts index eac141c1f7..2965906f7b 100644 --- a/packages/world/src/events.ts +++ b/packages/world/src/events.ts @@ -1,5 +1,4 @@ import { z } from 'zod'; -import { LimitAcquireRequestSchema } from './limits.js'; import { SerializedDataSchema } from './serialization.js'; import type { PaginationOptions, ResolveData } from './shared.js'; @@ -65,7 +64,6 @@ export const EventTypeSchema = z.enum([ 'step_created', 'step_completed', 'step_failed', - 'step_deferred', 'step_retrying', 'step_started', // Hook lifecycle events @@ -111,19 +109,6 @@ const StepFailedEventSchema = BaseEventSchema.extend({ }), }); -/** - * Event created when a step is blocked on admission and should be retried - * without counting the blocked attempt against maxRetries. - */ -const StepDeferredEventSchema = BaseEventSchema.extend({ - eventType: z.literal('step_deferred'), - correlationId: z.string(), - eventData: z.object({ - retryAfter: z.coerce.date().optional(), - lockRequest: LimitAcquireRequestSchema.optional(), - }), -}); - /** * Event created when a step fails and will be retried. * Sets the step status back to 'pending' and records the error. @@ -287,7 +272,6 @@ export const CreateEventSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, - StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events @@ -312,7 +296,6 @@ const AllEventsSchema = z.discriminatedUnion('eventType', [ StepCreatedEventSchema, StepCompletedEventSchema, StepFailedEventSchema, - StepDeferredEventSchema, StepRetryingEventSchema, StepStartedEventSchema, // Hook lifecycle events diff --git a/packages/world/src/index.ts b/packages/world/src/index.ts index fd12d63d94..5e8f73d111 100644 --- a/packages/world/src/index.ts +++ b/packages/world/src/index.ts @@ -12,6 +12,8 @@ export { HookSchema } from './hooks.js'; export type * from './interfaces.js'; export type * from './limits.js'; export { + createLockId, + createLockWakeCorrelationId, createLimitsNotImplementedError, LimitAcquireAcquiredResultSchema, LimitAcquireBlockedResultSchema, @@ -24,9 +26,11 @@ export { LimitHeartbeatRequestSchema, LimitKeySchema, LimitLeaseSchema, + LimitLockIdSchema, LimitRateSchema, LimitReleaseRequestSchema, LIMITS_NOT_IMPLEMENTED_MESSAGE, + parseLockId, } from './limits.js'; export type * from './queue.js'; export { diff --git a/packages/world/src/limits.ts b/packages/world/src/limits.ts index ec155b2d8d..495f29a84f 100644 --- a/packages/world/src/limits.ts +++ b/packages/world/src/limits.ts @@ -34,10 +34,44 @@ export const LimitDefinitionSchema = z ); export type LimitDefinition = z.infer; +export const LimitLockIdSchema = z.string().min(1); +export type LimitLockId = z.infer; + +export function createLockId(runId: string, lockIndex: number): LimitLockId { + return `${runId}:${lockIndex}`; +} + +export function parseLockId( + lockId: string +): { runId: string; lockIndex: number } | null { + const separatorIndex = lockId.lastIndexOf(':'); + if (separatorIndex <= 0 || separatorIndex === lockId.length - 1) { + return null; + } + + const runId = lockId.slice(0, separatorIndex); + const rawLockIndex = lockId.slice(separatorIndex + 1); + const lockIndex = Number.parseInt(rawLockIndex, 10); + if (!Number.isInteger(lockIndex) || lockIndex < 0) { + return null; + } + + return { runId, lockIndex }; +} + +export function createLockWakeCorrelationId( + runId: string, + lockIndex: number +): string { + return `wflock_wait_${runId}:${lockIndex}`; +} + export const LimitLeaseSchema = z.object({ leaseId: z.string().min(1), key: LimitKeySchema, - holderId: z.string().min(1), + lockId: LimitLockIdSchema, + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), acquiredAt: z.coerce.date(), expiresAt: z.coerce.date().optional(), definition: LimitDefinitionSchema, @@ -46,7 +80,8 @@ export type LimitLease = z.infer; export const LimitAcquireRequestSchema = z.object({ key: LimitKeySchema, - holderId: z.string().min(1), + runId: z.string().min(1), + lockIndex: z.number().int().nonnegative(), definition: LimitDefinitionSchema, leaseTtlMs: z.number().int().positive().optional(), }); @@ -88,7 +123,7 @@ export type LimitAcquireResult = z.infer; export const LimitReleaseRequestSchema = z.object({ leaseId: z.string().min(1), key: LimitKeySchema.optional(), - holderId: z.string().min(1).optional(), + lockId: LimitLockIdSchema.optional(), }); export type LimitReleaseRequest = z.infer; diff --git a/workbench/example/workflows/99_e2e.ts b/workbench/example/workflows/99_e2e.ts index b85e49cf3e..1c9bd2ca0a 100644 --- a/workbench/example/workflows/99_e2e.ts +++ b/workbench/example/workflows/99_e2e.ts @@ -216,13 +216,6 @@ export async function parallelSleepWorkflow() { async function cheapDbStep(userId: string) { 'use step'; - - await using _dbLimit = await lock({ - key: 'step:db:cheap', - concurrency: { max: 20 }, - leaseTtlMs: 30_000, - }); - return { userId, prompt: `profile:${userId}`, @@ -231,17 +224,10 @@ async function cheapDbStep(userId: string) { async function expensiveAIStep(prompt: string) { 'use step'; - - await using _aiLimit = await lock({ - key: 'step:provider:openai', - rate: { count: 10, periodMs: 60_000 }, - leaseTtlMs: 30_000, - }); - return `summary:${prompt}`; } -export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { +export async function workflowWithScopedLocks(userId = 'user-123') { 'use workflow'; await using userLimit = await lock({ @@ -250,8 +236,25 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { leaseTtlMs: 30_000, }); - const row = await cheapDbStep(userId); - const summary = await expensiveAIStep(row.prompt); + let row: Awaited>; + { + await using _dbLimit = await lock({ + key: 'step:db:cheap', + concurrency: { max: 20 }, + leaseTtlMs: 30_000, + }); + row = await cheapDbStep(userId); + } + + let summary: Awaited>; + { + await using _aiLimit = await lock({ + key: 'step:provider:openai', + rate: { count: 10, periodMs: 60_000 }, + leaseTtlMs: 30_000, + }); + summary = await expensiveAIStep(row.prompt); + } return { workflowKey: userLimit.key, @@ -261,61 +264,6 @@ export async function workflowWithWorkflowAndStepLocks(userId = 'user-123') { }; } -type LimitTraceState = { - events: string[]; -}; - -function sanitizeLimitTraceToken(traceToken: string) { - return traceToken.replace(/[^a-zA-Z0-9_-]/g, '_'); -} - -async function getLimitTracePath(traceToken: string) { - const path = await import('node:path'); - return path.join( - process.cwd(), - '.workflow-e2e', - `limits-${sanitizeLimitTraceToken(traceToken)}.json` - ); -} - -async function readLimitTraceState( - traceToken: string -): Promise { - const { mkdir, readFile } = await import('node:fs/promises'); - const path = await import('node:path'); - const tracePath = await getLimitTracePath(traceToken); - await mkdir(path.dirname(tracePath), { recursive: true }); - - try { - return JSON.parse(await readFile(tracePath, 'utf8')) as LimitTraceState; - } catch (error) { - if ((error as NodeJS.ErrnoException).code === 'ENOENT') { - return { events: [] }; - } - throw error; - } -} - -async function writeLimitTraceState( - traceToken: string, - state: LimitTraceState -) { - const { mkdir, writeFile } = await import('node:fs/promises'); - const path = await import('node:path'); - const tracePath = await getLimitTracePath(traceToken); - await mkdir(path.dirname(tracePath), { recursive: true }); - await writeFile(tracePath, JSON.stringify(state), 'utf8'); -} - -async function appendLimitTraceEvent(traceToken: string, event: string) { - const state = await readLimitTraceState(traceToken); - const nextState = { - events: [...state.events, event], - }; - await writeLimitTraceState(traceToken, nextState); - return nextState.events; -} - async function serializedLimitStep( label: string, holdMs: number, @@ -323,16 +271,9 @@ async function serializedLimitStep( ) { 'use step'; - const stepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: holdMs + 5_000, - }); - const metadata = getStepMetadata(); const acquiredAt = Date.now(); await new Promise((resolve) => setTimeout(resolve, holdMs)); - await stepLock.dispose(); const releasedAt = Date.now(); return { @@ -357,7 +298,15 @@ export async function workflowLockContentionWorkflow( }); const workflowLockAcquiredAt = Date.now(); - const step = await serializedLimitStep(userId, holdMs); + let step: Awaited>; + { + await using _nestedLock = await lock({ + key: 'step:db:serialized', + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + step = await serializedLimitStep(userId, holdMs); + } await workflowLock.dispose(); const workflowLockReleasedAt = Date.now(); @@ -365,57 +314,27 @@ export async function workflowLockContentionWorkflow( userId, workflowLockAcquiredAt, workflowLockReleasedAt, - stepLockAcquiredAt: step.acquiredAt, - stepLockReleasedAt: step.releasedAt, - }; -} - -async function stepLockNoRetriesStep( - label: string, - holdMs: number, - key = 'step:db:no-retries' -) { - 'use step'; - - await using _stepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: holdMs + 5_000, - }); - - const metadata = getStepMetadata(); - const acquiredAt = Date.now(); - await new Promise((resolve) => setTimeout(resolve, holdMs)); - const releasedAt = Date.now(); - - return { - label, - key, - attempt: metadata.attempt, - acquiredAt, - releasedAt, + stepCallLockAcquiredAt: step.acquiredAt, + stepCallLockReleasedAt: step.releasedAt, }; } -stepLockNoRetriesStep.maxRetries = 0; -export async function stepLockNoRetriesContentionWorkflow( - userId = 'user-123', - holdMs = 750, - label = userId -) { - 'use workflow'; - - return await stepLockNoRetriesStep(label, holdMs); -} - -export async function stepKeyLockContentionWorkflow( +export async function lockedStepCallContentionWorkflow( key = 'step:db:key-contention', holdMs = 750, label = key ) { 'use workflow'; - return await stepLockNoRetriesStep(label, holdMs, key); + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs + 5_000, + }); + + return await serializedLimitStep(label, holdMs, key); + } } ////////////////////////////////////////////////////////// @@ -466,48 +385,34 @@ export async function workflowLeakedLockWorkflow( key: leakedWorkflowLock.key, leaseTtlMs, leakedLeaseId: leakedWorkflowLock.leaseId, - workflowLockAcquiredAt, + lockAcquiredAt: workflowLockAcquiredAt, workflowCompletedAt: Date.now(), }; } -async function leakedStepLockStep( - key: string, - leaseTtlMs: number, - label: string +export async function leakedKeyLockWorkflow( + userId = 'user-123', + leaseTtlMs = 1_250, + label = userId ) { - 'use step'; + 'use workflow'; - const leakedStepLock = await lock({ - key, + const leakedLock = await lock({ + key: `workflow:key:expired:${userId}`, concurrency: { max: 1 }, leaseTtlMs, }); return { label, - key, + key: leakedLock.key, leaseTtlMs, - leakedLeaseId: leakedStepLock.leaseId, - stepLockAcquiredAt: Date.now(), + leakedLeaseId: leakedLock.leaseId, + lockAcquiredAt: Date.now(), workflowCompletedAt: Date.now(), }; } -export async function stepLeakedLockWorkflow( - userId = 'user-123', - leaseTtlMs = 1_250, - label = userId -) { - 'use workflow'; - - return await leakedStepLockStep( - `step:db:expired:${userId}`, - leaseTtlMs, - label - ); -} - export async function workflowRateLimitContentionWorkflow( userId = 'user-123', holdMs = 250, @@ -563,42 +468,48 @@ export async function workflowMixedLimitContentionWorkflow( }; } -async function midStepLockStep(key: string, traceToken: string, label: string) { +async function scopedMultiStepStep(label: string, holdMs: number) { 'use step'; - const { attempt } = getStepMetadata(); - await appendLimitTraceEvent(traceToken, `pre:${attempt}`); - - await using _midStepLock = await lock({ - key, - concurrency: { max: 1 }, - leaseTtlMs: 5_000, - }); - - const lockAcquiredAt = Date.now(); - await appendLimitTraceEvent(traceToken, `lock:${attempt}`); - const trace = await appendLimitTraceEvent(traceToken, `post:${attempt}`); - + const metadata = getStepMetadata(); + await new Promise((resolve) => setTimeout(resolve, holdMs)); return { label, - key, - attempt, - lockAcquiredAt, - preLockEffects: trace.filter((event) => event.startsWith('pre:')).length, - postLockEffects: trace.filter((event) => event.startsWith('post:')).length, - trace, + attempt: metadata.attempt, + completedAt: Date.now(), }; } -midStepLockStep.maxRetries = 0; -export async function midStepLockContentionWorkflow( - key = 'step:db:mid-step', - traceToken = 'mid-step', - label = key +export async function singleLockAcrossMultipleStepsWorkflow( + key = 'step:db:batch', + holdMs = 400 ) { 'use workflow'; - return await midStepLockStep(key, traceToken, label); + let workflowLockAcquiredAt: number; + let first: Awaited>; + let second: Awaited>; + let workflowLockReleasedAt: number; + { + await using _lock = await lock({ + key, + concurrency: { max: 1 }, + leaseTtlMs: holdMs * 2 + 5_000, + }); + + workflowLockAcquiredAt = Date.now(); + first = await scopedMultiStepStep('first', holdMs); + second = await scopedMultiStepStep('second', holdMs); + workflowLockReleasedAt = Date.now(); + } + + return { + key, + workflowLockAcquiredAt, + firstStepCompletedAt: first.completedAt, + secondStepCompletedAt: second.completedAt, + workflowLockReleasedAt, + }; } //////////////////////////////////////////////////////////