diff --git a/.changeset/config.json b/.changeset/config.json index 3045b28..b0b05fb 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -6,7 +6,7 @@ ], "commit": false, "fixed": [ - ["payloadcms-vectorize", "@payloadcms-vectorize/pg"] + ["payloadcms-vectorize", "@payloadcms-vectorize/pg", "@payloadcms-vectorize/cf"] ], "access": "public", "baseBranch": "main", diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..5785860 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,11 @@ +{ + "permissions": { + "allow": [ + "WebFetch(domain:developers.cloudflare.com)" + ], + "additionalDirectories": [ + "/Users/juandominguez/development/payloadcms-vectorize/adapters/cf", + "/Users/juandominguez/development/payloadcms-vectorize/docs" + ] + } +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2aef6f..84532f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,7 @@ on: push: branches: [main, develop] pull_request: - branches: [main] + branches: [main, split_db_adapter] jobs: typecheck: @@ -119,6 +119,44 @@ jobs: IVFFLATLISTS: 1 TEST_ENV: 1 + test_adapters_cf: + runs-on: ubuntu-latest + + services: + postgres: + image: pgvector/pgvector:pg15 + env: + POSTGRES_PASSWORD: password + POSTGRES_DB: payload_test + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5433:5432 + + steps: + - uses: actions/checkout@v4 + + - name: Install pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Install dependencies + run: pnpm install + + - name: Run cf adapter tests + run: pnpm test:adapters:cf + env: + PAYLOAD_SECRET: test-secret-key + TEST_ENV: 1 + test_e2e: runs-on: ubuntu-latest @@ -170,7 +208,7 @@ jobs: test: runs-on: ubuntu-latest - needs: [typecheck, test_int, test_adapters_pg, test_e2e] + needs: [typecheck, test_int, test_adapters_pg, test_adapters_cf, test_e2e] if: always() steps: - name: Check required jobs @@ -178,6 +216,7 @@ jobs: if [ "${{ needs.typecheck.result }}" != "success" ] || \ [ "${{ needs.test_int.result }}" != "success" ] || \ [ "${{ needs.test_adapters_pg.result }}" != "success" ] || \ + [ "${{ needs.test_adapters_cf.result }}" != "success" ] || \ [ "${{ needs.test_e2e.result }}" != "success" ]; then echo "One or more required jobs failed" exit 1 diff --git a/.gitignore b/.gitignore index 7665261..4e8441e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ node_modules/ /build /dist /adapters/pg/dist +/adapters/cf/dist # misc .DS_Store @@ -53,6 +54,5 @@ yarn-error.log* # Secrets */secret -# Agent -.cursor/ -.claude/ \ No newline at end of file +# Cursor +.cursor/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e7ab8c..8d30092 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,19 +2,33 @@ All notable changes to this project will be documented in this file. -## 0.7.0 +## 0.7.0-beta.1 - 2026-03-06 ### Breaking Changes -- **Database Adapter Architecture**: The plugin now uses a pluggable database adapter system. You must install a database adapter package (e.g., `@payloadcms-vectorize/pg`) separately from the core plugin. -- **`dbAdapter` option required**: The `payloadcmsVectorize()` plugin now requires a `dbAdapter` option pointing to your adapter's implementation. -- **`similarity` renamed to `score`**: The `VectorSearchResult.similarity` field has been renamed to `score` to be more generic across different distance metrics. +- **`DbAdapter` interface redesigned**: `storeEmbedding` and `deleteEmbeddings` replaced with `storeChunk`, `deleteChunks`, and `hasEmbeddingVersion`. Adapters now own all chunk storage, deletion, and version checking — the core plugin no longer calls `payload.create()` or `payload.delete()` directly for embeddings. +- **New `StoreChunkData` type**: Adapters receive a single data object containing `sourceCollection`, `docId`, `chunkIndex`, `chunkText`, `embeddingVersion`, `embedding`, and `extensionFields`. -### Added +### Improved -- **`@payloadcms-vectorize/pg` package**: PostgreSQL adapter for pgvector, extracted from the core plugin. -- **`DbAdapter` interface**: New interface for implementing custom database adapters. See `adapters/README.md`. -- **`deleteEmbeddings` on `DbAdapter`**: Adapters can now delete vectors when a document is deleted or re-indexed. +- **CF adapter: native Vectorize metadata filtering**: Search now uses Cloudflare Vectorize's native `filter` parameter (applied before topK) for `equals`, `not_equals`, `in`, `notIn`, `greater_than`, `greater_than_equal`, `less_than`, `less_than_equal`. Operators `like`, `contains`, `exists`, and `or` clauses are post-filtered. +- **CF adapter: deterministic vector IDs**: Vectors are now stored with deterministic IDs (`poolName:collection:docId:chunkIndex`), enabling reliable upserts and deletions. +- **CF adapter: metadata on vectors**: All chunk metadata (including extension fields) is stored directly on Vectorize vectors, enabling filtered search without a separate metadata collection. + +### Migration + +Custom `DbAdapter` implementations must update to the new interface: + +```typescript +// Before +storeEmbedding(payload, poolName, collection, docId, embeddingId, embedding) +deleteEmbeddings(payload, poolName, collection, docId) + +// After +storeChunk(payload, poolName, data: StoreChunkData) +deleteChunks(payload, poolName, sourceCollection, docId) +hasEmbeddingVersion(payload, poolName, sourceCollection, docId, embeddingVersion) +``` ## 0.6.0-beta.5 - 2026-02-25 @@ -36,8 +50,9 @@ All notable changes to this project will be documented in this file. ### Added - **`@payloadcms-vectorize/pg` package**: PostgreSQL adapter for pgvector, extracted from the core plugin. +- **`@payloadcms-vectorize/cf` package**: Cloudflare Vectorize adapter for edge-native vector search. - **`DbAdapter` interface**: New interface for implementing custom database adapters. See `adapters/README.md`. -- **`deleteEmbeddings` on `DbAdapter`**: Adapters can now delete vectors when a document is deleted or re-indexed. +- **`deleteEmbeddings` on `DbAdapter`**: Adapters can now delete vectors when a document is deleted or re-indexed. Implemented in both the `pg` and `cf` adapters. - **Adapter documentation**: Added `adapters/README.md` explaining how to create custom adapters. ### Migration diff --git a/adapters/cf/README.md b/adapters/cf/README.md new file mode 100644 index 0000000..08820f0 --- /dev/null +++ b/adapters/cf/README.md @@ -0,0 +1,238 @@ +# @payloadcms-vectorize/cf + +Cloudflare Vectorize adapter for [payloadcms-vectorize](https://github.com/techiejd/payloadcms-vectorize). Enables vector search capabilities using Cloudflare Vectorize. + +## Prerequisites + +- Cloudflare account with Vectorize index configured +- Payload CMS 3.x with any supported database adapter +- Node.js 18+ + +## Installation + +```bash +pnpm add @payloadcms-vectorize/cf payloadcms-vectorize +``` + +## Quick Start + +### 1. Create Vectorize Index + +Create a Vectorize index in your Cloudflare dashboard or via Wrangler: + +```bash +wrangler vectorize create my-vectorize-index --dimensions=384 --metric=cosine +``` + +### 2. Configure the Plugin + +```typescript +import { buildConfig } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { createCloudflareVectorizeIntegration } from '@payloadcms-vectorize/cf' +import payloadcmsVectorize from 'payloadcms-vectorize' + +// Create the integration +const integration = createCloudflareVectorizeIntegration({ + config: { + default: { + dims: 384, // Vector dimensions (must match your embedding model and Vectorize index) + }, + }, + binding: env.VECTORIZE, // Cloudflare Vectorize binding +}) + +export default buildConfig({ + // ... your existing config + db: postgresAdapter({ + pool: { + connectionString: process.env.DATABASE_URL, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: 'v1.0.0', + queryFn: embedQuery, + realTimeIngestionFn: embedDocs, + }, + }, + }, + }), + ], +}) +``` + +## Configuration + +The `createCloudflareVectorizeIntegration` function accepts a configuration object with `config` and `binding` properties: + +```typescript +const integration = createCloudflareVectorizeIntegration({ + config: { + poolName: { + dims: number, // Required: Vector dimensions + }, + // ... additional pools + }, + binding: vectorizeBinding, // Required: Cloudflare Vectorize binding +}) +``` + +### Configuration Options + +| Option | Type | Required | Description | +| ------ | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `dims` | `number` | Yes | Vector dimensions for the Vectorize index. Must match your embedding model's output dimensions and your Cloudflare Vectorize index configuration. | + +### Cloudflare Bindings + +| Property | Type | Required | Description | +| ----------- | ---------------- | -------- | ------------------------------------------------------------------------------------------------- | +| `vectorize` | `VectorizeIndex` | Yes | Cloudflare Vectorize binding for vector storage. Configured in `wrangler.toml` for Workers/Pages. | + +## Integration Return Value + +`createCloudflareVectorizeIntegration` returns an object with: + +| Property | Type | Description | +| --------- | ----------- | ------------------------------------------------------------------------- | +| `adapter` | `DbAdapter` | The database adapter to pass to `payloadcmsVectorize({ dbAdapter: ... })` | + +## Multiple Knowledge Pools + +You can configure multiple knowledge pools with different dimensions: + +```typescript +const integration = createCloudflareVectorizeIntegration({ + config: { + documents: { + dims: 1536, + }, + images: { + dims: 512, + }, + }, + binding: env.VECTORIZE, +}) + +export default buildConfig({ + // ... + plugins: [ + payloadcmsVectorize({ + dbAdapter: integration.adapter, + knowledgePools: { + documents: { + collections: { + /* ... */ + }, + embeddingConfig: { + /* ... */ + }, + }, + images: { + collections: { + /* ... */ + }, + embeddingConfig: { + /* ... */ + }, + }, + }, + }), + ], +}) +``` + +**Note:** Each knowledge pool requires a separate Vectorize index with matching dimensions. + +## Using with Cloudflare AI + +```typescript +export const embedDocs = async (texts: string[]): Promise => { + const results = await Promise.all( + texts.map((text) => + env.AI.run('@cf/baai/bge-small-en-v1.5', { + text, + }), + ), + ) + return results.map((r) => r.data[0]) +} + +export const embedQuery = async (text: string): Promise => { + const result = await env.AI.run('@cf/baai/bge-small-en-v1.5', { + text, + }) + return result.data[0] +} +``` + +## Using with Voyage AI + +```typescript +import { embed, embedMany } from 'ai' +import { voyage } from 'voyage-ai-provider' + +export const embedDocs = async (texts: string[]): Promise => { + const embedResult = await embedMany({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + values: texts, + providerOptions: { + voyage: { inputType: 'document' }, + }, + }) + return embedResult.embeddings +} + +export const embedQuery = async (text: string): Promise => { + const embedResult = await embed({ + model: voyage.textEmbeddingModel('voyage-3.5-lite'), + value: text, + providerOptions: { + voyage: { inputType: 'query' }, + }, + }) + return embedResult.embedding +} +``` + +## Known Limitations + +### Metadata Filtering + +The CF adapter uses Cloudflare Vectorize's native metadata filtering, which applies filters **before** the topK selection. This means filtering works correctly with the result limit for most operators. + +**Natively supported operators** (applied before topK — correct result counts): +- `equals`, `not_equals`, `in`, `notIn` +- `greater_than`, `greater_than_equal`, `less_than`, `less_than_equal` + +**Post-filtered operators** (applied after topK — may return fewer results than requested): +- `like`, `contains`, `exists` + +### Vectorize Constraints + +| Constraint | Limit | +|---|---| +| topK maximum | 100 (or 20 when returning metadata) | +| String metadata indexing | First 64 bytes only (truncated at UTF-8 boundaries) | +| Filter object size | Under 2048 bytes JSON-encoded | +| Range query accuracy | May be reduced on ~10M+ vectors | + +Metadata indexes must exist before vectors are inserted for filtering to work. + +### OR Queries + +Cloudflare Vectorize does not support OR at the filter level. All `or` clauses are evaluated as post-filters, subject to the topK constraint. + +## License + +MIT diff --git a/adapters/cf/dev/specs/adapter.spec.ts b/adapters/cf/dev/specs/adapter.spec.ts new file mode 100644 index 0000000..72d0df3 --- /dev/null +++ b/adapters/cf/dev/specs/adapter.spec.ts @@ -0,0 +1,335 @@ +/** + * Unit tests for the Cloudflare Vectorize adapter. + * + * These tests verify adapter functionality using mocked Cloudflare bindings + * without requiring a real Payload instance. + */ +import { describe, expect, test, vi } from 'vitest' +import { createCloudflareVectorizeIntegration } from '../../src/index.js' + +const DIMS = 8 + +// Mock Cloudflare binding +function createMockCloudflareBinding() { + const storage = new Map() + + return { + query: vi.fn(async (queryVector: number[], options: any) => { + const { topK = 10, returnMetadata = false, where } = options + + // Simple in-memory search using cosine similarity + const results = Array.from(storage.values()) + .filter((item) => { + // Basic metadata filtering + if (where?.and) { + return where.and.every((condition: any) => { + const key = condition.key + const value = condition.value + return item.metadata?.[key] === value + }) + } + return true + }) + .map((item) => { + // Calculate cosine similarity + const dotProduct = item.values.reduce((sum, v, i) => sum + v * queryVector[i], 0) + const normA = Math.sqrt(queryVector.reduce((sum, v) => sum + v * v, 0)) + const normB = Math.sqrt(item.values.reduce((sum, v) => sum + v * v, 0)) + const score = normA === 0 || normB === 0 ? 0 : dotProduct / (normA * normB) + + return { + id: item.id, + score, + metadata: returnMetadata ? item.metadata : undefined, + } + }) + .sort((a, b) => b.score - a.score) + .slice(0, topK) + + return { matches: results } + }), + + upsert: vi.fn(async (vectors: any[]) => { + for (const vector of vectors) { + storage.set(vector.id, { + id: vector.id, + values: vector.values, + metadata: vector.metadata || {}, + }) + } + }), + + deleteByIds: vi.fn(async (ids: string[]) => { + for (const id of ids) { + storage.delete(id) + } + }), + + list: vi.fn(async (options: any) => { + const vectors = Array.from(storage.values()).map((item) => ({ + id: item.id, + values: item.values, + metadata: options?.returnMetadata ? item.metadata : undefined, + })) + return { vectors } + }), + + // Helper to get storage for assertions + __getStorage: () => storage, + } +} + +function createMockPayloadForEmbed(mockBinding: any) { + return { + config: { + custom: { + createVectorizedPayloadObject: () => ({ + getDbAdapterCustom: () => ({ _vectorizeBinding: mockBinding }), + }), + }, + }, + create: vi.fn().mockResolvedValue({ id: 'mapping-1' }), + logger: { error: vi.fn() }, + } as any +} + +describe('createCloudflareVectorizeIntegration', () => { + describe('validation', () => { + test('should throw if vectorize binding is missing', () => { + expect(() => { + createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: undefined as any, + }) + }).toThrow('Cloudflare Vectorize binding is required') + }) + + test('should create integration with valid config', () => { + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const integration = createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: mockVectorize, + }) + + expect(integration).toBeDefined() + expect(integration.adapter).toBeDefined() + expect(integration.adapter.storeEmbedding).toBeDefined() + expect(integration.adapter.search).toBeDefined() + expect(integration.adapter.deleteEmbeddings).toBeDefined() + expect(integration.adapter.getConfigExtension).toBeDefined() + }) + }) + + describe('getConfigExtension', () => { + test('should return config with pool configurations', () => { + const poolConfigs = { mainPool: { dims: 384 }, secondaryPool: { dims: 768 } } + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const { adapter } = createCloudflareVectorizeIntegration({ + config: poolConfigs, + binding: mockVectorize, + }) + const extension = adapter.getConfigExtension({} as any) + + expect(extension.custom?._poolConfigs).toEqual(poolConfigs) + }) + + test('should return collections with cfMappings', () => { + const mockVectorize = { query: vi.fn(), upsert: vi.fn(), deleteByIds: vi.fn() } + + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 384 } }, + binding: mockVectorize, + }) + const extension = adapter.getConfigExtension({} as any) + + expect(extension.collections).toBeDefined() + expect(extension.collections!['vector-cf-mappings']).toBeDefined() + expect(extension.collections!['vector-cf-mappings'].slug).toBe('vector-cf-mappings') + }) + }) + + describe('storeEmbedding', () => { + test('should convert Float32Array to regular array', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const embedding = new Float32Array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]) + const mockPayload = createMockPayloadForEmbed(mockBinding) + + await adapter.storeEmbedding( + mockPayload, + 'default', + 'test-collection', + 'doc-1', + 'test-id', + embedding, + ) + + expect(mockBinding.upsert).toHaveBeenCalledWith([ + { + id: 'test-id', + values: Array.from(embedding), + }, + ]) + }) + + test('should create a mapping row', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = createMockPayloadForEmbed(mockBinding) + const embedding = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + + await adapter.storeEmbedding( + mockPayload, + 'default', + 'test-collection', + 'doc-1', + 'test-id', + embedding, + ) + + expect(mockPayload.create).toHaveBeenCalledWith({ + collection: 'vector-cf-mappings', + data: { + vectorId: 'test-id', + poolName: 'default', + sourceCollection: 'test-collection', + docId: 'doc-1', + }, + }) + }) + }) + + describe('deleteEmbeddings', () => { + test('should look up mappings with correct where clause', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ docs: [], hasNextPage: false }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockPayload.find).toHaveBeenCalledWith( + expect.objectContaining({ + collection: 'vector-cf-mappings', + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'test-collection' } }, + { docId: { equals: 'doc-123' } }, + ], + }, + }), + ) + }) + + test('should delete matching vectors via mappings', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ + docs: [ + { id: 'map-1', vectorId: 'vec-1' }, + { id: 'map-2', vectorId: 'vec-2' }, + ], + hasNextPage: false, + }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockBinding.deleteByIds).toHaveBeenCalledWith(['vec-1', 'vec-2']) + }) + + test('should clean up mapping rows after deleting vectors', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ + docs: [{ id: 'map-1', vectorId: 'vec-1' }], + hasNextPage: false, + }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockPayload.delete).toHaveBeenCalledWith( + expect.objectContaining({ + collection: 'vector-cf-mappings', + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'test-collection' } }, + { docId: { equals: 'doc-123' } }, + ], + }, + }), + ) + }) + + test('should handle empty results gracefully', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockResolvedValue({ docs: [], hasNextPage: false }), + delete: vi.fn().mockResolvedValue({}), + logger: { error: vi.fn() }, + } as any + + await adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123') + + expect(mockBinding.deleteByIds).not.toHaveBeenCalled() + }) + + test('should handle errors', async () => { + const mockBinding = createMockCloudflareBinding() + const { adapter } = createCloudflareVectorizeIntegration({ + config: { default: { dims: 8 } }, + binding: mockBinding as any, + }) + + const mockPayload = { + find: vi.fn().mockRejectedValue(new Error('Query failed')), + logger: { error: vi.fn() }, + } as any + + await expect( + adapter.deleteEmbeddings?.(mockPayload, 'default', 'test-collection', 'doc-123'), + ).rejects.toThrow('Failed to delete embeddings') + + expect(mockPayload.logger.error).toHaveBeenCalled() + }) + }) +}) diff --git a/adapters/cf/dev/specs/compliance.spec.ts b/adapters/cf/dev/specs/compliance.spec.ts new file mode 100644 index 0000000..2970bca --- /dev/null +++ b/adapters/cf/dev/specs/compliance.spec.ts @@ -0,0 +1,431 @@ +/** + * Adapter compliance tests for the Cloudflare Vectorize adapter. + * + * These tests verify that the Cloudflare adapter correctly implements + * the DbAdapter interface as defined in payloadcms-vectorize. + * + * Note: Uses mocked Cloudflare bindings since there's no local Vectorize emulator. + */ +import { beforeAll, afterAll, describe, expect, test, vi } from 'vitest' +import type { Payload, SanitizedConfig } from 'payload' +import { buildConfig, getPayload } from 'payload' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { Client } from 'pg' +import { createCloudflareVectorizeIntegration } from '../../src/index.js' +import payloadcmsVectorize from 'payloadcms-vectorize' +import type { DbAdapter } from 'payloadcms-vectorize' + +const DIMS = 8 +const dbName = `cf_compliance_test_${Date.now()}` + +// Mock Cloudflare Vectorize binding +function createMockVectorizeBinding() { + const storage = new Map< + string, + { id: string; values: number[]; metadata?: Record } + >() + + return { + query: vi.fn(async (vector: number[], options?: any) => { + const topK = options?.topK || 10 + const allVectors = Array.from(storage.values()) + + // Apply WHERE filter if present + let filtered = allVectors + if (options?.where?.and) { + filtered = allVectors.filter((vec) => { + return options.where.and.every((condition: any) => { + return vec.metadata?.[condition.key] === condition.value + }) + }) + } + + // Calculate cosine similarity + const results = filtered.map((vec) => { + let dotProduct = 0 + let normA = 0 + let normB = 0 + + for (let i = 0; i < vector.length; i++) { + dotProduct += vector[i] * vec.values[i] + normA += vector[i] * vector[i] + normB += vec.values[i] * vec.values[i] + } + + const similarity = dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) + + return { + id: vec.id, + score: similarity, + metadata: vec.metadata, + } + }) + + // Sort by score descending and limit + results.sort((a, b) => b.score - a.score) + return { matches: results.slice(0, topK) } + }), + + upsert: vi.fn(async (vectors: Array<{ id: string; values: number[]; metadata?: any }>) => { + for (const vec of vectors) { + storage.set(vec.id, vec) + } + }), + + deleteByIds: vi.fn(async (ids: string[]) => { + for (const id of ids) { + storage.delete(id) + } + }), + + list: vi.fn(async () => { + return Array.from(storage.values()) + }), + + // Test helper + __getStorage: () => storage, + } +} + +// Helper to create test database +async function createTestDb(name: string) { + const adminUri = + process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' + const client = new Client({ connectionString: adminUri }) + await client.connect() + + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [name]) + if (exists.rowCount === 0) { + await client.query(`CREATE DATABASE ${name}`) + } + await client.end() +} + +describe('Cloudflare Adapter Compliance Tests', () => { + let adapter: DbAdapter + let payload: Payload + let config: SanitizedConfig + let mockVectorize: ReturnType + + beforeAll(async () => { + await createTestDb(dbName) + + mockVectorize = createMockVectorizeBinding() + + const { adapter: cfAdapter } = createCloudflareVectorizeIntegration({ + config: { + default: { + dims: DIMS, + }, + }, + binding: mockVectorize as any, + }) + adapter = cfAdapter + + config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [], + db: postgresAdapter({ + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + payloadcmsVectorize({ + dbAdapter: adapter, + knowledgePools: { + default: { + collections: {}, + embeddingConfig: { + version: 'test-v1', + queryFn: async () => Array(DIMS).fill(0.5), + realTimeIngestionFn: async (texts) => texts.map(() => Array(DIMS).fill(0.5)), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `cf-compliance-${Date.now()}`, + cron: false, + }) + }) + + afterAll(async () => { + // Cleanup is handled by test isolation + }) + + describe('getConfigExtension()', () => { + test('returns a valid config extension object', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension).toBeDefined() + expect(typeof extension).toBe('object') + }) + + test('custom property contains adapter metadata', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.custom).toBeDefined() + expect(extension.custom!._poolConfigs).toBeDefined() + expect(extension.custom!._poolConfigs.default).toBeDefined() + expect(extension.custom!._poolConfigs.default.dims).toBe(DIMS) + }) + + test('collections property contains cfMappings collection', () => { + const extension = adapter.getConfigExtension({} as any) + + expect(extension.collections).toBeDefined() + expect(extension.collections!['vector-cf-mappings']).toBeDefined() + expect(extension.collections!['vector-cf-mappings'].slug).toBe('vector-cf-mappings') + }) + }) + + describe('storeEmbedding()', () => { + test('persists embedding without error (number[])', async () => { + const embedding = Array(DIMS) + .fill(0) + .map(() => Math.random()) + + const sourceDocId = `test-embed-1-${Date.now()}` + + // Create a document first + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text for embedding', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + String(doc.id), + embedding, + ), + ).resolves.not.toThrow() + + expect(mockVectorize.upsert).toHaveBeenCalled() + }) + + test('persists embedding without error (Float32Array)', async () => { + const embedding = new Float32Array( + Array(DIMS) + .fill(0) + .map(() => Math.random()), + ) + + const sourceDocId = `test-embed-2-${Date.now()}` + + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text for Float32Array', + embeddingVersion: 'v1-test', + }, + }) + + await expect( + adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + String(doc.id), + embedding, + ), + ).resolves.not.toThrow() + + expect(mockVectorize.upsert).toHaveBeenCalled() + }) + + test('stores embedding in Vectorize with correct ID', async () => { + const embedding = Array(DIMS).fill(0.5) + + const sourceDocId = `test-embed-id-${Date.now()}` + + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'test text', + embeddingVersion: 'v1-test', + }, + }) + + const embeddingId = String(doc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + embeddingId, + embedding, + ) + + const storage = mockVectorize.__getStorage() + expect(storage.has(embeddingId)).toBe(true) + expect(storage.get(embeddingId)?.values).toEqual(embedding) + }) + }) + + describe('search()', () => { + let targetEmbedding: number[] + let similarDocId: string + + beforeAll(async () => { + // Create test documents with known embeddings + targetEmbedding = Array(DIMS).fill(0.5) + const similarEmbedding = Array(DIMS) + .fill(0.5) + .map((v) => v + Math.random() * 0.05) + + const sourceDocId = `test-search-similar-${Date.now()}` + + // Create and embed a document + const similarDoc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'test-collection', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'similar document for search test', + embeddingVersion: 'v1-test', + }, + }) + similarDocId = String(similarDoc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'test-collection', + sourceDocId, + similarDocId, + similarEmbedding, + ) + }) + + test('returns an array of results', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + expect(Array.isArray(results)).toBe(true) + }) + + test('results contain required fields', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default') + + for (const result of results) { + expect(result).toHaveProperty('id') + expect(result).toHaveProperty('score') + expect(result).toHaveProperty('sourceCollection') + expect(result).toHaveProperty('docId') + expect(result).toHaveProperty('chunkIndex') + expect(result).toHaveProperty('chunkText') + expect(result).toHaveProperty('embeddingVersion') + + expect(typeof result.id).toBe('string') + expect(typeof result.score).toBe('number') + expect(typeof result.sourceCollection).toBe('string') + expect(typeof result.docId).toBe('string') + expect(typeof result.chunkIndex).toBe('number') + expect(typeof result.chunkText).toBe('string') + expect(typeof result.embeddingVersion).toBe('string') + } + }) + + test('results are ordered by score (highest first)', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 10) + + for (let i = 1; i < results.length; i++) { + expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score) + } + }) + + test('respects limit parameter', async () => { + const results = await adapter.search(payload, targetEmbedding, 'default', 1) + + expect(results.length).toBeLessThanOrEqual(1) + }) + + test('calls Vectorize query with correct parameters', async () => { + await adapter.search(payload, targetEmbedding, 'default', 5) + + expect(mockVectorize.query).toHaveBeenCalledWith(targetEmbedding, expect.any(Object)) + }) + }) + + describe('deleteEmbeddings()', () => { + test('removes embeddings from Vectorize via mapping', async () => { + const embedding = Array(DIMS).fill(0.7) + + const sourceDocId = `doc-to-delete-${Date.now()}` + + // Create and embed a document + const doc = await payload.create({ + collection: 'default' as any, + data: { + sourceCollection: 'delete-test', + docId: sourceDocId, + chunkIndex: 0, + chunkText: 'document to delete', + embeddingVersion: 'v1-test', + }, + }) + + const embeddingId = String(doc.id) + await adapter.storeEmbedding( + payload, + 'default', + 'delete-test', + sourceDocId, + embeddingId, + embedding, + ) + + // Verify it's stored in Vectorize + const storage = mockVectorize.__getStorage() + expect(storage.has(embeddingId)).toBe(true) + + // Delete it + await adapter.deleteEmbeddings?.(payload, 'default', 'delete-test', sourceDocId) + + // Verify deleteByIds was called with the correct vector ID + expect(mockVectorize.deleteByIds).toHaveBeenCalledWith([embeddingId]) + + // Verify mapping rows are cleaned up + const remainingMappings = await payload.find({ + collection: 'vector-cf-mappings' as any, + where: { + and: [ + { poolName: { equals: 'default' } }, + { sourceCollection: { equals: 'delete-test' } }, + { docId: { equals: sourceDocId } }, + ], + }, + }) + expect(remainingMappings.totalDocs).toBe(0) + }) + + test('handles non-existent embeddings gracefully', async () => { + await expect( + adapter.deleteEmbeddings?.(payload, 'default', 'non-existent', 'fake-id'), + ).resolves.not.toThrow() + }) + }) +}) diff --git a/adapters/cf/dev/specs/where.spec.ts b/adapters/cf/dev/specs/where.spec.ts new file mode 100644 index 0000000..1a65dea --- /dev/null +++ b/adapters/cf/dev/specs/where.spec.ts @@ -0,0 +1,397 @@ +import { describe, expect, test } from 'vitest' +import { splitWhere, matchesPostFilter } from '../../src/search.js' +import type { Where } from 'payload' + +describe('CF adapter - splitWhere', () => { + describe('simple field conditions', () => { + test('equals maps to $eq natively', () => { + const result = splitWhere({ status: { equals: 'published' } }) + expect(result.nativeFilter).toEqual({ status: { $eq: 'published' } }) + expect(result.postFilter).toBeNull() + }) + + test('not_equals maps to $ne natively', () => { + const result = splitWhere({ status: { not_equals: 'draft' } }) + expect(result.nativeFilter).toEqual({ status: { $ne: 'draft' } }) + expect(result.postFilter).toBeNull() + }) + + test('notEquals maps to $ne natively', () => { + const result = splitWhere({ status: { notEquals: 'draft' } }) + expect(result.nativeFilter).toEqual({ status: { $ne: 'draft' } }) + expect(result.postFilter).toBeNull() + }) + + test('in maps to $in natively', () => { + const result = splitWhere({ status: { in: ['published', 'draft'] } }) + expect(result.nativeFilter).toEqual({ status: { $in: ['published', 'draft'] } }) + expect(result.postFilter).toBeNull() + }) + + test('not_in maps to $nin natively', () => { + const result = splitWhere({ status: { not_in: ['draft'] } }) + expect(result.nativeFilter).toEqual({ status: { $nin: ['draft'] } }) + expect(result.postFilter).toBeNull() + }) + + test('notIn maps to $nin natively', () => { + const result = splitWhere({ status: { notIn: ['draft'] } }) + expect(result.nativeFilter).toEqual({ status: { $nin: ['draft'] } }) + expect(result.postFilter).toBeNull() + }) + + test('greater_than maps to $gt natively', () => { + const result = splitWhere({ views: { greater_than: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $gt: 100 } }) + expect(result.postFilter).toBeNull() + }) + + test('greaterThan maps to $gt natively', () => { + const result = splitWhere({ views: { greaterThan: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $gt: 100 } }) + expect(result.postFilter).toBeNull() + }) + + test('greater_than_equal maps to $gte natively', () => { + const result = splitWhere({ views: { greater_than_equal: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $gte: 100 } }) + expect(result.postFilter).toBeNull() + }) + + test('less_than maps to $lt natively', () => { + const result = splitWhere({ views: { less_than: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $lt: 100 } }) + expect(result.postFilter).toBeNull() + }) + + test('lessThan maps to $lt natively', () => { + const result = splitWhere({ views: { lessThan: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $lt: 100 } }) + expect(result.postFilter).toBeNull() + }) + + test('less_than_equal maps to $lte natively', () => { + const result = splitWhere({ views: { less_than_equal: 100 } }) + expect(result.nativeFilter).toEqual({ views: { $lte: 100 } }) + expect(result.postFilter).toBeNull() + }) + }) + + describe('non-native operators go to postFilter', () => { + test('like goes to postFilter', () => { + const result = splitWhere({ tags: { like: '%javascript%' } }) + expect(result.nativeFilter).toBeNull() + expect(result.postFilter).toEqual({ tags: { like: '%javascript%' } }) + }) + + test('contains goes to postFilter', () => { + const result = splitWhere({ category: { contains: 'tech' } }) + expect(result.nativeFilter).toBeNull() + expect(result.postFilter).toEqual({ category: { contains: 'tech' } }) + }) + + test('exists goes to postFilter', () => { + const result = splitWhere({ category: { exists: true } }) + expect(result.nativeFilter).toBeNull() + expect(result.postFilter).toEqual({ category: { exists: true } }) + }) + }) + + describe('and operator', () => { + test('splits and conditions into native and post', () => { + const result = splitWhere({ + and: [ + { status: { equals: 'published' } }, + { tags: { like: '%javascript%' } }, + ], + }) + expect(result.nativeFilter).toEqual({ status: { $eq: 'published' } }) + expect(result.postFilter).toEqual({ tags: { like: '%javascript%' } }) + }) + + test('all native conditions stay native', () => { + const result = splitWhere({ + and: [ + { status: { equals: 'published' } }, + { views: { greater_than: 100 } }, + ], + }) + expect(result.nativeFilter).toEqual({ + status: { $eq: 'published' }, + views: { $gt: 100 }, + }) + expect(result.postFilter).toBeNull() + }) + + test('all post conditions stay post', () => { + const result = splitWhere({ + and: [ + { tags: { like: '%js%' } }, + { category: { contains: 'tech' } }, + ], + }) + expect(result.nativeFilter).toBeNull() + expect(result.postFilter).toEqual({ + and: [ + { tags: { like: '%js%' } }, + { category: { contains: 'tech' } }, + ], + }) + }) + }) + + describe('or operator', () => { + test('entire or goes to postFilter (Vectorize does not support native or)', () => { + const where: Where = { + or: [ + { status: { equals: 'draft' } }, + { status: { equals: 'archived' } }, + ], + } + const result = splitWhere(where) + expect(result.nativeFilter).toBeNull() + expect(result.postFilter).toEqual(where) + }) + }) + + describe('mixed conditions', () => { + test('multiple field conditions split correctly', () => { + const result = splitWhere({ + status: { equals: 'published' }, + tags: { contains: 'javascript' }, + } as Where) + expect(result.nativeFilter).toEqual({ status: { $eq: 'published' } }) + expect(result.postFilter).toEqual({ tags: { contains: 'javascript' } }) + }) + }) +}) + +describe('CF adapter - matchesPostFilter', () => { + const doc = { + status: 'published', + category: 'tech', + views: 150, + rating: 4.5, + tags: 'javascript,nodejs,programming', + published: true, + } + + describe('equals / not_equals', () => { + test('equals matches', () => { + expect(matchesPostFilter(doc, { status: { equals: 'published' } })).toBe(true) + }) + + test('equals rejects', () => { + expect(matchesPostFilter(doc, { status: { equals: 'draft' } })).toBe(false) + }) + + test('not_equals matches', () => { + expect(matchesPostFilter(doc, { status: { not_equals: 'draft' } })).toBe(true) + }) + + test('not_equals rejects', () => { + expect(matchesPostFilter(doc, { status: { not_equals: 'published' } })).toBe(false) + }) + + test('notEquals matches', () => { + expect(matchesPostFilter(doc, { status: { notEquals: 'draft' } })).toBe(true) + }) + }) + + describe('in / notIn', () => { + test('in matches', () => { + expect(matchesPostFilter(doc, { status: { in: ['published', 'draft'] } })).toBe(true) + }) + + test('in rejects', () => { + expect(matchesPostFilter(doc, { status: { in: ['draft', 'archived'] } })).toBe(false) + }) + + test('not_in matches', () => { + expect(matchesPostFilter(doc, { status: { not_in: ['draft', 'archived'] } })).toBe(true) + }) + + test('not_in rejects', () => { + expect(matchesPostFilter(doc, { status: { not_in: ['published'] } })).toBe(false) + }) + + test('notIn matches', () => { + expect(matchesPostFilter(doc, { status: { notIn: ['draft'] } })).toBe(true) + }) + }) + + describe('like / contains', () => { + test('like with wildcards matches', () => { + expect(matchesPostFilter(doc, { tags: { like: '%javascript%' } })).toBe(true) + }) + + test('like rejects non-matching', () => { + expect(matchesPostFilter(doc, { tags: { like: '%python%' } })).toBe(false) + }) + + test('like is case insensitive', () => { + expect(matchesPostFilter(doc, { tags: { like: '%JavaScript%' } })).toBe(true) + }) + + test('contains matches substring', () => { + expect(matchesPostFilter(doc, { category: { contains: 'tech' } })).toBe(true) + }) + + test('contains rejects non-matching', () => { + expect(matchesPostFilter(doc, { category: { contains: 'design' } })).toBe(false) + }) + + test('contains is case insensitive', () => { + expect(matchesPostFilter(doc, { category: { contains: 'Tech' } })).toBe(true) + }) + }) + + describe('comparison operators', () => { + test('greater_than matches', () => { + expect(matchesPostFilter(doc, { views: { greater_than: 100 } })).toBe(true) + }) + + test('greater_than rejects', () => { + expect(matchesPostFilter(doc, { views: { greater_than: 200 } })).toBe(false) + }) + + test('greaterThan matches', () => { + expect(matchesPostFilter(doc, { views: { greaterThan: 100 } })).toBe(true) + }) + + test('greater_than_equal matches on boundary', () => { + expect(matchesPostFilter(doc, { views: { greater_than_equal: 150 } })).toBe(true) + }) + + test('less_than matches', () => { + expect(matchesPostFilter(doc, { views: { less_than: 200 } })).toBe(true) + }) + + test('less_than rejects', () => { + expect(matchesPostFilter(doc, { views: { less_than: 100 } })).toBe(false) + }) + + test('lessThan matches', () => { + expect(matchesPostFilter(doc, { rating: { lessThan: 4.6 } })).toBe(true) + }) + + test('less_than_equal matches on boundary', () => { + expect(matchesPostFilter(doc, { views: { less_than_equal: 150 } })).toBe(true) + }) + }) + + describe('exists operator', () => { + test('exists true matches present field', () => { + expect(matchesPostFilter(doc, { category: { exists: true } })).toBe(true) + }) + + test('exists true rejects null field', () => { + expect(matchesPostFilter({ ...doc, category: null }, { category: { exists: true } })).toBe(false) + }) + + test('exists false matches null field', () => { + expect(matchesPostFilter({ ...doc, category: null }, { category: { exists: false } })).toBe(true) + }) + + test('exists false matches undefined field', () => { + const { category, ...noCategory } = doc + expect(matchesPostFilter(noCategory, { category: { exists: false } })).toBe(true) + }) + + test('exists false rejects present field', () => { + expect(matchesPostFilter(doc, { category: { exists: false } })).toBe(false) + }) + }) + + describe('and / or', () => { + test('and: all conditions must match', () => { + expect(matchesPostFilter(doc, { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + })).toBe(true) + }) + + test('and: fails if any condition fails', () => { + expect(matchesPostFilter(doc, { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'design' } }, + ], + })).toBe(false) + }) + + test('or: matches if any condition matches', () => { + expect(matchesPostFilter(doc, { + or: [ + { status: { equals: 'draft' } }, + { category: { equals: 'tech' } }, + ], + })).toBe(true) + }) + + test('or: fails if no condition matches', () => { + expect(matchesPostFilter(doc, { + or: [ + { status: { equals: 'draft' } }, + { category: { equals: 'design' } }, + ], + })).toBe(false) + }) + }) + + describe('nested logic', () => { + test('and/or combination: (published AND tech) OR draft', () => { + expect(matchesPostFilter(doc, { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'draft' } }, + ], + })).toBe(true) + }) + + test('or within and', () => { + expect(matchesPostFilter(doc, { + and: [ + { + or: [ + { status: { equals: 'published' } }, + { status: { equals: 'draft' } }, + ], + }, + { views: { greater_than: 100 } }, + ], + })).toBe(true) + }) + + test('nested fails correctly', () => { + expect(matchesPostFilter(doc, { + and: [ + { + or: [ + { status: { equals: 'draft' } }, + { status: { equals: 'archived' } }, + ], + }, + { views: { greater_than: 100 } }, + ], + })).toBe(false) + }) + }) + + describe('edge cases', () => { + test('empty where matches everything', () => { + expect(matchesPostFilter(doc, {} as Where)).toBe(true) + }) + + test('undefined field returns false for equals', () => { + expect(matchesPostFilter(doc, { nonExistent: { equals: 'value' } })).toBe(false) + }) + }) +}) diff --git a/adapters/cf/package.json b/adapters/cf/package.json new file mode 100644 index 0000000..336f3f4 --- /dev/null +++ b/adapters/cf/package.json @@ -0,0 +1,27 @@ +{ + "name": "@payloadcms-vectorize/cf", + "version": "0.7.0-beta.1", + "description": "Cloudflare Vectorize adapter for payloadcms-vectorize", + "license": "MIT", + "type": "module", + "files": [ + "dist" + ], + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "peerDependencies": { + "payload": ">=3.0.0 <4.0.0", + "payloadcms-vectorize": ">=0.7.0-beta.1 <1.0.0" + }, + "devDependencies": { + "payloadcms-vectorize": "workspace:*" + }, + "engines": { + "node": "^18.20.2 || >=20.9.0", + "pnpm": "^9 || ^10" + }, + "publishConfig": { + "main": "./dist/index.js", + "types": "./dist/index.d.ts" + } +} diff --git a/adapters/cf/src/collections/cfMappings.ts b/adapters/cf/src/collections/cfMappings.ts new file mode 100644 index 0000000..ea9743b --- /dev/null +++ b/adapters/cf/src/collections/cfMappings.ts @@ -0,0 +1,49 @@ +import type { CollectionConfig } from 'payload' + +export const CF_MAPPINGS_SLUG = 'vector-cf-mappings' + +// This collection maps Cloudflare Vectorize vector IDs to source documents, +// so we can find and delete vectors when the source document is deleted. +const CFMappingsCollection: CollectionConfig = { + slug: CF_MAPPINGS_SLUG, + admin: { + hidden: true, + description: + 'Maps Cloudflare Vectorize vector IDs to source documents. Managed by the CF adapter.', + }, + access: { + read: () => true, + create: ({ req }) => req?.payloadAPI === 'local', + update: ({ req }) => req?.payloadAPI === 'local', + delete: ({ req }) => req?.payloadAPI === 'local', + }, + fields: [ + { + name: 'vectorId', + type: 'text', + required: true, + index: true, + }, + { + name: 'poolName', + type: 'text', + required: true, + index: true, + }, + { + name: 'sourceCollection', + type: 'text', + required: true, + index: true, + }, + { + name: 'docId', + type: 'text', + required: true, + index: true, + }, + ], + timestamps: true, +} + +export default CFMappingsCollection diff --git a/adapters/cf/src/embed.ts b/adapters/cf/src/embed.ts new file mode 100644 index 0000000..66dfae4 --- /dev/null +++ b/adapters/cf/src/embed.ts @@ -0,0 +1,46 @@ +import { CollectionSlug, Payload } from 'payload' +import { getVectorizeBinding } from './types.js' +import { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +import type { StoreChunkData } from 'payloadcms-vectorize' + +export default async ( + payload: Payload, + poolName: string, + data: StoreChunkData, +) => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + const vector = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + const id = `${poolName}:${data.sourceCollection}:${data.docId}:${data.chunkIndex}` + + await vectorizeBinding.upsert([ + { + id, + values: vector, + metadata: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + ...data.extensionFields, + }, + }, + ]) + + await payload.create({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + data: { + vectorId: id, + poolName, + sourceCollection: data.sourceCollection, + docId: data.docId, + }, + }) + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + } +} diff --git a/adapters/cf/src/index.ts b/adapters/cf/src/index.ts new file mode 100644 index 0000000..1dfda07 --- /dev/null +++ b/adapters/cf/src/index.ts @@ -0,0 +1,141 @@ +import type { CollectionSlug } from 'payload' +import type { DbAdapter } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' +import type { CloudflareVectorizeBinding, KnowledgePoolsConfig } from './types.js' +import cfMappingsCollection, { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +import embed from './embed.js' +import search from './search.js' + +/** + * Configuration for Cloudflare Vectorize integration + */ +interface CloudflareVectorizeConfig { + /** Knowledge pools configuration with their dimensions */ + config: KnowledgePoolsConfig + /** Cloudflare Vectorize binding for vector storage */ + binding: CloudflareVectorizeBinding +} + +/** + * Create a Cloudflare Vectorize integration for payloadcms-vectorize + * + * @param options Configuration object with knowledge pools and Vectorize binding + * @returns Object containing the DbAdapter instance + * + * @example + * ```typescript + * import { createCloudflareVectorizeIntegration } from '@payloadcms-vectorize/cf' + * + * const { adapter } = createCloudflareVectorizeIntegration({ + * config: { + * default: { + * dims: 384, + * }, + * }, + * binding: env.VECTORIZE, + * }) + * ``` + */ +export const createCloudflareVectorizeIntegration = ( + options: CloudflareVectorizeConfig, +): { adapter: DbAdapter } => { + if (!options.binding) { + throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding is required') + } + + const poolConfig = options.config + + const adapter: DbAdapter = { + getConfigExtension: () => { + return { + collections: { + [CF_MAPPINGS_SLUG]: cfMappingsCollection, + }, + custom: { + _poolConfigs: poolConfig, + _vectorizeBinding: options.binding, + }, + } + }, + + search, + + storeChunk: embed, + + deleteChunks: async (payload, poolName, sourceCollection, docId) => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + // Paginate through all mapping rows for this document+pool + const allVectorIds: string[] = [] + let page = 1 + let hasNextPage = true + + while (hasNextPage) { + const mappings = await payload.find({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + where: { + and: [ + { poolName: { equals: poolName } }, + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + page, + }) + + for (const mapping of mappings.docs) { + allVectorIds.push((mapping as Record).vectorId as string) + } + + hasNextPage = mappings.hasNextPage + page++ + } + + if (allVectorIds.length === 0) { + return + } + // Delete vectors from Cloudflare Vectorize + await vectorizeBinding.deleteByIds(allVectorIds) + // Delete mapping rows + await payload.delete({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + where: { + and: [ + { poolName: { equals: poolName } }, + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + }) + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error) + payload.logger.error( + `[@payloadcms-vectorize/cf] Failed to delete embeddings: ${errorMessage}`, + ) + throw new Error(`[@payloadcms-vectorize/cf] Failed to delete embeddings: ${errorMessage}`) + } + }, + + hasEmbeddingVersion: async (payload, poolName, sourceCollection, docId, _embeddingVersion) => { + const result = await payload.find({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + where: { + and: [ + { poolName: { equals: poolName } }, + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + limit: 1, + }) + return result.totalDocs > 0 + }, + } + + return { adapter } +} + +export { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +export type { CloudflareVectorizeBinding, KnowledgePoolsConfig } +export type { KnowledgePoolsConfig as KnowledgePoolConfig } diff --git a/adapters/cf/src/search.ts b/adapters/cf/src/search.ts new file mode 100644 index 0000000..f0fcaf7 --- /dev/null +++ b/adapters/cf/src/search.ts @@ -0,0 +1,194 @@ +import { BasePayload, Where } from 'payload' +import { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' + +export default async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit: number = 10, + where?: Where, +): Promise> => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + const queryOptions: Record = { + topK: limit, + returnMetadata: 'all' as const, + } + + let postFilter: Where | null = null + + if (where) { + const split = splitWhere(where) + if (split.nativeFilter && Object.keys(split.nativeFilter).length > 0) { + queryOptions.filter = split.nativeFilter + } + postFilter = split.postFilter + } + + const results = await vectorizeBinding.query(queryEmbedding, queryOptions) + + if (!results.matches) { + return [] + } + + const RESERVED_METADATA = ['sourceCollection', 'docId', 'chunkIndex', 'chunkText', 'embeddingVersion'] + + let searchResults: VectorSearchResult[] = results.matches.map((match) => { + const metadata = match.metadata || {} + const extensionFields = Object.fromEntries( + Object.entries(metadata).filter(([k]) => !RESERVED_METADATA.includes(k)) + ) + return { + id: match.id, + score: match.score || 0, + sourceCollection: String(metadata.sourceCollection || ''), + docId: String(metadata.docId || ''), + chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : parseInt(String(metadata.chunkIndex || '0'), 10), + chunkText: String(metadata.chunkText || ''), + embeddingVersion: String(metadata.embeddingVersion || ''), + ...extensionFields, + } + }) + + if (postFilter) { + searchResults = searchResults.filter((r) => matchesPostFilter(r, postFilter!)) + } + + return searchResults + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + } +} + +export type VectorizeFilter = Record> + +export interface FilterSplit { + nativeFilter: VectorizeFilter | null + postFilter: Where | null +} + +const NATIVE_OPERATOR_MAP: Record = { + equals: '$eq', + not_equals: '$ne', + notEquals: '$ne', + in: '$in', + not_in: '$nin', + notIn: '$nin', + greater_than: '$gt', + greaterThan: '$gt', + greater_than_equal: '$gte', + greaterThanEqual: '$gte', + less_than: '$lt', + lessThan: '$lt', + less_than_equal: '$lte', + lessThanEqual: '$lte', +} + +export function splitWhere(where: Where): FilterSplit { + const nativeFilter: VectorizeFilter = {} + const postFilterClauses: Where[] = [] + + if ('and' in where && Array.isArray(where.and)) { + for (const clause of where.and) { + const split = splitWhere(clause) + if (split.nativeFilter) { + Object.assign(nativeFilter, split.nativeFilter) + } + if (split.postFilter) { + postFilterClauses.push(split.postFilter) + } + } + return { + nativeFilter: Object.keys(nativeFilter).length > 0 ? nativeFilter : null, + postFilter: postFilterClauses.length > 0 + ? (postFilterClauses.length === 1 ? postFilterClauses[0] : { and: postFilterClauses }) + : null, + } + } + + if ('or' in where && Array.isArray(where.or)) { + return { nativeFilter: null, postFilter: where } + } + + for (const [fieldName, condition] of Object.entries(where)) { + if (fieldName === 'and' || fieldName === 'or') continue + if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) continue + + const cond = condition as Record + let handled = false + + for (const [payloadOp, cfOp] of Object.entries(NATIVE_OPERATOR_MAP)) { + if (payloadOp in cond) { + nativeFilter[fieldName] = { [cfOp]: cond[payloadOp] } + handled = true + break + } + } + + if (!handled) { + postFilterClauses.push({ [fieldName]: condition } as Where) + } + } + + return { + nativeFilter: Object.keys(nativeFilter).length > 0 ? nativeFilter : null, + postFilter: postFilterClauses.length > 0 + ? (postFilterClauses.length === 1 ? postFilterClauses[0] : { and: postFilterClauses }) + : null, + } +} + +export function matchesPostFilter(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((clause: Where) => matchesPostFilter(doc, clause)) + } + + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((clause: Where) => matchesPostFilter(doc, clause)) + } + + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + if (typeof condition !== 'object' || condition === null) continue + + const value = doc[field] + const cond = condition as Record + + if ('like' in cond && typeof cond.like === 'string') { + const pattern = String(cond.like).replace(/%/g, '.*') + if (!new RegExp(`^${pattern}$`, 'i').test(String(value ?? ''))) return false + } + + if ('contains' in cond && typeof cond.contains === 'string') { + if (!String(value ?? '').toLowerCase().includes(String(cond.contains).toLowerCase())) return false + } + + if ('exists' in cond && typeof cond.exists === 'boolean') { + const exists = value !== undefined && value !== null + if (cond.exists !== exists) return false + } + + if ('equals' in cond && value !== cond.equals) return false + if ('not_equals' in cond && value === cond.not_equals) return false + if ('notEquals' in cond && value === cond.notEquals) return false + if ('in' in cond && Array.isArray(cond.in) && !cond.in.includes(value)) return false + if ('not_in' in cond && Array.isArray(cond.not_in) && cond.not_in.includes(value)) return false + if ('notIn' in cond && Array.isArray(cond.notIn) && (cond.notIn as any[]).includes(value)) return false + if ('greater_than' in cond && !(value > (cond.greater_than as any))) return false + if ('greaterThan' in cond && !(value > (cond.greaterThan as any))) return false + if ('greater_than_equal' in cond && !(value >= (cond.greater_than_equal as any))) return false + if ('greaterThanEqual' in cond && !(value >= (cond.greaterThanEqual as any))) return false + if ('less_than' in cond && !(value < (cond.less_than as any))) return false + if ('lessThan' in cond && !(value < (cond.lessThan as any))) return false + if ('less_than_equal' in cond && !(value <= (cond.less_than_equal as any))) return false + if ('lessThanEqual' in cond && !(value <= (cond.lessThanEqual as any))) return false + } + + return true +} diff --git a/adapters/cf/src/types.ts b/adapters/cf/src/types.ts new file mode 100644 index 0000000..92786d4 --- /dev/null +++ b/adapters/cf/src/types.ts @@ -0,0 +1,65 @@ +import type { BasePayload } from 'payload' +import { getVectorizedPayload } from 'payloadcms-vectorize' + +/** + * Retrieve the Cloudflare Vectorize binding from a Payload instance. + * Throws if the binding is not found. + */ +export function getVectorizeBinding(payload: BasePayload): CloudflareVectorizeBinding { + const binding = getVectorizedPayload(payload)?.getDbAdapterCustom() + ?._vectorizeBinding as CloudflareVectorizeBinding | undefined + if (!binding) { + throw new Error('[@payloadcms-vectorize/cf] Cloudflare Vectorize binding not found') + } + return binding +} + +/** + * Configuration for a knowledge pool in Cloudflare Vectorize + */ +export interface CloudflareVectorizePoolConfig { + /** Vector dimensions for this pool (must match embedding model output) */ + dims: number +} + +/** + * All knowledge pools configuration for Cloudflare Vectorize + */ +export type KnowledgePoolsConfig = Record + +/** A single vector match returned by a Vectorize query */ +export interface VectorizeMatch { + id: string + score?: number + metadata?: Record +} + +/** Result of a Vectorize query */ +export interface VectorizeQueryResult { + matches: VectorizeMatch[] + count: number +} + +/** Vector to upsert into Vectorize */ +export interface VectorizeVector { + id: string + values: number[] + metadata?: Record +} + +/** + * Cloudflare Vectorize binding interface. + * Mirrors the subset of the Vectorize API we use. + * For the full type, install `@cloudflare/workers-types`. + */ +export interface CloudflareVectorizeBinding { + query(vector: number[], options?: { + topK?: number + returnMetadata?: boolean | 'indexed' | 'all' + filter?: Record + /** Vectorize metadata filtering */ + where?: Record + }): Promise + upsert(vectors: VectorizeVector[]): Promise + deleteByIds(ids: string[]): Promise +} diff --git a/adapters/cf/vitest.config.ts b/adapters/cf/vitest.config.ts new file mode 100644 index 0000000..8baf261 --- /dev/null +++ b/adapters/cf/vitest.config.ts @@ -0,0 +1,38 @@ +import path from 'path' +import { loadEnv } from 'payload/node' +import { fileURLToPath } from 'url' +import tsconfigPaths from 'vite-tsconfig-paths' +import { defineConfig } from 'vitest/config' + +const filename = fileURLToPath(import.meta.url) +const dirname = path.dirname(filename) + +export default defineConfig(() => { + loadEnv(path.resolve(dirname, '../../dev')) + + return { + plugins: [ + tsconfigPaths({ + ignoreConfigErrors: true, + }), + ], + resolve: { + alias: { + '@shared-test/utils': path.resolve(dirname, '../../dev/specs/utils.ts'), + '@shared-test/helpers/chunkers': path.resolve(dirname, '../../dev/helpers/chunkers.ts'), + '@shared-test/helpers/embed': path.resolve(dirname, '../../dev/helpers/embed.ts'), + '@shared-test/constants': path.resolve(dirname, '../../dev/specs/constants.ts'), + }, + }, + test: { + environment: 'node', + hookTimeout: 30_000, + testTimeout: 30_000, + include: ['dev/specs/**/*.spec.ts'], + exclude: ['**/e2e.spec.{ts,js}', '**/node_modules/**'], + // Run test files sequentially to avoid global state interference + // (embeddingsTables map and Payload instance caching) + fileParallelism: false, + }, + } +}) diff --git a/adapters/pg/dev/specs/vectorSearchWhere.spec.ts b/adapters/pg/dev/specs/vectorSearchWhere.spec.ts new file mode 100644 index 0000000..c019df3 --- /dev/null +++ b/adapters/pg/dev/specs/vectorSearchWhere.spec.ts @@ -0,0 +1,526 @@ +import type { Payload } from 'payload' +import { afterAll, beforeAll, describe, expect, test } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildDummyConfig, DIMS, integration, plugin } from './constants.js' +import { createTestDb, destroyPayload, waitForVectorizationJobs } from './utils.js' +import { getPayload } from 'payload' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '@shared-test/helpers/embed' +import { createVectorSearchHandlers } from '@shared-test/endpoints/vectorSearch' +import type { KnowledgePoolDynamicConfig, VectorSearchResult } from 'payloadcms-vectorize' + +async function performVectorSearch( + payload: Payload, + query: string, + knowledgePool: string = 'default', + where?: any, + limit: number = 100, +): Promise { + const knowledgePools: Record = { + default: { + collections: {}, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + } + const searchHandler = createVectorSearchHandlers(knowledgePools, integration.adapter).requestHandler + + const mockRequest = { + json: async () => ({ + query, + knowledgePool, + ...(where ? { where } : {}), + limit, + }), + payload, + } as any + + const response = await searchHandler(mockRequest) + const json = await response.json() + + if (response.status !== 200) { + throw new Error(`Search failed: ${json.error}`) + } + + return json.results +} + +describe('PG adapter - WHERE clause operators', () => { + let payload: Payload + const dbName = 'pg_where_clause_test' + + beforeAll(async () => { + await createTestDb({ dbName }) + + const config = await buildDummyConfig({ + jobs: { + tasks: [], + autoRun: [{ cron: '*/5 * * * * *', limit: 10 }], + }, + collections: [ + { + slug: 'articles', + fields: [ + { name: 'title', type: 'text' }, + { name: 'status', type: 'text' }, + { name: 'category', type: 'text' }, + { name: 'views', type: 'number' }, + { name: 'rating', type: 'number' }, + { name: 'published', type: 'checkbox' }, + { name: 'tags', type: 'text' }, + ], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + plugin({ + knowledgePools: { + default: { + collections: { + articles: { + toKnowledgePool: async (doc) => { + if (!doc.title) return [] + return [{ + chunk: doc.title, + status: doc.status || 'draft', + category: doc.category || 'general', + views: doc.views ?? 0, + rating: doc.rating ?? 0, + published: doc.published ?? false, + tags: doc.tags || 'none', + }] + }, + }, + }, + extensionFields: [ + { name: 'status', type: 'text' }, + { name: 'category', type: 'text' }, + { name: 'views', type: 'number' }, + { name: 'rating', type: 'number' }, + { name: 'published', type: 'checkbox' }, + { name: 'tags', type: 'text' }, + ], + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + }) + + payload = await getPayload({ + config, + key: `pg-where-test-${Date.now()}`, + cron: true, + }) + + await payload.create({ + collection: 'articles', + data: { + title: 'Published Tech Article', + status: 'published', + category: 'tech', + views: 150, + rating: 4.5, + published: true, + tags: 'javascript,nodejs,programming', + }, + }) + await payload.create({ + collection: 'articles', + data: { + title: 'Draft Tech Article', + status: 'draft', + category: 'tech', + views: 0, + rating: 0, + published: false, + tags: 'javascript', + }, + }) + await payload.create({ + collection: 'articles', + data: { + title: 'Published Design Article', + status: 'published', + category: 'design', + views: 300, + rating: 4.8, + published: true, + tags: 'ui,design,ux', + }, + }) + await payload.create({ + collection: 'articles', + data: { + title: 'Archived Tech Article', + status: 'archived', + category: 'tech', + views: 50, + rating: 3.5, + published: false, + tags: 'python,legacy', + }, + }) + + await waitForVectorizationJobs(payload) + }) + + afterAll(async () => { + await destroyPayload(payload) + }) + + describe('equals operator', () => { + test('filters by exact text match', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { equals: 'published' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).toBe('published')) + }) + + test('returns empty results when no match', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { equals: 'nonexistent' }, + }) + expect(results).toEqual([]) + }) + }) + + describe('not_equals / notEquals operator', () => { + test('filters by non-equal text match', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { not_equals: 'draft' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).not.toBe('draft')) + }) + + test('notEquals variant works', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { notEquals: 'archived' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).not.toBe('archived')) + }) + }) + + describe('in / notIn operators', () => { + test('filters by multiple allowed values', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { in: ['published', 'draft'] }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(['published', 'draft']).toContain(r.status)) + }) + + test('filters by excluded values', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { not_in: ['draft', 'archived'] }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(['draft', 'archived']).not.toContain(r.status)) + }) + + test('notIn variant works', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + status: { notIn: ['archived'] }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).not.toBe('archived')) + }) + }) + + describe('like / contains operators', () => { + test('filters by substring match with like', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + tags: { like: '%javascript%' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.tags).toContain('javascript')) + }) + + test('filters by substring match with contains', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + category: { contains: 'tech' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.category).toContain('tech')) + }) + }) + + describe('comparison operators (numbers)', () => { + test('greater_than filters numeric fields', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + views: { greater_than: 100 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.views).toBeGreaterThan(100)) + }) + + test('greaterThan variant works', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + views: { greaterThan: 100 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.views).toBeGreaterThan(100)) + }) + + test('greater_than_equal filters inclusive', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + views: { greater_than_equal: 150 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.views).toBeGreaterThanOrEqual(150)) + }) + + test('less_than filters numeric fields', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + views: { less_than: 200 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.views).toBeLessThan(200)) + }) + + test('less_than_equal filters inclusive', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + views: { less_than_equal: 150 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.views).toBeLessThanOrEqual(150)) + }) + + test('lessThan variant works', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + rating: { lessThan: 4.6 }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.rating).toBeLessThan(4.6)) + }) + + test('range query combining greater and less than', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [ + { views: { greater_than: 50 } }, + { views: { less_than: 200 } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(r.views).toBeGreaterThan(50) + expect(r.views).toBeLessThan(200) + }) + }) + }) + + describe('exists operator (null checks)', () => { + test('exists true filters non-null values', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + category: { exists: true }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(r.category).toBeDefined() + expect(r.category).not.toBeNull() + }) + }) + + test('exists false filters null values', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + category: { exists: false }, + }) + results.forEach((r) => { + expect(r.category === null || r.category === undefined).toBe(true) + }) + }) + }) + + describe('AND operator', () => { + test('combines multiple text conditions', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(r.status).toBe('published') + expect(r.category).toBe('tech') + }) + }) + + test('combines text and numeric conditions', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [ + { status: { equals: 'published' } }, + { views: { greater_than: 100 } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(r.status).toBe('published') + expect(r.views).toBeGreaterThan(100) + }) + }) + + test('and with single condition', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [{ status: { equals: 'published' } }], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).toBe('published')) + }) + }) + + describe('OR operator', () => { + test('returns results matching any condition', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + or: [ + { status: { equals: 'draft' } }, + { status: { equals: 'archived' } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(['draft', 'archived']).toContain(r.status)) + }) + + test('or with numeric conditions', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + or: [ + { views: { greater_than: 200 } }, + { rating: { greater_than: 4.7 } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + const matchesViews = r.views > 200 + const matchesRating = r.rating > 4.7 + expect(matchesViews || matchesRating).toBe(true) + }) + }) + + test('or with single condition', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + or: [{ status: { equals: 'published' } }], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.status).toBe('published')) + }) + }) + + describe('complex nested logic', () => { + test('and/or combination: (published tech) OR (archived)', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + or: [ + { + and: [ + { status: { equals: 'published' } }, + { category: { equals: 'tech' } }, + ], + }, + { status: { equals: 'archived' } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + const isPublishedTech = r.status === 'published' && r.category === 'tech' + const isArchived = r.status === 'archived' + expect(isPublishedTech || isArchived).toBe(true) + }) + }) + + test('multiple and conditions with negation', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [ + { status: { not_equals: 'draft' } }, + { category: { equals: 'tech' } }, + { views: { greater_than_equal: 0 } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(r.status).not.toBe('draft') + expect(r.category).toBe('tech') + expect(r.views).toBeGreaterThanOrEqual(0) + }) + }) + + test('nested or within and', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + and: [ + { + or: [ + { status: { equals: 'published' } }, + { status: { equals: 'draft' } }, + ], + }, + { views: { greater_than: 0 } }, + ], + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => { + expect(['published', 'draft']).toContain(r.status) + expect(r.views).toBeGreaterThan(0) + }) + }) + }) + + describe('edge cases', () => { + test('filter by docId (reserved field)', async () => { + const allResults = await performVectorSearch(payload, 'Article', 'default', {}) + expect(allResults.length).toBeGreaterThan(0) + + const targetDocId = allResults[0].docId + const results = await performVectorSearch(payload, 'Article', 'default', { + docId: { equals: targetDocId }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.docId).toBe(targetDocId)) + }) + + test('filter by sourceCollection', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + sourceCollection: { equals: 'articles' }, + }) + expect(results.length).toBeGreaterThan(0) + results.forEach((r) => expect(r.sourceCollection).toBe('articles')) + }) + }) + + describe('integration with limit and ordering', () => { + test('where clause combined with limit', async () => { + const results = await performVectorSearch( + payload, + 'Article', + 'default', + { status: { equals: 'published' } }, + 5, + ) + expect(results.length).toBeLessThanOrEqual(5) + results.forEach((r) => expect(r.status).toBe('published')) + }) + + test('where results are still ordered by relevance score', async () => { + const results = await performVectorSearch(payload, 'Article', 'default', { + category: { equals: 'tech' }, + }) + if (results.length > 1) { + for (let i = 0; i < results.length - 1; i++) { + expect(results[i].score).toBeGreaterThanOrEqual(results[i + 1].score) + } + } + }) + }) +}) diff --git a/adapters/pg/package.json b/adapters/pg/package.json index 4932f29..0c1c249 100644 --- a/adapters/pg/package.json +++ b/adapters/pg/package.json @@ -1,6 +1,6 @@ { "name": "@payloadcms-vectorize/pg", - "version": "0.7.0", + "version": "0.7.0-beta.1", "description": "PostgreSQL adapter for payloadcms-vectorize", "license": "MIT", "type": "module", @@ -11,7 +11,7 @@ "types": "./dist/index.d.ts", "peerDependencies": { "payload": ">=3.0.0 <4.0.0", - "payloadcms-vectorize": ">=0.7.0 <1.0.0", + "payloadcms-vectorize": ">=0.7.0-beta.1 <1.0.0", "@payloadcms/db-postgres": ">=3.0.0 <4.0.0" }, "devDependencies": { diff --git a/adapters/pg/src/index.ts b/adapters/pg/src/index.ts index 8b4242a..1f8b234 100644 --- a/adapters/pg/src/index.ts +++ b/adapters/pg/src/index.ts @@ -86,7 +86,52 @@ export const createPostgresVectorIntegration = ( } }, search, - storeEmbedding: embed, + + storeChunk: async (payload, poolName, data) => { + const embeddingArray = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + + const created = await payload.create({ + collection: poolName as any, + data: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + ...data.extensionFields, + embedding: embeddingArray, + }, + }) + + await embed(payload, poolName, data.sourceCollection, data.docId, String(created.id), embeddingArray) + }, + + deleteChunks: async (payload, poolName, sourceCollection, docId) => { + await payload.delete({ + collection: poolName as any, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + ], + }, + }) + }, + + hasEmbeddingVersion: async (payload, poolName, sourceCollection, docId, embeddingVersion) => { + const existing = await payload.find({ + collection: poolName as any, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + { embeddingVersion: { equals: embeddingVersion } }, + ], + }, + limit: 1, + }) + return existing.totalDocs > 0 + }, } return { afterSchemaInitHook, adapter } diff --git a/adapters/pg/vitest.config.js b/adapters/pg/vitest.config.js index 8baf261..383cb07 100644 --- a/adapters/pg/vitest.config.js +++ b/adapters/pg/vitest.config.js @@ -22,6 +22,7 @@ export default defineConfig(() => { '@shared-test/helpers/chunkers': path.resolve(dirname, '../../dev/helpers/chunkers.ts'), '@shared-test/helpers/embed': path.resolve(dirname, '../../dev/helpers/embed.ts'), '@shared-test/constants': path.resolve(dirname, '../../dev/specs/constants.ts'), + '@shared-test/endpoints/vectorSearch': path.resolve(dirname, '../../src/endpoints/vectorSearch.ts'), }, }, test: { diff --git a/dev/helpers/mockAdapter.ts b/dev/helpers/mockAdapter.ts index 3499457..c1dba0d 100644 --- a/dev/helpers/mockAdapter.ts +++ b/dev/helpers/mockAdapter.ts @@ -1,5 +1,5 @@ -import type { DbAdapter, KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' -import type { Payload, BasePayload, Where, Config } from 'payload' +import type { DbAdapter, KnowledgePoolName, StoreChunkData, VectorSearchResult } from 'payloadcms-vectorize' +import type { CollectionSlug, Payload, BasePayload, Where, Config } from 'payload' type StoredEmbedding = { poolName: string @@ -48,24 +48,86 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = custom: { _isMockAdapter: true, ...custom }, }), - storeEmbedding: async ( - _payload: Payload, + storeChunk: async ( + payload: Payload, poolName: KnowledgePoolName, - _sourceCollection: string, - _sourceDocId: string, - id: string, - embedding: number[] | Float32Array, + data: StoreChunkData, ): Promise => { - const key = `${poolName}:${id}` - const embeddingArray = Array.isArray(embedding) ? embedding : Array.from(embedding) + const embeddingArray = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + + const created = await payload.create({ + collection: poolName as CollectionSlug, + data: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + embedding: embeddingArray, + ...data.extensionFields, + }, + }) + const key = `${poolName}:${created.id}` storage.set(key, { poolName, - id, + id: String(created.id), embedding: embeddingArray, }) }, + deleteChunks: async ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + ): Promise => { + for (const [key, stored] of storage) { + if (stored.poolName === poolName) { + try { + const doc = await payload.findByID({ + collection: poolName as CollectionSlug, + id: stored.id, + }) + if (doc && (doc as any).sourceCollection === sourceCollection && (doc as any).docId === docId) { + storage.delete(key) + } + } catch (_e) {} + } + } + + await payload.delete({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + ], + }, + }) + }, + + hasEmbeddingVersion: async ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + embeddingVersion: string, + ): Promise => { + const result = await payload.find({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: docId } }, + { embeddingVersion: { equals: embeddingVersion } }, + ], + }, + limit: 1, + }) + return result.totalDocs > 0 + }, + search: async ( payload: BasePayload, queryEmbedding: number[], @@ -125,40 +187,50 @@ export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter = } } -/** - * Simple WHERE clause matcher for basic filtering - * Supports: equals, in, exists, and, or - */ function matchesWhere(doc: Record, where: Where): boolean { if (!where || Object.keys(where).length === 0) return true - // Handle 'and' operator if ('and' in where && Array.isArray(where.and)) { return where.and.every((clause: Where) => matchesWhere(doc, clause)) } - // Handle 'or' operator if ('or' in where && Array.isArray(where.or)) { return where.or.some((clause: Where) => matchesWhere(doc, clause)) } - // Handle field-level conditions for (const [field, condition] of Object.entries(where)) { if (field === 'and' || field === 'or') continue + if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) continue const value = doc[field] + const cond = condition as Record - if (typeof condition === 'object' && condition !== null) { - if ('equals' in condition && value !== condition.equals) { - return false - } - if ('in' in condition && Array.isArray(condition.in) && !condition.in.includes(value)) { - return false - } - if ('exists' in condition) { - const exists = value !== undefined && value !== null - if (condition.exists !== exists) return false - } + if ('equals' in cond && value !== cond.equals) return false + if ('not_equals' in cond && value === cond.not_equals) return false + if ('notEquals' in cond && value === cond.notEquals) return false + if ('in' in cond && Array.isArray(cond.in)) { + if (cond.in.length === 0 || !cond.in.includes(value)) return false + } + if ('not_in' in cond && Array.isArray(cond.not_in) && cond.not_in.includes(value)) return false + if ('notIn' in cond && Array.isArray(cond.notIn) && (cond.notIn as any[]).includes(value)) return false + if ('like' in cond && typeof cond.like === 'string') { + const pattern = String(cond.like).replace(/%/g, '.*') + if (!new RegExp(`^${pattern}$`).test(String(value ?? ''))) return false + } + if ('contains' in cond && typeof cond.contains === 'string') { + if (!String(value ?? '').includes(String(cond.contains))) return false + } + if ('greater_than' in cond && !(value > (cond.greater_than as any))) return false + if ('greaterThan' in cond && !(value > (cond.greaterThan as any))) return false + if ('greater_than_equal' in cond && !(value >= (cond.greater_than_equal as any))) return false + if ('greaterThanEqual' in cond && !(value >= (cond.greaterThanEqual as any))) return false + if ('less_than' in cond && !(value < (cond.less_than as any))) return false + if ('lessThan' in cond && !(value < (cond.lessThan as any))) return false + if ('less_than_equal' in cond && !(value <= (cond.less_than_equal as any))) return false + if ('lessThanEqual' in cond && !(value <= (cond.lessThanEqual as any))) return false + if ('exists' in cond && typeof cond.exists === 'boolean') { + const exists = value !== undefined && value !== null + if (cond.exists !== exists) return false } } diff --git a/docs/plans/2026-03-06-adapter-owns-storage-design.md b/docs/plans/2026-03-06-adapter-owns-storage-design.md new file mode 100644 index 0000000..3eead0b --- /dev/null +++ b/docs/plans/2026-03-06-adapter-owns-storage-design.md @@ -0,0 +1,114 @@ +# Design: Adapter Owns Storage + +## Problem + +The main plugin assumes all adapters use a PayloadCMS collection for embedding storage. It calls `payload.create()` and `payload.delete()` on the embeddings collection directly, then calls `adapter.storeEmbedding()` as a second step. This leaks storage concerns into adapter-agnostic code. + +For the PG adapter this works — it just updates a vector column on the row the main plugin created. For the CF adapter, the PayloadCMS row is dead weight — CF stores vectors in Cloudflare Vectorize, a separate service. The embeddings collection row is never read by CF search. + +Additionally, the CF adapter's search has broken metadata filtering: +- Only supports 3 of 11 WHERE operators (equals, in, exists) +- Post-filters after topK instead of using Vectorize's native filter parameter +- topK is capped at 100 (or 20 with metadata), making post-filtering unviable +- Fetches from the main collection instead of embeddings-specific data + +## Solution + +Make the adapter responsible for all chunk storage, deletion, and version checking. The main plugin delegates these operations entirely. + +## New DbAdapter API + +```ts +type DbAdapter = { + getConfigExtension: (config: Config) => { + bins?: { key: string; scriptPath: string }[] + custom?: Record + collections?: Record + } + + storeChunk: ( + payload: Payload, + poolName: string, + data: { + sourceCollection: string + docId: string + chunkIndex: number + chunkText: string + embeddingVersion: string + embedding: number[] | Float32Array + extensionFields: Record + }, + ) => Promise + + deleteChunks: ( + payload: Payload, + poolName: string, + sourceCollection: string, + docId: string, + ) => Promise + + hasEmbeddingVersion: ( + payload: Payload, + poolName: string, + sourceCollection: string, + docId: string, + embeddingVersion: string, + ) => Promise + + search: ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit?: number, + where?: Where, + ) => Promise> +} +``` + +Removed: `storeEmbedding`, `deleteEmbeddings` + +## PG Adapter Changes + +- `storeChunk`: `payload.create()` on embeddings collection + UPDATE vector column (combines current two-step flow) +- `deleteChunks`: `payload.delete()` on embeddings collection (moved from main plugin's `deleteDocumentEmbeddings`) +- `hasEmbeddingVersion`: `payload.find()` on embeddings collection (moved from main plugin's `docHasEmbeddingVersion`) +- `search`: unchanged + +## CF Adapter Changes + +- `storeChunk`: upserts to Vectorize with metadata on the vector + creates cfMapping row. No embeddings collection. +- `deleteChunks`: existing flow — query cfMappings, `deleteByIds` from Vectorize, delete cfMappings +- `hasEmbeddingVersion`: queries Vectorize by metadata filter for docId + embeddingVersion +- `search`: converts Payload `where` to Vectorize `filter` format, queries with `returnMetadata: "all"`, builds results from vector metadata. Post-filters only for `like`, `contains`, `exists` (no native CF equivalent). + +## Main Plugin Changes + +- `src/tasks/vectorize.ts`: remove `payload.create()`, call `adapter.storeChunk()` +- `src/tasks/bulkEmbedAll.ts`: replace `payload.create()` + `adapter.storeEmbedding()` with `adapter.storeChunk()`. Replace `docHasEmbeddingVersion()` with `adapter.hasEmbeddingVersion()` +- `src/utils/deleteDocumentEmbeddings.ts`: replace both steps with `adapter.deleteChunks()` +- Embeddings collection creation moves to adapter's `getConfigExtension` (PG provides it, CF doesn't need it) + +## CF Vectorize Filter Mapping + +| Payload Operator | Vectorize Filter | Implementation | +|---|---|---| +| equals | $eq | Native filter | +| not_equals | $ne | Native filter | +| in | $in | Native filter | +| notIn | $nin | Native filter | +| greater_than | $gt | Native filter | +| greater_than_equal | $gte | Native filter | +| less_than | $lt | Native filter | +| less_than_equal | $lte | Native filter | +| like | — | JS post-filter | +| contains | — | JS post-filter | +| exists | — | JS post-filter | + +## CF Adapter Limitations (for README) + +- String metadata indexed only for first 64 bytes (truncated at UTF-8 boundaries) +- `like`, `contains`, `exists` operators applied post-query, constrained by topK +- topK max 20 when returning metadata, 100 without +- Range queries on ~10M+ vectors may have reduced accuracy +- Filter objects must be under 2048 bytes JSON-encoded +- Metadata indexes must exist before vectors are inserted diff --git a/docs/plans/2026-03-06-adapter-owns-storage.md b/docs/plans/2026-03-06-adapter-owns-storage.md new file mode 100644 index 0000000..ada420d --- /dev/null +++ b/docs/plans/2026-03-06-adapter-owns-storage.md @@ -0,0 +1,1023 @@ +# Adapter Owns Storage — Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Move chunk storage, deletion, and version checking from the main plugin into the DbAdapter interface. Fix CF adapter's broken metadata filtering by using Vectorize's native filter parameter. + +**Architecture:** The `DbAdapter` interface gains three new methods (`storeChunk`, `deleteChunks`, `hasEmbeddingVersion`) and loses two (`storeEmbedding`, `deleteEmbeddings`). The main plugin stops calling `payload.create()` / `payload.delete()` on the embeddings collection directly — adapters own that. The embeddings collection creation moves from the main plugin into PG adapter's `getConfigExtension`. CF adapter stores metadata on Vectorize vectors and uses native filtering. + +**Tech Stack:** TypeScript, Payload CMS 3.x, Drizzle ORM (PG), Cloudflare Vectorize API + +--- + +### Task 1: Update DbAdapter type in src/types.ts + +**Files:** +- Modify: `src/types.ts:374-406` + +**Step 1: Update the DbAdapter type** + +Replace the current `storeEmbedding` and `deleteEmbeddings` with: + +```typescript +export type StoreChunkData = { + sourceCollection: string + docId: string + chunkIndex: number + chunkText: string + embeddingVersion: string + embedding: number[] | Float32Array + extensionFields: Record +} + +export type DbAdapter = { + getConfigExtension: (payloadCmsConfig: Config) => { + bins?: { key: string; scriptPath: string }[] + custom?: Record + collections?: Record + } + storeChunk: ( + payload: Payload, + poolName: KnowledgePoolName, + data: StoreChunkData, + ) => Promise + deleteChunks: ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + ) => Promise + hasEmbeddingVersion: ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + embeddingVersion: string, + ) => Promise + search: ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit?: number, + where?: Where, + ) => Promise> +} +``` + +**Step 2: Verify types compile** + +Run: `pnpm build:types:all` +Expected: Type errors in files that still reference `storeEmbedding` / `deleteEmbeddings` — this is expected and will be fixed in subsequent tasks. + +**Step 3: Commit** + +```bash +git add src/types.ts +git commit -m "refactor: update DbAdapter type — storeChunk, deleteChunks, hasEmbeddingVersion" +``` + +--- + +### Task 2: Update mock adapter for tests + +**Files:** +- Modify: `dev/helpers/mockAdapter.ts` + +**Step 1: Update createMockAdapter to implement new interface** + +Replace `storeEmbedding` with `storeChunk`, add `deleteChunks` and `hasEmbeddingVersion`: + +```typescript +import type { DbAdapter, KnowledgePoolName, VectorSearchResult, StoreChunkData } from 'payloadcms-vectorize' +import type { Payload, BasePayload, CollectionSlug, Where, Config } from 'payload' + +type StoredEmbedding = { + poolName: string + id: string + embedding: number[] +} + +type MockAdapterOptions = { + bins?: { key: string; scriptPath: string }[] + custom?: Record +} + +function cosineSimilarity(a: number[], b: number[]): number { + if (a.length !== b.length) { + throw new Error(`Vector dimension mismatch: ${a.length} vs ${b.length}`) + } + let dot = 0 + let normA = 0 + let normB = 0 + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i] + normA += a[i] * a[i] + normB += b[i] * b[i] + } + if (normA === 0 || normB === 0) return 0 + return dot / (Math.sqrt(normA) * Math.sqrt(normB)) +} + +export const createMockAdapter = (options: MockAdapterOptions = {}): DbAdapter => { + const { bins = [], custom = {} } = options + const storage = new Map() + + return { + getConfigExtension: (_config: Config) => ({ + bins, + custom: { _isMockAdapter: true, ...custom }, + }), + + storeChunk: async ( + payload: Payload, + poolName: KnowledgePoolName, + data: StoreChunkData, + ): Promise => { + const embeddingArray = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + + const created = await payload.create({ + collection: poolName as CollectionSlug, + data: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + ...data.extensionFields, + embedding: embeddingArray, + }, + }) + + const key = `${poolName}:${String(created.id)}` + storage.set(key, { + poolName, + id: String(created.id), + embedding: embeddingArray, + }) + }, + + deleteChunks: async ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + ): Promise => { + const existing = await payload.find({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + ], + }, + limit: 1000, + }) + for (const doc of existing.docs) { + storage.delete(`${poolName}:${String(doc.id)}`) + } + await payload.delete({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + ], + }, + }) + }, + + hasEmbeddingVersion: async ( + payload: Payload, + poolName: KnowledgePoolName, + sourceCollection: string, + docId: string, + embeddingVersion: string, + ): Promise => { + const existing = await payload.find({ + collection: poolName as CollectionSlug, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + { embeddingVersion: { equals: embeddingVersion } }, + ], + }, + limit: 1, + }) + return existing.totalDocs > 0 + }, + + search: async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: string, + limit: number = 10, + where?: Where, + ): Promise => { + const results: Array = [] + + for (const [_key, stored] of storage) { + if (stored.poolName !== poolName) continue + + const score = cosineSimilarity(queryEmbedding, stored.embedding) + + try { + const doc = await payload.findByID({ + collection: poolName as any, + id: stored.id, + }) + + if (doc) { + if (where && !matchesWhere(doc, where)) { + continue + } + + const { + id: _id, + createdAt: _createdAt, + updatedAt: _updatedAt, + embedding: _embedding, + ...docFields + } = doc as any + + results.push({ + id: stored.id, + score, + _score: score, + ...docFields, + }) + } + } catch (_e) { + // Document not found, skip + } + } + + return results + .sort((a, b) => b._score - a._score) + .slice(0, limit) + .map(({ _score, ...rest }) => rest) + }, + } +} + +function matchesWhere(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((clause: Where) => matchesWhere(doc, clause)) + } + + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((clause: Where) => matchesWhere(doc, clause)) + } + + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + + const value = doc[field] + + if (typeof condition === 'object' && condition !== null) { + if ('equals' in condition && value !== condition.equals) { + return false + } + if ('in' in condition && Array.isArray(condition.in) && !condition.in.includes(value)) { + return false + } + if ('exists' in condition) { + const exists = value !== undefined && value !== null + if (condition.exists !== exists) return false + } + } + } + + return true +} +``` + +**Step 2: Commit** + +```bash +git add dev/helpers/mockAdapter.ts +git commit -m "refactor: update mock adapter to new DbAdapter interface" +``` + +--- + +### Task 3: Update main plugin — vectorize.ts + +**Files:** +- Modify: `src/tasks/vectorize.ts` + +**Step 1: Replace payload.create() + adapter.storeEmbedding() with adapter.storeChunk()** + +Replace lines 96-137 in `runVectorizeTask` with: + +```typescript + await adapter.deleteChunks(payload, poolName, collection, String(sourceDoc.id)) + + const chunkData = await toKnowledgePoolFn(sourceDoc, payload) + + validateChunkData(chunkData, String(sourceDoc.id), collection) + + const chunkTexts = chunkData.map((item) => item.chunk) + const vectors = await dynamicConfig.embeddingConfig.realTimeIngestionFn!(chunkTexts) + + await Promise.all( + vectors.map(async (vector, index) => { + const { chunk, ...extensionFields } = chunkData[index] + await adapter.storeChunk(payload, poolName, { + sourceCollection: collection, + docId: String(sourceDoc.id), + chunkIndex: index, + chunkText: chunk, + embeddingVersion, + embedding: vector, + extensionFields, + }) + }), + ) +``` + +Remove the import of `deleteDocumentEmbeddings`. + +**Step 2: Commit** + +```bash +git add src/tasks/vectorize.ts +git commit -m "refactor: vectorize task uses adapter.storeChunk and adapter.deleteChunks" +``` + +--- + +### Task 4: Update main plugin — deleteDocumentEmbeddings.ts + +**Files:** +- Modify: `src/utils/deleteDocumentEmbeddings.ts` + +**Step 1: Replace two-step deletion with adapter.deleteChunks()** + +```typescript +import type { Payload } from 'payload' +import type { DbAdapter, KnowledgePoolName } from '../types.js' + +export async function deleteDocumentEmbeddings(args: { + payload: Payload + poolName: KnowledgePoolName + collection: string + docId: string + adapter: DbAdapter +}): Promise { + const { payload, poolName, collection, docId, adapter } = args + await adapter.deleteChunks(payload, poolName, collection, String(docId)) +} +``` + +**Step 2: Commit** + +```bash +git add src/utils/deleteDocumentEmbeddings.ts +git commit -m "refactor: deleteDocumentEmbeddings delegates to adapter.deleteChunks" +``` + +--- + +### Task 5: Update main plugin — bulkEmbedAll.ts + +**Files:** +- Modify: `src/tasks/bulkEmbedAll.ts` + +**Step 1: Replace docHasEmbeddingVersion with adapter.hasEmbeddingVersion** + +In `streamAndBatchDocs` (around line 622), replace: + +```typescript + const hasCurrentEmbedding = await docHasEmbeddingVersion({ + payload, + poolName, + sourceCollection: collectionSlug, + docId: String(doc.id), + embeddingVersion, + }) +``` + +with: + +```typescript + const hasCurrentEmbedding = await adapter.hasEmbeddingVersion( + payload, + poolName, + collectionSlug, + String(doc.id), + embeddingVersion, + ) +``` + +This requires adding `adapter` to the `streamAndBatchDocs` args and passing it through from the task handler. + +**Step 2: Replace docHasEmbeddingVersion in pollAndCompleteSingleBatch (around line 882)** + +Same replacement pattern: + +```typescript + const hasCurrentEmbedding = await adapter.hasEmbeddingVersion( + payload, + poolName, + meta.sourceCollection, + meta.docId, + meta.embeddingVersion, + ) +``` + +**Step 3: Replace payload.create() + adapter.storeEmbedding() in pollAndCompleteSingleBatch (around line 907-927)** + +Replace: + +```typescript + const created = await payload.create({ + collection: poolName as CollectionSlug, + data: { + sourceCollection: meta.sourceCollection, + docId: String(meta.docId), + chunkIndex: meta.chunkIndex, + chunkText: meta.text, + embeddingVersion: meta.embeddingVersion, + ...(meta.extensionFields || {}), + embedding: embeddingArray, + }, + }) + + await adapter.storeEmbedding( + payload, + poolName, + meta.sourceCollection, + String(meta.docId), + String(created.id), + embeddingArray, + ) +``` + +with: + +```typescript + await adapter.storeChunk(payload, poolName, { + sourceCollection: meta.sourceCollection, + docId: String(meta.docId), + chunkIndex: meta.chunkIndex, + chunkText: meta.text, + embeddingVersion: meta.embeddingVersion, + embedding: embeddingArray, + extensionFields: (meta.extensionFields || {}) as Record, + }) +``` + +**Step 4: Remove the local docHasEmbeddingVersion function** (lines 942-962) + +**Step 5: Thread `adapter` through to `streamAndBatchDocs`** + +Add `adapter: DbAdapter` to the `streamAndBatchDocs` args type and pass it from `createPrepareBulkEmbeddingTask`. The prepare task needs `adapter` added to its factory args (same as `createPollOrCompleteSingleBatchTask` already has). + +Update `createPrepareBulkEmbeddingTask` signature: + +```typescript +export const createPrepareBulkEmbeddingTask = ({ + knowledgePools, + pollOrCompleteQueueName, + prepareBulkEmbedQueueName, + adapter, +}: { + knowledgePools: Record + pollOrCompleteQueueName?: string + prepareBulkEmbedQueueName?: string + adapter: DbAdapter +}): TaskConfig => { +``` + +**Step 6: Commit** + +```bash +git add src/tasks/bulkEmbedAll.ts +git commit -m "refactor: bulk embed uses adapter.storeChunk, deleteChunks, hasEmbeddingVersion" +``` + +--- + +### Task 6: Update main plugin — src/index.ts (pass adapter to prepare task, move embeddings collection) + +**Files:** +- Modify: `src/index.ts` + +**Step 1: Pass adapter to createPrepareBulkEmbeddingTask** + +Around line 174, add `adapter: pluginOptions.dbAdapter`: + +```typescript + const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + prepareBulkEmbedQueueName: pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, + adapter: pluginOptions.dbAdapter, + }) +``` + +**Step 2: Commit** + +```bash +git add src/index.ts +git commit -m "refactor: pass adapter to prepare bulk embedding task" +``` + +--- + +### Task 7: Run existing tests to verify no regressions + +**Step 1: Build** + +Run: `pnpm build` +Expected: PASS (no type errors) + +**Step 2: Run core tests** + +Run: `pnpm test` (or the project's test command) +Expected: All existing tests pass. The mock adapter now handles `payload.create()` / `payload.delete()` internally, so test behavior should be identical. + +**Step 3: Commit if any fixes needed** + +--- + +### Task 8: Update PG adapter + +**Files:** +- Modify: `adapters/pg/src/index.ts` +- Modify: `adapters/pg/src/embed.ts` + +**Step 1: Add storeChunk, deleteChunks, hasEmbeddingVersion to PG adapter** + +In `adapters/pg/src/index.ts`, replace: + +```typescript + search, + storeEmbedding: embed, +``` + +with: + +```typescript + search, + + storeChunk: async (payload, poolName, data) => { + const embeddingArray = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + + const created = await payload.create({ + collection: poolName as any, + data: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + ...data.extensionFields, + embedding: embeddingArray, + }, + }) + + await embed(payload, poolName, data.sourceCollection, data.docId, String(created.id), embeddingArray) + }, + + deleteChunks: async (payload, poolName, sourceCollection, docId) => { + await payload.delete({ + collection: poolName as any, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + ], + }, + }) + }, + + hasEmbeddingVersion: async (payload, poolName, sourceCollection, docId, embeddingVersion) => { + const existing = await payload.find({ + collection: poolName as any, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + { embeddingVersion: { equals: embeddingVersion } }, + ], + }, + limit: 1, + }) + return existing.totalDocs > 0 + }, +``` + +**Step 2: Commit** + +```bash +git add adapters/pg/src/index.ts +git commit -m "refactor: PG adapter implements storeChunk, deleteChunks, hasEmbeddingVersion" +``` + +--- + +### Task 9: Update CF adapter — storeChunk with metadata + +**Files:** +- Modify: `adapters/cf/src/index.ts` +- Modify: `adapters/cf/src/embed.ts` + +**Step 1: Update embed.ts to accept and store metadata** + +```typescript +import { CollectionSlug, Payload } from 'payload' +import { getVectorizeBinding } from './types.js' +import { CF_MAPPINGS_SLUG } from './collections/cfMappings.js' +import type { StoreChunkData } from 'payloadcms-vectorize' + +export default async ( + payload: Payload, + poolName: string, + data: StoreChunkData, +) => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + const vector = Array.isArray(data.embedding) ? data.embedding : Array.from(data.embedding) + const id = `${poolName}:${data.sourceCollection}:${data.docId}:${data.chunkIndex}` + + await vectorizeBinding.upsert([ + { + id, + values: vector, + metadata: { + sourceCollection: data.sourceCollection, + docId: data.docId, + chunkIndex: data.chunkIndex, + chunkText: data.chunkText, + embeddingVersion: data.embeddingVersion, + ...data.extensionFields, + }, + }, + ]) + + await payload.create({ + collection: CF_MAPPINGS_SLUG as CollectionSlug, + data: { + vectorId: id, + poolName, + sourceCollection: data.sourceCollection, + docId: data.docId, + }, + }) + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Failed to store embedding: ${errorMessage}`) + } +} +``` + +**Step 2: Update index.ts — replace storeEmbedding, add deleteChunks and hasEmbeddingVersion** + +Replace `storeEmbedding: embed` with `storeChunk: embed` (embed now has the new signature). + +Replace `deleteEmbeddings` with `deleteChunks` (same logic, renamed). + +Add `hasEmbeddingVersion`: + +```typescript + hasEmbeddingVersion: async (payload, poolName, sourceCollection, docId, embeddingVersion) => { + const vectorizeBinding = getVectorizeBinding(payload) + const dummyVector = new Array(Object.values(poolConfig)[0]?.dims || 384).fill(0) + + const results = await vectorizeBinding.query(dummyVector, { + topK: 1, + returnMetadata: 'all', + filter: { + docId: { $eq: docId }, + sourceCollection: { $eq: sourceCollection }, + embeddingVersion: { $eq: embeddingVersion }, + }, + }) + + return (results.matches?.length ?? 0) > 0 + }, +``` + +**Step 3: Commit** + +```bash +git add adapters/cf/src/index.ts adapters/cf/src/embed.ts +git commit -m "refactor: CF adapter implements storeChunk with metadata, deleteChunks, hasEmbeddingVersion" +``` + +--- + +### Task 10: Update CF adapter — search with native filtering + +**Files:** +- Modify: `adapters/cf/src/search.ts` + +**Step 1: Rewrite search to use native Vectorize filtering and metadata** + +```typescript +import { BasePayload, Where } from 'payload' +import { KnowledgePoolName, VectorSearchResult } from 'payloadcms-vectorize' +import { getVectorizeBinding } from './types.js' + +export default async ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit: number = 10, + where?: Where, +): Promise> => { + const vectorizeBinding = getVectorizeBinding(payload) + + try { + const queryOptions: Record = { + topK: limit, + returnMetadata: 'all' as const, + } + + if (where) { + const { nativeFilter, postFilter } = convertWhereToVectorizeFilter(where) + if (nativeFilter && Object.keys(nativeFilter).length > 0) { + queryOptions.filter = nativeFilter + } + if (postFilter) { + // Will apply after query + } + } + + const results = await vectorizeBinding.query(queryEmbedding, queryOptions) + + if (!results.matches) { + return [] + } + + let searchResults: VectorSearchResult[] = results.matches.map((match) => { + const metadata = match.metadata || {} + return { + id: match.id, + score: match.score || 0, + sourceCollection: String(metadata.sourceCollection || ''), + docId: String(metadata.docId || ''), + chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : parseInt(String(metadata.chunkIndex || '0'), 10), + chunkText: String(metadata.chunkText || ''), + embeddingVersion: String(metadata.embeddingVersion || ''), + ...Object.fromEntries( + Object.entries(metadata).filter(([k]) => + !['sourceCollection', 'docId', 'chunkIndex', 'chunkText', 'embeddingVersion'].includes(k) + ) + ), + } + }) + + if (where) { + const { postFilter } = convertWhereToVectorizeFilter(where) + if (postFilter) { + searchResults = searchResults.filter((r) => matchesPostFilter(r, postFilter)) + } + } + + return searchResults + } catch (e) { + const errorMessage = e instanceof Error ? e.message : String(e) + payload.logger.error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + throw new Error(`[@payloadcms-vectorize/cf] Search failed: ${errorMessage}`) + } +} + +type VectorizeFilter = Record> +type PostFilterClause = Where + +interface FilterSplit { + nativeFilter: VectorizeFilter | null + postFilter: PostFilterClause | null +} + +const NATIVE_OPERATOR_MAP: Record = { + equals: '$eq', + not_equals: '$ne', + notEquals: '$ne', + in: '$in', + not_in: '$nin', + notIn: '$nin', + greater_than: '$gt', + greaterThan: '$gt', + greater_than_equal: '$gte', + greaterThanEqual: '$gte', + less_than: '$lt', + lessThan: '$lt', + less_than_equal: '$lte', + lessThanEqual: '$lte', +} + +function convertWhereToVectorizeFilter(where: Where): FilterSplit { + const nativeFilter: VectorizeFilter = {} + const postFilterClauses: Where[] = [] + + if ('and' in where && Array.isArray(where.and)) { + for (const clause of where.and) { + const split = convertWhereToVectorizeFilter(clause) + if (split.nativeFilter) { + Object.assign(nativeFilter, split.nativeFilter) + } + if (split.postFilter) { + postFilterClauses.push(split.postFilter) + } + } + return { + nativeFilter: Object.keys(nativeFilter).length > 0 ? nativeFilter : null, + postFilter: postFilterClauses.length > 0 ? { and: postFilterClauses } : null, + } + } + + if ('or' in where && Array.isArray(where.or)) { + // OR cannot be split — entire OR goes to post-filter if any clause is non-native + const allNative = where.or.every((clause) => { + const split = convertWhereToVectorizeFilter(clause) + return split.postFilter === null + }) + if (allNative) { + // Vectorize doesn't support OR at top level, so post-filter + return { nativeFilter: null, postFilter: where } + } + return { nativeFilter: null, postFilter: where } + } + + for (const [fieldName, condition] of Object.entries(where)) { + if (fieldName === 'and' || fieldName === 'or') continue + if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) continue + + const cond = condition as Record + let handled = false + + for (const [payloadOp, cfOp] of Object.entries(NATIVE_OPERATOR_MAP)) { + if (payloadOp in cond) { + nativeFilter[fieldName] = { [cfOp]: cond[payloadOp] } + handled = true + break + } + } + + if (!handled) { + // like, contains, exists → post-filter + postFilterClauses.push({ [fieldName]: condition } as Where) + } + } + + return { + nativeFilter: Object.keys(nativeFilter).length > 0 ? nativeFilter : null, + postFilter: postFilterClauses.length > 0 + ? (postFilterClauses.length === 1 ? postFilterClauses[0] : { and: postFilterClauses }) + : null, + } +} + +function matchesPostFilter(doc: Record, where: Where): boolean { + if (!where || Object.keys(where).length === 0) return true + + if ('and' in where && Array.isArray(where.and)) { + return where.and.every((clause: Where) => matchesPostFilter(doc, clause)) + } + + if ('or' in where && Array.isArray(where.or)) { + return where.or.some((clause: Where) => matchesPostFilter(doc, clause)) + } + + for (const [field, condition] of Object.entries(where)) { + if (field === 'and' || field === 'or') continue + if (typeof condition !== 'object' || condition === null) continue + + const value = doc[field] + const cond = condition as Record + + if ('like' in cond && typeof cond.like === 'string') { + const pattern = String(cond.like).replace(/%/g, '.*') + if (!new RegExp(`^${pattern}$`, 'i').test(String(value ?? ''))) return false + } + + if ('contains' in cond && typeof cond.contains === 'string') { + if (!String(value ?? '').toLowerCase().includes(String(cond.contains).toLowerCase())) return false + } + + if ('exists' in cond && typeof cond.exists === 'boolean') { + const exists = value !== undefined && value !== null + if (cond.exists !== exists) return false + } + + // Also handle native ops in post-filter for OR clauses + if ('equals' in cond && value !== cond.equals) return false + if ('not_equals' in cond && value === cond.not_equals) return false + if ('notEquals' in cond && value === cond.notEquals) return false + if ('in' in cond && Array.isArray(cond.in) && !cond.in.includes(value)) return false + if ('not_in' in cond && Array.isArray(cond.not_in) && cond.not_in.includes(value)) return false + if ('notIn' in cond && Array.isArray(cond.notIn) && (cond.notIn as any[]).includes(value)) return false + if ('greater_than' in cond && !(value > (cond.greater_than as any))) return false + if ('greaterThan' in cond && !(value > (cond.greaterThan as any))) return false + if ('greater_than_equal' in cond && !(value >= (cond.greater_than_equal as any))) return false + if ('greaterThanEqual' in cond && !(value >= (cond.greaterThanEqual as any))) return false + if ('less_than' in cond && !(value < (cond.less_than as any))) return false + if ('lessThan' in cond && !(value < (cond.lessThan as any))) return false + if ('less_than_equal' in cond && !(value <= (cond.less_than_equal as any))) return false + if ('lessThanEqual' in cond && !(value <= (cond.lessThanEqual as any))) return false + } + + return true +} +``` + +**Step 2: Commit** + +```bash +git add adapters/cf/src/search.ts +git commit -m "feat: CF search uses native Vectorize filtering + metadata" +``` + +--- + +### Task 11: Update CF README with limitations + +**Files:** +- Modify: `adapters/cf/README.md` + +**Step 1: Replace the "Known Limitations" section** + +Replace from `## Known Limitations` to end of file with: + +```markdown +## Known Limitations + +### Metadata Filtering + +The CF adapter uses Cloudflare Vectorize's native metadata filtering, which applies filters **before** the topK selection. This means filtering works correctly with the result limit for most operators. + +**Natively supported operators** (applied before topK — correct result counts): +- `equals`, `not_equals`, `in`, `notIn` +- `greater_than`, `greater_than_equal`, `less_than`, `less_than_equal` + +**Post-filtered operators** (applied after topK — may return fewer results than requested): +- `like`, `contains`, `exists` + +### Vectorize Constraints + +| Constraint | Limit | +|---|---| +| topK maximum | 100 (or 20 when returning metadata) | +| String metadata indexing | First 64 bytes only (truncated at UTF-8 boundaries) | +| Filter object size | Under 2048 bytes JSON-encoded | +| Range query accuracy | May be reduced on ~10M+ vectors | + +Metadata indexes must exist before vectors are inserted for filtering to work. + +### OR Queries + +Cloudflare Vectorize does not support OR at the filter level. All `or` clauses are evaluated as post-filters, subject to the topK constraint. + +## License + +MIT +``` + +**Step 2: Commit** + +```bash +git add adapters/cf/README.md +git commit -m "docs: update CF README with metadata filtering limitations" +``` + +--- + +### Task 12: Build and verify + +**Step 1: Build everything** + +Run: `pnpm build` +Expected: PASS + +**Step 2: Run all tests** + +Run: `pnpm test` +Expected: All tests pass + +**Step 3: Final commit if any fixes** + +--- + +### Task 13: Clean up — remove deleteDocumentEmbeddings export if now trivial + +**Files:** +- Modify: `src/index.ts` (line 85) + +**Step 1: Check if deleteDocumentEmbeddings is still exported and used externally** + +If it's only used internally now and is just a pass-through to `adapter.deleteChunks`, consider whether the export is still needed. If it's in the public API, keep it. If not, remove the export from `src/index.ts` line 85. + +**Step 2: Commit if changed** + +```bash +git add src/index.ts +git commit -m "chore: clean up deleteDocumentEmbeddings export" +``` diff --git a/package.json b/package.json index e65497a..78260bc 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "payloadcms-vectorize", - "version": "0.7.0", + "version": "0.7.0-beta.1", "description": "A plugin to vectorize collections for RAG in Payload 3.0", "license": "MIT", "type": "module", @@ -24,11 +24,12 @@ "scripts": { "build": "pnpm copyfiles && pnpm build:types && pnpm build:swc && pnpm build:adapters", "build:swc": "swc ./src -d ./dist --config-file .swcrc --strip-leading-paths", - "build:adapters": "pnpm build:adapters:pg", + "build:adapters": "pnpm build:adapters:pg && pnpm build:adapters:cf", "build:adapters:pg": "cd ./adapters/pg && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", + "build:adapters:cf": "cd ./adapters/cf && swc ./src -d ./dist --config-file ../../.swcrc --strip-leading-paths", "build:types": "tsc -p tsconfig.build.json --outDir dist --rootDir ./src", "build:types:all": "pnpm build:types && tsc --noEmit", - "clean": "rimraf {dist,*.tsbuildinfo,adapters/pg/dist}", + "clean": "rimraf {dist,*.tsbuildinfo,adapters/pg/dist,adapters/cf/dist}", "copyfiles": "copyfiles -u 1 \"src/**/*.{html,css,scss,ttf,woff,woff2,eot,svg,jpg,png,json}\" dist/", "dev": "cross-env DOTENV_CONFIG_PATH=dev/.env.development NODE_OPTIONS=--require=dotenv/config next dev dev --turbo", "dev:generate-importmap": "pnpm dev:payload generate:importmap", @@ -47,7 +48,8 @@ "test": "pnpm test:int && pnpm test:e2e", "test:e2e": "playwright test", "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest", - "test:adapters:pg": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/pg/vitest.config.js" + "test:adapters:pg": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/pg/vitest.config.js", + "test:adapters:cf": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx --max-old-space-size=8192' vitest --config adapters/cf/vitest.config.js" }, "devDependencies": { "@changesets/changelog-github": "^0.5.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f793dad..cc55b9e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -154,6 +154,16 @@ importers: specifier: ^0.0.8 version: 0.0.8 + adapters/cf: + dependencies: + payload: + specifier: '>=3.0.0 <4.0.0' + version: 3.69.0(graphql@16.12.0)(typescript@5.7.3) + devDependencies: + payloadcms-vectorize: + specifier: workspace:* + version: link:../.. + adapters/pg: dependencies: '@payloadcms/db-postgres': diff --git a/src/index.ts b/src/index.ts index 44aca25..d61852f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -69,6 +69,7 @@ export type { BulkEmbeddingRunStatus, VectorizedPayload, DbAdapter, + StoreChunkData, // For adapters VectorSearchResult, @@ -174,6 +175,7 @@ export default (pluginOptions: PayloadcmsVectorizeConfig) => const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + adapter: pluginOptions.dbAdapter, }) tasks.push(prepareBulkEmbedTask) diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 4f77883..4dee361 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -245,10 +245,12 @@ export const createPrepareBulkEmbeddingTask = ({ knowledgePools, pollOrCompleteQueueName, prepareBulkEmbedQueueName, + adapter, }: { knowledgePools: Record pollOrCompleteQueueName?: string prepareBulkEmbedQueueName?: string + adapter: DbAdapter }): TaskConfig => { const task: TaskConfig = { slug: TASK_SLUG_PREPARE_BULK_EMBEDDING, @@ -399,6 +401,7 @@ export const createPrepareBulkEmbeddingTask = ({ includeAll, lastCompletedAtDate, addChunk: callbacks.addChunk, + adapter, }) } catch (error) { // Ingestion failed - mark run as failed @@ -597,6 +600,7 @@ async function streamAndBatchDocs(args: { chunk: BulkEmbeddingInput isLastChunk: boolean }) => Promise + adapter: DbAdapter }): Promise<{ batchCount: number; totalInputs: number; batchIds: (string | number)[] }> { const { payload, @@ -609,6 +613,7 @@ async function streamAndBatchDocs(args: { includeAll, lastCompletedAtDate, addChunk, + adapter, } = args // Async generator that yields chunks one at a time from pre-fetched docs @@ -619,13 +624,9 @@ async function streamAndBatchDocs(args: { // If !includeAll, we still need to check if document has current embedding // (can't filter this in the where clause since it's a cross-collection check) if (!includeAll && !lastCompletedAtDate) { - const hasCurrentEmbedding = await docHasEmbeddingVersion({ - payload, - poolName, - sourceCollection: collectionSlug, - docId: String(doc.id), - embeddingVersion, - }) + const hasCurrentEmbedding = await adapter.hasEmbeddingVersion( + payload, poolName, collectionSlug, String(doc.id), embeddingVersion, + ) if (hasCurrentEmbedding) continue } @@ -879,13 +880,9 @@ async function pollAndCompleteSingleBatch(args: { if (isFirstChunkForDoc) { processedDocs.add(docKey) // Check if embeddings already exist for this document+version (from a previous batch) - const hasCurrentEmbedding = await docHasEmbeddingVersion({ - payload, - poolName, - sourceCollection: meta.sourceCollection, - docId: meta.docId, - embeddingVersion: meta.embeddingVersion, - }) + const hasCurrentEmbedding = await adapter.hasEmbeddingVersion( + payload, poolName, meta.sourceCollection, meta.docId, meta.embeddingVersion, + ) // Only delete if no embeddings exist for this version (they're from an old version) if (!hasCurrentEmbedding) { @@ -904,28 +901,16 @@ async function pollAndCompleteSingleBatch(args: { ? output.embedding : Array.from(output.embedding) - const created = await payload.create({ - collection: poolName as CollectionSlug, - data: { - sourceCollection: meta.sourceCollection, - docId: String(meta.docId), - chunkIndex: meta.chunkIndex, - chunkText: meta.text, - embeddingVersion: meta.embeddingVersion, - ...(meta.extensionFields || {}), - embedding: embeddingArray, - }, + await adapter.storeChunk(payload, poolName, { + sourceCollection: meta.sourceCollection, + docId: String(meta.docId), + chunkIndex: meta.chunkIndex, + chunkText: meta.text, + embeddingVersion: meta.embeddingVersion, + embedding: embeddingArray, + extensionFields: (meta.extensionFields || {}) as Record, }) - await adapter.storeEmbedding( - payload, - poolName, - meta.sourceCollection, - String(meta.docId), - String(created.id), - embeddingArray, - ) - succeededCount++ }, }) @@ -939,28 +924,6 @@ async function pollAndCompleteSingleBatch(args: { } } -async function docHasEmbeddingVersion(args: { - payload: Payload - poolName: KnowledgePoolName - sourceCollection: string - docId: string - embeddingVersion: string -}): Promise { - const { payload, poolName, sourceCollection, docId, embeddingVersion } = args - const existing = await payload.find({ - collection: poolName as CollectionSlug, - where: { - and: [ - { sourceCollection: { equals: sourceCollection } }, - { docId: { equals: String(docId) } }, - { embeddingVersion: { equals: embeddingVersion } }, - ], - }, - limit: 1, - }) - return existing.totalDocs > 0 -} - /** * Lookup metadata for a single input by runId + inputId. * Uses the composite index ['run', 'inputId'] for O(1) lookup. diff --git a/src/tasks/vectorize.ts b/src/tasks/vectorize.ts index 802c362..24542c3 100644 --- a/src/tasks/vectorize.ts +++ b/src/tasks/vectorize.ts @@ -8,7 +8,6 @@ import type { } from '../types.js' import { TASK_SLUG_VECTORIZE } from '../constants.js' import { validateChunkData } from '../utils/validateChunkData.js' -import { deleteDocumentEmbeddings } from '../utils/deleteDocumentEmbeddings.js' type VectorizeTaskInput = { collection: string @@ -96,13 +95,7 @@ async function runVectorizeTask(args: { // Delete all existing embeddings for this document before creating new ones // This ensures we replace old embeddings (potentially with a different embeddingVersion) // and prevents duplicates when a document is updated - await deleteDocumentEmbeddings({ - payload, - poolName, - collection, - docId: String(sourceDoc.id), - adapter, - }) + await adapter.deleteChunks(payload, poolName, collection, String(sourceDoc.id)) // Get chunks from toKnowledgePoolFn const chunkData = await toKnowledgePoolFn(sourceDoc, payload) @@ -117,22 +110,15 @@ async function runVectorizeTask(args: { await Promise.all( vectors.map(async (vector, index) => { const { chunk, ...extensionFields } = chunkData[index] - const created = await payload.create({ - collection: poolName, - data: { - chunkIndex: index, - chunkText: chunk, - docId: String(sourceDoc.id), - embeddingVersion, - sourceCollection: collection, - ...extensionFields, - embedding: Array.isArray(vector) ? vector : Array.from(vector), - }, + await adapter.storeChunk(payload, poolName, { + sourceCollection: collection, + docId: String(sourceDoc.id), + chunkIndex: index, + chunkText: chunk, + embeddingVersion, + embedding: vector, + extensionFields, }) - - const id = String(created.id) - - await adapter.storeEmbedding(payload, poolName, collection, String(sourceDoc.id), id, vector) }), ) } diff --git a/src/types.ts b/src/types.ts index 0e41c98..31601e1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -371,36 +371,45 @@ export interface BulkEmbeddingInputMetadataDoc extends TypeWithID { updatedAt: string } +export type StoreChunkData = { + sourceCollection: string + docId: string + chunkIndex: number + chunkText: string + embeddingVersion: string + embedding: number[] | Float32Array + extensionFields: Record +} + export type DbAdapter = { getConfigExtension: (payloadCmsConfig: Config) => { bins?: { key: string; scriptPath: string }[] custom?: Record collections?: Record } - search: ( - payload: BasePayload, - queryEmbedding: number[], + storeChunk: ( + payload: Payload, poolName: KnowledgePoolName, - limit?: number, - where?: Where, - ) => Promise> - storeEmbedding: ( + data: StoreChunkData, + ) => Promise + deleteChunks: ( payload: Payload, poolName: KnowledgePoolName, sourceCollection: string, - sourceDocId: string, - embeddingId: string, - embedding: number[] | Float32Array, + docId: string, ) => Promise - /** - * Delete embeddings for a source document - * Called when a document is deleted or re-indexed - * The adapter should delete all vectors associated with this document - */ - deleteEmbeddings?: ( + hasEmbeddingVersion: ( payload: Payload, poolName: KnowledgePoolName, sourceCollection: string, docId: string, - ) => Promise + embeddingVersion: string, + ) => Promise + search: ( + payload: BasePayload, + queryEmbedding: number[], + poolName: KnowledgePoolName, + limit?: number, + where?: Where, + ) => Promise> } diff --git a/src/utils/deleteDocumentEmbeddings.ts b/src/utils/deleteDocumentEmbeddings.ts index 413879b..6fee659 100644 --- a/src/utils/deleteDocumentEmbeddings.ts +++ b/src/utils/deleteDocumentEmbeddings.ts @@ -1,10 +1,6 @@ -import type { CollectionSlug, Payload } from 'payload' +import type { Payload } from 'payload' import type { DbAdapter, KnowledgePoolName } from '../types.js' -/** - * Two-step deletion: removes embeddings from the Payload collection - * and then from the adapter's storage (for adapters that store vectors separately). - */ export async function deleteDocumentEmbeddings(args: { payload: Payload poolName: KnowledgePoolName @@ -13,18 +9,5 @@ export async function deleteDocumentEmbeddings(args: { adapter: DbAdapter }): Promise { const { payload, poolName, collection, docId, adapter } = args - - await payload.delete({ - collection: poolName as CollectionSlug, - where: { - and: [ - { sourceCollection: { equals: collection } }, - { docId: { equals: String(docId) } }, - ], - }, - }) - - if (adapter.deleteEmbeddings) { - await adapter.deleteEmbeddings(payload, poolName, collection, String(docId)) - } + await adapter.deleteChunks(payload, poolName, collection, String(docId)) }