From ce3fdec86d93c161b696fdef3349a2146a05da47 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 3 Mar 2026 06:43:19 -0800 Subject: [PATCH] Add vec0 optimize command: compact sparse chunks after deletions Implements FTS5-style INSERT INTO v(v) VALUES ('optimize') command that packs live entries from newer/sparser chunks into free slots of older chunks, then deletes emptied chunks. Adds hidden command column to vtab schema, command dispatcher in xUpdate, and two-pointer compaction algorithm that handles vectors, all metadata types, and partitioned tables. Includes 16 Python tests, 7 C unit tests, and a libFuzzer target. Co-Authored-By: Claude Opus 4.6 --- sqlite-vec.c | 512 ++++++++++++++++++++++++++++++++++++- tests/fuzz/Makefile | 5 +- tests/fuzz/vec0-optimize.c | 140 ++++++++++ tests/test-optimize.py | 450 ++++++++++++++++++++++++++++++++ tests/test-unit.c | 253 ++++++++++++++++++ 5 files changed, 1358 insertions(+), 2 deletions(-) create mode 100644 tests/fuzz/vec0-optimize.c create mode 100644 tests/test-optimize.py diff --git a/sqlite-vec.c b/sqlite-vec.c index a4b9c11..9ec986a 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -3409,6 +3409,7 @@ static sqlite3_module vec_npy_eachModule = { #define VEC0_COLUMN_USERN_START 1 #define VEC0_COLUMN_OFFSET_DISTANCE 1 #define VEC0_COLUMN_OFFSET_K 2 +#define VEC0_COLUMN_OFFSET_CMD 3 #define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\"" @@ -3685,6 +3686,16 @@ int vec0_column_k_idx(vec0_vtab *p) { VEC0_COLUMN_OFFSET_K; } +/** + * @brief Returns the column index for the hidden command column. + * This column shares the table name and is used for FTS5-style insert commands + * like: INSERT INTO t(t) VALUES ('optimize'); + */ +int vec0_column_cmd_idx(vec0_vtab *p) { + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + + VEC0_COLUMN_OFFSET_CMD; +} + /** * Returns 1 if the given column-based index is a valid vector column, * 0 otherwise. @@ -4961,7 +4972,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } - sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + sqlite3_str_appendf(createStr, " distance hidden, k hidden, \"%w\" hidden) ", argv[2]); if (pkColumnName) { sqlite3_str_appendall(createStr, "without rowid "); } @@ -8305,11 +8316,32 @@ int vec0_write_metadata_value(vec0_vtab *p, int metadata_column_idx, i64 rowid, * * @return int SQLITE_OK on success, otherwise error code on failure */ +static int vec0_optimize(vec0_vtab *p); + +static int vec0Update_InsertCommand(sqlite3_vtab *pVTab, sqlite3_value *cmdValue) { + const char *zCmd = (const char *)sqlite3_value_text(cmdValue); + if (sqlite3_stricmp(zCmd, "optimize") == 0) { + return vec0_optimize((vec0_vtab *)pVTab); + } + vtab_set_error(pVTab, "Unknown vec0 command: \"%s\"", zCmd); + return SQLITE_ERROR; +} + int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, sqlite_int64 *pRowid) { UNUSED_PARAMETER(argc); vec0_vtab *p = (vec0_vtab *)pVTab; int rc; + + // Check for FTS5-style insert commands: INSERT INTO t(t) VALUES ('cmd') + { + int cmd_argv_idx = 2 + vec0_column_cmd_idx(p); + if (cmd_argv_idx < argc && + sqlite3_value_type(argv[cmd_argv_idx]) == SQLITE_TEXT) { + return vec0Update_InsertCommand(pVTab, argv[cmd_argv_idx]); + } + } + // Rowid for the inserted row, deterimined by the inserted ID + _rowids shadow // table i64 rowid; @@ -9008,6 +9040,484 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { return SQLITE_OK; } +// ============================================================ +// vec0 optimize: pack live entries into older chunks, delete empty ones +// ============================================================ + +/** + * Information about a single chunk loaded during optimize. + */ +struct vec0_optimize_chunk { + i64 chunk_id; + int validity_size; // bytes in validity bitmap + unsigned char *validity; // in-memory validity bitmap (owned) + int rowids_size; // bytes in rowids blob + i64 *rowids; // in-memory rowids array (owned) + int modified; // 1 if validity/rowids were changed and need flush +}; + +/** + * Move one entry from (src_chunk, src_offset) to (dst_chunk, dst_offset). + * Copies vector data, metadata data, updates rowids position. + * In-memory validity/rowids are updated in the caller. + */ +static int vec0_optimize_move_entry( + vec0_vtab *p, + struct vec0_optimize_chunk *src, i64 src_offset, + struct vec0_optimize_chunk *dst, i64 dst_offset) { + int rc; + i64 rowid = src->rowids[src_offset]; + + // 1. Move vector data for each vector column + for (int i = 0; i < p->numVectorColumns; i++) { + size_t vec_size = vector_column_byte_size(p->vector_columns[i]); + void *buf = sqlite3_malloc(vec_size); + if (!buf) return SQLITE_NOMEM; + + // Read from source + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], + "vectors", src->chunk_id, 1, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_read(blob, buf, vec_size, src_offset * vec_size); + if (rc != SQLITE_OK) { sqlite3_blob_close(blob); sqlite3_free(buf); return rc; } + // Zero the source slot + void *zeros = sqlite3_malloc(vec_size); + if (!zeros) { sqlite3_blob_close(blob); sqlite3_free(buf); return SQLITE_NOMEM; } + memset(zeros, 0, vec_size); + rc = sqlite3_blob_write(blob, zeros, vec_size, src_offset * vec_size); + sqlite3_free(zeros); + sqlite3_blob_close(blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + + // Write to destination + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], + "vectors", dst->chunk_id, 1, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_write(blob, buf, vec_size, dst_offset * vec_size); + sqlite3_blob_close(blob); + sqlite3_free(buf); + if (rc != SQLITE_OK) return rc; + } + + // 2. Move metadata for each metadata column + for (int i = 0; i < p->numMetadataColumns; i++) { + vec0_metadata_column_kind kind = p->metadata_columns[i].kind; + + if (kind == VEC0_METADATA_COLUMN_KIND_BOOLEAN) { + // Boolean: bit-level copy + sqlite3_blob *srcBlob = NULL, *dstBlob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowMetadataChunksNames[i], "data", + src->chunk_id, 1, &srcBlob); + if (rc != SQLITE_OK) return rc; + + int blobSize = sqlite3_blob_bytes(srcBlob); + unsigned char *srcBuf = sqlite3_malloc(blobSize); + if (!srcBuf) { sqlite3_blob_close(srcBlob); return SQLITE_NOMEM; } + rc = sqlite3_blob_read(srcBlob, srcBuf, blobSize, 0); + if (rc != SQLITE_OK) { sqlite3_free(srcBuf); sqlite3_blob_close(srcBlob); return rc; } + + int srcBit = bitmap_get(srcBuf, src_offset); + // Clear source bit + bitmap_set(srcBuf, src_offset, 0); + rc = sqlite3_blob_write(srcBlob, srcBuf, blobSize, 0); + sqlite3_blob_close(srcBlob); + sqlite3_free(srcBuf); + if (rc != SQLITE_OK) return rc; + + // Set destination bit + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowMetadataChunksNames[i], "data", + dst->chunk_id, 1, &dstBlob); + if (rc != SQLITE_OK) return rc; + + blobSize = sqlite3_blob_bytes(dstBlob); + unsigned char *dstBuf = sqlite3_malloc(blobSize); + if (!dstBuf) { sqlite3_blob_close(dstBlob); return SQLITE_NOMEM; } + rc = sqlite3_blob_read(dstBlob, dstBuf, blobSize, 0); + if (rc != SQLITE_OK) { sqlite3_free(dstBuf); sqlite3_blob_close(dstBlob); return rc; } + + bitmap_set(dstBuf, dst_offset, srcBit); + rc = sqlite3_blob_write(dstBlob, dstBuf, blobSize, 0); + sqlite3_blob_close(dstBlob); + sqlite3_free(dstBuf); + if (rc != SQLITE_OK) return rc; + + } else { + // Integer, float, text view: fixed-size per slot + int slot_size; + switch (kind) { + case VEC0_METADATA_COLUMN_KIND_INTEGER: slot_size = sizeof(i64); break; + case VEC0_METADATA_COLUMN_KIND_FLOAT: slot_size = sizeof(double); break; + case VEC0_METADATA_COLUMN_KIND_TEXT: slot_size = VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH; break; + default: return SQLITE_ERROR; + } + + void *buf = sqlite3_malloc(slot_size); + if (!buf) return SQLITE_NOMEM; + + // Read from source + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowMetadataChunksNames[i], "data", + src->chunk_id, 1, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_read(blob, buf, slot_size, src_offset * slot_size); + if (rc != SQLITE_OK) { sqlite3_blob_close(blob); sqlite3_free(buf); return rc; } + // Zero source slot + void *zeros = sqlite3_malloc(slot_size); + if (!zeros) { sqlite3_blob_close(blob); sqlite3_free(buf); return SQLITE_NOMEM; } + memset(zeros, 0, slot_size); + rc = sqlite3_blob_write(blob, zeros, slot_size, src_offset * slot_size); + sqlite3_free(zeros); + sqlite3_blob_close(blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + + // Write to destination + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowMetadataChunksNames[i], "data", + dst->chunk_id, 1, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_write(blob, buf, slot_size, dst_offset * slot_size); + sqlite3_blob_close(blob); + sqlite3_free(buf); + if (rc != SQLITE_OK) return rc; + } + } + + // 3. Update in-memory validity and rowids + bitmap_set(src->validity, src_offset, 0); + bitmap_set(dst->validity, dst_offset, 1); + src->rowids[src_offset] = 0; + dst->rowids[dst_offset] = rowid; + src->modified = 1; + dst->modified = 1; + + // 4. Update _rowids table position + rc = vec0_rowids_update_position(p, rowid, dst->chunk_id, dst_offset); + return rc; +} + +/** + * Delete a chunk and all its associated shadow table data. + * Does NOT check if it's empty — caller must ensure that. + */ +static int vec0_optimize_delete_chunk(vec0_vtab *p, i64 chunk_id) { + int rc; + char *zSql; + sqlite3_stmt *stmt; + + // Delete from _chunks + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE chunk_id = ?", + p->schemaName, p->tableName); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + + // Delete from each _vector_chunksNN + for (int i = 0; i < p->numVectorColumns; i++) { + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, i); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + } + + // Delete from each _metadatachunksNN + for (int i = 0; i < p->numMetadataColumns; i++) { + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_METADATA_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, i); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +/** + * Flush modified in-memory validity and rowids blobs back to the DB. + */ +static int vec0_optimize_flush_chunk(vec0_vtab *p, struct vec0_optimize_chunk *c) { + int rc; + sqlite3_blob *blob = NULL; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "validity", + c->chunk_id, 1, &blob); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_blob_write(blob, c->validity, c->validity_size, 0); + sqlite3_blob_close(blob); + if (rc != SQLITE_OK) return rc; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", + c->chunk_id, 1, &blob); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_blob_write(blob, c->rowids, c->rowids_size, 0); + sqlite3_blob_close(blob); + return rc; +} + +/** + * Optimize one partition: compact live entries from newer chunks into + * older chunks, then delete any emptied chunks. + */ +static int vec0_optimize_one_partition(vec0_vtab *p, sqlite3_stmt *stmtChunks) { + int rc = SQLITE_OK; + int nChunks = 0; + int nAlloced = 0; + struct vec0_optimize_chunk *chunks = NULL; + + // Step 1: Load all chunks for this partition into memory + while ((rc = sqlite3_step(stmtChunks)) == SQLITE_ROW) { + if (nChunks >= nAlloced) { + nAlloced = nAlloced ? nAlloced * 2 : 8; + struct vec0_optimize_chunk *tmp = sqlite3_realloc(chunks, nAlloced * sizeof(*chunks)); + if (!tmp) { rc = SQLITE_NOMEM; goto cleanup; } + chunks = tmp; + } + + struct vec0_optimize_chunk *c = &chunks[nChunks]; + memset(c, 0, sizeof(*c)); + c->chunk_id = sqlite3_column_int64(stmtChunks, 0); + c->modified = 0; + + // Read validity blob + const void *vBlob = sqlite3_column_blob(stmtChunks, 1); + c->validity_size = sqlite3_column_bytes(stmtChunks, 1); + c->validity = sqlite3_malloc(c->validity_size); + if (!c->validity) { rc = SQLITE_NOMEM; goto cleanup; } + memcpy(c->validity, vBlob, c->validity_size); + + // Read rowids blob + const void *rBlob = sqlite3_column_blob(stmtChunks, 2); + c->rowids_size = sqlite3_column_bytes(stmtChunks, 2); + c->rowids = sqlite3_malloc(c->rowids_size); + if (!c->rowids) { rc = SQLITE_NOMEM; goto cleanup; } + memcpy(c->rowids, rBlob, c->rowids_size); + + nChunks++; + } + if (rc != SQLITE_DONE) goto cleanup; + rc = SQLITE_OK; + + // Nothing to compact with 0 or 1 chunks + if (nChunks <= 1) goto cleanup; + + // Step 2: Two-pointer compaction + { + int left = 0; // index of target chunk (oldest with free space) + int right = nChunks - 1; // index of source chunk (newest) + int left_free = -1; // next free slot in left chunk + int right_live = -1; // next live slot in right chunk (scan from end) + + // Find first free slot in left chunk + for (int i = 0; i < p->chunk_size; i++) { + if (!bitmap_get(chunks[left].validity, i)) { left_free = i; break; } + } + // If left chunk is full, advance + while (left < right && left_free < 0) { + left++; + for (int i = 0; i < p->chunk_size && left < right; i++) { + if (!bitmap_get(chunks[left].validity, i)) { left_free = i; break; } + } + } + + // Find last live slot in right chunk (scan backwards for efficiency) + for (int i = p->chunk_size - 1; i >= 0; i--) { + if (bitmap_get(chunks[right].validity, i)) { right_live = i; break; } + } + // If right chunk is empty, retreat + while (left < right && right_live < 0) { + right--; + for (int i = p->chunk_size - 1; i >= 0; i--) { + if (bitmap_get(chunks[right].validity, i)) { right_live = i; break; } + } + } + + while (left < right) { + // Move entry from right to left + rc = vec0_optimize_move_entry(p, + &chunks[right], right_live, + &chunks[left], left_free); + if (rc != SQLITE_OK) goto cleanup; + + // Advance left_free to next free slot in current left chunk + { + int prev = left_free; + left_free = -1; + for (int i = prev + 1; i < p->chunk_size; i++) { + if (!bitmap_get(chunks[left].validity, i)) { left_free = i; break; } + } + } + // If left chunk is now full, advance to next chunk + while (left < right && left_free < 0) { + left++; + if (left >= right) break; + for (int i = 0; i < p->chunk_size; i++) { + if (!bitmap_get(chunks[left].validity, i)) { left_free = i; break; } + } + } + + // Retreat right_live to previous live slot in current right chunk + { + int prev = right_live; + right_live = -1; + for (int i = prev - 1; i >= 0; i--) { + if (bitmap_get(chunks[right].validity, i)) { right_live = i; break; } + } + } + // If right chunk is now empty, retreat to previous chunk + while (left < right && right_live < 0) { + right--; + if (left >= right) break; + for (int i = p->chunk_size - 1; i >= 0; i--) { + if (bitmap_get(chunks[right].validity, i)) { right_live = i; break; } + } + } + } + } + + // Step 3: Flush modified chunks, delete empty ones + for (int i = 0; i < nChunks; i++) { + // Check if chunk is now empty + int allZero = 1; + for (int j = 0; j < chunks[i].validity_size; j++) { + if (chunks[i].validity[j] != 0) { allZero = 0; break; } + } + + if (allZero) { + rc = vec0_optimize_delete_chunk(p, chunks[i].chunk_id); + if (rc != SQLITE_OK) goto cleanup; + } else if (chunks[i].modified) { + rc = vec0_optimize_flush_chunk(p, &chunks[i]); + if (rc != SQLITE_OK) goto cleanup; + } + } + +cleanup: + if (chunks) { + for (int i = 0; i < nChunks; i++) { + sqlite3_free(chunks[i].validity); + sqlite3_free(chunks[i].rowids); + } + sqlite3_free(chunks); + } + return rc; +} + +/** + * Top-level optimize: wraps everything in a savepoint, iterates partitions. + */ +static int vec0_optimize(vec0_vtab *p) { + int rc; + char *zSql; + sqlite3_stmt *stmt = NULL; + + // Free cached statements that may hold references to shadow tables + if (p->stmtLatestChunk) { + sqlite3_finalize(p->stmtLatestChunk); + p->stmtLatestChunk = NULL; + } + if (p->stmtRowidsUpdatePosition) { + sqlite3_finalize(p->stmtRowidsUpdatePosition); + p->stmtRowidsUpdatePosition = NULL; + } + + if (p->numPartitionColumns == 0) { + // No partitions: single pass over all chunks + zSql = sqlite3_mprintf( + "SELECT chunk_id, validity, rowids FROM " VEC0_SHADOW_CHUNKS_NAME + " ORDER BY chunk_id ASC", + p->schemaName, p->tableName); + if (!zSql) { rc = SQLITE_NOMEM; goto done; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) goto done; + + rc = vec0_optimize_one_partition(p, stmt); + sqlite3_finalize(stmt); + stmt = NULL; + if (rc != SQLITE_OK) goto done; + } else { + // Partitioned: get distinct partition values, then optimize each + sqlite3_str *s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "SELECT DISTINCT "); + for (int i = 0; i < p->numPartitionColumns; i++) { + if (i > 0) sqlite3_str_appendall(s, ", "); + sqlite3_str_appendf(s, "partition%02d", i); + } + sqlite3_str_appendf(s, " FROM " VEC0_SHADOW_CHUNKS_NAME, + p->schemaName, p->tableName); + zSql = sqlite3_str_finish(s); + if (!zSql) { rc = SQLITE_NOMEM; goto done; } + + sqlite3_stmt *stmtPartitions = NULL; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtPartitions, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) goto done; + + while ((rc = sqlite3_step(stmtPartitions)) == SQLITE_ROW) { + // Build query for this partition's chunks + sqlite3_str *cs = sqlite3_str_new(NULL); + sqlite3_str_appendf(cs, + "SELECT chunk_id, validity, rowids FROM " VEC0_SHADOW_CHUNKS_NAME + " WHERE ", + p->schemaName, p->tableName); + for (int i = 0; i < p->numPartitionColumns; i++) { + if (i > 0) sqlite3_str_appendall(cs, " AND "); + sqlite3_str_appendf(cs, "partition%02d = ?", i); + } + sqlite3_str_appendall(cs, " ORDER BY chunk_id ASC"); + char *zChunkSql = sqlite3_str_finish(cs); + if (!zChunkSql) { sqlite3_finalize(stmtPartitions); rc = SQLITE_NOMEM; goto done; } + + sqlite3_stmt *stmtChunks = NULL; + rc = sqlite3_prepare_v2(p->db, zChunkSql, -1, &stmtChunks, NULL); + sqlite3_free(zChunkSql); + if (rc != SQLITE_OK) { sqlite3_finalize(stmtPartitions); goto done; } + + for (int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_bind_value(stmtChunks, i + 1, sqlite3_column_value(stmtPartitions, i)); + } + + rc = vec0_optimize_one_partition(p, stmtChunks); + sqlite3_finalize(stmtChunks); + if (rc != SQLITE_OK) { sqlite3_finalize(stmtPartitions); goto done; } + } + sqlite3_finalize(stmtPartitions); + if (rc != SQLITE_DONE) goto done; + rc = SQLITE_OK; + } + +done: + // Invalidate stmtLatestChunk since chunks may have been deleted + if (p->stmtLatestChunk) { + sqlite3_finalize(p->stmtLatestChunk); + p->stmtLatestChunk = NULL; + } + + return rc; +} + int vec0Update_UpdateAuxColumn(vec0_vtab *p, int auxiliary_column_idx, sqlite3_value * value, i64 rowid) { int rc; sqlite3_stmt *stmt; diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 21629ef..f1d6192 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -72,10 +72,13 @@ $(TARGET_DIR)/vec_mismatch: vec-mismatch.c $(FUZZ_SRCS) | $(TARGET_DIR) $(TARGET_DIR)/vec0_delete_completeness: vec0-delete-completeness.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ +$(TARGET_DIR)/vec0_optimize: vec0-optimize.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + FUZZ_TARGETS = vec0_create exec json numpy \ shadow_corrupt vec0_operations scalar_functions \ vec0_create_full metadata_columns vec_each vec_mismatch \ - vec0_delete_completeness + vec0_delete_completeness vec0_optimize all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/vec0-optimize.c b/tests/fuzz/vec0-optimize.c new file mode 100644 index 0000000..27be601 --- /dev/null +++ b/tests/fuzz/vec0-optimize.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* + * Fuzz target for the vec0 optimize command. + * Performs random INSERT/DELETE operations, then runs optimize, + * and asserts that all remaining rows are still queryable and + * the virtual table is in a consistent state. + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 4) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=4)", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, emb FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtScan) goto cleanup; + + /* Track which rowids are live */ + uint8_t live[16]; + memset(live, 0, sizeof(live)); + + size_t i = 0; + while (i + 2 <= size - 2) { /* reserve 2 bytes for optimize trigger */ + uint8_t op = data[i++] % 3; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 16) + 1; + + switch (op) { + case 0: { + /* INSERT */ + float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 4 && i < size - 2; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + rc = sqlite3_step(stmtInsert); + if (rc == SQLITE_DONE) { + live[rowid - 1] = 1; + } + break; + } + case 1: { + /* DELETE */ + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + rc = sqlite3_step(stmtDelete); + if (rc == SQLITE_DONE) { + live[rowid - 1] = 0; + } + break; + } + case 2: { + /* Full scan */ + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + /* Run optimize */ + rc = sqlite3_exec(db, "INSERT INTO v(v) VALUES ('optimize')", NULL, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Verify: all live rows are still queryable */ + int expected_count = 0; + for (int j = 0; j < 16; j++) { + if (live[j]) expected_count++; + } + + sqlite3_stmt *stmtCount = NULL; + sqlite3_prepare_v2(db, "SELECT count(*) FROM v", -1, &stmtCount, NULL); + if (stmtCount) { + rc = sqlite3_step(stmtCount); + assert(rc == SQLITE_ROW); + int actual_count = sqlite3_column_int(stmtCount, 0); + assert(actual_count == expected_count); + sqlite3_finalize(stmtCount); + } + + /* Verify each live row is accessible via point query */ + sqlite3_stmt *stmtPoint = NULL; + sqlite3_prepare_v2(db, "SELECT emb FROM v WHERE rowid = ?", -1, &stmtPoint, NULL); + if (stmtPoint) { + for (int j = 0; j < 16; j++) { + if (!live[j]) continue; + sqlite3_reset(stmtPoint); + sqlite3_bind_int64(stmtPoint, 1, j + 1); + rc = sqlite3_step(stmtPoint); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_bytes(stmtPoint, 0) == 16); + } + sqlite3_finalize(stmtPoint); + } + + /* Verify shadow table consistency: _rowids count matches live count */ + sqlite3_stmt *stmtRowids = NULL; + sqlite3_prepare_v2(db, "SELECT count(*) FROM v_rowids", -1, &stmtRowids, NULL); + if (stmtRowids) { + rc = sqlite3_step(stmtRowids); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_int(stmtRowids, 0) == expected_count); + sqlite3_finalize(stmtRowids); + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/test-optimize.py b/tests/test-optimize.py new file mode 100644 index 0000000..67179e0 --- /dev/null +++ b/tests/test-optimize.py @@ -0,0 +1,450 @@ +import sqlite3 +import struct +import pytest +from helpers import _f32, _i64, _int8, exec + + +def test_optimize_basic(db): + """Insert 16 rows (2 chunks of 8), delete 6 from chunk 1, optimize → 1 chunk.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 2 + + # Delete 6 from chunk 1 (rows 1-6), leaving 2 live in chunk 1 + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + # 10 live rows: 2 in chunk 1, 8 in chunk 2 + assert db.execute("select count(*) from v").fetchone()[0] == 10 + + db.execute("insert into v(v) values ('optimize')") + + # After optimize: 10 entries should fit in 2 chunks (8+2) + # but the 8 from chunk 2 can't all be moved into 6 free slots of chunk 1, + # so we should still have at least 2 chunks. + # Actually: left=chunk1(6 free), right=chunk2(8 live) + # Move 6 entries from chunk2 → chunk1, chunk2 still has 2 live → 2 chunks remain + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 2 + + # All 10 rows still queryable + rows = db.execute("select rowid from v order by rowid").fetchall() + assert [r[0] for r in rows] == list(range(7, 17)) + + for i in range(7, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_full_compaction(db): + """Insert 24 rows (3 chunks of 8), delete all but 4, optimize → 1 chunk.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 25): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 3 + + # Keep rows 1,2,3,4 in chunk 1, delete everything else + for i in range(5, 25): + db.execute("delete from v where rowid = ?", [i]) + + assert db.execute("select count(*) from v").fetchone()[0] == 4 + + db.execute("insert into v(v) values ('optimize')") + + # Only 1 chunk should remain + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1 + assert db.execute("select count(*) from v_vector_chunks00").fetchone()[0] == 1 + + # All 4 rows still queryable + for i in range(1, 5): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_noop_clean_table(db): + """Insert exactly 8 rows (1 full chunk), optimize is a no-op.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 9): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + db.execute("insert into v(v) values ('optimize')") + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1 + for i in range(1, 9): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_empty_table(db): + """Optimize on empty table is a no-op.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + db.execute("insert into v(v) values ('optimize')") + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0 + + +def test_optimize_knn_still_works(db): + """After optimize, KNN queries return correct results.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + # Delete first 6 rows + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + # KNN query for vector closest to [7,7,7,7] + knn = db.execute( + "select rowid, distance from v where emb match ? and k = 1", + [_f32([7.0, 7.0, 7.0, 7.0])], + ).fetchall() + assert len(knn) == 1 + assert knn[0][0] == 7 + + +def test_optimize_fullscan_still_works(db): + """After optimize, SELECT * returns all rows.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + rows = db.execute("select rowid, emb from v order by rowid").fetchall() + assert len(rows) == 10 + for row in rows: + assert row[1] == _f32([float(row[0])] * 4) + + +def test_optimize_partitioned(db): + """Two partitions each fragmented → optimized independently.""" + db.execute( + "create virtual table v using vec0(" + "part text partition key, emb float[4], chunk_size=8" + ")" + ) + + # Partition A: 16 rows (2 chunks) + for i in range(1, 17): + db.execute( + "insert into v(rowid, part, emb) values (?, 'A', ?)", + [i, _f32([float(i)] * 4)], + ) + + # Partition B: 16 rows (2 chunks) + for i in range(17, 33): + db.execute( + "insert into v(rowid, part, emb) values (?, 'B', ?)", + [i, _f32([float(i)] * 4)], + ) + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 4 + + # Delete 7 from each partition's first chunk + for i in range(1, 8): + db.execute("delete from v where rowid = ?", [i]) + for i in range(17, 24): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + # Each partition had 9 live entries: fits in 2 chunks each → 4 total + # (7 free in chunk1 + 8 live in chunk2 → move 7 → chunk2 has 1 live → still 2 chunks) + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 4 + + # All remaining rows still accessible + for i in range(8, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + for i in range(24, 33): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_with_metadata(db): + """Optimize with integer, float, boolean, and short text metadata.""" + db.execute( + "create virtual table v using vec0(" + "emb float[4], " + "m_bool boolean, " + "m_int integer, " + "m_float float, " + "m_text text, " + "chunk_size=8" + ")" + ) + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb, m_bool, m_int, m_float, m_text) " + "values (?, ?, ?, ?, ?, ?)", + [i, _f32([float(i)] * 4), i % 2 == 0, i * 10, float(i) / 2.0, f"t{i}"], + ) + + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + # Verify metadata preserved + for i in range(7, 17): + row = db.execute( + "select m_bool, m_int, m_float, m_text from v where rowid = ?", [i] + ).fetchone() + assert row[0] == (1 if i % 2 == 0 else 0), f"bool mismatch at rowid {i}" + assert row[1] == i * 10, f"int mismatch at rowid {i}" + assert abs(row[2] - float(i) / 2.0) < 1e-6, f"float mismatch at rowid {i}" + assert row[3] == f"t{i}", f"text mismatch at rowid {i}" + + +def test_optimize_with_auxiliary(db): + """Aux data still accessible after optimize (keyed by rowid, no move needed).""" + db.execute( + "create virtual table v using vec0(" + "emb float[4], +aux_text text, chunk_size=8" + ")" + ) + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb, aux_text) values (?, ?, ?)", + [i, _f32([float(i)] * 4), f"aux_{i}"], + ) + + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + for i in range(7, 17): + row = db.execute( + "select aux_text from v where rowid = ?", [i] + ).fetchone() + assert row[0] == f"aux_{i}" + + +def test_optimize_text_pk(db): + """Rowids correctly updated, text PKs still work after optimize.""" + db.execute( + "create virtual table v using vec0(" + "id text primary key, emb float[4], chunk_size=8" + ")" + ) + + for i in range(1, 17): + db.execute( + "insert into v(id, emb) values (?, ?)", + [f"doc_{i}", _f32([float(i)] * 4)], + ) + + for i in range(1, 7): + db.execute("delete from v where id = ?", [f"doc_{i}"]) + + db.execute("insert into v(v) values ('optimize')") + + for i in range(7, 17): + row = db.execute( + "select emb from v where id = ?", [f"doc_{i}"] + ).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def _file_db(tmp_path): + """Open a file-backed DB (required for page_count to shrink after VACUUM).""" + db = sqlite3.connect(str(tmp_path / "test.db")) + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def test_optimize_disk_space_reclaimed(tmp_path): + """PRAGMA page_count decreases after optimize + VACUUM.""" + dims = 256 + db = _file_db(tmp_path) + db.execute(f"create virtual table v using vec0(emb float[{dims}], chunk_size=8)") + + for i in range(1, 25): # 3 full chunks of 8 + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * dims)], + ) + db.commit() + pages_before = db.execute("pragma page_count").fetchone()[0] + + # Delete 20 of 24 rows (leaving 4 live) + for i in range(5, 25): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + db.commit() + + db.execute("vacuum") + pages_after = db.execute("pragma page_count").fetchone()[0] + assert pages_after < pages_before, ( + f"page_count should shrink after optimize+vacuum: " + f"{pages_before} -> {pages_after}" + ) + + # Remaining rows still work + for i in range(1, 5): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * dims) + db.close() + + +def test_optimize_unknown_command(db): + """Unknown command gives SQLITE_ERROR with message.""" + result = exec(db, "insert into v(v) values ('bogus')") + # We need a table first + db.execute("create virtual table v2 using vec0(emb float[4], chunk_size=8)") + result = exec(db, "insert into v2(v2) values ('bogus')") + assert "error" in result + assert "Unknown" in result["message"] or "unknown" in result["message"] + + +def test_optimize_insert_after(db): + """Inserting new rows after optimize still works correctly.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + # Insert new rows after optimize + for i in range(100, 108): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + # Both old and new rows queryable + for i in range(7, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + for i in range(100, 108): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_multiple_moves_from_same_chunk(db): + """Ensure multiple live entries in the same source chunk are all moved.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + # 24 rows = 3 chunks of 8 + for i in range(1, 25): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + # Delete all of chunk 1 (1-8) — leaves 8 free slots + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + # Delete half of chunk 2 (9-12) — leaves 4 live in chunk 2, 8 live in chunk 3 + for i in range(9, 13): + db.execute("delete from v where rowid = ?", [i]) + + # 12 live rows total: 4 in chunk 2 (offsets 4-7), 8 in chunk 3 (offsets 0-7) + assert db.execute("select count(*) from v").fetchone()[0] == 12 + + db.execute("insert into v(v) values ('optimize')") + + # After optimize: all 12 should fit in 2 chunks, chunk 3 should be emptied + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 2 + + # All remaining rows still queryable with correct vectors + for i in range(13, 25): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_scattered_deletes(db): + """Delete every other row to create scattered free slots across chunks.""" + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 25): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + # Delete even rows: 2,4,6,8,10,12,14,16,18,20,22,24 + for i in range(2, 25, 2): + db.execute("delete from v where rowid = ?", [i]) + + # 12 live rows scattered across 3 chunks + assert db.execute("select count(*) from v").fetchone()[0] == 12 + + db.execute("insert into v(v) values ('optimize')") + + # After optimize: 12 rows should fit in 2 chunks + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 2 + + # All remaining odd rows still queryable + for i in range(1, 25, 2): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_optimize_with_long_text_metadata(db): + """Long text metadata (overflow) preserved after optimize.""" + db.execute( + "create virtual table v using vec0(" + "emb float[4], m_text text, chunk_size=8" + ")" + ) + + long_text = "x" * 100 # >12 chars, stored in overflow table + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb, m_text) values (?, ?, ?)", + [i, _f32([float(i)] * 4), f"{long_text}_{i}"], + ) + + for i in range(1, 7): + db.execute("delete from v where rowid = ?", [i]) + + db.execute("insert into v(v) values ('optimize')") + + for i in range(7, 17): + row = db.execute( + "select m_text from v where rowid = ?", [i] + ).fetchone() + assert row[0] == f"{long_text}_{i}" diff --git a/tests/test-unit.c b/tests/test-unit.c index 269a990..7199dba 100644 --- a/tests/test-unit.c +++ b/tests/test-unit.c @@ -659,6 +659,252 @@ void test_distance_hamming() { printf(" All distance_hamming tests passed.\n"); } +// Helper: create an in-memory DB with vec0 loaded +static sqlite3 *test_db_open(void) { + sqlite3 *db; + int rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + return db; +} + +// Helper: execute SQL, assert success +static void test_exec(sqlite3 *db, const char *sql) { + char *errmsg = NULL; + int rc = sqlite3_exec(db, sql, NULL, NULL, &errmsg); + if (rc != SQLITE_OK) { + fprintf(stderr, "SQL error: %s\n SQL: %s\n", errmsg ? errmsg : "(null)", sql); + sqlite3_free(errmsg); + assert(0); + } +} + +// Helper: execute SQL, return integer from first column of first row +static int test_exec_int(sqlite3 *db, const char *sql) { + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL); + assert(rc == SQLITE_OK); + rc = sqlite3_step(stmt); + assert(rc == SQLITE_ROW); + int val = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + return val; +} + +// Helper: insert a float[4] vector with given rowid +static void test_insert_f4(sqlite3 *db, int64_t rowid, float v0, float v1, float v2, float v3) { + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + assert(rc == SQLITE_OK); + float vec[4] = {v0, v1, v2, v3}; + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + rc = sqlite3_step(stmt); + assert(rc == SQLITE_DONE); + sqlite3_finalize(stmt); +} + +// Helper: verify a float[4] vector at given rowid +static void test_verify_f4(sqlite3 *db, int64_t rowid, float v0, float v1, float v2, float v3) { + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(db, + "SELECT emb FROM v WHERE rowid = ?", -1, &stmt, NULL); + assert(rc == SQLITE_OK); + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + assert(rc == SQLITE_ROW); + const float *blob = sqlite3_column_blob(stmt, 0); + assert(blob != NULL); + assert(sqlite3_column_bytes(stmt, 0) == 16); + float eps = 1e-6f; + assert(fabsf(blob[0] - v0) < eps); + assert(fabsf(blob[1] - v1) < eps); + assert(fabsf(blob[2] - v2) < eps); + assert(fabsf(blob[3] - v3) < eps); + sqlite3_finalize(stmt); +} + +void test_optimize_basic(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + // Insert 16 rows (2 chunks) + for (int i = 1; i <= 16; i++) { + test_insert_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + assert(test_exec_int(db, "SELECT count(*) FROM v_chunks") == 2); + + // Delete first 6 rows + for (int i = 1; i <= 6; i++) { + char sql[64]; + snprintf(sql, sizeof(sql), "DELETE FROM v WHERE rowid = %d", i); + test_exec(db, sql); + } + assert(test_exec_int(db, "SELECT count(*) FROM v") == 10); + + // Optimize + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + + // All remaining rows still queryable + for (int i = 7; i <= 16; i++) { + test_verify_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_full_compaction(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + for (int i = 1; i <= 24; i++) { + test_insert_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + assert(test_exec_int(db, "SELECT count(*) FROM v_chunks") == 3); + + // Keep 1-4, delete 5-24 + for (int i = 5; i <= 24; i++) { + char sql[64]; + snprintf(sql, sizeof(sql), "DELETE FROM v WHERE rowid = %d", i); + test_exec(db, sql); + } + + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + + // Should compact to 1 chunk + assert(test_exec_int(db, "SELECT count(*) FROM v_chunks") == 1); + assert(test_exec_int(db, "SELECT count(*) FROM v_vector_chunks00") == 1); + + for (int i = 1; i <= 4; i++) { + test_verify_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_empty_table(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + // Optimize on empty table — should be no-op + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + assert(test_exec_int(db, "SELECT count(*) FROM v_chunks") == 0); + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_noop_full_chunk(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + for (int i = 1; i <= 8; i++) { + test_insert_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + // Single full chunk — optimize is no-op + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + assert(test_exec_int(db, "SELECT count(*) FROM v_chunks") == 1); + + for (int i = 1; i <= 8; i++) { + test_verify_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_knn_after(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + for (int i = 1; i <= 16; i++) { + test_insert_f4(db, i, (float)i, 0, 0, 0); + } + + for (int i = 1; i <= 6; i++) { + char sql[64]; + snprintf(sql, sizeof(sql), "DELETE FROM v WHERE rowid = %d", i); + test_exec(db, sql); + } + + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + + // KNN: find vector closest to [7,0,0,0] + sqlite3_stmt *stmt; + float query[4] = {7.0f, 0.0f, 0.0f, 0.0f}; + int rc = sqlite3_prepare_v2(db, + "SELECT rowid FROM v WHERE emb MATCH ? AND k = 1", -1, &stmt, NULL); + assert(rc == SQLITE_OK); + sqlite3_bind_blob(stmt, 1, query, sizeof(query), SQLITE_TRANSIENT); + rc = sqlite3_step(stmt); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_int64(stmt, 0) == 7); + sqlite3_finalize(stmt); + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_insert_after(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + for (int i = 1; i <= 16; i++) { + test_insert_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + for (int i = 1; i <= 6; i++) { + char sql[64]; + snprintf(sql, sizeof(sql), "DELETE FROM v WHERE rowid = %d", i); + test_exec(db, sql); + } + + test_exec(db, "INSERT INTO v(v) VALUES ('optimize')"); + + // Insert new rows after optimize + for (int i = 100; i < 108; i++) { + test_insert_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + // Both old and new rows queryable + for (int i = 7; i <= 16; i++) { + test_verify_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + for (int i = 100; i < 108; i++) { + test_verify_f4(db, i, (float)i, (float)i, (float)i, (float)i); + } + + sqlite3_close(db); + printf(" Passed.\n"); +} + +void test_optimize_unknown_command(void) { + printf("Starting %s...\n", __func__); + sqlite3 *db = test_db_open(); + test_exec(db, "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=8)"); + + char *errmsg = NULL; + int rc = sqlite3_exec(db, "INSERT INTO v(v) VALUES ('bogus')", NULL, NULL, &errmsg); + assert(rc != SQLITE_OK); + assert(errmsg != NULL); + assert(strstr(errmsg, "nknown") != NULL || strstr(errmsg, "unknown") != NULL); + sqlite3_free(errmsg); + + sqlite3_close(db); + printf(" Passed.\n"); +} + int main() { printf("Starting unit tests...\n"); #ifdef SQLITE_VEC_ENABLE_AVX @@ -677,5 +923,12 @@ int main() { test_distance_l2_sqr_float(); test_distance_cosine_float(); test_distance_hamming(); + test_optimize_basic(); + test_optimize_full_compaction(); + test_optimize_empty_table(); + test_optimize_noop_full_chunk(); + test_optimize_knn_after(); + test_optimize_insert_after(); + test_optimize_unknown_command(); printf("All unit tests passed.\n"); }