diff --git a/sqlite-vec.c b/sqlite-vec.c index cc09078..a4b9c11 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -3447,6 +3447,13 @@ static sqlite3_module vec_npy_eachModule = { #define VEC0_SHADOW_VECTOR_N_NAME "\"%w\".\"%w_vector_chunks%02d\"" /// 1) schema, 2) original vtab table name +// +// IMPORTANT: "rowid" is declared as PRIMARY KEY but WITHOUT the INTEGER type. +// This means it is NOT a true SQLite rowid alias — the user-defined "rowid" +// column and the internal SQLite rowid (_rowid_) are two separate values. +// When inserting, both must be set explicitly to keep them in sync. See the +// _rowid_ bindings in vec0_new_chunk() and the explanation in +// SHADOW_TABLE_ROWID_QUIRK below. #define VEC0_SHADOW_VECTOR_N_CREATE \ "CREATE TABLE " VEC0_SHADOW_VECTOR_N_NAME "(" \ "rowid PRIMARY KEY," \ @@ -4506,6 +4513,20 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk // Step 2: Create new vector chunks for each vector column, with // that new chunk_rowid. + // + // SHADOW_TABLE_ROWID_QUIRK: The _vector_chunksNN and _metadatachunksNN + // shadow tables declare "rowid PRIMARY KEY" without the INTEGER type, so + // the user-defined "rowid" column is NOT an alias for the internal SQLite + // rowid (_rowid_). When only appending rows these two happen to stay in + // sync, but after a chunk is deleted (vec0Update_Delete_DeleteChunkIfEmpty) + // and a new one is created, the auto-assigned _rowid_ can diverge from the + // user "rowid" value. Since sqlite3_blob_open() addresses rows by internal + // _rowid_, we must explicitly set BOTH _rowid_ and "rowid" to the same + // value so that later blob operations can find the row. + // + // The correct long-term fix is changing the schema to + // "rowid INTEGER PRIMARY KEY" + // which makes it a true alias, but that would break existing databases. for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { @@ -4515,9 +4536,10 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk i64 vectorsSize = p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); + // See SHADOW_TABLE_ROWID_QUIRK above for why _rowid_ and rowid are both set. zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_VECTOR_N_NAME - "(rowid, vectors)" - "VALUES (?, ?)", + "(_rowid_, rowid, vectors)" + "VALUES (?, ?, ?)", p->schemaName, p->tableName, vector_column_idx); if (!zSql) { return SQLITE_NOMEM; @@ -4530,8 +4552,9 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk return rc; } - sqlite3_bind_int64(stmt, 1, rowid); - sqlite3_bind_zeroblob64(stmt, 2, vectorsSize); + sqlite3_bind_int64(stmt, 1, rowid); // _rowid_ (internal SQLite rowid) + sqlite3_bind_int64(stmt, 2, rowid); // rowid (user-defined column) + sqlite3_bind_zeroblob64(stmt, 3, vectorsSize); rc = sqlite3_step(stmt); sqlite3_finalize(stmt); @@ -4546,9 +4569,10 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk continue; } int metadata_column_idx = p->user_column_idxs[i]; + // See SHADOW_TABLE_ROWID_QUIRK above for why _rowid_ and rowid are both set. zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_METADATA_N_NAME - "(rowid, data)" - "VALUES (?, ?)", + "(_rowid_, rowid, data)" + "VALUES (?, ?, ?)", p->schemaName, p->tableName, metadata_column_idx); if (!zSql) { return SQLITE_NOMEM; @@ -4561,8 +4585,9 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk return rc; } - sqlite3_bind_int64(stmt, 1, rowid); - sqlite3_bind_zeroblob64(stmt, 2, vec0_metadata_chunk_size(p->metadata_columns[metadata_column_idx].kind, p->chunk_size)); + sqlite3_bind_int64(stmt, 1, rowid); // _rowid_ (internal SQLite rowid) + sqlite3_bind_int64(stmt, 2, rowid); // rowid (user-defined column) + sqlite3_bind_zeroblob64(stmt, 3, vec0_metadata_chunk_size(p->metadata_columns[metadata_column_idx].kind, p->chunk_size)); rc = sqlite3_step(stmt); sqlite3_finalize(stmt); @@ -5126,6 +5151,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); } + // See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY" + // without INTEGER type issue applies here. for (int i = 0; i < pNew->numMetadataColumns; i++) { char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_N_NAME "(rowid PRIMARY KEY, data BLOB NOT NULL);", pNew->schemaName, pNew->tableName, i); @@ -8574,6 +8601,200 @@ int vec0Update_Delete_ClearValidity(vec0_vtab *p, i64 chunk_id, return SQLITE_OK; } +int vec0Update_Delete_ClearRowid(vec0_vtab *p, i64 chunk_id, + u64 chunk_offset) { + int rc, brc; + sqlite3_blob *blobChunksRowids = NULL; + i64 zero = 0; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", + chunk_id, 1, &blobChunksRowids); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "could not open rowids blob for %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return SQLITE_ERROR; + } + + rc = sqlite3_blob_write(blobChunksRowids, &zero, sizeof(zero), + chunk_offset * sizeof(i64)); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, + "could not write to rowids blob for %s.%s.%lld at %llu", + p->schemaName, p->shadowChunksName, chunk_id, chunk_offset); + } + + brc = sqlite3_blob_close(blobChunksRowids); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "vec0 deletion error: Error commiting rowids blob " + "transaction on %s.%s.%lld at %llu", + p->schemaName, p->shadowChunksName, chunk_id, chunk_offset); + return brc; + } + return SQLITE_OK; +} + +int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, + u64 chunk_offset) { + int rc, brc; + for (int i = 0; i < p->numVectorColumns; i++) { + sqlite3_blob *blobVectors = NULL; + size_t n = vector_column_byte_size(p->vector_columns[i]); + + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowVectorChunksNames[i], "vectors", + chunk_id, 1, &blobVectors); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, + "could not open vector blob for %s.%s.%lld column %d", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id, i); + return SQLITE_ERROR; + } + + void *zeroBuf = sqlite3_malloc(n); + if (!zeroBuf) { + sqlite3_blob_close(blobVectors); + return SQLITE_NOMEM; + } + memset(zeroBuf, 0, n); + + rc = sqlite3_blob_write(blobVectors, zeroBuf, n, chunk_offset * n); + sqlite3_free(zeroBuf); + if (rc != SQLITE_OK) { + vtab_set_error( + &p->base, + "could not write to vector blob for %s.%s.%lld at %llu column %d", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id, + chunk_offset, i); + } + + brc = sqlite3_blob_close(blobVectors); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "vec0 deletion error: Error commiting vector blob " + "transaction on %s.%s.%lld column %d", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id, i); + return brc; + } + } + return SQLITE_OK; +} + +int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, + int *deleted) { + int rc, brc; + sqlite3_blob *blobValidity = NULL; + *deleted = 0; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "validity", + chunk_id, 0, &blobValidity); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, + "could not open validity blob for chunk %lld", chunk_id); + return SQLITE_ERROR; + } + + int validitySize = sqlite3_blob_bytes(blobValidity); + unsigned char *validityBuf = sqlite3_malloc(validitySize); + if (!validityBuf) { + sqlite3_blob_close(blobValidity); + return SQLITE_NOMEM; + } + + rc = sqlite3_blob_read(blobValidity, validityBuf, validitySize, 0); + brc = sqlite3_blob_close(blobValidity); + if (rc != SQLITE_OK) { + sqlite3_free(validityBuf); + return rc; + } + if (brc != SQLITE_OK) { + sqlite3_free(validityBuf); + return brc; + } + + int allZero = 1; + for (int i = 0; i < validitySize; i++) { + if (validityBuf[i] != 0) { + allZero = 0; + break; + } + } + sqlite3_free(validityBuf); + + if (!allZero) { + return SQLITE_OK; + } + + // All validity bits are zero — delete this chunk and its associated data + char *zSql; + sqlite3_stmt *stmt; + + // Delete from _chunks + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE rowid = ?", + p->schemaName, p->tableName); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + + // Delete from each _vector_chunksNN + for (int i = 0; i < p->numVectorColumns; i++) { + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + + // Delete from each _metadatachunksNN + for (int i = 0; i < p->numMetadataColumns; i++) { + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_METADATA_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + + // Invalidate cached stmtLatestChunk so it gets re-prepared on next insert + if (p->stmtLatestChunk) { + sqlite3_finalize(p->stmtLatestChunk); + p->stmtLatestChunk = NULL; + } + + *deleted = 1; + return SQLITE_OK; +} + int vec0Update_Delete_DeleteRowids(vec0_vtab *p, i64 rowid) { int rc; sqlite3_stmt *stmt = NULL; @@ -8735,16 +8956,23 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { return rc; } + // 2. clear validity bit rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset); if (rc != SQLITE_OK) { return rc; } // 3. zero out rowid in chunks.rowids - // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 4. zero out any data in vector chunks tables - // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 5. delete from _rowids table rc = vec0Update_Delete_DeleteRowids(p, rowid); @@ -8760,9 +8988,21 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { } } - // 6. delete metadata + // 7. delete metadata for(int i = 0; i < p->numMetadataColumns; i++) { rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + } + + // 8. reclaim chunk if fully empty + { + int chunkDeleted; + rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted); + if (rc != SQLITE_OK) { + return rc; + } } return SQLITE_OK; diff --git a/tests/__snapshots__/test-auxiliary.ambr b/tests/__snapshots__/test-auxiliary.ambr index bfe3d2c..66a3ef3 100644 --- a/tests/__snapshots__/test-auxiliary.ambr +++ b/tests/__snapshots__/test-auxiliary.ambr @@ -137,7 +137,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -163,7 +163,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), diff --git a/tests/__snapshots__/test-metadata.ambr b/tests/__snapshots__/test-metadata.ambr index 12212ff..ff7b112 100644 --- a/tests/__snapshots__/test-metadata.ambr +++ b/tests/__snapshots__/test-metadata.ambr @@ -27,8 +27,8 @@ OrderedDict({ 'chunk_id': 1, 'size': 8, - 'validity': b'\x02', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'validity': b'\x06', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -37,7 +37,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x02', + 'data': b'\x06', }), ]), }), @@ -46,7 +46,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -55,7 +55,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -64,13 +64,17 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), 'v_metadatatext03': OrderedDict({ 'sql': 'select * from v_metadatatext03', 'rows': list([ + OrderedDict({ + 'rowid': 3, + 'data': '1234567890123', + }), ]), }), 'v_rowids': OrderedDict({ @@ -82,6 +86,12 @@ 'chunk_id': 1, 'chunk_offset': 1, }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 2, + }), ]), }), 'v_vector_chunks00': OrderedDict({ @@ -89,7 +99,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -264,7 +274,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -335,7 +345,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -360,6 +370,14 @@ 'f': 2.2, 't': 'test2', }), + OrderedDict({ + 'rowid': 3, + 'vector': b'3333', + 'b': 1, + 'n': 3, + 'f': 3.3, + 't': '1234567890123', + }), ]), }) # --- diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 0f1e5ba..21629ef 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -69,9 +69,13 @@ $(TARGET_DIR)/vec_each: vec-each.c $(FUZZ_SRCS) | $(TARGET_DIR) $(TARGET_DIR)/vec_mismatch: vec-mismatch.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ +$(TARGET_DIR)/vec0_delete_completeness: vec0-delete-completeness.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + FUZZ_TARGETS = vec0_create exec json numpy \ shadow_corrupt vec0_operations scalar_functions \ - vec0_create_full metadata_columns vec_each vec_mismatch + vec0_create_full metadata_columns vec_each vec_mismatch \ + vec0_delete_completeness all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/vec0-delete-completeness.c b/tests/fuzz/vec0-delete-completeness.c new file mode 100644 index 0000000..8603b71 --- /dev/null +++ b/tests/fuzz/vec0-delete-completeness.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtScan = NULL; + sqlite3_stmt *stmtCount = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=4)", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 3; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 16) + 1; + + switch (op) { + case 0: { + // INSERT + float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 4 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + // DELETE + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + // Full scan + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + // Delete all remaining rows + sqlite3_exec(db, "DELETE FROM v", NULL, NULL, NULL); + + // Assert all shadow tables are empty after full deletion + sqlite3_prepare_v2(db, + "SELECT count(*) FROM v_rowids", -1, &stmtCount, NULL); + if (stmtCount) { + rc = sqlite3_step(stmtCount); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_int(stmtCount, 0) == 0); + sqlite3_finalize(stmtCount); + stmtCount = NULL; + } + + sqlite3_prepare_v2(db, + "SELECT count(*) FROM v_chunks", -1, &stmtCount, NULL); + if (stmtCount) { + rc = sqlite3_step(stmtCount); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_int(stmtCount, 0) == 0); + sqlite3_finalize(stmtCount); + stmtCount = NULL; + } + + sqlite3_prepare_v2(db, + "SELECT count(*) FROM v_vector_chunks00", -1, &stmtCount, NULL); + if (stmtCount) { + rc = sqlite3_step(stmtCount); + assert(rc == SQLITE_ROW); + assert(sqlite3_column_int(stmtCount, 0) == 0); + sqlite3_finalize(stmtCount); + stmtCount = NULL; + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/test-insert-delete.py b/tests/test-insert-delete.py index 343c231..cc1697b 100644 --- a/tests/test-insert-delete.py +++ b/tests/test-insert-delete.py @@ -1,7 +1,7 @@ import sqlite3 import struct import pytest -from helpers import _f32, exec +from helpers import _f32, _i64, _int8, exec def test_insert_creates_chunks_and_vectors(db, snapshot): @@ -147,3 +147,335 @@ def test_insert_validates_type(db): def test_info_table_contents(db, snapshot): db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") assert exec(db, "select key, value from v_info order by key") == snapshot() + + +def test_delete_zeroes_rowid_blob(db): + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 4): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + db.execute("delete from v where rowid = 2") + + blob = db.execute("select rowids from v_chunks where rowid = 1").fetchone()[0] + rowids = struct.unpack("<8q", blob) + assert rowids[0] == 1 # slot 0 intact + assert rowids[1] == 0 # slot 1 zeroed (was rowid 2) + assert rowids[2] == 3 # slot 2 intact + + +def test_delete_zeroes_vector_blob(db): + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + db.execute( + "insert into v(rowid, emb) values (1, ?)", [_f32([1.0, 2.0, 3.0, 4.0])] + ) + db.execute( + "insert into v(rowid, emb) values (2, ?)", [_f32([5.0, 6.0, 7.0, 8.0])] + ) + + db.execute("delete from v where rowid = 1") + + blob = db.execute( + "select vectors from v_vector_chunks00 where rowid = 1" + ).fetchone()[0] + # First slot (4 floats = 16 bytes) should be zeroed + first_slot = struct.unpack("<4f", blob[:16]) + assert first_slot == (0.0, 0.0, 0.0, 0.0) + # Second slot should be unchanged + second_slot = struct.unpack("<4f", blob[16:32]) + assert second_slot == (5.0, 6.0, 7.0, 8.0) + + +def test_delete_all_rows_deletes_chunk(db): + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 9): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + assert ( + db.execute("select count(*) from v_chunks").fetchone()[0] == 0 + ) + assert ( + db.execute("select count(*) from v_vector_chunks00").fetchone()[0] == 0 + ) + + # Inserting after full deletion still works + db.execute( + "insert into v(rowid, emb) values (100, ?)", [_f32([9.0, 9.0, 9.0, 9.0])] + ) + row = db.execute("select emb from v where rowid = 100").fetchone() + assert row[0] == _f32([9.0, 9.0, 9.0, 9.0]) + + +def test_delete_chunk_multiple_chunks(db): + db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)") + + for i in range(1, 17): + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * 4)], + ) + + # Delete all rows from the first chunk (rows 1-8) + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + # Only 1 chunk should remain + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1 + + # Rows 9-16 still queryable + for i in range(9, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + +def test_delete_with_metadata_columns(db): + db.execute( + "create virtual table v using vec0(" + "emb float[4], " + "m_bool boolean, " + "m_int integer, " + "m_float float, " + "m_text text, " + "chunk_size=8" + ")" + ) + + for i in range(1, 9): + db.execute( + "insert into v(rowid, emb, m_bool, m_int, m_float, m_text) " + "values (?, ?, ?, ?, ?, ?)", + [i, _f32([float(i)] * 4), i % 2 == 0, i * 10, float(i) / 2.0, f"text_{i}"], + ) + + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0 + assert db.execute("select count(*) from v_vector_chunks00").fetchone()[0] == 0 + assert db.execute("select count(*) from v_metadatachunks00").fetchone()[0] == 0 + assert db.execute("select count(*) from v_metadatachunks01").fetchone()[0] == 0 + assert db.execute("select count(*) from v_metadatachunks02").fetchone()[0] == 0 + assert db.execute("select count(*) from v_metadatachunks03").fetchone()[0] == 0 + + +def test_delete_with_auxiliary_columns(db): + db.execute( + "create virtual table v using vec0(" + "emb float[4], " + "+aux_text text, " + "chunk_size=8" + ")" + ) + + for i in range(1, 9): + db.execute( + "insert into v(rowid, emb, aux_text) values (?, ?, ?)", + [i, _f32([float(i)] * 4), f"aux_{i}"], + ) + + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0 + assert db.execute("select count(*) from v_auxiliary").fetchone()[0] == 0 + + +def test_delete_with_text_primary_key(db): + db.execute( + "create virtual table v using vec0(" + "id text primary key, emb float[4], chunk_size=8" + ")" + ) + + db.execute( + "insert into v(id, emb) values ('a', ?)", [_f32([1.0, 2.0, 3.0, 4.0])] + ) + db.execute( + "insert into v(id, emb) values ('b', ?)", [_f32([5.0, 6.0, 7.0, 8.0])] + ) + + db.execute("delete from v where id = 'a'") + + # Vector blob slot 0 should be zeroed + blob = db.execute( + "select vectors from v_vector_chunks00 where rowid = 1" + ).fetchone()[0] + first_slot = struct.unpack("<4f", blob[:16]) + assert first_slot == (0.0, 0.0, 0.0, 0.0) + + # Remaining row still queryable + row = db.execute("select emb from v where id = 'b'").fetchone() + assert row[0] == _f32([5.0, 6.0, 7.0, 8.0]) + + +def test_delete_with_partition_keys(db): + db.execute( + "create virtual table v using vec0(" + "part text partition key, emb float[4], chunk_size=8" + ")" + ) + + for i in range(1, 9): + db.execute( + "insert into v(rowid, part, emb) values (?, 'A', ?)", + [i, _f32([float(i)] * 4)], + ) + for i in range(9, 17): + db.execute( + "insert into v(rowid, part, emb) values (?, 'B', ?)", + [i, _f32([float(i)] * 4)], + ) + + # Delete all from partition A + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + + # 1 chunk should remain (partition B's) + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1 + + # Partition B rows intact + for i in range(9, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * 4) + + # Re-insert into partition A works + db.execute( + "insert into v(rowid, part, emb) values (100, 'A', ?)", + [_f32([99.0, 99.0, 99.0, 99.0])], + ) + row = db.execute("select emb from v where rowid = 100").fetchone() + assert row[0] == _f32([99.0, 99.0, 99.0, 99.0]) + + +def test_delete_int8_vectors(db): + db.execute("create virtual table v using vec0(emb int8[4], chunk_size=8)") + + db.execute( + "insert into v(rowid, emb) values (1, vec_int8(?))", + [_int8([1, 2, 3, 4])], + ) + db.execute( + "insert into v(rowid, emb) values (2, vec_int8(?))", + [_int8([5, 6, 7, 8])], + ) + + db.execute("delete from v where rowid = 1") + + blob = db.execute( + "select vectors from v_vector_chunks00 where rowid = 1" + ).fetchone()[0] + # int8[4] = 4 bytes per slot + first_slot = struct.unpack("<4b", blob[:4]) + assert first_slot == (0, 0, 0, 0) + second_slot = struct.unpack("<4b", blob[4:8]) + assert second_slot == (5, 6, 7, 8) + + +def test_delete_bit_vectors(db): + db.execute("create virtual table v using vec0(emb bit[8], chunk_size=8)") + + db.execute( + "insert into v(rowid, emb) values (1, vec_bit(?))", + [bytes([0xFF])], + ) + db.execute( + "insert into v(rowid, emb) values (2, vec_bit(?))", + [bytes([0xAA])], + ) + + db.execute("delete from v where rowid = 1") + + blob = db.execute( + "select vectors from v_vector_chunks00 where rowid = 1" + ).fetchone()[0] + # bit[8] = 1 byte per slot + assert blob[0:1] == bytes([0x00]) + assert blob[1:2] == bytes([0xAA]) + + +def _file_db(tmp_path): + """Open a file-backed DB (required for page_count to shrink after VACUUM).""" + db = sqlite3.connect(str(tmp_path / "test.db")) + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def test_delete_chunk_shrinks_pages(tmp_path): + """Use large vectors (float[256]) so each chunk blob spans multiple pages, + making the page_count difference measurable after VACUUM.""" + dims = 256 + db = _file_db(tmp_path) + db.execute(f"create virtual table v using vec0(emb float[{dims}], chunk_size=8)") + + for i in range(1, 25): # 3 full chunks of 8 + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * dims)], + ) + db.commit() + pages_before = db.execute("pragma page_count").fetchone()[0] + + # Delete all rows + for i in range(1, 25): + db.execute("delete from v where rowid = ?", [i]) + db.commit() + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0 + + db.execute("vacuum") + pages_after = db.execute("pragma page_count").fetchone()[0] + assert pages_after < pages_before, ( + f"page_count should shrink after deleting all chunks and vacuum: " + f"{pages_before} -> {pages_after}" + ) + db.close() + + +def test_delete_one_chunk_of_two_shrinks_pages(tmp_path): + """Use large vectors (float[256]) so each chunk blob spans multiple pages, + making the page_count difference measurable after VACUUM.""" + dims = 256 + db = _file_db(tmp_path) + db.execute(f"create virtual table v using vec0(emb float[{dims}], chunk_size=8)") + + for i in range(1, 17): # 2 full chunks of 8 + db.execute( + "insert into v(rowid, emb) values (?, ?)", + [i, _f32([float(i)] * dims)], + ) + db.commit() + pages_before = db.execute("pragma page_count").fetchone()[0] + + # Delete all rows from the first chunk (rows 1-8) + for i in range(1, 9): + db.execute("delete from v where rowid = ?", [i]) + db.commit() + + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1 + + db.execute("vacuum") + pages_after = db.execute("pragma page_count").fetchone()[0] + assert pages_after < pages_before, ( + f"page_count should shrink after deleting one chunk and vacuum: " + f"{pages_before} -> {pages_after}" + ) + + # Remaining rows still queryable after vacuum + for i in range(9, 17): + row = db.execute("select emb from v where rowid = ?", [i]).fetchone() + assert row[0] == _f32([float(i)] * dims) + db.close()