From 4feb19ec74b60a7308f995fbfe3dd654cf325ac4 Mon Sep 17 00:00:00 2001 From: sampathrg Date: Thu, 26 Mar 2026 19:07:18 +0530 Subject: [PATCH 1/5] rfc for filters save and load --- docs/rfcs/document_provider_serialization.md | 296 +++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 docs/rfcs/document_provider_serialization.md diff --git a/docs/rfcs/document_provider_serialization.md b/docs/rfcs/document_provider_serialization.md new file mode 100644 index 000000000..8433405a4 --- /dev/null +++ b/docs/rfcs/document_provider_serialization.md @@ -0,0 +1,296 @@ +# RFC: SaveWith and LoadWith for DocumentProvider + +**Status**: Draft +**Crate(s) affected**: `diskann-label-filter`, `diskann-providers` + +--- + +## Table of Contents + +1. [Summary](#summary) +2. [Background](#background) +3. [Existing Serialization Patterns](#existing-serialization-patterns) + - [SaveWith and LoadWith Traits](#savewith-and-loadwith-traits) + - [BfTreeProvider](#bftreeprovider) + - [DefaultProvider](#defaultprovider) + - [DiskProvider](#diskprovider) +4. [Proposed Design](#proposed-design) + - [Crate Dependency](#crate-dependency) + - [Attribute Store Serialization Interface](#attribute-store-serialization-interface) + - [Label File Format](#label-file-format) + - [DocumentProvider SaveWith Implementation](#documentprovider-savewith-implementation) + - [DocumentProvider LoadWith Implementation](#documentprovider-loadwith-implementation) +5. [Open Questions](#open-questions) + +--- + +## Summary + +This RFC proposes implementing the `SaveWith` and `LoadWith` serialization traits for `DocumentProvider`. `DocumentProvider` wraps an inner `DataProvider` (e.g. `DefaultProvider` or `BfTreeProvider`) and an `AttributeStore` that holds the per-vector label data. Serialization must handle both: delegating to the inner provider's own save/load and persisting the attribute store to a separate binary file. + +--- + +## Background + +`DocumentProvider` (in `diskann-label-filter`) is a wrapper that combines a `DataProvider` with an `AttributeStore`. It is used when the ANN index must support filtered search: every indexed vector carries a set of typed key–value attributes (e.g. `"category" = "electronics"`, `"price" = 299.99`) and queries can include attribute-based predicates. + +```rust +pub struct DocumentProvider { + inner_provider: DP, + attribute_store: AS, +} +``` + +The concrete attribute store in use today is `RoaringAttributeStore`, which maintains: + +- An `AttributeEncoder` mapping each distinct `Attribute` (field name + typed value) to a compact `u64` ID. +- A forward index (`RoaringTreemapSetProvider`) mapping each node's internal ID to the set of its attribute IDs. +- An inverted index (`RoaringTreemapSetProvider`) mapping each attribute ID to the set of node IDs that carry it. The inverted index can be rebuilt from the forward index and so **does not need to be persisted**. + +At present, `DocumentProvider` implements `DataProvider`, `SetElement`, and `Delete` but has no serialization support. This RFC proposes filling that gap. + +--- + +## Existing Serialization Patterns + +### SaveWith and LoadWith Traits + +Defined in `diskann-providers::storage`: + +```rust +pub trait SaveWith { + type Ok: Send; + type Error: std::error::Error + Send; + + fn save_with

(&self, provider: &P, auxiliary: &T) + -> impl Future> + Send + where + P: StorageWriteProvider; +} + +pub trait LoadWith: Sized { + type Error: std::error::Error + Send; + + fn load_with

(provider: &P, auxiliary: &T) + -> impl Future> + Send + where + P: StorageReadProvider; +} +``` + +The generic `T` parameter carries the context needed to derive file paths. In practice this is either a file-path prefix `String`, a structured `AsyncIndexMetadata` (wrapping a prefix string), or a tuple thereof. + +### BfTreeProvider + +| Trait | Auxiliary type | Description | +|---|---|---| +| `SaveWith` | `String` | The `String` is the file-path prefix | +| `LoadWith` | `String` | Same | + +**Files written on save:** + +- `{prefix}_params.json` — JSON configuration blob (`SavedParams`): dimension, metric, `max_degree`, BfTree configuration parameters, quantization parameters (if PQ is enabled), and graph hyperparameters. +- `{prefix}_vectors.bftree` — BfTree snapshot of full-precision vector data (copy of the in-memory BfTree snapshot). +- `{prefix}_neighbors.bftree` — BfTree snapshot of the graph adjacency lists. +- `{prefix}_quant.bftree` — BfTree snapshot of quantized vectors *(PQ variant only)*. +- `{prefix}_pq_pivots.bin` — PQ pivot table and centroids *(PQ variant only)*. +- `{prefix}_deleted.bin` — Serialized delete bitmap. + +**On load:** + +1. Reads `_params.json` to reconstruct the `BfTree` configs and index parameters. +2. Opens BfTree snapshots (using `BfTree::new_from_snapshot`) for full vectors, neighbors, and quant vectors. +3. Loads the PQ pivot table from `_pq_pivots.bin` *(PQ variant only)*. +4. Loads the delete bitmap from `_deleted.bin`, or creates an empty bitmap if the file does not exist. + +### DefaultProvider + +| Trait | Auxiliary type | Description | +|---|---|---| +| `SaveWith` | `(u32, AsyncIndexMetadata)` | `u32` is the start-point ID; `AsyncIndexMetadata` wraps the file-path prefix | +| `SaveWith` | `(u32, u32, DiskGraphOnly)` | Graph-only save for disk-index construction | +| `LoadWith` | `AsyncQuantLoadContext` | Prefix + frozen-point count + metric + prefetch hints | + +**Files written on save (`(u32, AsyncIndexMetadata)` variant):** + +- `{prefix}.data` — Full-precision vectors in the standard `.bin` format (via `MemoryVectorProviderAsync` / `FastMemoryVectorProviderAsync`). +- `{prefix}_build_pq_compressed.bin` / `{prefix}_sq_compressed.bin` — Quantized vectors *(if a quant store is present)*. +- `{prefix}` (raw prefix, no extension) — Graph adjacency list in `.bin` graph format (via `SimpleNeighborProviderAsync::save_direct()`). + +> **Note:** The delete store (`TableDeleteProviderAsync`) is **not** persisted; it is reconstructed empty via `LoadWith`. + +**On load (`AsyncQuantLoadContext`):** + +1. Loads full-precision vectors from `{prefix}.data`. +2. Loads quant vectors from the compressed bin file *(if present)*. +3. Loads the graph from the raw prefix file via `SimpleNeighborProviderAsync::load_direct()`. +4. Constructs an empty delete store from the point count. + +### DiskProvider + +`DiskProvider` implements `LoadWith` but **has no `SaveWith` implementation**. Disk-index creation is handled externally by `DiskIndexWriter::create_disk_layout()`, which interleaves the vector data, neighbor lists, and associated data into a sector-aligned binary file. The `DiskIndexWriter` is not integrated with the `SaveWith`/`LoadWith` trait family. + +--- + +## Proposed Design + +### Attribute Store Serialization Interface + +Rather than extending the `AttributeStore` trait (which would impose serialization concerns on all implementations), we propose adding `SaveWith` and `LoadWith` directly on `RoaringAttributeStore` for the relevant auxiliary types. The attribute store is responsible for extracting whatever path information it needs from `T`. + +The `DocumentProvider` impls will then require the attribute store bound: + +```rust +impl SaveWith for DocumentProvider +where + DP: SaveWith, + AS: SaveWith, + ... +``` + +```rust +impl LoadWith for DocumentProvider +where + DP: LoadWith, + AS: LoadWith, + ... +``` + +This is deliberately narrow: concrete implementations are only required for `RoaringAttributeStore` for now, keeping the door open for future implementations. + +### Label File Format + +The label data is persisted to a single binary file at `{prefix}.labels.bin`. `RoaringAttributeStore` is responsible for deriving the path from the auxiliary type `T` passed to its `SaveWith` / `LoadWith` implementation. The format uses little-endian byte order throughout and is designed for straightforward sequential writes and reads. + +```text +┌────────────────────────────────────────────────────────────────────┐ +│ Header (16 bytes) │ +│ [u64: num_attribute_entries] │ +│ [u64: forward_index_offset] (byte offset from file start to │ +│ Section 2) │ +├────────────────────────────────────────────────────────────────────┤ +│ Section 1: Attribute Dictionary │ +│ Repeated `num_attribute_entries` times: │ +│ │ +│ [u64: attribute_id] │ +│ [u32: field_name_byte_len] │ +│ [u8 * field_name_byte_len: UTF-8 field name] │ +│ [u8: type_tag] │ +│ 0 = Bool │ +│ 1 = Integer │ +│ 2 = Real │ +│ 3 = String │ +│ 4 = Empty │ +│ [value bytes, depends on type_tag]: │ +│ Bool: 1 byte (0 = false, 1 = true) │ +│ Integer: 8 bytes (i64, little-endian) │ +│ Real: 8 bytes (f64, little-endian) │ +│ String: [u32: byte_len] + [u8 * byte_len: UTF-8 value] │ +│ Empty: 0 bytes │ +├────────────────────────────────────────────────────────────────────┤ +│ Section 2: Forward Index │ +│ [u64: num_nodes_with_labels] │ +│ Repeated `num_nodes_with_labels` times: │ +│ │ +│ [u32: node_internal_id] │ +│ [u32: num_attribute_ids_for_this_node] │ +│ [u64 * num_attribute_ids: attribute IDs (sorted ascending)] │ +└────────────────────────────────────────────────────────────────────┘ +``` + +**Notes:** + +- The inverted index is **not persisted**; it is rebuilt from the forward index during `load_with`. +- `node_internal_id` is currently `u32` (matching `VectorId = u32` throughout the system). If the type is generalised in the future, this field and the file format version must be updated. +- The attribute dictionary is written first so that a reader can build the reverse mapping (`u64` ID → `Attribute`) before scanning the forward index. + +### DocumentProvider SaveWith Implementation + +```rust +impl SaveWith for DocumentProvider +where + DP: DataProvider + SaveWith, + AS: AttributeStore + SaveWith + AsyncFriendly, + ANNError: From, +{ + type Ok = (); + type Error = ANNError; + + async fn save_with

( + &self, + provider: &P, + auxiliary: &T, + ) -> Result<(), ANNError> + where + P: StorageWriteProvider, + { + // 1. Delegate to inner provider. + self.inner_provider + .save_with(provider, auxiliary) + .await?; + + // 2. Persist the attribute store. + self.attribute_store + .save_with(provider, auxiliary) + .await?; + + Ok(()) + } +} +``` + +The `RoaringAttributeStore::save_with` implementation must: + +1. Acquire read locks on `attribute_map` and `index`. +2. Write the header. +3. Iterate over all `(InternalAttribute, u64)` pairs in `AttributeEncoder` via `AttributeEncoder::for_each` (already present, currently marked `dead_code`) and write the dictionary section. +4. Iterate over all `(node_id, attribute_id_set)` pairs in the forward index and write the forward index section. + +### DocumentProvider LoadWith Implementation + +```rust +impl LoadWith for DocumentProvider +where + DP: DataProvider + LoadWith, + AS: AttributeStore + LoadWith + AsyncFriendly, + ANNError: From, +{ + type Error = ANNError; + + async fn load_with

( + provider: &P, + auxiliary: &T, + ) -> Result + where + P: StorageReadProvider, + { + // 1. Load the inner provider. + let inner_provider = DP::load_with(provider, auxiliary).await?; + + // 2. Load the attribute store. + let attribute_store = AS::load_with(provider, auxiliary).await?; + + Ok(Self { + inner_provider, + attribute_store, + }) + } +} +``` + +The `RoaringAttributeStore::load_with` implementation must: + +1. Read the header to obtain `num_attribute_entries` and `forward_index_offset`. +2. Decode the attribute dictionary, inserting each `(u64 id, Attribute)` pair into the `AttributeEncoder`. +3. Seek to `forward_index_offset` and decode the forward index, inserting into the `RoaringTreemapSetProvider`. +4. Rebuild the inverted index from the forward index (iterate node → attribute-IDs, invert to attribute-ID → nodes). + +--- + +## Open Questions + +### DiskProvider Compatibility + +`DiskProvider` uses `DiskIndexWriter::create_disk_layout()` for serialization rather than the `SaveWith` trait. There is therefore no `SaveWith` implementation through which label data could be co-written. Consequently, `DocumentProvider, AS>` would only support `LoadWith`, not `SaveWith`. + +For `LoadWith` to work, the label file must have been written separately (e.g. by calling `attribute_store.save_with(...)` directly during the disk-index build pipeline before creating the disk layout). `DiskIndexWriter` would need to be extended or wrapped to also write the label file at `{prefix}.labels.bin` as part of `create_disk_layout` or a new `create_disk_layout_with_labels` variant. From a8966c1478abefa47e560cfef3358523d0c51ab3 Mon Sep 17 00:00:00 2001 From: sampathrg Date: Thu, 26 Mar 2026 19:39:56 +0530 Subject: [PATCH 2/5] Move RFC + edit to adhere to template --- .../document_provider_serialization.md | 145 ++++++---------- temp.json | 158 ++++++++++++++++++ 2 files changed, 211 insertions(+), 92 deletions(-) rename {docs/rfcs => rfcs}/document_provider_serialization.md (61%) create mode 100644 temp.json diff --git a/docs/rfcs/document_provider_serialization.md b/rfcs/document_provider_serialization.md similarity index 61% rename from docs/rfcs/document_provider_serialization.md rename to rfcs/document_provider_serialization.md index 8433405a4..bd12a271a 100644 --- a/docs/rfcs/document_provider_serialization.md +++ b/rfcs/document_provider_serialization.md @@ -1,36 +1,19 @@ # RFC: SaveWith and LoadWith for DocumentProvider -**Status**: Draft -**Crate(s) affected**: `diskann-label-filter`, `diskann-providers` - ---- - -## Table of Contents - -1. [Summary](#summary) -2. [Background](#background) -3. [Existing Serialization Patterns](#existing-serialization-patterns) - - [SaveWith and LoadWith Traits](#savewith-and-loadwith-traits) - - [BfTreeProvider](#bftreeprovider) - - [DefaultProvider](#defaultprovider) - - [DiskProvider](#diskprovider) -4. [Proposed Design](#proposed-design) - - [Crate Dependency](#crate-dependency) - - [Attribute Store Serialization Interface](#attribute-store-serialization-interface) - - [Label File Format](#label-file-format) - - [DocumentProvider SaveWith Implementation](#documentprovider-savewith-implementation) - - [DocumentProvider LoadWith Implementation](#documentprovider-loadwith-implementation) -5. [Open Questions](#open-questions) - ---- +| | | +| ---------------- | ---------------- | +| **Authors** | Sampath Rajendra | +| **Contributors** | Sampath Rajendra | +| **Created** | 2026-03-26 | +| **Updated** | 2026-03-26 | ## Summary This RFC proposes implementing the `SaveWith` and `LoadWith` serialization traits for `DocumentProvider`. `DocumentProvider` wraps an inner `DataProvider` (e.g. `DefaultProvider` or `BfTreeProvider`) and an `AttributeStore` that holds the per-vector label data. Serialization must handle both: delegating to the inner provider's own save/load and persisting the attribute store to a separate binary file. ---- +## Motivation -## Background +### Background `DocumentProvider` (in `diskann-label-filter`) is a wrapper that combines a `DataProvider` with an `AttributeStore`. It is used when the ANN index must support filtered search: every indexed vector carries a set of typed key–value attributes (e.g. `"category" = "electronics"`, `"price" = 299.99`) and queries can include attribute-based predicates. @@ -47,15 +30,7 @@ The concrete attribute store in use today is `RoaringAttributeStore`, which - A forward index (`RoaringTreemapSetProvider`) mapping each node's internal ID to the set of its attribute IDs. - An inverted index (`RoaringTreemapSetProvider`) mapping each attribute ID to the set of node IDs that carry it. The inverted index can be rebuilt from the forward index and so **does not need to be persisted**. -At present, `DocumentProvider` implements `DataProvider`, `SetElement`, and `Delete` but has no serialization support. This RFC proposes filling that gap. - ---- - -## Existing Serialization Patterns - -### SaveWith and LoadWith Traits - -Defined in `diskann-providers::storage`: +The `SaveWith` and `LoadWith` traits are defined in `diskann-providers::storage`: ```rust pub trait SaveWith { @@ -78,82 +53,52 @@ pub trait LoadWith: Sized { } ``` -The generic `T` parameter carries the context needed to derive file paths. In practice this is either a file-path prefix `String`, a structured `AsyncIndexMetadata` (wrapping a prefix string), or a tuple thereof. - -### BfTreeProvider - -| Trait | Auxiliary type | Description | -|---|---|---| -| `SaveWith` | `String` | The `String` is the file-path prefix | -| `LoadWith` | `String` | Same | +The generic `T` parameter carries the context needed to derive file paths — in practice a file-path prefix `String`, a structured `AsyncIndexMetadata` (wrapping a prefix string), or a tuple thereof. -**Files written on save:** +Existing serialization patterns for the inner provider types: -- `{prefix}_params.json` — JSON configuration blob (`SavedParams`): dimension, metric, `max_degree`, BfTree configuration parameters, quantization parameters (if PQ is enabled), and graph hyperparameters. -- `{prefix}_vectors.bftree` — BfTree snapshot of full-precision vector data (copy of the in-memory BfTree snapshot). -- `{prefix}_neighbors.bftree` — BfTree snapshot of the graph adjacency lists. -- `{prefix}_quant.bftree` — BfTree snapshot of quantized vectors *(PQ variant only)*. -- `{prefix}_pq_pivots.bin` — PQ pivot table and centroids *(PQ variant only)*. -- `{prefix}_deleted.bin` — Serialized delete bitmap. +**`BfTreeProvider`** (`SaveWith` / `LoadWith`): writes `_params.json`, `_vectors.bftree`, `_neighbors.bftree`, `_deleted.bin`, and PQ files when applicable. -**On load:** +**`DefaultProvider`** (`SaveWith<(u32, AsyncIndexMetadata)>` / `LoadWith`): writes `{prefix}.data`, compressed vector files, and the raw graph file. The delete store is not persisted and is reconstructed empty on load. -1. Reads `_params.json` to reconstruct the `BfTree` configs and index parameters. -2. Opens BfTree snapshots (using `BfTree::new_from_snapshot`) for full vectors, neighbors, and quant vectors. -3. Loads the PQ pivot table from `_pq_pivots.bin` *(PQ variant only)*. -4. Loads the delete bitmap from `_deleted.bin`, or creates an empty bitmap if the file does not exist. +**`DiskProvider`** (`LoadWith` only): has no `SaveWith` implementation. Disk-index creation is handled externally by `DiskIndexWriter::create_disk_layout()`, which is not integrated with the `SaveWith`/`LoadWith` trait family. -### DefaultProvider +### Problem Statement -| Trait | Auxiliary type | Description | -|---|---|---| -| `SaveWith` | `(u32, AsyncIndexMetadata)` | `u32` is the start-point ID; `AsyncIndexMetadata` wraps the file-path prefix | -| `SaveWith` | `(u32, u32, DiskGraphOnly)` | Graph-only save for disk-index construction | -| `LoadWith` | `AsyncQuantLoadContext` | Prefix + frozen-point count + metric + prefetch hints | +`DocumentProvider` implements `DataProvider`, `SetElement`, and `Delete` but has no serialization support. An index built with `DocumentProvider` cannot be persisted and reloaded; every restart requires rebuilding the index from scratch, including re-encoding all attribute data. -**Files written on save (`(u32, AsyncIndexMetadata)` variant):** +### Goals -- `{prefix}.data` — Full-precision vectors in the standard `.bin` format (via `MemoryVectorProviderAsync` / `FastMemoryVectorProviderAsync`). -- `{prefix}_build_pq_compressed.bin` / `{prefix}_sq_compressed.bin` — Quantized vectors *(if a quant store is present)*. -- `{prefix}` (raw prefix, no extension) — Graph adjacency list in `.bin` graph format (via `SimpleNeighborProviderAsync::save_direct()`). +1. Implement `SaveWith` and `LoadWith` for `DocumentProvider`, delegating to the inner provider and the attribute store respectively. +2. Define a stable binary file format (`{prefix}.labels.bin`) for persisting `RoaringAttributeStore` label data. +3. Implement `SaveWith` and `LoadWith` directly on `RoaringAttributeStore` without widening the `AttributeStore` trait. -> **Note:** The delete store (`TableDeleteProviderAsync`) is **not** persisted; it is reconstructed empty via `LoadWith`. +## Proposal -**On load (`AsyncQuantLoadContext`):** - -1. Loads full-precision vectors from `{prefix}.data`. -2. Loads quant vectors from the compressed bin file *(if present)*. -3. Loads the graph from the raw prefix file via `SimpleNeighborProviderAsync::load_direct()`. -4. Constructs an empty delete store from the point count. - -### DiskProvider - -`DiskProvider` implements `LoadWith` but **has no `SaveWith` implementation**. Disk-index creation is handled externally by `DiskIndexWriter::create_disk_layout()`, which interleaves the vector data, neighbor lists, and associated data into a sector-aligned binary file. The `DiskIndexWriter` is not integrated with the `SaveWith`/`LoadWith` trait family. - ---- - -## Proposed Design +The `diskann-label-filter` crate takes a new dependency on `diskann-providers` (for the `SaveWith`/`LoadWith` trait definitions and the `StorageReadProvider`/`StorageWriteProvider` abstractions). ### Attribute Store Serialization Interface -Rather than extending the `AttributeStore` trait (which would impose serialization concerns on all implementations), we propose adding `SaveWith` and `LoadWith` directly on `RoaringAttributeStore` for the relevant auxiliary types. The attribute store is responsible for extracting whatever path information it needs from `T`. +Rather than extending the `AttributeStore` trait (which would impose serialization concerns on all implementations), `SaveWith` and `LoadWith` are added directly on `RoaringAttributeStore` for the relevant auxiliary types. The attribute store is responsible for extracting whatever path information it needs from `T`. -The `DocumentProvider` impls will then require the attribute store bound: +The `DocumentProvider` impls require the attribute store bound: ```rust impl SaveWith for DocumentProvider where - DP: SaveWith, - AS: SaveWith, - ... + DP: DataProvider + SaveWith, + AS: AttributeStore + SaveWith + AsyncFriendly, + ANNError: From, +{ ... } ``` ```rust impl LoadWith for DocumentProvider where - DP: LoadWith, - AS: LoadWith, - ... + DP: DataProvider + LoadWith, + AS: AttributeStore + LoadWith + AsyncFriendly, + ANNError: From, +{ ... } ``` This is deliberately narrow: concrete implementations are only required for `RoaringAttributeStore` for now, keeping the door open for future implementations. @@ -285,12 +230,28 @@ The `RoaringAttributeStore::load_with` implementation must: 3. Seek to `forward_index_offset` and decode the forward index, inserting into the `RoaringTreemapSetProvider`. 4. Rebuild the inverted index from the forward index (iterate node → attribute-IDs, invert to attribute-ID → nodes). ---- +## Trade-offs + +### Keeping serialization off the `AttributeStore` trait + +Adding `SaveWith`/`LoadWith` directly to `RoaringAttributeStore` rather than to the `AttributeStore` trait avoids imposing a serialization requirement on every future `AttributeStore` implementation. The cost is that `DocumentProvider` can only be saved or loaded when `AS` satisfies the respective bound — callers using a hypothetical `AS` that does not implement `SaveWith` would need to handle serialization manually or outside of this trait pair. + +### DiskProvider compatibility + +`DiskProvider` has no `SaveWith` implementation; disk-index creation is handled by `DiskIndexWriter::create_disk_layout()`, which is not integrated with the `SaveWith`/`LoadWith` trait family. Consequently, `DocumentProvider, AS>` supports `LoadWith` only. + +For `LoadWith` to work, the label file (`{prefix}.labels.bin`) must have been written separately — e.g. by calling `attribute_store.save_with(...)` directly during the disk-index build pipeline before the disk layout is finalized. The alternative of integrating label-file writing into `DiskIndexWriter` is deferred to future work (see below). + +## Benchmark Results + +Not applicable for this RFC. -## Open Questions +## Future Work -### DiskProvider Compatibility +- [ ] Extend `DiskIndexWriter` (or introduce a `create_disk_layout_with_labels` variant) to co-write `{prefix}.labels.bin` during disk-index construction, enabling a full `SaveWith`/`LoadWith` round-trip for `DocumentProvider, AS>`. +- [ ] If `VectorId` is ever generalised beyond `u32`, update the `node_internal_id` field width and introduce a file format version field. -`DiskProvider` uses `DiskIndexWriter::create_disk_layout()` for serialization rather than the `SaveWith` trait. There is therefore no `SaveWith` implementation through which label data could be co-written. Consequently, `DocumentProvider, AS>` would only support `LoadWith`, not `SaveWith`. +## References -For `LoadWith` to work, the label file must have been written separately (e.g. by calling `attribute_store.save_with(...)` directly during the disk-index build pipeline before creating the disk layout). `DiskIndexWriter` would need to be extended or wrapped to also write the label file at `{prefix}.labels.bin` as part of `create_disk_layout` or a new `create_disk_layout_with_labels` variant. +1. [`diskann-providers` — `SaveWith` / `LoadWith` trait definitions](../diskann-providers/src/) +2. [`diskann-label-filter` — `DocumentProvider`, `RoaringAttributeStore`](../diskann-label-filter/src/) diff --git a/temp.json b/temp.json new file mode 100644 index 000000000..bd8bc0b56 --- /dev/null +++ b/temp.json @@ -0,0 +1,158 @@ +[ + { + "input": { + "content": { + "build": { + "alpha": 1.2000000476837158, + "data": "test_data/disk_index_search\\disk_index_siftsmall_learn_256pts_data.fbin", + "data_labels": "test_data/disk_index_search\\data.256.label.jsonl", + "data_type": "float32", + "distance": "squared_l2", + "l_build": 50, + "max_degree": 32, + "num_threads": 4 + }, + "search": { + "beta": 0.5, + "groundtruth": "test_data/disk_index_search\\disk_index_10pts_idx_uint32_truth_search_filter_res.bin", + "num_threads": [ + 1 + ], + "queries": "test_data/disk_index_search\\disk_index_sample_query_10pts.fbin", + "query_predicates": "test_data/disk_index_search\\query.10.label.jsonl", + "reps": 5, + "runs": [ + { + "recall_k": 10, + "search_l": [ + 20, + 30, + 40 + ], + "search_n": 20 + } + ] + } + }, + "type": "document-index-build" + }, + "results": { + "build_time": 94624, + "data_load_time": 3259, + "dim": 128, + "insert_latencies": { + "mean": 1170.5625, + "median": 1094.0, + "p90": 1709, + "p99": 2519 + }, + "label_count": 256, + "label_load_time": 4117, + "num_vectors": 256, + "search": [ + { + "mean_cmps": 145.5, + "mean_hops": 21.600000381469727, + "mean_latency": 793.92, + "num_queries": 10, + "num_threads": 1, + "p90_latency": 1463, + "p99_latency": 1950, + "per_query_details": [], + "qps": [ + 1488.095238095238, + 1394.700139470014, + 1643.115346697338, + 1142.204454597373, + 859.9191675982457 + ], + "recall": { + "average": 1.0, + "maximum": 10, + "minimum": 10, + "num_queries": 10, + "recall_k": 10, + "recall_n": 20 + }, + "search_l": 20, + "search_n": 20, + "wall_clock_time": [ + 6720, + 7170, + 6086, + 8755, + 11629 + ] + }, + { + "mean_cmps": 159.1999969482422, + "mean_hops": 31.5, + "mean_latency": 1549.42, + "num_queries": 10, + "num_threads": 1, + "p90_latency": 2275, + "p99_latency": 2661, + "per_query_details": [], + "qps": [ + 463.32761895936613, + 641.5191172696947, + 489.6440287910689, + 840.6186953597847, + 1106.1946902654868 + ], + "recall": { + "average": 1.0, + "maximum": 10, + "minimum": 10, + "num_queries": 10, + "recall_k": 10, + "recall_n": 20 + }, + "search_l": 30, + "search_n": 20, + "wall_clock_time": [ + 21583, + 15588, + 20423, + 11896, + 9040 + ] + }, + { + "mean_cmps": 165.1999969482422, + "mean_hops": 41.5, + "mean_latency": 1056.92, + "num_queries": 10, + "num_threads": 1, + "p90_latency": 1364, + "p99_latency": 1517, + "per_query_details": [], + "qps": [ + 801.8603159329646, + 817.99591002045, + 967.8668215253582, + 1054.1851149061777, + 1075.7314974182443 + ], + "recall": { + "average": 1.0, + "maximum": 10, + "minimum": 10, + "num_queries": 10, + "recall_k": 10, + "recall_n": 20 + }, + "search_l": 40, + "search_n": 20, + "wall_clock_time": [ + 12471, + 12225, + 10332, + 9486, + 9296 + ] + } + ] + } + } +] \ No newline at end of file From 4c00a9b67d5705276a89c688bd1b59cc61b0deea Mon Sep 17 00:00:00 2001 From: sampathrg Date: Thu, 26 Mar 2026 21:11:21 +0530 Subject: [PATCH 3/5] Add use case based info --- rfcs/document_provider_serialization.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rfcs/document_provider_serialization.md b/rfcs/document_provider_serialization.md index bd12a271a..966097555 100644 --- a/rfcs/document_provider_serialization.md +++ b/rfcs/document_provider_serialization.md @@ -9,7 +9,7 @@ ## Summary -This RFC proposes implementing the `SaveWith` and `LoadWith` serialization traits for `DocumentProvider`. `DocumentProvider` wraps an inner `DataProvider` (e.g. `DefaultProvider` or `BfTreeProvider`) and an `AttributeStore` that holds the per-vector label data. Serialization must handle both: delegating to the inner provider's own save/load and persisting the attribute store to a separate binary file. +This RFC addresses the use case of saving and loading of label information along with the index information that is already saved and loaded by the `DataProvider`. The RFC proposes to do this by implementing the `SaveWith` and `LoadWith` serialization traits for `DocumentProvider`. `DocumentProvider` wraps an inner `DataProvider` (e.g. `DefaultProvider` or `BfTreeProvider`) and an `AttributeStore` that holds the per-vector label data. Serialization must handle both: delegating to the inner provider's own save/load and persisting the attribute store to a separate binary file. ## Motivation From 1bfe082c26156028367f0dfe607036506903547c Mon Sep 17 00:00:00 2001 From: sampathrg Date: Mon, 30 Mar 2026 19:02:27 +0530 Subject: [PATCH 4/5] Modify save format to account for differnt VectorId types --- rfcs/document_provider_serialization.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/rfcs/document_provider_serialization.md b/rfcs/document_provider_serialization.md index 966097555..4a3811c92 100644 --- a/rfcs/document_provider_serialization.md +++ b/rfcs/document_provider_serialization.md @@ -53,7 +53,7 @@ pub trait LoadWith: Sized { } ``` -The generic `T` parameter carries the context needed to derive file paths — in practice a file-path prefix `String`, a structured `AsyncIndexMetadata` (wrapping a prefix string), or a tuple thereof. +The generic `T` parameter carries the context needed to derive file paths — examples are a file-path prefix `String`, a structured `AsyncIndexMetadata` (wrapping a prefix string). Existing serialization patterns for the inner provider types: @@ -70,7 +70,7 @@ Existing serialization patterns for the inner provider types: ### Goals 1. Implement `SaveWith` and `LoadWith` for `DocumentProvider`, delegating to the inner provider and the attribute store respectively. -2. Define a stable binary file format (`{prefix}.labels.bin`) for persisting `RoaringAttributeStore` label data. +2. Define a binary file format (`{prefix}.labels.bin`) for persisting `RoaringAttributeStore` label data. 3. Implement `SaveWith` and `LoadWith` directly on `RoaringAttributeStore` without widening the `AttributeStore` trait. ## Proposal @@ -101,18 +101,19 @@ where { ... } ``` -This is deliberately narrow: concrete implementations are only required for `RoaringAttributeStore` for now, keeping the door open for future implementations. - ### Label File Format The label data is persisted to a single binary file at `{prefix}.labels.bin`. `RoaringAttributeStore` is responsible for deriving the path from the auxiliary type `T` passed to its `SaveWith` / `LoadWith` implementation. The format uses little-endian byte order throughout and is designed for straightforward sequential writes and reads. ```text ┌────────────────────────────────────────────────────────────────────┐ -│ Header (16 bytes) │ -│ [u64: num_attribute_entries] │ +│ Header (17 bytes) │ +│ [u64: num_attribute_entries] │ │ [u64: forward_index_offset] (byte offset from file start to │ │ Section 2) │ +│ [u8: vector_id_type_tag] │ +│ 0 = u32 │ +│ 1 = u64 │ ├────────────────────────────────────────────────────────────────────┤ │ Section 1: Attribute Dictionary │ │ Repeated `num_attribute_entries` times: │ @@ -137,7 +138,7 @@ The label data is persisted to a single binary file at `{prefix}.labels.bin`. `R │ [u64: num_nodes_with_labels] │ │ Repeated `num_nodes_with_labels` times: │ │ │ -│ [u32: node_internal_id] │ +│ [N bytes: node_internal_id (width per vector_id_type_tag)] │ │ [u32: num_attribute_ids_for_this_node] │ │ [u64 * num_attribute_ids: attribute IDs (sorted ascending)] │ └────────────────────────────────────────────────────────────────────┘ @@ -146,7 +147,6 @@ The label data is persisted to a single binary file at `{prefix}.labels.bin`. `R **Notes:** - The inverted index is **not persisted**; it is rebuilt from the forward index during `load_with`. -- `node_internal_id` is currently `u32` (matching `VectorId = u32` throughout the system). If the type is generalised in the future, this field and the file format version must be updated. - The attribute dictionary is written first so that a reader can build the reverse mapping (`u64` ID → `Attribute`) before scanning the forward index. ### DocumentProvider SaveWith Implementation @@ -242,14 +242,17 @@ Adding `SaveWith`/`LoadWith` directly to `RoaringAttributeStore` rather than to For `LoadWith` to work, the label file (`{prefix}.labels.bin`) must have been written separately — e.g. by calling `attribute_store.save_with(...)` directly during the disk-index build pipeline before the disk layout is finalized. The alternative of integrating label-file writing into `DiskIndexWriter` is deferred to future work (see below). +### Save format + +The save format could have been JSON. This would help with readability and debuggability but at the cost of more storage space and perhaps longer load times. + ## Benchmark Results -Not applicable for this RFC. +TODO ## Future Work - [ ] Extend `DiskIndexWriter` (or introduce a `create_disk_layout_with_labels` variant) to co-write `{prefix}.labels.bin` during disk-index construction, enabling a full `SaveWith`/`LoadWith` round-trip for `DocumentProvider, AS>`. -- [ ] If `VectorId` is ever generalised beyond `u32`, update the `node_internal_id` field width and introduce a file format version field. ## References From ba15b338bf5a44a19674524cba8521471389fe61 Mon Sep 17 00:00:00 2001 From: sampathrg Date: Mon, 30 Mar 2026 19:06:17 +0530 Subject: [PATCH 5/5] delete temp.json --- temp.json | 158 ------------------------------------------------------ 1 file changed, 158 deletions(-) delete mode 100644 temp.json diff --git a/temp.json b/temp.json deleted file mode 100644 index bd8bc0b56..000000000 --- a/temp.json +++ /dev/null @@ -1,158 +0,0 @@ -[ - { - "input": { - "content": { - "build": { - "alpha": 1.2000000476837158, - "data": "test_data/disk_index_search\\disk_index_siftsmall_learn_256pts_data.fbin", - "data_labels": "test_data/disk_index_search\\data.256.label.jsonl", - "data_type": "float32", - "distance": "squared_l2", - "l_build": 50, - "max_degree": 32, - "num_threads": 4 - }, - "search": { - "beta": 0.5, - "groundtruth": "test_data/disk_index_search\\disk_index_10pts_idx_uint32_truth_search_filter_res.bin", - "num_threads": [ - 1 - ], - "queries": "test_data/disk_index_search\\disk_index_sample_query_10pts.fbin", - "query_predicates": "test_data/disk_index_search\\query.10.label.jsonl", - "reps": 5, - "runs": [ - { - "recall_k": 10, - "search_l": [ - 20, - 30, - 40 - ], - "search_n": 20 - } - ] - } - }, - "type": "document-index-build" - }, - "results": { - "build_time": 94624, - "data_load_time": 3259, - "dim": 128, - "insert_latencies": { - "mean": 1170.5625, - "median": 1094.0, - "p90": 1709, - "p99": 2519 - }, - "label_count": 256, - "label_load_time": 4117, - "num_vectors": 256, - "search": [ - { - "mean_cmps": 145.5, - "mean_hops": 21.600000381469727, - "mean_latency": 793.92, - "num_queries": 10, - "num_threads": 1, - "p90_latency": 1463, - "p99_latency": 1950, - "per_query_details": [], - "qps": [ - 1488.095238095238, - 1394.700139470014, - 1643.115346697338, - 1142.204454597373, - 859.9191675982457 - ], - "recall": { - "average": 1.0, - "maximum": 10, - "minimum": 10, - "num_queries": 10, - "recall_k": 10, - "recall_n": 20 - }, - "search_l": 20, - "search_n": 20, - "wall_clock_time": [ - 6720, - 7170, - 6086, - 8755, - 11629 - ] - }, - { - "mean_cmps": 159.1999969482422, - "mean_hops": 31.5, - "mean_latency": 1549.42, - "num_queries": 10, - "num_threads": 1, - "p90_latency": 2275, - "p99_latency": 2661, - "per_query_details": [], - "qps": [ - 463.32761895936613, - 641.5191172696947, - 489.6440287910689, - 840.6186953597847, - 1106.1946902654868 - ], - "recall": { - "average": 1.0, - "maximum": 10, - "minimum": 10, - "num_queries": 10, - "recall_k": 10, - "recall_n": 20 - }, - "search_l": 30, - "search_n": 20, - "wall_clock_time": [ - 21583, - 15588, - 20423, - 11896, - 9040 - ] - }, - { - "mean_cmps": 165.1999969482422, - "mean_hops": 41.5, - "mean_latency": 1056.92, - "num_queries": 10, - "num_threads": 1, - "p90_latency": 1364, - "p99_latency": 1517, - "per_query_details": [], - "qps": [ - 801.8603159329646, - 817.99591002045, - 967.8668215253582, - 1054.1851149061777, - 1075.7314974182443 - ], - "recall": { - "average": 1.0, - "maximum": 10, - "minimum": 10, - "num_queries": 10, - "recall_k": 10, - "recall_n": 20 - }, - "search_l": 40, - "search_n": 20, - "wall_clock_time": [ - 12471, - 12225, - 10332, - 9486, - 9296 - ] - } - ] - } - } -] \ No newline at end of file