From e3553e7fdce4254b66ff345151e56f1885e9f5b8 Mon Sep 17 00:00:00 2001 From: "Varada Thiruvillamalai (from Dev Box)" Date: Thu, 2 Apr 2026 08:36:35 -0700 Subject: [PATCH] Extract diskann-storage crate from diskann-providers Create new Tier 2 crate diskann-storage with zero diskann-* dependencies, containing: - StorageReadProvider / StorageWriteProvider traits - FileStorageProvider (filesystem-backed) - VirtualStorageProvider (in-memory/overlay VFS, feature-gated) - DynWriteProvider / WriteProviderWrapper / WriteSeek adapters - Path utility functions for index artifact naming - DatasetDto transfer type - Generic protobuf load/save via prost diskann-providers now depends on diskann-storage and re-exports all moved types to preserve backward compatibility. The virtual_storage feature is properly wired through diskann-providers' own virtual_storage feature. Consumer crates (diskann-disk, diskann-tools, diskann-benchmark) updated to import storage types directly from diskann-storage. The virtual_storage feature is only in dev-dependencies for these crates. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Cargo.lock | 14 ++ Cargo.toml | 4 + diskann-benchmark/Cargo.toml | 1 + .../src/backend/disk_index/benchmarks.rs | 2 +- .../src/backend/disk_index/build.rs | 2 +- .../src/backend/disk_index/search.rs | 10 +- diskann-benchmark/src/inputs/disk.rs | 2 +- diskann-benchmark/src/inputs/save_and_load.rs | 2 +- diskann-benchmark/src/main.rs | 2 +- diskann-benchmark/src/utils/datafiles.rs | 5 +- diskann-disk/Cargo.toml | 2 + diskann-disk/src/build/builder/build.rs | 4 +- diskann-disk/src/build/builder/core.rs | 7 +- .../src/build/builder/inmem_builder.rs | 2 +- diskann-disk/src/build/builder/quantizer.rs | 2 +- .../checkpoint_record_manager_with_file.rs | 3 +- .../src/search/provider/disk_provider.rs | 9 +- .../search/provider/disk_vertex_provider.rs | 7 +- diskann-disk/src/storage/cached_reader.rs | 4 +- diskann-disk/src/storage/cached_writer.rs | 4 +- diskann-disk/src/storage/disk_index_reader.rs | 4 +- diskann-disk/src/storage/disk_index_writer.rs | 7 +- diskann-disk/src/storage/quant/generator.rs | 4 +- .../src/storage/quant/pq/pq_generation.rs | 7 +- .../aligned_file_reader_factory.rs | 2 +- .../storage_provider_aligned_file_reader.rs | 4 +- .../virtual_aligned_reader_factory.rs | 2 +- diskann-disk/src/utils/partition.rs | 4 +- diskann-providers/Cargo.toml | 3 +- .../benchmarks/copy_aligned_data_bench.rs | 4 - .../copy_aligned_data_bench_iai.rs | 4 - .../graph/provider/async_/bf_tree/provider.rs | 2 +- diskann-providers/src/storage/mod.rs | 30 ++- diskann-providers/src/storage/path_utility.rs | 70 ------- .../src/storage/storage_provider.rs | 123 ------------ diskann-providers/src/utils/mod.rs | 4 +- diskann-providers/src/utils/utils.rs | 15 -- diskann-storage/Cargo.toml | 32 +++ diskann-storage/src/dataset_dto.rs | 54 +++++ .../src}/file_storage_provider.rs | 88 +++++++-- diskann-storage/src/lib.rs | 49 +++++ diskann-storage/src/path_utility.rs | 151 ++++++++++++++ diskann-storage/src/proto_storage.rs | 171 ++++++++++++++++ diskann-storage/src/storage_provider.rs | 186 ++++++++++++++++++ .../src}/virtual_storage_provider.rs | 121 +++++++++--- diskann-tools/Cargo.toml | 2 + .../src/bin/gen_associated_data_from_range.rs | 2 +- diskann-tools/src/bin/generate_minmax.rs | 4 +- diskann-tools/src/bin/generate_pq.rs | 2 +- .../src/bin/random_data_generator.rs | 2 +- diskann-tools/src/bin/relative_contrast.rs | 2 +- diskann-tools/src/bin/subsample_bin.rs | 3 +- diskann-tools/src/utils/build_disk_index.rs | 4 +- diskann-tools/src/utils/build_pq.rs | 10 +- .../utils/gen_associated_data_from_range.rs | 4 +- .../utils/generate_synthetic_labels_utils.rs | 6 +- diskann-tools/src/utils/ground_truth.rs | 2 +- .../src/utils/random_data_generator.rs | 5 +- diskann-tools/src/utils/relative_contrast.rs | 4 +- diskann-tools/src/utils/search_disk_index.rs | 5 +- diskann-tools/src/utils/search_index_utils.rs | 2 +- 61 files changed, 925 insertions(+), 363 deletions(-) delete mode 100644 diskann-providers/src/storage/path_utility.rs delete mode 100644 diskann-providers/src/storage/storage_provider.rs delete mode 100644 diskann-providers/src/utils/utils.rs create mode 100644 diskann-storage/Cargo.toml create mode 100644 diskann-storage/src/dataset_dto.rs rename {diskann-providers/src/storage => diskann-storage/src}/file_storage_provider.rs (63%) create mode 100644 diskann-storage/src/lib.rs create mode 100644 diskann-storage/src/path_utility.rs create mode 100644 diskann-storage/src/proto_storage.rs create mode 100644 diskann-storage/src/storage_provider.rs rename {diskann-providers/src/storage => diskann-storage/src}/virtual_storage_provider.rs (63%) diff --git a/Cargo.lock b/Cargo.lock index 2383e9968..762eb306a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -660,6 +660,7 @@ dependencies = [ "diskann-label-filter", "diskann-providers", "diskann-quantization", + "diskann-storage", "diskann-tools", "diskann-utils", "diskann-vector", @@ -743,6 +744,7 @@ dependencies = [ "diskann-platform", "diskann-providers", "diskann-quantization", + "diskann-storage", "diskann-utils", "diskann-vector", "futures-util", @@ -844,6 +846,7 @@ dependencies = [ "diskann-linalg", "diskann-platform", "diskann-quantization", + "diskann-storage", "diskann-utils", "diskann-vector", "diskann-wide", @@ -892,6 +895,16 @@ dependencies = [ "trybuild", ] +[[package]] +name = "diskann-storage" +version = "0.50.0" +dependencies = [ + "prost", + "tempfile", + "thiserror 2.0.17", + "vfs", +] + [[package]] name = "diskann-tools" version = "0.50.0" @@ -907,6 +920,7 @@ dependencies = [ "diskann-label-filter", "diskann-providers", "diskann-quantization", + "diskann-storage", "diskann-utils", "diskann-vector", "half", diff --git a/Cargo.toml b/Cargo.toml index 4e0cda86a..7d64e8045 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,8 @@ members = [ "diskann-platform", # Algorithm "diskann", + # Storage + "diskann-storage", # Providers "diskann-providers", "diskann-disk", @@ -56,6 +58,8 @@ diskann-quantization = { path = "diskann-quantization", default-features = false diskann-platform = { path = "diskann-platform", version = "0.50.0" } # Algorithm diskann = { path = "diskann", version = "0.50.0" } +# Storage +diskann-storage = { path = "diskann-storage", default-features = false, version = "0.50.0" } # Providers diskann-providers = { path = "diskann-providers", default-features = false, version = "0.50.0" } diskann-disk = { path = "diskann-disk", version = "0.50.0" } diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml index bebaf4b8e..8c574e54c 100644 --- a/diskann-benchmark/Cargo.toml +++ b/diskann-benchmark/Cargo.toml @@ -15,6 +15,7 @@ bytemuck.workspace = true roaring.workspace = true clap = { workspace = true, features = ["derive"] } diskann-providers = { workspace = true } +diskann-storage = { workspace = true } diskann.workspace = true diskann-utils.workspace = true half = { workspace = true, features = ["rand_distr", "num-traits"] } diff --git a/diskann-benchmark/src/backend/disk_index/benchmarks.rs b/diskann-benchmark/src/backend/disk_index/benchmarks.rs index 71c89f846..13437d628 100644 --- a/diskann-benchmark/src/backend/disk_index/benchmarks.rs +++ b/diskann-benchmark/src/backend/disk_index/benchmarks.rs @@ -13,7 +13,7 @@ use diskann_benchmark_runner::{ utils::datatype::{DataType, Type}, Benchmark, Checkpoint, }; -use diskann_providers::storage::FileStorageProvider; +use diskann_storage::FileStorageProvider; use half::f16; use crate::{ diff --git a/diskann-benchmark/src/backend/disk_index/build.rs b/diskann-benchmark/src/backend/disk_index/build.rs index b6ebf3b83..7313fa9f6 100644 --- a/diskann-benchmark/src/backend/disk_index/build.rs +++ b/diskann-benchmark/src/backend/disk_index/build.rs @@ -18,8 +18,8 @@ use diskann_disk::{ }, storage::DiskIndexWriter, }; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{model::IndexConfiguration, utils::load_metadata_from_file}; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_vector::distance::Metric; use opentelemetry::global; use opentelemetry::trace::Tracer; diff --git a/diskann-benchmark/src/backend/disk_index/search.rs b/diskann-benchmark/src/backend/disk_index/search.rs index 65e5804a7..c417f1de0 100644 --- a/diskann-benchmark/src/backend/disk_index/search.rs +++ b/diskann-benchmark/src/backend/disk_index/search.rs @@ -19,12 +19,10 @@ use diskann_disk::{ storage::disk_index_reader::DiskIndexReader, utils::{instrumentation::PerfLogger, statistics, AlignedFileReaderFactory, QueryStatistics}, }; -use diskann_providers::storage::StorageReadProvider; -use diskann_providers::{ - storage::{ - get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file, FileStorageProvider, - }, - utils::{create_thread_pool, ParallelIteratorInPool}, +use diskann_providers::utils::{create_thread_pool, ParallelIteratorInPool}; +use diskann_storage::{ + get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file, FileStorageProvider, + StorageReadProvider, }; use diskann_tools::utils::{search_index_utils, KRecallAtN}; use diskann_utils::views::Matrix; diff --git a/diskann-benchmark/src/inputs/disk.rs b/diskann-benchmark/src/inputs/disk.rs index 2951d1fe4..641accd91 100644 --- a/diskann-benchmark/src/inputs/disk.rs +++ b/diskann-benchmark/src/inputs/disk.rs @@ -11,7 +11,7 @@ use diskann_benchmark_runner::{ }; #[cfg(feature = "disk-index")] use diskann_disk::QuantizationType; -use diskann_providers::storage::{get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file}; +use diskann_storage::{get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file}; use serde::{Deserialize, Serialize}; use crate::{ diff --git a/diskann-benchmark/src/inputs/save_and_load.rs b/diskann-benchmark/src/inputs/save_and_load.rs index 3866ac894..d8a607284 100644 --- a/diskann-benchmark/src/inputs/save_and_load.rs +++ b/diskann-benchmark/src/inputs/save_and_load.rs @@ -5,7 +5,7 @@ use std::{io::Read, mem::size_of, num::NonZeroUsize}; use diskann::{ANNError, ANNResult}; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; pub fn get_graph_num_frozen_points( storage_provider: &impl StorageReadProvider, diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs index b3de5901e..455b5349f 100644 --- a/diskann-benchmark/src/main.rs +++ b/diskann-benchmark/src/main.rs @@ -131,7 +131,7 @@ mod tests { use super::*; use diskann_benchmark_runner::{app::Commands, output::Memory}; - use diskann_providers::storage::FileStorageProvider; + use diskann_storage::FileStorageProvider; use diskann_tools::utils::{compute_ground_truth_from_datafiles, GraphDataF32Vector}; use diskann_vector::distance::Metric; diff --git a/diskann-benchmark/src/utils/datafiles.rs b/diskann-benchmark/src/utils/datafiles.rs index 9c5057488..b945a670b 100644 --- a/diskann-benchmark/src/utils/datafiles.rs +++ b/diskann-benchmark/src/utils/datafiles.rs @@ -9,7 +9,7 @@ use anyhow::Context; use bit_set::BitSet; use diskann::utils::IntoUsize; use diskann_benchmark_runner::utils::datatype::DataType; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; use diskann_utils::views::Matrix; use serde::{Deserialize, Serialize}; @@ -23,8 +23,7 @@ where T: Copy + bytemuck::Pod, { let data = diskann_utils::io::read_bin::( - &mut diskann_providers::storage::FileStorageProvider - .open_reader(&path.0.to_string_lossy())?, + &mut diskann_storage::FileStorageProvider.open_reader(&path.0.to_string_lossy())?, )?; Ok(data) } diff --git a/diskann-disk/Cargo.toml b/diskann-disk/Cargo.toml index c68d65769..4c8686620 100644 --- a/diskann-disk/Cargo.toml +++ b/diskann-disk/Cargo.toml @@ -21,6 +21,7 @@ targets = [ diskann = { workspace = true } diskann-utils = { workspace = true } diskann-providers = { workspace = true } +diskann-storage = { workspace = true } diskann-vector = { workspace = true } diskann-linalg = { workspace = true } diskann-quantization = { workspace = true, features = ["rayon"] } @@ -58,6 +59,7 @@ diskann-providers = { workspace = true, default-features = false, features = [ "testing", "virtual_storage", ] } +diskann-storage = { workspace = true, features = ["virtual_storage"] } diskann-utils = { workspace = true, features = ["testing"] } criterion.workspace = true iai-callgrind.workspace = true diff --git a/diskann-disk/src/build/builder/build.rs b/diskann-disk/src/build/builder/build.rs index 8eabad038..4c5cd8897 100644 --- a/diskann-disk/src/build/builder/build.rs +++ b/diskann-disk/src/build/builder/build.rs @@ -15,7 +15,6 @@ use diskann::{ utils::{async_tools, vecid_from_usize, TryIntoVectorId, VectorRepr, ONE}, ANNError, ANNErrorKind, ANNResult, }; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ model::{ graph::{ @@ -30,6 +29,7 @@ use diskann_providers::{ MAX_MEDOID_SAMPLE_SIZE, }, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::io::{read_bin, write_bin}; use diskann_utils::views::MatrixView; use tokio::task::JoinSet; @@ -913,7 +913,7 @@ impl StartPoint { mod start_point_tests { use std::io::Write; - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use diskann_utils::io::Metadata; use super::*; diff --git a/diskann-disk/src/build/builder/core.rs b/diskann-disk/src/build/builder/core.rs index c7f21b682..50f4391c8 100644 --- a/diskann-disk/src/build/builder/core.rs +++ b/diskann-disk/src/build/builder/core.rs @@ -5,7 +5,6 @@ use std::mem::{self, size_of}; use diskann::ANNResult; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ model::{ graph::traits::GraphDataType, IndexConfiguration, GRAPH_SLACK_FACTOR, @@ -17,6 +16,7 @@ use diskann_providers::{ READ_WRITE_BLOCK_SIZE, }, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::io::read_bin; use rand::{seq::SliceRandom, Rng}; use tracing::info; @@ -633,15 +633,16 @@ pub(crate) mod disk_index_builder_tests { utils::{IntoUsize, VectorRepr, ONE}, ANNResult, }; - use diskann_providers::storage::VirtualStorageProvider; use diskann_providers::{ common::AlignedBoxWithSlice, - storage::{get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file}, test_utils::graph_data_type_utils::{ GraphDataF32VectorU32Data, GraphDataF32VectorUnitData, }, utils::Timer, }; + use diskann_storage::{ + get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file, VirtualStorageProvider, + }; use diskann_utils::test_data_root; use diskann_vector::{ distance::Metric::{self, L2}, diff --git a/diskann-disk/src/build/builder/inmem_builder.rs b/diskann-disk/src/build/builder/inmem_builder.rs index ab86b912f..18c11c8db 100644 --- a/diskann-disk/src/build/builder/inmem_builder.rs +++ b/diskann-disk/src/build/builder/inmem_builder.rs @@ -14,7 +14,6 @@ use diskann::{ utils::VectorRepr, ANNError, ANNResult, }; -use diskann_providers::storage::{DynWriteProvider, StorageReadProvider, WriteProviderWrapper}; use diskann_providers::{ index::diskann_async, model::{ @@ -31,6 +30,7 @@ use diskann_providers::{ index_storage::load_index, load_fp_index, AsyncIndexMetadata, DiskGraphOnly, SaveWith, }, }; +use diskann_storage::{DynWriteProvider, StorageReadProvider, WriteProviderWrapper}; use diskann_utils::future::{AsyncFriendly, SendFuture}; use super::quantizer::BuildQuantizer; diff --git a/diskann-disk/src/build/builder/quantizer.rs b/diskann-disk/src/build/builder/quantizer.rs index 179610f77..f208487be 100644 --- a/diskann-disk/src/build/builder/quantizer.rs +++ b/diskann-disk/src/build/builder/quantizer.rs @@ -4,7 +4,6 @@ */ //! Disk index quantizer implementation. use diskann::{ANNError, ANNResult}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ index::diskann_async::train_pq, model::{ @@ -18,6 +17,7 @@ use diskann_providers::{ utils::{BridgeErr, PQPathNames}, }; use diskann_quantization::scalar::train::ScalarQuantizationParameters; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::views::MatrixView; use tracing::info; diff --git a/diskann-disk/src/build/chunking/checkpoint/checkpoint_record_manager_with_file.rs b/diskann-disk/src/build/chunking/checkpoint/checkpoint_record_manager_with_file.rs index 774788d3b..ec80efdbb 100644 --- a/diskann-disk/src/build/chunking/checkpoint/checkpoint_record_manager_with_file.rs +++ b/diskann-disk/src/build/chunking/checkpoint/checkpoint_record_manager_with_file.rs @@ -9,7 +9,8 @@ use std::{ }; use diskann::{ANNError, ANNResult}; -use diskann_providers::{storage::FileStorageProvider, utils::file_exists}; +use diskann_providers::utils::file_exists; +use diskann_storage::FileStorageProvider; use super::{CheckpointManager, CheckpointRecord, Progress, WorkStage}; diff --git a/diskann-disk/src/search/provider/disk_provider.rs b/diskann-disk/src/search/provider/disk_provider.rs index e00186a49..d1a97143e 100644 --- a/diskann-disk/src/search/provider/disk_provider.rs +++ b/diskann-disk/src/search/provider/disk_provider.rs @@ -36,14 +36,15 @@ use diskann::{ }, ANNError, ANNResult, }; -use diskann_providers::storage::StorageReadProvider; use diskann_providers::{ model::{ compute_pq_distance, compute_pq_distance_for_pq_coordinates, graph::traits::GraphDataType, pq::quantizer_preprocess, PQData, PQScratch, }, - storage::{get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file, LoadWith}, + storage::LoadWith, }; +use diskann_storage::StorageReadProvider; +use diskann_storage::{get_compressed_pq_file, get_disk_index_file, get_pq_pivot_file}; use diskann_vector::{distance::Metric, DistanceFunction, PreprocessedDistanceFunction}; use futures_util::future; use tokio::runtime::Runtime; @@ -1060,9 +1061,6 @@ mod disk_provider_tests { utils::IntoUsize, ANNErrorKind, }; - use diskann_providers::storage::{ - DynWriteProvider, StorageReadProvider, VirtualStorageProvider, - }; use diskann_providers::{ common::AlignedBoxWithSlice, test_utils::graph_data_type_utils::{ @@ -1070,6 +1068,7 @@ mod disk_provider_tests { }, utils::{create_thread_pool, load_aligned_bin, PQPathNames, ParallelIteratorInPool}, }; + use diskann_storage::{DynWriteProvider, StorageReadProvider, VirtualStorageProvider}; use diskann_utils::{io::read_bin, test_data_root}; use diskann_vector::distance::Metric; use rayon::prelude::{IndexedParallelIterator, IntoParallelRefIterator}; diff --git a/diskann-disk/src/search/provider/disk_vertex_provider.rs b/diskann-disk/src/search/provider/disk_vertex_provider.rs index d59fd7760..ecc77061d 100644 --- a/diskann-disk/src/search/provider/disk_vertex_provider.rs +++ b/diskann-disk/src/search/provider/disk_vertex_provider.rs @@ -264,15 +264,14 @@ mod disk_vertex_provider_tests { use std::sync::Arc; use diskann::{graph::config, utils::ONE}; - use diskann_providers::storage::{ - StorageReadProvider, StorageWriteProvider, VirtualStorageProvider, - }; use diskann_providers::{ model::{graph::traits::GraphDataType, IndexConfiguration}, - storage::get_disk_index_file, test_utils::graph_data_type_utils::GraphDataF32VectorU32Data, utils::load_metadata_from_file, }; + use diskann_storage::{ + get_disk_index_file, StorageReadProvider, StorageWriteProvider, VirtualStorageProvider, + }; use diskann_utils::test_data_root; use vfs::OverlayFS; diff --git a/diskann-disk/src/storage/cached_reader.rs b/diskann-disk/src/storage/cached_reader.rs index 3326d7607..1a8e89587 100644 --- a/diskann-disk/src/storage/cached_reader.rs +++ b/diskann-disk/src/storage/cached_reader.rs @@ -5,7 +5,7 @@ use std::io::{Read, Seek}; use diskann::{ANNError, ANNResult}; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; use tracing::info; /// Sequential cached reads with a generic storage provider with read access. @@ -114,7 +114,7 @@ where #[cfg(test)] mod cached_reader_test { - use diskann_providers::storage::{StorageWriteProvider, VirtualStorageProvider}; + use diskann_storage::{StorageWriteProvider, VirtualStorageProvider}; use vfs::MemoryFS; use super::*; diff --git a/diskann-disk/src/storage/cached_writer.rs b/diskann-disk/src/storage/cached_writer.rs index 50d28a766..4f71ee8bb 100644 --- a/diskann-disk/src/storage/cached_writer.rs +++ b/diskann-disk/src/storage/cached_writer.rs @@ -4,7 +4,7 @@ */ use std::io::{Seek, SeekFrom, Write}; -use diskann_providers::storage::StorageWriteProvider; +use diskann_storage::StorageWriteProvider; use tracing::info; /// Sequential cached writes with a generic storage provider with write access. @@ -114,7 +114,7 @@ where #[cfg(test)] mod cached_writer_test { - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use vfs::OverlayFS; use super::*; diff --git a/diskann-disk/src/storage/disk_index_reader.rs b/diskann-disk/src/storage/disk_index_reader.rs index 319207a2c..7d1045a4c 100644 --- a/diskann-disk/src/storage/disk_index_reader.rs +++ b/diskann-disk/src/storage/disk_index_reader.rs @@ -5,8 +5,8 @@ use std::{marker::PhantomData, sync::Arc}; use diskann::ANNResult; -use diskann_providers::storage::StorageReadProvider; use diskann_providers::{model::pq::PQData, storage::PQStorage, utils::load_metadata_from_file}; +use diskann_storage::StorageReadProvider; use tracing::info; /// This struct is used by the DiskIndexSearcher to read the index data from storage. Noted that the index data here is different from index graph, @@ -69,7 +69,7 @@ impl DiskIndexReader { #[cfg(test)] mod disk_index_storage_test { use diskann::ANNErrorKind; - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use diskann_utils::test_data_root; use vfs::OverlayFS; diff --git a/diskann-disk/src/storage/disk_index_writer.rs b/diskann-disk/src/storage/disk_index_writer.rs index e494eb866..a2c9c3b88 100644 --- a/diskann-disk/src/storage/disk_index_writer.rs +++ b/diskann-disk/src/storage/disk_index_writer.rs @@ -8,12 +8,13 @@ use std::{ use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use diskann::{ANNError, ANNResult}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ model::graph::traits::GraphDataType, - storage::{get_mem_index_file, path_utility::*}, utils::{save_bytes, READ_WRITE_BLOCK_SIZE}, }; +use diskann_storage::{ + get_mem_index_file, path_utility::*, StorageReadProvider, StorageWriteProvider, +}; use tracing::info; use crate::{ @@ -597,8 +598,8 @@ impl DiskIndexWriter { #[cfg(test)] mod disk_index_storage_test { - use diskann_providers::storage::VirtualStorageProvider; use diskann_providers::test_utils::graph_data_type_utils::GraphDataF32VectorU32Data; + use diskann_storage::VirtualStorageProvider; use diskann_utils::test_data_root; use vfs::OverlayFS; diff --git a/diskann-disk/src/storage/quant/generator.rs b/diskann-disk/src/storage/quant/generator.rs index 8e5006d6f..b2ca75b17 100644 --- a/diskann-disk/src/storage/quant/generator.rs +++ b/diskann-disk/src/storage/quant/generator.rs @@ -9,11 +9,11 @@ use std::{ }; use diskann::{error::IntoANNResult, utils::VectorRepr, ANNError, ANNResult}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ forward_threadpool, utils::{load_metadata_from_file, AsThreadPool, BridgeErr, ParallelIteratorInPool, Timer}, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::{io::Metadata, views}; use rayon::iter::IndexedParallelIterator; use tracing::info; @@ -275,8 +275,8 @@ mod generator_tests { }; use diskann::utils::read_exact_into; - use diskann_providers::storage::VirtualStorageProvider; use diskann_providers::utils::{create_thread_pool_for_test, save_bytes}; + use diskann_storage::VirtualStorageProvider; use diskann_utils::{ io::{write_bin, Metadata}, views::MatrixView, diff --git a/diskann-disk/src/storage/quant/pq/pq_generation.rs b/diskann-disk/src/storage/quant/pq/pq_generation.rs index 32c97becc..d89c710e5 100644 --- a/diskann-disk/src/storage/quant/pq/pq_generation.rs +++ b/diskann-disk/src/storage/quant/pq/pq_generation.rs @@ -6,7 +6,6 @@ use std::marker::PhantomData; use diskann::{utils::VectorRepr, ANNError}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ forward_threadpool, model::{ @@ -17,6 +16,7 @@ use diskann_providers::{ utils::{AsThreadPool, BridgeErr, Timer}, }; use diskann_quantization::{product::TransposedTable, CompressInto}; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::views::MatrixBase; use diskann_vector::distance::Metric; use tracing::info; @@ -186,10 +186,9 @@ mod pq_generation_tests { use diskann::ANNError; use diskann_providers::model::pq::generate_pq_pivots; use diskann_providers::model::GeneratePivotArguments; - use diskann_providers::storage::{ - PQStorage, StorageReadProvider, StorageWriteProvider, VirtualStorageProvider, - }; + use diskann_providers::storage::PQStorage; use diskann_providers::utils::{create_thread_pool_for_test, AsThreadPool}; + use diskann_storage::{StorageReadProvider, StorageWriteProvider, VirtualStorageProvider}; use diskann_utils::{ io::{read_bin, write_bin}, test_data_root, diff --git a/diskann-disk/src/utils/aligned_file_reader/aligned_file_reader_factory.rs b/diskann-disk/src/utils/aligned_file_reader/aligned_file_reader_factory.rs index 3674765df..5f1bdb84b 100644 --- a/diskann-disk/src/utils/aligned_file_reader/aligned_file_reader_factory.rs +++ b/diskann-disk/src/utils/aligned_file_reader/aligned_file_reader_factory.rs @@ -14,7 +14,7 @@ use super::WindowsAlignedFileReader; use crate::utils::aligned_file_reader::traits::AlignedReaderFactory; #[cfg(any(miri, target_os = "macos"))] -use diskann_providers::storage::FileStorageProvider; +use diskann_storage::FileStorageProvider; pub struct AlignedFileReaderFactory { pub file_path: String, diff --git a/diskann-disk/src/utils/aligned_file_reader/storage_provider_aligned_file_reader.rs b/diskann-disk/src/utils/aligned_file_reader/storage_provider_aligned_file_reader.rs index 2e0b88253..a1d563fab 100644 --- a/diskann-disk/src/utils/aligned_file_reader/storage_provider_aligned_file_reader.rs +++ b/diskann-disk/src/utils/aligned_file_reader/storage_provider_aligned_file_reader.rs @@ -6,7 +6,7 @@ use std::io::Read; use diskann::ANNResult; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; use tracing::info; use super::traits::AlignedFileReader; @@ -50,7 +50,7 @@ impl AlignedFileReader for StorageProviderAlignedFileReader { mod tests { use std::io::{Seek, SeekFrom}; - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use diskann_utils::test_data_root; use super::*; diff --git a/diskann-disk/src/utils/aligned_file_reader/virtual_aligned_reader_factory.rs b/diskann-disk/src/utils/aligned_file_reader/virtual_aligned_reader_factory.rs index 53c049348..1444089dc 100644 --- a/diskann-disk/src/utils/aligned_file_reader/virtual_aligned_reader_factory.rs +++ b/diskann-disk/src/utils/aligned_file_reader/virtual_aligned_reader_factory.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use diskann::ANNResult; -use diskann_providers::storage::VirtualStorageProvider; +use diskann_storage::VirtualStorageProvider; use vfs::{FileSystem, MemoryFS}; use super::{traits::AlignedReaderFactory, StorageProviderAlignedFileReader}; diff --git a/diskann-disk/src/utils/partition.rs b/diskann-disk/src/utils/partition.rs index 67a88ef54..0fa84b801 100644 --- a/diskann-disk/src/utils/partition.rs +++ b/diskann-disk/src/utils/partition.rs @@ -3,7 +3,6 @@ * Licensed under the MIT license. */ use diskann::{error::IntoANNResult, utils::VectorRepr, ANNError, ANNResult}; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ forward_threadpool, utils::{ @@ -11,6 +10,7 @@ use diskann_providers::{ AsThreadPool, RayonThreadPool, READ_WRITE_BLOCK_SIZE, }, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use rand::Rng; use tracing::info; @@ -425,8 +425,8 @@ fn estimate_cluster_sizes( mod partition_test { use std::io::Read; - use diskann_providers::storage::VirtualStorageProvider; use diskann_providers::utils::create_thread_pool_for_test; + use diskann_storage::VirtualStorageProvider; use diskann_utils::test_data_root; use vfs::{MemoryFS, OverlayFS}; diff --git a/diskann-providers/Cargo.toml b/diskann-providers/Cargo.toml index 65a7b0e0b..661cfaee1 100644 --- a/diskann-providers/Cargo.toml +++ b/diskann-providers/Cargo.toml @@ -33,6 +33,7 @@ tracing.workspace = true diskann-linalg = { workspace = true } diskann = { workspace = true } diskann-utils = { workspace = true } +diskann-storage = { workspace = true } diskann-quantization = { workspace = true, features = ["rayon"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tempfile = { workspace = true, optional = true } @@ -81,7 +82,7 @@ perf_test = ["dep:opentelemetry"] testing = ["dep:tempfile"] bf_tree = ["dep:bf-tree", "dep:polonius-the-crab", "dep:serde_json"] experimental_diversity_search = ["diskann/experimental_diversity_search"] -virtual_storage = ["dep:vfs"] +virtual_storage = ["dep:vfs", "diskann-storage/virtual_storage"] # Some 'cfg's in the source tree will be flagged by `cargo clippy -j 2 --workspace --no-deps --all-targets -- -D warnings` [lints.rust] diff --git a/diskann-providers/benches/benchmarks/copy_aligned_data_bench.rs b/diskann-providers/benches/benchmarks/copy_aligned_data_bench.rs index e8ffae784..d10b959ab 100644 --- a/diskann-providers/benches/benchmarks/copy_aligned_data_bench.rs +++ b/diskann-providers/benches/benchmarks/copy_aligned_data_bench.rs @@ -21,10 +21,6 @@ pub const BENCHMARK_ID: &str = "copy_aligned_data"; pub fn benchmark_copy_aligned_data(c: &mut Criterion) { let tmp_dir = TempDir::with_prefix(BENCHMARK_ID).expect("Failed to create temporary directory"); - #[expect( - clippy::disallowed_methods, - reason = "Use physical file system rather than memory for testing the actual disk read/write" - )] let storage_provider = VirtualStorageProvider::new_physical(tmp_dir.path()); let num_points = 1_000_000; diff --git a/diskann-providers/benches/benchmarks_iai/copy_aligned_data_bench_iai.rs b/diskann-providers/benches/benchmarks_iai/copy_aligned_data_bench_iai.rs index 4d85f2b4b..68108e239 100644 --- a/diskann-providers/benches/benchmarks_iai/copy_aligned_data_bench_iai.rs +++ b/diskann-providers/benches/benchmarks_iai/copy_aligned_data_bench_iai.rs @@ -27,10 +27,6 @@ iai_callgrind::library_benchmark_group!( #[iai_callgrind::library_benchmark] pub fn benchmark_copy_aligned_data_iai() { let tmp_dir = TempDir::with_prefix(BENCHMARK_ID).expect("Failed to create temporary directory"); - #[expect( - clippy::disallowed_methods, - reason = "Use physical file system rather than memory for testing the actual disk read/write" - )] let storage_provider = VirtualStorageProvider::new_physical(tmp_dir.path()); let num_points = 1_000_000; diff --git a/diskann-providers/src/model/graph/provider/async_/bf_tree/provider.rs b/diskann-providers/src/model/graph/provider/async_/bf_tree/provider.rs index 907f43770..1bd2b5da4 100644 --- a/diskann-providers/src/model/graph/provider/async_/bf_tree/provider.rs +++ b/diskann-providers/src/model/graph/provider/async_/bf_tree/provider.rs @@ -2227,7 +2227,7 @@ where mod tests { use super::*; use crate::model::graph::provider::async_::common::TableBasedDeletes; - use crate::storage::file_storage_provider::FileStorageProvider; + use crate::storage::FileStorageProvider; #[tokio::test] async fn test_data_provider_and_delete_interface() { diff --git a/diskann-providers/src/storage/mod.rs b/diskann-providers/src/storage/mod.rs index 1233b11f6..43284f0c5 100644 --- a/diskann-providers/src/storage/mod.rs +++ b/diskann-providers/src/storage/mod.rs @@ -3,26 +3,27 @@ * Licensed under the MIT license. */ -mod storage_provider; -pub use storage_provider::{ - DynWriteProvider, StorageReadProvider, StorageWriteProvider, WriteProviderWrapper, WriteSeek, +// Core storage traits and implementations — re-exported from diskann-storage. +pub use diskann_storage::{ + DynWriteProvider, FileStorageProvider, StorageReadProvider, StorageWriteProvider, + WriteProviderWrapper, WriteSeek, }; #[cfg(any(test, feature = "virtual_storage"))] -mod virtual_storage_provider; -#[cfg(any(test, feature = "virtual_storage"))] -pub use virtual_storage_provider::VirtualStorageProvider; +pub use diskann_storage::VirtualStorageProvider; + +pub use diskann_storage::path_utility; +pub use diskann_storage::path_utility::{ + get_compressed_pq_file, get_disk_index_compressed_pq_file, get_disk_index_file, + get_disk_index_pq_pivot_file, get_label_file, get_label_medoids_file, get_mem_index_data_file, + get_mem_index_file, get_pq_pivot_file, get_universal_label_file, +}; mod api; pub use api::{AsyncIndexMetadata, AsyncQuantLoadContext, DiskGraphOnly, LoadWith, SaveWith}; pub(crate) mod bin; -pub(crate) mod file_storage_provider; -// Use VirtualStorageProvider in tests to avoid filesystem side-effects -#[cfg(not(test))] -pub use file_storage_provider::FileStorageProvider; - mod pq_storage; pub use pq_storage::PQStorage; @@ -31,13 +32,6 @@ pub use sq_storage::SQStorage; pub mod protos; -pub mod path_utility; -pub use path_utility::{ - get_compressed_pq_file, get_disk_index_compressed_pq_file, get_disk_index_file, - get_disk_index_pq_pivot_file, get_label_file, get_label_medoids_file, get_mem_index_data_file, - get_mem_index_file, get_pq_pivot_file, get_universal_label_file, -}; - pub mod index_storage; pub use index_storage::{ create_load_context, load_fp_index, load_index_with_deletes, load_pq_index, diff --git a/diskann-providers/src/storage/path_utility.rs b/diskann-providers/src/storage/path_utility.rs deleted file mode 100644 index 10f7b047f..000000000 --- a/diskann-providers/src/storage/path_utility.rs +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -pub fn get_mem_index_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_mem.index" -} - -pub fn get_mem_index_data_file(mem_index_path: &str) -> String { - format!("{}.data", mem_index_path) -} - -pub fn get_disk_index_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_disk.index" -} - -pub fn get_pq_pivot_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_pq_pivots.bin" -} - -pub fn get_compressed_pq_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_pq_compressed.bin" -} - -pub fn get_disk_index_pq_pivot_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_disk.index_pq_pivots.bin" -} - -pub fn get_disk_index_compressed_pq_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_disk.index_pq_compressed.bin" -} - -pub fn get_label_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_labels.txt" -} - -pub fn get_label_medoids_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_labels_to_medoids.txt" -} - -pub fn get_universal_label_file(index_path_prefix: &str) -> String { - index_path_prefix.to_string() + "_universal_label.txt" -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_get_label_file() { - let prefix = "test_prefix"; - let result = get_label_file(prefix); - assert_eq!(result, "test_prefix_labels.txt"); - } - - #[test] - fn test_get_label_medoids_file() { - let prefix = "test_prefix"; - let result = get_label_medoids_file(prefix); - assert_eq!(result, "test_prefix_labels_to_medoids.txt"); - } - - #[test] - fn test_get_universal_label_file() { - let prefix = "test_prefix"; - let result = get_universal_label_file(prefix); - assert_eq!(result, "test_prefix_universal_label.txt"); - } -} diff --git a/diskann-providers/src/storage/storage_provider.rs b/diskann-providers/src/storage/storage_provider.rs deleted file mode 100644 index 1bb6d0cc0..000000000 --- a/diskann-providers/src/storage/storage_provider.rs +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ -use std::io::{Read, Result, Seek, Write}; - -/// This module provides traits and implementations to access a storage system, it could be -/// a file system or an exchange store. -/// -/// The `StorageReadProvider` trait is an abstraction for accessing a storage system in a read only manner. It could be -/// implemented with a file system or an exchange store. It provides methods to open a storage -/// for reading, get the length of the storage and check if a storage with a given -/// identifier exists. -/// -/// The method is defined in a way that it can work with both file system and BigStorageShim. -/// -/// Implementations of this trait will be used by DiskIndexReader to access the storage system. -/// -/// The internal Reader associated type of the `StorageReadProvider` trait is defined by the `StorageReader` trait. -pub trait StorageReadProvider: Sync { - type Reader: Read + Seek; - - /// Open a storage with the given identifier for read. - fn open_reader(&self, item_identifier: &str) -> Result; - - /// Get the length of the storage with the given identifier. - fn get_length(&self, item_identifier: &str) -> Result; - - /// Check if the storage with the given identifier exists. - fn exists(&self, item_identifier: &str) -> bool; -} - -/// `StorageWriteProvider` is a trait that abstracts over the ability to write to a storage. Since the ANN algorithm only writes into file system, -/// currently we only have one implementation for this trait based on file system. -pub trait StorageWriteProvider: Sync { - type Writer: WriteSeek; - - /// Open a storage with the given identifier for write. - fn open_writer(&self, item_identifier: &str) -> Result; - - /// Create a storage with the given identifier for write. - fn create_for_write(&self, item_identifier: &str) -> Result; - - // Deletes a storage item with the given identifier. - fn delete(&self, item_identifier: &str) -> Result<()>; -} - -/// Trait alias for types that implement both `Write` and `Seek`. -/// -/// Use this when an API needs a writer that can also move the cursor. -/// Implemented for any type that implements `Write` and `Seek`. -pub trait WriteSeek: Write + Seek {} -impl WriteSeek for T where T: Write + Seek {} - -/// Object safe interface for opening and creating writers without exposing a concrete provider type. -/// -/// This is useful when passing a writer provider through trait objects or other dynamic -/// boundaries. Methods return boxed writers so callers can use a single uniform interface. -pub trait DynWriteProvider: Sync { - /// Open an existing item for writing. - /// - /// Returns a boxed writer positioned by the provider. Fails if the item does not exist. - fn open_writer(&self, item_identifier: &str) -> std::io::Result>; - - /// Create a new item for writing. - /// - /// Returns a boxed writer for a new item. Behavior if the item already exists depends on the provider. - fn create_for_write(&self, item_identifier: &str) -> std::io::Result>; - - /// Delete an item identified by `item_identifier`. - fn delete(&self, item_identifier: &str) -> std::io::Result<()>; -} - -impl DynWriteProvider for T -where - T: StorageWriteProvider, -{ - fn open_writer(&self, item_identifier: &str) -> std::io::Result> { - self.open_writer(item_identifier) - .map(|w| Box::new(w) as Box) - } - - fn create_for_write(&self, item_identifier: &str) -> std::io::Result> { - self.create_for_write(item_identifier) - .map(|w| Box::new(w) as Box) - } - - fn delete(&self, item_identifier: &str) -> std::io::Result<()> { - self.delete(item_identifier) - } -} - -/// Adapter that exposes a `&dyn DynWriteProvider` as a `StorageWriteProvider`. -/// -/// Useful when an API is generic over `StorageWriteProvider` but the caller only has a dynamic -/// provider. The wrapper forwards all calls to the inner provider and returns boxed writers -/// tied to the wrapper lifetime `'a`. -pub struct WriteProviderWrapper<'a> { - inner: &'a dyn DynWriteProvider, -} - -impl<'a> WriteProviderWrapper<'a> { - /// Construct a new wrapper around the given dynamic provider reference. - pub const fn new(inner: &'a dyn DynWriteProvider) -> Self { - Self { inner } - } -} - -impl<'a> StorageWriteProvider for WriteProviderWrapper<'a> { - type Writer = Box; - - fn open_writer(&self, item_identifier: &str) -> std::io::Result { - self.inner.open_writer(item_identifier) - } - - fn create_for_write(&self, item_identifier: &str) -> std::io::Result { - self.inner.create_for_write(item_identifier) - } - - fn delete(&self, item_identifier: &str) -> std::io::Result<()> { - self.inner.delete(item_identifier) - } -} diff --git a/diskann-providers/src/utils/mod.rs b/diskann-providers/src/utils/mod.rs index 87ff9df0b..36a35f085 100644 --- a/diskann-providers/src/utils/mod.rs +++ b/diskann-providers/src/utils/mod.rs @@ -14,9 +14,7 @@ pub use normalizing_util::{ normalize_data_file, normalize_data_internal, normalize_data_internal_no_cblas, }; -#[allow(clippy::module_inception)] -mod utils; -pub use utils::DatasetDto; +pub use diskann_storage::DatasetDto; mod bridge_error; pub use bridge_error::{Bridge, BridgeErr}; diff --git a/diskann-providers/src/utils/utils.rs b/diskann-providers/src/utils/utils.rs deleted file mode 100644 index 93aee9259..000000000 --- a/diskann-providers/src/utils/utils.rs +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ - -/// Dataset dto used for other layer, such as storage -/// N is the aligned dimension -#[derive(Debug)] -pub struct DatasetDto<'a, T> { - /// data slice borrow from dataset - pub data: &'a mut [T], - - /// rounded dimension - pub rounded_dim: usize, -} diff --git a/diskann-storage/Cargo.toml b/diskann-storage/Cargo.toml new file mode 100644 index 000000000..82e5e8971 --- /dev/null +++ b/diskann-storage/Cargo.toml @@ -0,0 +1,32 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +[package] +name = "diskann-storage" +version.workspace = true +authors.workspace = true +description.workspace = true +documentation.workspace = true +license.workspace = true +edition = "2024" + +[dependencies] +prost = "0.14.1" +thiserror.workspace = true +vfs = { workspace = true, optional = true } + +[dev-dependencies] +tempfile.workspace = true +vfs.workspace = true + +[features] +default = [] +virtual_storage = ["dep:vfs"] + +[package.metadata.docs.rs] +default-target = "x86_64-pc-windows-msvc" +targets = [ + "x86_64-unknown-linux-gnu", + "aarch64-pc-windows-msvc", + "i686-pc-windows-msvc", + "x86_64-pc-windows-msvc", +] diff --git a/diskann-storage/src/dataset_dto.rs b/diskann-storage/src/dataset_dto.rs new file mode 100644 index 000000000..faa9e75ef --- /dev/null +++ b/diskann-storage/src/dataset_dto.rs @@ -0,0 +1,54 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Lightweight data-transfer object for aligned vector datasets. +//! +//! [`DatasetDto`] is a view type that pairs a mutable slice of vector data with +//! the rounded (padded) dimension used for aligned storage layouts. + +/// Dataset DTO used to pass aligned vector data between layers. +/// +/// `T` is the element type (e.g. `f32`, `u8`), and `rounded_dim` records the +/// padded dimension that may be larger than the original vector dimension due +/// to alignment requirements. +#[derive(Debug)] +pub struct DatasetDto<'a, T> { + /// Mutable borrow of the underlying data slice. + pub data: &'a mut [T], + + /// The rounded (padded) dimension of each vector in `data`. + pub rounded_dim: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_construction() { + let mut data = vec![1.0f32, 2.0, 3.0, 0.0]; // dim=3, rounded_dim=4 + let dto = DatasetDto { + data: &mut data, + rounded_dim: 4, + }; + assert_eq!(dto.rounded_dim, 4); + assert_eq!(dto.data.len(), 4); + } + + #[test] + fn mutation_through_dto() { + let mut data = vec![0u8; 8]; + { + let dto = DatasetDto { + data: &mut data, + rounded_dim: 4, + }; + dto.data[0] = 42; + dto.data[7] = 255; + } + assert_eq!(data[0], 42); + assert_eq!(data[7], 255); + } +} diff --git a/diskann-providers/src/storage/file_storage_provider.rs b/diskann-storage/src/file_storage_provider.rs similarity index 63% rename from diskann-providers/src/storage/file_storage_provider.rs rename to diskann-storage/src/file_storage_provider.rs index 9d4d351a4..fc0a1c68b 100644 --- a/diskann-providers/src/storage/file_storage_provider.rs +++ b/diskann-storage/src/file_storage_provider.rs @@ -2,14 +2,23 @@ * Copyright (c) Microsoft Corporation. * Licensed under the MIT license. */ + +//! Filesystem-backed storage provider. +//! +//! [`FileStorageProvider`] implements both [`StorageReadProvider`] and +//! [`StorageWriteProvider`] using the local filesystem via [`std::fs`]. + use std::{ fs::{self, File, OpenOptions}, io::{BufReader, BufWriter, Result}, }; -use super::{StorageReadProvider, StorageWriteProvider}; +use crate::{StorageReadProvider, StorageWriteProvider}; -/// FileStorage implements both StorageReadProvider and StorageWriteProvider. +/// Storage provider backed by the local filesystem. +/// +/// Each `item_identifier` is interpreted as a filesystem path. Readers are +/// buffered via [`BufReader`] and writers via [`BufWriter`]. #[derive(Default)] pub struct FileStorageProvider; @@ -62,8 +71,7 @@ mod tests { use super::*; #[test] - fn test_file_reader() { - // Use TempDir for automatic deleting when going out of scope + fn read_after_write() { let tmp_dir = TempDir::with_prefix("test_file_reader").expect("Failed to create temporary directory"); let file_path = tmp_dir.path().join("test_file_reader.txt"); @@ -76,19 +84,18 @@ mod tests { let mut buffer = [0; 5]; reader.seek(SeekFrom::Start(0)).unwrap(); - reader.read(&mut buffer).unwrap(); + reader.read_exact(&mut buffer).unwrap(); assert_eq!(&buffer, b"Hello"); reader.seek(SeekFrom::Start(5)).unwrap(); - reader.read(&mut buffer).unwrap(); + reader.read_exact(&mut buffer).unwrap(); assert_eq!(&buffer, b", wor"); } #[test] - fn test_file_create_write() { + fn create_write_and_append() { let storage_provider = FileStorageProvider; - // Use TempDir for automatic deleting when going out of scope let tmp_dir = TempDir::with_prefix("test_file_create_write") .expect("Failed to create temporary directory"); let file_path = tmp_dir.path().join("test_file_create_write.txt"); @@ -126,8 +133,7 @@ mod tests { } #[test] - fn test_file_storage_exists() { - // Use TempDir for automatic deleting when going out of scope + fn exists_check() { let tmp_dir = TempDir::with_prefix("test_file_storage_exists") .expect("Failed to create temporary directory"); let file_path = tmp_dir.path().join("test_file_storage_exists.txt"); @@ -139,8 +145,7 @@ mod tests { } #[test] - fn test_file_storage_get_length() { - // Use TempDir for automatic deleting when going out of scope + fn get_length() { let tmp_dir = TempDir::with_prefix("test_file_storage_get_length") .expect("Failed to create temporary directory"); let file_path = tmp_dir.path().join("test_file_storage_get_length.txt"); @@ -150,6 +155,63 @@ mod tests { file.write_all(b"Hello, world!").unwrap(); assert_eq!(FileStorageProvider.get_length(file_name).unwrap(), 13); - fs::remove_file(file_name).unwrap(); + } + + #[test] + fn delete_removes_file() { + let tmp_dir = TempDir::with_prefix("test_file_storage_delete") + .expect("Failed to create temporary directory"); + let file_path = tmp_dir.path().join("to_delete.txt"); + let file_name = file_path.to_str().unwrap(); + + File::create(file_name).unwrap(); + assert!(FileStorageProvider.exists(file_name)); + + FileStorageProvider.delete(file_name).unwrap(); + assert!(!FileStorageProvider.exists(file_name)); + } + + #[test] + fn create_for_write_truncates_existing() { + let tmp_dir = + TempDir::with_prefix("test_truncate").expect("Failed to create temporary directory"); + let file_path = tmp_dir.path().join("truncate.txt"); + let file_name = file_path.to_str().unwrap(); + + // Write long content. + { + let mut w = FileStorageProvider.create_for_write(file_name).unwrap(); + w.write_all(b"long initial content that should be truncated") + .unwrap(); + w.flush().unwrap(); + } + + // Overwrite with short content — must truncate. + { + let mut w = FileStorageProvider.create_for_write(file_name).unwrap(); + w.write_all(b"short").unwrap(); + w.flush().unwrap(); + } + + let len = FileStorageProvider.get_length(file_name).unwrap(); + assert_eq!(len, 5, "create_for_write should truncate the file"); + } + + #[test] + fn open_reader_nonexistent_returns_error() { + let err = FileStorageProvider.open_reader("/nonexistent/path/file.bin"); + assert!(err.is_err()); + } + + #[test] + fn open_writer_nonexistent_returns_error() { + let err = FileStorageProvider.open_writer("/nonexistent/path/file.bin"); + assert!(err.is_err()); + } + + #[test] + fn delete_nonexistent_returns_error() { + let err = FileStorageProvider.delete("/nonexistent/path/file.bin"); + assert!(err.is_err()); } } diff --git a/diskann-storage/src/lib.rs b/diskann-storage/src/lib.rs new file mode 100644 index 000000000..ccb3b62dd --- /dev/null +++ b/diskann-storage/src/lib.rs @@ -0,0 +1,49 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Storage abstraction layer for DiskANN. +//! +//! This crate provides traits and implementations for reading from and writing +//! to storage backends. The [`StorageReadProvider`] and [`StorageWriteProvider`] +//! traits abstract over concrete storage systems so that the same code can +//! operate against a local filesystem, an in-memory virtual filesystem, or any +//! other backend. +//! +//! # Crate features +//! +//! - **`virtual_storage`** — enables [`VirtualStorageProvider`], an in-memory +//! or overlay filesystem useful for testing without touching the real +//! filesystem. + +#![cfg_attr( + not(test), + warn(clippy::panic, clippy::unwrap_used, clippy::expect_used) +)] + +mod storage_provider; +pub use storage_provider::{ + DynWriteProvider, StorageReadProvider, StorageWriteProvider, WriteProviderWrapper, WriteSeek, +}; + +mod file_storage_provider; +pub use file_storage_provider::FileStorageProvider; + +#[cfg(any(test, feature = "virtual_storage"))] +mod virtual_storage_provider; +#[cfg(any(test, feature = "virtual_storage"))] +pub use virtual_storage_provider::VirtualStorageProvider; + +pub mod path_utility; +pub use path_utility::{ + get_compressed_pq_file, get_disk_index_compressed_pq_file, get_disk_index_file, + get_disk_index_pq_pivot_file, get_label_file, get_label_medoids_file, get_mem_index_data_file, + get_mem_index_file, get_pq_pivot_file, get_universal_label_file, +}; + +mod dataset_dto; +pub use dataset_dto::DatasetDto; + +pub mod proto_storage; +pub use proto_storage::{ProtoStorageError, load_proto, save_proto}; diff --git a/diskann-storage/src/path_utility.rs b/diskann-storage/src/path_utility.rs new file mode 100644 index 000000000..0254250e5 --- /dev/null +++ b/diskann-storage/src/path_utility.rs @@ -0,0 +1,151 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Canonical file-path naming conventions for DiskANN index artifacts. +//! +//! Every function in this module takes a path prefix (e.g. `"/index/my_index"`) +//! and appends the appropriate suffix for the artifact type. This ensures that +//! all components in the system agree on file naming. + +/// Return the path for the in-memory graph index file. +pub fn get_mem_index_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_mem.index") +} + +/// Return the path for the full-precision vector data file. +pub fn get_mem_index_data_file(mem_index_path: &str) -> String { + format!("{mem_index_path}.data") +} + +/// Return the path for the disk-based graph index file. +pub fn get_disk_index_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_disk.index") +} + +/// Return the path for the PQ pivot table file. +pub fn get_pq_pivot_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_pq_pivots.bin") +} + +/// Return the path for the PQ compressed data file. +pub fn get_compressed_pq_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_pq_compressed.bin") +} + +/// Return the path for the disk-index PQ pivot table file. +pub fn get_disk_index_pq_pivot_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_disk.index_pq_pivots.bin") +} + +/// Return the path for the disk-index PQ compressed data file. +pub fn get_disk_index_compressed_pq_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_disk.index_pq_compressed.bin") +} + +/// Return the path for the label file. +pub fn get_label_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_labels.txt") +} + +/// Return the path for the label-to-medoid mapping file. +pub fn get_label_medoids_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_labels_to_medoids.txt") +} + +/// Return the path for the universal label file. +pub fn get_universal_label_file(index_path_prefix: &str) -> String { + format!("{index_path_prefix}_universal_label.txt") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn mem_index_file() { + assert_eq!(get_mem_index_file("test_prefix"), "test_prefix_mem.index"); + } + + #[test] + fn mem_index_data_file() { + assert_eq!(get_mem_index_data_file("test_prefix"), "test_prefix.data"); + } + + #[test] + fn disk_index_file() { + assert_eq!(get_disk_index_file("test_prefix"), "test_prefix_disk.index"); + } + + #[test] + fn pq_pivot_file() { + assert_eq!( + get_pq_pivot_file("test_prefix"), + "test_prefix_pq_pivots.bin" + ); + } + + #[test] + fn compressed_pq_file() { + assert_eq!( + get_compressed_pq_file("test_prefix"), + "test_prefix_pq_compressed.bin" + ); + } + + #[test] + fn disk_index_pq_pivot_file() { + assert_eq!( + get_disk_index_pq_pivot_file("test_prefix"), + "test_prefix_disk.index_pq_pivots.bin" + ); + } + + #[test] + fn disk_index_compressed_pq_file() { + assert_eq!( + get_disk_index_compressed_pq_file("test_prefix"), + "test_prefix_disk.index_pq_compressed.bin" + ); + } + + #[test] + fn label_file() { + assert_eq!(get_label_file("test_prefix"), "test_prefix_labels.txt"); + } + + #[test] + fn label_medoids_file() { + assert_eq!( + get_label_medoids_file("test_prefix"), + "test_prefix_labels_to_medoids.txt" + ); + } + + #[test] + fn universal_label_file() { + assert_eq!( + get_universal_label_file("test_prefix"), + "test_prefix_universal_label.txt" + ); + } + + #[test] + fn empty_prefix() { + assert_eq!(get_mem_index_file(""), "_mem.index"); + assert_eq!(get_label_file(""), "_labels.txt"); + } + + #[test] + fn prefix_with_path_separators() { + assert_eq!( + get_mem_index_file("/data/index/sift"), + "/data/index/sift_mem.index" + ); + assert_eq!( + get_pq_pivot_file("C:\\data\\index"), + "C:\\data\\index_pq_pivots.bin" + ); + } +} diff --git a/diskann-storage/src/proto_storage.rs b/diskann-storage/src/proto_storage.rs new file mode 100644 index 000000000..e70f9b9ca --- /dev/null +++ b/diskann-storage/src/proto_storage.rs @@ -0,0 +1,171 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Generic protobuf I/O backed by [`StorageReadProvider`] / [`StorageWriteProvider`]. +//! +//! Provides [`load_proto`] and [`save_proto`] for reading and writing any +//! [`prost::Message`] type through the storage abstraction layer. + +use std::io::{Read, Result, Write}; + +use crate::{StorageReadProvider, StorageWriteProvider}; +use thiserror::Error; + +/// Load a protobuf message of type `S` from the item at `path`. +/// +/// The item is read in its entirety and then decoded via [`prost::Message::decode`]. +/// +/// # Errors +/// +/// Returns [`ProtoStorageError::IoError`] if reading fails, or +/// [`ProtoStorageError::DecodeError`] if the bytes cannot be decoded. +pub fn load_proto(read_provider: &P, path: &str) -> std::result::Result +where + P: StorageReadProvider, + S: prost::Message + Default, +{ + let mut reader = read_provider.open_reader(path)?; + let mut raw_buffer = Vec::new(); + reader.read_to_end(&mut raw_buffer)?; + Ok(S::decode(&*raw_buffer)?) +} + +/// Save a protobuf message to the item at `path`. +/// +/// The message is encoded via [`prost::Message::encode_to_vec`] and written +/// through the provider. Returns the number of bytes written on success. +/// +/// # Errors +/// +/// Propagates any I/O errors encountered during creation, writing, or flushing. +pub fn save_proto(proto_struct: S, write_provider: &P, path: &str) -> Result +where + P: StorageWriteProvider, + S: prost::Message, +{ + let mut writer = write_provider.create_for_write(path)?; + let encoded_proto = proto_struct.encode_to_vec(); + writer.write_all(&encoded_proto)?; + writer.flush()?; + Ok(encoded_proto.len()) +} + +/// Errors that can occur when reading from or writing to protobuf storage. +#[derive(Debug, Error)] +pub enum ProtoStorageError { + /// An I/O error occurred while accessing the underlying storage. + #[error("I/O error while accessing protobuf storage: {0}")] + IoError(#[from] std::io::Error), + + /// The bytes read from storage could not be decoded as the expected + /// protobuf message type. + #[error("failed to decode protobuf message: {0}")] + DecodeError(#[from] prost::DecodeError), +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::VirtualStorageProvider; + use prost::Message; + + /// A minimal protobuf message for testing round-trips. + /// + /// Generated-style struct (in production these come from `prost-build`). + #[derive(Clone, PartialEq, Message)] + struct TestMessage { + #[prost(string, tag = "1")] + name: String, + #[prost(uint32, tag = "2")] + value: u32, + } + + #[test] + fn save_and_load_round_trip() { + let storage = VirtualStorageProvider::new_memory(); + let original = TestMessage { + name: "hello".to_string(), + value: 42, + }; + + let bytes_written = save_proto(original.clone(), &storage, "/test.bin").unwrap(); + assert_eq!(bytes_written, original.encode_to_vec().len()); + + let loaded: TestMessage = load_proto(&storage, "/test.bin").unwrap(); + assert_eq!(loaded, original); + } + + #[test] + fn load_invalid_data_returns_decode_error() { + let storage = VirtualStorageProvider::new_memory(); + { + let mut writer = storage.create_for_write("/bad.bin").unwrap(); + // Write garbage bytes that don't form a valid protobuf. + writer.write_all(&[0xFF, 0xFF, 0xFF, 0xFF]).unwrap(); + writer.flush().unwrap(); + } + + let err = load_proto::<_, TestMessage>(&storage, "/bad.bin").unwrap_err(); + assert!( + matches!(err, ProtoStorageError::DecodeError(_)), + "expected DecodeError, got: {err:?}" + ); + } + + #[test] + fn load_missing_file_returns_io_error() { + let storage = VirtualStorageProvider::new_memory(); + let err = load_proto::<_, TestMessage>(&storage, "/missing.bin").unwrap_err(); + assert!( + matches!(err, ProtoStorageError::IoError(_)), + "expected IoError, got: {err:?}" + ); + } + + #[test] + fn save_returns_correct_byte_count() { + let storage = VirtualStorageProvider::new_memory(); + let msg = TestMessage { + name: "count_test".to_string(), + value: 99, + }; + let expected_len = msg.encode_to_vec().len(); + let actual = save_proto(msg, &storage, "/count.bin").unwrap(); + assert_eq!(actual, expected_len); + } + + #[test] + fn save_overwrites_existing_item() { + let storage = VirtualStorageProvider::new_memory(); + + let msg1 = TestMessage { + name: "first".to_string(), + value: 1, + }; + save_proto(msg1, &storage, "/overwrite.bin").unwrap(); + + let msg2 = TestMessage { + name: "second".to_string(), + value: 2, + }; + save_proto(msg2.clone(), &storage, "/overwrite.bin").unwrap(); + + let loaded: TestMessage = load_proto(&storage, "/overwrite.bin").unwrap(); + assert_eq!(loaded, msg2); + } + + #[test] + fn empty_message_round_trip() { + let storage = VirtualStorageProvider::new_memory(); + let empty = TestMessage { + name: String::new(), + value: 0, + }; + + save_proto(empty.clone(), &storage, "/empty.bin").unwrap(); + let loaded: TestMessage = load_proto(&storage, "/empty.bin").unwrap(); + assert_eq!(loaded, empty); + } +} diff --git a/diskann-storage/src/storage_provider.rs b/diskann-storage/src/storage_provider.rs new file mode 100644 index 000000000..29ea0d3f2 --- /dev/null +++ b/diskann-storage/src/storage_provider.rs @@ -0,0 +1,186 @@ +/* + * Copyright (c) Microsoft Corporation. + * Licensed under the MIT license. + */ + +//! Core storage provider traits. +//! +//! These traits abstract over concrete I/O backends so that the rest of the +//! codebase can read and write index data without knowing whether the +//! underlying storage is a local filesystem, an in-memory virtual filesystem, +//! or a remote object store. + +use std::io::{Read, Result, Seek, Write}; + +/// Abstraction for read-only access to a storage backend. +/// +/// Implementations may target a local filesystem, an in-memory filesystem, or a +/// remote object store. The associated [`Reader`](Self::Reader) type must +/// support both sequential reads and random seeks. +/// +/// # Thread Safety +/// +/// Providers are required to be [`Sync`] so that they can be shared across +/// threads. Individual readers are *not* required to be thread-safe. +pub trait StorageReadProvider: Sync { + /// The reader type returned when opening an item. + type Reader: Read + Seek; + + /// Open a storage item identified by `item_identifier` for reading. + fn open_reader(&self, item_identifier: &str) -> Result; + + /// Return the size, in bytes, of the item identified by `item_identifier`. + fn get_length(&self, item_identifier: &str) -> Result; + + /// Return `true` if an item with the given identifier exists in storage. + fn exists(&self, item_identifier: &str) -> bool; +} + +/// Abstraction for write access to a storage backend. +/// +/// The associated [`Writer`](Self::Writer) type supports both sequential writes +/// and random seeks. +/// +/// # Thread Safety +/// +/// Providers are required to be [`Sync`]; individual writers are not. +pub trait StorageWriteProvider: Sync { + /// The writer type returned when opening or creating an item. + type Writer: WriteSeek; + + /// Open an existing storage item for writing. + fn open_writer(&self, item_identifier: &str) -> Result; + + /// Create a new storage item (or truncate an existing one) for writing. + fn create_for_write(&self, item_identifier: &str) -> Result; + + /// Delete the storage item identified by `item_identifier`. + fn delete(&self, item_identifier: &str) -> Result<()>; +} + +/// Trait alias for types that implement both [`Write`] and [`Seek`]. +/// +/// Automatically implemented for every type satisfying both bounds. +pub trait WriteSeek: Write + Seek {} +impl WriteSeek for T where T: Write + Seek {} + +/// Object-safe interface for opening and creating writers without exposing a +/// concrete provider type. +/// +/// This is useful when passing a writer provider through trait objects or other +/// dynamic boundaries. Methods return boxed writers so callers can use a single +/// uniform interface. +pub trait DynWriteProvider: Sync { + /// Open an existing item for writing, returning a boxed writer. + fn open_writer(&self, item_identifier: &str) -> std::io::Result>; + + /// Create a new item for writing, returning a boxed writer. + fn create_for_write(&self, item_identifier: &str) -> std::io::Result>; + + /// Delete the item identified by `item_identifier`. + fn delete(&self, item_identifier: &str) -> std::io::Result<()>; +} + +impl DynWriteProvider for T +where + T: StorageWriteProvider, +{ + fn open_writer(&self, item_identifier: &str) -> std::io::Result> { + self.open_writer(item_identifier) + .map(|w| Box::new(w) as Box) + } + + fn create_for_write(&self, item_identifier: &str) -> std::io::Result> { + self.create_for_write(item_identifier) + .map(|w| Box::new(w) as Box) + } + + fn delete(&self, item_identifier: &str) -> std::io::Result<()> { + self.delete(item_identifier) + } +} + +/// Adapter that exposes a `&dyn DynWriteProvider` as a [`StorageWriteProvider`]. +/// +/// Useful when an API is generic over [`StorageWriteProvider`] but the caller +/// only has a dynamic provider. The wrapper forwards all calls to the inner +/// provider and returns boxed writers tied to the wrapper lifetime `'a`. +pub struct WriteProviderWrapper<'a> { + inner: &'a dyn DynWriteProvider, +} + +impl<'a> WriteProviderWrapper<'a> { + /// Construct a new wrapper around the given dynamic provider reference. + pub const fn new(inner: &'a dyn DynWriteProvider) -> Self { + Self { inner } + } +} + +impl<'a> StorageWriteProvider for WriteProviderWrapper<'a> { + type Writer = Box; + + fn open_writer(&self, item_identifier: &str) -> std::io::Result { + self.inner.open_writer(item_identifier) + } + + fn create_for_write(&self, item_identifier: &str) -> std::io::Result { + self.inner.create_for_write(item_identifier) + } + + fn delete(&self, item_identifier: &str) -> std::io::Result<()> { + self.inner.delete(item_identifier) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Verify that `WriteProviderWrapper` delegates correctly to the wrapped + /// `DynWriteProvider` by exercising a round-trip through a concrete + /// provider. + #[test] + fn write_provider_wrapper_round_trip() { + use crate::VirtualStorageProvider; + use std::io::{Read, Seek, SeekFrom, Write}; + + let vsp = VirtualStorageProvider::new_memory(); + let dyn_provider: &dyn DynWriteProvider = &vsp; + let wrapper = WriteProviderWrapper::new(dyn_provider); + + // Write via the wrapper using the StorageWriteProvider trait. + { + let mut writer = + StorageWriteProvider::create_for_write(&wrapper, "/round_trip.bin").unwrap(); + writer.write_all(b"hello wrapper").unwrap(); + writer.flush().unwrap(); + } + + // Read back via the original provider. + let mut reader = vsp.open_reader("/round_trip.bin").unwrap(); + let mut buf = String::new(); + reader.seek(SeekFrom::Start(0)).unwrap(); + reader.read_to_string(&mut buf).unwrap(); + assert_eq!(buf, "hello wrapper"); + } + + /// Verify that `DynWriteProvider::delete` propagates correctly. + #[test] + fn dyn_write_provider_delete() { + use crate::VirtualStorageProvider; + use std::io::Write; + + let vsp = VirtualStorageProvider::new_memory(); + let dyn_provider: &dyn DynWriteProvider = &vsp; + + // Create and then delete. + { + let mut w = dyn_provider.create_for_write("/to_delete.bin").unwrap(); + w.write_all(b"tmp").unwrap(); + w.flush().unwrap(); + } + assert!(vsp.exists("/to_delete.bin")); + dyn_provider.delete("/to_delete.bin").unwrap(); + assert!(!vsp.exists("/to_delete.bin")); + } +} diff --git a/diskann-providers/src/storage/virtual_storage_provider.rs b/diskann-storage/src/virtual_storage_provider.rs similarity index 63% rename from diskann-providers/src/storage/virtual_storage_provider.rs rename to diskann-storage/src/virtual_storage_provider.rs index 8b99a6e35..5dbc86145 100644 --- a/diskann-providers/src/storage/virtual_storage_provider.rs +++ b/diskann-storage/src/virtual_storage_provider.rs @@ -5,37 +5,40 @@ //! Virtual storage providers for testing. //! -//! This module provides test utilities that allow using in-memory or overlay filesystems -//! instead of the real filesystem. This is useful for keeping test data in-memory and -//! avoiding filesystem side-effects during testing. +//! [`VirtualStorageProvider`] wraps a [`vfs::FileSystem`] implementation so +//! that tests can run entirely in memory or against an overlay filesystem +//! without touching the real disk. use std::{io, io::Result}; use vfs::{MemoryFS, OverlayFS, PhysicalFS, SeekAndRead, SeekAndWrite, filesystem::FileSystem}; -use super::{StorageReadProvider, StorageWriteProvider}; +use crate::{StorageReadProvider, StorageWriteProvider}; -/// VirtualStorageProvider implements both StorageReadProvider and StorageWriteProvider. -/// This is a test utility and is not intended for use in production code. This allows us to -/// specify alternate filesystems to make testing easier. +/// Storage provider backed by an arbitrary [`vfs::FileSystem`]. /// -/// # Examples: +/// Use the factory methods [`new_memory`](VirtualStorageProvider::new_memory), +/// [`new_overlay`](VirtualStorageProvider::new_overlay), or +/// [`new_physical`](VirtualStorageProvider::new_physical) to construct +/// instances for different backends. +/// +/// # Examples /// -/// Use an in-memory filesystem instead of the normal filesystem to keep all test data in-memory /// ``` /// use std::io::Write; /// use vfs::FileSystem; -/// use diskann_providers::storage::VirtualStorageProvider; +/// use diskann_storage::VirtualStorageProvider; /// /// let storage_provider = VirtualStorageProvider::new_memory(); /// -/// // Create the root directory +/// // Create the root directory. /// storage_provider.filesystem().create_dir("/test_root").expect("Could not create test directory"); /// /// { -/// // Write test data to the in-memory filesystem inside a scope block so that the writer -/// // is flushed and disposed before using the storage_provider. -/// let mut file = storage_provider.filesystem().create_file("/test_root/input_data.bin").expect("Could not create test file"); +/// let mut file = storage_provider +/// .filesystem() +/// .create_file("/test_root/input_data.bin") +/// .expect("Could not create test file"); /// file.write_all(b"This is test data").expect("Unable to write test data"); /// } /// ``` @@ -48,6 +51,7 @@ impl VirtualStorageProvider { VirtualStorageProvider { filesystem } } + /// Return `true` if the item identified by `item_identifier` exists. pub fn exists(&self, item_identifier: &str) -> bool { self.filesystem.metadata(item_identifier).is_ok() } @@ -57,7 +61,7 @@ impl VirtualStorageProvider { &self.filesystem } - /// Consume the storage provider, returning the underlying filesystem. + /// Consume the provider, returning the underlying filesystem. pub fn take(self) -> FileSystemType { self.filesystem } @@ -117,8 +121,8 @@ impl StorageWriteProvider for VirtualStorageProvider } impl VirtualStorageProvider { - /// Create a two-layer overlay filesystem with an in-memory filesystem for writes - /// on top of the physical filesystem for reads. + /// Create a two-layer overlay filesystem with an in-memory layer for writes + /// on top of a physical filesystem for reads. pub fn new_overlay>(path: P) -> Self { #[allow(clippy::disallowed_methods)] let base_filesystem = PhysicalFS::new(path); @@ -131,21 +135,19 @@ impl VirtualStorageProvider { } impl VirtualStorageProvider { - /// Create a storage provider that uses an in-memory filesystem. + /// Create a storage provider backed by a pure in-memory filesystem. pub fn new_memory() -> Self { let memory_filesystem = MemoryFS::new(); - VirtualStorageProvider::new(memory_filesystem) } } impl VirtualStorageProvider { - /// Create a storage provider that uses the physical filesystem with a custom root path. - /// This prevents operations from writing outside of the specified sandbox. + /// Create a storage provider that sandboxes physical filesystem access + /// to the given root path. pub fn new_physical>(path: P) -> Self { #[allow(clippy::disallowed_methods)] let physical_filesystem = PhysicalFS::new(path); - VirtualStorageProvider::new(physical_filesystem) } } @@ -157,7 +159,7 @@ mod tests { use super::*; #[test] - fn test_file_reader() { + fn read_after_write_in_memory() { let file_name = "/test_file_reader.txt"; let storage_provider = VirtualStorageProvider::new_memory(); @@ -173,17 +175,16 @@ mod tests { let mut buffer = [0; 5]; reader.seek(SeekFrom::Start(0)).unwrap(); - reader.read(&mut buffer).unwrap(); + reader.read_exact(&mut buffer).unwrap(); assert_eq!(&buffer, b"Hello"); reader.seek(SeekFrom::Start(5)).unwrap(); - reader.read(&mut buffer).unwrap(); + reader.read_exact(&mut buffer).unwrap(); assert_eq!(&buffer, b", wor"); - storage_provider.take().remove_file(file_name).unwrap(); } #[test] - fn test_file_storage_exists() { + fn exists_reports_correctly() { let storage_provider = VirtualStorageProvider::new_memory(); let file_name = "/test_file_storage_exists.txt"; @@ -199,7 +200,6 @@ mod tests { .write_all(b"This is the text") .expect("Write did not succeed"); - // Make sure the file exists assert!( storage_provider.exists(file_name), "New file does not exist" @@ -216,7 +216,7 @@ mod tests { } #[test] - fn test_file_storage_get_length() { + fn get_length_returns_byte_count() { let file_name = "/test_file_storage_get_length.txt"; let storage_provider = VirtualStorageProvider::new_memory(); @@ -228,6 +228,67 @@ mod tests { .expect("Write did not succeed"); assert_eq!(storage_provider.get_length(file_name).unwrap(), 13); - storage_provider.take().remove_file(file_name).unwrap(); + } + + #[test] + fn create_for_write_replaces_existing() { + let storage_provider = VirtualStorageProvider::new_memory(); + let path = "/replace_me.bin"; + + { + let mut w = storage_provider.create_for_write(path).unwrap(); + w.write_all(b"original content").unwrap(); + w.flush().unwrap(); + } + + { + let mut w = storage_provider.create_for_write(path).unwrap(); + w.write_all(b"new").unwrap(); + w.flush().unwrap(); + } + + assert_eq!(storage_provider.get_length(path).unwrap(), 3); + } + + #[test] + fn delete_removes_item() { + let storage_provider = VirtualStorageProvider::new_memory(); + let path = "/to_delete.bin"; + + { + let mut w = storage_provider.create_for_write(path).unwrap(); + w.write_all(b"data").unwrap(); + w.flush().unwrap(); + } + assert!(storage_provider.exists(path)); + + storage_provider.delete(path).unwrap(); + assert!(!storage_provider.exists(path)); + } + + #[test] + fn open_reader_missing_file_returns_error() { + let storage_provider = VirtualStorageProvider::new_memory(); + let result = storage_provider.open_reader("/does_not_exist.bin"); + assert!(result.is_err()); + } + + #[test] + fn trait_based_read_provider() { + let vsp = VirtualStorageProvider::new_memory(); + { + let mut w = vsp.create_for_write("/trait_test.bin").unwrap(); + w.write_all(b"via trait").unwrap(); + w.flush().unwrap(); + } + + fn read_via_trait(provider: &dyn StorageReadProvider) { + assert!(provider.exists("/trait_test.bin")); + assert_eq!(provider.get_length("/trait_test.bin").unwrap(), 9); + } + + // We can't use the dyn version with associated types directly, but we + // can verify the impl works through a generic function. + read_via_trait(&vsp); } } diff --git a/diskann-tools/Cargo.toml b/diskann-tools/Cargo.toml index ae987dca9..2ad50fa9d 100644 --- a/diskann-tools/Cargo.toml +++ b/diskann-tools/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true byteorder.workspace = true clap = { workspace = true, features = ["derive"] } diskann-providers = { workspace = true, default-features = false } # see `linalg/Cargo.toml` +diskann-storage = { workspace = true } diskann-vector = { workspace = true } diskann-disk = { workspace = true } diskann-utils = { workspace = true } @@ -41,6 +42,7 @@ vfs = { workspace = true } diskann-providers = { workspace = true, default-features = false, features = [ "virtual_storage", ] } +diskann-storage = { workspace = true, features = ["virtual_storage"] } diskann-utils = { workspace = true, features = ["testing"] } [features] diff --git a/diskann-tools/src/bin/gen_associated_data_from_range.rs b/diskann-tools/src/bin/gen_associated_data_from_range.rs index 708dc5004..f0f1ec165 100644 --- a/diskann-tools/src/bin/gen_associated_data_from_range.rs +++ b/diskann-tools/src/bin/gen_associated_data_from_range.rs @@ -4,7 +4,7 @@ */ use clap::Parser; -use diskann_providers::storage::FileStorageProvider; +use diskann_storage::FileStorageProvider; use diskann_tools::utils::{gen_associated_data_from_range, CMDResult}; fn main() -> CMDResult<()> { diff --git a/diskann-tools/src/bin/generate_minmax.rs b/diskann-tools/src/bin/generate_minmax.rs index 7ced9e2ef..74e903e50 100644 --- a/diskann-tools/src/bin/generate_minmax.rs +++ b/diskann-tools/src/bin/generate_minmax.rs @@ -11,7 +11,6 @@ use std::{ use anyhow::{Context, Result}; use clap::Parser; -use diskann_providers::storage::StorageReadProvider; use diskann_quantization::{ algorithms::transforms::{DoubleHadamard, TargetDim}, alloc::GlobalAllocator, @@ -19,6 +18,7 @@ use diskann_quantization::{ num::Positive, CompressInto, }; +use diskann_storage::StorageReadProvider; use diskann_utils::io::Metadata; use half::f16; use rand::{rngs::StdRng, SeedableRng}; @@ -103,7 +103,7 @@ where { // Load input data let input_data = diskann_utils::io::read_bin::( - &mut diskann_providers::storage::FileStorageProvider + &mut diskann_storage::FileStorageProvider .open_reader(input_path) .with_context(|| format!("Failed to open {}", input_path))?, ) diff --git a/diskann-tools/src/bin/generate_pq.rs b/diskann-tools/src/bin/generate_pq.rs index 740ab9aa1..170259a71 100644 --- a/diskann-tools/src/bin/generate_pq.rs +++ b/diskann-tools/src/bin/generate_pq.rs @@ -4,7 +4,7 @@ */ use clap::Parser; -use diskann_providers::storage::FileStorageProvider; +use diskann_storage::FileStorageProvider; use diskann_tools::utils::{ build_pq, get_num_threads, init_subscriber, BuildPQParameters, CMDToolError, DataType, GraphDataF32Vector, GraphDataHalfVector, GraphDataInt8Vector, GraphDataU8Vector, diff --git a/diskann-tools/src/bin/random_data_generator.rs b/diskann-tools/src/bin/random_data_generator.rs index 3fc258dc9..8233e6037 100644 --- a/diskann-tools/src/bin/random_data_generator.rs +++ b/diskann-tools/src/bin/random_data_generator.rs @@ -4,7 +4,7 @@ */ use clap::Parser; -use diskann_providers::storage::FileStorageProvider; +use diskann_storage::FileStorageProvider; use diskann_tools::utils::{write_random_data, CMDResult, CMDToolError, DataType}; #[derive(Debug, Parser)] diff --git a/diskann-tools/src/bin/relative_contrast.rs b/diskann-tools/src/bin/relative_contrast.rs index 994b1ff6f..59746ff6e 100644 --- a/diskann-tools/src/bin/relative_contrast.rs +++ b/diskann-tools/src/bin/relative_contrast.rs @@ -4,8 +4,8 @@ */ use clap::Parser; -use diskann_providers::storage::FileStorageProvider; use diskann_providers::utils::random; +use diskann_storage::FileStorageProvider; use diskann_tools::utils::{ init_subscriber, relative_contrast::compute_relative_contrast, CMDResult, DataType, GraphDataF32Vector, GraphDataHalfVector, GraphDataInt8Vector, GraphDataU8Vector, diff --git a/diskann-tools/src/bin/subsample_bin.rs b/diskann-tools/src/bin/subsample_bin.rs index 6612ea91b..490ac791e 100644 --- a/diskann-tools/src/bin/subsample_bin.rs +++ b/diskann-tools/src/bin/subsample_bin.rs @@ -13,9 +13,8 @@ use rand::rngs::StdRng; use rand_distr::{Distribution, StandardUniform}; use diskann::utils::VectorRepr; -use diskann_providers::storage::FileStorageProvider; -use diskann_providers::storage::StorageWriteProvider; use diskann_providers::utils::{random, SampleVectorReader, SamplingDensity}; +use diskann_storage::{FileStorageProvider, StorageWriteProvider}; use diskann_tools::utils::DataType; use diskann_utils::io::Metadata; diff --git a/diskann-tools/src/utils/build_disk_index.rs b/diskann-tools/src/utils/build_disk_index.rs index 916c24c8a..856164c61 100644 --- a/diskann-tools/src/utils/build_disk_index.rs +++ b/diskann-tools/src/utils/build_disk_index.rs @@ -19,11 +19,11 @@ use diskann_disk::{ storage::DiskIndexWriter, QuantizationType, }; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ model::{graph::traits::GraphDataType, IndexConfiguration}, utils::{load_metadata_from_file, Timer}, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_vector::distance::Metric; use opentelemetry::global::BoxedSpan; #[cfg(feature = "perf_test")] @@ -187,7 +187,7 @@ where #[cfg(test)] mod tests { use diskann::ANNErrorKind; - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use vfs::MemoryFS; use super::*; diff --git a/diskann-tools/src/utils/build_pq.rs b/diskann-tools/src/utils/build_pq.rs index becdeb1d6..da43de3ee 100644 --- a/diskann-tools/src/utils/build_pq.rs +++ b/diskann-tools/src/utils/build_pq.rs @@ -4,18 +4,18 @@ */ use diskann::ANNResult; -use diskann_providers::storage::StorageReadProvider; use diskann_providers::{ model::{ graph::traits::GraphDataType, GeneratePivotArguments, MAX_PQ_TRAINING_SET_SIZE, NUM_KMEANS_REPS_PQ, NUM_PQ_CENTROIDS, }, - storage::{ - get_disk_index_compressed_pq_file, get_disk_index_pq_pivot_file, FileStorageProvider, - PQStorage, - }, + storage::PQStorage, utils::{load_metadata_from_file, Timer}, }; +use diskann_storage::{ + get_disk_index_compressed_pq_file, get_disk_index_pq_pivot_file, FileStorageProvider, + StorageReadProvider, +}; use diskann_vector::distance::Metric; use tracing::info; diff --git a/diskann-tools/src/utils/gen_associated_data_from_range.rs b/diskann-tools/src/utils/gen_associated_data_from_range.rs index 7d91ec865..5dc66708e 100644 --- a/diskann-tools/src/utils/gen_associated_data_from_range.rs +++ b/diskann-tools/src/utils/gen_associated_data_from_range.rs @@ -5,7 +5,7 @@ use std::io::Write; -use diskann_providers::storage::StorageWriteProvider; +use diskann_storage::StorageWriteProvider; use diskann_utils::io::Metadata; use super::CMDResult; @@ -37,7 +37,7 @@ pub fn gen_associated_data_from_range( mod tests { use super::*; use byteorder::{LittleEndian, ReadBytesExt}; - use diskann_providers::storage::{StorageReadProvider, VirtualStorageProvider}; + use diskann_storage::{StorageReadProvider, VirtualStorageProvider}; #[test] fn test_gen_associated_data_from_range() { diff --git a/diskann-tools/src/utils/generate_synthetic_labels_utils.rs b/diskann-tools/src/utils/generate_synthetic_labels_utils.rs index 856243128..c5a9577d5 100644 --- a/diskann-tools/src/utils/generate_synthetic_labels_utils.rs +++ b/diskann-tools/src/utils/generate_synthetic_labels_utils.rs @@ -6,7 +6,7 @@ use std::{collections::HashMap, io::Write}; use diskann::ANNResult; -use diskann_providers::storage::DynWriteProvider; +use diskann_storage::DynWriteProvider; use rand::{ distr::{Bernoulli, Distribution}, Rng, @@ -132,9 +132,7 @@ pub fn generate_labels( mod test { use std::io::BufRead; - use diskann_providers::storage::{ - StorageReadProvider, StorageWriteProvider, VirtualStorageProvider, - }; + use diskann_storage::{StorageReadProvider, StorageWriteProvider, VirtualStorageProvider}; use super::generate_labels; diff --git a/diskann-tools/src/utils/ground_truth.rs b/diskann-tools/src/utils/ground_truth.rs index 883f0c2ec..08040d498 100644 --- a/diskann-tools/src/utils/ground_truth.rs +++ b/diskann-tools/src/utils/ground_truth.rs @@ -13,12 +13,12 @@ use diskann::{ neighbor::{Neighbor, NeighborPriorityQueue}, utils::VectorRepr, }; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ common::AlignedBoxWithSlice, model::graph::traits::GraphDataType, utils::{create_thread_pool, file_util, ParallelIteratorInPool, VectorDataIterator}, }; +use diskann_storage::{StorageReadProvider, StorageWriteProvider}; use diskann_utils::{ io::{read_bin, Metadata}, views::Matrix, diff --git a/diskann-tools/src/utils/random_data_generator.rs b/diskann-tools/src/utils/random_data_generator.rs index 811e6434b..544d0636b 100644 --- a/diskann-tools/src/utils/random_data_generator.rs +++ b/diskann-tools/src/utils/random_data_generator.rs @@ -6,7 +6,8 @@ use std::io::{BufWriter, Write}; use byteorder::{LittleEndian, WriteBytesExt}; -use diskann_providers::{storage::StorageWriteProvider, utils::math_util}; +use diskann_providers::utils::math_util; +use diskann_storage::StorageWriteProvider; use diskann_utils::io::Metadata; use diskann_vector::Half; @@ -197,7 +198,7 @@ fn write_random_vector_block< #[cfg(test)] mod tests { - use diskann_providers::storage::VirtualStorageProvider; + use diskann_storage::VirtualStorageProvider; use rstest::rstest; use super::*; diff --git a/diskann-tools/src/utils/relative_contrast.rs b/diskann-tools/src/utils/relative_contrast.rs index 3b521e4e6..bc75d4e0f 100644 --- a/diskann-tools/src/utils/relative_contrast.rs +++ b/diskann-tools/src/utils/relative_contrast.rs @@ -5,7 +5,7 @@ use diskann::{utils::VectorRepr, ANNError}; use diskann_providers::model::graph::traits::GraphDataType; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; use diskann_utils::io::read_bin; use rand::Rng; @@ -116,8 +116,8 @@ pub fn compute_relative_contrast< #[cfg(test)] mod relative_contrast_tests { - use diskann_providers::storage::{StorageWriteProvider, VirtualStorageProvider}; use diskann_providers::utils::random; + use diskann_storage::{StorageWriteProvider, VirtualStorageProvider}; use diskann_utils::io::Metadata; use diskann_vector::distance::Metric; use half::f16; diff --git a/diskann-tools/src/utils/search_disk_index.rs b/diskann-tools/src/utils/search_disk_index.rs index 65bb8ffb5..3e429760d 100644 --- a/diskann-tools/src/utils/search_disk_index.rs +++ b/diskann-tools/src/utils/search_disk_index.rs @@ -17,12 +17,13 @@ use diskann_disk::{ QueryStatistics, }, }; -use diskann_providers::storage::{StorageReadProvider, StorageWriteProvider}; use diskann_providers::{ model::graph::traits::GraphDataType, - storage::{get_compressed_pq_file, get_pq_pivot_file}, utils::{create_thread_pool, load_aligned_bin, ParallelIteratorInPool}, }; +use diskann_storage::{ + get_compressed_pq_file, get_pq_pivot_file, StorageReadProvider, StorageWriteProvider, +}; use diskann_utils::{io::write_bin, views::MatrixView}; use diskann_vector::distance::Metric; use opentelemetry::global::BoxedSpan; diff --git a/diskann-tools/src/utils/search_index_utils.rs b/diskann-tools/src/utils/search_index_utils.rs index 176f2e1ec..1d4475d69 100644 --- a/diskann-tools/src/utils/search_index_utils.rs +++ b/diskann-tools/src/utils/search_index_utils.rs @@ -6,7 +6,7 @@ use std::{collections::HashSet, fmt, hash::Hash, io::Read, mem::size_of}; use bytemuck::cast_slice; use diskann::{ANNError, ANNResult}; -use diskann_providers::storage::StorageReadProvider; +use diskann_storage::StorageReadProvider; use diskann_utils::io::Metadata; use tracing::info;