From a36ed5ad28928899e88e7a1f04c848cfd1500271 Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 20 Mar 2026 16:05:24 -0400 Subject: [PATCH 1/2] add `vortex-compressor` Signed-off-by: Connor Tsui --- Cargo.lock | 20 +- Cargo.toml | 2 + fuzz/src/array/mod.rs | 15 +- vortex-array/public-api.lock | 8 + vortex-array/src/executor.rs | 1 + vortex-compressor/Cargo.toml | 34 + vortex-compressor/public-api.lock | 865 ++++++++++++++++++ vortex-compressor/src/builtins/constant.rs | 216 +++++ vortex-compressor/src/builtins/dict/float.rs | 152 +++ .../src/builtins/dict/integer.rs | 166 ++++ vortex-compressor/src/builtins/dict/mod.rs | 316 +++++++ vortex-compressor/src/builtins/mod.rs | 45 + vortex-compressor/src/compressor.rs | 525 +++++++++++ vortex-compressor/src/ctx.rs | 109 +++ vortex-compressor/src/lib.rs | 27 + vortex-compressor/src/sample.rs | 161 ++++ vortex-compressor/src/scheme.rs | 286 ++++++ vortex-compressor/src/stats/cache.rs | 133 +++ vortex-compressor/src/stats/float.rs | 315 +++++++ vortex-compressor/src/stats/integer.rs | 622 +++++++++++++ vortex-compressor/src/stats/mod.rs | 22 + vortex-compressor/src/stats/options.rs | 26 + vortex-compressor/src/stats/string.rs | 102 +++ vortex-file/src/strategy.rs | 42 +- vortex-layout/src/layouts/compressed.rs | 5 +- vortex/public-api.lock | 6 +- vortex/src/lib.rs | 5 +- 27 files changed, 4193 insertions(+), 33 deletions(-) create mode 100644 vortex-compressor/Cargo.toml create mode 100644 vortex-compressor/public-api.lock create mode 100644 vortex-compressor/src/builtins/constant.rs create mode 100644 vortex-compressor/src/builtins/dict/float.rs create mode 100644 vortex-compressor/src/builtins/dict/integer.rs create mode 100644 vortex-compressor/src/builtins/dict/mod.rs create mode 100644 vortex-compressor/src/builtins/mod.rs create mode 100644 vortex-compressor/src/compressor.rs create mode 100644 vortex-compressor/src/ctx.rs create mode 100644 vortex-compressor/src/lib.rs create mode 100644 vortex-compressor/src/sample.rs create mode 100644 vortex-compressor/src/scheme.rs create mode 100644 vortex-compressor/src/stats/cache.rs create mode 100644 vortex-compressor/src/stats/float.rs create mode 100644 vortex-compressor/src/stats/integer.rs create mode 100644 vortex-compressor/src/stats/mod.rs create mode 100644 vortex-compressor/src/stats/options.rs create mode 100644 vortex-compressor/src/stats/string.rs diff --git a/Cargo.lock b/Cargo.lock index 54014adfb1d..8fc152107c7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9860,7 +9860,6 @@ name = "vortex-btrblocks" version = "0.1.0" dependencies = [ "codspeed-divan-compat", - "enum-iterator", "getrandom 0.4.2", "itertools 0.14.0", "num-traits", @@ -9873,6 +9872,7 @@ dependencies = [ "vortex-alp", "vortex-array", "vortex-buffer", + "vortex-compressor", "vortex-datetime-parts", "vortex-decimal-byte-parts", "vortex-error", @@ -9943,6 +9943,24 @@ dependencies = [ "vortex-session", ] +[[package]] +name = "vortex-compressor" +version = "0.1.0" +dependencies = [ + "itertools 0.14.0", + "num-traits", + "parking_lot", + "rand 0.10.0", + "rstest", + "rustc-hash", + "tracing", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-utils", +] + [[package]] name = "vortex-cub" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 0d6853627a8..4a374e1e034 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ members = [ "vortex-proto", "vortex-array", "vortex-tensor", + "vortex-compressor", "vortex-btrblocks", "vortex-layout", "vortex-scan", @@ -259,6 +260,7 @@ vortex-array = { version = "0.1.0", path = "./vortex-array", default-features = vortex-btrblocks = { version = "0.1.0", path = "./vortex-btrblocks", default-features = false } vortex-buffer = { version = "0.1.0", path = "./vortex-buffer", default-features = false } vortex-bytebool = { version = "0.1.0", path = "./encodings/bytebool", default-features = false } +vortex-compressor = { version = "0.1.0", path = "./vortex-compressor", default-features = false } vortex-datafusion = { version = "0.1.0", path = "./vortex-datafusion", default-features = false } vortex-datetime-parts = { version = "0.1.0", path = "./encodings/datetime-parts", default-features = false } vortex-decimal-byte-parts = { version = "0.1.0", path = "encodings/decimal-byte-parts", default-features = false } diff --git a/fuzz/src/array/mod.rs b/fuzz/src/array/mod.rs index 0b101b91d8f..70094fed072 100644 --- a/fuzz/src/array/mod.rs +++ b/fuzz/src/array/mod.rs @@ -61,9 +61,10 @@ use vortex_array::search_sorted::SearchSorted; use vortex_array::search_sorted::SearchSortedSide; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::FloatCode; -use vortex_btrblocks::IntCode; -use vortex_btrblocks::StringCode; +use vortex_btrblocks::SchemeExt; +use vortex_btrblocks::schemes::float; +use vortex_btrblocks::schemes::integer; +use vortex_btrblocks::schemes::string; use vortex_error::VortexExpect; use vortex_error::vortex_panic; use vortex_mask::Mask; @@ -546,9 +547,11 @@ pub fn compress_array(array: &ArrayRef, strategy: CompressorStrategy) -> ArrayRe .compress(array) .vortex_expect("BtrBlocksCompressor compress should succeed in fuzz test"), CompressorStrategy::Compact => BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build() .compress(array) .vortex_expect("Compact compress should succeed in fuzz test"), diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index c53d9a893cf..b47484e1adf 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -22266,6 +22266,14 @@ pub fn vortex_array::ExecutionCtx::new(session: vortex_session::VortexSession) - pub fn vortex_array::ExecutionCtx::session(&self) -> &vortex_session::VortexSession +impl core::clone::Clone for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::clone(&self) -> vortex_array::ExecutionCtx + +impl core::fmt::Debug for vortex_array::ExecutionCtx + +pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + impl core::fmt::Display for vortex_array::ExecutionCtx pub fn vortex_array::ExecutionCtx::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-array/src/executor.rs b/vortex-array/src/executor.rs index da05450f8de..114adf355c3 100644 --- a/vortex-array/src/executor.rs +++ b/vortex-array/src/executor.rs @@ -172,6 +172,7 @@ impl dyn DynArray + '_ { /// /// Accumulates a trace of execution steps. Individual steps are logged at TRACE level for /// real-time following, and the full trace is dumped at DEBUG level when the context is dropped. +#[derive(Debug, Clone)] pub struct ExecutionCtx { id: usize, session: VortexSession, diff --git a/vortex-compressor/Cargo.toml b/vortex-compressor/Cargo.toml new file mode 100644 index 00000000000..260c9c531f5 --- /dev/null +++ b/vortex-compressor/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "vortex-compressor" +authors = { workspace = true } +categories = { workspace = true } +description = "Encoding-agnostic compression framework for Vortex arrays" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = { workspace = true } +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[dependencies] +itertools = { workspace = true } +num-traits = { workspace = true } +parking_lot = { workspace = true } +rand = { workspace = true } +rustc-hash = { workspace = true } +tracing = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-utils = { workspace = true } + +[dev-dependencies] +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[lints] +workspace = true diff --git a/vortex-compressor/public-api.lock b/vortex-compressor/public-api.lock new file mode 100644 index 00000000000..3fbc28076eb --- /dev/null +++ b/vortex-compressor/public-api.lock @@ -0,0 +1,865 @@ +pub mod vortex_compressor + +pub mod vortex_compressor::builtins + +pub struct vortex_compressor::builtins::FloatConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::clone(&self) -> vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::eq(&self, other: &vortex_compressor::builtins::FloatConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::FloatDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::clone(&self) -> vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::FloatDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::eq(&self, other: &vortex_compressor::builtins::FloatDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::FloatDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::FloatDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::clone(&self) -> vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::eq(&self, other: &vortex_compressor::builtins::IntConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::IntDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::clone(&self) -> vortex_compressor::builtins::IntDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::IntDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::eq(&self, other: &vortex_compressor::builtins::IntDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::IntDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::IntDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringConstantScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::clone(&self) -> vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringConstantScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::eq(&self, other: &vortex_compressor::builtins::StringConstantScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringConstantScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringConstantScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::builtins::StringDictScheme + +impl core::clone::Clone for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::clone(&self) -> vortex_compressor::builtins::StringDictScheme + +impl core::cmp::Eq for vortex_compressor::builtins::StringDictScheme + +impl core::cmp::PartialEq for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::eq(&self, other: &vortex_compressor::builtins::StringDictScheme) -> bool + +impl core::fmt::Debug for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::builtins::StringDictScheme + +impl core::marker::StructuralPartialEq for vortex_compressor::builtins::StringDictScheme + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::builtins::float_dictionary_encode(stats: &vortex_compressor::stats::FloatStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::integer_dictionary_encode(stats: &vortex_compressor::stats::IntegerStats) -> vortex_array::arrays::dict::array::DictArray + +pub fn vortex_compressor::builtins::is_float_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_integer_primitive(canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::is_utf8_string(canonical: &vortex_array::canonical::Canonical) -> bool + +pub mod vortex_compressor::ctx + +pub struct vortex_compressor::ctx::CompressorContext + +impl vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::as_leaf(self) -> Self + +pub fn vortex_compressor::ctx::CompressorContext::as_sample(self) -> Self + +pub fn vortex_compressor::ctx::CompressorContext::cascade_history(&self) -> &[(vortex_compressor::scheme::SchemeId, usize)] + +pub fn vortex_compressor::ctx::CompressorContext::finished_cascading(&self) -> bool + +pub fn vortex_compressor::ctx::CompressorContext::is_sample(&self) -> bool + +pub fn vortex_compressor::ctx::CompressorContext::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::ctx::CompressorContext::with_stats_options(self, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +impl core::clone::Clone for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::clone(&self) -> vortex_compressor::ctx::CompressorContext + +impl core::fmt::Debug for vortex_compressor::ctx::CompressorContext + +pub fn vortex_compressor::ctx::CompressorContext::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub const vortex_compressor::ctx::MAX_CASCADE: usize + +pub mod vortex_compressor::scheme + +pub enum vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::ChildSelection::All + +pub vortex_compressor::scheme::ChildSelection::Many(&'static [usize]) + +pub vortex_compressor::scheme::ChildSelection::One(usize) + +impl vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::contains(&self, child_index: usize) -> bool + +impl core::clone::Clone for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::clone(&self) -> vortex_compressor::scheme::ChildSelection + +impl core::fmt::Debug for vortex_compressor::scheme::ChildSelection + +pub fn vortex_compressor::scheme::ChildSelection::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::ChildSelection + +pub struct vortex_compressor::scheme::AncestorExclusion + +pub vortex_compressor::scheme::AncestorExclusion::ancestor: vortex_compressor::scheme::SchemeId + +pub vortex_compressor::scheme::AncestorExclusion::children: vortex_compressor::scheme::ChildSelection + +impl core::clone::Clone for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::clone(&self) -> vortex_compressor::scheme::AncestorExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::AncestorExclusion + +pub fn vortex_compressor::scheme::AncestorExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::AncestorExclusion + +pub struct vortex_compressor::scheme::DescendantExclusion + +pub vortex_compressor::scheme::DescendantExclusion::children: vortex_compressor::scheme::ChildSelection + +pub vortex_compressor::scheme::DescendantExclusion::excluded: vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::clone(&self) -> vortex_compressor::scheme::DescendantExclusion + +impl core::fmt::Debug for vortex_compressor::scheme::DescendantExclusion + +pub fn vortex_compressor::scheme::DescendantExclusion::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::scheme::DescendantExclusion + +pub struct vortex_compressor::scheme::SchemeId + +impl core::clone::Clone for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::clone(&self) -> vortex_compressor::scheme::SchemeId + +impl core::cmp::Eq for vortex_compressor::scheme::SchemeId + +impl core::cmp::PartialEq for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::eq(&self, other: &vortex_compressor::scheme::SchemeId) -> bool + +impl core::fmt::Debug for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::SchemeId::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::Copy for vortex_compressor::scheme::SchemeId + +impl core::marker::StructuralPartialEq for vortex_compressor::scheme::SchemeId + +pub trait vortex_compressor::scheme::Scheme: core::fmt::Debug + core::marker::Send + core::marker::Sync + +pub fn vortex_compressor::scheme::Scheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::scheme::Scheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::scheme::Scheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::scheme::Scheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::scheme::Scheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::scheme::Scheme::num_children(&self) -> usize + +pub fn vortex_compressor::scheme::Scheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::scheme::Scheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatConstantScheme + +pub fn vortex_compressor::builtins::FloatConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::FloatDictScheme + +pub fn vortex_compressor::builtins::FloatDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::FloatDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::FloatDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::FloatDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::FloatDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::FloatDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntConstantScheme + +pub fn vortex_compressor::builtins::IntConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::IntDictScheme + +pub fn vortex_compressor::builtins::IntDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::IntDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::IntDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::IntDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::IntDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::IntDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringConstantScheme + +pub fn vortex_compressor::builtins::StringConstantScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::compress(&self, _compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringConstantScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringConstantScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringConstantScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringConstantScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringConstantScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl vortex_compressor::scheme::Scheme for vortex_compressor::builtins::StringDictScheme + +pub fn vortex_compressor::builtins::StringDictScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::compress(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_compressor::builtins::StringDictScheme::detects_constant(&self) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::CascadingCompressor, data: &mut vortex_compressor::stats::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_compressor::builtins::StringDictScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_compressor::builtins::StringDictScheme::num_children(&self) -> usize + +pub fn vortex_compressor::builtins::StringDictScheme::scheme_name(&self) -> &'static str + +pub fn vortex_compressor::builtins::StringDictScheme::stats_options(&self) -> vortex_compressor::stats::GenerateStatsOptions + +pub trait vortex_compressor::scheme::SchemeExt: vortex_compressor::scheme::Scheme + +pub fn vortex_compressor::scheme::SchemeExt::id(&self) -> vortex_compressor::scheme::SchemeId + +impl vortex_compressor::scheme::SchemeExt for T + +pub fn T::id(&self) -> vortex_compressor::scheme::SchemeId + +pub fn vortex_compressor::scheme::estimate_compression_ratio_with_sampling(scheme: &S, compressor: &vortex_compressor::CascadingCompressor, array: &vortex_array::array::ArrayRef, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub mod vortex_compressor::stats + +pub enum vortex_compressor::stats::FloatErasedStats + +pub vortex_compressor::stats::FloatErasedStats::F16(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F32(vortex_compressor::stats::FloatTypedStats) + +pub vortex_compressor::stats::FloatErasedStats::F64(vortex_compressor::stats::FloatTypedStats) + +impl core::clone::Clone for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::clone(&self) -> vortex_compressor::stats::FloatErasedStats + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub enum vortex_compressor::stats::IntegerErasedStats + +pub vortex_compressor::stats::IntegerErasedStats::I16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::I8(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U16(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U32(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U64(vortex_compressor::stats::IntegerTypedStats) + +pub vortex_compressor::stats::IntegerErasedStats::U8(vortex_compressor::stats::IntegerTypedStats) + +impl vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_ilog2(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerErasedStats::max_minus_min(&self) -> u64 + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_negative(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::min_is_zero(&self) -> bool + +pub fn vortex_compressor::stats::IntegerErasedStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::clone(&self) -> vortex_compressor::stats::IntegerErasedStats + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::fmt::Debug for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::ArrayAndStats + +impl vortex_compressor::stats::ArrayAndStats + +pub fn vortex_compressor::stats::ArrayAndStats::array(&self) -> &vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::float_stats(&mut self) -> &vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::ArrayAndStats::get_or_insert_with(&mut self, f: impl core::ops::function::FnOnce() -> T) -> &T + +pub fn vortex_compressor::stats::ArrayAndStats::integer_stats(&mut self) -> &vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::ArrayAndStats::into_array(self) -> vortex_array::array::ArrayRef + +pub fn vortex_compressor::stats::ArrayAndStats::new(array: vortex_array::array::ArrayRef, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::ArrayAndStats::string_stats(&mut self) -> &vortex_compressor::stats::StringStats + +pub struct vortex_compressor::stats::FloatDistinctInfo + +impl vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_set::HashSet, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::clone(&self) -> vortex_compressor::stats::FloatDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::FloatDistinctInfo + +pub fn vortex_compressor::stats::FloatDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatStats + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::erased(&self) -> &vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::FloatStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::FloatStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::FloatStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::FloatStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::distinct_count(&self) -> core::option::Option + +impl core::clone::Clone for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::clone(&self) -> vortex_compressor::stats::FloatStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatStats + +pub fn vortex_compressor::stats::FloatStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::FloatTypedStats + +impl vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::FloatDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::FloatErasedStats + +pub fn vortex_compressor::stats::FloatErasedStats::from(typed: vortex_compressor::stats::FloatTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::clone(&self) -> vortex_compressor::stats::FloatTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::FloatTypedStats + +pub fn vortex_compressor::stats::FloatTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::GenerateStatsOptions + +pub vortex_compressor::stats::GenerateStatsOptions::count_distinct_values: bool + +impl vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::merge(self, other: Self) -> Self + +impl core::clone::Clone for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::clone(&self) -> vortex_compressor::stats::GenerateStatsOptions + +impl core::default::Default for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::default() -> vortex_compressor::stats::GenerateStatsOptions + +impl core::fmt::Debug for vortex_compressor::stats::GenerateStatsOptions + +pub fn vortex_compressor::stats::GenerateStatsOptions::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_compressor::stats::GenerateStatsOptions + +pub struct vortex_compressor::stats::IntegerDistinctInfo + +impl vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::distinct_values(&self) -> &vortex_utils::aliases::hash_map::HashMap, u32, rustc_hash::FxBuildHasher> + +impl core::clone::Clone for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::clone(&self) -> vortex_compressor::stats::IntegerDistinctInfo + +impl core::fmt::Debug for vortex_compressor::stats::IntegerDistinctInfo + +pub fn vortex_compressor::stats::IntegerDistinctInfo::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerStats + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::average_run_length(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::erased(&self) -> &vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerStats::generate(input: &vortex_array::arrays::primitive::array::PrimitiveArray) -> Self + +pub fn vortex_compressor::stats::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::IntegerStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray + +pub fn vortex_compressor::stats::IntegerStats::value_count(&self) -> u32 + +impl vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::IntegerStats::most_frequent_value_and_count(&self) -> core::option::Option<(vortex_array::scalar::typed_view::primitive::pvalue::PValue, u32)> + +impl core::clone::Clone for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::clone(&self) -> vortex_compressor::stats::IntegerStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerStats + +pub fn vortex_compressor::stats::IntegerStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::IntegerTypedStats + +impl vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::distinct(&self) -> core::option::Option<&vortex_compressor::stats::IntegerDistinctInfo> + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::convert::From> for vortex_compressor::stats::IntegerErasedStats + +pub fn vortex_compressor::stats::IntegerErasedStats::from(typed: vortex_compressor::stats::IntegerTypedStats) -> Self + +impl core::clone::Clone for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::clone(&self) -> vortex_compressor::stats::IntegerTypedStats + +impl core::fmt::Debug for vortex_compressor::stats::IntegerTypedStats + +pub fn vortex_compressor::stats::IntegerTypedStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::stats::StringStats + +impl vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::estimated_distinct_count(&self) -> core::option::Option + +pub fn vortex_compressor::stats::StringStats::generate(input: &vortex_array::arrays::varbinview::array::VarBinViewArray) -> Self + +pub fn vortex_compressor::stats::StringStats::generate_opts(input: &vortex_array::arrays::varbinview::array::VarBinViewArray, opts: vortex_compressor::stats::GenerateStatsOptions) -> Self + +pub fn vortex_compressor::stats::StringStats::null_count(&self) -> u32 + +pub fn vortex_compressor::stats::StringStats::source(&self) -> &vortex_array::arrays::varbinview::array::VarBinViewArray + +pub fn vortex_compressor::stats::StringStats::value_count(&self) -> u32 + +impl core::clone::Clone for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::clone(&self) -> vortex_compressor::stats::StringStats + +impl core::fmt::Debug for vortex_compressor::stats::StringStats + +pub fn vortex_compressor::stats::StringStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +pub struct vortex_compressor::CascadingCompressor + +impl vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::compress_child(&self, child: &vortex_array::array::ArrayRef, parent_ctx: &vortex_compressor::ctx::CompressorContext, parent_id: vortex_compressor::scheme::SchemeId, child_index: usize) -> vortex_error::VortexResult + +pub fn vortex_compressor::CascadingCompressor::execution_ctx(&self) -> parking_lot::mutex::MutexGuard<'_, vortex_array::executor::ExecutionCtx> + +pub fn vortex_compressor::CascadingCompressor::new(schemes: alloc::vec::Vec<&'static dyn vortex_compressor::scheme::Scheme>) -> Self + +impl core::clone::Clone for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::clone(&self) -> vortex_compressor::CascadingCompressor + +impl core::fmt::Debug for vortex_compressor::CascadingCompressor + +pub fn vortex_compressor::CascadingCompressor::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result diff --git a/vortex-compressor/src/builtins/constant.rs b/vortex-compressor/src/builtins/constant.rs new file mode 100644 index 00000000000..178f67e3e9d --- /dev/null +++ b/vortex-compressor/src/builtins/constant.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Constant encoding schemes for integer, float, and string arrays. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::MaskedArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::Scheme; +use crate::stats::ArrayAndStats; + +/// Constant encoding for integer arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntConstantScheme; + +impl Scheme for IntConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + if stats.distinct_count().is_none_or(|count| count > 1) { + return Ok(0.0); + } + + Ok(stats.value_count() as f64) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.integer_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for float arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatConstantScheme; + +impl Scheme for FloatConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.float_stats(); + + if stats.null_count() as usize == stats.source().len() || stats.value_count() == 0 { + return Ok(0.0); + } + + if stats.distinct_count().is_some_and(|count| count == 1) { + return Ok(stats.value_count() as f64); + } + + Ok(0.0) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let source = data.float_stats().source().clone(); + compress_constant_primitive(&source) + } +} + +/// Constant encoding for string arrays with a single distinct value. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringConstantScheme; + +impl Scheme for StringConstantScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.constant" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn detects_constant(&self) -> bool { + true + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if ctx.is_sample() { + return Ok(0.0); + } + + let stats = data.string_stats(); + + if stats.estimated_distinct_count().is_none_or(|c| c > 1) + || !is_constant( + &stats.source().clone().into_array(), + &mut compressor.execution_ctx(), + )? + { + return Ok(0.0); + } + + // Force constant in these cases. + Ok(f64::MAX) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let scalar_idx = + (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = stats.source().scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, stats.source().len()).into_array(); + if !stats.source().all_valid()? { + Ok( + MaskedArray::try_new(const_arr, stats.source().validity().clone())? + .into_array(), + ) + } else { + Ok(const_arr) + } + } + None => Ok(ConstantArray::new( + Scalar::null(stats.source().dtype().clone()), + stats.source().len(), + ) + .into_array()), + } + } +} + +/// Shared helper for compressing a constant primitive array (int or float). +fn compress_constant_primitive(source: &PrimitiveArray) -> VortexResult { + let scalar_idx = (0..source.len()).position(|idx| source.is_valid(idx).unwrap_or(false)); + + match scalar_idx { + Some(idx) => { + let scalar = source.scalar_at(idx)?; + let const_arr = ConstantArray::new(scalar, source.len()).into_array(); + if !source.all_valid()? { + Ok(MaskedArray::try_new(const_arr, source.validity().clone())?.into_array()) + } else { + Ok(const_arr) + } + } + None => { + Ok(ConstantArray::new(Scalar::null(source.dtype().clone()), source.len()).into_array()) + } + } +} diff --git a/vortex-compressor/src/builtins/dict/float.rs b/vortex-compressor/src/builtins/dict/float.rs new file mode 100644 index 00000000000..d9a7af35e16 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/float.rs @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Float-specific dictionary encoding implementation. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for +//! external compatibility. + +use vortex_array::IntoArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::half::f16; +use vortex_array::validity::Validity; +use vortex_array::vtable::ValidityHelper; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; + +use crate::stats::FloatErasedStats; +use crate::stats::FloatStats; + +/// Encodes a typed float array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values().iter().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + }; + + let values_validity = match $validity { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + let values = PrimitiveArray::new(values, values_validity).into_array(); + + // SAFETY: enforced by the DictEncoder. + unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } + }}; +} + +/// Compresses a floating-point array into a dictionary array according to attached stats. +pub fn dictionary_encode(stats: &FloatStats) -> DictArray { + let validity = stats.source().validity(); + match stats.erased() { + FloatErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16), + FloatErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32), + FloatErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for a float type using its bit representation as the hash key. +macro_rules! impl_encode { + ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; + ($typ:ty, $utyp:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[allow(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value.to_bits(), code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(f16, u16); +impl_encode!(f32, u32); +impl_encode!(f64, u64); + +#[cfg(test)] +mod tests { + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::dictionary_encode; + use crate::stats::FloatStats; + use crate::stats::GenerateStatsOptions; + + #[test] + fn test_float_dict_encode() { + let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; + let validity = + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); + let array = PrimitiveArray::new(values, validity); + + let stats = FloatStats::generate_opts( + &array, + GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let dict_array = dictionary_encode(&stats); + assert_eq!(dict_array.values().len(), 2); + assert_eq!(dict_array.codes().len(), 5); + + let expected = PrimitiveArray::new( + buffer![1f32, 2f32, 2f32, 1f32, 1f32], + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), + ) + .into_array(); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); + } +} diff --git a/vortex-compressor/src/builtins/dict/integer.rs b/vortex-compressor/src/builtins/dict/integer.rs new file mode 100644 index 00000000000..00ec39ae1a9 --- /dev/null +++ b/vortex-compressor/src/builtins/dict/integer.rs @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Dictionary compressor that reuses the unique values in the [`IntegerStats`]. +//! +//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted +//! for external compatibility. + +use vortex_array::IntoArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::validity::Validity; +use vortex_array::vtable::ValidityHelper; +use vortex_buffer::Buffer; +use vortex_error::VortexExpect; + +use crate::stats::IntegerErasedStats; +use crate::stats::IntegerStats; + +/// Encodes a typed integer array into a [`DictArray`] using the pre-computed distinct values. +macro_rules! typed_encode { + ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ + let distinct = $typed.distinct().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect(); + + let max_code = values.len(); + let codes = if max_code <= u8::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else if max_code <= u16::MAX as usize { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + } else { + let buf = >::encode( + &values, + $stats.source().as_slice::<$typ>(), + ); + PrimitiveArray::new(buf, $validity.clone()).into_array() + }; + + let values_validity = match $validity { + Validity::NonNullable => Validity::NonNullable, + _ => Validity::AllValid, + }; + + let values = PrimitiveArray::new(values, values_validity).into_array(); + // SAFETY: invariants enforced in DictEncoder. + unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } + }}; +} + +/// Compresses an integer array into a dictionary array according to attached stats. +#[expect( + clippy::cognitive_complexity, + reason = "complexity from match on all integer types" +)] +pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { + let src_validity = stats.source().validity(); + + match stats.erased() { + IntegerErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), + IntegerErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), + IntegerErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), + IntegerErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), + IntegerErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), + IntegerErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), + IntegerErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), + IntegerErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), + } +} + +/// Stateless encoder that maps values to dictionary codes via a `HashMap`. +struct DictEncoder; + +/// Trait for encoding values of type `T` into codes of type `I`. +trait Encode { + /// Using the distinct value set, turn the values into a set of codes. + fn encode(distinct: &[T], values: &[T]) -> Buffer; +} + +/// Implements [`Encode`] for an integer type with all code width variants (u8, u16, u32). +macro_rules! impl_encode { + ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; + ($typ:ty, $($ityp:ty),+) => { + $( + impl Encode<$typ, $ityp> for DictEncoder { + #[allow(clippy::cast_possible_truncation)] + fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { + let mut codes = + vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity( + distinct.len(), + ); + for (code, &value) in distinct.iter().enumerate() { + codes.insert(value, code as $ityp); + } + + let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); + for value in values { + // Any code lookups which fail are for nulls, so their value does not matter. + // SAFETY: we have exactly sized output to be as large as values. + unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; + } + + output.freeze() + } + } + )* + }; +} + +impl_encode!(u8); +impl_encode!(u16); +impl_encode!(u32); +impl_encode!(u64); +impl_encode!(i8); +impl_encode!(i16); +impl_encode!(i32); +impl_encode!(i64); + +#[cfg(test)] +mod tests { + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::dictionary_encode; + use crate::stats::IntegerStats; + + #[test] + fn test_dict_encode_integer_stats() { + let data = buffer![100i32, 200, 100, 0, 100]; + let validity = + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); + let array = PrimitiveArray::new(data, validity); + + let stats = IntegerStats::generate_opts( + &array, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + let dict_array = dictionary_encode(&stats); + assert_eq!(dict_array.values().len(), 2); + assert_eq!(dict_array.codes().len(), 5); + + let expected = PrimitiveArray::new( + buffer![100i32, 200, 100, 100, 100], + Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), + ) + .into_array(); + assert_arrays_eq!(dict_array.as_ref(), expected.as_ref()); + } +} diff --git a/vortex-compressor/src/builtins/dict/mod.rs b/vortex-compressor/src/builtins/dict/mod.rs new file mode 100644 index 00000000000..c8d38dcf56c --- /dev/null +++ b/vortex-compressor/src/builtins/dict/mod.rs @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Dictionary encoding schemes for integer, float, and string arrays. + +pub mod float; +pub mod integer; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::DictArray; +use vortex_array::builders::dict::dict_encode; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use super::is_float_primitive; +use super::is_integer_primitive; +use super::is_utf8_string; +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::estimate_compression_ratio_with_sampling; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Dictionary encoding for low-cardinality integer values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntDictScheme; + +impl Scheme for IntDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + let distinct_values_count = stats.distinct_count().vortex_expect( + "this must be present since `DictScheme` declared that we need distinct values", + ); + + // If > 50% of the values are distinct, skip dict. + if distinct_values_count > stats.value_count() / 2 { + return Ok(0.0); + } + + // Ignore nulls encoding for the estimate. We only focus on values. + let values_size = stats.source().ptype().bit_width() * distinct_values_count as usize; + + // Assume codes are compressed RLE + BitPacking. + let codes_bw = usize::BITS - distinct_values_count.leading_zeros(); + + let n_runs = (stats.value_count() / stats.average_run_length()) as usize; + + // Assume that codes will either be BitPack or RLE-BitPack. + let codes_size_bp = (codes_bw * stats.value_count()) as usize; + let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); + + let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); + + let before = stats.value_count() as usize * stats.source().ptype().bit_width(); + + Ok(before as f64 / (values_size + codes_size) as f64) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + let dict = integer::dictionary_encode(stats); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes does not change their values. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality float values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatDictScheme; + +impl Scheme for FloatDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// Float dict codes (child 1) are compact unsigned integers that should not be + /// dict-encoded again. Float dict values (child 0) flow through ALP into integer-land, + /// where integer dict encoding is redundant since the values are already deduplicated at + /// the float level. + /// + /// Additional exclusions for codes (IntSequenceScheme, IntRunEndScheme, FoRScheme, + /// ZigZagScheme, SparseScheme, RLE) are expressed as pull rules on those schemes in + /// vortex-btrblocks. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + if stats.value_count() == 0 { + return Ok(0.0); + } + + if stats + .distinct_count() + .is_some_and(|count| count <= stats.value_count() / 2) + { + return estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx); + } + + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + let dict = float::dictionary_encode(stats); + let has_all_values_referenced = dict.has_all_values_referenced(); + // let DictArrayParts { codes, values, .. } = dict.into_parts(); + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(has_all_values_referenced) + .into_array(), + ) + } + } +} + +/// Dictionary encoding for low-cardinality string values. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct StringDictScheme; + +impl Scheme for StringDictScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.dict" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, codes=1. + fn num_children(&self) -> usize { + 2 + } + + /// String dict codes (child 1) are compact unsigned integers that should not be dict-encoded + /// again. + /// + /// Additional exclusions for codes (IntSequenceScheme, FoRScheme, ZigZagScheme, SparseScheme, + /// RunEndScheme, RLE, etc.) are expressed as pull rules on those schemes in `vortex-btrblocks`. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + if stats + .estimated_distinct_count() + .is_none_or(|c| c > stats.value_count() / 2) + { + return Ok(0.0); + } + + if stats.value_count() == 0 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let dict = dict_encode(&stats.source().clone().into_array())?; + + // Values = child 0. + let compressed_values = compressor.compress_child(dict.values(), &ctx, self.id(), 0)?; + + // Codes = child 1. + let compressed_codes = compressor.compress_child( + &dict.codes().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + // SAFETY: compressing codes or values does not alter the invariants. + unsafe { + Ok( + DictArray::new_unchecked(compressed_codes, compressed_values) + .set_all_values_referenced(dict.has_all_values_referenced()) + .into_array(), + ) + } + } +} diff --git a/vortex-compressor/src/builtins/mod.rs b/vortex-compressor/src/builtins/mod.rs new file mode 100644 index 00000000000..704453fb40b --- /dev/null +++ b/vortex-compressor/src/builtins/mod.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Built-in compression schemes that use only `vortex-array` encodings. +//! +//! These schemes produce arrays using types already in `vortex-array` ([`ConstantArray`], +//! [`DictArray`], [`MaskedArray`], etc.) and have no external encoding crate dependencies. +//! +//! [`ConstantArray`]: vortex_array::arrays::ConstantArray +//! [`DictArray`]: vortex_array::arrays::DictArray +//! [`MaskedArray`]: vortex_array::arrays::MaskedArray + +pub use constant::FloatConstantScheme; +pub use constant::IntConstantScheme; +pub use constant::StringConstantScheme; +pub use dict::FloatDictScheme; +pub use dict::IntDictScheme; +pub use dict::StringDictScheme; +pub use dict::float::dictionary_encode as float_dictionary_encode; +pub use dict::integer::dictionary_encode as integer_dictionary_encode; + +mod constant; +mod dict; + +use vortex_array::Canonical; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; + +/// Returns `true` if the canonical array is a primitive with an integer ptype. +pub fn is_integer_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if p.ptype().is_int()) +} + +/// Returns `true` if the canonical form represents a floating-point primitive. +pub fn is_float_primitive(canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Primitive(p) if !p.ptype().is_int()) +} + +/// Returns `true` if the canonical array is a UTF-8 string type. +pub fn is_utf8_string(canonical: &Canonical) -> bool { + matches!(canonical, + Canonical::VarBinView(v) if + v.dtype().eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + ) +} diff --git a/vortex-compressor/src/compressor.rs b/vortex-compressor/src/compressor.rs new file mode 100644 index 00000000000..aad6cc7e33e --- /dev/null +++ b/vortex-compressor/src/compressor.rs @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Cascading array compression implementation. + +use std::sync::Arc; + +use parking_lot::Mutex; +use parking_lot::MutexGuard; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::CanonicalValidity; +use vortex_array::DynArray; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::ToCanonical; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::FixedSizeListArray; +use vortex_array::arrays::ListArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::listview::list_from_list_view; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::ValidityHelper; +use vortex_error::VortexResult; + +use crate::builtins::IntDictScheme; +use crate::ctx::CompressorContext; +use crate::scheme::ChildSelection; +use crate::scheme::DescendantExclusion; +use crate::scheme::Scheme; +use crate::scheme::SchemeExt; +use crate::scheme::SchemeId; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// The implicit root scheme ID for the compressor's own cascading (e.g. list offset compression). +/// +/// This is the **only** [`SchemeId`] that is not auto-provided via [`SchemeExt`]. +const ROOT_SCHEME_ID: SchemeId = SchemeId { + name: "vortex.compressor.root", +}; + +/// Child indices for the compressor's list/listview compression. +mod root_list_children { + /// List/ListView offsets child. + pub const OFFSETS: usize = 1; + /// ListView sizes child. + pub const SIZES: usize = 2; +} + +/// The main compressor type implementing cascading adaptive compression. +/// +/// This compressor applies adaptive compression [`Scheme`]s to arrays based on their data types and +/// characteristics. It recursively compresses nested structures like structs and lists, and chooses +/// optimal compression schemes for leaf types. +/// +/// The compressor works by: +/// 1. Canonicalizing input arrays to a standard representation. +/// 2. Pre-filtering schemes by [`Scheme::matches`] and exclusion rules. +/// 3. Evaluating each matching scheme's compression ratio on a sample. +/// 4. Compressing with the best scheme and verifying the result is smaller. +/// +/// No scheme may appear twice in a cascade chain. The compressor enforces this automatically +/// along with push/pull exclusion rules declared by each scheme. +#[derive(Debug, Clone)] +pub struct CascadingCompressor { + /// The enabled compression schemes. + schemes: Vec<&'static dyn Scheme>, + + /// Descendant exclusion rules for the compressor's own cascading (e.g. excluding Dict from + /// list offsets). + root_exclusions: Vec, + + /// Shared execution context for array operations during compression. + /// + /// This should have low contention as we only execute arrays one at a time during compression. + ctx: Arc>, +} + +impl CascadingCompressor { + /// Creates a new compressor with the given schemes. + /// + /// Root-level exclusion rules (e.g. excluding Dict from list offsets) are built + /// automatically. + pub fn new(schemes: Vec<&'static dyn Scheme>) -> Self { + // Root exclusion: exclude IntDict from list/listview offsets (monotonically + // increasing data where dictionary encoding is wasteful). + let root_exclusions = vec![DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(root_list_children::OFFSETS), + }]; + Self { + schemes, + root_exclusions, + // TODO(connor): The caller should probably pass this in. + ctx: Arc::new(Mutex::new(LEGACY_SESSION.create_execution_ctx())), + } + } + + /// Returns a mutable borrow of the execution context. + pub fn execution_ctx(&self) -> MutexGuard<'_, ExecutionCtx> { + self.ctx.lock() + } + + /// Compresses a child array produced by a cascading scheme. + /// + /// If the cascade budget is exhausted, the canonical array is returned as-is. Otherwise, + /// the child context is created by descending and recording the parent scheme + child + /// index, and compression proceeds normally. + /// + /// # Errors + /// + /// Returns an error if compression fails. + pub fn compress_child( + &self, + child: &ArrayRef, + parent_ctx: &CompressorContext, + parent_id: SchemeId, + child_index: usize, + ) -> VortexResult { + if parent_ctx.finished_cascading() { + return Ok(child.clone()); + } + + let canonical = child + .clone() + .execute::(&mut self.execution_ctx())? + .0; + let compact = canonical.compact()?; + + let child_ctx = parent_ctx + .clone() + .descend_with_scheme(parent_id, child_index); + self.compress_canonical(compact, child_ctx) + } + + /// Compresses an array using cascading adaptive compression. + /// + /// First canonicalizes and compacts the array, then applies optimal compression schemes. + /// + /// # Errors + /// + /// Returns an error if canonicalization or compression fails. + pub fn compress(&self, array: &ArrayRef) -> VortexResult { + let canonical = array + .clone() + .execute::(&mut self.execution_ctx())? + .0; + + // Compact it, removing any wasted space before we attempt to compress it. + let compact = canonical.compact()?; + + self.compress_canonical(compact, CompressorContext::new()) + } + + /// Compresses a canonical array by dispatching to type-specific logic. + /// + /// # Errors + /// + /// Returns an error if compression of any sub-array fails. + fn compress_canonical( + &self, + array: Canonical, + ctx: CompressorContext, + ) -> VortexResult { + match array { + Canonical::Null(null_array) => Ok(null_array.into_array()), + Canonical::Bool(bool_array) => Ok(bool_array.into_array()), + Canonical::Primitive(primitive) => { + self.choose_and_compress(Canonical::Primitive(primitive), ctx) + } + Canonical::Decimal(decimal) => { + self.choose_and_compress(Canonical::Decimal(decimal), ctx) + } + Canonical::Struct(struct_array) => { + let fields = struct_array + .unmasked_fields() + .iter() + .map(|field| self.compress(field)) + .collect::, _>>()?; + + Ok(StructArray::try_new( + struct_array.names().clone(), + fields, + struct_array.len(), + struct_array.validity().clone(), + )? + .into_array()) + } + Canonical::List(list_view_array) => { + if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { + let list_array = list_from_list_view(list_view_array)?; + self.compress_list_array(list_array, ctx) + } else { + self.compress_list_view_array(list_view_array, ctx) + } + } + Canonical::FixedSizeList(fsl_array) => { + let compressed_elems = self.compress(fsl_array.elements())?; + + Ok(FixedSizeListArray::try_new( + compressed_elems, + fsl_array.list_size(), + fsl_array.validity().clone(), + fsl_array.len(), + )? + .into_array()) + } + Canonical::VarBinView(strings) => { + if strings + .dtype() + .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) + { + self.choose_and_compress(Canonical::VarBinView(strings), ctx) + } else { + // We do not compress binary arrays. + Ok(strings.into_array()) + } + } + Canonical::Extension(ext_array) => { + let before_nbytes = ext_array.as_ref().nbytes(); + + // Try scheme-based compression first. + let result = + self.choose_and_compress(Canonical::Extension(ext_array.clone()), ctx)?; + if result.nbytes() < before_nbytes { + return Ok(result); + } + + // Otherwise, fall back to compressing the underlying storage array. + let compressed_storage = self.compress(ext_array.storage_array())?; + + Ok( + ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) + .into_array(), + ) + } + } + } + + /// The main scheme-selection entry point for a single leaf array. + /// + /// Filters allowed schemes by [`matches`] and exclusion rules, merges their [`stats_options`] + /// into a single [`GenerateStatsOptions`], then delegates to [`choose_scheme`] to pick the + /// winner by estimated compression ratio. + /// + /// If a winner is found and its compressed output is actually smaller, that output is returned. + /// Otherwise, the original array is returned unchanged. + /// + /// Empty and all-null arrays are short-circuited before any scheme evaluation. + /// + /// [`matches`]: Scheme::matches + /// [`stats_options`]: Scheme::stats_options + /// [`choose_scheme`]: Self::choose_scheme + fn choose_and_compress( + &self, + canonical: Canonical, + ctx: CompressorContext, + ) -> VortexResult { + let eligible_schemes: Vec<&'static dyn Scheme> = self + .schemes + .iter() + .copied() + .filter(|s| s.matches(&canonical) && !self.is_excluded(*s, &ctx)) + .collect(); + + let array: ArrayRef = canonical.into(); + + // If there are no schemes that we can compress into, then just return it uncompressed. + if eligible_schemes.is_empty() { + return Ok(array); + } + + // Nothing to compress if empty or all-null. + if array.is_empty() { + return Ok(array); + } + + if array.all_invalid()? { + return Ok( + ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), + ); + } + + let before_nbytes = array.nbytes(); + let merged_opts = eligible_schemes + .iter() + .fold(GenerateStatsOptions::default(), |acc, s| { + acc.merge(s.stats_options()) + }); + + let ctx = ctx.with_stats_options(merged_opts); + + let mut data = ArrayAndStats::new(array, merged_opts); + + if let Some(winner) = self.choose_scheme(&eligible_schemes, &mut data, ctx.clone())? { + let compressed = winner.compress(self, &mut data, ctx)?; + if compressed.nbytes() < before_nbytes { + return Ok(compressed); + } + } + + // No scheme improved on the original. + Ok(data.into_array()) + } + + /// Calls [`expected_compression_ratio`] on each candidate and returns the scheme with the + /// highest ratio, or `None` if no scheme exceeds 1.0. Ties are broken by registration order + /// (earlier in the list wins). + /// + /// [`expected_compression_ratio`]: Scheme::expected_compression_ratio + fn choose_scheme( + &self, + schemes: &[&'static dyn Scheme], + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult> { + let mut best: Option<(&'static dyn Scheme, f64)> = None; + + for &scheme in schemes { + // Constant detection on a sample is a false positive: the sample being constant + // does not mean the full array is constant. + if ctx.is_sample() && scheme.detects_constant() { + continue; + } + + let ratio = scheme.expected_compression_ratio(self, data, ctx.clone())?; + + tracing::debug!(scheme = %scheme.id(), ratio, "evaluated compression ratio"); + + if is_better_ratio(ratio, &best) { + best = Some((scheme, ratio)); + + // Schemes that return f64::MAX (like Constant) cannot be beat, so stop early. + if ratio == f64::MAX { + break; + } + } + } + + Ok(best.map(|(s, _)| s)) + } + + // TODO(connor): Lots of room for optimization here. + /// Returns `true` if the candidate scheme should be excluded based on the cascade history and + /// exclusion rules. + fn is_excluded(&self, candidate: &dyn Scheme, ctx: &CompressorContext) -> bool { + let id = candidate.id(); + let history = ctx.cascade_history(); + + // Self-exclusion: no scheme appears twice in any chain. + if history.iter().any(|&(sid, _)| sid == id) { + return true; + } + + let mut iter = history.iter().copied().peekable(); + + // The root entry is always first in the history (if present). Check if the root has + // excluded us. + if let Some((_, child_idx)) = iter.next_if(|&(sid, _)| sid == ROOT_SCHEME_ID) + && self + .root_exclusions + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + + // Push rules: Check if any of our ancestors have excluded us. + for (ancestor_id, child_idx) in iter { + if let Some(ancestor) = self.schemes.iter().find(|s| s.id() == ancestor_id) + && ancestor + .descendant_exclusions() + .iter() + .any(|rule| rule.excluded == id && rule.children.contains(child_idx)) + { + return true; + } + } + + // Pull rules: Check if we have excluded ourselves because of our ancestors. + for rule in candidate.ancestor_exclusions() { + if history + .iter() + .any(|(sid, cidx)| *sid == rule.ancestor && rule.children.contains(*cidx)) + { + return true; + } + } + + false + } + + /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. + fn compress_list_array( + &self, + list_array: ListArray, + ctx: CompressorContext, + ) -> VortexResult { + let list_array = list_array.reset_offsets(true)?; + + let compressed_elems = self.compress(list_array.elements())?; + + // Record the root scheme with the offsets child index so root exclusion rules apply. + let offset_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), + offset_ctx, + )?; + + Ok(ListArray::try_new( + compressed_elems, + compressed_offsets, + list_array.validity().clone(), + )? + .into_array()) + } + + /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing + /// elements. + fn compress_list_view_array( + &self, + list_view: ListViewArray, + ctx: CompressorContext, + ) -> VortexResult { + let compressed_elems = self.compress(list_view.elements())?; + + let offset_ctx = ctx + .clone() + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + let compressed_offsets = self.compress_canonical( + Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), + offset_ctx, + )?; + + let sizes_ctx = ctx.descend_with_scheme(ROOT_SCHEME_ID, root_list_children::SIZES); + let compressed_sizes = self.compress_canonical( + Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), + sizes_ctx, + )?; + + Ok(ListViewArray::try_new( + compressed_elems, + compressed_offsets, + compressed_sizes, + list_view.validity().clone(), + )? + .into_array()) + } +} + +/// Returns `true` if `ratio` is a valid compression ratio (> 1.0, finite, not subnormal) that +/// beats the current best. +fn is_better_ratio(ratio: f64, best: &Option<(&'static dyn Scheme, f64)>) -> bool { + ratio.is_finite() && !ratio.is_subnormal() && ratio > 1.0 && best.is_none_or(|(_, r)| ratio > r) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::builtins::FloatDictScheme; + use crate::builtins::IntDictScheme; + use crate::builtins::StringDictScheme; + use crate::ctx::CompressorContext; + use crate::scheme::SchemeExt; + + fn compressor() -> CascadingCompressor { + CascadingCompressor::new(vec![&IntDictScheme, &FloatDictScheme, &StringDictScheme]) + } + + #[test] + fn test_self_exclusion() { + let c = compressor(); + let ctx = CompressorContext::default().descend_with_scheme(IntDictScheme.id(), 0); + + // IntDictScheme is in the history, so it should be excluded. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_root_exclusion_list_offsets() { + let c = compressor(); + let ctx = CompressorContext::default() + .descend_with_scheme(ROOT_SCHEME_ID, root_list_children::OFFSETS); + + // IntDict should be excluded for list offsets. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_codes() { + let c = compressor(); + // FloatDict cascading through codes (child 1). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 1); + + // IntDict should be excluded from FloatDict's codes child. + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_push_rule_float_dict_excludes_int_dict_from_values() { + let c = compressor(); + // FloatDict cascading through values (child 0). + let ctx = CompressorContext::default().descend_with_scheme(FloatDictScheme.id(), 0); + + // IntDict should also be excluded from FloatDict's values child (ALP propagation + // replacement). + assert!(c.is_excluded(&IntDictScheme, &ctx)); + } + + #[test] + fn test_no_exclusion_without_history() { + let c = compressor(); + let ctx = CompressorContext::default(); + + // No history means no exclusions. + assert!(!c.is_excluded(&IntDictScheme, &ctx)); + } +} diff --git a/vortex-compressor/src/ctx.rs b/vortex-compressor/src/ctx.rs new file mode 100644 index 00000000000..465a7398350 --- /dev/null +++ b/vortex-compressor/src/ctx.rs @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression context for recursive compression. + +use vortex_error::VortexExpect; + +use crate::scheme::SchemeId; +use crate::stats::GenerateStatsOptions; + +// TODO(connor): Why is this 3??? This doesn't seem smart or adaptive. +/// Maximum cascade depth for compression. +pub const MAX_CASCADE: usize = 3; + +/// Context passed through recursive compression calls. +/// +/// Tracks the cascade history (which schemes and child indices have been applied in the current +/// chain) so the compressor can enforce exclusion rules and prevent cycles. +#[derive(Debug, Clone)] +pub struct CompressorContext { + /// Whether we're compressing a sample (for ratio estimation). + is_sample: bool, + /// Remaining cascade depth allowed. + allowed_cascading: usize, + /// Merged stats options from all eligible schemes at this compression site. + stats_options: GenerateStatsOptions, + /// The cascade chain: `(scheme_id, child_index)` pairs from root to current depth. + /// Used for self-exclusion, push rules ([`descendant_exclusions`]), and pull rules + /// ([`ancestor_exclusions`]). + /// + /// [`descendant_exclusions`]: crate::scheme::Scheme::descendant_exclusions + /// [`ancestor_exclusions`]: crate::scheme::Scheme::ancestor_exclusions + cascade_history: Vec<(SchemeId, usize)>, +} + +impl CompressorContext { + /// Creates a new `CompressorContext`. + /// + /// This should **only** be created by the compressor. + pub(super) fn new() -> Self { + Self { + is_sample: false, + allowed_cascading: MAX_CASCADE, + stats_options: GenerateStatsOptions::default(), + cascade_history: Vec::new(), + } + } +} + +#[cfg(test)] +impl Default for CompressorContext { + fn default() -> Self { + Self::new() + } +} + +impl CompressorContext { + /// Whether this context is for sample compression (ratio estimation). + pub fn is_sample(&self) -> bool { + self.is_sample + } + + /// Whether cascading is exhausted (no further cascade levels allowed). + pub fn finished_cascading(&self) -> bool { + self.allowed_cascading == 0 + } + + /// Returns the merged stats generation options for this compression site. + pub fn stats_options(&self) -> GenerateStatsOptions { + self.stats_options + } + + /// Returns a context with the given stats options. + pub fn with_stats_options(mut self, opts: GenerateStatsOptions) -> Self { + self.stats_options = opts; + self + } + + /// Returns a context marked as sample compression. + pub fn as_sample(mut self) -> Self { + self.is_sample = true; + self + } + + /// Returns a context that disallows further cascading. + pub fn as_leaf(mut self) -> Self { + self.allowed_cascading = 0; + self + } + + /// Descends one level in the cascade, recording the current scheme and which child is + /// being compressed. + /// + /// The `child_index` identifies which child of the scheme is being compressed (e.g. for + /// Dict: values=0, codes=1). + pub(crate) fn descend_with_scheme(mut self, id: SchemeId, child_index: usize) -> Self { + self.allowed_cascading = self + .allowed_cascading + .checked_sub(1) + .vortex_expect("cannot descend: cascade depth exhausted"); + self.cascade_history.push((id, child_index)); + self + } + + /// Returns the cascade chain of `(scheme_id, child_index)` pairs. + pub fn cascade_history(&self) -> &[(SchemeId, usize)] { + &self.cascade_history + } +} diff --git a/vortex-compressor/src/lib.rs b/vortex-compressor/src/lib.rs new file mode 100644 index 00000000000..683bea4f8aa --- /dev/null +++ b/vortex-compressor/src/lib.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![deny(missing_docs)] +#![warn(clippy::missing_docs_in_private_items)] +#![warn(clippy::missing_errors_doc)] +#![warn(clippy::missing_panics_doc)] +#![warn(clippy::missing_safety_doc)] + +//! Encoding-agnostic compression framework for Vortex arrays. +//! +//! This crate provides the core compression engine: the [`Scheme`](scheme::Scheme) trait, +//! sampling-based ratio estimation, cascaded compression, and statistics infrastructure for +//! deciding the best encoding scheme for an array. +//! +//! This crate contains no encoding dependencies. Batteries-included compressors are provided by +//! downstream crates like `vortex-btrblocks`, which register different encodings to the compressor. + +pub mod builtins; +pub mod ctx; +pub mod scheme; +pub mod stats; + +mod sample; + +mod compressor; +pub use compressor::CascadingCompressor; diff --git a/vortex-compressor/src/sample.rs b/vortex-compressor/src/sample.rs new file mode 100644 index 00000000000..fe6cd5078a9 --- /dev/null +++ b/vortex-compressor/src/sample.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Sampling utilities for compression ratio estimation. + +use rand::RngExt; +use rand::SeedableRng; +use rand::prelude::StdRng; +use vortex_array::ArrayRef; +use vortex_array::DynArray; +use vortex_array::IntoArray; +use vortex_array::arrays::ChunkedArray; +use vortex_error::VortexExpect; + +/// The size of each sampled run. +pub const SAMPLE_SIZE: u32 = 64; + +/// The number of sampled runs. +/// +/// # Warning +/// +/// The product of `SAMPLE_SIZE` and `SAMPLE_COUNT` should be (roughly) a multiple of 1024 so that +/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. +pub const SAMPLE_COUNT: u32 = 16; + +/// Fixed seed for the sampling RNG, ensuring deterministic compression output. +const SAMPLE_SEED: u64 = 1234567890; + +/// Samples approximately 1% of the input array for compression ratio estimation. +pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> ArrayRef { + if input.len() <= (sample_size as usize) * (sample_count as usize) { + return input.to_array(); + } + + let slices = stratified_slices( + input.len(), + sample_size, + sample_count, + &mut StdRng::seed_from_u64(SAMPLE_SEED), + ); + + // For every slice, grab the relevant slice and repack into a new PrimitiveArray. + let chunks: Vec<_> = slices + .into_iter() + .map(|(start, end)| { + input + .slice(start..end) + .vortex_expect("slice should succeed") + }) + .collect(); + // SAFETY: all chunks are slices of `input`, so they share its dtype. + unsafe { ChunkedArray::new_unchecked(chunks, input.dtype().clone()) }.into_array() +} + +/// Computes the number of sample chunks to cover approximately 1% of `len` elements, +/// with a minimum of `SAMPLE_SIZE * SAMPLE_COUNT` (1024) values. +pub(crate) fn sample_count_approx_one_percent(len: usize) -> u32 { + let approximately_one_percent = + (len / 100) / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); + u32::max( + u32::next_multiple_of( + approximately_one_percent + .try_into() + .vortex_expect("sample count must fit in u32"), + 16, + ), + SAMPLE_COUNT, + ) +} + +/// Divides an array into `sample_count` equal partitions and picks one random contiguous +/// slice of `sample_size` elements from each partition. +/// +/// This is a stratified sampling strategy: instead of drawing all samples from one region, +/// it spreads them evenly across the array so that every part of the data is represented. +/// Each returned `(start, end)` pair is a half-open range into the original array. +/// +/// If the total number of requested samples (`sample_size * sample_count`) is greater than or +/// equal to `length`, a single slice spanning the whole array is returned. +fn stratified_slices( + length: usize, + sample_size: u32, + sample_count: u32, + rng: &mut StdRng, +) -> Vec<(usize, usize)> { + let total_num_samples: usize = (sample_count as usize) * (sample_size as usize); + if total_num_samples >= length { + return vec![(0usize, length)]; + } + + let partitions = partition_indices(length, sample_count); + let num_samples_per_partition: Vec = partition_indices(total_num_samples, sample_count) + .into_iter() + .map(|(start, stop)| stop - start) + .collect(); + + partitions + .into_iter() + .zip(num_samples_per_partition) + .map(|((start, stop), size)| { + assert!( + stop - start >= size, + "Slices must be bigger than their sampled size" + ); + let random_start = rng.random_range(start..=(stop - size)); + (random_start, random_start + size) + }) + .collect() +} + +/// Splits `[0, length)` into `num_partitions` contiguous, non-overlapping slices of +/// approximately equal size. +/// +/// If `length` is not evenly divisible by `num_partitions`, the first +/// `length % num_partitions` slices get one extra element. Each returned `(start, end)` pair +/// is a half-open range. +fn partition_indices(length: usize, num_partitions: u32) -> Vec<(usize, usize)> { + let num_long_parts = length % num_partitions as usize; + let short_step = length / num_partitions as usize; + let long_step = short_step + 1; + let long_stop = num_long_parts * long_step; + + (0..long_stop) + .step_by(long_step) + .map(|off| (off, off + long_step)) + .chain( + (long_stop..length) + .step_by(short_step) + .map(|off| (off, off + short_step)), + ) + .collect() +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + + use super::*; + + #[test] + fn sample_is_deterministic() -> VortexResult<()> { + // Create a deterministic array with linear-with-noise pattern + let values: Vec = (0i64..100_000).map(|i| i + (i * 7 + 3) % 11).collect(); + + let array = + PrimitiveArray::new(Buffer::from_iter(values), Validity::NonNullable).into_array(); + + let first = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); + for _ in 0..10 { + let again = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); + assert_eq!(first.nbytes(), again.nbytes()); + assert_arrays_eq!(&first, &again); + } + Ok(()) + } +} diff --git a/vortex-compressor/src/scheme.rs b/vortex-compressor/src/scheme.rs new file mode 100644 index 00000000000..dab34a778eb --- /dev/null +++ b/vortex-compressor/src/scheme.rs @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Unified compression scheme trait and exclusion rules. + +use std::fmt; +use std::fmt::Debug; +use std::hash::Hash; +use std::hash::Hasher; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_error::VortexResult; + +use crate::CascadingCompressor; +use crate::ctx::CompressorContext; +use crate::sample::SAMPLE_SIZE; +use crate::sample::sample; +use crate::sample::sample_count_approx_one_percent; +use crate::stats::ArrayAndStats; +use crate::stats::GenerateStatsOptions; + +/// Unique identifier for a compression scheme. +/// +/// The only way to obtain a [`SchemeId`] is through [`SchemeExt::id()`], which is +/// auto-implemented for all [`Scheme`] types. There is no public constructor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct SchemeId { + /// Only constructable within `vortex-compressor`. + /// + /// The only public way to obtain a [`SchemeId`] is through [`SchemeExt::id()`]. + pub(super) name: &'static str, +} + +impl fmt::Display for SchemeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.name) + } +} + +/// Selects which children of a cascading scheme a rule applies to. +#[derive(Debug, Clone, Copy)] +pub enum ChildSelection { + /// Rule applies to all children. + All, + /// Rule applies to a single child. + One(usize), + /// Rule applies to multiple specific children. + Many(&'static [usize]), +} + +impl ChildSelection { + /// Returns `true` if this selection includes the given child index. + pub fn contains(&self, child_index: usize) -> bool { + match self { + ChildSelection::All => true, + ChildSelection::One(idx) => *idx == child_index, + ChildSelection::Many(indices) => indices.contains(&child_index), + } + } +} + +/// Push rule: declared by a cascading scheme to exclude another scheme from the subtree +/// rooted at the specified children. +/// +/// Use this when the declaring scheme (the ancestor) knows about the excluded scheme. For example, +/// `ZigZag` excludes `Dict` from all its children. +#[derive(Debug, Clone, Copy)] +pub struct DescendantExclusion { + /// The scheme to exclude from descendants. + pub excluded: SchemeId, + /// Which children of the declaring scheme this rule applies to. + pub children: ChildSelection, +} + +/// Pull rule: declared by a scheme to exclude itself when the specified ancestor is in the +/// cascade chain. +/// +/// Use this when the excluded scheme (the descendant) knows about the ancestor. For example, +/// `Sequence` excludes itself when `IntDict` is an ancestor on its codes child. +#[derive(Debug, Clone, Copy)] +pub struct AncestorExclusion { + /// The ancestor scheme that makes the declaring scheme ineligible. + pub ancestor: SchemeId, + /// Which children of the ancestor this rule applies to. + pub children: ChildSelection, +} + +/// A single compression encoding that the [`CascadingCompressor`] can select from. +/// +/// The compressor evaluates every registered scheme whose [`matches`] returns `true` for a +/// given array, picks the one with the highest [`expected_compression_ratio`], and calls +/// [`compress`] on the winner. +/// +/// One of the key features of this compressor is that schemes may "cascade": a scheme's +/// [`compress`] can call back into the compressor via [`CascadingCompressor::compress_child`] to +/// compress child or transformed arrays, building up multiple encoding layers (e.g. +/// frame-of-reference and then bit-packing). +/// +/// # Identity +/// +/// Every scheme has a globally unique name returned by [`scheme_name`]. The [`SchemeExt::id`] +/// method (auto-implemented, cannot be overridden) wraps that name in an opaque [`SchemeId`] used +/// for equality, hashing, and exclusion rules. +/// +/// # Cascading and children +/// +/// Schemes that produce child arrays for further compression declare [`num_children`] > 0. Each +/// child is identified by index. Cascading schemes should use +/// [`CascadingCompressor::compress_child`] to compress each child array, which handles cascade +/// level / budget tracking and context management automatically. +/// +/// No scheme may appear twice in a cascade chain (enforced by the compressor). This keeps the +/// search space a tree. +/// +/// # Exclusion rules +/// +/// Schemes declare exclusion rules to prevent incompatible scheme combinations in the cascade +/// chain: +/// +/// - [`descendant_exclusions`] (push): "exclude scheme X from my child Y's subtree." Used when the +/// declaring scheme knows about the excluded scheme. +/// - [`ancestor_exclusions`] (pull): "exclude me if ancestor X's child Y is above me." Used when +/// the declaring scheme knows about the ancestor. +/// +/// # Implementing a scheme +/// +/// At a minimum, implementors must provide [`scheme_name`], [`matches`], and [`compress`]. +/// +/// The default [`expected_compression_ratio`] estimates the ratio by compressing a small sample. +/// Implementors should only override this method when a cheaper heuristic is available (e.g. +/// returning `f64::MAX` for constant detection or `0.0` for early rejection based on stats). +/// +/// Schemes that need statistics that may be expensive to compute should override [`stats_options`] +/// to declare what they require. The compressor merges all eligible schemes' options before +/// generating stats, so each stat is always computed at most once for a given array. +/// +/// [`scheme_name`]: Scheme::scheme_name +/// [`matches`]: Scheme::matches +/// [`compress`]: Scheme::compress +/// [`expected_compression_ratio`]: Scheme::expected_compression_ratio +/// [`stats_options`]: Scheme::stats_options +/// [`num_children`]: Scheme::num_children +/// [`descendant_exclusions`]: Scheme::descendant_exclusions +/// [`ancestor_exclusions`]: Scheme::ancestor_exclusions +pub trait Scheme: Debug + Send + Sync { + /// The globally unique name for this scheme (e.g. `"vortex.int.bitpacking"`). + fn scheme_name(&self) -> &'static str; + + /// Whether this scheme can compress the given canonical array. + fn matches(&self, canonical: &Canonical) -> bool; + + /// True if this scheme detects constant arrays. + fn detects_constant(&self) -> bool { + false + } + + /// Returns the stats generation options this scheme requires. The compressor merges all + /// eligible schemes' options before generating stats so that a single stats pass satisfies + /// every scheme. + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions::default() + } + + /// The number of child arrays this scheme produces when cascading. Returns 0 for leaf + /// schemes that produce a final encoded array. + fn num_children(&self) -> usize { + 0 + } + + /// Schemes to exclude from specific children's subtrees (push direction). + /// + /// Each rule says: "when I cascade through child Y, do not use scheme X anywhere in that + /// subtree." Only meaningful when [`num_children`](Scheme::num_children) > 0. + fn descendant_exclusions(&self) -> Vec { + Vec::new() + } + + /// Ancestors that make this scheme ineligible (pull direction). + /// + /// Each rule says: "if ancestor X cascaded through child Y somewhere above me in the chain, do + /// not try me." + fn ancestor_exclusions(&self) -> Vec { + Vec::new() + } + + // TODO(connor): It would be nice if we returned a more useful type that said "choose me no + // matter what" instead of `f64::MAX`. + /// Estimate the compression ratio for this scheme on the given array. + /// + /// # Errors + /// + /// Returns an error if compression of the sample fails. + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + /// Compress the array using this scheme. + /// + /// # Errors + /// + /// Returns an error if compression fails. + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult; +} + +impl PartialEq for dyn Scheme { + fn eq(&self, other: &Self) -> bool { + self.id() == other.id() + } +} + +impl Eq for dyn Scheme {} + +impl Hash for dyn Scheme { + fn hash(&self, state: &mut H) { + self.id().hash(state); + } +} + +/// Extension trait providing [`id`](SchemeExt::id) for all [`Scheme`] implementors. +/// +/// This trait is automatically implemented for every type that implements [`Scheme`]. Because the +/// blanket implementation covers all types, external crates cannot override `id()`. +pub trait SchemeExt: Scheme { + /// Unique identifier derived from [`scheme_name`](Scheme::scheme_name). + fn id(&self) -> SchemeId { + SchemeId { + name: self.scheme_name(), + } + } +} + +impl SchemeExt for T {} + +/// Estimates compression ratio by compressing a ~1% sample of the data. +/// +/// Creates a new [`ArrayAndStats`] for the sample so that stats are generated from the sample, not +/// the full array. +/// +/// # Errors +/// +/// Returns an error if sample compression fails. +pub fn estimate_compression_ratio_with_sampling( + scheme: &S, + compressor: &CascadingCompressor, + array: &ArrayRef, + ctx: CompressorContext, +) -> VortexResult { + let sample_array = if ctx.is_sample() { + array.clone() + } else { + let source_len = array.len(); + let sample_count = sample_count_approx_one_percent(source_len); + + tracing::trace!( + "Sampling {} values out of {}", + SAMPLE_SIZE as u64 * sample_count as u64, + source_len + ); + + sample(array, SAMPLE_SIZE, sample_count) + }; + + let mut sample_data = ArrayAndStats::new(sample_array, ctx.stats_options()); + let sample_ctx = ctx.as_sample(); + + let after = scheme + .compress(compressor, &mut sample_data, sample_ctx)? + .nbytes(); + let before = sample_data.array().nbytes(); + let ratio = before as f64 / after as f64; + + tracing::debug!("estimate_compression_ratio_with_sampling(compressor={scheme:#?}) = {ratio}",); + + Ok(ratio) +} diff --git a/vortex-compressor/src/stats/cache.rs b/vortex-compressor/src/stats/cache.rs new file mode 100644 index 00000000000..bbb6522337f --- /dev/null +++ b/vortex-compressor/src/stats/cache.rs @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Per-compression-site statistics cache and the [`ArrayAndStats`] bundle. + +use std::any::Any; +use std::any::TypeId; + +use vortex_array::ArrayRef; +use vortex_array::ToCanonical; +use vortex_error::VortexExpect; + +use super::FloatStats; +use super::GenerateStatsOptions; +use super::IntegerStats; +use super::StringStats; + +/// Cache for compression statistics, keyed by concrete type. +struct StatsCache { + // TODO(connor): We could further optimize this with a `SmallVec` here. + /// The cache entries, keyed by [`TypeId`]. + /// + /// The total number of statistics types in this stats should be relatively small, so we use a + /// vector instead of a hash map. + entries: Vec<(TypeId, Box)>, +} + +impl StatsCache { + /// Creates a new empty cache. + fn new() -> Self { + Self { + entries: Vec::new(), + } + } + + /// Returns a cached value, computing it on first access. + fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + let type_id = TypeId::of::(); + let pos = self.entries.iter().position(|(id, _)| *id == type_id); + + if let Some(pos) = pos { + self.entries[pos] + .1 + .downcast_ref::() + .vortex_expect("we just checked the TypeID") + } else { + self.entries.push((type_id, Box::new(f()))); + self.entries + .last() + .vortex_expect("just pushed") + .1 + .downcast_ref::() + .vortex_expect("we just checked the TypeID") + } + } +} + +/// An array bundled with its lazily-computed statistics cache. +/// +/// The cache is guaranteed to correspond to the array. When a scheme creates a derived array (e.g. +/// FoR bias subtraction), it must create a new [`ArrayAndStats`] so that stale stats from the +/// original array are not reused. +/// +/// Built-in stats are accessed via typed methods (`integer_stats`, `float_stats`, `string_stats`) +/// which generate stats lazily on first access using the stored [`GenerateStatsOptions`]. +/// +/// Extension schemes can use `get_or_insert_with` for custom stats types. +pub struct ArrayAndStats { + /// The array. + array: ArrayRef, + /// The stats cache. + cache: StatsCache, + /// The stats generation options. + opts: GenerateStatsOptions, +} + +impl ArrayAndStats { + /// Creates a new bundle with the given stats generation options. + /// + /// Stats are generated lazily on first access via the typed accessor methods. + pub fn new(array: ArrayRef, opts: GenerateStatsOptions) -> Self { + Self { + array, + cache: StatsCache::new(), + opts, + } + } + + /// Returns a reference to the array. + pub fn array(&self) -> &ArrayRef { + &self.array + } + + /// Consumes the bundle and returns the array. + pub fn into_array(self) -> ArrayRef { + self.array + } + + /// Returns integer stats, generating them lazily on first access. + pub fn integer_stats(&mut self) -> &IntegerStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + IntegerStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns float stats, generating them lazily on first access. + pub fn float_stats(&mut self) -> &FloatStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + FloatStats::generate_opts(&array.to_primitive(), opts) + }) + } + + /// Returns string stats, generating them lazily on first access. + pub fn string_stats(&mut self) -> &StringStats { + let array = self.array.clone(); + let opts = self.opts; + + self.cache.get_or_insert_with::(|| { + StringStats::generate_opts(&array.to_varbinview(), opts) + }) + } + + /// For extension schemes with custom stats types. + pub fn get_or_insert_with(&mut self, f: impl FnOnce() -> T) -> &T { + self.cache.get_or_insert_with::(f) + } +} diff --git a/vortex-compressor/src/stats/float.rs b/vortex-compressor/src/stats/float.rs new file mode 100644 index 00000000000..67877d7796c --- /dev/null +++ b/vortex-compressor/src/stats/float.rs @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Float compression statistics. + +use std::hash::Hash; + +use itertools::Itertools; +use num_traits::Float; +use rustc_hash::FxBuildHasher; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::NativeValue; +use vortex_array::dtype::NativePType; +use vortex_array::dtype::PType; +use vortex_array::dtype::half::f16; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_mask::AllOr; +use vortex_utils::aliases::hash_set::HashSet; + +use super::GenerateStatsOptions; + +/// Information about the distinct values in a float array. +#[derive(Debug, Clone)] +pub struct DistinctInfo { + /// The set of distinct float values. + distinct_values: HashSet, FxBuildHasher>, + /// The count of unique values. + distinct_count: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values set. + pub fn distinct_values(&self) -> &HashSet, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific float type. +#[derive(Debug, Clone)] +pub struct TypedStats { + /// Distinct value information, or `None` if not computed. + distinct: Option>, +} + +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } +} + +/// Type-erased container for one of the [`TypedStats`] variants. +#[derive(Debug, Clone)] +pub enum ErasedStats { + /// Stats for `f16` arrays. + F16(TypedStats), + /// Stats for `f32` arrays. + F32(TypedStats), + /// Stats for `f64` arrays. + F64(TypedStats), +} + +impl ErasedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + match self { + ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count), + ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count), + } + } +} + +/// Implements `From>` for [`ErasedStats`]. +macro_rules! impl_from_typed { + ($T:ty, $variant:path) => { + impl From> for ErasedStats { + fn from(typed: TypedStats<$T>) -> Self { + $variant(typed) + } + } + }; +} + +impl_from_typed!(f16, ErasedStats::F16); +impl_from_typed!(f32, ErasedStats::F32); +impl_from_typed!(f64, ErasedStats::F64); + +/// Array of floating-point numbers and relevant stats for compression. +#[derive(Debug, Clone)] +pub struct FloatStats { + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, +} + +impl FloatStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &PrimitiveArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + match input.ptype() { + PType::F16 => typed_float_stats::(input, opts.count_distinct_values), + PType::F32 => typed_float_stats::(input, opts.count_distinct_values), + PType::F64 => typed_float_stats::(input, opts.count_distinct_values), + _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()), + } + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } +} + +impl FloatStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("FloatStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &PrimitiveArray { + &self.src + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { + self.average_run_length + } + + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased + } +} + +/// Computes typed float statistics for a specific float type. +fn typed_float_stats( + array: &PrimitiveArray, + count_distinct_values: bool, +) -> VortexResult +where + NativeValue: Hash + Eq, + TypedStats: Into, +{ + // Special case: empty array. + if array.is_empty() { + return Ok(FloatStats { + src: array.clone(), + null_count: 0, + value_count: 0, + average_run_length: 0, + erased: TypedStats { distinct: None }.into(), + }); + } else if array.all_invalid()? { + return Ok(FloatStats { + src: array.clone(), + null_count: u32::try_from(array.len())?, + value_count: 0, + average_run_length: 0, + erased: TypedStats { distinct: None }.into(), + }); + } + + let null_count = array + .statistics() + .compute_null_count() + .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; + let value_count = array.len() - null_count; + + // Keep a HashMap of T, then convert the keys into PValue afterward since value is + // so much more efficient to hash and search for. + let mut distinct_values = if count_distinct_values { + HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashSet::with_hasher(FxBuildHasher) + }; + + let validity = array.validity_mask()?; + + let mut runs = 1; + let head_idx = validity + .first() + .vortex_expect("All null masks have been handled before"); + let buff = array.to_buffer::(); + let mut prev = buff[head_idx]; + + let first_valid_buff = buff.slice(head_idx..array.len()); + match validity.bit_buffer() { + AllOr::All => { + for value in first_valid_buff { + if count_distinct_values { + distinct_values.insert(NativeValue(value)); + } + + if value != prev { + prev = value; + runs += 1; + } + } + } + AllOr::None => unreachable!("All invalid arrays have been handled earlier"), + AllOr::Some(v) => { + for (&value, valid) in first_valid_buff + .iter() + .zip_eq(v.slice(head_idx..array.len()).iter()) + { + if valid { + if count_distinct_values { + distinct_values.insert(NativeValue(value)); + } + + if value != prev { + prev = value; + runs += 1; + } + } + } + } + } + + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + + let distinct = count_distinct_values.then(|| DistinctInfo { + distinct_count: u32::try_from(distinct_values.len()) + .vortex_expect("more than u32::MAX distinct values"), + distinct_values, + }); + + Ok(FloatStats { + null_count, + value_count, + src: array.clone(), + average_run_length: value_count / runs, + erased: TypedStats { distinct }.into(), + }) +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::ToCanonical; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + + use super::FloatStats; + + #[test] + fn test_float_stats() { + let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); + let floats = floats.to_primitive(); + + let stats = FloatStats::generate_opts( + &floats, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 3); + assert_eq!(stats.null_count, 0); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 3); + } + + #[test] + fn test_float_stats_leading_nulls() { + let floats = PrimitiveArray::new( + buffer![0.0f32, 1.0f32, 2.0f32], + Validity::from_iter([false, true, true]), + ); + + let stats = FloatStats::generate_opts( + &floats, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 2); + assert_eq!(stats.null_count, 1); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 2); + } +} diff --git a/vortex-compressor/src/stats/integer.rs b/vortex-compressor/src/stats/integer.rs new file mode 100644 index 00000000000..1f13118584b --- /dev/null +++ b/vortex-compressor/src/stats/integer.rs @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Integer compression statistics. + +use std::hash::Hash; + +use num_traits::PrimInt; +use rustc_hash::FxBuildHasher; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::primitive::NativeValue; +use vortex_array::dtype::IntegerPType; +use vortex_array::expr::stats::Stat; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar::PValue; +use vortex_array::scalar::Scalar; +use vortex_buffer::BitBuffer; +use vortex_error::VortexError; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_mask::AllOr; +use vortex_utils::aliases::hash_map::HashMap; + +use super::GenerateStatsOptions; + +/// Information about the distinct values in an integer array. +#[derive(Debug, Clone)] +pub struct DistinctInfo { + /// The unique values and their occurrences. + distinct_values: HashMap, u32, FxBuildHasher>, + /// The count of unique values. + distinct_count: u32, + /// The most frequent value. + most_frequent_value: T, + /// The number of times the most frequent value occurs. + top_frequency: u32, +} + +impl DistinctInfo { + /// Returns a reference to the distinct values map. + pub fn distinct_values(&self) -> &HashMap, u32, FxBuildHasher> { + &self.distinct_values + } +} + +/// Typed statistics for a specific integer type. +#[derive(Debug, Clone)] +pub struct TypedStats { + /// The minimum value. + min: T, + /// The maximum value. + max: T, + /// Distinct value information, or `None` if not computed. + distinct: Option>, +} + +impl TypedStats { + /// Returns the distinct value information, if computed. + pub fn distinct(&self) -> Option<&DistinctInfo> { + self.distinct.as_ref() + } +} + +impl TypedStats { + /// Get the count of distinct values, if we have computed it already. + fn distinct_count(&self) -> Option { + Some(self.distinct.as_ref()?.distinct_count) + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> { + let distinct = self.distinct.as_ref()?; + Some((&distinct.most_frequent_value, distinct.top_frequency)) + } +} + +/// Type-erased container for one of the [`TypedStats`] variants. +/// +/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased +/// set of stats. We then perform a variety of access methods on them. +#[derive(Clone, Debug)] +pub enum ErasedStats { + /// Stats for `u8` arrays. + U8(TypedStats), + /// Stats for `u16` arrays. + U16(TypedStats), + /// Stats for `u32` arrays. + U32(TypedStats), + /// Stats for `u64` arrays. + U64(TypedStats), + /// Stats for `i8` arrays. + I8(TypedStats), + /// Stats for `i16` arrays. + I16(TypedStats), + /// Stats for `i32` arrays. + I32(TypedStats), + /// Stats for `i64` arrays. + I64(TypedStats), +} + +impl ErasedStats { + /// Returns `true` if the minimum value is zero. + pub fn min_is_zero(&self) -> bool { + match &self { + ErasedStats::U8(x) => x.min == 0, + ErasedStats::U16(x) => x.min == 0, + ErasedStats::U32(x) => x.min == 0, + ErasedStats::U64(x) => x.min == 0, + ErasedStats::I8(x) => x.min == 0, + ErasedStats::I16(x) => x.min == 0, + ErasedStats::I32(x) => x.min == 0, + ErasedStats::I64(x) => x.min == 0, + } + } + + /// Returns `true` if the minimum value is negative. + pub fn min_is_negative(&self) -> bool { + match &self { + ErasedStats::U8(_) + | ErasedStats::U16(_) + | ErasedStats::U32(_) + | ErasedStats::U64(_) => false, + ErasedStats::I8(x) => x.min < 0, + ErasedStats::I16(x) => x.min < 0, + ErasedStats::I32(x) => x.min < 0, + ErasedStats::I64(x) => x.min < 0, + } + } + + /// Difference between max and min. + pub fn max_minus_min(&self) -> u64 { + match &self { + ErasedStats::U8(x) => (x.max - x.min) as u64, + ErasedStats::U16(x) => (x.max - x.min) as u64, + ErasedStats::U32(x) => (x.max - x.min) as u64, + ErasedStats::U64(x) => x.max - x.min, + ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64, + ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64, + ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64, + ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128) + .vortex_expect("max minus min result bigger than u64"), + } + } + + /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero. + /// + /// This matches how BitPacking computes bit width: it reinterprets signed values as + /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed + /// values, the transmuted value equals the original value. + /// + /// This is used to determine if FOR encoding would reduce bit width compared to + /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help. + pub fn max_ilog2(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.max.checked_ilog2(), + ErasedStats::U16(x) => x.max.checked_ilog2(), + ErasedStats::U32(x) => x.max.checked_ilog2(), + ErasedStats::U64(x) => x.max.checked_ilog2(), + // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior. + ErasedStats::I8(x) => (x.max as u8).checked_ilog2(), + ErasedStats::I16(x) => (x.max as u16).checked_ilog2(), + ErasedStats::I32(x) => (x.max as u32).checked_ilog2(), + ErasedStats::I64(x) => (x.max as u64).checked_ilog2(), + } + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + match &self { + ErasedStats::U8(x) => x.distinct_count(), + ErasedStats::U16(x) => x.distinct_count(), + ErasedStats::U32(x) => x.distinct_count(), + ErasedStats::U64(x) => x.distinct_count(), + ErasedStats::I8(x) => x.distinct_count(), + ErasedStats::I16(x) => x.distinct_count(), + ErasedStats::I32(x) => x.distinct_count(), + ErasedStats::I64(x) => x.distinct_count(), + } + } + + /// Get the most commonly occurring value and its count. + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + match &self { + ErasedStats::U8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::U64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I8(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I16(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I32(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + ErasedStats::I64(x) => { + let (top_value, count) = x.most_frequent_value_and_count()?; + Some(((*top_value).into(), count)) + } + } + } +} + +/// Implements `From>` for [`ErasedStats`]. +macro_rules! impl_from_typed { + ($T:ty, $variant:path) => { + impl From> for ErasedStats { + fn from(typed: TypedStats<$T>) -> Self { + $variant(typed) + } + } + }; +} + +impl_from_typed!(u8, ErasedStats::U8); +impl_from_typed!(u16, ErasedStats::U16); +impl_from_typed!(u32, ErasedStats::U32); +impl_from_typed!(u64, ErasedStats::U64); +impl_from_typed!(i8, ErasedStats::I8); +impl_from_typed!(i16, ErasedStats::I16); +impl_from_typed!(i32, ErasedStats::I32); +impl_from_typed!(i64, ErasedStats::I64); + +/// Array of integers and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct IntegerStats { + /// The underlying source array. + src: PrimitiveArray, + /// Cache for `validity.false_count()`. + null_count: u32, + /// Cache for `validity.true_count()`. + value_count: u32, + /// The average run length. + average_run_length: u32, + /// Type-erased typed statistics. + erased: ErasedStats, +} + +impl IntegerStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &PrimitiveArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + match_each_integer_ptype!(input.ptype(), |T| { + typed_int_stats::(input, opts.count_distinct_values) + }) + } + + /// Get the count of distinct values, if we have computed it already. + pub fn distinct_count(&self) -> Option { + self.erased.distinct_count() + } + + /// Get the most commonly occurring value and its count, if we have computed it already. + pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> { + self.erased.most_frequent_value_and_count() + } +} + +impl IntegerStats { + /// Generates stats with default options. + pub fn generate(input: &PrimitiveArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("IntegerStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &PrimitiveArray { + &self.src + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the average run length. + pub fn average_run_length(&self) -> u32 { + self.average_run_length + } + + /// Returns the type-erased typed statistics. + pub fn erased(&self) -> &ErasedStats { + &self.erased + } +} + +/// Computes typed integer statistics for a specific integer type. +fn typed_int_stats( + array: &PrimitiveArray, + count_distinct_values: bool, +) -> VortexResult +where + T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>, + TypedStats: Into, + NativeValue: Eq + Hash, +{ + // Special case: empty array. + if array.is_empty() { + return Ok(IntegerStats { + src: array.clone(), + null_count: 0, + value_count: 0, + average_run_length: 0, + erased: TypedStats { + min: T::max_value(), + max: T::min_value(), + distinct: None, + } + .into(), + }); + } else if array.all_invalid()? { + return Ok(IntegerStats { + src: array.clone(), + null_count: u32::try_from(array.len())?, + value_count: 0, + average_run_length: 0, + erased: TypedStats { + min: T::max_value(), + max: T::min_value(), + distinct: None, + } + .into(), + }); + } + + let validity = array.validity_mask()?; + let null_count = validity.false_count(); + let value_count = validity.true_count(); + + // Initialize loop state. + let head_idx = validity + .first() + .vortex_expect("All null masks have been handled before"); + let buffer = array.to_buffer::(); + let head = buffer[head_idx]; + + let mut loop_state = LoopState { + distinct_values: if count_distinct_values { + HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) + } else { + HashMap::with_hasher(FxBuildHasher) + }, + prev: head, + runs: 1, + }; + + let sliced = buffer.slice(head_idx..array.len()); + let mut chunks = sliced.as_slice().chunks_exact(64); + match validity.bit_buffer() { + AllOr::All => { + for chunk in &mut chunks { + inner_loop_nonnull( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &mut loop_state, + ) + } + let remainder = chunks.remainder(); + inner_loop_naive( + remainder, + count_distinct_values, + &BitBuffer::new_set(remainder.len()), + &mut loop_state, + ); + } + AllOr::None => unreachable!("All invalid arrays have been handled before"), + AllOr::Some(v) => { + let mask = v.slice(head_idx..array.len()); + let mut offset = 0; + for chunk in &mut chunks { + let validity = mask.slice(offset..(offset + 64)); + offset += 64; + + match validity.true_count() { + // All nulls -> no stats to update. + 0 => continue, + // Inner loop for when validity check can be elided. + 64 => inner_loop_nonnull( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &mut loop_state, + ), + // Inner loop for when we need to check validity. + _ => inner_loop_nullable( + chunk.try_into().ok().vortex_expect("chunk size must be 64"), + count_distinct_values, + &validity, + &mut loop_state, + ), + } + } + // Final iteration, run naive loop. + let remainder = chunks.remainder(); + inner_loop_naive( + remainder, + count_distinct_values, + &mask.slice(offset..(offset + remainder.len())), + &mut loop_state, + ); + } + } + + let runs = loop_state.runs; + + let min = array + .statistics() + .compute_as::(Stat::Min) + .vortex_expect("min should be computed"); + + let max = array + .statistics() + .compute_as::(Stat::Max) + .vortex_expect("max should be computed"); + + let distinct = count_distinct_values.then(|| { + let (&top_value, &top_count) = loop_state + .distinct_values + .iter() + .max_by_key(|&(_, &count)| count) + .vortex_expect("we know this is non-empty"); + + DistinctInfo { + distinct_count: u32::try_from(loop_state.distinct_values.len()) + .vortex_expect("there are more than `u32::MAX` distinct values"), + most_frequent_value: top_value.0, + top_frequency: top_count, + distinct_values: loop_state.distinct_values, + } + }); + + let typed = TypedStats { min, max, distinct }; + + let null_count = u32::try_from(null_count)?; + let value_count = u32::try_from(value_count)?; + + Ok(IntegerStats { + src: array.clone(), + null_count, + value_count, + average_run_length: value_count / runs, + erased: typed.into(), + }) +} + +/// Internal loop state for integer stats computation. +struct LoopState { + /// The previous value seen. + prev: T, + /// The run count. + runs: u32, + /// The distinct values map. + distinct_values: HashMap, u32, FxBuildHasher>, +} + +/// Inner loop for non-null chunks of 64 values. +#[inline(always)] +fn inner_loop_nonnull( + values: &[T; 64], + count_distinct_values: bool, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for &value in values { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } +} + +/// Inner loop for nullable chunks of 64 values. +#[inline(always)] +fn inner_loop_nullable( + values: &[T; 64], + count_distinct_values: bool, + is_valid: &BitBuffer, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for (idx, &value) in values.iter().enumerate() { + if is_valid.value(idx) { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } + } +} + +/// Fallback inner loop for remainder values. +#[inline(always)] +fn inner_loop_naive( + values: &[T], + count_distinct_values: bool, + is_valid: &BitBuffer, + state: &mut LoopState, +) where + NativeValue: Eq + Hash, +{ + for (idx, &value) in values.iter().enumerate() { + if is_valid.value(idx) { + if count_distinct_values { + *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; + } + + if value != state.prev { + state.prev = value; + state.runs += 1; + } + } + } +} + +#[cfg(test)] +mod tests { + use std::iter; + + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::BitBuffer; + use vortex_buffer::Buffer; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use super::IntegerStats; + use super::typed_int_stats; + + #[test] + fn test_naive_count_distinct_values() -> VortexResult<()> { + let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 2); + Ok(()) + } + + #[test] + fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![217u8, 0], + Validity::from(BitBuffer::from(vec![true, false])), + ); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 1); + Ok(()) + } + + #[test] + fn test_count_distinct_values() -> VortexResult<()> { + let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 128); + Ok(()) + } + + #[test] + fn test_count_distinct_values_nullable() -> VortexResult<()> { + let array = PrimitiveArray::new( + (0..128u8).collect::>(), + Validity::from(BitBuffer::from_iter( + iter::repeat_n(vec![true, false], 64).flatten(), + )), + ); + let stats = typed_int_stats::(&array, true)?; + assert_eq!(stats.distinct_count().unwrap(), 64); + Ok(()) + } + + #[test] + fn test_integer_stats_leading_nulls() { + let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); + + let stats = IntegerStats::generate_opts( + &ints, + crate::stats::GenerateStatsOptions { + count_distinct_values: true, + }, + ); + + assert_eq!(stats.value_count, 2); + assert_eq!(stats.null_count, 1); + assert_eq!(stats.average_run_length, 1); + assert_eq!(stats.distinct_count().unwrap(), 2); + } +} diff --git a/vortex-compressor/src/stats/mod.rs b/vortex-compressor/src/stats/mod.rs new file mode 100644 index 00000000000..e4417b66b3d --- /dev/null +++ b/vortex-compressor/src/stats/mod.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types and caching. + +mod cache; +mod float; +mod integer; +mod options; +mod string; + +pub use cache::ArrayAndStats; +pub use float::DistinctInfo as FloatDistinctInfo; +pub use float::ErasedStats as FloatErasedStats; +pub use float::FloatStats; +pub use float::TypedStats as FloatTypedStats; +pub use integer::DistinctInfo as IntegerDistinctInfo; +pub use integer::ErasedStats as IntegerErasedStats; +pub use integer::IntegerStats; +pub use integer::TypedStats as IntegerTypedStats; +pub use options::GenerateStatsOptions; +pub use string::StringStats; diff --git a/vortex-compressor/src/stats/options.rs b/vortex-compressor/src/stats/options.rs new file mode 100644 index 00000000000..d53b69d748a --- /dev/null +++ b/vortex-compressor/src/stats/options.rs @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression statistics types. + +/// Configures how stats are generated. +/// +/// Each scheme declares its required options via [`Scheme::stats_options`]. The compressor +/// merges all eligible schemes' options before generating stats, so that a single stats pass +/// satisfies every scheme. +/// +/// [`Scheme::stats_options`]: crate::scheme::Scheme::stats_options +#[derive(Debug, Default, Clone, Copy)] +pub struct GenerateStatsOptions { + /// Whether distinct values should be counted during stats generation. + pub count_distinct_values: bool, +} + +impl GenerateStatsOptions { + /// Merges two options by OR-ing each field. The result enables a stat if either input does. + pub fn merge(self, other: Self) -> Self { + Self { + count_distinct_values: self.count_distinct_values || other.count_distinct_values, + } + } +} diff --git a/vortex-compressor/src/stats/string.rs b/vortex-compressor/src/stats/string.rs new file mode 100644 index 00000000000..f8db9d0c4f2 --- /dev/null +++ b/vortex-compressor/src/stats/string.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! String compression statistics. + +use vortex_array::arrays::VarBinViewArray; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_utils::aliases::hash_set::HashSet; + +use super::GenerateStatsOptions; + +/// Array of variable-length byte arrays, and relevant stats for compression. +#[derive(Clone, Debug)] +pub struct StringStats { + /// The underlying source array. + src: VarBinViewArray, + /// The estimated number of distinct strings, or `None` if not computed. + estimated_distinct_count: Option, + /// The number of non-null values. + value_count: u32, + /// The number of null values. + null_count: u32, +} + +/// Estimate the number of distinct strings in the var bin view array. +fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { + let views = strings.views(); + // Iterate the views. Two strings which are equal must have the same first 8-bytes. + // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all + // share a 4-byte prefix and have the same length. + let mut distinct = HashSet::with_capacity(views.len() / 2); + views.iter().for_each(|&view| { + #[expect( + clippy::cast_possible_truncation, + reason = "approximate uniqueness with view prefix" + )] + let len_and_prefix = view.as_u128() as u64; + distinct.insert(len_and_prefix); + }); + + Ok(u32::try_from(distinct.len())?) +} + +impl StringStats { + /// Generates stats, returning an error on failure. + fn generate_opts_fallible( + input: &VarBinViewArray, + opts: GenerateStatsOptions, + ) -> VortexResult { + let null_count = input + .statistics() + .compute_null_count() + .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; + let value_count = input.len() - null_count; + let estimated_distinct_count = opts + .count_distinct_values + .then(|| estimate_distinct_count(input)) + .transpose()?; + + Ok(Self { + src: input.clone(), + value_count: u32::try_from(value_count)?, + null_count: u32::try_from(null_count)?, + estimated_distinct_count, + }) + } +} + +impl StringStats { + /// Generates stats with default options. + pub fn generate(input: &VarBinViewArray) -> Self { + Self::generate_opts(input, GenerateStatsOptions::default()) + } + + /// Generates stats with provided options. + pub fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { + Self::generate_opts_fallible(input, opts) + .vortex_expect("StringStats::generate_opts should not fail") + } + + /// Returns the underlying source array. + pub fn source(&self) -> &VarBinViewArray { + &self.src + } + + /// Returns the estimated number of distinct strings, or `None` if not computed. + pub fn estimated_distinct_count(&self) -> Option { + self.estimated_distinct_count + } + + /// Returns the number of non-null values. + pub fn value_count(&self) -> u32 { + self.value_count + } + + /// Returns the number of null values. + pub fn null_count(&self) -> u32 { + self.null_count + } +} diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 4d6031a220c..efd693c5ca1 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -28,14 +28,6 @@ use vortex_array::arrays::VarBinView; use vortex_array::dtype::FieldPath; use vortex_array::session::ArrayRegistry; use vortex_array::session::ArraySession; -#[cfg(feature = "zstd")] -use vortex_btrblocks::BtrBlocksCompressorBuilder; -#[cfg(feature = "zstd")] -use vortex_btrblocks::FloatCode; -#[cfg(feature = "zstd")] -use vortex_btrblocks::IntCode; -#[cfg(feature = "zstd")] -use vortex_btrblocks::StringCode; use vortex_bytebool::ByteBool; use vortex_datetime_parts::DateTimeParts; use vortex_decimal_byte_parts::DecimalByteParts; @@ -63,6 +55,16 @@ use vortex_sequence::Sequence; use vortex_sparse::Sparse; use vortex_utils::aliases::hash_map::HashMap; use vortex_zigzag::ZigZag; + +#[rustfmt::skip] +#[cfg(feature = "zstd")] +use vortex_btrblocks::{ + BtrBlocksCompressorBuilder, + SchemeExt, + schemes::float, + schemes::integer, + schemes::string, +}; #[cfg(feature = "zstd")] use vortex_zstd::Zstd; #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] @@ -196,18 +198,22 @@ impl WriteStrategyBuilder { /// GPU decompression. Without it, strings use interleaved Zstd compression. #[cfg(feature = "zstd")] pub fn with_cuda_compatible_encodings(mut self) -> Self { - let mut builder = BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Sparse, IntCode::Rle]) - .exclude_float([FloatCode::Rle, FloatCode::Sparse]) - .exclude_string([StringCode::Dict, StringCode::Fsst]); + let mut builder = BtrBlocksCompressorBuilder::default().exclude([ + integer::SparseScheme.id(), + integer::RLE_INTEGER_SCHEME.id(), + float::RLE_FLOAT_SCHEME.id(), + float::NullDominatedSparseScheme.id(), + string::StringDictScheme.id(), + string::FSSTScheme.id(), + ]); #[cfg(feature = "unstable_encodings")] { - builder = builder.include_string([StringCode::ZstdBuffers]); + builder = builder.include([string::ZstdBuffersScheme.id()]); } #[cfg(not(feature = "unstable_encodings"))] { - builder = builder.include_string([StringCode::Zstd]); + builder = builder.include([string::ZstdScheme.id()]); } self.compressor = Some(Arc::new(builder.build())); @@ -222,9 +228,11 @@ impl WriteStrategyBuilder { #[cfg(feature = "zstd")] pub fn with_compact_encodings(mut self) -> Self { let btrblocks = BtrBlocksCompressorBuilder::default() - .include_string([StringCode::Zstd]) - .include_int([IntCode::Pco]) - .include_float([FloatCode::Pco]) + .include([ + string::ZstdScheme.id(), + integer::PcoScheme.id(), + float::PcoScheme.id(), + ]) .build(); self.compressor = Some(Arc::new(btrblocks)); diff --git a/vortex-layout/src/layouts/compressed.rs b/vortex-layout/src/layouts/compressed.rs index 58ba381d415..603b2360e0d 100644 --- a/vortex-layout/src/layouts/compressed.rs +++ b/vortex-layout/src/layouts/compressed.rs @@ -11,7 +11,8 @@ use vortex_array::DynArray; use vortex_array::expr::stats::Stat; use vortex_btrblocks::BtrBlocksCompressor; use vortex_btrblocks::BtrBlocksCompressorBuilder; -use vortex_btrblocks::IntCode; +use vortex_btrblocks::SchemeExt; +use vortex_btrblocks::schemes::integer::IntDictScheme; use vortex_error::VortexResult; use vortex_io::runtime::Handle; @@ -69,7 +70,7 @@ impl CompressingStrategy { pub fn new_btrblocks(child: S, exclude_int_dict_encoding: bool) -> Self { let compressor = if exclude_int_dict_encoding { BtrBlocksCompressorBuilder::default() - .exclude_int([IntCode::Dict]) + .exclude([IntDictScheme.id()]) .build() } else { BtrBlocksCompressor::default() diff --git a/vortex/public-api.lock b/vortex/public-api.lock index 0c8ce9d0cd9..7be026902db 100644 --- a/vortex/public-api.lock +++ b/vortex/public-api.lock @@ -22,11 +22,9 @@ pub use vortex::compressor::BtrBlocksCompressor pub use vortex::compressor::BtrBlocksCompressorBuilder -pub use vortex::compressor::FloatCode +pub use vortex::compressor::Scheme -pub use vortex::compressor::IntCode - -pub use vortex::compressor::StringCode +pub use vortex::compressor::SchemeId pub mod vortex::dtype diff --git a/vortex/src/lib.rs b/vortex/src/lib.rs index a532fc1adad..ab22ea36f4e 100644 --- a/vortex/src/lib.rs +++ b/vortex/src/lib.rs @@ -36,9 +36,8 @@ pub mod buffer { pub mod compressor { pub use vortex_btrblocks::BtrBlocksCompressor; pub use vortex_btrblocks::BtrBlocksCompressorBuilder; - pub use vortex_btrblocks::FloatCode; - pub use vortex_btrblocks::IntCode; - pub use vortex_btrblocks::StringCode; + pub use vortex_btrblocks::Scheme; + pub use vortex_btrblocks::SchemeId; } pub mod dtype { From ed4cd525cfd15170c73ba096a97b7b418537f44a Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Fri, 20 Mar 2026 16:05:51 -0400 Subject: [PATCH 2/2] replace compressor in `vortex-btrblocks` Signed-off-by: Connor Tsui --- vortex-btrblocks/Cargo.toml | 2 +- vortex-btrblocks/benches/dict_encode.rs | 1 - vortex-btrblocks/benches/stats_calc.rs | 1 - vortex-btrblocks/public-api.lock | 630 ++++++--- vortex-btrblocks/src/builder.rs | 219 +-- vortex-btrblocks/src/canonical_compressor.rs | 307 +---- vortex-btrblocks/src/compressor/decimal.rs | 43 - .../src/compressor/float/dictionary.rs | 137 -- vortex-btrblocks/src/compressor/float/mod.rs | 746 ---------- .../src/compressor/float/stats.rs | 268 ---- .../src/compressor/integer/dictionary.rs | 153 --- .../src/compressor/integer/mod.rs | 1223 ----------------- .../src/compressor/integer/stats.rs | 519 ------- vortex-btrblocks/src/compressor/mod.rs | 178 --- vortex-btrblocks/src/compressor/rle.rs | 192 --- vortex-btrblocks/src/compressor/string.rs | 687 --------- vortex-btrblocks/src/compressor/temporal.rs | 52 - vortex-btrblocks/src/ctx.rs | 136 -- vortex-btrblocks/src/lib.rs | 82 +- vortex-btrblocks/src/sample.rs | 135 -- vortex-btrblocks/src/scheme.rs | 135 -- vortex-btrblocks/src/schemes/decimal.rs | 77 ++ vortex-btrblocks/src/schemes/float.rs | 422 ++++++ vortex-btrblocks/src/schemes/integer.rs | 983 +++++++++++++ vortex-btrblocks/src/schemes/mod.rs | 14 + .../src/{compressor => schemes}/patches.rs | 0 vortex-btrblocks/src/schemes/rle.rs | 311 +++++ vortex-btrblocks/src/schemes/string.rs | 363 +++++ vortex-btrblocks/src/schemes/temporal.rs | 123 ++ vortex-btrblocks/src/stats.rs | 63 - 30 files changed, 2939 insertions(+), 5263 deletions(-) delete mode 100644 vortex-btrblocks/src/compressor/decimal.rs delete mode 100644 vortex-btrblocks/src/compressor/float/dictionary.rs delete mode 100644 vortex-btrblocks/src/compressor/float/mod.rs delete mode 100644 vortex-btrblocks/src/compressor/float/stats.rs delete mode 100644 vortex-btrblocks/src/compressor/integer/dictionary.rs delete mode 100644 vortex-btrblocks/src/compressor/integer/mod.rs delete mode 100644 vortex-btrblocks/src/compressor/integer/stats.rs delete mode 100644 vortex-btrblocks/src/compressor/mod.rs delete mode 100644 vortex-btrblocks/src/compressor/rle.rs delete mode 100644 vortex-btrblocks/src/compressor/string.rs delete mode 100644 vortex-btrblocks/src/compressor/temporal.rs delete mode 100644 vortex-btrblocks/src/ctx.rs delete mode 100644 vortex-btrblocks/src/sample.rs delete mode 100644 vortex-btrblocks/src/scheme.rs create mode 100644 vortex-btrblocks/src/schemes/decimal.rs create mode 100644 vortex-btrblocks/src/schemes/float.rs create mode 100644 vortex-btrblocks/src/schemes/integer.rs create mode 100644 vortex-btrblocks/src/schemes/mod.rs rename vortex-btrblocks/src/{compressor => schemes}/patches.rs (100%) create mode 100644 vortex-btrblocks/src/schemes/rle.rs create mode 100644 vortex-btrblocks/src/schemes/string.rs create mode 100644 vortex-btrblocks/src/schemes/temporal.rs delete mode 100644 vortex-btrblocks/src/stats.rs diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 1c745306c4a..9bbd2430f09 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -14,7 +14,6 @@ rust-version = { workspace = true } version = { workspace = true } [dependencies] -enum-iterator = { workspace = true } getrandom_v03 = { workspace = true } itertools = { workspace = true } num-traits = { workspace = true } @@ -25,6 +24,7 @@ tracing = { workspace = true } vortex-alp = { workspace = true } vortex-array = { workspace = true } vortex-buffer = { workspace = true } +vortex-compressor = { workspace = true } vortex-datetime-parts = { workspace = true } vortex-decimal-byte-parts = { workspace = true } vortex-error = { workspace = true } diff --git a/vortex-btrblocks/benches/dict_encode.rs b/vortex-btrblocks/benches/dict_encode.rs index 9bed0f11936..8d7c6fc6297 100644 --- a/vortex-btrblocks/benches/dict_encode.rs +++ b/vortex-btrblocks/benches/dict_encode.rs @@ -9,7 +9,6 @@ use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::builders::dict::dict_encode; use vortex_array::validity::Validity; -use vortex_btrblocks::CompressorStats; use vortex_btrblocks::IntegerStats; use vortex_btrblocks::integer_dictionary_encode; use vortex_buffer::BufferMut; diff --git a/vortex-btrblocks/benches/stats_calc.rs b/vortex-btrblocks/benches/stats_calc.rs index a272c16210c..b3070598d6b 100644 --- a/vortex-btrblocks/benches/stats_calc.rs +++ b/vortex-btrblocks/benches/stats_calc.rs @@ -10,7 +10,6 @@ mod benchmarks { use divan::Bencher; use vortex_array::arrays::PrimitiveArray; use vortex_array::validity::Validity; - use vortex_btrblocks::CompressorStats; use vortex_btrblocks::GenerateStatsOptions; use vortex_btrblocks::IntegerStats; use vortex_buffer::Buffer; diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 55d23a96a26..562c60da670 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -1,331 +1,631 @@ pub mod vortex_btrblocks -pub enum vortex_btrblocks::FloatCode +pub use vortex_btrblocks::ArrayAndStats -pub vortex_btrblocks::FloatCode::Alp +pub use vortex_btrblocks::CascadingCompressor -pub vortex_btrblocks::FloatCode::AlpRd +pub use vortex_btrblocks::CompressorContext -pub vortex_btrblocks::FloatCode::Constant +pub use vortex_btrblocks::FloatStats -pub vortex_btrblocks::FloatCode::Dict +pub use vortex_btrblocks::GenerateStatsOptions -pub vortex_btrblocks::FloatCode::Pco +pub use vortex_btrblocks::IntegerStats -pub vortex_btrblocks::FloatCode::Rle +pub use vortex_btrblocks::MAX_CASCADE -pub vortex_btrblocks::FloatCode::RunEnd +pub use vortex_btrblocks::Scheme -pub vortex_btrblocks::FloatCode::Sparse +pub use vortex_btrblocks::SchemeExt -pub vortex_btrblocks::FloatCode::Uncompressed +pub use vortex_btrblocks::SchemeId -impl core::clone::Clone for vortex_btrblocks::FloatCode +pub use vortex_btrblocks::StringStats -pub fn vortex_btrblocks::FloatCode::clone(&self) -> vortex_btrblocks::FloatCode +pub use vortex_btrblocks::estimate_compression_ratio_with_sampling -impl core::cmp::Eq for vortex_btrblocks::FloatCode +pub use vortex_btrblocks::integer_dictionary_encode -impl core::cmp::Ord for vortex_btrblocks::FloatCode +pub mod vortex_btrblocks::schemes -pub fn vortex_btrblocks::FloatCode::cmp(&self, other: &vortex_btrblocks::FloatCode) -> core::cmp::Ordering +pub mod vortex_btrblocks::schemes::decimal -impl core::cmp::PartialEq for vortex_btrblocks::FloatCode +pub struct vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::eq(&self, other: &vortex_btrblocks::FloatCode) -> bool +impl core::clone::Clone for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::cmp::PartialOrd for vortex_btrblocks::FloatCode +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::clone(&self) -> vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::partial_cmp(&self, other: &vortex_btrblocks::FloatCode) -> core::option::Option +impl core::cmp::Eq for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::fmt::Debug for vortex_btrblocks::FloatCode +impl core::cmp::PartialEq for vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::eq(&self, other: &vortex_btrblocks::schemes::decimal::DecimalScheme) -> bool -impl core::hash::Hash for vortex_btrblocks::FloatCode +impl core::fmt::Debug for vortex_btrblocks::schemes::decimal::DecimalScheme -pub fn vortex_btrblocks::FloatCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::marker::Copy for vortex_btrblocks::FloatCode +impl core::marker::Copy for vortex_btrblocks::schemes::decimal::DecimalScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::FloatCode +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::decimal::DecimalScheme -impl enum_iterator::Sequence for vortex_btrblocks::FloatCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::decimal::DecimalScheme -pub const vortex_btrblocks::FloatCode::CARDINALITY: usize +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::FloatCode::first() -> core::option::Option +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::FloatCode::last() -> core::option::Option +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::FloatCode::next(&self) -> core::option::Option +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::FloatCode::previous(&self) -> core::option::Option +pub fn vortex_btrblocks::schemes::decimal::DecimalScheme::scheme_name(&self) -> &'static str -pub enum vortex_btrblocks::IntCode +pub mod vortex_btrblocks::schemes::float -pub vortex_btrblocks::IntCode::BitPacking +pub use vortex_btrblocks::schemes::float::FloatConstantScheme -pub vortex_btrblocks::IntCode::Constant +pub use vortex_btrblocks::schemes::float::FloatDictScheme -pub vortex_btrblocks::IntCode::Dict +pub use vortex_btrblocks::schemes::float::FloatStats -pub vortex_btrblocks::IntCode::For +pub use vortex_btrblocks::schemes::float::is_float_primitive -pub vortex_btrblocks::IntCode::Pco +pub struct vortex_btrblocks::schemes::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Rle +impl core::clone::Clone for vortex_btrblocks::schemes::float::ALPRDScheme -pub vortex_btrblocks::IntCode::RunEnd +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::clone(&self) -> vortex_btrblocks::schemes::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Sequence +impl core::cmp::Eq for vortex_btrblocks::schemes::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Sparse +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::ALPRDScheme -pub vortex_btrblocks::IntCode::Uncompressed +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::eq(&self, other: &vortex_btrblocks::schemes::float::ALPRDScheme) -> bool -pub vortex_btrblocks::IntCode::ZigZag +impl core::fmt::Debug for vortex_btrblocks::schemes::float::ALPRDScheme -impl core::clone::Clone for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::IntCode::clone(&self) -> vortex_btrblocks::IntCode +impl core::marker::Copy for vortex_btrblocks::schemes::float::ALPRDScheme -impl core::cmp::Eq for vortex_btrblocks::IntCode +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::ALPRDScheme -impl core::cmp::Ord for vortex_btrblocks::IntCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALPRDScheme -pub fn vortex_btrblocks::IntCode::cmp(&self, other: &vortex_btrblocks::IntCode) -> core::cmp::Ordering +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialEq for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntCode::eq(&self, other: &vortex_btrblocks::IntCode) -> bool +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::cmp::PartialOrd for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::schemes::float::ALPRDScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::IntCode::partial_cmp(&self, other: &vortex_btrblocks::IntCode) -> core::option::Option +pub struct vortex_btrblocks::schemes::float::ALPScheme -impl core::fmt::Debug for vortex_btrblocks::IntCode +impl core::clone::Clone for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::IntCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::float::ALPScheme::clone(&self) -> vortex_btrblocks::schemes::float::ALPScheme -impl core::hash::Hash for vortex_btrblocks::IntCode +impl core::cmp::Eq for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::IntCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::ALPScheme -impl core::marker::Copy for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::schemes::float::ALPScheme::eq(&self, other: &vortex_btrblocks::schemes::float::ALPScheme) -> bool -impl core::marker::StructuralPartialEq for vortex_btrblocks::IntCode +impl core::fmt::Debug for vortex_btrblocks::schemes::float::ALPScheme -impl enum_iterator::Sequence for vortex_btrblocks::IntCode +pub fn vortex_btrblocks::schemes::float::ALPScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub const vortex_btrblocks::IntCode::CARDINALITY: usize +impl core::marker::Copy for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::IntCode::first() -> core::option::Option +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::IntCode::last() -> core::option::Option +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::ALPScheme -pub fn vortex_btrblocks::IntCode::next(&self) -> core::option::Option +pub fn vortex_btrblocks::schemes::float::ALPScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::IntCode::previous(&self) -> core::option::Option +pub fn vortex_btrblocks::schemes::float::ALPScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub enum vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::ALPScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub vortex_btrblocks::StringCode::Constant +pub fn vortex_btrblocks::schemes::float::ALPScheme::num_children(&self) -> usize -pub vortex_btrblocks::StringCode::Dict +pub fn vortex_btrblocks::schemes::float::ALPScheme::scheme_name(&self) -> &'static str -pub vortex_btrblocks::StringCode::Fsst +pub struct vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub vortex_btrblocks::StringCode::Sparse +impl core::clone::Clone for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub vortex_btrblocks::StringCode::Uncompressed +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::clone(&self) -> vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub vortex_btrblocks::StringCode::Zstd +impl core::cmp::Eq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub vortex_btrblocks::StringCode::ZstdBuffers +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::clone::Clone for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::eq(&self, other: &vortex_btrblocks::schemes::float::NullDominatedSparseScheme) -> bool -pub fn vortex_btrblocks::StringCode::clone(&self) -> vortex_btrblocks::StringCode +impl core::fmt::Debug for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::cmp::Eq for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -impl core::cmp::Ord for vortex_btrblocks::StringCode +impl core::marker::Copy for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::StringCode::cmp(&self, other: &vortex_btrblocks::StringCode) -> core::cmp::Ordering +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -impl core::cmp::PartialEq for vortex_btrblocks::StringCode +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::NullDominatedSparseScheme -pub fn vortex_btrblocks::StringCode::eq(&self, other: &vortex_btrblocks::StringCode) -> bool +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::cmp::PartialOrd for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::StringCode::partial_cmp(&self, other: &vortex_btrblocks::StringCode) -> core::option::Option +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -impl core::fmt::Debug for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::StringCode::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::num_children(&self) -> usize -impl core::hash::Hash for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::NullDominatedSparseScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::StringCode::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +pub struct vortex_btrblocks::schemes::float::PcoScheme -impl core::marker::Copy for vortex_btrblocks::StringCode +impl core::clone::Clone for vortex_btrblocks::schemes::float::PcoScheme -impl core::marker::StructuralPartialEq for vortex_btrblocks::StringCode +pub fn vortex_btrblocks::schemes::float::PcoScheme::clone(&self) -> vortex_btrblocks::schemes::float::PcoScheme -impl enum_iterator::Sequence for vortex_btrblocks::StringCode +impl core::cmp::Eq for vortex_btrblocks::schemes::float::PcoScheme -pub const vortex_btrblocks::StringCode::CARDINALITY: usize +impl core::cmp::PartialEq for vortex_btrblocks::schemes::float::PcoScheme -pub fn vortex_btrblocks::StringCode::first() -> core::option::Option +pub fn vortex_btrblocks::schemes::float::PcoScheme::eq(&self, other: &vortex_btrblocks::schemes::float::PcoScheme) -> bool -pub fn vortex_btrblocks::StringCode::last() -> core::option::Option +impl core::fmt::Debug for vortex_btrblocks::schemes::float::PcoScheme -pub fn vortex_btrblocks::StringCode::next(&self) -> core::option::Option +pub fn vortex_btrblocks::schemes::float::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::StringCode::previous(&self) -> core::option::Option +impl core::marker::Copy for vortex_btrblocks::schemes::float::PcoScheme -pub struct vortex_btrblocks::BtrBlocksCompressor +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::float::PcoScheme -pub vortex_btrblocks::BtrBlocksCompressor::float_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::float::FloatScheme> +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::float::PcoScheme -pub vortex_btrblocks::BtrBlocksCompressor::int_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme> +pub fn vortex_btrblocks::schemes::float::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub vortex_btrblocks::BtrBlocksCompressor::string_schemes: alloc::vec::Vec<&'static dyn vortex_btrblocks::compressor::string::StringScheme> +pub fn vortex_btrblocks::schemes::float::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl vortex_btrblocks::BtrBlocksCompressor +pub fn vortex_btrblocks::schemes::float::PcoScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::BtrBlocksCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult +pub const vortex_btrblocks::schemes::float::RLE_FLOAT_SCHEME: vortex_btrblocks::schemes::rle::RLEScheme -impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressor +pub mod vortex_btrblocks::schemes::integer -pub fn vortex_btrblocks::BtrBlocksCompressor::clone(&self) -> vortex_btrblocks::BtrBlocksCompressor +pub use vortex_btrblocks::schemes::integer::IntConstantScheme -impl core::default::Default for vortex_btrblocks::BtrBlocksCompressor +pub use vortex_btrblocks::schemes::integer::IntDictScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::default() -> Self +pub use vortex_btrblocks::schemes::integer::IntegerStats -impl vortex_btrblocks::CanonicalCompressor for vortex_btrblocks::BtrBlocksCompressor +pub use vortex_btrblocks::schemes::integer::is_integer_primitive -pub fn vortex_btrblocks::BtrBlocksCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +pub struct vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +impl core::clone::Clone for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::clone(&self) -> vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::BitPackingScheme -pub struct vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::BitPackingScheme -impl vortex_btrblocks::BtrBlocksCompressorBuilder +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::BitPackingScheme) -> bool -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +impl core::marker::Copy for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_string(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::BitPackingScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_float(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_int(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include_string(self, codes: impl core::iter::traits::collect::IntoIterator) -> Self +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressorBuilder +pub fn vortex_btrblocks::schemes::integer::BitPackingScheme::scheme_name(&self) -> &'static str -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::clone(&self) -> vortex_btrblocks::BtrBlocksCompressorBuilder +pub struct vortex_btrblocks::schemes::integer::FoRScheme -impl core::default::Default for vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::clone::Clone for vortex_btrblocks::schemes::integer::FoRScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::default() -> Self +pub fn vortex_btrblocks::schemes::integer::FoRScheme::clone(&self) -> vortex_btrblocks::schemes::integer::FoRScheme -impl core::fmt::Debug for vortex_btrblocks::BtrBlocksCompressorBuilder +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::FoRScheme -pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::FoRScheme + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::FoRScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::FoRScheme + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::integer::FoRScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::FoRScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::FoRScheme + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::integer::FoRScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::integer::PcoScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::integer::PcoScheme + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::clone(&self) -> vortex_btrblocks::schemes::integer::PcoScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::PcoScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::PcoScheme + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::PcoScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::PcoScheme + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::integer::PcoScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::PcoScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::PcoScheme + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::integer::PcoScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::integer::RunEndScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::integer::RunEndScheme + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::clone(&self) -> vortex_btrblocks::schemes::integer::RunEndScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::RunEndScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::RunEndScheme + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::RunEndScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::RunEndScheme + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::integer::RunEndScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::RunEndScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::RunEndScheme + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::num_children(&self) -> usize + +pub fn vortex_btrblocks::schemes::integer::RunEndScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::integer::SequenceScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::integer::SequenceScheme + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::clone(&self) -> vortex_btrblocks::schemes::integer::SequenceScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::SequenceScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::SequenceScheme + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::SequenceScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::SequenceScheme + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::integer::SequenceScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::SequenceScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::SequenceScheme + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::ancestor_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::integer::SequenceScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::integer::SparseScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::integer::SparseScheme + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::clone(&self) -> vortex_btrblocks::schemes::integer::SparseScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::SparseScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::SparseScheme + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::SparseScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::SparseScheme + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::integer::SparseScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::SparseScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::SparseScheme + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::integer::SparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub struct vortex_btrblocks::GenerateStatsOptions +pub fn vortex_btrblocks::schemes::integer::SparseScheme::num_children(&self) -> usize -pub vortex_btrblocks::GenerateStatsOptions::count_distinct_values: bool +pub fn vortex_btrblocks::schemes::integer::SparseScheme::scheme_name(&self) -> &'static str -impl core::default::Default for vortex_btrblocks::GenerateStatsOptions +pub fn vortex_btrblocks::schemes::integer::SparseScheme::stats_options(&self) -> vortex_compressor::stats::options::GenerateStatsOptions -pub fn vortex_btrblocks::GenerateStatsOptions::default() -> Self +pub struct vortex_btrblocks::schemes::integer::ZigZagScheme -pub struct vortex_btrblocks::IntegerStats +impl core::clone::Clone for vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::clone::Clone for vortex_btrblocks::IntegerStats +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::clone(&self) -> vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::clone(&self) -> vortex_btrblocks::IntegerStats +impl core::cmp::Eq for vortex_btrblocks::schemes::integer::ZigZagScheme -impl core::fmt::Debug for vortex_btrblocks::IntegerStats +impl core::cmp::PartialEq for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::eq(&self, other: &vortex_btrblocks::schemes::integer::ZigZagScheme) -> bool -impl vortex_btrblocks::CompressorStats for vortex_btrblocks::IntegerStats +impl core::fmt::Debug for vortex_btrblocks::schemes::integer::ZigZagScheme -pub type vortex_btrblocks::IntegerStats::ArrayVTable = vortex_array::arrays::primitive::vtable::Primitive +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::IntegerStats::generate(input: &::Array) -> Self +impl core::marker::Copy for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::integer::ZigZagScheme -pub fn vortex_btrblocks::IntegerStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::ancestor_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub trait vortex_btrblocks::CanonicalCompressor +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::descendant_exclusions(&self) -> alloc::vec::Vec -pub fn vortex_btrblocks::CanonicalCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::expected_compression_ratio(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult -pub fn vortex_btrblocks::CanonicalCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool -pub fn vortex_btrblocks::CanonicalCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::num_children(&self) -> usize -pub fn vortex_btrblocks::CanonicalCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +pub fn vortex_btrblocks::schemes::integer::ZigZagScheme::scheme_name(&self) -> &'static str -impl vortex_btrblocks::CanonicalCompressor for vortex_btrblocks::BtrBlocksCompressor +pub const vortex_btrblocks::schemes::integer::RLE_INTEGER_SCHEME: vortex_btrblocks::schemes::rle::RLEScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::compress_canonical(&self, array: vortex_array::canonical::Canonical, ctx: vortex_btrblocks::ctx::CompressorContext, excludes: vortex_btrblocks::ctx::Excludes<'_>) -> vortex_error::VortexResult +pub mod vortex_btrblocks::schemes::string -pub fn vortex_btrblocks::BtrBlocksCompressor::float_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::float::FloatScheme] +pub use vortex_btrblocks::schemes::string::StringConstantScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::int_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::integer::IntegerScheme] +pub use vortex_btrblocks::schemes::string::StringDictScheme -pub fn vortex_btrblocks::BtrBlocksCompressor::string_schemes(&self) -> &[&'static dyn vortex_btrblocks::compressor::string::StringScheme] +pub use vortex_btrblocks::schemes::string::StringStats -pub trait vortex_btrblocks::CompressorStats: core::fmt::Debug + core::clone::Clone +pub use vortex_btrblocks::schemes::string::is_utf8_string -pub type vortex_btrblocks::CompressorStats::ArrayVTable: vortex_array::vtable::VTable +pub struct vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::CompressorStats::generate(input: &::Array) -> Self +impl core::clone::Clone for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::CompressorStats::generate_opts(input: &::Array, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub fn vortex_btrblocks::schemes::string::FSSTScheme::clone(&self) -> vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::CompressorStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +impl core::cmp::Eq for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::CompressorStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::CompressorStats::source(&self) -> &::Array +pub fn vortex_btrblocks::schemes::string::FSSTScheme::eq(&self, other: &vortex_btrblocks::schemes::string::FSSTScheme) -> bool -impl vortex_btrblocks::CompressorStats for vortex_btrblocks::IntegerStats +impl core::fmt::Debug for vortex_btrblocks::schemes::string::FSSTScheme -pub type vortex_btrblocks::IntegerStats::ArrayVTable = vortex_array::arrays::primitive::vtable::Primitive +pub fn vortex_btrblocks::schemes::string::FSSTScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::IntegerStats::generate(input: &::Array) -> Self +impl core::marker::Copy for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::IntegerStats::generate_opts(input: &vortex_array::arrays::primitive::array::PrimitiveArray, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::FSSTScheme -pub fn vortex_btrblocks::IntegerStats::sample(&self, sample_size: u32, sample_count: u32) -> Self +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::FSSTScheme + +pub fn vortex_btrblocks::schemes::string::FSSTScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::string::FSSTScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::string::FSSTScheme::num_children(&self) -> usize + +pub fn vortex_btrblocks::schemes::string::FSSTScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::clone(&self) -> vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::eq(&self, other: &vortex_btrblocks::schemes::string::NullDominatedSparseScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::NullDominatedSparseScheme + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::descendant_exclusions(&self) -> alloc::vec::Vec + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::num_children(&self) -> usize + +pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::schemes::string::ZstdScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::string::ZstdScheme + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::clone(&self) -> vortex_btrblocks::schemes::string::ZstdScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::string::ZstdScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::ZstdScheme + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::eq(&self, other: &vortex_btrblocks::schemes::string::ZstdScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::string::ZstdScheme + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::string::ZstdScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::ZstdScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::ZstdScheme + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::compress(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::string::ZstdScheme::scheme_name(&self) -> &'static str + +pub mod vortex_btrblocks::schemes::temporal + +pub struct vortex_btrblocks::schemes::temporal::TemporalScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::temporal::TemporalScheme + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::clone(&self) -> vortex_btrblocks::schemes::temporal::TemporalScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::temporal::TemporalScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::temporal::TemporalScheme + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::eq(&self, other: &vortex_btrblocks::schemes::temporal::TemporalScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::temporal::TemporalScheme + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::temporal::TemporalScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::temporal::TemporalScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::temporal::TemporalScheme + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::compress(&self, compressor: &vortex_compressor::compressor::CascadingCompressor, data: &mut vortex_compressor::stats::cache::ArrayAndStats, ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::detects_constant(&self) -> bool + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::expected_compression_ratio(&self, _compressor: &vortex_compressor::compressor::CascadingCompressor, _data: &mut vortex_compressor::stats::cache::ArrayAndStats, _ctx: vortex_compressor::ctx::CompressorContext) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::matches(&self, canonical: &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::num_children(&self) -> usize + +pub fn vortex_btrblocks::schemes::temporal::TemporalScheme::scheme_name(&self) -> &'static str + +pub struct vortex_btrblocks::BtrBlocksCompressor(pub vortex_compressor::compressor::CascadingCompressor) + +impl vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::compress(&self, array: &vortex_array::array::ArrayRef) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::clone(&self) -> vortex_btrblocks::BtrBlocksCompressor + +impl core::default::Default for vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::default() -> Self + +impl core::ops::deref::Deref for vortex_btrblocks::BtrBlocksCompressor + +pub type vortex_btrblocks::BtrBlocksCompressor::Target = vortex_compressor::compressor::CascadingCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressor::deref(&self) -> &vortex_compressor::compressor::CascadingCompressor + +pub struct vortex_btrblocks::BtrBlocksCompressorBuilder + +impl vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude(self, ids: impl core::iter::traits::collect::IntoIterator) -> Self + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::include(self, ids: impl core::iter::traits::collect::IntoIterator) -> Self + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::with_scheme(self, scheme: &'static dyn vortex_compressor::scheme::Scheme) -> Self + +impl core::clone::Clone for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::clone(&self) -> vortex_btrblocks::BtrBlocksCompressorBuilder + +impl core::default::Default for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::default() -> Self + +impl core::fmt::Debug for vortex_btrblocks::BtrBlocksCompressorBuilder + +pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result -pub fn vortex_btrblocks::IntegerStats::sample_opts(&self, sample_size: u32, sample_count: u32, opts: vortex_btrblocks::GenerateStatsOptions) -> Self +pub const vortex_btrblocks::ALL_SCHEMES: &[&dyn vortex_compressor::scheme::Scheme] -pub fn vortex_btrblocks::IntegerStats::source(&self) -> &vortex_array::arrays::primitive::array::PrimitiveArray +pub fn vortex_btrblocks::compress_patches(patches: &vortex_array::patches::Patches) -> vortex_error::VortexResult -pub fn vortex_btrblocks::integer_dictionary_encode(stats: &vortex_btrblocks::IntegerStats) -> vortex_array::arrays::dict::array::DictArray +pub fn vortex_btrblocks::default_excluded() -> vortex_utils::aliases::hash_set::HashSet diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index d329ec8c139..a0a3e12ab26 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -3,156 +3,165 @@ //! Builder for configuring `BtrBlocksCompressor` instances. -use itertools::Itertools; use vortex_utils::aliases::hash_set::HashSet; use crate::BtrBlocksCompressor; -use crate::FloatCode; -use crate::IntCode; -use crate::StringCode; -use crate::compressor::float::ALL_FLOAT_SCHEMES; -use crate::compressor::float::FloatScheme; -use crate::compressor::integer::ALL_INT_SCHEMES; -use crate::compressor::integer::IntegerScheme; -use crate::compressor::string::ALL_STRING_SCHEMES; -use crate::compressor::string::StringScheme; +use crate::CascadingCompressor; +use crate::Scheme; +use crate::SchemeExt; +use crate::SchemeId; +use crate::schemes::decimal; +use crate::schemes::float; +use crate::schemes::integer; +use crate::schemes::rle; +use crate::schemes::string; +use crate::schemes::temporal; + +/// All available compression schemes. +/// +/// This list is order-sensitive: the builder preserves this order when constructing +/// the final scheme list, so that tie-breaking is deterministic. +pub const ALL_SCHEMES: &[&dyn Scheme] = &[ + //////////////////////////////////////////////////////////////////////////////////////////////// + // Integer schemes. + //////////////////////////////////////////////////////////////////////////////////////////////// + &integer::IntConstantScheme, + // NOTE: FoR must precede BitPacking to avoid unnecessary patches. + &integer::FoRScheme, + // NOTE: ZigZag should precede BitPacking because we don't want negative numbers. + &integer::ZigZagScheme, + &integer::BitPackingScheme, + &integer::SparseScheme, + &integer::IntDictScheme, + &integer::RunEndScheme, + &integer::SequenceScheme, + &rle::RLE_INTEGER_SCHEME, + #[cfg(feature = "pco")] + &integer::PcoScheme, + //////////////////////////////////////////////////////////////////////////////////////////////// + // Float schemes. + //////////////////////////////////////////////////////////////////////////////////////////////// + &float::FloatConstantScheme, + &float::ALPScheme, + &float::ALPRDScheme, + &float::FloatDictScheme, + &float::NullDominatedSparseScheme, + &rle::RLE_FLOAT_SCHEME, + #[cfg(feature = "pco")] + &float::PcoScheme, + //////////////////////////////////////////////////////////////////////////////////////////////// + // String schemes. + //////////////////////////////////////////////////////////////////////////////////////////////// + &string::StringDictScheme, + &string::FSSTScheme, + &string::StringConstantScheme, + &string::NullDominatedSparseScheme, + #[cfg(feature = "zstd")] + &string::ZstdScheme, + #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] + &string::ZstdBuffersScheme, + // Decimal schemes. + &decimal::DecimalScheme, + // Temporal schemes. + &temporal::TemporalScheme, +]; + +/// Returns the set of scheme IDs excluded by default (behind feature gates or known-expensive). +pub fn default_excluded() -> HashSet { + #[allow(unused_mut, reason = "depends on enabled feature flags")] + let mut excluded = HashSet::new(); + #[cfg(feature = "pco")] + { + excluded.insert(integer::PcoScheme.id()); + excluded.insert(float::PcoScheme.id()); + } + #[cfg(feature = "zstd")] + excluded.insert(string::ZstdScheme.id()); + #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] + excluded.insert(string::ZstdBuffersScheme.id()); + excluded +} /// Builder for creating configured [`BtrBlocksCompressor`] instances. /// -/// Use this builder to configure which compression schemes are allowed for each data type. -/// By default, all schemes are enabled. +/// Use this builder to configure which compression schemes are allowed. +/// By default, all schemes are enabled except those in [`default_excluded`]. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode, FloatCode}; +/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme, SchemeExt}; +/// use vortex_btrblocks::schemes::integer::IntDictScheme; /// -/// // Default compressor - all schemes allowed +/// // Default compressor - all non-excluded schemes allowed. /// let compressor = BtrBlocksCompressorBuilder::default().build(); /// -/// // Exclude specific schemes +/// // Exclude specific schemes. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict]) +/// .exclude([IntDictScheme.id()]) /// .build(); /// -/// // Exclude then re-include +/// // Exclude then re-include. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict, IntCode::Rle]) -/// .include_int([IntCode::Dict]) +/// .exclude([IntDictScheme.id()]) +/// .include([IntDictScheme.id()]) /// .build(); /// ``` #[derive(Debug, Clone)] pub struct BtrBlocksCompressorBuilder { - int_schemes: HashSet<&'static dyn IntegerScheme>, - float_schemes: HashSet<&'static dyn FloatScheme>, - string_schemes: HashSet<&'static dyn StringScheme>, + schemes: HashSet<&'static dyn Scheme>, } impl Default for BtrBlocksCompressorBuilder { fn default() -> Self { + let excluded = default_excluded(); Self { - int_schemes: ALL_INT_SCHEMES - .iter() - .copied() - .filter(|s| s.code() != IntCode::Pco) - .collect(), - float_schemes: ALL_FLOAT_SCHEMES + schemes: ALL_SCHEMES .iter() .copied() - .filter(|s| s.code() != FloatCode::Pco) - .collect(), - string_schemes: ALL_STRING_SCHEMES - .iter() - .copied() - .filter(|s| s.code() != StringCode::Zstd && s.code() != StringCode::ZstdBuffers) + .filter(|s| !excluded.contains(&s.id())) .collect(), } } } impl BtrBlocksCompressorBuilder { - /// Create a new builder with no encodings enabled. - pub fn empty() -> Self { - Self { - int_schemes: Default::default(), - float_schemes: Default::default(), - string_schemes: Default::default(), - } - } - - /// Excludes the specified integer compression schemes. - pub fn exclude_int(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.int_schemes.retain(|s| !codes.contains(&s.code())); + /// Excludes the specified compression schemes by their [`SchemeId`]. + pub fn exclude(mut self, ids: impl IntoIterator) -> Self { + let ids: HashSet<_> = ids.into_iter().collect(); + self.schemes.retain(|s| !ids.contains(&s.id())); self } - /// Excludes the specified float compression schemes. - pub fn exclude_float(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.float_schemes.retain(|s| !codes.contains(&s.code())); - self - } - - /// Excludes the specified string compression schemes. - pub fn exclude_string(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - self.string_schemes.retain(|s| !codes.contains(&s.code())); - self - } - - /// Includes the specified integer compression schemes. - pub fn include_int(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_INT_SCHEMES { - if codes.contains(&scheme.code()) { - self.int_schemes.insert(*scheme); - } - } - self - } - - /// Includes the specified float compression schemes. - pub fn include_float(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_FLOAT_SCHEMES { - if codes.contains(&scheme.code()) { - self.float_schemes.insert(*scheme); + /// Includes the specified compression schemes by their [`SchemeId`]. + /// + /// Only schemes present in [`ALL_SCHEMES`] can be included. + pub fn include(mut self, ids: impl IntoIterator) -> Self { + let ids: HashSet<_> = ids.into_iter().collect(); + for scheme in ALL_SCHEMES { + if ids.contains(&scheme.id()) { + self.schemes.insert(*scheme); } } self } - /// Includes the specified string compression schemes. - pub fn include_string(mut self, codes: impl IntoIterator) -> Self { - let codes: HashSet<_> = codes.into_iter().collect(); - for scheme in ALL_STRING_SCHEMES { - if codes.contains(&scheme.code()) { - self.string_schemes.insert(*scheme); - } - } + /// Adds a single scheme to the builder. + pub fn with_scheme(mut self, scheme: &'static dyn Scheme) -> Self { + self.schemes.insert(scheme); self } - /// Builds the configured `BtrBlocksCompressor`. + /// Builds the configured [`BtrBlocksCompressor`]. + /// + /// The resulting scheme list preserves the order of [`ALL_SCHEMES`] for deterministic + /// tie-breaking. pub fn build(self) -> BtrBlocksCompressor { - // Note we should apply the schemes in the same order, in case try conflict. - BtrBlocksCompressor { - int_schemes: self - .int_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - float_schemes: self - .float_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - string_schemes: self - .string_schemes - .into_iter() - .sorted_by_key(|s| s.code()) - .collect_vec(), - } + let schemes = ALL_SCHEMES + .iter() + .copied() + .filter(|s| self.schemes.contains(s)) + .collect(); + BtrBlocksCompressor(CascadingCompressor::new(schemes)) } } diff --git a/vortex-btrblocks/src/canonical_compressor.rs b/vortex-btrblocks/src/canonical_compressor.rs index 410dda0b599..4ba118defc9 100644 --- a/vortex-btrblocks/src/canonical_compressor.rs +++ b/vortex-btrblocks/src/canonical_compressor.rs @@ -1,316 +1,59 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Canonical array compression implementation. +//! BtrBlocks-specific compressor wrapping the generic [`CascadingCompressor`]. + +use std::ops::Deref; use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::CanonicalValidity; -use vortex_array::DynArray; -use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; -use vortex_array::ToCanonical; -use vortex_array::VortexSessionExecute; -use vortex_array::aggregate_fn::fns::is_constant::is_constant; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::ExtensionArray; -use vortex_array::arrays::FixedSizeListArray; -use vortex_array::arrays::ListArray; -use vortex_array::arrays::ListViewArray; -use vortex_array::arrays::StructArray; -use vortex_array::arrays::TemporalArray; -use vortex_array::arrays::listview::list_from_list_view; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; -use vortex_array::extension::datetime::TemporalMetadata; -use vortex_array::vtable::ValidityHelper; use vortex_error::VortexResult; use crate::BtrBlocksCompressorBuilder; -use crate::CompressorContext; -use crate::CompressorExt; -use crate::Excludes; -use crate::FloatCompressor; -use crate::IntCode; -use crate::IntCompressor; -use crate::StringCompressor; -use crate::compressor::decimal::compress_decimal; -use crate::compressor::float::FloatScheme; -use crate::compressor::integer::IntegerScheme; -use crate::compressor::string::StringScheme; -use crate::compressor::temporal::compress_temporal; - -/// Trait for compressors that can compress canonical arrays. -/// -/// Provides access to configured compression schemes and the ability to -/// compress canonical arrays recursively. -pub trait CanonicalCompressor { - /// Compresses a canonical array with the specified options. - fn compress_canonical( - &self, - array: Canonical, - ctx: CompressorContext, - excludes: Excludes, - ) -> VortexResult; - - /// Returns the enabled integer compression schemes. - fn int_schemes(&self) -> &[&'static dyn IntegerScheme]; - - /// Returns the enabled float compression schemes. - fn float_schemes(&self) -> &[&'static dyn FloatScheme]; - - /// Returns the enabled string compression schemes. - fn string_schemes(&self) -> &[&'static dyn StringScheme]; -} +use crate::CascadingCompressor; -/// The main compressor type implementing BtrBlocks-inspired compression. -/// -/// This compressor applies adaptive compression schemes to arrays based on their data types -/// and characteristics. It recursively compresses nested structures like structs and lists, -/// and chooses optimal compression schemes for primitive types. -/// -/// The compressor works by: -/// 1. Canonicalizing input arrays to a standard representation -/// 2. Analyzing data characteristics to choose optimal compression schemes -/// 3. Recursively compressing nested structures -/// 4. Applying type-specific compression for primitives, strings, and temporal data +/// The BtrBlocks-style compressor with all built-in schemes pre-registered. /// -/// Use [`BtrBlocksCompressorBuilder`] to configure which compression schemes are enabled. +/// This is a thin wrapper around [`CascadingCompressor`] that provides a default set of +/// compression schemes via [`BtrBlocksCompressorBuilder`]. /// /// # Examples /// /// ```rust -/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; +/// use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; +/// use vortex_btrblocks::schemes::integer::IntDictScheme; /// -/// // Default compressor - all schemes allowed +/// // Default compressor - all schemes allowed. /// let compressor = BtrBlocksCompressor::default(); /// -/// // Exclude specific schemes using the builder +/// // Exclude specific schemes using the builder. /// let compressor = BtrBlocksCompressorBuilder::default() -/// .exclude_int([IntCode::Dict]) +/// .exclude([IntDictScheme.id()]) /// .build(); /// ``` #[derive(Clone)] -pub struct BtrBlocksCompressor { - /// Integer compressor with configured schemes. - pub int_schemes: Vec<&'static dyn IntegerScheme>, - - /// Float compressor with configured schemes. - pub float_schemes: Vec<&'static dyn FloatScheme>, - - /// String compressor with configured schemes. - pub string_schemes: Vec<&'static dyn StringScheme>, -} - -impl Default for BtrBlocksCompressor { - fn default() -> Self { - BtrBlocksCompressorBuilder::default().build() - } -} +pub struct BtrBlocksCompressor( + /// The underlying cascading compressor. + pub CascadingCompressor, +); impl BtrBlocksCompressor { /// Compresses an array using BtrBlocks-inspired compression. - /// - /// First canonicalizes and compacts the array, then applies optimal compression schemes. pub fn compress(&self, array: &ArrayRef) -> VortexResult { - // Canonicalize the array - // TODO(joe): receive `ctx` and use it. - let canonical = array - .clone() - .execute::(&mut LEGACY_SESSION.create_execution_ctx())? - .0; - - // Compact it, removing any wasted space before we attempt to compress it - let compact = canonical.compact()?; - - self.compress_canonical(compact, CompressorContext::default(), Excludes::none()) - } - - pub(crate) fn integer_compressor(&self) -> IntCompressor<'_> { - IntCompressor { - btr_blocks_compressor: self, - } - } - - pub(crate) fn float_compressor(&self) -> FloatCompressor<'_> { - FloatCompressor { - btr_blocks_compressor: self, - } - } - - pub(crate) fn string_compressor(&self) -> StringCompressor<'_> { - StringCompressor { - btr_blocks_compressor: self, - } - } - - /// Compresses a [`ListArray`] by narrowing offsets and recursively compressing elements. - fn compress_list_array( - &self, - list_array: ListArray, - ctx: CompressorContext, - ) -> VortexResult { - // Reset the offsets to remove garbage data that might prevent us from narrowing our - // offsets (there could be a large amount of trailing garbage data that the current - // views do not reference at all). - let list_array = list_array.reset_offsets(true)?; - - let compressed_elems = self.compress(list_array.elements())?; - - // Note that since the type of our offsets are not encoded in our `DType`, and since - // we guarantee above that all elements are referenced by offsets, we may narrow the - // widths. - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_array.offsets().to_primitive().narrow()?), - ctx, - Excludes::from(&[IntCode::Dict]), - )?; - - Ok(ListArray::try_new( - compressed_elems, - compressed_offsets, - list_array.validity().clone(), - )? - .into_array()) - } - - /// Compresses a [`ListViewArray`] by narrowing offsets/sizes and recursively compressing - /// elements. - fn compress_list_view_array( - &self, - list_view: ListViewArray, - ctx: CompressorContext, - ) -> VortexResult { - let compressed_elems = self.compress(list_view.elements())?; - let compressed_offsets = self.compress_canonical( - Canonical::Primitive(list_view.offsets().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - let compressed_sizes = self.compress_canonical( - Canonical::Primitive(list_view.sizes().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - Ok(ListViewArray::try_new( - compressed_elems, - compressed_offsets, - compressed_sizes, - list_view.validity().clone(), - )? - .into_array()) + self.0.compress(array) } } -impl CanonicalCompressor for BtrBlocksCompressor { - /// Compresses a canonical array by dispatching to type-specific compressors. - /// - /// Recursively compresses nested structures and applies optimal schemes for each data type. - fn compress_canonical( - &self, - array: Canonical, - ctx: CompressorContext, - excludes: Excludes, - ) -> VortexResult { - match array { - Canonical::Null(null_array) => Ok(null_array.into_array()), - // TODO(aduffy): Sparse, other bool compressors. - Canonical::Bool(bool_array) => Ok(bool_array.into_array()), - Canonical::Primitive(primitive) => { - if primitive.ptype().is_int() { - self.integer_compressor() - .compress(self, &primitive, ctx, excludes.int) - } else { - self.float_compressor() - .compress(self, &primitive, ctx, excludes.float) - } - } - Canonical::Decimal(decimal) => compress_decimal(self, &decimal), - Canonical::Struct(struct_array) => { - let fields = struct_array - .unmasked_fields() - .iter() - .map(|field| self.compress(field)) - .collect::, _>>()?; +impl Deref for BtrBlocksCompressor { + type Target = CascadingCompressor; - Ok(StructArray::try_new( - struct_array.names().clone(), - fields, - struct_array.len(), - struct_array.validity().clone(), - )? - .into_array()) - } - Canonical::List(list_view_array) => { - if list_view_array.is_zero_copy_to_list() || list_view_array.elements().is_empty() { - // Offsets are already monotonic and non-overlapping, so we - // can drop the sizes array and compress as a ListArray. - let list_array = list_from_list_view(list_view_array)?; - self.compress_list_array(list_array, ctx) - } else { - self.compress_list_view_array(list_view_array, ctx) - } - } - Canonical::FixedSizeList(fsl_array) => { - let compressed_elems = self.compress(fsl_array.elements())?; - - Ok(FixedSizeListArray::try_new( - compressed_elems, - fsl_array.list_size(), - fsl_array.validity().clone(), - fsl_array.len(), - )? - .into_array()) - } - Canonical::VarBinView(strings) => { - if strings - .dtype() - .eq_ignore_nullability(&DType::Utf8(Nullability::NonNullable)) - { - self.string_compressor() - .compress(self, &strings, ctx, excludes.string) - } else { - // Binary arrays do not compress - Ok(strings.into_array()) - } - } - Canonical::Extension(ext_array) => { - // We compress Timestamp-level arrays with DateTimeParts compression - if let Ok(temporal_array) = TemporalArray::try_from(ext_array.clone().into_array()) - && let TemporalMetadata::Timestamp(..) = temporal_array.temporal_metadata() - { - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - if is_constant(&ext_array.clone().into_array(), &mut ctx)? { - return Ok(ConstantArray::new( - temporal_array.as_ref().scalar_at(0)?, - ext_array.len(), - ) - .into_array()); - } - return compress_temporal(self, temporal_array); - } - - // Compress the underlying storage array. - let compressed_storage = self.compress(ext_array.storage_array())?; - - Ok( - ExtensionArray::new(ext_array.ext_dtype().clone(), compressed_storage) - .into_array(), - ) - } - } - } - - fn int_schemes(&self) -> &[&'static dyn IntegerScheme] { - &self.int_schemes - } - - fn float_schemes(&self) -> &[&'static dyn FloatScheme] { - &self.float_schemes + fn deref(&self) -> &CascadingCompressor { + &self.0 } +} - fn string_schemes(&self) -> &[&'static dyn StringScheme] { - &self.string_schemes +impl Default for BtrBlocksCompressor { + fn default() -> Self { + BtrBlocksCompressorBuilder::default().build() } } diff --git a/vortex-btrblocks/src/compressor/decimal.rs b/vortex-btrblocks/src/compressor/decimal.rs deleted file mode 100644 index bf738a72839..00000000000 --- a/vortex-btrblocks/src/compressor/decimal.rs +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::arrays::DecimalArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::decimal::narrowed_decimal; -use vortex_array::dtype::DecimalType; -use vortex_array::vtable::ValidityHelper; -use vortex_decimal_byte_parts::DecimalBytePartsArray; -use vortex_error::VortexResult; - -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::CompressorContext; -use crate::Excludes; - -// TODO(joe): add support splitting i128/256 buffers into chunks primitive values for compression. -// 2 for i128 and 4 for i256 -pub fn compress_decimal( - compressor: &BtrBlocksCompressor, - decimal: &DecimalArray, -) -> VortexResult { - let decimal = narrowed_decimal(decimal.clone()); - let validity = decimal.validity(); - let prim = match decimal.values_type() { - DecimalType::I8 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I16 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I32 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - DecimalType::I64 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), - _ => return Ok(decimal.into_array()), - }; - - let compressed = compressor.compress_canonical( - Canonical::Primitive(prim), - CompressorContext::default(), - Excludes::none(), - )?; - - DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) -} diff --git a/vortex-btrblocks/src/compressor/float/dictionary.rs b/vortex-btrblocks/src/compressor/float/dictionary.rs deleted file mode 100644 index 33c024af4b3..00000000000 --- a/vortex-btrblocks/src/compressor/float/dictionary.rs +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Float-specific dictionary encoding implementation. -//! -//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility. - -use vortex_array::IntoArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::dtype::half::f16; -use vortex_array::validity::Validity; -use vortex_array::vtable::ValidityHelper; -use vortex_buffer::Buffer; - -use super::stats::ErasedDistinctValues; -use super::stats::FloatStats; - -macro_rules! typed_encode { - ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let values: Buffer<$typ> = $typed.values.iter().map(|x| x.0).collect(); - - let max_code = values.len(); - let codes = if max_code <= u8::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - } else if max_code <= u16::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - } else { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - }; - - let values_validity = match $validity { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, - }; - let values = PrimitiveArray::new(values, values_validity).into_array(); - - // SAFETY: enforced by the DictEncoder - unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } - }}; -} - -/// Compresses a floating-point array into a dictionary arrays according to attached stats. -pub fn dictionary_encode(stats: &FloatStats) -> DictArray { - let validity = stats.src.validity(); - match &stats.distinct_values { - ErasedDistinctValues::F16(typed) => typed_encode!(stats, typed, validity, f16), - ErasedDistinctValues::F32(typed) => typed_encode!(stats, typed, validity, f32), - ErasedDistinctValues::F64(typed) => typed_encode!(stats, typed, validity, f64), - } -} - -struct DictEncoder; - -trait Encode { - /// Using the distinct value set, turn the values into a set of codes. - fn encode(distinct: &[T], values: &[T]) -> Buffer; -} - -macro_rules! impl_encode { - ($typ:ty, $utyp:ty) => { impl_encode!($typ, $utyp, u8, u16, u32); }; - ($typ:ty, $utyp:ty, $($ityp:ty),+) => { - $( - impl Encode<$typ, $ityp> for DictEncoder { - #[allow(clippy::cast_possible_truncation)] - fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { - let mut codes = - vortex_utils::aliases::hash_map::HashMap::<$utyp, $ityp>::with_capacity( - distinct.len(), - ); - for (code, &value) in distinct.iter().enumerate() { - codes.insert(value.to_bits(), code as $ityp); - } - - let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); - for value in values { - // Any code lookups which fail are for nulls, so their value - // does not matter. - output.push(codes.get(&value.to_bits()).copied().unwrap_or_default()); - } - - return output.freeze(); - } - } - )* - }; -} - -impl_encode!(f16, u16); -impl_encode!(f32, u32); -impl_encode!(f64, u64); - -#[cfg(test)] -mod tests { - use vortex_array::DynArray; - use vortex_array::IntoArray; - use vortex_array::arrays::BoolArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity; - use vortex_buffer::buffer; - - use super::super::FloatStats; - use crate::CompressorStats; - use crate::compressor::float::dictionary::dictionary_encode; - - #[test] - fn test_float_dict_encode() { - // Create an array that has some nulls - let values = buffer![1f32, 2f32, 2f32, 0f32, 1f32]; - let validity = - Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(values, validity); - - let stats = FloatStats::generate(&array); - let dict_array = dictionary_encode(&stats); - assert_eq!(dict_array.values().len(), 2); - assert_eq!(dict_array.codes().len(), 5); - - let undict = dict_array; - - // We just use code zero but it doesn't really matter. - // We can just shove a whole validity buffer in there instead. - let expected = PrimitiveArray::new( - buffer![1f32, 2f32, 2f32, 1f32, 1f32], - Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), - ) - .into_array(); - assert_arrays_eq!(undict.as_ref(), expected.as_ref()); - } -} diff --git a/vortex-btrblocks/src/compressor/float/mod.rs b/vortex-btrblocks/src/compressor/float/mod.rs deleted file mode 100644 index 57bb4dc65f3..00000000000 --- a/vortex-btrblocks/src/compressor/float/mod.rs +++ /dev/null @@ -1,746 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -pub(crate) mod dictionary; -pub(super) mod stats; - -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; -use vortex_alp::ALP; -use vortex_alp::ALPArray; -use vortex_alp::RDEncoder; -use vortex_alp::alp_encode; -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::Primitive; -use vortex_array::arrays::dict::DictArrayParts; -use vortex_array::dtype::PType; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; -use vortex_array::vtable::ValidityHelper; -use vortex_error::VortexResult; -use vortex_error::vortex_panic; -use vortex_sparse::Sparse; -use vortex_sparse::SparseArray; - -use self::dictionary::dictionary_encode; -pub use self::stats::FloatStats; -use super::integer::DictScheme as IntDictScheme; -use super::integer::RunEndScheme as IntRunEndScheme; -use super::integer::SparseScheme as IntSparseScheme; -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Excludes; -use crate::GenerateStatsOptions; -use crate::IntCode; -use crate::Scheme; -use crate::SchemeExt; -use crate::compressor::patches::compress_patches; -use crate::compressor::rle; -use crate::compressor::rle::RLEScheme; - -pub trait FloatScheme: Scheme + Send + Sync {} - -impl FloatScheme for T where T: Scheme + Send + Sync -{} - -impl PartialEq for dyn FloatScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn FloatScheme {} - -impl Hash for dyn FloatScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -/// All available float compression schemes. -pub const ALL_FLOAT_SCHEMES: &[&dyn FloatScheme] = &[ - &UncompressedScheme, - &ConstantScheme, - &ALPScheme, - &ALPRDScheme, - &DictScheme, - &NullDominated, - &RLE_FLOAT_SCHEME, - #[cfg(feature = "pco")] - &PcoScheme, -]; - -/// [`Compressor`] for floating-point numbers. -#[derive(Clone, Copy)] -pub struct FloatCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for FloatCompressor<'a> { - type ArrayVTable = Primitive; - type SchemeType = dyn FloatScheme; - type StatsType = FloatStats; - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .float_schemes() - .iter() - .any(|s| s.code() == DictScheme.code()) - { - FloatStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - FloatStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } - - fn schemes(&self) -> &[&'static dyn FloatScheme] { - self.btr_blocks_compressor.float_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } -} - -/// Unique identifier for float compression schemes. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum FloatCode { - /// No compression applied. - Uncompressed, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// ALP (Adaptive Lossless floating-Point) encoding. - Alp, - /// ALPRD (ALP with Right Division) encoding variant. - AlpRd, - /// Dictionary encoding for low-cardinality float values. - Dict, - /// Run-end encoding. - RunEnd, - /// RLE encoding - generic run-length encoding. - Rle, - /// Sparse encoding for null-dominated arrays. - Sparse, - /// Pco (pcodec) compression for floats. - Pco, -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct UncompressedScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ConstantScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ALPScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct ALPRDScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct DictScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct NullDominated; - -/// Pco (pcodec) compression for floats. -#[cfg(feature = "pco")] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct PcoScheme; - -/// Configuration for float RLE compression. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FloatRLEConfig; - -impl rle::RLEConfig for FloatRLEConfig { - type Stats = FloatStats; - type Code = FloatCode; - - const CODE: FloatCode = FloatCode::Rle; - - fn compress_values( - compressor: &BtrBlocksCompressor, - values: &vortex_array::arrays::PrimitiveArray, - ctx: CompressorContext, - excludes: &[FloatCode], - ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) - } -} - -/// RLE scheme for float compression. -pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); - -impl Scheme for UncompressedScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::Uncompressed - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - _stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - Ok(1.0) - } - - fn compress( - &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - Ok(stats.source().clone().into_array()) - } -} - -impl Scheme for ConstantScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::Constant - } - - fn expected_compression_ratio( - &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - // Never select Constant when sampling - if ctx.is_sample { - return Ok(0.0); - } - - if stats.null_count as usize == stats.src.len() || stats.value_count == 0 { - return Ok(0.0); - } - - // Can only have 1 distinct value - if stats.distinct_values_count != 1 { - return Ok(0.0); - } - - Ok(stats.value_count as f64) - } - - fn compress( - &self, - _btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); - if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), - ) - .into_array()), - } - } -} - -impl Scheme for ALPScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::Alp - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[FloatCode], - ) -> VortexResult { - // We don't support ALP for f16 - if stats.source().ptype() == PType::F16 { - return Ok(0.0); - } - - if ctx.allowed_cascading == 0 { - // ALP does not compress on its own, we need to be able to cascade it with - // an integer compressor. - return Ok(0.0); - } - - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &FloatStats, - ctx: CompressorContext, - excludes: &[FloatCode], - ) -> VortexResult { - let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; - let alp = alp_encoded.as_::(); - let alp_ints = alp.encoded().to_primitive(); - - // Compress the ALP ints. - // Patches are not compressed. They should be infrequent, and if they are not then we want - // to keep them linear for easy indexing. - let mut int_excludes = Vec::new(); - if excludes.contains(&FloatCode::Dict) { - int_excludes.push(IntDictScheme.code()); - } - if excludes.contains(&FloatCode::RunEnd) { - int_excludes.push(IntRunEndScheme.code()); - } - - let compressed_alp_ints = compressor.compress_canonical( - Canonical::Primitive(alp_ints), - ctx.descend(), - Excludes::int_only(&int_excludes), - )?; - - let patches = alp.patches().map(compress_patches).transpose()?; - - Ok(ALPArray::new(compressed_alp_ints, alp.exponents(), patches).into_array()) - } -} - -impl Scheme for ALPRDScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::AlpRd - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[FloatCode], - ) -> VortexResult { - if stats.source().ptype() == PType::F16 { - return Ok(0.0); - } - - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - let encoder = match stats.source().ptype() { - PType::F32 => RDEncoder::new(stats.source().as_slice::()), - PType::F64 => RDEncoder::new(stats.source().as_slice::()), - ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"), - }; - - let mut alp_rd = encoder.encode(stats.source()); - - let patches = alp_rd - .left_parts_patches() - .map(compress_patches) - .transpose()?; - alp_rd.replace_left_parts_patches(patches); - - Ok(alp_rd.into_array()) - } -} - -impl Scheme for DictScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::Dict - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[FloatCode], - ) -> VortexResult { - if stats.value_count == 0 { - return Ok(0.0); - } - - // If the array is high cardinality (>50% unique values) skip. - if stats.distinct_values_count > stats.value_count / 2 { - return Ok(0.0); - } - - // Take a sample and run compression on the sample to determine before/after size. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - let dict = dictionary_encode(stats); - let has_all_values_referenced = dict.has_all_values_referenced(); - let DictArrayParts { codes, values, .. } = dict.into_parts(); - - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(codes.to_primitive()), - ctx.descend(), - Excludes::int_only(&[IntCode::Dict, IntCode::Sequence]), - )?; - - assert!(values.is_canonical()); - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(values.to_primitive()), - ctx.descend(), - Excludes::from(&[FloatCode::Dict]), - )?; - - // SAFETY: compressing codes or values does not alter the invariants - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(has_all_values_referenced) - .into_array(), - ) - } - } -} - -impl Scheme for NullDominated { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> Self::CodeType { - FloatCode::Sparse - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - if stats.value_count == 0 { - // All nulls should use ConstantScheme - return Ok(0.0); - } - - // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); - } - - // Otherwise we don't go this route - Ok(0.0) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - - // We pass None as we only run this pathway for NULL-dominated float arrays - let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; - - if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values - let new_excludes = [IntSparseScheme.code()]; - - // Don't attempt to compress the non-null values - - let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices.to_primitive()), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - SparseArray::try_new( - compressed_indices, - sparse.patches().values().clone(), - sparse.len(), - sparse.fill_scalar().clone(), - ) - .map(|a| a.into_array()) - } else { - Ok(sparse_encoded) - } - } -} - -#[cfg(feature = "pco")] -impl Scheme for PcoScheme { - type StatsType = FloatStats; - type CodeType = FloatCode; - - fn code(&self) -> FloatCode { - FloatCode::Pco - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[FloatCode], - ) -> VortexResult { - Ok(vortex_pco::PcoArray::from_primitive( - stats.source(), - pco::DEFAULT_COMPRESSION_LEVEL, - 8192, - )? - .into_array()) - } -} - -#[cfg(test)] -mod tests { - - use std::iter; - - use vortex_array::DynArray; - use vortex_array::IntoArray; - use vortex_array::ToCanonical; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::builders::ArrayBuilder; - use vortex_array::builders::PrimitiveBuilder; - use vortex_array::display::DisplayOptions; - use vortex_array::dtype::Nullability; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_buffer::buffer_mut; - use vortex_error::VortexResult; - - use super::RLE_FLOAT_SCHEME; - use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; - use crate::CompressorStats; - use crate::Scheme; - - #[test] - fn test_empty() -> VortexResult<()> { - // Make sure empty array compression does not fail - let btr = BtrBlocksCompressor::default(); - let result = btr.float_compressor().compress( - &btr, - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - CompressorContext::default(), - &[], - )?; - - assert!(result.is_empty()); - Ok(()) - } - - #[test] - fn test_compress() -> VortexResult<()> { - let mut values = buffer_mut![1.0f32; 1024]; - // Sprinkle some other values in. - for i in 0..1024 { - // Insert 2.0 at all odd positions. - // This should force dictionary encoding and exclude run-end due to the - // average run length being 1. - values[i] = (i % 50) as f32; - } - - let floats = values.into_array().to_primitive(); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &floats, CompressorContext::default(), &[])?; - assert_eq!(compressed.len(), 1024); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.dict(f32, len=1024)"); - - Ok(()) - } - - #[test] - fn test_rle_compression() -> VortexResult<()> { - let mut values = Vec::new(); - values.extend(iter::repeat_n(1.5f32, 100)); - values.extend(iter::repeat_n(2.7f32, 200)); - values.extend(iter::repeat_n(3.15f32, 150)); - - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let stats = super::FloatStats::generate(&array); - let btr = BtrBlocksCompressor::default(); - let compressed = - RLE_FLOAT_SCHEME.compress(&btr, &stats, CompressorContext::default(), &[])?; - - let decoded = compressed; - let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn test_sparse_compression() -> VortexResult<()> { - let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); - array.append_value(f32::NAN); - array.append_value(-f32::NAN); - array.append_value(f32::INFINITY); - array.append_value(-f32::INFINITY); - array.append_value(0.0f32); - array.append_value(-0.0f32); - array.append_nulls(90); - - let floats = array.finish_into_primitive(); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &floats, CompressorContext::default(), &[])?; - assert_eq!(compressed.len(), 96); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.sparse(f32?, len=96)"); - - Ok(()) - } -} - -/// Tests to verify that each float compression scheme produces the expected encoding. -#[cfg(test)] -mod scheme_selection_tests { - - use vortex_alp::ALP; - use vortex_array::arrays::Constant; - use vortex_array::arrays::Dict; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::builders::ArrayBuilder; - use vortex_array::builders::PrimitiveBuilder; - use vortex_array::dtype::Nullability; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_error::VortexResult; - - use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; - - #[test] - fn test_constant_compressed() -> VortexResult<()> { - let values: Vec = vec![42.5; 100]; - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_alp_compressed() -> VortexResult<()> { - let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_dict_compressed() -> VortexResult<()> { - let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; - let values: Vec = (0..1000) - .map(|i| distinct_values[i % distinct_values.len()]) - .collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_null_dominated_compressed() -> VortexResult<()> { - let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); - for i in 0..5 { - builder.append_value(i as f64); - } - builder.append_nulls(95); - let array = builder.finish_into_primitive(); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.float_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - // Verify the compressed array preserves values. - assert_eq!(compressed.len(), 100); - Ok(()) - } -} diff --git a/vortex-btrblocks/src/compressor/float/stats.rs b/vortex-btrblocks/src/compressor/float/stats.rs deleted file mode 100644 index 818ba40d2cf..00000000000 --- a/vortex-btrblocks/src/compressor/float/stats.rs +++ /dev/null @@ -1,268 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::hash::Hash; - -use itertools::Itertools; -use num_traits::Float; -use rustc_hash::FxBuildHasher; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::Primitive; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::primitive::NativeValue; -use vortex_array::dtype::NativePType; -use vortex_array::dtype::PType; -use vortex_array::dtype::half::f16; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; -use vortex_error::vortex_err; -use vortex_error::vortex_panic; -use vortex_mask::AllOr; -use vortex_utils::aliases::hash_set::HashSet; - -use crate::CompressorStats; -use crate::GenerateStatsOptions; -use crate::compressor::rle::RLEStats; -use crate::sample::sample; - -#[derive(Debug, Clone)] -pub struct DistinctValues { - pub values: HashSet, FxBuildHasher>, -} - -#[derive(Debug, Clone)] -pub enum ErasedDistinctValues { - F16(DistinctValues), - F32(DistinctValues), - F64(DistinctValues), -} - -macro_rules! impl_from_typed { - ($typ:ty, $variant:path) => { - impl From> for ErasedDistinctValues { - fn from(value: DistinctValues<$typ>) -> Self { - $variant(value) - } - } - }; -} - -impl_from_typed!(f16, ErasedDistinctValues::F16); -impl_from_typed!(f32, ErasedDistinctValues::F32); -impl_from_typed!(f64, ErasedDistinctValues::F64); - -/// Array of floating-point numbers and relevant stats for compression. -#[derive(Debug, Clone)] -pub struct FloatStats { - pub(crate) src: PrimitiveArray, - // cache for validity.false_count() - pub(crate) null_count: u32, - // cache for validity.true_count() - pub(crate) value_count: u32, - #[allow(dead_code)] - pub(crate) average_run_length: u32, - pub(crate) distinct_values: ErasedDistinctValues, - pub(crate) distinct_values_count: u32, -} - -impl FloatStats { - fn generate_opts_fallible( - input: &PrimitiveArray, - opts: GenerateStatsOptions, - ) -> VortexResult { - match input.ptype() { - PType::F16 => typed_float_stats::(input, opts.count_distinct_values), - PType::F32 => typed_float_stats::(input, opts.count_distinct_values), - PType::F64 => typed_float_stats::(input, opts.count_distinct_values), - _ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()), - } - } -} - -impl CompressorStats for FloatStats { - type ArrayVTable = Primitive; - - fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { - Self::generate_opts_fallible(input, opts) - .vortex_expect("FloatStats::generate_opts should not fail") - } - - fn source(&self) -> &PrimitiveArray { - &self.src - } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive(); - - Self::generate_opts(&sampled, opts) - } -} - -impl RLEStats for FloatStats { - fn value_count(&self) -> u32 { - self.value_count - } - - fn average_run_length(&self) -> u32 { - self.average_run_length - } - - fn source(&self) -> &PrimitiveArray { - &self.src - } -} - -fn typed_float_stats( - array: &PrimitiveArray, - count_distinct_values: bool, -) -> VortexResult -where - DistinctValues: Into, - NativeValue: Hash + Eq, -{ - // Special case: empty array - if array.is_empty() { - return Ok(FloatStats { - src: array.clone(), - null_count: 0, - value_count: 0, - average_run_length: 0, - distinct_values_count: 0, - distinct_values: DistinctValues { - values: HashSet::, FxBuildHasher>::with_hasher(FxBuildHasher), - } - .into(), - }); - } else if array.all_invalid()? { - return Ok(FloatStats { - src: array.clone(), - null_count: u32::try_from(array.len())?, - value_count: 0, - average_run_length: 0, - distinct_values_count: 0, - distinct_values: DistinctValues { - values: HashSet::, FxBuildHasher>::with_hasher(FxBuildHasher), - } - .into(), - }); - } - - let null_count = array - .statistics() - .compute_null_count() - .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; - let value_count = array.len() - null_count; - - // Keep a HashMap of T, then convert the keys into PValue afterward since value is - // so much more efficient to hash and search for. - let mut distinct_values = if count_distinct_values { - HashSet::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) - } else { - HashSet::with_hasher(FxBuildHasher) - }; - - let validity = array.validity_mask()?; - - let mut runs = 1; - let head_idx = validity - .first() - .vortex_expect("All null masks have been handled before"); - let buff = array.to_buffer::(); - let mut prev = buff[head_idx]; - - let first_valid_buff = buff.slice(head_idx..array.len()); - match validity.bit_buffer() { - AllOr::All => { - for value in first_valid_buff { - if count_distinct_values { - distinct_values.insert(NativeValue(value)); - } - - if value != prev { - prev = value; - runs += 1; - } - } - } - AllOr::None => unreachable!("All invalid arrays have been handled earlier"), - AllOr::Some(v) => { - for (&value, valid) in first_valid_buff - .iter() - .zip_eq(v.slice(head_idx..array.len()).iter()) - { - if valid { - if count_distinct_values { - distinct_values.insert(NativeValue(value)); - } - - if value != prev { - prev = value; - runs += 1; - } - } - } - } - } - - let null_count = u32::try_from(null_count)?; - let value_count = u32::try_from(value_count)?; - let distinct_values_count = if count_distinct_values { - u32::try_from(distinct_values.len())? - } else { - u32::MAX - }; - - Ok(FloatStats { - null_count, - value_count, - distinct_values_count, - src: array.clone(), - average_run_length: value_count / runs, - distinct_values: DistinctValues { - values: distinct_values, - } - .into(), - }) -} - -#[cfg(test)] -mod tests { - use vortex_array::IntoArray; - use vortex_array::ToCanonical; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::buffer; - - use super::FloatStats; - use crate::CompressorStats; - - #[test] - fn test_float_stats() { - let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array(); - let floats = floats.to_primitive(); - - let stats = FloatStats::generate(&floats); - - assert_eq!(stats.value_count, 3); - assert_eq!(stats.null_count, 0); - assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 3); - } - - #[test] - fn test_float_stats_leading_nulls() { - let floats = PrimitiveArray::new( - buffer![0.0f32, 1.0f32, 2.0f32], - Validity::from_iter([false, true, true]), - ); - - let stats = FloatStats::generate(&floats); - - assert_eq!(stats.value_count, 2); - assert_eq!(stats.null_count, 1); - assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 2); - } -} diff --git a/vortex-btrblocks/src/compressor/integer/dictionary.rs b/vortex-btrblocks/src/compressor/integer/dictionary.rs deleted file mode 100644 index 70a29aaeedd..00000000000 --- a/vortex-btrblocks/src/compressor/integer/dictionary.rs +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Dictionary compressor that reuses the unique values in the `IntegerStats`. -//! -//! Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility. - -use vortex_array::IntoArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::validity::Validity; -use vortex_array::vtable::ValidityHelper; -use vortex_buffer::Buffer; - -use super::IntegerStats; -use super::stats::ErasedStats; - -macro_rules! typed_encode { - ($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{ - let values: Buffer<$typ> = $typed.distinct_values.keys().map(|x| x.0).collect(); - - let max_code = values.len(); - let codes = if max_code <= u8::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - } else if max_code <= u16::MAX as usize { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - } else { - let buf = - >::encode(&values, $stats.src.as_slice::<$typ>()); - PrimitiveArray::new(buf, $validity.clone()).into_array() - }; - - let values_validity = match $validity { - Validity::NonNullable => Validity::NonNullable, - _ => Validity::AllValid, - }; - - let values = PrimitiveArray::new(values, values_validity).into_array(); - // SAFETY: invariants enforced in DictEncoder - unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) } - }}; -} - -/// Compresses an integer array into a dictionary arrays according to attached stats. -#[expect( - clippy::cognitive_complexity, - reason = "complexity from match on all integer types" -)] -pub fn dictionary_encode(stats: &IntegerStats) -> DictArray { - // We need to preserve the nullability somehow from the original - let src_validity = stats.src.validity(); - - match &stats.typed { - ErasedStats::U8(typed) => typed_encode!(stats, typed, src_validity, u8), - ErasedStats::U16(typed) => typed_encode!(stats, typed, src_validity, u16), - ErasedStats::U32(typed) => typed_encode!(stats, typed, src_validity, u32), - ErasedStats::U64(typed) => typed_encode!(stats, typed, src_validity, u64), - ErasedStats::I8(typed) => typed_encode!(stats, typed, src_validity, i8), - ErasedStats::I16(typed) => typed_encode!(stats, typed, src_validity, i16), - ErasedStats::I32(typed) => typed_encode!(stats, typed, src_validity, i32), - ErasedStats::I64(typed) => typed_encode!(stats, typed, src_validity, i64), - } -} - -struct DictEncoder; - -trait Encode { - /// Using the distinct value set, turn the values into a set of codes. - fn encode(distinct: &[T], values: &[T]) -> Buffer; -} - -macro_rules! impl_encode { - ($typ:ty) => { impl_encode!($typ, u8, u16, u32); }; - ($typ:ty, $($ityp:ty),+) => { - $( - impl Encode<$typ, $ityp> for DictEncoder { - #[allow(clippy::cast_possible_truncation)] - fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> { - let mut codes = - vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity( - distinct.len(), - ); - for (code, &value) in distinct.iter().enumerate() { - codes.insert(value, code as $ityp); - } - - let mut output = vortex_buffer::BufferMut::with_capacity(values.len()); - for value in values { - // Any code lookups which fail are for nulls, so their value - // does not matter. - // SAFETY: we have exactly sized output to be as large as values. - unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) }; - } - - return output.freeze(); - } - } - )* - }; -} - -impl_encode!(u8); -impl_encode!(u16); -impl_encode!(u32); -impl_encode!(u64); -impl_encode!(i8); -impl_encode!(i16); -impl_encode!(i32); -impl_encode!(i64); - -#[cfg(test)] -mod tests { - use vortex_array::DynArray; - use vortex_array::IntoArray; - use vortex_array::arrays::BoolArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity; - use vortex_buffer::buffer; - - use super::IntegerStats; - use super::dictionary_encode; - use crate::CompressorStats; - - #[test] - fn test_dict_encode_integer_stats() { - // Create an array that has some nulls - let data = buffer![100i32, 200, 100, 0, 100]; - let validity = - Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()); - let array = PrimitiveArray::new(data, validity); - - let stats = IntegerStats::generate(&array); - let dict_array = dictionary_encode(&stats); - assert_eq!(dict_array.values().len(), 2); - assert_eq!(dict_array.codes().len(), 5); - - let undict = dict_array; - - // We just use code zero, but it doesn't really matter. - // We can just shove a whole validity buffer in there instead. - let expected = PrimitiveArray::new( - buffer![100i32, 200, 100, 100, 100], - Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()), - ) - .into_array(); - assert_arrays_eq!(undict.as_ref(), expected.as_ref()); - } -} diff --git a/vortex-btrblocks/src/compressor/integer/mod.rs b/vortex-btrblocks/src/compressor/integer/mod.rs deleted file mode 100644 index 58ee4f62e76..00000000000 --- a/vortex-btrblocks/src/compressor/integer/mod.rs +++ /dev/null @@ -1,1223 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -pub(crate) mod dictionary; -pub(super) mod stats; - -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; -pub use stats::IntegerStats; -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::Primitive; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; -use vortex_array::vtable::ValidityHelper; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; -use vortex_error::vortex_bail; -use vortex_error::vortex_err; -use vortex_fastlanes::FoRArray; -use vortex_fastlanes::bitpack_compress::bit_width_histogram; -use vortex_fastlanes::bitpack_compress::bitpack_encode; -use vortex_fastlanes::bitpack_compress::find_best_bit_width; -use vortex_runend::RunEndArray; -use vortex_runend::compress::runend_encode; -use vortex_sequence::sequence_encode; -use vortex_sparse::Sparse; -use vortex_sparse::SparseArray; -use vortex_zigzag::ZigZagArray; -use vortex_zigzag::zigzag_encode; - -use self::dictionary::dictionary_encode; -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Excludes; -use crate::GenerateStatsOptions; -use crate::Scheme; -use crate::SchemeExt; -use crate::compressor::patches::compress_patches; -use crate::compressor::rle; -use crate::compressor::rle::RLEScheme; - -/// All available integer compression schemes. -pub const ALL_INT_SCHEMES: &[&dyn IntegerScheme] = &[ - &ConstantScheme, - &FORScheme, - &ZigZagScheme, - &BitPackingScheme, - &SparseScheme, - &DictScheme, - &RunEndScheme, - &SequenceScheme, - &RLE_INTEGER_SCHEME, - #[cfg(feature = "pco")] - &PcoScheme, -]; - -/// [`Compressor`] for signed and unsigned integers. -#[derive(Clone, Copy)] -pub struct IntCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for IntCompressor<'a> { - type ArrayVTable = Primitive; - type SchemeType = dyn IntegerScheme; - type StatsType = IntegerStats; - - fn schemes(&self) -> &[&'static dyn IntegerScheme] { - self.btr_blocks_compressor.int_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .int_schemes() - .iter() - .any(|s| s.code() == IntCode::Dict) - { - IntegerStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - IntegerStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } -} - -pub trait IntegerScheme: - Scheme + Send + Sync -{ -} - -// Auto-impl -impl IntegerScheme for T where - T: Scheme + Send + Sync -{ -} - -impl PartialEq for dyn IntegerScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn IntegerScheme {} - -impl Hash for dyn IntegerScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -/// Unique identifier for integer compression schemes. -/// -/// NOTE: Variant order matters for tie-breaking; `For` must precede `BitPacking` to avoid unnecessary patches. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum IntCode { - /// No compression applied. - Uncompressed, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// Frame of Reference encoding - subtracts minimum value then bitpacks. - For, - /// BitPacking encoding - compresses non-negative integers by reducing bit width. - BitPacking, - /// ZigZag encoding - transforms negative integers to positive for better bitpacking. - ZigZag, - /// Sparse encoding - optimizes null-dominated or single-value-dominated arrays. - Sparse, - /// Dictionary encoding - creates a dictionary of unique values. - Dict, - /// Run-end encoding - run-length encoding with end positions. - RunEnd, - /// Sequence encoding - detects sequential patterns. - Sequence, - /// RLE encoding - generic run-length encoding. - Rle, - /// Pco (pcodec) compression for integers. - Pco, -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] - -pub struct UncompressedScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] - -pub struct ConstantScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] - -pub struct FORScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ZigZagScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct BitPackingScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct SparseScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct DictScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct RunEndScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct SequenceScheme; - -/// Pco (pcodec) compression for integers. -#[cfg(feature = "pco")] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct PcoScheme; - -/// Threshold for the average run length in an array before we consider run-end encoding. -const RUN_END_THRESHOLD: u32 = 4; - -/// Configuration for integer RLE compression. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct IntRLEConfig; - -impl rle::RLEConfig for IntRLEConfig { - type Stats = IntegerStats; - type Code = IntCode; - - const CODE: IntCode = IntCode::Rle; - - fn compress_values( - compressor: &BtrBlocksCompressor, - values: &PrimitiveArray, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - compressor.compress_canonical(Canonical::Primitive(values.clone()), ctx, excludes.into()) - } -} - -/// RLE scheme for integer compression. -pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); - -impl Scheme for UncompressedScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::Uncompressed - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - _stats: &IntegerStats, - _ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - // no compression - Ok(1.0) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - _ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - Ok(stats.source().clone().into_array()) - } -} - -impl Scheme for ConstantScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::Constant - } - - fn is_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - // Never yield ConstantScheme for a sample, it could be a false-positive. - if ctx.is_sample { - return Ok(0.0); - } - - // Only arrays with one distinct values can be constant compressed. - if stats.distinct_values_count > 1 { - return Ok(0.0); - } - - Ok(stats.value_count as f64) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - _ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); - if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), - ) - .into_array()), - } - } -} - -impl Scheme for FORScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::For - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - // Only apply if we are not at the leaf - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - // All-null cannot be FOR compressed. - if stats.value_count == 0 { - return Ok(0.0); - } - - // Only apply when the min is not already zero. - if stats.typed.min_is_zero() { - return Ok(0.0); - } - - // Difference between max and min - let full_width: u32 = stats - .src - .ptype() - .bit_width() - .try_into() - .vortex_expect("bit width must fit in u32"); - let for_bw = match stats.typed.max_minus_min().checked_ilog2() { - Some(l) => l + 1, - // If max-min == 0, it we should use a different compression scheme - // as we don't want to bitpack down to 0 bits. - None => return Ok(0.0), - }; - - // If BitPacking could apply (non-negative values) and FOR doesn't reduce bit width - // compared to BitPacking, don't use FOR since it has overhead (storing reference). - // Only skip FOR when min >= 0, otherwise BitPacking can't apply directly. - if let Some(max_log) = stats - .typed - .max_ilog2() - .filter(|_| !stats.typed.min_is_negative()) - { - let bitpack_bw = max_log + 1; - if for_bw >= bitpack_bw { - return Ok(0.0); - } - } - - Ok(full_width as f64 / for_bw as f64) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - let for_array = FoRArray::encode(stats.src.clone())?; - let biased = for_array.encoded().to_primitive(); - let biased_stats = IntegerStats::generate_opts( - &biased, - GenerateStatsOptions { - count_distinct_values: false, - }, - ); - - // Immediately bitpack. If any other scheme was preferable, it would be chosen instead - // of bitpacking. - // NOTE: we could delegate in the future if we had another downstream codec that performs - // as well. - let leaf_ctx = CompressorContext { - is_sample: ctx.is_sample, - allowed_cascading: 0, - }; - let compressed = - BitPackingScheme.compress(compressor, &biased_stats, leaf_ctx, excludes)?; - - let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; - for_compressed - .as_ref() - .statistics() - .inherit_from(for_array.as_ref().statistics()); - Ok(for_compressed.into_array()) - } -} - -impl Scheme for ZigZagScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::ZigZag - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - // ZigZag is only useful when we cascade it with another encoding - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - // Don't try and compress all-null arrays - if stats.value_count == 0 { - return Ok(0.0); - } - - // ZigZag is only useful when there are negative values. - if !stats.typed.min_is_negative() { - return Ok(0.0); - } - - // Run compression on a sample to see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - // Zigzag encode the values, then recursively compress the inner values. - let zag = zigzag_encode(stats.src.clone())?; - let encoded = zag.encoded().to_primitive(); - - // ZigZag should be after Dict, RunEnd or Sparse. - // We should only do these "container" style compressors once. - let mut new_excludes = vec![ - ZigZagScheme.code(), - DictScheme.code(), - RunEndScheme.code(), - SparseScheme.code(), - ]; - new_excludes.extend_from_slice(excludes); - - let compressed = compressor.compress_canonical( - Canonical::Primitive(encoded), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - tracing::debug!("zigzag output: {}", compressed.encoding_id()); - - Ok(ZigZagArray::try_new(compressed)?.into_array()) - } -} - -impl Scheme for BitPackingScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::BitPacking - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - // BitPacking only works for non-negative values - if stats.typed.min_is_negative() { - return Ok(0.0); - } - - // Don't compress all-null arrays - if stats.value_count == 0 { - return Ok(0.0); - } - - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - _ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - let histogram = bit_width_histogram(stats.source())?; - let bw = find_best_bit_width(stats.source().ptype(), &histogram)?; - // If best bw is determined to be the current bit-width, return the original array. - if bw as usize == stats.source().ptype().bit_width() { - return Ok(stats.source().clone().into_array()); - } - let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?; - - let patches = packed.patches().map(compress_patches).transpose()?; - packed.replace_patches(patches); - - Ok(packed.into_array()) - } -} - -impl Scheme for SparseScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::Sparse - } - - // We can avoid asserting the encoding tree instead. - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - if stats.value_count == 0 { - // All nulls should use ConstantScheme - return Ok(0.0); - } - - // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); - } - - // See if the top value accounts for >= 90% of the set values. - let (_, top_count) = stats.typed.top_value_and_count(); - - if top_count == stats.value_count { - // top_value is the only value, should use ConstantScheme instead - return Ok(0.0); - } - - let freq = top_count as f64 / stats.value_count as f64; - if freq >= 0.9 { - // We only store the positions of the non-top values. - return Ok(stats.value_count as f64 / (stats.value_count - top_count) as f64); - } - - Ok(0.0) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - let (top_pvalue, top_count) = stats.typed.top_value_and_count(); - if top_count as usize == stats.src.len() { - // top_value is the only value, use ConstantScheme - return Ok(ConstantArray::new( - Scalar::primitive_value( - top_pvalue, - top_pvalue.ptype(), - stats.src.dtype().nullability(), - ), - stats.src.len(), - ) - .into_array()); - } - - let sparse_encoded = SparseArray::encode( - &stats.src.clone().into_array(), - Some(Scalar::primitive_value( - top_pvalue, - top_pvalue.ptype(), - stats.src.dtype().nullability(), - )), - )?; - - if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the values - let mut new_excludes = vec![SparseScheme.code(), IntCode::Dict]; - new_excludes.extend_from_slice(excludes); - - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(sparse.patches().values().to_primitive()), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - let indices = sparse.patches().indices().to_primitive().narrow()?; - - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - SparseArray::try_new( - compressed_indices, - compressed_values, - sparse.len(), - sparse.fill_scalar().clone(), - ) - .map(|a| a.into_array()) - } else { - Ok(sparse_encoded) - } - } -} - -impl Scheme for DictScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::Dict - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - // Dict should not be terminal. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - if stats.value_count == 0 { - return Ok(0.0); - } - - // If > 50% of the values are distinct, skip dict. - if stats.distinct_values_count > stats.value_count / 2 { - return Ok(0.0); - } - - // Ignore nulls encoding for the estimate. We only focus on values. - let values_size = stats.source().ptype().bit_width() * stats.distinct_values_count as usize; - - // Assume codes are compressed RLE + BitPacking. - let codes_bw = usize::BITS - stats.distinct_values_count.leading_zeros(); - - let n_runs = (stats.value_count / stats.average_run_length) as usize; - - // Assume that codes will either be BitPack or RLE-BitPack - let codes_size_bp = (codes_bw * stats.value_count) as usize; - let codes_size_rle_bp = usize::checked_mul((codes_bw + 32) as usize, n_runs); - - let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX)); - - let before = stats.value_count as usize * stats.source().ptype().bit_width(); - - Ok(before as f64 / (values_size + codes_size) as f64) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - - // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be - // RLE or FOR + BP. Cascading probably wastes some time here. - - let dict = dictionary_encode(stats); - - // Cascade the codes child - // Don't allow SequenceArray as the codes child as it merely adds extra indirection without actually compressing data. - let mut new_excludes = vec![IntCode::Dict, IntCode::Sequence]; - new_excludes.extend_from_slice(excludes); - - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive().narrow()?), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - // SAFETY: compressing codes does not change their values - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, dict.values().clone()) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} - -impl Scheme for RunEndScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::RunEnd - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - // If the run length is below the threshold, drop it. - if stats.average_run_length < RUN_END_THRESHOLD { - return Ok(0.0); - } - - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - // Run compression on a sample, see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &IntegerStats, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - - // run-end encode the ends - let (ends, values) = runend_encode(&stats.src); - - let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()]; - new_excludes.extend_from_slice(excludes); - - let compressed_ends = compressor.compress_canonical( - Canonical::Primitive(ends.to_primitive()), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - let compressed_values = compressor.compress_canonical( - Canonical::Primitive(values.to_primitive()), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - // SAFETY: compression doesn't affect invariants - unsafe { - Ok( - RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, stats.src.len()) - .into_array(), - ) - } - } -} - -impl Scheme for SequenceScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> Self::CodeType { - IntCode::Sequence - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - if stats.null_count > 0 { - return Ok(0.0); - } - - // If the distinct_values_count was computed (!= u32::MAX) - // Then all values in a sequence must be unique. - if stats.distinct_values_count != u32::MAX - && stats.distinct_values_count as usize != stats.src.len() - { - return Ok(0.0); - } - - // Since two values are required to store base and multiplier the - // compression ratio is divided by 2. - Ok(sequence_encode(&stats.src)? - .map(|_| stats.src.len() as f64 / 2.0) - .unwrap_or(0.0)) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - if stats.null_count > 0 { - vortex_bail!("sequence encoding does not support nulls"); - } - sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array")) - } -} - -#[cfg(feature = "pco")] -impl Scheme for PcoScheme { - type StatsType = IntegerStats; - type CodeType = IntCode; - - fn code(&self) -> IntCode { - IntCode::Pco - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[IntCode], - ) -> VortexResult { - // Pco does not support I8 or U8. - if matches!( - stats.src.ptype(), - vortex_array::dtype::PType::I8 | vortex_array::dtype::PType::U8 - ) { - return Ok(0.0); - } - - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[IntCode], - ) -> VortexResult { - Ok(vortex_pco::PcoArray::from_primitive( - stats.source(), - pco::DEFAULT_COMPRESSION_LEVEL, - 8192, - )? - .into_array()) - } -} - -#[cfg(test)] -mod tests { - use std::iter; - - use itertools::Itertools; - use rand::Rng; - use rand::SeedableRng; - use rand::rngs::StdRng; - use vortex_array::DynArray; - use vortex_array::IntoArray; - use vortex_array::ToCanonical; - use vortex_array::arrays::Dict; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity; - use vortex_array::vtable::ValidityHelper; - use vortex_buffer::Buffer; - use vortex_buffer::BufferMut; - use vortex_buffer::buffer; - use vortex_error::VortexResult; - use vortex_sequence::Sequence; - use vortex_sparse::Sparse; - - use super::IntegerStats; - use super::RLE_INTEGER_SCHEME; - use super::SequenceScheme; - use super::SparseScheme; - use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; - use crate::CompressorStats; - use crate::Scheme; - - #[test] - fn test_empty() -> VortexResult<()> { - // Make sure empty array compression does not fail - let btr = BtrBlocksCompressor::default(); - let result = btr.integer_compressor().compress( - &btr, - &PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable), - CompressorContext::default(), - &[], - )?; - - assert!(result.is_empty()); - Ok(()) - } - - #[test] - fn test_dict_encodable() -> VortexResult<()> { - let mut codes = BufferMut::::with_capacity(65_535); - // Write some runs of length 3 of a handful of different values. Interrupted by some - // one-off values. - - let numbers = [0, 10, 50, 100, 1000, 3000] - .into_iter() - .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked - .collect_vec(); - - let mut rng = StdRng::seed_from_u64(1u64); - while codes.len() < 64000 { - let run_length = rng.next_u32() % 5; - let value = numbers[rng.next_u32() as usize % numbers.len()]; - for _ in 0..run_length { - codes.push(value); - } - } - - let primitive = codes.freeze().into_array().to_primitive(); - let btr = BtrBlocksCompressor::default(); - let compressed = btr.integer_compressor().compress( - &btr, - &primitive, - CompressorContext::default(), - &[], - )?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn sparse_with_nulls() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![189u8, 189, 189, 0, 46], - Validity::from_iter(vec![true, true, true, true, false]), - ); - let btr = BtrBlocksCompressor::default(); - let compressed = SparseScheme.compress( - &btr, - &IntegerStats::generate(&array), - CompressorContext::default(), - &[], - )?; - assert!(compressed.is::()); - let decoded = compressed.clone(); - let expected = - PrimitiveArray::new(buffer![189u8, 189, 189, 0, 0], array.validity().clone()) - .into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn sparse_mostly_nulls() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], - Validity::from_iter(vec![ - false, false, false, false, false, false, false, false, false, false, true, - ]), - ); - let btr = BtrBlocksCompressor::default(); - let compressed = SparseScheme.compress( - &btr, - &IntegerStats::generate(&array), - CompressorContext::default(), - &[], - )?; - assert!(compressed.is::()); - let decoded = compressed.clone(); - let expected = PrimitiveArray::new( - buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], - array.validity().clone(), - ) - .into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn nullable_sequence() -> VortexResult<()> { - let values = (0i32..20).step_by(7).collect_vec(); - let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); - let btr = BtrBlocksCompressor::default(); - let compressed = SequenceScheme.compress( - &btr, - &IntegerStats::generate(&array), - CompressorContext::default(), - &[], - )?; - assert!(compressed.is::()); - let decoded = compressed; - let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test] - fn test_rle_compression() -> VortexResult<()> { - let mut values = Vec::new(); - values.extend(iter::repeat_n(42i32, 100)); - values.extend(iter::repeat_n(123i32, 200)); - values.extend(iter::repeat_n(987i32, 150)); - - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = RLE_INTEGER_SCHEME.compress( - &btr, - &IntegerStats::generate(&array), - CompressorContext::default(), - &[], - )?; - - let decoded = compressed; - let expected = Buffer::copy_from(&values).into_array(); - assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); - Ok(()) - } - - #[test_with::env(CI)] - #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] - fn compress_large_int() -> VortexResult<()> { - const NUM_LISTS: usize = 10_000; - const ELEMENTS_PER_LIST: usize = 5_000; - - let prim = (0..NUM_LISTS) - .flat_map(|list_idx| { - (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) - }) - .collect::() - .into_array(); - - let btr = BtrBlocksCompressor::default(); - drop(btr.compress(&prim)?); - - Ok(()) - } -} - -/// Tests to verify that each integer compression scheme produces the expected encoding. -#[cfg(test)] -mod scheme_selection_tests { - use std::iter; - - use rand::Rng; - use rand::SeedableRng; - use rand::rngs::StdRng; - use vortex_array::arrays::Constant; - use vortex_array::arrays::Dict; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_error::VortexResult; - use vortex_fastlanes::BitPacked; - use vortex_fastlanes::FoR; - use vortex_fastlanes::RLE; - use vortex_runend::RunEnd; - use vortex_sequence::Sequence; - use vortex_sparse::Sparse; - - use crate::BtrBlocksCompressor; - use crate::CompressorContext; - use crate::CompressorExt; - use crate::IntCode; - - #[test] - fn test_constant_compressed() -> VortexResult<()> { - let values: Vec = iter::repeat_n(42, 100).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_for_compressed() -> VortexResult<()> { - let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_bitpacking_compressed() -> VortexResult<()> { - let values: Vec = (0..1000).map(|i| i % 16).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_sparse_compressed() -> VortexResult<()> { - let mut values: Vec = Vec::new(); - for i in 0..1000 { - if i % 20 == 0 { - values.push(2_000_000 + (i * 7) % 1000); - } else { - values.push(1_000_000); - } - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_dict_compressed() -> VortexResult<()> { - let mut codes = Vec::with_capacity(65_535); - let numbers: Vec = [0, 10, 50, 100, 1000, 3000] - .into_iter() - .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked - .collect(); - - let mut rng = StdRng::seed_from_u64(1u64); - while codes.len() < 64000 { - let run_length = rng.next_u32() % 5; - let value = numbers[rng.next_u32() as usize % numbers.len()]; - for _ in 0..run_length { - codes.push(value); - } - } - - let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_runend_compressed() -> VortexResult<()> { - let mut values: Vec = Vec::new(); - for i in 0..100 { - values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_sequence_compressed() -> VortexResult<()> { - let values: Vec = (0..1000).map(|i| i * 7).collect(); - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = - btr.integer_compressor() - .compress(&btr, &array, CompressorContext::default(), &[])?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_rle_compressed() -> VortexResult<()> { - let mut values: Vec = Vec::new(); - for i in 0..1024 { - values.extend(iter::repeat_n(i, 10)); - } - let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); - let btr = BtrBlocksCompressor::default(); - let compressed = btr.integer_compressor().compress( - &btr, - &array, - CompressorContext::default(), - &[IntCode::RunEnd], - )?; - assert!(compressed.is::()); - Ok(()) - } -} diff --git a/vortex-btrblocks/src/compressor/integer/stats.rs b/vortex-btrblocks/src/compressor/integer/stats.rs deleted file mode 100644 index 111a1b7a155..00000000000 --- a/vortex-btrblocks/src/compressor/integer/stats.rs +++ /dev/null @@ -1,519 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::hash::Hash; - -use num_traits::PrimInt; -use rustc_hash::FxBuildHasher; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::Primitive; -use vortex_array::arrays::PrimitiveArray; -use vortex_array::arrays::primitive::NativeValue; -use vortex_array::dtype::IntegerPType; -use vortex_array::expr::stats::Stat; -use vortex_array::match_each_integer_ptype; -use vortex_array::scalar::PValue; -use vortex_array::scalar::Scalar; -use vortex_buffer::BitBuffer; -use vortex_error::VortexError; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; -use vortex_mask::AllOr; -use vortex_utils::aliases::hash_map::HashMap; - -use crate::CompressorStats; -use crate::GenerateStatsOptions; -use crate::compressor::rle::RLEStats; -use crate::sample::sample; - -#[derive(Clone, Debug)] -pub struct TypedStats { - pub min: T, - pub max: T, - pub top_value: T, - pub top_count: u32, - pub distinct_values: HashMap, u32, FxBuildHasher>, -} - -/// Type-erased container for one of the [TypedStats] variants. -/// -/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased -/// set of stats. We then perform a variety of access methods on them. -#[derive(Clone, Debug)] -pub enum ErasedStats { - U8(TypedStats), - U16(TypedStats), - U32(TypedStats), - U64(TypedStats), - I8(TypedStats), - I16(TypedStats), - I32(TypedStats), - I64(TypedStats), -} - -impl ErasedStats { - pub fn min_is_zero(&self) -> bool { - match &self { - ErasedStats::U8(x) => x.min == 0, - ErasedStats::U16(x) => x.min == 0, - ErasedStats::U32(x) => x.min == 0, - ErasedStats::U64(x) => x.min == 0, - ErasedStats::I8(x) => x.min == 0, - ErasedStats::I16(x) => x.min == 0, - ErasedStats::I32(x) => x.min == 0, - ErasedStats::I64(x) => x.min == 0, - } - } - - pub fn min_is_negative(&self) -> bool { - match &self { - ErasedStats::U8(_) - | ErasedStats::U16(_) - | ErasedStats::U32(_) - | ErasedStats::U64(_) => false, - ErasedStats::I8(x) => x.min < 0, - ErasedStats::I16(x) => x.min < 0, - ErasedStats::I32(x) => x.min < 0, - ErasedStats::I64(x) => x.min < 0, - } - } - - // Difference between max and min. - pub fn max_minus_min(&self) -> u64 { - match &self { - ErasedStats::U8(x) => (x.max - x.min) as u64, - ErasedStats::U16(x) => (x.max - x.min) as u64, - ErasedStats::U32(x) => (x.max - x.min) as u64, - ErasedStats::U64(x) => x.max - x.min, - ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64, - ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64, - ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64, - ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128) - .vortex_expect("max minus min result bigger than u64"), - } - } - - /// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero. - /// - /// This matches how BitPacking computes bit width: it reinterprets signed values as - /// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed - /// values, the transmuted value equals the original value. - /// - /// This is used to determine if FOR encoding would reduce bit width compared to - /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help. - pub fn max_ilog2(&self) -> Option { - match &self { - ErasedStats::U8(x) => x.max.checked_ilog2(), - ErasedStats::U16(x) => x.max.checked_ilog2(), - ErasedStats::U32(x) => x.max.checked_ilog2(), - ErasedStats::U64(x) => x.max.checked_ilog2(), - // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior - ErasedStats::I8(x) => (x.max as u8).checked_ilog2(), - ErasedStats::I16(x) => (x.max as u16).checked_ilog2(), - ErasedStats::I32(x) => (x.max as u32).checked_ilog2(), - ErasedStats::I64(x) => (x.max as u64).checked_ilog2(), - } - } - - /// Get the most commonly occurring value and its count - pub fn top_value_and_count(&self) -> (PValue, u32) { - match &self { - ErasedStats::U8(x) => (x.top_value.into(), x.top_count), - ErasedStats::U16(x) => (x.top_value.into(), x.top_count), - ErasedStats::U32(x) => (x.top_value.into(), x.top_count), - ErasedStats::U64(x) => (x.top_value.into(), x.top_count), - ErasedStats::I8(x) => (x.top_value.into(), x.top_count), - ErasedStats::I16(x) => (x.top_value.into(), x.top_count), - ErasedStats::I32(x) => (x.top_value.into(), x.top_count), - ErasedStats::I64(x) => (x.top_value.into(), x.top_count), - } - } -} - -macro_rules! impl_from_typed { - ($T:ty, $variant:path) => { - impl From> for ErasedStats { - fn from(typed: TypedStats<$T>) -> Self { - $variant(typed) - } - } - }; -} - -impl_from_typed!(u8, ErasedStats::U8); -impl_from_typed!(u16, ErasedStats::U16); -impl_from_typed!(u32, ErasedStats::U32); -impl_from_typed!(u64, ErasedStats::U64); -impl_from_typed!(i8, ErasedStats::I8); -impl_from_typed!(i16, ErasedStats::I16); -impl_from_typed!(i32, ErasedStats::I32); -impl_from_typed!(i64, ErasedStats::I64); - -/// Array of integers and relevant stats for compression. -#[derive(Clone, Debug)] -pub struct IntegerStats { - pub(super) src: PrimitiveArray, - // cache for validity.false_count() - pub(super) null_count: u32, - // cache for validity.true_count() - pub(super) value_count: u32, - pub(super) average_run_length: u32, - pub(super) distinct_values_count: u32, - pub(crate) typed: ErasedStats, -} - -impl IntegerStats { - fn generate_opts_fallible( - input: &PrimitiveArray, - opts: GenerateStatsOptions, - ) -> VortexResult { - match_each_integer_ptype!(input.ptype(), |T| { - typed_int_stats::(input, opts.count_distinct_values) - }) - } -} - -impl CompressorStats for IntegerStats { - type ArrayVTable = Primitive; - - fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self { - Self::generate_opts_fallible(input, opts) - .vortex_expect("IntegerStats::generate_opts should not fail") - } - - fn source(&self) -> &PrimitiveArray { - &self.src - } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive(); - - Self::generate_opts(&sampled, opts) - } -} - -impl RLEStats for IntegerStats { - fn value_count(&self) -> u32 { - self.value_count - } - - fn average_run_length(&self) -> u32 { - self.average_run_length - } - - fn source(&self) -> &PrimitiveArray { - &self.src - } -} - -fn typed_int_stats( - array: &PrimitiveArray, - count_distinct_values: bool, -) -> VortexResult -where - T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>, - TypedStats: Into, - NativeValue: Eq + Hash, -{ - // Special case: empty array - if array.is_empty() { - return Ok(IntegerStats { - src: array.clone(), - null_count: 0, - value_count: 0, - average_run_length: 0, - distinct_values_count: 0, - typed: TypedStats { - min: T::max_value(), - max: T::min_value(), - top_value: T::default(), - top_count: 0, - distinct_values: HashMap::with_hasher(FxBuildHasher), - } - .into(), - }); - } else if array.all_invalid()? { - return Ok(IntegerStats { - src: array.clone(), - null_count: u32::try_from(array.len())?, - value_count: 0, - average_run_length: 0, - distinct_values_count: 0, - typed: TypedStats { - min: T::max_value(), - max: T::min_value(), - top_value: T::default(), - top_count: 0, - distinct_values: HashMap::with_hasher(FxBuildHasher), - } - .into(), - }); - } - - let validity = array.validity_mask()?; - let null_count = validity.false_count(); - let value_count = validity.true_count(); - - // Initialize loop state - let head_idx = validity - .first() - .vortex_expect("All null masks have been handled before"); - let buffer = array.to_buffer::(); - let head = buffer[head_idx]; - - let mut loop_state = LoopState { - distinct_values: if count_distinct_values { - HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher) - } else { - HashMap::with_hasher(FxBuildHasher) - }, - prev: head, - runs: 1, - }; - - let sliced = buffer.slice(head_idx..array.len()); - let mut chunks = sliced.as_slice().chunks_exact(64); - match validity.bit_buffer() { - AllOr::All => { - for chunk in &mut chunks { - inner_loop_nonnull( - chunk.try_into().ok().vortex_expect("chunk size must be 64"), - count_distinct_values, - &mut loop_state, - ) - } - let remainder = chunks.remainder(); - inner_loop_naive( - remainder, - count_distinct_values, - &BitBuffer::new_set(remainder.len()), - &mut loop_state, - ); - } - AllOr::None => unreachable!("All invalid arrays have been handled before"), - AllOr::Some(v) => { - let mask = v.slice(head_idx..array.len()); - let mut offset = 0; - for chunk in &mut chunks { - let validity = mask.slice(offset..(offset + 64)); - offset += 64; - - match validity.true_count() { - // All nulls -> no stats to update - 0 => continue, - // Inner loop for when validity check can be elided - 64 => inner_loop_nonnull( - chunk.try_into().ok().vortex_expect("chunk size must be 64"), - count_distinct_values, - &mut loop_state, - ), - // Inner loop for when we need to check validity - _ => inner_loop_nullable( - chunk.try_into().ok().vortex_expect("chunk size must be 64"), - count_distinct_values, - &validity, - &mut loop_state, - ), - } - } - // Final iteration, run naive loop - let remainder = chunks.remainder(); - inner_loop_naive( - remainder, - count_distinct_values, - &mask.slice(offset..(offset + remainder.len())), - &mut loop_state, - ); - } - } - - let (top_value, top_count) = if count_distinct_values { - let (&top_value, &top_count) = loop_state - .distinct_values - .iter() - .max_by_key(|&(_, &count)| count) - .vortex_expect("non-empty"); - (top_value.0, top_count) - } else { - (T::default(), 0) - }; - - let runs = loop_state.runs; - let distinct_values_count = if count_distinct_values { - u32::try_from(loop_state.distinct_values.len())? - } else { - u32::MAX - }; - - let min = array - .statistics() - .compute_as::(Stat::Min) - .vortex_expect("min should be computed"); - - let max = array - .statistics() - .compute_as::(Stat::Max) - .vortex_expect("max should be computed"); - - let typed = TypedStats { - min, - max, - distinct_values: loop_state.distinct_values, - top_value, - top_count, - }; - - let null_count = u32::try_from(null_count)?; - let value_count = u32::try_from(value_count)?; - - Ok(IntegerStats { - src: array.clone(), - null_count, - value_count, - average_run_length: value_count / runs, - distinct_values_count, - typed: typed.into(), - }) -} - -struct LoopState { - prev: T, - runs: u32, - distinct_values: HashMap, u32, FxBuildHasher>, -} - -#[inline(always)] -fn inner_loop_nonnull( - values: &[T; 64], - count_distinct_values: bool, - state: &mut LoopState, -) where - NativeValue: Eq + Hash, -{ - for &value in values { - if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; - } - - if value != state.prev { - state.prev = value; - state.runs += 1; - } - } -} - -#[inline(always)] -fn inner_loop_nullable( - values: &[T; 64], - count_distinct_values: bool, - is_valid: &BitBuffer, - state: &mut LoopState, -) where - NativeValue: Eq + Hash, -{ - for (idx, &value) in values.iter().enumerate() { - if is_valid.value(idx) { - if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; - } - - if value != state.prev { - state.prev = value; - state.runs += 1; - } - } - } -} - -#[inline(always)] -fn inner_loop_naive( - values: &[T], - count_distinct_values: bool, - is_valid: &BitBuffer, - state: &mut LoopState, -) where - NativeValue: Eq + Hash, -{ - for (idx, &value) in values.iter().enumerate() { - if is_valid.value(idx) { - if count_distinct_values { - *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1; - } - - if value != state.prev { - state.prev = value; - state.runs += 1; - } - } - } -} - -#[cfg(test)] -mod tests { - use std::iter; - - use vortex_array::arrays::PrimitiveArray; - use vortex_array::validity::Validity; - use vortex_buffer::BitBuffer; - use vortex_buffer::Buffer; - use vortex_buffer::buffer; - use vortex_error::VortexResult; - - use super::IntegerStats; - use super::typed_int_stats; - use crate::CompressorStats; - - #[test] - fn test_naive_count_distinct_values() -> VortexResult<()> { - let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable); - let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 2); - Ok(()) - } - - #[test] - fn test_naive_count_distinct_values_nullable() -> VortexResult<()> { - let array = PrimitiveArray::new( - buffer![217u8, 0], - Validity::from(BitBuffer::from(vec![true, false])), - ); - let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 1); - Ok(()) - } - - #[test] - fn test_count_distinct_values() -> VortexResult<()> { - let array = PrimitiveArray::new((0..128u8).collect::>(), Validity::NonNullable); - let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 128); - Ok(()) - } - - #[test] - fn test_count_distinct_values_nullable() -> VortexResult<()> { - let array = PrimitiveArray::new( - (0..128u8).collect::>(), - Validity::from(BitBuffer::from_iter( - iter::repeat_n(vec![true, false], 64).flatten(), - )), - ); - let stats = typed_int_stats::(&array, true)?; - assert_eq!(stats.distinct_values_count, 64); - Ok(()) - } - - #[test] - fn test_integer_stats_leading_nulls() { - let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true])); - - let stats = IntegerStats::generate(&ints); - - assert_eq!(stats.value_count, 2); - assert_eq!(stats.null_count, 1); - assert_eq!(stats.average_run_length, 1); - assert_eq!(stats.distinct_values_count, 2); - } -} diff --git a/vortex-btrblocks/src/compressor/mod.rs b/vortex-btrblocks/src/compressor/mod.rs deleted file mode 100644 index 5c3a31271cd..00000000000 --- a/vortex-btrblocks/src/compressor/mod.rs +++ /dev/null @@ -1,178 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Type-specific compressor traits that drive scheme selection and compression. -//! -//! [`Compressor`] defines the interface: generate statistics for an array via -//! [`Compressor::gen_stats`], and provide available [`Scheme`]s via [`Compressor::schemes`]. -//! -//! [`CompressorExt`] is blanket-implemented for all `Compressor`s and adds the core logic: -//! -//! - [`CompressorExt::choose_scheme`] iterates all schemes, skips excluded ones, and calls -//! [`Scheme::expected_compression_ratio`] on each. It returns the scheme with the highest ratio -//! above 1.0, or falls back to the default. See the [`scheme`](crate::scheme) module for how -//! ratio estimation works. -//! - [`CompressorExt::compress`] generates stats, calls `choose_scheme()`, and applies the -//! result. If compression did not shrink the array, the original is returned. - -use vortex_array::ArrayRef; -use vortex_array::IntoArray; -use vortex_array::arrays::ConstantArray; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; -use vortex_error::VortexResult; - -use crate::BtrBlocksCompressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Scheme; - -pub(crate) mod decimal; -pub(crate) mod float; -pub(crate) mod integer; -mod patches; -mod rle; -pub(crate) mod string; -pub(crate) mod temporal; - -/// Maximum cascade depth for compression. -pub(crate) const MAX_CASCADE: usize = 3; - -/// A compressor for a particular input type. -/// -/// This trait defines the interface for type-specific compressors that can adaptively -/// choose and apply compression schemes based on data characteristics. Compressors -/// analyze input arrays, select optimal compression schemes, and handle cascading -/// compression with multiple encoding layers. -/// -/// The compressor works by generating statistics on the input data, evaluating -/// available compression schemes, and selecting the one with the best compression ratio. -pub trait Compressor { - /// The VTable type for arrays this compressor operates on. - type ArrayVTable: VTable; - /// The compression scheme type used by this compressor. - type SchemeType: Scheme + ?Sized; - /// The statistics type used to analyze arrays for compression. - type StatsType: CompressorStats; - - /// Generates statistics for the given array to guide compression scheme selection. - fn gen_stats(&self, array: &::Array) -> Self::StatsType; - - /// Returns all available compression schemes for this compressor. - fn schemes(&self) -> &[&'static Self::SchemeType]; - /// Returns the default fallback compression scheme. - fn default_scheme(&self) -> &'static Self::SchemeType; -} - -/// Extension trait providing scheme selection and compression for compressors. -pub trait CompressorExt: Compressor -where - Self::SchemeType: 'static, -{ - /// Selects the best compression scheme based on expected compression ratios. - /// - /// Evaluates all available schemes against the provided statistics and returns - /// the one with the highest compression ratio. Falls back to the default scheme - /// if no scheme provides compression benefits. - #[allow(clippy::cognitive_complexity)] - fn choose_scheme( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[::CodeType], - ) -> VortexResult<&'static Self::SchemeType> { - let mut best_ratio = 1.0; - let mut best_scheme: Option<&'static Self::SchemeType> = None; - - // logging helpers - let depth = MAX_CASCADE - ctx.allowed_cascading; - - for scheme in self.schemes().iter() { - // Skip excluded schemes - if excludes.contains(&scheme.code()) { - continue; - } - - // We never choose Constant for a sample - if ctx.is_sample && scheme.is_constant() { - continue; - } - - tracing::trace!( - is_sample = ctx.is_sample, - depth, - is_constant = scheme.is_constant(), - ?scheme, - "Trying compression scheme" - ); - - let ratio = scheme.expected_compression_ratio(compressor, stats, ctx, excludes)?; - tracing::trace!( - is_sample = ctx.is_sample, - depth, - ratio, - ?scheme, - "Expected compression result" - ); - - if !(ratio.is_subnormal() || ratio.is_infinite() || ratio.is_nan()) { - if ratio > best_ratio { - best_ratio = ratio; - best_scheme = Some(*scheme); - } - } else { - tracing::trace!( - "Calculated invalid compression ratio {ratio} for scheme: {scheme:?}. Must not be sub-normal, infinite or nan." - ); - } - } - - tracing::trace!(depth, scheme = ?best_scheme, ratio = best_ratio, "best scheme found"); - - if let Some(best) = best_scheme { - Ok(best) - } else { - Ok(self.default_scheme()) - } - } - - /// Compresses an array using this compressor. - /// - /// Generates statistics on the input array, selects the best compression scheme, - /// and applies it. Returns the original array if compression would increase size. - fn compress( - &self, - btr_blocks_compressor: &BtrBlocksCompressor, - array: &<::ArrayVTable as VTable>::Array, - ctx: CompressorContext, - excludes: &[::CodeType], - ) -> VortexResult { - // Avoid compressing empty arrays. - if array.is_empty() { - return Ok(array.to_array()); - } - - // Avoid compressing all-null arrays. - if array.all_invalid()? { - return Ok( - ConstantArray::new(Scalar::null(array.dtype().clone()), array.len()).into_array(), - ); - } - - // Generate stats on the array directly. - let stats = self.gen_stats(array); - let best_scheme = self.choose_scheme(btr_blocks_compressor, &stats, ctx, excludes)?; - - let output = best_scheme.compress(btr_blocks_compressor, &stats, ctx, excludes)?; - if output.nbytes() < array.nbytes() { - Ok(output) - } else { - tracing::debug!("resulting tree too large: {}", output.encoding_id()); - Ok(array.to_array()) - } - } -} - -// Blanket implementation for all Compressor types with 'static SchemeType -impl CompressorExt for T where T::SchemeType: 'static {} diff --git a/vortex-btrblocks/src/compressor/rle.rs b/vortex-btrblocks/src/compressor/rle.rs deleted file mode 100644 index ef4b3fcb048..00000000000 --- a/vortex-btrblocks/src/compressor/rle.rs +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::fmt::Debug; -use std::hash::Hash; -use std::marker::PhantomData; - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::PrimitiveArray; -use vortex_error::VortexResult; -use vortex_fastlanes::RLEArray; - -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Excludes; -use crate::IntCode; -use crate::Scheme; -use crate::SchemeExt; - -/// Threshold for the average run length in an array before we consider run-length encoding. -pub const RUN_LENGTH_THRESHOLD: u32 = 4; - -/// Trait for accessing RLE-specific statistics. -pub trait RLEStats { - fn value_count(&self) -> u32; - fn average_run_length(&self) -> u32; - fn source(&self) -> &PrimitiveArray; -} - -/// Configuration trait for RLE schemes. -/// -/// Implement this trait to define the behavior of an RLE scheme for a specific -/// stats and code type combination. -pub trait RLEConfig: Debug + Send + Sync + 'static { - /// The statistics type used by this RLE scheme. - type Stats: RLEStats + CompressorStats; - /// The code type used to identify schemes. - type Code: Copy + Clone + Debug + Hash + Eq + Ord; - - /// The unique code identifying this RLE scheme. - const CODE: Self::Code; - - /// Compress the values array after RLE encoding. - fn compress_values( - compressor: &BtrBlocksCompressor, - values: &PrimitiveArray, - ctx: CompressorContext, - excludes: &[Self::Code], - ) -> VortexResult; -} - -/// RLE scheme that is generic over a configuration type. -/// -/// This is a ZST (zero-sized type) - all behavior is defined by the `RLEConfig` trait. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct RLEScheme(PhantomData); - -impl RLEScheme { - /// Creates a new RLE scheme. - pub const fn new() -> Self { - Self(PhantomData) - } -} - -impl Default for RLEScheme { - fn default() -> Self { - Self::new() - } -} - -impl Scheme for RLEScheme { - type StatsType = C::Stats; - type CodeType = C::Code; - - fn code(&self) -> C::Code { - C::CODE - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[C::Code], - ) -> VortexResult { - // RLE is only useful when we cascade it with another encoding. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - // Don't compress all-null or empty arrays. - if stats.value_count() == 0 { - return Ok(0.0); - } - - // Check whether RLE is a good fit, based on the average run length. - if stats.average_run_length() < RUN_LENGTH_THRESHOLD { - return Ok(0.0); - } - - // Run compression on a sample to see how it performs. - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[C::Code], - ) -> VortexResult { - let rle_array = RLEArray::encode(RLEStats::source(stats))?; - - if ctx.allowed_cascading == 0 { - return Ok(rle_array.into_array()); - } - - // Prevent RLE recursion. - let mut new_excludes = vec![self.code()]; - new_excludes.extend_from_slice(excludes); - - let compressed_values = C::compress_values( - compressor, - &rle_array.values().to_primitive(), - ctx.descend(), - &new_excludes, - )?; - - // Delta in an unstable encoding, once we deem it stable we can switch over to this always. - #[cfg(feature = "unstable_encodings")] - let compressed_indices = try_compress_delta( - &rle_array.indices().to_primitive().narrow()?, - compressor, - ctx.descend(), - Excludes::from(&[IntCode::Dict]), - )?; - - #[cfg(not(feature = "unstable_encodings"))] - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(rle_array.indices().to_primitive().narrow()?), - ctx.descend(), - Excludes::from(&[IntCode::Dict]), - )?; - - let compressed_offsets = compressor.compress_canonical( - Canonical::Primitive(rle_array.values_idx_offsets().to_primitive().narrow()?), - ctx.descend(), - Excludes::from(&[IntCode::Dict]), - )?; - - // SAFETY: Recursive compression doesn't affect the invariants. - unsafe { - Ok(RLEArray::new_unchecked( - compressed_values, - compressed_indices, - compressed_offsets, - rle_array.dtype().clone(), - rle_array.offset(), - rle_array.len(), - ) - .into_array()) - } - } -} - -#[cfg(feature = "unstable_encodings")] -fn try_compress_delta( - primitive_array: &PrimitiveArray, - compressor: &BtrBlocksCompressor, - ctx: CompressorContext, - excludes: Excludes, -) -> VortexResult { - use vortex_array::VortexSessionExecute; - - let (bases, deltas) = vortex_fastlanes::delta_compress( - primitive_array, - &mut vortex_array::LEGACY_SESSION.create_execution_ctx(), - )?; - - let compressed_bases = - compressor.compress_canonical(Canonical::Primitive(bases), ctx, excludes)?; - let compressed_deltas = - compressor.compress_canonical(Canonical::Primitive(deltas), ctx, excludes)?; - - vortex_fastlanes::DeltaArray::try_from_delta_compress_parts(compressed_bases, compressed_deltas) - .map(vortex_fastlanes::DeltaArray::into_array) -} diff --git a/vortex-btrblocks/src/compressor/string.rs b/vortex-btrblocks/src/compressor/string.rs deleted file mode 100644 index cac9cf969cd..00000000000 --- a/vortex-btrblocks/src/compressor/string.rs +++ /dev/null @@ -1,687 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use std::hash::Hash; -use std::hash::Hasher; - -use enum_iterator::Sequence; -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; -use vortex_array::ToCanonical; -use vortex_array::VortexSessionExecute; -use vortex_array::aggregate_fn::fns::is_constant::is_constant; -use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::DictArray; -use vortex_array::arrays::MaskedArray; -use vortex_array::arrays::VarBinArray; -use vortex_array::arrays::VarBinView; -use vortex_array::arrays::VarBinViewArray; -use vortex_array::builders::dict::dict_encode; -use vortex_array::scalar::Scalar; -use vortex_array::vtable::VTable; -use vortex_array::vtable::ValidityHelper; -use vortex_error::VortexExpect; -use vortex_error::VortexResult; -use vortex_error::vortex_err; -use vortex_fsst::FSSTArray; -use vortex_fsst::fsst_compress; -use vortex_fsst::fsst_train_compressor; -use vortex_sparse::Sparse; -use vortex_sparse::SparseArray; -use vortex_utils::aliases::hash_set::HashSet; - -use super::integer::DictScheme as IntDictScheme; -use super::integer::SequenceScheme as IntSequenceScheme; -use super::integer::SparseScheme as IntSparseScheme; -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::Compressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::Excludes; -use crate::GenerateStatsOptions; -use crate::IntCode; -use crate::Scheme; -use crate::SchemeExt; -use crate::sample::sample; - -/// Array of variable-length byte arrays, and relevant stats for compression. -#[derive(Clone, Debug)] -pub struct StringStats { - src: VarBinViewArray, - estimated_distinct_count: u32, - value_count: u32, - null_count: u32, -} - -/// Estimate the number of distinct strings in the var bin view array. -fn estimate_distinct_count(strings: &VarBinViewArray) -> VortexResult { - let views = strings.views(); - // Iterate the views. Two strings which are equal must have the same first 8-bytes. - // NOTE: there are cases where this performs pessimally, e.g. when we have strings that all - // share a 4-byte prefix and have the same length. - let mut distinct = HashSet::with_capacity(views.len() / 2); - views.iter().for_each(|&view| { - #[expect( - clippy::cast_possible_truncation, - reason = "approximate uniqueness with view prefix" - )] - let len_and_prefix = view.as_u128() as u64; - distinct.insert(len_and_prefix); - }); - - Ok(u32::try_from(distinct.len())?) -} - -impl StringStats { - fn generate_opts_fallible( - input: &VarBinViewArray, - opts: GenerateStatsOptions, - ) -> VortexResult { - let null_count = input - .statistics() - .compute_null_count() - .ok_or_else(|| vortex_err!("Failed to compute null_count"))?; - let value_count = input.len() - null_count; - let estimated_distinct = if opts.count_distinct_values { - estimate_distinct_count(input)? - } else { - u32::MAX - }; - - Ok(Self { - src: input.clone(), - value_count: u32::try_from(value_count)?, - null_count: u32::try_from(null_count)?, - estimated_distinct_count: estimated_distinct, - }) - } -} - -impl CompressorStats for StringStats { - type ArrayVTable = VarBinView; - - fn generate_opts(input: &VarBinViewArray, opts: GenerateStatsOptions) -> Self { - Self::generate_opts_fallible(input, opts) - .vortex_expect("StringStats::generate_opts should not fail") - } - - fn source(&self) -> &VarBinViewArray { - &self.src - } - - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self { - let sampled = - sample(&self.src.clone().into_array(), sample_size, sample_count).to_varbinview(); - - Self::generate_opts(&sampled, opts) - } -} - -/// All available string compression schemes. -pub const ALL_STRING_SCHEMES: &[&dyn StringScheme] = &[ - &UncompressedScheme, - &DictScheme, - &FSSTScheme, - &ConstantScheme, - &NullDominated, - #[cfg(feature = "zstd")] - &ZstdScheme, - #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] - &ZstdBuffersScheme, -]; - -/// [`Compressor`] for strings. -#[derive(Clone, Copy)] -pub struct StringCompressor<'a> { - /// Reference to the parent compressor. - pub btr_blocks_compressor: &'a dyn CanonicalCompressor, -} - -impl<'a> Compressor for StringCompressor<'a> { - type ArrayVTable = VarBinView; - type SchemeType = dyn StringScheme; - type StatsType = StringStats; - - fn gen_stats(&self, array: &::Array) -> Self::StatsType { - if self - .btr_blocks_compressor - .string_schemes() - .iter() - .any(|s| s.code() == DictScheme.code()) - { - StringStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: true, - }, - ) - } else { - StringStats::generate_opts( - array, - GenerateStatsOptions { - count_distinct_values: false, - }, - ) - } - } - - fn schemes(&self) -> &[&'static dyn StringScheme] { - self.btr_blocks_compressor.string_schemes() - } - - fn default_scheme(&self) -> &'static Self::SchemeType { - &UncompressedScheme - } -} - -pub trait StringScheme: - Scheme + Send + Sync -{ -} - -impl StringScheme for T where - T: Scheme + Send + Sync -{ -} - -impl PartialEq for dyn StringScheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} - -impl Eq for dyn StringScheme {} - -impl Hash for dyn StringScheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct UncompressedScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct DictScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct FSSTScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ConstantScheme; - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct NullDominated; - -/// Zstd compression without dictionaries (nvCOMP compatible). -#[cfg(feature = "zstd")] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ZstdScheme; - -/// Zstd buffer-level compression preserving array layout for GPU decompression. -#[cfg(all(feature = "zstd", feature = "unstable_encodings"))] -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub struct ZstdBuffersScheme; - -/// Unique identifier for string compression schemes. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Sequence, Ord, PartialOrd)] -pub enum StringCode { - /// No compression applied. - Uncompressed, - /// Dictionary encoding for low-cardinality strings. - Dict, - /// FSST (Fast Static Symbol Table) compression. - Fsst, - /// Constant encoding for arrays with a single distinct value. - Constant, - /// Sparse encoding for null-dominated arrays. - Sparse, - /// Zstd compression without dictionaries. - Zstd, - /// Zstd buffer-level compression preserving array layout. - ZstdBuffers, -} - -impl Scheme for UncompressedScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> StringCode { - StringCode::Uncompressed - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - _stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - Ok(1.0) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - Ok(stats.source().clone().into_array()) - } -} - -impl Scheme for DictScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> StringCode { - StringCode::Dict - } - - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[StringCode], - ) -> VortexResult { - // If we don't have a sufficiently high number of distinct values, do not attempt Dict. - if stats.estimated_distinct_count > stats.value_count / 2 { - return Ok(0.0); - } - - // If array is all null, do not attempt dict. - if stats.value_count == 0 { - return Ok(0.0); - } - - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - let dict = dict_encode(&stats.source().clone().into_array())?; - - // If we are not allowed to cascade, do not attempt codes or values compression. - if ctx.allowed_cascading == 0 { - return Ok(dict.into_array()); - } - - // Find best compressor for codes and values separately - let compressed_codes = compressor.compress_canonical( - Canonical::Primitive(dict.codes().to_primitive()), - ctx.descend(), - Excludes::from(&[IntDictScheme.code(), IntSequenceScheme.code()]), - )?; - - // Attempt to compress the values with non-Dict compression. - // Currently this will only be FSST. - let compressed_values = compressor.compress_canonical( - Canonical::VarBinView(dict.values().to_varbinview()), - ctx.descend(), - Excludes::from(&[DictScheme.code()]), - )?; - - // SAFETY: compressing codes or values does not alter the invariants - unsafe { - Ok( - DictArray::new_unchecked(compressed_codes, compressed_values) - .set_all_values_referenced(dict.has_all_values_referenced()) - .into_array(), - ) - } - } -} - -impl Scheme for FSSTScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> StringCode { - StringCode::Fsst - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - let fsst = { - let compressor = fsst_train_compressor(&stats.src); - fsst_compress(&stats.src, &compressor) - }; - - let compressed_original_lengths = compressor.compress_canonical( - Canonical::Primitive(fsst.uncompressed_lengths().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - - let compressed_codes_offsets = compressor.compress_canonical( - Canonical::Primitive(fsst.codes().offsets().to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - let compressed_codes = VarBinArray::try_new( - compressed_codes_offsets, - fsst.codes().bytes().clone(), - fsst.codes().dtype().clone(), - fsst.codes().validity().clone(), - )?; - - let fsst = FSSTArray::try_new( - fsst.dtype().clone(), - fsst.symbols().clone(), - fsst.symbol_lengths().clone(), - compressed_codes, - compressed_original_lengths, - )?; - - Ok(fsst.into_array()) - } -} - -impl Scheme for ConstantScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> Self::CodeType { - StringCode::Constant - } - - fn is_constant(&self) -> bool { - true - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - if ctx.is_sample { - return Ok(0.0); - } - - let mut ctx = LEGACY_SESSION.create_execution_ctx(); - if stats.estimated_distinct_count > 1 - || !is_constant(&stats.src.clone().into_array(), &mut ctx)? - { - return Ok(0.0); - } - - // Force constant is these cases - Ok(f64::MAX) - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - let scalar_idx = - (0..stats.source().len()).position(|idx| stats.source().is_valid(idx).unwrap_or(false)); - - match scalar_idx { - Some(idx) => { - let scalar = stats.source().scalar_at(idx)?; - let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array(); - if !stats.source().all_valid()? { - Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array()) - } else { - Ok(const_arr) - } - } - None => Ok(ConstantArray::new( - Scalar::null(stats.src.dtype().clone()), - stats.src.len(), - ) - .into_array()), - } - } -} - -impl Scheme for NullDominated { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> Self::CodeType { - StringCode::Sparse - } - - fn expected_compression_ratio( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - // Only use `SparseScheme` if we can cascade. - if ctx.allowed_cascading == 0 { - return Ok(0.0); - } - - if stats.value_count == 0 { - // All nulls should use ConstantScheme - return Ok(0.0); - } - - // If the majority is null, will compress well. - if stats.null_count as f64 / stats.src.len() as f64 > 0.9 { - return Ok(stats.src.len() as f64 / stats.value_count as f64); - } - - // Otherwise we don't go this route - Ok(0.0) - } - - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - _excludes: &[Self::CodeType], - ) -> VortexResult { - assert!(ctx.allowed_cascading > 0); - - // We pass None as we only run this pathway for NULL-dominated string arrays - let sparse_encoded = SparseArray::encode(&stats.src.clone().into_array(), None)?; - - if let Some(sparse) = sparse_encoded.as_opt::() { - // Compress the indices only (not the values for strings) - let new_excludes = vec![IntSparseScheme.code(), IntCode::Dict]; - - let indices = sparse.patches().indices().to_primitive().narrow()?; - let compressed_indices = compressor.compress_canonical( - Canonical::Primitive(indices), - ctx.descend(), - Excludes::int_only(&new_excludes), - )?; - - SparseArray::try_new( - compressed_indices, - sparse.patches().values().clone(), - sparse.len(), - sparse.fill_scalar().clone(), - ) - .map(|a| a.into_array()) - } else { - Ok(sparse_encoded) - } - } -} - -#[cfg(feature = "zstd")] -impl Scheme for ZstdScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> StringCode { - StringCode::Zstd - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - let compacted = stats.source().compact_buffers()?; - Ok( - vortex_zstd::ZstdArray::from_var_bin_view_without_dict(&compacted, 3, 8192)? - .into_array(), - ) - } -} - -#[cfg(all(feature = "zstd", feature = "unstable_encodings"))] -impl Scheme for ZstdBuffersScheme { - type StatsType = StringStats; - type CodeType = StringCode; - - fn code(&self) -> StringCode { - StringCode::ZstdBuffers - } - - fn compress( - &self, - _compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - _ctx: CompressorContext, - _excludes: &[StringCode], - ) -> VortexResult { - Ok( - vortex_zstd::ZstdBuffersArray::compress(&stats.source().clone().into_array(), 3)? - .into_array(), - ) - } -} - -#[cfg(test)] -mod tests { - use vortex_array::IntoArray; - use vortex_array::arrays::VarBinViewArray; - use vortex_array::builders::ArrayBuilder; - use vortex_array::builders::VarBinViewBuilder; - use vortex_array::display::DisplayOptions; - use vortex_array::dtype::DType; - use vortex_array::dtype::Nullability; - use vortex_error::VortexResult; - - use crate::BtrBlocksCompressor; - - #[test] - fn test_strings() -> VortexResult<()> { - let mut strings = Vec::new(); - for _ in 0..1024 { - strings.push(Some("hello-world-1234")); - } - for _ in 0..1024 { - strings.push(Some("hello-world-56789")); - } - let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - - let array_ref = strings.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; - assert_eq!(compressed.len(), 2048); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.dict(utf8, len=2048)"); - - Ok(()) - } - - #[test] - fn test_sparse_nulls() -> VortexResult<()> { - let mut strings = VarBinViewBuilder::with_capacity(DType::Utf8(Nullability::Nullable), 100); - strings.append_nulls(99); - - strings.append_value("one little string"); - - let strings = strings.finish_into_varbinview(); - - let array_ref = strings.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; - assert_eq!(compressed.len(), 100); - - let display = compressed - .display_as(DisplayOptions::MetadataOnly) - .to_string() - .to_lowercase(); - assert_eq!(display, "vortex.sparse(utf8?, len=100)"); - - Ok(()) - } -} - -/// Tests to verify that each string compression scheme produces the expected encoding. -#[cfg(test)] -mod scheme_selection_tests { - use vortex_array::IntoArray; - use vortex_array::arrays::Constant; - use vortex_array::arrays::Dict; - use vortex_array::arrays::VarBinViewArray; - use vortex_array::dtype::DType; - use vortex_array::dtype::Nullability; - use vortex_error::VortexResult; - use vortex_fsst::FSST; - - use crate::BtrBlocksCompressor; - - #[test] - fn test_constant_compressed() -> VortexResult<()> { - let strings: Vec> = vec![Some("constant_value"); 100]; - let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let array_ref = array.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_dict_compressed() -> VortexResult<()> { - let distinct_values = ["apple", "banana", "cherry"]; - let mut strings = Vec::with_capacity(1000); - for i in 0..1000 { - strings.push(Some(distinct_values[i % 3])); - } - let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let array_ref = array.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; - assert!(compressed.is::()); - Ok(()) - } - - #[test] - fn test_fsst_compressed() -> VortexResult<()> { - let mut strings = Vec::with_capacity(1000); - for i in 0..1000 { - strings.push(Some(format!( - "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern" - ))); - } - let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); - let array_ref = array.into_array(); - let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; - assert!(compressed.is::()); - Ok(()) - } -} diff --git a/vortex-btrblocks/src/compressor/temporal.rs b/vortex-btrblocks/src/compressor/temporal.rs deleted file mode 100644 index 6fb917be58d..00000000000 --- a/vortex-btrblocks/src/compressor/temporal.rs +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Specialized compressor for DateTimeParts metadata. - -use vortex_array::ArrayRef; -use vortex_array::Canonical; -use vortex_array::IntoArray; -use vortex_array::ToCanonical; -use vortex_array::arrays::TemporalArray; -use vortex_datetime_parts::DateTimePartsArray; -use vortex_datetime_parts::TemporalParts; -use vortex_datetime_parts::split_temporal; -use vortex_error::VortexResult; - -use crate::BtrBlocksCompressor; -use crate::CanonicalCompressor; -use crate::CompressorContext; -use crate::Excludes; - -/// Compress a temporal array into a `DateTimePartsArray`. -pub fn compress_temporal( - compressor: &BtrBlocksCompressor, - array: TemporalArray, -) -> VortexResult { - let dtype = array.dtype().clone(); - let TemporalParts { - days, - seconds, - subseconds, - } = split_temporal(array)?; - - let ctx = CompressorContext::default().descend(); - - let days = compressor.compress_canonical( - Canonical::Primitive(days.to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - let seconds = compressor.compress_canonical( - Canonical::Primitive(seconds.to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - let subseconds = compressor.compress_canonical( - Canonical::Primitive(subseconds.to_primitive().narrow()?), - ctx, - Excludes::none(), - )?; - - Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) -} diff --git a/vortex-btrblocks/src/ctx.rs b/vortex-btrblocks/src/ctx.rs deleted file mode 100644 index f2cb6a37102..00000000000 --- a/vortex-btrblocks/src/ctx.rs +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Compression context types for recursive compression. - -use crate::FloatCode; -use crate::IntCode; -use crate::MAX_CASCADE; -use crate::StringCode; - -/// Holds references to exclude lists for each compression code type. -/// -/// This struct is passed through recursive compression calls to specify -/// which schemes should be excluded at each level. -#[derive(Debug, Clone, Copy, Default)] -pub struct Excludes<'a> { - /// Integer schemes to exclude. - pub int: &'a [IntCode], - /// Float schemes to exclude. - pub float: &'a [FloatCode], - /// String schemes to exclude. - pub string: &'a [StringCode], -} - -impl<'a> Excludes<'a> { - /// Creates an empty excludes (no exclusions). - pub const fn none() -> Self { - Self { - int: &[], - float: &[], - string: &[], - } - } - - /// Creates excludes with only integer exclusions. - pub const fn int_only(int: &'a [IntCode]) -> Self { - Self { - int, - float: &[], - string: &[], - } - } - - /// Creates excludes with only float exclusions. - pub const fn float_only(float: &'a [FloatCode]) -> Self { - Self { - int: &[], - float, - string: &[], - } - } - - /// Creates excludes with only string exclusions. - pub const fn string_only(string: &'a [StringCode]) -> Self { - Self { - int: &[], - float: &[], - string, - } - } -} - -impl<'a> From<&'a [IntCode]> for Excludes<'a> { - fn from(int: &'a [IntCode]) -> Self { - Self::int_only(int) - } -} - -impl<'a, const N: usize> From<&'a [IntCode; N]> for Excludes<'a> { - fn from(int: &'a [IntCode; N]) -> Self { - Self::int_only(int) - } -} - -impl<'a> From<&'a [FloatCode]> for Excludes<'a> { - fn from(float: &'a [FloatCode]) -> Self { - Self::float_only(float) - } -} - -impl<'a, const N: usize> From<&'a [FloatCode; N]> for Excludes<'a> { - fn from(float: &'a [FloatCode; N]) -> Self { - Self::float_only(float) - } -} - -impl<'a> From<&'a [StringCode]> for Excludes<'a> { - fn from(string: &'a [StringCode]) -> Self { - Self::string_only(string) - } -} - -impl<'a, const N: usize> From<&'a [StringCode; N]> for Excludes<'a> { - fn from(string: &'a [StringCode; N]) -> Self { - Self::string_only(string) - } -} - -/// Context passed through recursive compression calls. -/// -/// Bundles `is_sample` and `allowed_cascading` which always travel together. -/// Excludes are passed separately since they're type-specific. -#[derive(Debug, Clone, Copy)] -pub struct CompressorContext { - /// Whether we're compressing a sample (for ratio estimation). - pub is_sample: bool, - /// Remaining cascade depth allowed. - pub allowed_cascading: usize, -} - -impl Default for CompressorContext { - fn default() -> Self { - Self { - is_sample: false, - allowed_cascading: MAX_CASCADE, - } - } -} - -impl CompressorContext { - /// Descend one level in the cascade (decrements `allowed_cascading`). - pub fn descend(self) -> Self { - Self { - allowed_cascading: self.allowed_cascading.saturating_sub(1), - ..self - } - } - - /// Returns a context marked as sample compression (for ratio estimation). - pub fn as_sample(self) -> Self { - Self { - is_sample: true, - ..self - } - } -} diff --git a/vortex-btrblocks/src/lib.rs b/vortex-btrblocks/src/lib.rs index 28e4eeb8dfa..26dc56b0d8f 100644 --- a/vortex-btrblocks/src/lib.rs +++ b/vortex-btrblocks/src/lib.rs @@ -12,71 +12,71 @@ //! //! # Key Features //! -//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data patterns -//! - **Type-Specific Compressors**: Specialized compression for integers, floats, strings, and temporal data -//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results -//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios -//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists +//! - **Adaptive Compression**: Automatically selects the best compression scheme based on data +//! patterns. +//! - **Unified Scheme Trait**: A single [`Scheme`] trait covers all data types (integers, floats, +//! strings, etc.) with a [`SchemeId`] for identity. +//! - **Cascaded Encoding**: Multiple compression layers can be applied for optimal results. +//! - **Statistical Analysis**: Uses data sampling and statistics to predict compression ratios. +//! - **Recursive Structure Handling**: Compresses nested structures like structs and lists. //! //! # How It Works //! //! [`BtrBlocksCompressor::compress()`] takes an `&ArrayRef` and returns an `ArrayRef` that may //! use a different encoding. It first canonicalizes the input, then dispatches by type. -//! Primitives go to a type-specific `Compressor` (integer, float, or string). Compound types -//! like structs and lists recurse into their fields and elements. +//! Primitives and strings go through `choose_and_compress`, which evaluates every enabled +//! [`Scheme`] and picks the one with the best compression ratio. Compound types like structs +//! and lists recurse into their fields and elements. //! -//! Each type-specific compressor holds a static list of `Scheme` implementations (e.g. -//! BitPacking, ALP, Dict). There is no dynamic registry. The compressor evaluates each scheme by -//! compressing a ~1% sample and measuring the ratio, then picks the best. See `SchemeExt` for -//! details on how sampling works. +//! Each `Scheme` implementation declares whether it [`matches`](Scheme::matches) a given +//! canonical form and, if so, estimates the compression ratio (often by compressing a ~1% +//! sample). There is no dynamic registry — the set of schemes is fixed at build time via +//! [`ALL_SCHEMES`]. //! //! Schemes can produce arrays that are themselves further compressed (e.g. FoR then BitPacking), -//! up to `MAX_CASCADE` (3) layers deep. An `Excludes` set prevents the same scheme from being -//! applied twice in a chain. +//! up to [`MAX_CASCADE`] (3) layers deep. An excludes slice of [`SchemeId`] prevents the same +//! scheme from being applied twice in a chain. //! //! # Example //! //! ```rust -//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, IntCode}; +//! use vortex_btrblocks::{BtrBlocksCompressor, BtrBlocksCompressorBuilder, Scheme, SchemeExt}; +//! use vortex_btrblocks::schemes::integer::IntDictScheme; //! use vortex_array::DynArray; //! -//! // Default compressor with all schemes enabled +//! // Default compressor with all schemes enabled. //! let compressor = BtrBlocksCompressor::default(); //! -//! // Configure with builder to exclude specific schemes +//! // Configure with builder to exclude specific schemes. //! let compressor = BtrBlocksCompressorBuilder::default() -//! .exclude_int([IntCode::Dict]) +//! .exclude([IntDictScheme.id()]) //! .build(); //! ``` //! //! [BtrBlocks]: https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/papers/btrblocks.pdf -pub use compressor::float::FloatCode; -use compressor::float::FloatCompressor; -pub use compressor::integer::IntCode; -use compressor::integer::IntCompressor; -pub use compressor::string::StringCode; -use compressor::string::StringCompressor; - mod builder; mod canonical_compressor; -mod compressor; -mod ctx; -mod sample; -mod scheme; -mod stats; +/// Compression scheme implementations. +pub mod schemes; +// Re-export framework types from vortex-compressor for backwards compatibility. +// Btrblocks-specific exports. +pub use builder::ALL_SCHEMES; pub use builder::BtrBlocksCompressorBuilder; +pub use builder::default_excluded; pub use canonical_compressor::BtrBlocksCompressor; -pub use canonical_compressor::CanonicalCompressor; -use compressor::Compressor; -use compressor::CompressorExt; -use compressor::MAX_CASCADE; -pub use compressor::integer::IntegerStats; -pub use compressor::integer::dictionary::dictionary_encode as integer_dictionary_encode; -use ctx::CompressorContext; -use ctx::Excludes; -use scheme::Scheme; -use scheme::SchemeExt; -pub use stats::CompressorStats; -pub use stats::GenerateStatsOptions; +pub use schemes::patches::compress_patches; +pub use vortex_compressor::CascadingCompressor; +pub use vortex_compressor::builtins::integer_dictionary_encode; +pub use vortex_compressor::ctx::CompressorContext; +pub use vortex_compressor::ctx::MAX_CASCADE; +pub use vortex_compressor::scheme::Scheme; +pub use vortex_compressor::scheme::SchemeExt; +pub use vortex_compressor::scheme::SchemeId; +pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling; +pub use vortex_compressor::stats::ArrayAndStats; +pub use vortex_compressor::stats::FloatStats; +pub use vortex_compressor::stats::GenerateStatsOptions; +pub use vortex_compressor::stats::IntegerStats; +pub use vortex_compressor::stats::StringStats; diff --git a/vortex-btrblocks/src/sample.rs b/vortex-btrblocks/src/sample.rs deleted file mode 100644 index 4967f45f9c9..00000000000 --- a/vortex-btrblocks/src/sample.rs +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -use rand::RngExt; -use rand::SeedableRng; -use rand::prelude::StdRng; -use vortex_array::ArrayRef; -use vortex_array::DynArray; -use vortex_array::IntoArray; -use vortex_array::arrays::ChunkedArray; -use vortex_error::VortexExpect; - -use crate::stats::SAMPLE_COUNT; -use crate::stats::SAMPLE_SIZE; - -pub(crate) fn sample(input: &ArrayRef, sample_size: u32, sample_count: u32) -> ArrayRef { - if input.len() <= (sample_size as usize) * (sample_count as usize) { - return input.to_array(); - } - - let slices = stratified_slices( - input.len(), - sample_size, - sample_count, - &mut StdRng::seed_from_u64(1234567890u64), - ); - - // For every slice, grab the relevant slice and repack into a new PrimitiveArray. - let chunks: Vec<_> = slices - .into_iter() - .map(|(start, end)| { - input - .slice(start..end) - .vortex_expect("slice should succeed") - }) - .collect(); - ChunkedArray::try_new(chunks, input.dtype().clone()) - .vortex_expect("sample slices should form valid chunked array") - .into_array() -} - -/// Computes the number of sample chunks to cover approximately 1% of `len` elements, -/// with a minimum of `SAMPLE_SIZE * SAMPLE_COUNT` (1024) values. -pub(crate) fn sample_count_approx_one_percent(len: usize) -> u32 { - let approximately_one_percent = - (len / 100) / usize::try_from(SAMPLE_SIZE).vortex_expect("SAMPLE_SIZE must fit in usize"); - u32::max( - u32::next_multiple_of( - approximately_one_percent - .try_into() - .vortex_expect("sample count must fit in u32"), - 16, - ), - SAMPLE_COUNT, - ) -} - -fn stratified_slices( - length: usize, - sample_size: u32, - sample_count: u32, - rng: &mut StdRng, -) -> Vec<(usize, usize)> { - let total_num_samples: usize = (sample_count as usize) * (sample_size as usize); - if total_num_samples >= length { - return vec![(0usize, length)]; - } - - let partitions = partition_indices(length, sample_count); - let num_samples_per_partition: Vec = partition_indices(total_num_samples, sample_count) - .into_iter() - .map(|(start, stop)| stop - start) - .collect(); - - partitions - .into_iter() - .zip(num_samples_per_partition) - .map(|((start, stop), size)| { - assert!( - stop - start >= size, - "Slices must be bigger than their sampled size" - ); - let random_start = rng.random_range(start..=(stop - size)); - (random_start, random_start + size) - }) - .collect() -} - -/// Split a range of array indices into as-equal-as-possible slices. If the provided `num_partitions` doesn't -/// evenly divide into `length`, then the first `(length % num_partitions)` slices will have an extra element. -fn partition_indices(length: usize, num_partitions: u32) -> Vec<(usize, usize)> { - let num_long_parts = length % num_partitions as usize; - let short_step = length / num_partitions as usize; - let long_step = short_step + 1; - let long_stop = num_long_parts * long_step; - - (0..long_stop) - .step_by(long_step) - .map(|off| (off, off + long_step)) - .chain( - (long_stop..length) - .step_by(short_step) - .map(|off| (off, off + short_step)), - ) - .collect() -} - -#[cfg(test)] -mod tests { - use vortex_array::IntoArray; - use vortex_array::arrays::PrimitiveArray; - use vortex_array::assert_arrays_eq; - use vortex_array::validity::Validity; - use vortex_buffer::Buffer; - use vortex_error::VortexResult; - - use super::*; - - #[test] - fn sample_is_deterministic() -> VortexResult<()> { - // Create a deterministic array with linear-with-noise pattern - let values: Vec = (0i64..100_000).map(|i| i + (i * 7 + 3) % 11).collect(); - - let array = - PrimitiveArray::new(Buffer::from_iter(values), Validity::NonNullable).into_array(); - - let first = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); - for _ in 0..10 { - let again = sample(&array, SAMPLE_SIZE, SAMPLE_COUNT); - assert_eq!(first.nbytes(), again.nbytes()); - assert_arrays_eq!(&first, &again); - } - Ok(()) - } -} diff --git a/vortex-btrblocks/src/scheme.rs b/vortex-btrblocks/src/scheme.rs deleted file mode 100644 index 1b12a5930e5..00000000000 --- a/vortex-btrblocks/src/scheme.rs +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Compression scheme traits. This is the interface each encoding implements to participate in -//! compression. -//! -//! [`Scheme`] is the core trait. Each encoding (e.g. BitPacking, ALP, Dict) implements it with -//! two key methods: [`Scheme::expected_compression_ratio`] to estimate how well it compresses -//! the data, and [`Scheme::compress`] to apply the encoding. Type-specific sub-traits -//! ([`IntegerScheme`], [`FloatScheme`], [`StringScheme`]) bind schemes to the appropriate stats -//! and code types. -//! -//! [`SchemeExt`] provides the default ratio estimation strategy. It samples ~1% of the array -//! (minimum [`SAMPLE_SIZE`] values), compresses the sample, and returns the before/after byte -//! ratio. Schemes can override [`Scheme::expected_compression_ratio`] if they have a cheaper -//! heuristic. -//! -//! [`IntegerScheme`]: crate::compressor::integer::IntegerScheme -//! [`FloatScheme`]: crate::compressor::float::FloatScheme -//! [`StringScheme`]: crate::compressor::string::StringScheme -//! [`SAMPLE_SIZE`]: crate::stats::SAMPLE_SIZE - -use std::fmt::Debug; -use std::hash::Hash; -use std::hash::Hasher; - -use vortex_array::ArrayRef; -use vortex_error::VortexResult; - -use crate::BtrBlocksCompressor; -use crate::CompressorContext; -use crate::CompressorStats; -use crate::sample::sample_count_approx_one_percent; -use crate::stats::SAMPLE_SIZE; - -/// Top-level compression scheme trait. -/// -/// Variants are specialized for each data type, e.g. see `IntegerScheme`, `FloatScheme`, etc. -pub trait Scheme: Debug { - /// Type of the stats generated by the compression scheme. - type StatsType: CompressorStats; - /// Type of the code used to uniquely identify the compression scheme. - type CodeType: Copy + Eq + Hash + Ord; - - /// Scheme unique identifier. - fn code(&self) -> Self::CodeType; - - /// True if this is the singular Constant scheme for this data type. - fn is_constant(&self) -> bool { - false - } - - /// Estimate the compression ratio for running this scheme (and its children) - /// for the given input. - /// - /// Depth is the depth in the encoding tree we've already reached before considering this - /// scheme. - /// - /// Returns the estimated compression ratio as well as the tree of compressors to use. - fn expected_compression_ratio( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[Self::CodeType], - ) -> VortexResult { - self.estimate_compression_ratio_with_sampling(compressor, stats, ctx, excludes) - } - - /// Compress the input with this scheme, yielding a new array. - fn compress( - &self, - compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[Self::CodeType], - ) -> VortexResult; -} - -impl PartialEq for dyn Scheme { - fn eq(&self, other: &Self) -> bool { - self.code() == other.code() - } -} -impl Eq for dyn Scheme {} -impl Hash for dyn Scheme { - fn hash(&self, state: &mut H) { - self.code().hash(state) - } -} - -/// Extension trait providing sampling-based compression ratio estimation for schemes. -pub trait SchemeExt: Scheme { - /// Estimates compression ratio by compressing a sample of the data. - /// - /// This method samples approximately 1% of the data (with a minimum of 1024 values) - /// and compresses it to estimate the overall compression ratio. - fn estimate_compression_ratio_with_sampling( - &self, - btr_blocks_compressor: &BtrBlocksCompressor, - stats: &Self::StatsType, - ctx: CompressorContext, - excludes: &[Self::CodeType], - ) -> VortexResult { - let sample = if ctx.is_sample { - stats.clone() - } else { - let source_len = stats.source().len(); - let sample_count = sample_count_approx_one_percent(source_len); - - tracing::trace!( - "Sampling {} values out of {}", - SAMPLE_SIZE as u64 * sample_count as u64, - source_len - ); - - stats.sample(SAMPLE_SIZE, sample_count) - }; - - let after = self - .compress(btr_blocks_compressor, &sample, ctx.as_sample(), excludes)? - .nbytes(); - let before = sample.source().nbytes(); - - tracing::debug!( - "estimate_compression_ratio_with_sampling(compressor={self:#?} ctx={ctx:?}) = {}", - before as f64 / after as f64 - ); - - Ok(before as f64 / after as f64) - } -} - -// Blanket implementation for all Scheme types -impl SchemeExt for T {} diff --git a/vortex-btrblocks/src/schemes/decimal.rs b/vortex-btrblocks/src/schemes/decimal.rs new file mode 100644 index 00000000000..dcbf74c6f10 --- /dev/null +++ b/vortex-btrblocks/src/schemes/decimal.rs @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Decimal compression scheme using byte-part decomposition. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::decimal::narrowed_decimal; +use vortex_array::dtype::DecimalType; +use vortex_array::vtable::ValidityHelper; +use vortex_decimal_byte_parts::DecimalBytePartsArray; +use vortex_error::VortexResult; + +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeExt; + +/// Compression scheme for decimal arrays via byte-part decomposition. +/// +/// Narrows the decimal to the smallest integer type, compresses the underlying primitive, and wraps +/// the result in a [`DecimalBytePartsArray`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct DecimalScheme; + +impl Scheme for DecimalScheme { + fn scheme_name(&self) -> &'static str { + "vortex.decimal.byte_parts" + } + + fn matches(&self, canonical: &Canonical) -> bool { + matches!(canonical, Canonical::Decimal(_)) + } + + /// Children: primitive=0. + fn num_children(&self) -> usize { + 1 + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + // Decimal compression is almost always beneficial (narrowing + primitive compression). + Ok(f64::MAX) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // TODO(joe): add support splitting i128/256 buffers into chunks of primitive values + // for compression. 2 for i128 and 4 for i256. + let decimal = data.array().clone().to_decimal(); + let decimal = narrowed_decimal(decimal); + let validity = decimal.validity(); + let prim = match decimal.values_type() { + DecimalType::I8 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I16 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I32 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + DecimalType::I64 => PrimitiveArray::new(decimal.buffer::(), validity.clone()), + _ => return Ok(decimal.into_array()), + }; + + let compressed = compressor.compress_child(&prim.into_array(), &ctx, self.id(), 0)?; + + DecimalBytePartsArray::try_new(compressed, decimal.decimal_dtype()).map(|d| d.into_array()) + } +} diff --git a/vortex-btrblocks/src/schemes/float.rs b/vortex-btrblocks/src/schemes/float.rs new file mode 100644 index 00000000000..0f5622cea3f --- /dev/null +++ b/vortex-btrblocks/src/schemes/float.rs @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Float compression schemes. + +use vortex_alp::ALPArray; +use vortex_alp::RDEncoder; +use vortex_alp::alp_encode; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::dtype::PType; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; +use vortex_error::VortexResult; +use vortex_error::vortex_panic; +use vortex_sparse::Sparse; +use vortex_sparse::SparseArray; + +use super::integer::SparseScheme as IntSparseScheme; +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeExt; +use crate::compress_patches; +use crate::estimate_compression_ratio_with_sampling; + +/// ALP (Adaptive Lossless floating-Point) encoding. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ALPScheme; + +/// ALPRD (ALP with Real Double) encoding variant. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ALPRDScheme; + +/// Sparse encoding for null-dominated float arrays. +/// +/// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct NullDominatedSparseScheme; + +/// Pco (pcodec) compression for floats. +#[cfg(feature = "pco")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct PcoScheme; + +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::FloatConstantScheme; +pub use vortex_compressor::builtins::FloatDictScheme; +pub use vortex_compressor::builtins::is_float_primitive; +pub use vortex_compressor::stats::FloatStats; + +pub use crate::schemes::rle::RLE_FLOAT_SCHEME; + +impl Scheme for ALPScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.alp" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + /// Children: encoded_ints=0. + fn num_children(&self) -> usize { + 1 + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // ALP encodes floats as integers. Without integer compression afterward, the encoded ints + // are the same size. + if ctx.finished_cascading() { + return Ok(0.0); + } + + // We don't support ALP for f16. + if data.float_stats().source().ptype() == PType::F16 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + let alp_encoded = alp_encode(&stats.source().to_primitive(), None)?; + + // Compress the ALP ints. + let compressed_alp_ints = + compressor.compress_child(alp_encoded.encoded(), &ctx, self.id(), 0)?; + + // Patches are not compressed. They should be infrequent, and if they are not then we want + // to keep them linear for easy indexing. + let patches = alp_encoded.patches().map(compress_patches).transpose()?; + + Ok(ALPArray::new(compressed_alp_ints, alp_encoded.exponents(), patches).into_array()) + } +} + +impl Scheme for ALPRDScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.alprd" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + if data.float_stats().source().ptype() == PType::F16 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + let encoder = match stats.source().ptype() { + PType::F32 => RDEncoder::new(stats.source().as_slice::()), + PType::F64 => RDEncoder::new(stats.source().as_slice::()), + ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"), + }; + + let mut alp_rd = encoder.encode(stats.source()); + + let patches = alp_rd + .left_parts_patches() + .map(compress_patches) + .transpose()?; + alp_rd.replace_left_parts_patches(patches); + + Ok(alp_rd.into_array()) + } +} + +impl Scheme for NullDominatedSparseScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.sparse" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + /// Children: indices=0. + fn num_children(&self) -> usize { + 1 + } + + /// The indices of a null-dominated sparse array should not be sparse-encoded again. + fn descendant_exclusions(&self) -> Vec { + vec![DescendantExclusion { + excluded: IntSparseScheme.id(), + children: ChildSelection::All, + }] + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + if stats.value_count() == 0 { + // All nulls should use ConstantScheme instead of this. + return Ok(0.0); + } + + // If the majority (90%) of values is null, this will compress well. + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); + } + + // Otherwise we don't go this route. + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + + // We pass None as we only run this pathway for NULL-dominated float arrays. + let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; + + if let Some(sparse) = sparse_encoded.as_opt::() { + let indices = sparse.patches().indices().to_primitive().narrow()?; + let compressed_indices = + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 0)?; + + SparseArray::try_new( + compressed_indices, + sparse.patches().values().clone(), + sparse.len(), + sparse.fill_scalar().clone(), + ) + .map(|a| a.into_array()) + } else { + Ok(sparse_encoded) + } + } +} + +#[cfg(feature = "pco")] +impl Scheme for PcoScheme { + fn scheme_name(&self) -> &'static str { + "vortex.float.pco" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.float_stats(); + Ok(vortex_pco::PcoArray::from_primitive( + stats.source(), + pco::DEFAULT_COMPRESSION_LEVEL, + 8192, + )? + .into_array()) + } +} + +#[cfg(test)] +mod tests { + use std::iter; + + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builders::ArrayBuilder; + use vortex_array::builders::PrimitiveBuilder; + use vortex_array::display::DisplayOptions; + use vortex_array::dtype::Nullability; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_buffer::buffer_mut; + use vortex_compressor::CascadingCompressor; + use vortex_error::VortexResult; + use vortex_fastlanes::RLE; + + use crate::BtrBlocksCompressor; + use crate::schemes::rle::RLE_FLOAT_SCHEME; + + #[test] + fn test_empty() -> VortexResult<()> { + let btr = BtrBlocksCompressor::default(); + let array = PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable).into_array(); + let result = btr.compress(&array)?; + + assert!(result.is_empty()); + Ok(()) + } + + #[test] + fn test_compress() -> VortexResult<()> { + let mut values = buffer_mut![1.0f32; 1024]; + for i in 0..1024 { + values[i] = (i % 50) as f32; + } + + let array = values.into_array(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array)?; + assert_eq!(compressed.len(), 1024); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.dict(f32, len=1024)"); + + Ok(()) + } + + #[test] + fn test_rle_compression() -> VortexResult<()> { + let mut values = Vec::new(); + values.extend(iter::repeat_n(1.5f32, 100)); + values.extend(iter::repeat_n(2.7f32, 200)); + values.extend(iter::repeat_n(3.15f32, 150)); + + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + + let compressor = CascadingCompressor::new(vec![&RLE_FLOAT_SCHEME]); + let compressed = compressor.compress(&array.into_array())?; + assert!(compressed.is::()); + + let expected = Buffer::copy_from(&values).into_array(); + assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn test_sparse_compression() -> VortexResult<()> { + let mut array = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); + array.append_value(f32::NAN); + array.append_value(-f32::NAN); + array.append_value(f32::INFINITY); + array.append_value(-f32::INFINITY); + array.append_value(0.0f32); + array.append_value(-0.0f32); + array.append_nulls(90); + + let array = array.finish_into_primitive().into_array(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array)?; + assert_eq!(compressed.len(), 96); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.sparse(f32?, len=96)"); + + Ok(()) + } +} + +/// Tests to verify that each float compression scheme produces the expected encoding. +#[cfg(test)] +mod scheme_selection_tests { + use vortex_alp::ALP; + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::Constant; + use vortex_array::arrays::Dict; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::builders::ArrayBuilder; + use vortex_array::builders::PrimitiveBuilder; + use vortex_array::dtype::Nullability; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + + use crate::BtrBlocksCompressor; + + #[test] + fn test_constant_compressed() -> VortexResult<()> { + let values: Vec = vec![42.5; 100]; + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_alp_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| (i as f64) * 0.01).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_dict_compressed() -> VortexResult<()> { + let distinct_values = [1.1, 2.2, 3.3, 4.4, 5.5]; + let values: Vec = (0..1000) + .map(|i| distinct_values[i % distinct_values.len()]) + .collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_null_dominated_compressed() -> VortexResult<()> { + let mut builder = PrimitiveBuilder::::with_capacity(Nullability::Nullable, 100); + for i in 0..5 { + builder.append_value(i as f64); + } + builder.append_nulls(95); + let array = builder.finish_into_primitive(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + // Verify the compressed array preserves values. + assert_eq!(compressed.len(), 100); + Ok(()) + } +} diff --git a/vortex-btrblocks/src/schemes/integer.rs b/vortex-btrblocks/src/schemes/integer.rs new file mode 100644 index 00000000000..df1bd0081b7 --- /dev/null +++ b/vortex-btrblocks/src/schemes/integer.rs @@ -0,0 +1,983 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Integer compression schemes. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::ConstantArray; +use vortex_array::scalar::Scalar; +use vortex_compressor::builtins::FloatDictScheme; +use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::scheme::AncestorExclusion; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_err; +use vortex_fastlanes::FoRArray; +use vortex_fastlanes::bitpack_compress::bit_width_histogram; +use vortex_fastlanes::bitpack_compress::bitpack_encode; +use vortex_fastlanes::bitpack_compress::find_best_bit_width; +use vortex_runend::RunEndArray; +use vortex_runend::compress::runend_encode; +use vortex_sequence::sequence_encode; +use vortex_sparse::Sparse; +use vortex_sparse::SparseArray; +use vortex_zigzag::ZigZagArray; +use vortex_zigzag::zigzag_encode; + +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::GenerateStatsOptions; +use crate::Scheme; +use crate::SchemeExt; +use crate::compress_patches; +use crate::estimate_compression_ratio_with_sampling; + +/// Frame of Reference encoding. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FoRScheme; + +/// ZigZag encoding for negative integers. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ZigZagScheme; + +/// BitPacking encoding for non-negative integers. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct BitPackingScheme; + +/// Sparse encoding for single-value-dominated arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct SparseScheme; + +/// Run-end encoding with end positions. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct RunEndScheme; + +/// Sequence encoding for sequential patterns. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct SequenceScheme; + +/// Pco (pcodec) compression for integers. +#[cfg(feature = "pco")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct PcoScheme; + +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::IntConstantScheme; +pub use vortex_compressor::builtins::IntDictScheme; +pub use vortex_compressor::builtins::is_integer_primitive; +pub use vortex_compressor::stats::IntegerStats; + +pub use crate::schemes::rle::RLE_INTEGER_SCHEME; + +/// Threshold for the average run length in an array before we consider run-end encoding. +const RUN_END_THRESHOLD: u32 = 4; + +impl Scheme for FoRScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.for" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + /// Dict codes always start at 0, so FoR (which subtracts the min) is a no-op. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // FoR only subtracts the min. Without further compression (e.g. BitPacking), the output is + // the same size. + if ctx.finished_cascading() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + // All-null cannot be FOR compressed. + if stats.value_count() == 0 { + return Ok(0.0); + } + + // Only apply when the min is not already zero. + if stats.erased().min_is_zero() { + return Ok(0.0); + } + + // Difference between max and min. + let for_bitwidth = match stats.erased().max_minus_min().checked_ilog2() { + Some(l) => l + 1, + // If max-min == 0, the we should compress as a constant array. + None => return Ok(0.0), + }; + + // If BitPacking can be applied (only non-negative values) and FoR doesn't reduce bit width + // compared to BitPacking, don't use FoR since it has a small amount of overhead (storing + // the reference) for effectively no benefits. + if let Some(max_log) = stats + .erased() + .max_ilog2() + // Only skip FoR when min >= 0, otherwise BitPacking can't be applied without ZigZag. + .filter(|_| !stats.erased().min_is_negative()) + { + let bitpack_bitwidth = max_log + 1; + if for_bitwidth >= bitpack_bitwidth { + return Ok(0.0); + } + } + + let full_width: u32 = stats + .source() + .ptype() + .bit_width() + .try_into() + .vortex_expect("bit width must fit in u32"); + + Ok(full_width as f64 / for_bitwidth as f64) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let primitive = data.array().to_primitive(); + let for_array = FoRArray::encode(primitive)?; + let biased = for_array.encoded().to_primitive(); + + // Immediately bitpack. If any other scheme was preferable, it would be chosen instead + // of bitpacking. + // NOTE: we could delegate in the future if we had another downstream codec that performs + // as well. + let leaf_ctx = ctx.clone().as_leaf(); + let mut biased_data = ArrayAndStats::new(biased.into_array(), ctx.stats_options()); + let compressed = BitPackingScheme.compress(compressor, &mut biased_data, leaf_ctx)?; + + // TODO(connor): This should really be `new_unchecked`. + let for_compressed = FoRArray::try_new(compressed, for_array.reference_scalar().clone())?; + for_compressed + .as_ref() + .statistics() + .inherit_from(for_array.as_ref().statistics()); + + Ok(for_compressed.into_array()) + } +} + +impl Scheme for ZigZagScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.zigzag" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + /// Children: encoded=0. + fn num_children(&self) -> usize { + 1 + } + + /// ZigZag is a bijective value transform that preserves cardinality, run patterns, and value + /// dominance. If Dict, RunEnd, or Sparse lost on the original array, they will lose on ZigZag's + /// output too, so we skip evaluating them. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: RunEndScheme.id(), + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::All, + }, + ] + } + + /// Dict codes are unsigned integers (0..cardinality). ZigZag only helps negatives. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // ZigZag only transforms negative values to positive. Without further compression, + // the output is the same size. + if ctx.finished_cascading() { + return Ok(0.0); + } + + let stats = data.integer_stats(); + + // Don't try and compress all-null arrays. + if stats.value_count() == 0 { + return Ok(0.0); + } + + // ZigZag is only useful when there are negative values. + if !stats.erased().min_is_negative() { + return Ok(0.0); + } + + // Run compression on a sample to see how it performs. + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + // Zigzag encode the values, then recursively compress the inner values. + let zag = zigzag_encode(stats.source().clone())?; + let encoded = zag.encoded().to_primitive(); + + let compressed = compressor.compress_child(&encoded.into_array(), &ctx, self.id(), 0)?; + + tracing::debug!("zigzag output: {}", compressed.encoding_id()); + + Ok(ZigZagArray::try_new(compressed)?.into_array()) + } +} + +impl Scheme for BitPackingScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.bitpacking" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + // BitPacking only works for non-negative values. + if stats.erased().min_is_negative() { + return Ok(0.0); + } + + // Don't compress all-null arrays. + if stats.value_count() == 0 { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + let histogram = bit_width_histogram(stats.source())?; + let bw = find_best_bit_width(stats.source().ptype(), &histogram)?; + // If best bw is determined to be the current bit-width, return the original array. + if bw as usize == stats.source().ptype().bit_width() { + return Ok(stats.source().clone().into_array()); + } + let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?; + + let patches = packed.patches().map(compress_patches).transpose()?; + packed.replace_patches(patches); + + Ok(packed.into_array()) + } +} + +impl Scheme for SparseScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.sparse" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn stats_options(&self) -> GenerateStatsOptions { + GenerateStatsOptions { + count_distinct_values: true, + } + } + + /// Children: values=0, indices=1. + fn num_children(&self) -> usize { + 2 + } + + /// Sparse indices (child 1) are monotonically increasing positions with all unique values. + /// Dict, RunEnd, RLE, and Sparse are all pointless on such data. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RunEndScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RLE_INTEGER_SCHEME.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + if stats.value_count() == 0 { + // All nulls should use ConstantScheme. + return Ok(0.0); + } + + // If the majority is null, will compress well. + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); + } + + // See if the top value accounts for >= 90% of the set values. + let (_, top_count) = stats + .erased() + .most_frequent_value_and_count() + .vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); + + if top_count == stats.value_count() { + // top_value is the only value, should use ConstantScheme instead. + return Ok(0.0); + } + + let freq = top_count as f64 / stats.value_count() as f64; + if freq >= 0.9 { + // We only store the positions of the non-top values. + return Ok(stats.value_count() as f64 / (stats.value_count() - top_count) as f64); + } + + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + let (top_pvalue, top_count) = stats + .erased() + .most_frequent_value_and_count() + .vortex_expect( + "this must be present since `SparseScheme` declared that we need distinct values", + ); + if top_count as usize == stats.source().len() { + // top_value is the only value, use ConstantScheme. + return Ok(ConstantArray::new( + Scalar::primitive_value( + top_pvalue, + top_pvalue.ptype(), + stats.source().dtype().nullability(), + ), + stats.source().len(), + ) + .into_array()); + } + + let sparse_encoded = SparseArray::encode( + &stats.source().clone().into_array(), + Some(Scalar::primitive_value( + top_pvalue, + top_pvalue.ptype(), + stats.source().dtype().nullability(), + )), + )?; + + if let Some(sparse) = sparse_encoded.as_opt::() { + let compressed_values = compressor.compress_child( + &sparse.patches().values().to_primitive().into_array(), + &ctx, + self.id(), + 0, + )?; + + let indices = sparse.patches().indices().to_primitive().narrow()?; + + let compressed_indices = + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 1)?; + + SparseArray::try_new( + compressed_indices, + compressed_values, + sparse.len(), + sparse.fill_scalar().clone(), + ) + .map(|a| a.into_array()) + } else { + Ok(sparse_encoded) + } + } +} + +impl Scheme for RunEndScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.runend" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + /// Children: values=0, ends=1. + fn num_children(&self) -> usize { + 2 + } + + /// RunEnd ends (child 1) are monotonically increasing positions with all unique values. + /// Dict, RunEnd, RLE, and Sparse are all pointless on such data. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RunEndScheme.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: RLE_INTEGER_SCHEME.id(), + children: ChildSelection::One(1), + }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + /// Dict values (child 0) are all unique by definition, so run-end encoding them is + /// pointless. Codes (child 1) can have runs and may benefit from RunEnd. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + // If the run length is below the threshold, drop it. + if stats.average_run_length() < RUN_END_THRESHOLD { + return Ok(0.0); + } + + // Run compression on a sample, see how it performs. + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + // Run-end encode the ends. + let (ends, values) = runend_encode(stats.source()); + + let compressed_values = + compressor.compress_child(&values.to_primitive().into_array(), &ctx, self.id(), 0)?; + + let compressed_ends = + compressor.compress_child(&ends.to_primitive().into_array(), &ctx, self.id(), 1)?; + + // SAFETY: compression doesn't affect invariants. + unsafe { + Ok(RunEndArray::new_unchecked( + compressed_ends, + compressed_values, + 0, + stats.source().len(), + ) + .into_array()) + } + } +} + +impl Scheme for SequenceScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.sequence" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + /// Sequence encoding on dictionary codes just adds a layer of indirection without compressing + /// the data. Dict codes are compact integers that benefit from BitPacking or FoR, not from + /// sequence detection. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(1), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(1), + }, + ] + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + if stats.null_count() > 0 { + return Ok(0.0); + } + + // TODO(connor): Why do we sequence encode the whole thing and then throw it away? And then + // why do we divide the ratio by 2??? + + // If the distinct_values_count was computed, and not all values are unique, then this + // cannot be encoded as a sequence array. + if stats + .distinct_count() + // TODO(connor): Shouldn't this be `is_none_or`??? Why do things fail if not this? + .is_some_and(|count| count as usize != stats.source().len()) + { + return Ok(0.0); + } + + // Since two values are required to store base and multiplier the compression ratio is + // divided by 2. + Ok(sequence_encode(stats.source())? + .map(|_| stats.source().len() as f64 / 2.0) + .unwrap_or(0.0)) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + if stats.null_count() > 0 { + vortex_bail!("sequence encoding does not support nulls"); + } + sequence_encode(stats.source())?.ok_or_else(|| vortex_err!("cannot sequence encode array")) + } +} + +#[cfg(feature = "pco")] +impl Scheme for PcoScheme { + fn scheme_name(&self) -> &'static str { + "vortex.int.pco" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + // Pco does not support I8 or U8. + if matches!( + stats.source().ptype(), + vortex_array::dtype::PType::I8 | vortex_array::dtype::PType::U8 + ) { + return Ok(0.0); + } + + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.integer_stats(); + + Ok(vortex_pco::PcoArray::from_primitive( + stats.source(), + pco::DEFAULT_COMPRESSION_LEVEL, + 8192, + )? + .into_array()) + } +} + +#[cfg(test)] +mod tests { + use std::iter; + + use itertools::Itertools; + use rand::Rng; + use rand::SeedableRng; + use rand::rngs::StdRng; + use vortex_array::DynArray; + use vortex_array::IntoArray; + use vortex_array::arrays::Dict; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_buffer::BufferMut; + use vortex_buffer::buffer; + use vortex_compressor::CascadingCompressor; + use vortex_error::VortexResult; + use vortex_fastlanes::RLE; + use vortex_sequence::Sequence; + use vortex_sparse::Sparse; + + use crate::BtrBlocksCompressor; + use crate::schemes::rle::RLE_INTEGER_SCHEME; + + #[test] + fn test_empty() -> VortexResult<()> { + // Make sure empty array compression does not fail. + let btr = BtrBlocksCompressor::default(); + let array = PrimitiveArray::new(Buffer::::empty(), Validity::NonNullable); + let result = btr.compress(&array.into_array())?; + + assert!(result.is_empty()); + Ok(()) + } + + #[test] + fn test_dict_encodable() -> VortexResult<()> { + let mut codes = BufferMut::::with_capacity(65_535); + // Write some runs of length 3 of a handful of different values. Interrupted by some + // one-off values. + + let numbers = [0, 10, 50, 100, 1000, 3000] + .into_iter() + .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked + .collect_vec(); + + let mut rng = StdRng::seed_from_u64(1u64); + while codes.len() < 64000 { + let run_length = rng.next_u32() % 5; + let value = numbers[rng.next_u32() as usize % numbers.len()]; + for _ in 0..run_length { + codes.push(value); + } + } + + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&codes.freeze().into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn sparse_mostly_nulls() -> VortexResult<()> { + let array = PrimitiveArray::new( + buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46], + Validity::from_iter(vec![ + false, false, false, false, false, false, false, false, false, false, true, + ]), + ); + let validity = array.validity()?; + + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + + let decoded = compressed.clone(); + let expected = + PrimitiveArray::new(buffer![0u8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46], validity).into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn nullable_sequence() -> VortexResult<()> { + let values = (0i32..20).step_by(7).collect_vec(); + let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some)); + + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + + let decoded = compressed; + let expected = PrimitiveArray::from_option_iter(values.into_iter().map(Some)).into_array(); + assert_arrays_eq!(decoded.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test] + fn test_rle_compression() -> VortexResult<()> { + let mut values = Vec::new(); + values.extend(iter::repeat_n(42i32, 100)); + values.extend(iter::repeat_n(123i32, 200)); + values.extend(iter::repeat_n(987i32, 150)); + + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let compressor = CascadingCompressor::new(vec![&RLE_INTEGER_SCHEME]); + let compressed = compressor.compress(&array.into_array())?; + assert!(compressed.is::()); + + let expected = Buffer::copy_from(&values).into_array(); + assert_arrays_eq!(compressed.as_ref(), expected.as_ref()); + Ok(()) + } + + #[test_with::env(CI)] + #[test_with::no_env(VORTEX_SKIP_SLOW_TESTS)] + fn compress_large_int() -> VortexResult<()> { + const NUM_LISTS: usize = 10_000; + const ELEMENTS_PER_LIST: usize = 5_000; + + let prim = (0..NUM_LISTS) + .flat_map(|list_idx| { + (0..ELEMENTS_PER_LIST).map(move |elem_idx| (list_idx * 1000 + elem_idx) as f64) + }) + .collect::() + .into_array(); + + let btr = BtrBlocksCompressor::default(); + drop(btr.compress(&prim)?); + + Ok(()) + } +} + +/// Tests to verify that each integer compression scheme produces the expected encoding. +#[cfg(test)] +mod scheme_selection_tests { + use std::iter; + + use rand::Rng; + use rand::SeedableRng; + use rand::rngs::StdRng; + use vortex_array::IntoArray; + use vortex_array::arrays::Constant; + use vortex_array::arrays::Dict; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; + use vortex_error::VortexResult; + use vortex_fastlanes::BitPacked; + use vortex_fastlanes::FoR; + use vortex_runend::RunEnd; + use vortex_sequence::Sequence; + use vortex_sparse::Sparse; + + use crate::BtrBlocksCompressor; + + #[test] + fn test_constant_compressed() -> VortexResult<()> { + let values: Vec = iter::repeat_n(42, 100).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_for_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| 1_000_000 + ((i * 37) % 100)).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_bitpacking_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| i % 16).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_sparse_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..1000 { + if i % 20 == 0 { + values.push(2_000_000 + (i * 7) % 1000); + } else { + values.push(1_000_000); + } + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_dict_compressed() -> VortexResult<()> { + let mut codes = Vec::with_capacity(65_535); + let numbers: Vec = [0, 10, 50, 100, 1000, 3000] + .into_iter() + .map(|i| 12340 * i) // must be big enough to not prefer fastlanes.bitpacked + .collect(); + + let mut rng = StdRng::seed_from_u64(1u64); + while codes.len() < 64000 { + let run_length = rng.next_u32() % 5; + let value = numbers[rng.next_u32() as usize % numbers.len()]; + for _ in 0..run_length { + codes.push(value); + } + } + + let array = PrimitiveArray::new(Buffer::copy_from(&codes), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_runend_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..100 { + values.extend(iter::repeat_n((i32::MAX - 50).wrapping_add(i), 10)); + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_sequence_compressed() -> VortexResult<()> { + let values: Vec = (0..1000).map(|i| i * 7).collect(); + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_rle_compressed() -> VortexResult<()> { + let mut values: Vec = Vec::new(); + for i in 0..1024 { + values.extend(iter::repeat_n(i, 10)); + } + let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array.into_array())?; + eprintln!("{}", compressed.display_tree()); + assert!(compressed.is::()); + Ok(()) + } +} diff --git a/vortex-btrblocks/src/schemes/mod.rs b/vortex-btrblocks/src/schemes/mod.rs new file mode 100644 index 00000000000..13f1bfecd25 --- /dev/null +++ b/vortex-btrblocks/src/schemes/mod.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compression scheme implementations. + +pub mod float; +pub mod integer; +pub mod string; + +pub mod decimal; +pub mod temporal; + +pub(crate) mod patches; +pub(crate) mod rle; diff --git a/vortex-btrblocks/src/compressor/patches.rs b/vortex-btrblocks/src/schemes/patches.rs similarity index 100% rename from vortex-btrblocks/src/compressor/patches.rs rename to vortex-btrblocks/src/schemes/patches.rs diff --git a/vortex-btrblocks/src/schemes/rle.rs b/vortex-btrblocks/src/schemes/rle.rs new file mode 100644 index 00000000000..b301098eb4c --- /dev/null +++ b/vortex-btrblocks/src/schemes/rle.rs @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Debug; +use std::marker::PhantomData; + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_compressor::builtins::FloatDictScheme; +use vortex_compressor::builtins::StringDictScheme; +use vortex_compressor::builtins::is_float_primitive; +use vortex_compressor::builtins::is_integer_primitive; +use vortex_compressor::scheme::AncestorExclusion; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; +#[cfg(feature = "unstable_encodings")] +use vortex_compressor::scheme::SchemeId; +use vortex_compressor::stats::FloatStats; +use vortex_compressor::stats::IntegerStats; +use vortex_error::VortexResult; +use vortex_fastlanes::RLEArray; + +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeExt; +use crate::estimate_compression_ratio_with_sampling; +use crate::schemes::integer::IntDictScheme; +use crate::schemes::integer::SparseScheme; + +/// Threshold for the average run length in an array before we consider run-length encoding. +pub const RUN_LENGTH_THRESHOLD: u32 = 4; + +/// RLE scheme for integer compression. +pub const RLE_INTEGER_SCHEME: RLEScheme = RLEScheme::new(); + +/// RLE scheme for float compression. +pub const RLE_FLOAT_SCHEME: RLEScheme = RLEScheme::new(); + +/// Configuration for integer RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct IntRLEConfig; + +/// Configuration for float RLE compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FloatRLEConfig; + +/// Configuration trait for RLE schemes. +/// +/// Implement this trait to define the behavior of an RLE scheme for a specific +/// stats type. +pub trait RLEConfig: Debug + Send + Sync + 'static { + /// The statistics type used by this RLE scheme. + type Stats: RLEStats + 'static; + + /// The globally unique name for this RLE scheme. + const SCHEME_NAME: &'static str; + + /// Whether this scheme can compress the given canonical array. + fn matches(canonical: &Canonical) -> bool; + + /// Generates statistics for the given array. + fn generate_stats(array: &ArrayRef) -> Self::Stats; +} + +impl RLEConfig for IntRLEConfig { + type Stats = IntegerStats; + + const SCHEME_NAME: &'static str = "vortex.int.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_integer_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> IntegerStats { + IntegerStats::generate(&array.to_primitive()) + } +} + +impl RLEConfig for FloatRLEConfig { + type Stats = FloatStats; + + const SCHEME_NAME: &'static str = "vortex.float.rle"; + + fn matches(canonical: &Canonical) -> bool { + is_float_primitive(canonical) + } + + fn generate_stats(array: &ArrayRef) -> FloatStats { + FloatStats::generate(&array.to_primitive()) + } +} + +/// Trait for accessing RLE-specific statistics. +pub trait RLEStats { + /// Returns the number of non-null values. + fn value_count(&self) -> u32; + /// Returns the average run length. + fn average_run_length(&self) -> u32; + /// Returns the underlying source array. + fn source(&self) -> &PrimitiveArray; +} + +impl RLEStats for IntegerStats { + fn value_count(&self) -> u32 { + self.value_count() + } + + fn average_run_length(&self) -> u32 { + self.average_run_length() + } + + fn source(&self) -> &PrimitiveArray { + self.source() + } +} + +impl RLEStats for FloatStats { + fn value_count(&self) -> u32 { + FloatStats::value_count(self) + } + + fn average_run_length(&self) -> u32 { + FloatStats::average_run_length(self) + } + + fn source(&self) -> &PrimitiveArray { + FloatStats::source(self) + } +} + +/// RLE scheme that is generic over a configuration type. +/// +/// This is a ZST (zero-sized type) - all behavior is defined by the `RLEConfig` trait. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RLEScheme(PhantomData); + +impl RLEScheme { + /// Creates a new RLE scheme. + pub const fn new() -> Self { + Self(PhantomData) + } +} + +impl Default for RLEScheme { + fn default() -> Self { + Self::new() + } +} + +impl Scheme for RLEScheme { + fn scheme_name(&self) -> &'static str { + C::SCHEME_NAME + } + + fn matches(&self, canonical: &Canonical) -> bool { + C::matches(canonical) + } + + /// Children: values=0, indices=1, offsets=2. + fn num_children(&self) -> usize { + 3 + } + + /// RLE indices (child 1) and offsets (child 2) are monotonically increasing positions + /// with all unique values. Dict, RunEnd, and Sparse are all pointless on such data. + /// Self-exclusion already prevents RLE on RLE children. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::Many(&[1, 2]), + }, + // TODO(connor): This is wrong for some reason? + // DescendantExclusion { + // excluded: RunEndScheme.id(), + // children: ChildSelection::Many(&[1, 2]), + // }, + DescendantExclusion { + excluded: SparseScheme.id(), + children: ChildSelection::Many(&[1, 2]), + }, + ] + } + + /// Dict values (child 0) are all unique by definition, so RLE is pointless on them. + fn ancestor_exclusions(&self) -> Vec { + vec![ + AncestorExclusion { + ancestor: IntDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: FloatDictScheme.id(), + children: ChildSelection::One(0), + }, + AncestorExclusion { + ancestor: StringDictScheme.id(), + children: ChildSelection::One(0), + }, + ] + } + + fn expected_compression_ratio( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + // RLE is only useful when we cascade it with another encoding. + let array = data.array().clone(); + let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); + + // Don't compress all-null or empty arrays. + if stats.value_count() == 0 { + return Ok(0.0); + } + + // Check whether RLE is a good fit, based on the average run length. + if stats.average_run_length() < RUN_LENGTH_THRESHOLD { + return Ok(0.0); + } + + // Run compression on a sample to see how it performs. + estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let array = data.array().clone(); + let stats = data.get_or_insert_with::(|| C::generate_stats(&array)); + let rle_array = RLEArray::encode(RLEStats::source(stats))?; + + let compressed_values = compressor.compress_child( + &rle_array.values().to_primitive().into_array(), + &ctx, + self.id(), + 0, + )?; + + // Delta in an unstable encoding, once we deem it stable we can switch over to this always. + #[cfg(feature = "unstable_encodings")] + let compressed_indices = try_compress_delta( + compressor, + &rle_array.indices().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + #[cfg(not(feature = "unstable_encodings"))] + let compressed_indices = compressor.compress_child( + &rle_array.indices().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + + let compressed_offsets = compressor.compress_child( + &rle_array + .values_idx_offsets() + .to_primitive() + .narrow()? + .into_array(), + &ctx, + self.id(), + 2, + )?; + + // SAFETY: Recursive compression doesn't affect the invariants. + unsafe { + Ok(RLEArray::new_unchecked( + compressed_values, + compressed_indices, + compressed_offsets, + rle_array.dtype().clone(), + rle_array.offset(), + rle_array.len(), + ) + .into_array()) + } + } +} + +#[cfg(feature = "unstable_encodings")] +fn try_compress_delta( + compressor: &CascadingCompressor, + child: &ArrayRef, + parent_ctx: &CompressorContext, + parent_id: SchemeId, + child_index: usize, +) -> VortexResult { + let (bases, deltas) = + vortex_fastlanes::delta_compress(&child.to_primitive(), &mut compressor.execution_ctx())?; + + let compressed_bases = + compressor.compress_child(&bases.into_array(), parent_ctx, parent_id, child_index)?; + let compressed_deltas = + compressor.compress_child(&deltas.into_array(), parent_ctx, parent_id, child_index)?; + + vortex_fastlanes::DeltaArray::try_from_delta_compress_parts(compressed_bases, compressed_deltas) + .map(vortex_fastlanes::DeltaArray::into_array) +} diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs new file mode 100644 index 00000000000..0840a2dfae2 --- /dev/null +++ b/vortex-btrblocks/src/schemes/string.rs @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! String compression schemes. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::VarBinArray; +use vortex_array::vtable::ValidityHelper; +use vortex_compressor::scheme::ChildSelection; +use vortex_compressor::scheme::DescendantExclusion; +use vortex_error::VortexResult; +use vortex_fsst::FSSTArray; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; +use vortex_sparse::Sparse; +use vortex_sparse::SparseArray; + +use super::integer::IntDictScheme; +use super::integer::SparseScheme as IntSparseScheme; +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeExt; + +/// FSST (Fast Static Symbol Table) compression. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FSSTScheme; + +/// Sparse encoding for null-dominated arrays. +/// +/// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct NullDominatedSparseScheme; + +/// Zstd compression without dictionaries (nvCOMP compatible). +#[cfg(feature = "zstd")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ZstdScheme; + +/// Zstd buffer-level compression preserving array layout for GPU decompression. +#[cfg(all(feature = "zstd", feature = "unstable_encodings"))] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ZstdBuffersScheme; + +// Re-export builtin schemes from vortex-compressor. +pub use vortex_compressor::builtins::StringConstantScheme; +pub use vortex_compressor::builtins::StringDictScheme; +pub use vortex_compressor::builtins::is_utf8_string; +pub use vortex_compressor::stats::StringStats; + +impl Scheme for FSSTScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.fsst" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// Children: lengths=0, code_offsets=1. + fn num_children(&self) -> usize { + 2 + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let fsst = { + let compressor_fsst = fsst_train_compressor(stats.source()); + fsst_compress(stats.source(), &compressor_fsst) + }; + + let compressed_original_lengths = compressor.compress_child( + &fsst + .uncompressed_lengths() + .to_primitive() + .narrow()? + .into_array(), + &ctx, + self.id(), + 0, + )?; + + let compressed_codes_offsets = compressor.compress_child( + &fsst.codes().offsets().to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + let compressed_codes = VarBinArray::try_new( + compressed_codes_offsets, + fsst.codes().bytes().clone(), + fsst.codes().dtype().clone(), + fsst.codes().validity().clone(), + )?; + + let fsst = FSSTArray::try_new( + fsst.dtype().clone(), + fsst.symbols().clone(), + fsst.symbol_lengths().clone(), + compressed_codes, + compressed_original_lengths, + )?; + + Ok(fsst.into_array()) + } +} + +impl Scheme for NullDominatedSparseScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.sparse" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// Children: indices=0. + fn num_children(&self) -> usize { + 1 + } + + /// The indices of a null-dominated sparse array should not be sparse-encoded again. + fn descendant_exclusions(&self) -> Vec { + vec![ + DescendantExclusion { + excluded: IntSparseScheme.id(), + children: ChildSelection::All, + }, + DescendantExclusion { + excluded: IntDictScheme.id(), + children: ChildSelection::All, + }, + ] + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + if stats.value_count() == 0 { + // All nulls should use ConstantScheme. + return Ok(0.0); + } + + // If the majority is null, will compress well. + if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 { + return Ok(stats.source().len() as f64 / stats.value_count() as f64); + } + + // Otherwise we don't go this route. + Ok(0.0) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + // We pass None as we only run this pathway for NULL-dominated string arrays. + let sparse_encoded = SparseArray::encode(&stats.source().clone().into_array(), None)?; + + if let Some(sparse) = sparse_encoded.as_opt::() { + // Compress the indices only (not the values for strings). + let indices = sparse.patches().indices().to_primitive().narrow()?; + let compressed_indices = + compressor.compress_child(&indices.into_array(), &ctx, self.id(), 0)?; + + SparseArray::try_new( + compressed_indices, + sparse.patches().values().clone(), + sparse.len(), + sparse.fill_scalar().clone(), + ) + .map(|a| a.into_array()) + } else { + Ok(sparse_encoded) + } + } +} + +#[cfg(feature = "zstd")] +impl Scheme for ZstdScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.zstd" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + let compacted = stats.source().compact_buffers()?; + Ok( + vortex_zstd::ZstdArray::from_var_bin_view_without_dict(&compacted, 3, 8192)? + .into_array(), + ) + } +} + +#[cfg(all(feature = "zstd", feature = "unstable_encodings"))] +impl Scheme for ZstdBuffersScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.zstd_buffers" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + fn compress( + &self, + _compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + let stats = data.string_stats(); + + Ok( + vortex_zstd::ZstdBuffersArray::compress(&stats.source().clone().into_array(), 3)? + .into_array(), + ) + } +} + +#[cfg(test)] +mod tests { + use vortex_array::IntoArray; + use vortex_array::arrays::VarBinViewArray; + use vortex_array::builders::ArrayBuilder; + use vortex_array::builders::VarBinViewBuilder; + use vortex_array::display::DisplayOptions; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_error::VortexResult; + + use crate::BtrBlocksCompressor; + + #[test] + fn test_strings() -> VortexResult<()> { + let mut strings = Vec::new(); + for _ in 0..1024 { + strings.push(Some("hello-world-1234")); + } + for _ in 0..1024 { + strings.push(Some("hello-world-56789")); + } + let strings = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + + let array_ref = strings.into_array(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array_ref)?; + assert_eq!(compressed.len(), 2048); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.dict(utf8, len=2048)"); + + Ok(()) + } + + #[test] + fn test_sparse_nulls() -> VortexResult<()> { + let mut strings = VarBinViewBuilder::with_capacity(DType::Utf8(Nullability::Nullable), 100); + strings.append_nulls(99); + + strings.append_value("one little string"); + + let strings = strings.finish_into_varbinview(); + + let array_ref = strings.into_array(); + let btr = BtrBlocksCompressor::default(); + let compressed = btr.compress(&array_ref)?; + assert_eq!(compressed.len(), 100); + + let display = compressed + .display_as(DisplayOptions::MetadataOnly) + .to_string() + .to_lowercase(); + assert_eq!(display, "vortex.sparse(utf8?, len=100)"); + + Ok(()) + } +} + +/// Tests to verify that each string compression scheme produces the expected encoding. +#[cfg(test)] +mod scheme_selection_tests { + use vortex_array::IntoArray; + use vortex_array::arrays::Constant; + use vortex_array::arrays::Dict; + use vortex_array::arrays::VarBinViewArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_error::VortexResult; + use vortex_fsst::FSST; + + use crate::BtrBlocksCompressor; + + #[test] + fn test_constant_compressed() -> VortexResult<()> { + let strings: Vec> = vec![Some("constant_value"); 100]; + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_dict_compressed() -> VortexResult<()> { + let distinct_values = ["apple", "banana", "cherry"]; + let mut strings = Vec::with_capacity(1000); + for i in 0..1000 { + strings.push(Some(distinct_values[i % 3])); + } + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; + assert!(compressed.is::()); + Ok(()) + } + + #[test] + fn test_fsst_compressed() -> VortexResult<()> { + let mut strings = Vec::with_capacity(1000); + for i in 0..1000 { + strings.push(Some(format!( + "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern" + ))); + } + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + let compressed = BtrBlocksCompressor::default().compress(&array_ref)?; + assert!(compressed.is::()); + Ok(()) + } +} diff --git a/vortex-btrblocks/src/schemes/temporal.rs b/vortex-btrblocks/src/schemes/temporal.rs new file mode 100644 index 00000000000..f1ecb158d96 --- /dev/null +++ b/vortex-btrblocks/src/schemes/temporal.rs @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Temporal compression scheme using datetime-part decomposition. + +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::aggregate_fn::fns::is_constant::is_constant; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::TemporalArray; +use vortex_array::dtype::extension::Matcher; +use vortex_array::extension::datetime::AnyTemporal; +use vortex_array::extension::datetime::TemporalMetadata; +use vortex_datetime_parts::DateTimePartsArray; +use vortex_datetime_parts::TemporalParts; +use vortex_datetime_parts::split_temporal; +use vortex_error::VortexResult; + +use crate::ArrayAndStats; +use crate::CascadingCompressor; +use crate::CompressorContext; +use crate::Scheme; +use crate::SchemeExt; + +/// Compression scheme for temporal timestamp arrays via datetime-part decomposition. +/// +/// Splits timestamps into days, seconds, and subseconds components, compresses each +/// independently, and wraps the result in a [`DateTimePartsArray`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct TemporalScheme; + +impl Scheme for TemporalScheme { + fn scheme_name(&self) -> &'static str { + "vortex.ext.temporal" + } + + fn matches(&self, canonical: &Canonical) -> bool { + let Canonical::Extension(ext) = canonical else { + return false; + }; + + let ext_dtype = ext.ext_dtype(); + + matches!( + AnyTemporal::try_match(ext_dtype), + Some(TemporalMetadata::Timestamp(..)) + ) + } + + fn detects_constant(&self) -> bool { + true + } + + /// Children: days=0, seconds=1, subseconds=2. + fn num_children(&self) -> usize { + 3 + } + + fn expected_compression_ratio( + &self, + _compressor: &CascadingCompressor, + _data: &mut ArrayAndStats, + _ctx: CompressorContext, + ) -> VortexResult { + // Temporal compression (splitting into parts) is almost always beneficial. + // Return a moderate ratio to ensure this scheme is selected. + Ok(f64::MAX) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &mut ArrayAndStats, + ctx: CompressorContext, + ) -> VortexResult { + let array = data.array().clone(); + let ext_array = array.to_extension(); + let temporal_array = TemporalArray::try_from(ext_array.clone().into_array())?; + + // Check for constant array and return early if so. + let is_constant = is_constant( + &ext_array.clone().into_array(), + &mut compressor.execution_ctx(), + )?; + + if is_constant { + return Ok( + ConstantArray::new(temporal_array.as_ref().scalar_at(0)?, ext_array.len()) + .into_array(), + ); + } + + let dtype = temporal_array.dtype().clone(); + let TemporalParts { + days, + seconds, + subseconds, + } = split_temporal(temporal_array)?; + + let days = compressor.compress_child( + &days.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 0, + )?; + let seconds = compressor.compress_child( + &seconds.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 1, + )?; + let subseconds = compressor.compress_child( + &subseconds.to_primitive().narrow()?.into_array(), + &ctx, + self.id(), + 2, + )?; + + Ok(DateTimePartsArray::try_new(dtype, days, seconds, subseconds)?.into_array()) + } +} diff --git a/vortex-btrblocks/src/stats.rs b/vortex-btrblocks/src/stats.rs deleted file mode 100644 index b3e25cfb8d6..00000000000 --- a/vortex-btrblocks/src/stats.rs +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Compression statistics types. - -use std::fmt::Debug; - -use vortex_array::vtable::VTable; - -/// Configures how stats are generated. -pub struct GenerateStatsOptions { - /// Should distinct values should be counted during stats generation. - pub count_distinct_values: bool, - // pub count_runs: bool, - // should this be scheme-specific? -} - -impl Default for GenerateStatsOptions { - fn default() -> Self { - Self { - count_distinct_values: true, - // count_runs: true, - } - } -} - -/// The size of each sampled run. -pub(crate) const SAMPLE_SIZE: u32 = 64; -/// The number of sampled runs. -/// -/// # Warning -/// -/// The product of SAMPLE_SIZE and SAMPLE_COUNT should be (roughly) a multiple of 1024 so that -/// fastlanes bitpacking of sampled vectors does not introduce (large amounts of) padding. -pub(crate) const SAMPLE_COUNT: u32 = 16; - -/// Stats for the compressor. -pub trait CompressorStats: Debug + Clone { - /// The type of the underlying source array vtable. - type ArrayVTable: VTable; - - /// Generates stats with default options. - fn generate(input: &::Array) -> Self { - Self::generate_opts(input, GenerateStatsOptions::default()) - } - - /// Generates stats with provided options. - fn generate_opts( - input: &::Array, - opts: GenerateStatsOptions, - ) -> Self; - - /// Returns the underlying source array that statistics were generated from. - fn source(&self) -> &::Array; - - /// Sample the array with default options. - fn sample(&self, sample_size: u32, sample_count: u32) -> Self { - self.sample_opts(sample_size, sample_count, GenerateStatsOptions::default()) - } - - /// Sample the array with provided options. - fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self; -}