diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index b503a41ea..2f34fd038 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -25,6 +25,7 @@ set(ICEBERG_SOURCES data/position_delete_writer.cc data/writer.cc delete_file_index.cc + deletes/roaring_position_bitmap.cc expression/aggregate.cc expression/binder.cc expression/evaluator.cc @@ -166,6 +167,7 @@ iceberg_install_all_headers(iceberg) add_subdirectory(catalog) add_subdirectory(data) +add_subdirectory(deletes) add_subdirectory(expression) add_subdirectory(manifest) add_subdirectory(puffin) diff --git a/src/iceberg/deletes/CMakeLists.txt b/src/iceberg/deletes/CMakeLists.txt new file mode 100644 index 000000000..2ce7ccf15 --- /dev/null +++ b/src/iceberg/deletes/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +iceberg_install_all_headers(iceberg/deletes) diff --git a/src/iceberg/deletes/meson.build b/src/iceberg/deletes/meson.build new file mode 100644 index 000000000..28a01de16 --- /dev/null +++ b/src/iceberg/deletes/meson.build @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +install_headers(['roaring_position_bitmap.h'], subdir: 'iceberg/deletes') diff --git a/src/iceberg/deletes/roaring_position_bitmap.cc b/src/iceberg/deletes/roaring_position_bitmap.cc new file mode 100644 index 000000000..fec037887 --- /dev/null +++ b/src/iceberg/deletes/roaring_position_bitmap.cc @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/deletes/roaring_position_bitmap.h" + +#include +#include +#include +#include +#include + +#include + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { + +constexpr size_t kBitmapCountSizeBytes = 8; +constexpr size_t kBitmapKeySizeBytes = 4; + +// Extracts high 32 bits from a 64-bit position (the key). +int32_t Key(int64_t pos) { return static_cast(pos >> 32); } + +// Extracts low 32 bits from a 64-bit position. +uint32_t Pos32Bits(int64_t pos) { return static_cast(0xFFFFFFFF & pos); } + +// Combines key (high 32 bits) and pos32 (low 32 bits) into a 64-bit +// position. The low 32 bits are zero-extended to avoid sign extension. +int64_t ToPosition(int32_t key, uint32_t pos32) { + return (int64_t{key} << 32) | int64_t{pos32}; +} + +void WriteLE64(char* buf, int64_t value) { + auto le = ToLittleEndian(static_cast(value)); + std::memcpy(buf, &le, sizeof(le)); +} + +void WriteLE32(char* buf, int32_t value) { + auto le = ToLittleEndian(static_cast(value)); + std::memcpy(buf, &le, sizeof(le)); +} + +int64_t ReadLE64(const char* buf) { + uint64_t v; + std::memcpy(&v, buf, sizeof(v)); + return static_cast(FromLittleEndian(v)); +} + +int32_t ReadLE32(const char* buf) { + uint32_t v; + std::memcpy(&v, buf, sizeof(v)); + return static_cast(FromLittleEndian(v)); +} + +Status ValidatePosition(int64_t pos) { + if (pos < 0 || pos > RoaringPositionBitmap::kMaxPosition) { + return InvalidArgument("Bitmap supports positions that are >= 0 and <= {}: {}", + RoaringPositionBitmap::kMaxPosition, pos); + } + return {}; +} + +} // namespace + +struct RoaringPositionBitmap::Impl { + std::vector bitmaps; + + void AllocateBitmapsIfNeeded(int32_t required_length) { + if (std::cmp_less(bitmaps.size(), required_length)) { + bitmaps.resize(static_cast(required_length)); + } + } +}; + +RoaringPositionBitmap::RoaringPositionBitmap() : impl_(std::make_unique()) {} + +RoaringPositionBitmap::~RoaringPositionBitmap() = default; + +RoaringPositionBitmap::RoaringPositionBitmap(const RoaringPositionBitmap& other) + : impl_(other.impl_ != nullptr ? std::make_unique(*other.impl_) + : std::make_unique()) {} + +RoaringPositionBitmap& RoaringPositionBitmap::operator=( + const RoaringPositionBitmap& other) { + if (this == &other) { + return *this; + } + impl_ = other.impl_ != nullptr ? std::make_unique(*other.impl_) + : std::make_unique(); + return *this; +} + +RoaringPositionBitmap::RoaringPositionBitmap(RoaringPositionBitmap&&) noexcept = default; + +RoaringPositionBitmap& RoaringPositionBitmap::operator=( + RoaringPositionBitmap&&) noexcept = default; + +RoaringPositionBitmap::RoaringPositionBitmap(std::unique_ptr impl) + : impl_(std::move(impl)) {} + +Status RoaringPositionBitmap::Add(int64_t pos) { + ICEBERG_RETURN_UNEXPECTED(ValidatePosition(pos)); + int32_t key = Key(pos); + uint32_t pos32 = Pos32Bits(pos); + impl_->AllocateBitmapsIfNeeded(key + 1); + impl_->bitmaps[key].add(pos32); + return {}; +} + +Status RoaringPositionBitmap::AddRange(int64_t pos_start, int64_t pos_end) { + for (int64_t pos = pos_start; pos < pos_end; ++pos) { + ICEBERG_RETURN_UNEXPECTED(Add(pos)); + } + return {}; +} + +Result RoaringPositionBitmap::Contains(int64_t pos) const { + ICEBERG_RETURN_UNEXPECTED(ValidatePosition(pos)); + int32_t key = Key(pos); + uint32_t pos32 = Pos32Bits(pos); + return std::cmp_less(key, impl_->bitmaps.size()) && impl_->bitmaps[key].contains(pos32); +} + +bool RoaringPositionBitmap::IsEmpty() const { return Cardinality() == 0; } + +size_t RoaringPositionBitmap::Cardinality() const { + size_t total = 0; + for (const auto& bitmap : impl_->bitmaps) { + total += bitmap.cardinality(); + } + return total; +} + +void RoaringPositionBitmap::Or(const RoaringPositionBitmap& other) { + impl_->AllocateBitmapsIfNeeded(static_cast(other.impl_->bitmaps.size())); + for (size_t key = 0; key < other.impl_->bitmaps.size(); ++key) { + impl_->bitmaps[key] |= other.impl_->bitmaps[key]; + } +} + +bool RoaringPositionBitmap::Optimize() { + bool changed = false; + for (auto& bitmap : impl_->bitmaps) { + changed |= bitmap.runOptimize(); + } + return changed; +} + +void RoaringPositionBitmap::ForEach(const std::function& fn) const { + for (size_t key = 0; key < impl_->bitmaps.size(); ++key) { + for (uint32_t pos32 : impl_->bitmaps[key]) { + fn(ToPosition(static_cast(key), pos32)); + } + } +} + +size_t RoaringPositionBitmap::SerializedSizeInBytes() const { + size_t size = kBitmapCountSizeBytes; + for (const auto& bitmap : impl_->bitmaps) { + size += kBitmapKeySizeBytes + bitmap.getSizeInBytes(/*portable=*/true); + } + return size; +} + +// Serializes using the portable format (little-endian). +// See https://iceberg.apache.org/puffin-spec/#deletion-vector-v1-blob-type +Result RoaringPositionBitmap::Serialize() const { + size_t size = SerializedSizeInBytes(); + std::string result(size, '\0'); + char* buf = result.data(); + + // Write bitmap count (array length including empties) + WriteLE64(buf, static_cast(impl_->bitmaps.size())); + buf += kBitmapCountSizeBytes; + + // Write each bitmap with its key + for (int32_t key = 0; std::cmp_less(key, impl_->bitmaps.size()); ++key) { + WriteLE32(buf, key); + buf += kBitmapKeySizeBytes; + size_t written = impl_->bitmaps[key].write(buf, /*portable=*/true); + buf += written; + } + + return result; +} + +Result RoaringPositionBitmap::Deserialize(std::string_view bytes) { + const char* buf = bytes.data(); + size_t remaining = bytes.size(); + + ICEBERG_PRECHECK(remaining >= kBitmapCountSizeBytes, + "Buffer too small for bitmap count: {} bytes", remaining); + + int64_t bitmap_count = ReadLE64(buf); + buf += kBitmapCountSizeBytes; + remaining -= kBitmapCountSizeBytes; + + ICEBERG_PRECHECK( + bitmap_count >= 0 && bitmap_count <= std::numeric_limits::max(), + "Invalid bitmap count: {}", bitmap_count); + + auto impl = std::make_unique(); + int32_t last_key = -1; + auto remaining_count = static_cast(bitmap_count); + + while (remaining_count > 0) { + ICEBERG_PRECHECK(remaining >= kBitmapKeySizeBytes, + "Buffer too small for bitmap key: {} bytes", remaining); + + int32_t key = ReadLE32(buf); + buf += kBitmapKeySizeBytes; + remaining -= kBitmapKeySizeBytes; + + ICEBERG_PRECHECK(key >= 0, "Invalid unsigned key: {}", key); + ICEBERG_PRECHECK(key < std::numeric_limits::max(), "Key is too large: {}", + key); + ICEBERG_PRECHECK(key > last_key, + "Keys must be sorted in ascending order, got key {} after {}", key, + last_key); + + // Fill gaps with empty bitmaps + while (last_key < key - 1) { + impl->bitmaps.emplace_back(); + ++last_key; + } + + // Read bitmap using portable safe deserialization. + // CRoaring's readSafe may throw on corrupted data. + roaring::Roaring bitmap; + try { + bitmap = roaring::Roaring::readSafe(buf, remaining); + } catch (const std::exception& e) { + return InvalidArgument("Failed to deserialize bitmap at key {}: {}", key, e.what()); + } + size_t bitmap_size = bitmap.getSizeInBytes(/*portable=*/true); + ICEBERG_PRECHECK( + bitmap_size <= remaining, + "Buffer too small for bitmap key {}: {} bytes needed, {} bytes available", key, + bitmap_size, remaining); + buf += bitmap_size; + remaining -= bitmap_size; + + impl->bitmaps.emplace_back(std::move(bitmap)); + last_key = key; + --remaining_count; + } + + return RoaringPositionBitmap(std::move(impl)); +} + +} // namespace iceberg diff --git a/src/iceberg/deletes/roaring_position_bitmap.h b/src/iceberg/deletes/roaring_position_bitmap.h new file mode 100644 index 000000000..650160c1a --- /dev/null +++ b/src/iceberg/deletes/roaring_position_bitmap.h @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/deletes/roaring_position_bitmap.h +/// A 64-bit position bitmap using an array of 32-bit Roaring bitmaps. + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +/// \brief A bitmap that supports positive 64-bit positions, optimized +/// for cases where most positions fit in 32 bits. +/// +/// Incoming 64-bit positions are divided into a 32-bit "key" using the +/// most significant 4 bytes and a 32-bit position using the least +/// significant 4 bytes. For each key, a 32-bit Roaring bitmap is +/// maintained to store positions for that key. +/// +/// \note The Puffin deletion-vector-v1 wrapping (length prefix, magic +/// bytes, CRC-32) is handled by the Puffin writer/reader layer, not +/// this class. +class ICEBERG_EXPORT RoaringPositionBitmap { + public: + /// \brief Maximum supported position (aligned with the Java implementation). + static constexpr int64_t kMaxPosition = 0x7FFFFFFE80000000LL; + + RoaringPositionBitmap(); + ~RoaringPositionBitmap(); + + RoaringPositionBitmap(RoaringPositionBitmap&& other) noexcept; + RoaringPositionBitmap& operator=(RoaringPositionBitmap&& other) noexcept; + + RoaringPositionBitmap(const RoaringPositionBitmap& other); + RoaringPositionBitmap& operator=(const RoaringPositionBitmap& other); + + /// \brief Sets a position in the bitmap. + /// \param pos the position (must be >= 0 and <= kMaxPosition) + /// \return Status indicating success or InvalidArgument error + [[nodiscard]] Status Add(int64_t pos); + + /// \brief Sets a range of positions [pos_start, pos_end). + /// \return Status indicating success or InvalidArgument error + [[nodiscard]] Status AddRange(int64_t pos_start, int64_t pos_end); + + /// \brief Checks if a position is set in the bitmap. + /// \return Result or InvalidArgument error + [[nodiscard]] Result Contains(int64_t pos) const; + + /// \brief Returns true if the bitmap has no positions set. + bool IsEmpty() const; + + /// \brief Returns the number of set positions in the bitmap. + size_t Cardinality() const; + + /// \brief Merges all positions from the other bitmap into this one + /// (in-place union). + void Or(const RoaringPositionBitmap& other); + + /// \brief Optimizes the bitmap by applying run-length encoding to + /// containers where it is more space efficient than array or bitset + /// representations. + /// \return true if any container was changed + bool Optimize(); + + /// \brief Iterates over all set positions in ascending order. + void ForEach(const std::function& fn) const; + + /// \brief Returns the serialized size in bytes. + size_t SerializedSizeInBytes() const; + + /// \brief Serializes using the portable format (little-endian). + Result Serialize() const; + + /// \brief Deserializes a bitmap from bytes. + static Result Deserialize(std::string_view bytes); + + private: + struct Impl; + std::unique_ptr impl_; + + explicit RoaringPositionBitmap(std::unique_ptr impl); +}; + +} // namespace iceberg diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 2cf1065b0..42f46d5a9 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -43,6 +43,7 @@ iceberg_sources = files( 'arrow_c_data_guard_internal.cc', 'catalog/memory/in_memory_catalog.cc', 'delete_file_index.cc', + 'deletes/roaring_position_bitmap.cc', 'expression/aggregate.cc', 'expression/binder.cc', 'expression/evaluator.cc', @@ -221,6 +222,7 @@ install_headers( ) subdir('catalog') +subdir('deletes') subdir('expression') subdir('manifest') subdir('puffin') diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..c802df8a5 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -115,6 +115,7 @@ add_iceberg_test(util_test endian_test.cc formatter_test.cc location_util_test.cc + roaring_position_bitmap_test.cc string_util_test.cc transform_util_test.cc truncate_util_test.cc diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index df2d5db8e..1eafc5fe1 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -90,6 +90,7 @@ iceberg_tests = { 'endian_test.cc', 'formatter_test.cc', 'location_util_test.cc', + 'roaring_position_bitmap_test.cc', 'string_util_test.cc', 'transform_util_test.cc', 'truncate_util_test.cc', diff --git a/src/iceberg/test/resources/64map32bitvals.bin b/src/iceberg/test/resources/64map32bitvals.bin new file mode 100644 index 000000000..475b89441 Binary files /dev/null and b/src/iceberg/test/resources/64map32bitvals.bin differ diff --git a/src/iceberg/test/resources/64mapempty.bin b/src/iceberg/test/resources/64mapempty.bin new file mode 100644 index 000000000..1b1cb4d44 Binary files /dev/null and b/src/iceberg/test/resources/64mapempty.bin differ diff --git a/src/iceberg/test/resources/64maphighvals.bin b/src/iceberg/test/resources/64maphighvals.bin new file mode 100644 index 000000000..d4312b8d2 Binary files /dev/null and b/src/iceberg/test/resources/64maphighvals.bin differ diff --git a/src/iceberg/test/resources/64mapspreadvals.bin b/src/iceberg/test/resources/64mapspreadvals.bin new file mode 100644 index 000000000..83c72f6ba Binary files /dev/null and b/src/iceberg/test/resources/64mapspreadvals.bin differ diff --git a/src/iceberg/test/roaring_position_bitmap_test.cc b/src/iceberg/test/roaring_position_bitmap_test.cc new file mode 100644 index 000000000..8d7020b09 --- /dev/null +++ b/src/iceberg/test/roaring_position_bitmap_test.cc @@ -0,0 +1,565 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/deletes/roaring_position_bitmap.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "iceberg/test/matchers.h" +#include "iceberg/test/test_config.h" + +namespace iceberg { + +namespace { + +constexpr int64_t kBitmapSize = 0xFFFFFFFFL; +constexpr int64_t kBitmapOffset = kBitmapSize + 1L; +constexpr int64_t kContainerSize = 0xFFFF; // Character.MAX_VALUE +constexpr int64_t kContainerOffset = kContainerSize + 1L; + +// Helper to construct a position from bitmap index, container index, and value +int64_t Position(int32_t bitmap_index, int32_t container_index, int64_t value) { + return bitmap_index * kBitmapOffset + container_index * kContainerOffset + value; +} + +std::string ReadTestResource(const std::string& filename) { + std::filesystem::path path = std::filesystem::path(ICEBERG_TEST_RESOURCES) / filename; + std::ifstream file(path, std::ios::binary); + EXPECT_TRUE(file.good()) << "Cannot open: " << path; + return {std::istreambuf_iterator(file), std::istreambuf_iterator()}; +} + +RoaringPositionBitmap RoundTripSerialize(const RoaringPositionBitmap& bitmap) { + auto serialized = bitmap.Serialize(); + EXPECT_THAT(serialized, IsOk()); + auto deserialized = RoaringPositionBitmap::Deserialize(serialized.value()); + EXPECT_THAT(deserialized, IsOk()); + return std::move(deserialized.value()); +} + +void AssertEqualContent(const RoaringPositionBitmap& bitmap, + const std::set& positions) { + ASSERT_EQ(bitmap.Cardinality(), positions.size()); + for (int64_t pos : positions) { + ASSERT_THAT(bitmap.Contains(pos), HasValue(::testing::Eq(true))) + << "Position not found: " << pos; + } + bitmap.ForEach([&](int64_t pos) { + ASSERT_TRUE(positions.contains(pos)) << "Unexpected position: " << pos; + }); +} + +void AssertEqual(RoaringPositionBitmap& bitmap, const std::set& positions) { + AssertEqualContent(bitmap, positions); + + auto copy1 = RoundTripSerialize(bitmap); + AssertEqualContent(copy1, positions); + + bitmap.Optimize(); + auto copy2 = RoundTripSerialize(bitmap); + AssertEqualContent(copy2, positions); +} + +RoaringPositionBitmap BuildBitmap(const std::set& positions) { + RoaringPositionBitmap bitmap; + for (int64_t pos : positions) { + EXPECT_THAT(bitmap.Add(pos), IsOk()); + } + return bitmap; +} + +struct AddRangeParams { + const char* name; + int64_t start; + int64_t end; + std::vector absent_positions; +}; + +class RoaringPositionBitmapAddRangeTest + : public ::testing::TestWithParam {}; + +TEST_P(RoaringPositionBitmapAddRangeTest, AddsExpectedPositions) { + const auto& param = GetParam(); + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.AddRange(param.start, param.end), IsOk()); + + std::set expected_positions; + for (int64_t pos = param.start; pos < param.end; ++pos) { + expected_positions.insert(pos); + } + AssertEqualContent(bitmap, expected_positions); + for (int64_t pos : param.absent_positions) { + ASSERT_FALSE(bitmap.Contains(pos).value()); + } +} + +INSTANTIATE_TEST_SUITE_P( + AddRangeScenarios, RoaringPositionBitmapAddRangeTest, + ::testing::Values(AddRangeParams{.name = "single_key", + .start = 10, + .end = 20, + .absent_positions = {9, 20}}, + AddRangeParams{ + .name = "across_keys", + .start = (int64_t{1} << 32) - 5, + .end = (int64_t{1} << 32) + 5, + .absent_positions = {0, (int64_t{1} << 32) + 5}, + }), + [](const ::testing::TestParamInfo& info) { return info.param.name; }); + +struct OrParams { + const char* name; + std::set lhs_input; + std::set rhs_input; + std::set lhs_expected; + std::set rhs_expected; +}; + +class RoaringPositionBitmapOrTest : public ::testing::TestWithParam {}; + +TEST_P(RoaringPositionBitmapOrTest, ProducesExpectedUnionAndKeepsRhsUnchanged) { + const auto& param = GetParam(); + auto lhs = BuildBitmap(param.lhs_input); + auto rhs = BuildBitmap(param.rhs_input); + + lhs.Or(rhs); + + AssertEqualContent(lhs, param.lhs_expected); + AssertEqualContent(rhs, param.rhs_expected); +} + +INSTANTIATE_TEST_SUITE_P( + OrScenarios, RoaringPositionBitmapOrTest, + ::testing::Values( + OrParams{ + .name = "disjoint", + .lhs_input = {10L, 20L}, + .rhs_input = {30L, 40L, int64_t{2} << 32}, + .lhs_expected = {10L, 20L, 30L, 40L, int64_t{2} << 32}, + .rhs_expected = {30L, 40L, int64_t{2} << 32}, + }, + OrParams{ + .name = "rhs_empty", + .lhs_input = {10L, 20L}, + .rhs_input = {}, + .lhs_expected = {10L, 20L}, + .rhs_expected = {}, + }, + OrParams{ + .name = "overlapping", + .lhs_input = {10L, 20L, 30L}, + .rhs_input = {20L, 40L}, + .lhs_expected = {10L, 20L, 30L, 40L}, + .rhs_expected = {20L, 40L}, + }, + OrParams{ + .name = "sparse_multi_key", + .lhs_input = {100L, (int64_t{1} << 32) | 200L}, + .rhs_input = {(int64_t{2} << 32) | 300L, (int64_t{3} << 32) | 400L}, + .lhs_expected = {100L, (int64_t{1} << 32) | 200L, (int64_t{2} << 32) | 300L, + (int64_t{3} << 32) | 400L}, + .rhs_expected = {(int64_t{2} << 32) | 300L, (int64_t{3} << 32) | 400L}, + }), + [](const ::testing::TestParamInfo& info) { return info.param.name; }); + +enum class InteropBitmapShape { + kEmpty, + kOnly32BitPositions, + kSpreadAcrossKeys, +}; + +struct InteropCase { + const char* file_name; + InteropBitmapShape expected_shape; +}; + +void AssertInteropBitmapShape(const RoaringPositionBitmap& bitmap, + InteropBitmapShape expected_shape) { + bool saw_pos_lt_32_bit = false; + bool saw_pos_ge_32_bit = false; + + bitmap.ForEach([&](int64_t pos) { + if (pos < (int64_t{1} << 32)) { + saw_pos_lt_32_bit = true; + } else { + saw_pos_ge_32_bit = true; + } + }); + + switch (expected_shape) { + case InteropBitmapShape::kEmpty: + ASSERT_TRUE(bitmap.IsEmpty()); + ASSERT_EQ(bitmap.Cardinality(), 0u); + break; + case InteropBitmapShape::kOnly32BitPositions: + ASSERT_GT(bitmap.Cardinality(), 0u); + ASSERT_TRUE(saw_pos_lt_32_bit); + ASSERT_FALSE(saw_pos_ge_32_bit); + break; + case InteropBitmapShape::kSpreadAcrossKeys: + ASSERT_GT(bitmap.Cardinality(), 0u); + ASSERT_TRUE(saw_pos_lt_32_bit); + ASSERT_TRUE(saw_pos_ge_32_bit); + break; + } +} + +} // namespace + +TEST(RoaringPositionBitmapTest, TestAdd) { + RoaringPositionBitmap bitmap; + + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_TRUE(bitmap.Contains(10L).value()); + + ASSERT_THAT(bitmap.Add(0L), IsOk()); + ASSERT_TRUE(bitmap.Contains(0L).value()); + + ASSERT_THAT(bitmap.Add(10L), IsOk()); // duplicate + ASSERT_TRUE(bitmap.Contains(10L).value()); +} + +TEST(RoaringPositionBitmapTest, TestAddPositionsRequiringMultipleBitmaps) { + RoaringPositionBitmap bitmap; + + int64_t pos1 = 10L; + int64_t pos2 = (int64_t{1} << 32) | 20L; + int64_t pos3 = (int64_t{2} << 32) | 30L; + int64_t pos4 = (int64_t{100} << 32) | 40L; + + ASSERT_THAT(bitmap.Add(pos1), IsOk()); + ASSERT_THAT(bitmap.Add(pos2), IsOk()); + ASSERT_THAT(bitmap.Add(pos3), IsOk()); + ASSERT_THAT(bitmap.Add(pos4), IsOk()); + + AssertEqualContent(bitmap, {pos1, pos2, pos3, pos4}); + ASSERT_EQ(bitmap.SerializedSizeInBytes(), 1260); +} + +TEST(RoaringPositionBitmapTest, TestAddEmptyRange) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.AddRange(10, 10), IsOk()); + ASSERT_TRUE(bitmap.IsEmpty()); +} + +TEST(RoaringPositionBitmapTest, TestCardinality) { + RoaringPositionBitmap bitmap; + + ASSERT_EQ(bitmap.Cardinality(), 0); + + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_THAT(bitmap.Add(20L), IsOk()); + ASSERT_THAT(bitmap.Add(30L), IsOk()); + ASSERT_EQ(bitmap.Cardinality(), 3); + + ASSERT_THAT(bitmap.Add(10L), IsOk()); // already exists + ASSERT_EQ(bitmap.Cardinality(), 3); +} + +TEST(RoaringPositionBitmapTest, TestCardinalitySparseBitmaps) { + RoaringPositionBitmap bitmap; + + ASSERT_THAT(bitmap.Add(100L), IsOk()); + ASSERT_THAT(bitmap.Add(101L), IsOk()); + ASSERT_THAT(bitmap.Add(105L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{1} << 32) | 200L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{100} << 32) | 300L), IsOk()); + + ASSERT_EQ(bitmap.Cardinality(), 5); +} + +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeRoundTrip) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_THAT(bitmap.Add(20L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{1} << 32) | 30L), IsOk()); + + auto copy = RoundTripSerialize(bitmap); + AssertEqualContent(copy, {10L, 20L, (int64_t{1} << 32) | 30L}); +} + +TEST(RoaringPositionBitmapTest, TestCopyConstructor) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{2} << 32) | 30L), IsOk()); + + RoaringPositionBitmap copy(bitmap); + + AssertEqualContent(copy, {10L, (int64_t{2} << 32) | 30L}); + + // Copy is independent from source. + ASSERT_THAT(copy.Add(99L), IsOk()); + ASSERT_THAT(bitmap.Contains(99L), HasValue(::testing::Eq(false))); +} + +TEST(RoaringPositionBitmapTest, TestCopyAssignment) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{3} << 32) | 40L), IsOk()); + + RoaringPositionBitmap assigned; + ASSERT_THAT(assigned.Add(1L), IsOk()); + + assigned = bitmap; + AssertEqualContent(assigned, {10L, (int64_t{3} << 32) | 40L}); + + // Assignment result is independent from source. + ASSERT_THAT(bitmap.Add(200L), IsOk()); + ASSERT_THAT(assigned.Contains(200L), HasValue(::testing::Eq(false))); +} + +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeEmpty) { + RoaringPositionBitmap bitmap; + auto copy = RoundTripSerialize(bitmap); + ASSERT_TRUE(copy.IsEmpty()); + ASSERT_EQ(copy.Cardinality(), 0); +} + +TEST(RoaringPositionBitmapTest, TestSerializeDeserializeAllContainerBitmap) { + RoaringPositionBitmap bitmap; + + // bitmap 0, container 0 (array - few elements) + ASSERT_THAT(bitmap.Add(Position(0, 0, 5)), IsOk()); + ASSERT_THAT(bitmap.Add(Position(0, 0, 7)), IsOk()); + + // bitmap 0, container 1 (array that can be compressed) + ASSERT_THAT(bitmap.AddRange(Position(0, 1, 1), Position(0, 1, 1000)), IsOk()); + + // bitmap 0, container 2 (bitset - nearly full container) + ASSERT_THAT(bitmap.AddRange(Position(0, 2, 1), Position(0, 2, kContainerOffset - 1)), + IsOk()); + + // bitmap 1, container 0 (array) + ASSERT_THAT(bitmap.Add(Position(1, 0, 10)), IsOk()); + ASSERT_THAT(bitmap.Add(Position(1, 0, 20)), IsOk()); + + // bitmap 1, container 1 (array that can be compressed) + ASSERT_THAT(bitmap.AddRange(Position(1, 1, 10), Position(1, 1, 500)), IsOk()); + + // bitmap 1, container 2 (bitset) + ASSERT_THAT(bitmap.AddRange(Position(1, 2, 1), Position(1, 2, kContainerOffset - 1)), + IsOk()); + + ASSERT_TRUE(bitmap.Optimize()); + + auto copy = RoundTripSerialize(bitmap); + std::set expected_positions; + bitmap.ForEach([&](int64_t pos) { expected_positions.insert(pos); }); + + AssertEqualContent(copy, expected_positions); +} + +TEST(RoaringPositionBitmapTest, TestForEach) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.Add(30L), IsOk()); + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_THAT(bitmap.Add(20L), IsOk()); + ASSERT_THAT(bitmap.Add((int64_t{1} << 32) | 5L), IsOk()); + + std::vector positions; + bitmap.ForEach([&](int64_t pos) { positions.push_back(pos); }); + + ASSERT_EQ(positions.size(), 4u); + ASSERT_EQ(positions[0], 10L); + ASSERT_EQ(positions[1], 20L); + ASSERT_EQ(positions[2], 30L); + ASSERT_EQ(positions[3], (int64_t{1} << 32) | 5L); +} + +TEST(RoaringPositionBitmapTest, TestIsEmpty) { + RoaringPositionBitmap bitmap; + ASSERT_TRUE(bitmap.IsEmpty()); + + ASSERT_THAT(bitmap.Add(10L), IsOk()); + ASSERT_FALSE(bitmap.IsEmpty()); +} + +TEST(RoaringPositionBitmapTest, TestOptimize) { + RoaringPositionBitmap bitmap; + ASSERT_THAT(bitmap.AddRange(0, 10000), IsOk()); + size_t size_before = bitmap.SerializedSizeInBytes(); + + bool changed = bitmap.Optimize(); + ASSERT_TRUE(changed); + + // RLE optimization should reduce size for this dense range + size_t size_after = bitmap.SerializedSizeInBytes(); + ASSERT_GT(size_before, size_after); + + // Content should be unchanged after RLE optimization + std::set expected_positions; + for (int64_t i = 0; i < 10000; ++i) { + expected_positions.insert(i); + } + AssertEqualContent(bitmap, expected_positions); + + // Round-trip should preserve content after RLE + auto copy = RoundTripSerialize(bitmap); + AssertEqualContent(copy, expected_positions); +} + +TEST(RoaringPositionBitmapTest, TestUnsupportedPositions) { + RoaringPositionBitmap bitmap; + + // Negative position + ASSERT_THAT(bitmap.Add(-1L), IsError(ErrorKind::kInvalidArgument)); + + // Contains with negative position + ASSERT_THAT(bitmap.Contains(-1L), IsError(ErrorKind::kInvalidArgument)); + + // Position exceeding MAX_POSITION + ASSERT_THAT(bitmap.Add(RoaringPositionBitmap::kMaxPosition + 1L), + IsError(ErrorKind::kInvalidArgument)); + + // Contains with position exceeding MAX_POSITION + ASSERT_THAT(bitmap.Contains(RoaringPositionBitmap::kMaxPosition + 1L), + IsError(ErrorKind::kInvalidArgument)); +} + +TEST(RoaringPositionBitmapTest, TestRandomSparseBitmap) { + std::mt19937_64 rng(42); + RoaringPositionBitmap bitmap; + std::set positions; + + std::uniform_int_distribution dist(0, int64_t{5} << 32); + + for (int i = 0; i < 100000; ++i) { + int64_t pos = dist(rng); + positions.insert(pos); + ASSERT_THAT(bitmap.Add(pos), IsOk()); + } + + AssertEqual(bitmap, positions); + + // Random lookups + std::mt19937_64 rng2(123); + std::uniform_int_distribution lookup_dist(0, + RoaringPositionBitmap::kMaxPosition); + for (int i = 0; i < 20000; ++i) { + int64_t pos = lookup_dist(rng2); + auto result = bitmap.Contains(pos); + ASSERT_THAT(result, IsOk()); + ASSERT_EQ(result.value(), positions.contains(pos)); + } +} + +TEST(RoaringPositionBitmapTest, TestRandomDenseBitmap) { + RoaringPositionBitmap bitmap; + std::set positions; + + // Create dense ranges across multiple bitmap keys + for (int64_t offset : {int64_t{0}, int64_t{2} << 32, int64_t{5} << 32}) { + for (int64_t i = 0; i < 10000; ++i) { + ASSERT_THAT(bitmap.Add(offset + i), IsOk()); + positions.insert(offset + i); + } + } + + AssertEqual(bitmap, positions); +} + +TEST(RoaringPositionBitmapTest, TestRandomMixedBitmap) { + std::mt19937_64 rng(42); + RoaringPositionBitmap bitmap; + std::set positions; + + // Sparse positions in [3<<32, 5<<32) + std::uniform_int_distribution dist1(int64_t{3} << 32, int64_t{5} << 32); + for (int i = 0; i < 50000; ++i) { + int64_t pos = dist1(rng); + positions.insert(pos); + ASSERT_THAT(bitmap.Add(pos), IsOk()); + } + + // Dense range in [0, 10000) + for (int64_t i = 0; i < 10000; ++i) { + ASSERT_THAT(bitmap.Add(i), IsOk()); + positions.insert(i); + } + + // More sparse positions in [0, 1<<32) + std::uniform_int_distribution dist2(0, int64_t{1} << 32); + for (int i = 0; i < 5000; ++i) { + int64_t pos = dist2(rng); + positions.insert(pos); + ASSERT_THAT(bitmap.Add(pos), IsOk()); + } + + AssertEqual(bitmap, positions); +} + +TEST(RoaringPositionBitmapTest, TestDeserializeInvalidData) { + // Buffer too small + auto result = RoaringPositionBitmap::Deserialize(""); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + + // Invalid bitmap count (very large) + std::string buf(8, '\xFF'); + result = RoaringPositionBitmap::Deserialize(buf); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); +} + +TEST(RoaringPositionBitmapInteropTest, TestDeserializeSupportedRoaringExamples) { + // These .bin fixtures are copied from the Apache Iceberg Java repository's + // roaring position bitmap interoperability test resources. + static const std::vector kCases = { + {.file_name = "64map32bitvals.bin", + .expected_shape = InteropBitmapShape::kOnly32BitPositions}, + {.file_name = "64mapempty.bin", .expected_shape = InteropBitmapShape::kEmpty}, + {.file_name = "64mapspreadvals.bin", + .expected_shape = InteropBitmapShape::kSpreadAcrossKeys}, + }; + + for (const auto& test_case : kCases) { + SCOPED_TRACE(test_case.file_name); + std::string data = ReadTestResource(test_case.file_name); + auto result = RoaringPositionBitmap::Deserialize(data); + ASSERT_THAT(result, IsOk()); + + const auto& bitmap = result.value(); + AssertInteropBitmapShape(bitmap, test_case.expected_shape); + + std::set positions; + bitmap.ForEach([&](int64_t pos) { positions.insert(pos); }); + AssertEqualContent(bitmap, positions); + + auto copy = RoundTripSerialize(bitmap); + AssertEqualContent(copy, positions); + } +} + +TEST(RoaringPositionBitmapInteropTest, TestDeserializeUnsupportedRoaringExample) { + // This file is copied from the Apache Iceberg Java repository and it contains + // a value with key larger than max supported + std::string data = ReadTestResource("64maphighvals.bin"); + auto result = RoaringPositionBitmap::Deserialize(data); + ASSERT_THAT(result, IsError(ErrorKind::kInvalidArgument)); + ASSERT_THAT(result, HasErrorMessage("Invalid unsigned key")); +} + +} // namespace iceberg