Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
f36e6bd
Add SSE4.2 implementation
AntoinePrv Oct 27, 2025
d019318
Add unpack uint8_t benchmark
AntoinePrv Oct 28, 2025
fdfd354
Add bool unpack benchmark
AntoinePrv Oct 30, 2025
6826466
Bias benchmarks toward small scale
AntoinePrv Nov 25, 2025
a14a070
Add Kernel plan builder
AntoinePrv Oct 20, 2025
4300cc1
Add simd kernel
AntoinePrv Oct 24, 2025
f38f774
Handle rshifts on SSE2
AntoinePrv Oct 27, 2025
8b52a54
Use new kernel when possible in generated 128 code
AntoinePrv Oct 27, 2025
a992186
Refactor array to xsimd::batch_constant
AntoinePrv Oct 27, 2025
3e3d2fa
Refactor right shift
AntoinePrv Oct 27, 2025
77b118d
Add oversized plan
AntoinePrv Oct 28, 2025
524ac1b
Add oversized kernel
AntoinePrv Oct 28, 2025
454decc
Rename kernels
AntoinePrv Oct 28, 2025
fd3ae27
Add simd kernel dispatch
AntoinePrv Oct 28, 2025
2525e4b
Call Simd kernel directly
AntoinePrv Oct 28, 2025
05f6e7c
Fix SIMD level None
AntoinePrv Oct 29, 2025
d0d9064
Initialize swizzles to -1
AntoinePrv Oct 29, 2025
f16708a
Doc
AntoinePrv Oct 29, 2025
de6baeb
Improve test error message
AntoinePrv Oct 29, 2025
e30aebe
Use new kernel in avx2
AntoinePrv Oct 28, 2025
f607991
AVX2 swizzle fallback
AntoinePrv Oct 29, 2025
2210fb4
Remove dead code
AntoinePrv Oct 30, 2025
1dc32a7
Simplify Large masks
AntoinePrv Oct 30, 2025
e02a74a
Remove bpacking 256 generated file
AntoinePrv Oct 30, 2025
91bf34a
Remove uint8_t fallback
AntoinePrv Oct 30, 2025
a939f29
Add boolean simd implementation
AntoinePrv Oct 30, 2025
77fb735
Use std::is_base_of for arch detection
AntoinePrv Oct 30, 2025
51ce7d6
Improve swizzle
AntoinePrv Nov 17, 2025
f23cd66
Only use lshift hack when available
AntoinePrv Nov 17, 2025
d38df81
Fix return type
AntoinePrv Nov 17, 2025
415ebaa
Fix shift included size
AntoinePrv Nov 18, 2025
cf6b56d
Add Avx2 uint16_t shift fallback
AntoinePrv Nov 19, 2025
99301c5
Refactor make_mult
AntoinePrv Nov 19, 2025
3168e1c
Add Avx2 lshift unint8_t fallback
AntoinePrv Nov 19, 2025
9db46a6
Refactor right shift excess
AntoinePrv Nov 19, 2025
e2c7367
Refactor make_mult
AntoinePrv Nov 20, 2025
5db07c7
Add SSE var shift uint8_t fallback to uint16_t
AntoinePrv Nov 20, 2025
d5b9eca
Implement size reading reduction
AntoinePrv Nov 20, 2025
eb8cec0
Add fallback Avx2 right shift
AntoinePrv Nov 24, 2025
b91c087
Refactor static dispatch
AntoinePrv Nov 26, 2025
be9abd3
Forward oversized to larger uint when possible
AntoinePrv Nov 26, 2025
1551710
Add arch detection functions
AntoinePrv Nov 26, 2025
32335ee
Refactor traits usage
AntoinePrv Nov 26, 2025
11f79b4
Forward x86_64 unpack64 to unpack32
AntoinePrv Nov 26, 2025
2954c29
Simplify template usage
AntoinePrv Nov 26, 2025
e3744cd
Reorganize and doc
AntoinePrv Nov 26, 2025
c2fc546
Refactor KernelDispatch and remove Oversized dispatch
AntoinePrv Nov 26, 2025
16110cb
Forward large unpack8 to unpack16 on SSE2
AntoinePrv Nov 26, 2025
96410eb
Use fallback right shift on large uint8_t avx2
AntoinePrv Nov 26, 2025
4102591
Fix enable_if
AntoinePrv Nov 27, 2025
960bd9c
Add missing header
AntoinePrv Nov 27, 2025
d28d015
fmt
AntoinePrv Nov 27, 2025
4962eee
Add SSE4.2 to dynamic dispatch
AntoinePrv Nov 27, 2025
0a0b314
Rename bpacking_simd_impl > bpacking_simd_kernel
AntoinePrv Nov 27, 2025
4a07ab0
Restore modifications to simd_codegen
AntoinePrv Nov 27, 2025
68cddfb
Reduce reading size and declare bytes read
AntoinePrv Nov 27, 2025
227b776
Add kBytesRead to scalar code
AntoinePrv Nov 27, 2025
17a7231
Add kBytesRead to simd 512 generated code
AntoinePrv Nov 27, 2025
c51879e
Prevent overreading
AntoinePrv Nov 27, 2025
3e86901
Fix pessimit overeading guard
AntoinePrv Nov 28, 2025
6a61a87
Fix overreading guard comparison
AntoinePrv Dec 1, 2025
5a000fc
Add UnpackOptions and max_read_bytes
AntoinePrv Dec 1, 2025
e6e097a
Use C++20 NTTP
AntoinePrv Jan 7, 2026
743577f
xsimd 14.0 compatibility
AntoinePrv Jan 8, 2026
cecd14f
fmt
AntoinePrv Jan 21, 2026
c0ee9d5
C++20 NTTP options
AntoinePrv Jan 23, 2026
1cff8bc
Homogenous wording
AntoinePrv Jan 23, 2026
57f278b
Remove xsimd backward compatibility
AntoinePrv Feb 3, 2026
43c8694
Apply doc fixes from code review
AntoinePrv Feb 6, 2026
251437e
Documentation and code improvements
AntoinePrv Feb 6, 2026
5ed131a
Move utilities into bpacking sub ns
AntoinePrv Feb 9, 2026
f64ad1d
Refactor plan builders
AntoinePrv Feb 9, 2026
8ea86be
Move utilities
AntoinePrv Feb 9, 2026
e68f936
Kernel documentation
AntoinePrv Feb 9, 2026
3c27968
adjust_bytes_per_read doc
AntoinePrv Feb 9, 2026
d050ee1
Fewer typename
AntoinePrv Feb 9, 2026
1329b39
Add documentation
AntoinePrv Feb 9, 2026
19a32e3
Fix bounds in plan builders
AntoinePrv Feb 10, 2026
b431b19
Change names
AntoinePrv Feb 10, 2026
22dff86
Add extra comments
AntoinePrv Feb 11, 2026
b638570
Fix comments
AntoinePrv Feb 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {

if (ARROW_PREDICT_FALSE(static_cast<int64_t>(byte_offset_) * 8 + bit_offset_ +
num_bits >
static_cast<int64_t>(max_bytes_) * 8))
static_cast<int64_t>(max_bytes_) * 8)) {
return false;
}

buffered_values_ |= v << bit_offset_;
bit_offset_ += num_bits;
Expand Down Expand Up @@ -273,14 +274,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
batch_size = static_cast<int>(remaining_bits / num_bits);
}

const ::arrow::internal::UnpackOptions opts{
.batch_size = batch_size,
.bit_width = num_bits,
.bit_offset = bit_offset_,
.max_read_bytes = max_bytes_ - byte_offset_,
};

if constexpr (std::is_same_v<T, bool>) {
::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
bit_offset_);
::arrow::internal::unpack(buffer_ + byte_offset_, v, opts);

} else {
::arrow::internal::unpack(buffer_ + byte_offset_,
reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
num_bits, bit_offset_);
reinterpret_cast<std::make_unsigned_t<T>*>(v), opts);
}

Advance(batch_size * num_bits);
Expand Down
31 changes: 16 additions & 15 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include <array>

#include "arrow/util/bpacking_dispatch_internal.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
Expand All @@ -29,19 +28,21 @@ namespace {

template <typename Uint>
struct UnpackDynamicFunction {
using FunctionType = decltype(&unpack_scalar<Uint>);
using FunctionType = decltype(&bpacking::unpack_scalar<Uint>);
using Implementation = std::pair<DispatchLevel, FunctionType>;

static constexpr auto implementations() {
return std::array{
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
// rhsift and poor xsimd fallback.
Implementation{DispatchLevel::NONE, &unpack_scalar<Uint>},
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
#else
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &unpack_avx2<Uint>},
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
Implementation{DispatchLevel::AVX512, &unpack_avx512<Uint>},
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
#endif
};
}
Expand All @@ -50,19 +51,19 @@ struct UnpackDynamicFunction {
} // namespace

template <typename Uint>
void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
#if defined(ARROW_HAVE_NEON)
return unpack_neon(in, out, batch_size, num_bits, bit_offset);
return bpacking::unpack_neon(in, out, opts);
#else
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, batch_size, num_bits, bit_offset);
return dispatch.func(in, out, opts);
#endif
}

template void unpack<bool>(const uint8_t*, bool*, int, int, int);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious, why not put all the unpack-related APIs inside arrow::internal::bpacking as well? Does it cause too much code churn, or would it fail for other reasons?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No reason, anything works really. My reasoning was unpack is a "library-public" utility function, so it lives in arrow::internal while arrow::internal::bpacking is "private" to the unpack function. Does that makes sense?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of, though we might want to revisit later anyway. Not necessary for this PR in any case!

template void unpack<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal
129 changes: 109 additions & 20 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <stdexcept>
#include <vector>

Expand All @@ -33,7 +34,7 @@ namespace arrow::internal {
namespace {

template <typename Int>
using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&);

/// Get the number of bytes associate with a packing.
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
Expand Down Expand Up @@ -86,33 +87,62 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);
auto unpacked = std::make_unique<Int[]>(num_values);

const ::arrow::internal::UnpackOptions opts{
.batch_size = num_values,
.bit_width = bit_width,
.bit_offset = 0,
.max_read_bytes = -1,
};

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
unpack(packed_ptr, unpacked.get(), opts);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
// Currently, the minimum unpack SIMD kernel size is 32 and the RLE-bit-packing encoder
// will not emit runs larger than 512 (though other implementation might), so we biased
// the benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};

static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues8 = {
kBitWidths8,
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues16 = {
kBitWidths16,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc<bool> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<bool>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc<uint8_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint8_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc<uint16_t> unpack,
bool skip = false, std::string skip_msg = "") {
Expand All @@ -129,52 +159,111 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &bpacking::unpack_scalar<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_SSE4_2)
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &bpacking::unpack_sse4_2<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &bpacking::unpack_avx2<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &bpacking::unpack_avx2<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &bpacking::unpack_avx2<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &unpack_avx2<uint32_t>,
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &bpacking::unpack_avx2<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &bpacking::unpack_avx2<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512<uint16_t>,
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &bpacking::unpack_avx512<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, &unpack_avx512<uint32_t>,
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64_t>,
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &bpacking::unpack_neon<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &bpacking::unpack_neon<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &bpacking::unpack_neon<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &bpacking::unpack_neon<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);

BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);

BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
Expand Down
Loading
Loading