From a6aaad4131fb4991836b227a287c2c03a0e29010 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 20 Oct 2025 16:30:10 +0200 Subject: [PATCH 01/12] Add unpack epilogue --- .../arrow/util/bit_stream_utils_internal.h | 15 +-- cpp/src/arrow/util/bit_util.h | 10 ++ .../arrow/util/bpacking_dispatch_internal.h | 91 ++++++++++++++++++- cpp/src/arrow/util/bpacking_test.cc | 81 ++++++++++------- 4 files changed, 149 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index cf039a9ac9f..3b7252d6ba8 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -339,20 +339,9 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { int num_unpacked = ::arrow::internal::unpack( buffer + byte_offset, reinterpret_cast(v + i), batch_size - i, num_bits); - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; + ARROW_DCHECK_EQ(batch_size - i, num_unpacked); - buffered_values = - detail::ReadLittleEndianWord(buffer + byte_offset, max_bytes - byte_offset); - - for (; i < batch_size; ++i) { - detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); - } - - bit_offset_ = bit_offset; - byte_offset_ = byte_offset; - buffered_values_ = buffered_values; + this->Advance(batch_size * num_bits); return batch_size; } diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 8d4811ede79..e72eca74e86 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -118,6 +118,16 @@ constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) { return (static_cast(1) << bit_index) - 1; } +// Returns a mask for the bit_index lower order bits. +// Only valid for bit_index in the range [0, sizeof(Uint)]. +template +constexpr auto LeastSignificantBitMaskInc(Uint bit_index) { + if (bit_index == 8 * sizeof(Uint)) { + return ~Uint{0}; + } + return (Uint{1} << bit_index) - Uint{1}; +} + // Returns 'value' rounded up to the nearest multiple of 'factor' constexpr int64_t RoundUp(int64_t value, int64_t factor) { return CeilDiv(value, factor) * factor; diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index a2319c05701..f1c70dde861 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -17,11 +17,14 @@ #pragma once +#include #include #include +#include "arrow/util/bit_util.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/ubsan.h" namespace arrow::internal { @@ -50,6 +53,85 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) { return batch_size; } +/// Compute the maximum spread in bytes that a packed integer can cover. +/// +/// This is assuming contiguous packed integer starting on a byte aligned boundary. +/// This function is non-monotonic, for instance three bit integers will be split on the +/// first byte boundary (hence having a spread of two bytes) while four bit integer will +/// be well behaved and never spread over byte boundary (hence having a spread of one). +constexpr int PackedMaxSpreadBytes(int width) { + int max = static_cast(bit_util::BytesForBits(width)); + int start = width; + while (start % 8 != 0) { + const int byte_start = start / 8; + const int byte_end = (start + width - 1) / 8; // inclusive end bit + const int spread = byte_end - byte_start + 1; + max = spread > max ? spread : max; + start += width; + } + return max; +} + +// Integer type that tries to contain as much as the spread as possible. +template +using SpreadBufferUint = + std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), uint_fast32_t, uint_fast64_t>; + +/// Unpack integers. +/// This function works for all input batch sizes but is not the fastest. +template +int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) { + constexpr int kMaxSpreadBytes = PackedMaxSpreadBytes(kPackedBitWidth); + using buffer_uint = SpreadBufferUint; + constexpr int kBufferSize = sizeof(buffer_uint); + // Due to misalignment, on large bit width, the spread can be larger than the maximum + // size integer. For instance a 63 bit width misaligned packed integer can spread over 9 + // aligned bytes. + constexpr bool kOversized = kBufferSize < kMaxSpreadBytes; + constexpr buffer_uint kLowMask = + bit_util::LeastSignificantBitMaskInc(kPackedBitWidth); + + // Looping over values one by one + for (int k = 0; k < batch_size; ++k) { + const int start_bit = k * kPackedBitWidth; + const int start_byte = start_bit / 8; + const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1; + ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes); + + // Reading the bytes for the current value. + // Must be careful not to read out of input bounds. + buffer_uint buffer = 0; + if constexpr (kOversized) { + // We read the max possible bytes in the first pass and handle the rest after. + // Even though the worst spread does not happen on all iterations we can still read + // all bytes because we will mask them. + std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, spread_bytes)); + } else { + std::memcpy(&buffer, in + start_byte, spread_bytes); + } + + buffer = bit_util::FromLittleEndian(buffer); + const int bit_offset = start_bit % 8; + buffer >>= bit_offset; + Uint val = static_cast(buffer & kLowMask); + + // Handle the oversized bytes + if constexpr (kOversized) { + // The oversized bytes do not happen at all iterations + if (spread_bytes > kBufferSize) { + std::memcpy(&buffer, in + start_byte + kBufferSize, spread_bytes - kBufferSize); + buffer = bit_util::FromLittleEndian(buffer); + buffer <<= 8 * kBufferSize - bit_offset; + val |= static_cast(buffer & kLowMask); + } + } + + out[k] = val; + } + + return batch_size; +} + /// Unpack a packed array, delegating to a Unpacker struct. /// /// @tparam kPackedBitWidth The width in bits of the values in the packed array. @@ -61,15 +143,18 @@ template typename Unpacker, typename UnpackedUInt> int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) { using UnpackerForWidth = Unpacker; - constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; - batch_size = batch_size / kValuesUnpacked * kValuesUnpacked; - int num_loops = batch_size / kValuesUnpacked; + const int num_loops = batch_size / kValuesUnpacked; for (int i = 0; i < num_loops; ++i) { in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked); } + const auto epilog_size = batch_size - num_loops * kValuesUnpacked; + ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked); + ARROW_COMPILER_ASSUME(epilog_size >= 0); + unpack_epilog(in, out + num_loops * kValuesUnpacked, epilog_size); + return batch_size; } diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 9a3e31b5893..0df17a55fa4 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -19,10 +19,9 @@ #include -#include "arrow/result.h" -#include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bit_util.h" #include "arrow/util/bpacking_internal.h" #include "arrow/util/bpacking_scalar_internal.h" #include "arrow/util/bpacking_simd_internal.h" @@ -38,19 +37,14 @@ template using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. -Result GetNumBytes(int32_t num_values, int32_t bit_width) { - const auto num_bits = num_values * bit_width; - if (num_bits % 8 != 0) { - return Status::NotImplemented( - "The unpack functions only work on a multiple of 8 bits."); - } - return num_bits / 8; +int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { + return static_cast(bit_util::BytesForBits(num_values * bit_width)); } /// Generate random bytes as packed integers. std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { constexpr uint32_t kSeed = 3214; - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + const auto num_bytes = GetNumBytes(num_values, bit_width); std::vector out(std::max(1, num_bytes)); // We need a valid pointer for size 0 random_bytes(num_bytes, kSeed, out.data()); @@ -75,7 +69,7 @@ std::vector UnpackValues(const uint8_t* packed, int32_t num_values, template std::vector PackValues(const std::vector& values, int32_t num_values, int32_t bit_width) { - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + const auto num_bytes = GetNumBytes(num_values, bit_width); std::vector out(static_cast(num_bytes)); bit_util::BitWriter writer(out.data(), num_bytes); @@ -85,6 +79,7 @@ std::vector PackValues(const std::vector& values, int32_t num_valu throw std::runtime_error("Cannot write move values"); } } + writer.Flush(); return out; } @@ -92,15 +87,32 @@ std::vector PackValues(const std::vector& values, int32_t num_valu template void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + const auto num_bytes = GetNumBytes(num_values, bit_width); const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack); EXPECT_EQ(unpacked.size(), num_values); const auto roundtrip = PackValues(unpacked, num_values, bit_width); EXPECT_EQ(num_bytes, roundtrip.size()); - for (int i = 0; i < num_bytes; ++i) { + + // Checking all bytes but the last (that may not fall aligned) + for (int i = 0; i < num_bytes - 1; ++i) { EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; } + + // Checking last byte + if (num_bytes >= 1) { + const int i = num_bytes - 1; + const int last_bits_cnt = (num_values * bit_width) % 8; + + if (last_bits_cnt == 0) { + // Properly aligned, this is the same check as before + EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; + } else { + // We need to mask the last bits in the packed data that are arbitrary and not used. + const auto mask = static_cast((1 << last_bits_cnt) - 1); + EXPECT_EQ(packed[i] & mask, roundtrip[i] & mask) << "differ in position " << i; + } + } } const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { @@ -119,10 +131,8 @@ const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { class TestUnpack : public ::testing::TestWithParam { protected: template - void TestRoundtripAlignment(UnpackFunc unpack, int bit_width, + void TestRoundtripAlignment(UnpackFunc unpack, int num_values, int bit_width, std::size_t alignment_offset) { - int num_values = GetParam(); - // Assume std::vector allocation is likely be aligned for greater than a byte. // So we allocate more values than necessary and skip to the next byte with the // desired (non) alignment to test the proper condition. @@ -135,9 +145,8 @@ class TestUnpack : public ::testing::TestWithParam { } template - void TestUnpackZeros(UnpackFunc unpack, int bit_width) { - int num_values = GetParam(); - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + void TestUnpackZeros(UnpackFunc unpack, int num_values, int bit_width) { + const auto num_bytes = GetNumBytes(num_values, bit_width); const std::vector packed(static_cast(num_bytes), uint8_t{0}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -147,9 +156,8 @@ class TestUnpack : public ::testing::TestWithParam { } template - void TestUnpackOnes(UnpackFunc unpack, int bit_width) { - int num_values = GetParam(); - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + void TestUnpackOnes(UnpackFunc unpack, int num_values, int bit_width) { + const auto num_bytes = GetNumBytes(num_values, bit_width); const std::vector packed(static_cast(num_bytes), uint8_t{0xFF}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -168,9 +176,8 @@ class TestUnpack : public ::testing::TestWithParam { } template - void TestUnpackAlternating(UnpackFunc unpack, int bit_width) { - int num_values = GetParam(); - EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + void TestUnpackAlternating(UnpackFunc unpack, int num_values, int bit_width) { + const auto num_bytes = GetNumBytes(num_values, bit_width); const std::vector packed(static_cast(num_bytes), uint8_t{0xAA}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -198,20 +205,30 @@ class TestUnpack : public ::testing::TestWithParam { template void TestAll(UnpackFunc unpack) { + const int num_values_base = GetParam(); + constexpr int kMaxBitWidth = std::is_same_v ? 1 : 8 * sizeof(Int); // Given how many edge cases there are in unpacking integers, it is best to test all // sizes for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) { SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width); - // Known values - TestUnpackZeros(unpack, bit_width); - TestUnpackOnes(unpack, bit_width); - TestUnpackAlternating(unpack, bit_width); + // Similarly, we test all epilogue sizes. That is extra values that could make it + // fall outside of an SIMD register + for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; ++epilogue_size) { + SCOPED_TRACE(::testing::Message() << "Testing epilogue_size=" << epilogue_size); - // Roundtrips - TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 0); - TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 1); + const int num_values = num_values_base + epilogue_size; + + // Known values + TestUnpackZeros(unpack, num_values, bit_width); + TestUnpackOnes(unpack, num_values, bit_width); + TestUnpackAlternating(unpack, num_values, bit_width); + + // Roundtrips + TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 0); + TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 1); + } } } }; From c669b7770b377fb1dbb99800a925a655767fe03a Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 20 Oct 2025 18:02:35 +0200 Subject: [PATCH 02/12] Try smaller integer sizes --- cpp/src/arrow/util/bpacking_dispatch_internal.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index f1c70dde861..9bd63532db0 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -74,8 +74,11 @@ constexpr int PackedMaxSpreadBytes(int width) { // Integer type that tries to contain as much as the spread as possible. template -using SpreadBufferUint = - std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), uint_fast32_t, uint_fast64_t>; +using SpreadBufferUint = std::conditional_t< + (kSpreadBytes <= sizeof(uint8_t)), uint_fast8_t, + std::conditional_t<(kSpreadBytes <= sizeof(uint16_t)), uint_fast16_t, + std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), + uint_fast32_t, uint_fast64_t>>>; /// Unpack integers. /// This function works for all input batch sizes but is not the fastest. From 77fd67c834b949d5536df7a8f5b49476a8779cbc Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 21 Oct 2025 10:22:52 +0200 Subject: [PATCH 03/12] void return type --- .../arrow/util/bit_stream_utils_internal.h | 5 +- cpp/src/arrow/util/bpacking.cc | 12 +- cpp/src/arrow/util/bpacking_benchmark.cc | 2 +- .../arrow/util/bpacking_dispatch_internal.h | 16 +-- cpp/src/arrow/util/bpacking_internal.h | 29 ++--- cpp/src/arrow/util/bpacking_scalar.cc | 12 +- cpp/src/arrow/util/bpacking_scalar_internal.h | 40 +++--- cpp/src/arrow/util/bpacking_simd_avx2.cc | 12 +- cpp/src/arrow/util/bpacking_simd_avx512.cc | 12 +- cpp/src/arrow/util/bpacking_simd_default.cc | 12 +- cpp/src/arrow/util/bpacking_simd_internal.h | 114 +++++++++--------- cpp/src/arrow/util/bpacking_test.cc | 9 +- 12 files changed, 135 insertions(+), 140 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 3b7252d6ba8..543df6d9e79 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -337,9 +337,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { using unpack_t = typename internal_bit_reader::unpack_detect::type; - int num_unpacked = ::arrow::internal::unpack( - buffer + byte_offset, reinterpret_cast(v + i), batch_size - i, num_bits); - ARROW_DCHECK_EQ(batch_size - i, num_unpacked); + ::arrow::internal::unpack(buffer + byte_offset, reinterpret_cast(v + i), + batch_size - i, num_bits); this->Advance(batch_size * num_bits); diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 369f361d9a6..95990dc1866 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -50,7 +50,7 @@ struct UnpackDynamicFunction { } // namespace template -int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) { +void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) { #if defined(ARROW_HAVE_NEON) return unpack_neon(in, out, batch_size, num_bits); #else @@ -59,10 +59,10 @@ int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) { #endif } -template int unpack(const uint8_t*, bool*, int, int); -template int unpack(const uint8_t*, uint8_t*, int, int); -template int unpack(const uint8_t*, uint16_t*, int, int); -template int unpack(const uint8_t*, uint32_t*, int, int); -template int unpack(const uint8_t*, uint64_t*, int, int); +template void unpack(const uint8_t*, bool*, int, int); +template void unpack(const uint8_t*, uint8_t*, int, int); +template void unpack(const uint8_t*, uint16_t*, int, int); +template void unpack(const uint8_t*, uint32_t*, int, int); +template void unpack(const uint8_t*, uint64_t*, int, int); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 144da6ea878..82804d8efd8 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -33,7 +33,7 @@ namespace arrow::internal { namespace { template -using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); +using UnpackFunc = void (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index 9bd63532db0..cb204cf9cb0 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -31,15 +31,14 @@ namespace arrow::internal { /// Unpack a zero bit packed array. template -int unpack_null(const uint8_t* in, Uint* out, int batch_size) { +void unpack_null(const uint8_t* in, Uint* out, int batch_size) { std::memset(out, 0, batch_size * sizeof(Uint)); - return batch_size; } /// Unpack a packed array where packed and unpacked values have exactly the same number of /// bits. template -int unpack_full(const uint8_t* in, Uint* out, int batch_size) { +void unpack_full(const uint8_t* in, Uint* out, int batch_size) { if constexpr (ARROW_LITTLE_ENDIAN == 1) { std::memcpy(out, in, batch_size * sizeof(Uint)); } else { @@ -50,7 +49,6 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) { out[k] = FromLittleEndian(SafeLoadAs(in + (k * sizeof(Uint)))); } } - return batch_size; } /// Compute the maximum spread in bytes that a packed integer can cover. @@ -144,7 +142,7 @@ int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) { /// @tparam UnpackedUInt The type in which we unpack the values. template typename Unpacker, typename UnpackedUInt> -int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) { +void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) { using UnpackerForWidth = Unpacker; constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; @@ -157,13 +155,11 @@ int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) { ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked); ARROW_COMPILER_ASSUME(epilog_size >= 0); unpack_epilog(in, out + num_loops * kValuesUnpacked, epilog_size); - - return batch_size; } template