diff --git a/cpp/src/arrow/util/bit_run_reader.cc b/cpp/src/arrow/util/bit_run_reader.cc index b1284151d5c..9edf3360890 100644 --- a/cpp/src/arrow/util/bit_run_reader.cc +++ b/cpp/src/arrow/util/bit_run_reader.cc @@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t // Prepare for inversion in NextRun. // Clear out any preceding bits. - word_ = word_ & ~bit_util::LeastSignificantBitMask(position_); + word_ = word_ & ~bit_util::LeastSignificantBitMask(position_); } #endif diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h index ed7be940a54..7bb00140279 100644 --- a/cpp/src/arrow/util/bit_run_reader.h +++ b/cpp/src/arrow/util/bit_run_reader.h @@ -106,7 +106,7 @@ class ARROW_EXPORT BitRunReader { int64_t start_bit_offset = start_position & 63; // Invert the word for proper use of CountTrailingZeros and // clear bits so CountTrailingZeros can do it magic. - word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset); + word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset); // Go forward until the next change from unset to set. int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset; @@ -311,12 +311,12 @@ class BaseSetBitRunReader { memcpy(reinterpret_cast(&word) + 8 - num_bytes, bitmap_, num_bytes); // XXX MostSignificantBitmask return (bit_util::ToLittleEndian(word) << bit_offset) & - ~bit_util::LeastSignificantBitMask(64 - num_bits); + ~bit_util::LeastSignificantBitMask(64 - num_bits); } else { memcpy(&word, bitmap_, num_bytes); bitmap_ += num_bytes; return (bit_util::ToLittleEndian(word) >> bit_offset) & - bit_util::LeastSignificantBitMask(num_bits); + bit_util::LeastSignificantBitMask(num_bits); } } diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index cf039a9ac9f..1057a0bf381 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -19,7 +19,6 @@ #pragma once -#include #include #include #include @@ -249,110 +248,36 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) { return true; } -namespace detail { - -template -inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer, - int* bit_offset, int* byte_offset, uint64_t* buffered_values) { -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable : 4800) -#endif - *v = static_cast(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >> - *bit_offset); -#ifdef _MSC_VER -# pragma warning(pop) -#endif - *bit_offset += num_bits; - if (*bit_offset >= 64) { - *byte_offset += 8; - *bit_offset -= 64; - - *buffered_values = - detail::ReadLittleEndianWord(buffer + *byte_offset, max_bytes - *byte_offset); -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable : 4800 4805) -#endif - // Read bits of v that crossed into new buffered_values_ - if (ARROW_PREDICT_TRUE(num_bits - *bit_offset < static_cast(8 * sizeof(T)))) { - // if shift exponent(num_bits - *bit_offset) is not less than sizeof(T), *v will not - // change and the following code may cause a runtime error that the shift exponent - // is too large - *v = *v | static_cast(bit_util::TrailingBits(*buffered_values, *bit_offset) - << (num_bits - *bit_offset)); - } -#ifdef _MSC_VER -# pragma warning(pop) -#endif - ARROW_DCHECK_LE(*bit_offset, 64); - } -} - -} // namespace detail - template inline bool BitReader::GetValue(int num_bits, T* v) { return GetBatch(num_bits, v, 1) == 1; } -namespace internal_bit_reader { -template -struct unpack_detect { - using type = std::make_unsigned_t; -}; - -template <> -struct unpack_detect { - using type = bool; -}; -} // namespace internal_bit_reader - template inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { - ARROW_DCHECK(buffer_ != NULL); - ARROW_DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)) << "num_bits: " << num_bits; + constexpr uint64_t kBitsPerByte = 8; - int bit_offset = bit_offset_; - int byte_offset = byte_offset_; - uint64_t buffered_values = buffered_values_; - int max_bytes = max_bytes_; - const uint8_t* buffer = buffer_; + ARROW_DCHECK(buffer_ != NULLPTR); + ARROW_DCHECK_LE(num_bits, static_cast(sizeof(T) * 8)) << "num_bits: " << num_bits; const int64_t needed_bits = num_bits * static_cast(batch_size); - constexpr uint64_t kBitsPerByte = 8; const int64_t remaining_bits = - static_cast(max_bytes - byte_offset) * kBitsPerByte - bit_offset; + static_cast(max_bytes_ - byte_offset_) * kBitsPerByte - bit_offset_; if (remaining_bits < needed_bits) { batch_size = static_cast(remaining_bits / num_bits); } - int i = 0; - if (ARROW_PREDICT_FALSE(bit_offset != 0)) { - for (; i < batch_size && bit_offset != 0; ++i) { - detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); - } - } - - using unpack_t = typename internal_bit_reader::unpack_detect::type; - - int num_unpacked = ::arrow::internal::unpack( - buffer + byte_offset, reinterpret_cast(v + i), batch_size - i, num_bits); - i += num_unpacked; - byte_offset += num_unpacked * num_bits / 8; - - buffered_values = - detail::ReadLittleEndianWord(buffer + byte_offset, max_bytes - byte_offset); + if constexpr (std::is_same_v) { + ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits, + bit_offset_); - for (; i < batch_size; ++i) { - detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset, - &buffered_values); + } else { + ::arrow::internal::unpack(buffer_ + byte_offset_, + reinterpret_cast*>(v), batch_size, + num_bits, bit_offset_); } - bit_offset_ = bit_offset; - byte_offset_ = byte_offset; - buffered_values_ = buffered_values; + Advance(batch_size * num_bits); return batch_size; } diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h index 8d4811ede79..52a42f538b5 100644 --- a/cpp/src/arrow/util/bit_util.h +++ b/cpp/src/arrow/util/bit_util.h @@ -113,9 +113,16 @@ constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; } // Returns a mask for the bit_index lower order bits. -// Only valid for bit_index in the range [0, 64). -constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) { - return (static_cast(1) << bit_index) - 1; +// Valid in the range `[0, 8*sizof(Uint)]` if `kAllowUpperBound` +// otherwise `[0, 8*sizof(Uint)[` +template +constexpr auto LeastSignificantBitMask(Uint bit_index) { + if constexpr (kAllowUpperBound) { + if (bit_index == 8 * sizeof(Uint)) { + return ~Uint{0}; + } + } + return (Uint{1} << bit_index) - Uint{1}; } // Returns 'value' rounded up to the nearest multiple of 'factor' diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h index d95fd921f48..83c142c559b 100644 --- a/cpp/src/arrow/util/bitmap_reader.h +++ b/cpp/src/arrow/util/bitmap_reader.h @@ -136,7 +136,7 @@ class BitmapUInt64Reader { memcpy(&word, bitmap_, num_bytes); bitmap_ += num_bytes; return (bit_util::ToLittleEndian(word) >> bit_offset) & - bit_util::LeastSignificantBitMask(num_bits); + bit_util::LeastSignificantBitMask(num_bits); } const uint8_t* bitmap_; diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 369f361d9a6..fdb1c5a52ac 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -50,19 +50,19 @@ struct UnpackDynamicFunction { } // namespace template -int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) { +void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) { #if defined(ARROW_HAVE_NEON) - return unpack_neon(in, out, batch_size, num_bits); + return unpack_neon(in, out, batch_size, num_bits, bit_offset); #else static DynamicDispatch > dispatch; - return dispatch.func(in, out, batch_size, num_bits); + return dispatch.func(in, out, batch_size, num_bits, bit_offset); #endif } -template int unpack(const uint8_t*, bool*, int, int); -template int unpack(const uint8_t*, uint8_t*, int, int); -template int unpack(const uint8_t*, uint16_t*, int, int); -template int unpack(const uint8_t*, uint32_t*, int, int); -template int unpack(const uint8_t*, uint64_t*, int, int); +template void unpack(const uint8_t*, bool*, int, int, int); +template void unpack(const uint8_t*, uint8_t*, int, int, int); +template void unpack(const uint8_t*, uint16_t*, int, int, int); +template void unpack(const uint8_t*, uint32_t*, int, int, int); +template void unpack(const uint8_t*, uint64_t*, int, int, int); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 144da6ea878..7094656eeef 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -33,7 +33,7 @@ namespace arrow::internal { namespace { template -using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); +using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int); /// Get the number of bytes associate with a packing. constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { @@ -89,7 +89,7 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo std::vector unpacked(num_values, 0); for (auto _ : state) { - unpack(packed_ptr, unpacked.data(), num_values, bit_width); + unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0); benchmark::ClobberMemory(); } state.SetItemsProcessed(num_values * state.iterations()); diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h index a2319c05701..eed5542808d 100644 --- a/cpp/src/arrow/util/bpacking_dispatch_internal.h +++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h @@ -17,26 +17,28 @@ #pragma once +#include #include #include +#include "arrow/util/bit_util.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/macros.h" #include "arrow/util/ubsan.h" namespace arrow::internal { /// Unpack a zero bit packed array. template -int unpack_null(const uint8_t* in, Uint* out, int batch_size) { +void unpack_null(const uint8_t* in, Uint* out, int batch_size) { std::memset(out, 0, batch_size * sizeof(Uint)); - return batch_size; } /// Unpack a packed array where packed and unpacked values have exactly the same number of /// bits. template -int unpack_full(const uint8_t* in, Uint* out, int batch_size) { +void unpack_full(const uint8_t* in, Uint* out, int batch_size) { if constexpr (ARROW_LITTLE_ENDIAN == 1) { std::memcpy(out, in, batch_size * sizeof(Uint)); } else { @@ -47,7 +49,114 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) { out[k] = FromLittleEndian(SafeLoadAs(in + (k * sizeof(Uint)))); } } - return batch_size; +} + +/// Compute the maximum spread in bytes that a packed integer can cover. +/// +/// This is assuming contiguous packed integer starting with the given bit offset away +/// from a byte boundary. +/// This function is non-monotonic, for instance with zero offset, three bit integers +/// will be split on the first byte boundary (hence having a spread of two bytes) while +/// four bit integer will be well behaved and never spread over byte boundary (hence +/// having a spread of one). +constexpr int PackedMaxSpreadBytes(int width, int bit_offset) { + int max = static_cast(bit_util::BytesForBits(width)); + int start = bit_offset; + do { + const int byte_start = start / 8; + const int byte_end = (start + width - 1) / 8; // inclusive end bit + const int spread = byte_end - byte_start + 1; + max = spread > max ? spread : max; + start += width; + } while (start % 8 != bit_offset); + return max; +} + +/// Compute the maximum spread in bytes that a packed integer can cover across all bit +/// offsets. +constexpr int PackedMaxSpreadBytes(int width) { + int max = 0; + for (int offset = 0; offset < 8; ++offset) { + const int spread = PackedMaxSpreadBytes(width, offset); + max = spread > max ? spread : max; + } + return max; +} + +// Integer type that tries to contain as much as the spread as possible. +template +using SpreadBufferUint = std::conditional_t< + (kSpreadBytes <= sizeof(uint8_t)), uint_fast8_t, + std::conditional_t<(kSpreadBytes <= sizeof(uint16_t)), uint_fast16_t, + std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), + uint_fast32_t, uint_fast64_t>>>; + +/// Unpack integers. +/// This function works for all input batch sizes but is not the fastest. +/// In prolog mode, instead of unpacking all required element, the function will +/// stop if it finds a byte aligned value start. +template +int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) { + // For the epilog we adapt the max spread since better alignment give shorter spreads + ARROW_DCHECK(kIsProlog || bit_offset == 0); + ARROW_DCHECK(bit_offset >= 0 && bit_offset < 8); + constexpr int kMaxSpreadBytes = kIsProlog ? PackedMaxSpreadBytes(kPackedBitWidth) + : PackedMaxSpreadBytes(kPackedBitWidth, 0); + using buffer_uint = SpreadBufferUint; + constexpr int kBufferSize = sizeof(buffer_uint); + // Due to misalignment, on large bit width, the spread can be larger than the maximum + // size integer. For instance a 63 bit width misaligned packed integer can spread over 9 + // aligned bytes. + constexpr bool kOversized = kBufferSize < kMaxSpreadBytes; + constexpr buffer_uint kLowMask = + bit_util::LeastSignificantBitMask(kPackedBitWidth); + + ARROW_DCHECK_GE(bit_offset, 0); + ARROW_DCHECK_LE(bit_offset, 8); + + // Looping over values one by one + const int start_bit_term = batch_size * kPackedBitWidth + bit_offset; + int start_bit = bit_offset; + while ((start_bit < start_bit_term) && (!kIsProlog || (start_bit % 8 != 0))) { + const int start_byte = start_bit / 8; + const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1; + ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes); + + // Reading the bytes for the current value. + // Must be careful not to read out of input bounds. + buffer_uint buffer = 0; + if constexpr (kOversized) { + // We read the max possible bytes in the first pass and handle the rest after. + // Even though the worst spread does not happen on all iterations we can still read + // all bytes because we will mask them. + std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, spread_bytes)); + } else { + std::memcpy(&buffer, in + start_byte, spread_bytes); + } + + buffer = bit_util::FromLittleEndian(buffer); + const int bit_offset = start_bit % 8; + buffer >>= bit_offset; + Uint val = static_cast(buffer & kLowMask); + + // Handle the oversized bytes + if constexpr (kOversized) { + // The oversized bytes do not happen at all iterations + if (spread_bytes > kBufferSize) { + std::memcpy(&buffer, in + start_byte + kBufferSize, spread_bytes - kBufferSize); + buffer = bit_util::FromLittleEndian(buffer); + buffer <<= 8 * kBufferSize - bit_offset; + val |= static_cast(buffer & kLowMask); + } + } + + *out = val; + out++; + start_bit += kPackedBitWidth; + } + + ARROW_DCHECK((start_bit - bit_offset) % kPackedBitWidth == 0); + return (start_bit - bit_offset) / kPackedBitWidth; } /// Unpack a packed array, delegating to a Unpacker struct. @@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) { /// @tparam UnpackedUInt The type in which we unpack the values. template typename Unpacker, typename UnpackedUInt> -int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) { - using UnpackerForWidth = Unpacker; +void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset) { + if constexpr (kPackedBitWidth == 0) { + // Easy case to handle, simply setting memory to zero. + return unpack_null(in, out, batch_size); + } else { + // In case of misalignment, we need to run the prolog until aligned. + int extracted = unpack_exact(in, out, batch_size, bit_offset); + // We either extracted everything or found a alignment + const int start_bit = extracted * kPackedBitWidth + bit_offset; + ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0)); + batch_size -= extracted; + ARROW_DCHECK_GE(batch_size, 0); + in += start_bit / 8; + out += extracted; - constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; - batch_size = batch_size / kValuesUnpacked * kValuesUnpacked; - int num_loops = batch_size / kValuesUnpacked; + if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) { + // Only memcpy / static_cast + return unpack_full(in, out, batch_size); + } else { + using UnpackerForWidth = Unpacker; + constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked; - for (int i = 0; i < num_loops; ++i) { - in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked); - } + // Running the optimized kernel for batch extraction + const int unpacker_iter_count = batch_size / kValuesUnpacked; + for (int i = 0; i < unpacker_iter_count; ++i) { + in = UnpackerForWidth::unpack(in, out); + out += kValuesUnpacked; + } + batch_size -= unpacker_iter_count * kValuesUnpacked; - return batch_size; + // Running the epilog for the remaining values that don't fit in a kernel + ARROW_DCHECK_LT(batch_size, kValuesUnpacked); + ARROW_DCHECK_GE(batch_size, 0); + ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked); + ARROW_COMPILER_ASSUME(batch_size >= 0); + unpack_exact(in, out, batch_size, /* bit_offset= */ 0); + } + } } template