From a6aaad4131fb4991836b227a287c2c03a0e29010 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Mon, 20 Oct 2025 16:30:10 +0200
Subject: [PATCH 01/12] Add unpack epilogue

---
 .../arrow/util/bit_stream_utils_internal.h    | 15 +--
 cpp/src/arrow/util/bit_util.h                 | 10 ++
 .../arrow/util/bpacking_dispatch_internal.h   | 91 ++++++++++++++++++-
 cpp/src/arrow/util/bpacking_test.cc           | 81 ++++++++++-------
 4 files changed, 149 insertions(+), 48 deletions(-)
diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index cf039a9ac9f..3b7252d6ba8 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -339,20 +339,9 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
 
   int num_unpacked = ::arrow::internal::unpack(
       buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i), batch_size - i, num_bits);
-  i += num_unpacked;
-  byte_offset += num_unpacked * num_bits / 8;
+  ARROW_DCHECK_EQ(batch_size - i, num_unpacked);
 
-  buffered_values =
-      detail::ReadLittleEndianWord(buffer + byte_offset, max_bytes - byte_offset);
-
-  for (; i < batch_size; ++i) {
-    detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
-                      &buffered_values);
-  }
-
-  bit_offset_ = bit_offset;
-  byte_offset_ = byte_offset;
-  buffered_values_ = buffered_values;
+  this->Advance(batch_size * num_bits);
 
   return batch_size;
 }
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 8d4811ede79..e72eca74e86 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -118,6 +118,16 @@ constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
   return (static_cast<uint64_t>(1) << bit_index) - 1;
 }
 
+// Returns a mask for the bit_index lower order bits.
+// Only valid for bit_index in the range [0, sizeof(Uint)].
+template <typename Uint>
+constexpr auto LeastSignificantBitMaskInc(Uint bit_index) {
+  if (bit_index == 8 * sizeof(Uint)) {
+    return ~Uint{0};
+  }
+  return (Uint{1} << bit_index) - Uint{1};
+}
+
 // Returns 'value' rounded up to the nearest multiple of 'factor'
 constexpr int64_t RoundUp(int64_t value, int64_t factor) {
   return CeilDiv(value, factor) * factor;
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index a2319c05701..f1c70dde861 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -17,11 +17,14 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstring>
 #include <type_traits>
 
+#include "arrow/util/bit_util.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 
 namespace arrow::internal {
@@ -50,6 +53,85 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
   return batch_size;
 }
 
+/// Compute the maximum spread in bytes that a packed integer can cover.
+///
+/// This is assuming contiguous packed integer starting on a byte aligned boundary.
+/// This function is non-monotonic, for instance three bit integers will be split on the
+/// first byte boundary (hence having a spread of two bytes) while four bit integer will
+/// be well behaved and never spread over byte boundary (hence having a spread of one).
+constexpr int PackedMaxSpreadBytes(int width) {
+  int max = static_cast<int>(bit_util::BytesForBits(width));
+  int start = width;
+  while (start % 8 != 0) {
+    const int byte_start = start / 8;
+    const int byte_end = (start + width - 1) / 8;  // inclusive end bit
+    const int spread = byte_end - byte_start + 1;
+    max = spread > max ? spread : max;
+    start += width;
+  }
+  return max;
+}
+
+// Integer type that tries to contain as much as the spread as possible.
+template <int kSpreadBytes>
+using SpreadBufferUint =
+    std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), uint_fast32_t, uint_fast64_t>;
+
+/// Unpack integers.
+/// This function works for all input batch sizes but is not the fastest.
+template <int kPackedBitWidth, typename Uint>
+int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) {
+  constexpr int kMaxSpreadBytes = PackedMaxSpreadBytes(kPackedBitWidth);
+  using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;
+  constexpr int kBufferSize = sizeof(buffer_uint);
+  // Due to misalignment, on large bit width, the spread can be larger than the maximum
+  // size integer. For instance a 63 bit width misaligned packed integer can spread over 9
+  // aligned bytes.
+  constexpr bool kOversized = kBufferSize < kMaxSpreadBytes;
+  constexpr buffer_uint kLowMask =
+      bit_util::LeastSignificantBitMaskInc<buffer_uint>(kPackedBitWidth);
+
+  // Looping over values one by one
+  for (int k = 0; k < batch_size; ++k) {
+    const int start_bit = k * kPackedBitWidth;
+    const int start_byte = start_bit / 8;
+    const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1;
+    ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
+
+    // Reading the bytes for the current value.
+    // Must be careful not to read out of input bounds.
+    buffer_uint buffer = 0;
+    if constexpr (kOversized) {
+      // We read the max possible bytes in the first pass and handle the rest after.
+      // Even though the worst spread does not happen on all iterations we can still read
+      // all bytes because we will mask them.
+      std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, spread_bytes));
+    } else {
+      std::memcpy(&buffer, in + start_byte, spread_bytes);
+    }
+
+    buffer = bit_util::FromLittleEndian(buffer);
+    const int bit_offset = start_bit % 8;
+    buffer >>= bit_offset;
+    Uint val = static_cast<Uint>(buffer & kLowMask);
+
+    // Handle the oversized bytes
+    if constexpr (kOversized) {
+      // The oversized bytes do not happen at all iterations
+      if (spread_bytes > kBufferSize) {
+        std::memcpy(&buffer, in + start_byte + kBufferSize, spread_bytes - kBufferSize);
+        buffer = bit_util::FromLittleEndian(buffer);
+        buffer <<= 8 * kBufferSize - bit_offset;
+        val |= static_cast<Uint>(buffer & kLowMask);
+      }
+    }
+
+    out[k] = val;
+  }
+
+  return batch_size;
+}
+
 /// Unpack a packed array, delegating to a Unpacker struct.
 ///
 /// @tparam kPackedBitWidth The width in bits of the values in the packed array.
@@ -61,15 +143,18 @@ template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
 int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
   using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
-
   constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
-  batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
-  int num_loops = batch_size / kValuesUnpacked;
 
+  const int num_loops = batch_size / kValuesUnpacked;
   for (int i = 0; i < num_loops; ++i) {
     in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
   }
 
+  const auto epilog_size = batch_size - num_loops * kValuesUnpacked;
+  ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked);
+  ARROW_COMPILER_ASSUME(epilog_size >= 0);
+  unpack_epilog<kPackedBitWidth>(in, out + num_loops * kValuesUnpacked, epilog_size);
+
   return batch_size;
 }
 
diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index 9a3e31b5893..0df17a55fa4 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -19,10 +19,9 @@
 
 #include <gtest/gtest.h>
 
-#include "arrow/result.h"
-#include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/bit_stream_utils_internal.h"
+#include "arrow/util/bit_util.h"
 #include "arrow/util/bpacking_internal.h"
 #include "arrow/util/bpacking_scalar_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
@@ -38,19 +37,14 @@ template <typename Int>
 using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
 
 /// Get the number of bytes associate with a packing.
-Result<int32_t> GetNumBytes(int32_t num_values, int32_t bit_width) {
-  const auto num_bits = num_values * bit_width;
-  if (num_bits % 8 != 0) {
-    return Status::NotImplemented(
-        "The unpack functions only work on a multiple of 8 bits.");
-  }
-  return num_bits / 8;
+int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
+  return static_cast<int32_t>(bit_util::BytesForBits(num_values * bit_width));
 }
 
 /// Generate random bytes as packed integers.
 std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) {
   constexpr uint32_t kSeed = 3214;
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  const auto num_bytes = GetNumBytes(num_values, bit_width);
 
   std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid pointer for size 0
   random_bytes(num_bytes, kSeed, out.data());
@@ -75,7 +69,7 @@ std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
 template <typename Int>
 std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_values,
                                 int32_t bit_width) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  const auto num_bytes = GetNumBytes(num_values, bit_width);
 
   std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));
   bit_util::BitWriter writer(out.data(), num_bytes);
@@ -85,6 +79,7 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_valu
       throw std::runtime_error("Cannot write move values");
     }
   }
+  writer.Flush();
 
   return out;
 }
@@ -92,15 +87,32 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_valu
 template <typename Int>
 void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values,
                               int32_t bit_width, UnpackFunc<Int> unpack) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  const auto num_bytes = GetNumBytes(num_values, bit_width);
 
   const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack);
   EXPECT_EQ(unpacked.size(), num_values);
   const auto roundtrip = PackValues(unpacked, num_values, bit_width);
   EXPECT_EQ(num_bytes, roundtrip.size());
-  for (int i = 0; i < num_bytes; ++i) {
+
+  // Checking all bytes but the last (that may not fall aligned)
+  for (int i = 0; i < num_bytes - 1; ++i) {
     EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
   }
+
+  // Checking last byte
+  if (num_bytes >= 1) {
+    const int i = num_bytes - 1;
+    const int last_bits_cnt = (num_values * bit_width) % 8;
+
+    if (last_bits_cnt == 0) {
+      // Properly aligned, this is the same check as before
+      EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
+    } else {
+      // We need to mask the last bits in the packed data that are arbitrary and not used.
+      const auto mask = static_cast<uint8_t>((1 << last_bits_cnt) - 1);
+      EXPECT_EQ(packed[i] & mask, roundtrip[i] & mask) << "differ in position " << i;
+    }
+  }
 }
 
 const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
@@ -119,10 +131,8 @@ const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
 class TestUnpack : public ::testing::TestWithParam<int> {
  protected:
   template <typename Int>
-  void TestRoundtripAlignment(UnpackFunc<Int> unpack, int bit_width,
+  void TestRoundtripAlignment(UnpackFunc<Int> unpack, int num_values, int bit_width,
                               std::size_t alignment_offset) {
-    int num_values = GetParam();
-
     // Assume std::vector allocation is likely be aligned for greater than a byte.
     // So we allocate more values than necessary and skip to the next byte with the
     // desired (non) alignment to test the proper condition.
@@ -135,9 +145,8 @@ class TestUnpack : public ::testing::TestWithParam<int> {
   }
 
   template <typename Int>
-  void TestUnpackZeros(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackZeros(UnpackFunc<Int> unpack, int num_values, int bit_width) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0});
     const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
@@ -147,9 +156,8 @@ class TestUnpack : public ::testing::TestWithParam<int> {
   }
 
   template <typename Int>
-  void TestUnpackOnes(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackOnes(UnpackFunc<Int> unpack, int num_values, int bit_width) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xFF});
     const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
@@ -168,9 +176,8 @@ class TestUnpack : public ::testing::TestWithParam<int> {
   }
 
   template <typename Int>
-  void TestUnpackAlternating(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackAlternating(UnpackFunc<Int> unpack, int num_values, int bit_width) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xAA});
     const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
@@ -198,20 +205,30 @@ class TestUnpack : public ::testing::TestWithParam<int> {
 
   template <typename Int>
   void TestAll(UnpackFunc<Int> unpack) {
+    const int num_values_base = GetParam();
+
     constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 * sizeof(Int);
     // Given how many edge cases there are in unpacking integers, it is best to test all
     // sizes
     for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
       SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width);
 
-      // Known values
-      TestUnpackZeros(unpack, bit_width);
-      TestUnpackOnes(unpack, bit_width);
-      TestUnpackAlternating(unpack, bit_width);
+      // Similarly, we test all epilogue sizes. That is extra values that could make it
+      // fall outside of an SIMD register
+      for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; ++epilogue_size) {
+        SCOPED_TRACE(::testing::Message() << "Testing epilogue_size=" << epilogue_size);
 
-      // Roundtrips
-      TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 0);
-      TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 1);
+        const int num_values = num_values_base + epilogue_size;
+
+        // Known values
+        TestUnpackZeros(unpack, num_values, bit_width);
+        TestUnpackOnes(unpack, num_values, bit_width);
+        TestUnpackAlternating(unpack, num_values, bit_width);
+
+        // Roundtrips
+        TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 0);
+        TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 1);
+      }
     }
   }
 };

From c669b7770b377fb1dbb99800a925a655767fe03a Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Mon, 20 Oct 2025 18:02:35 +0200
Subject: [PATCH 02/12] Try smaller integer sizes

---
 cpp/src/arrow/util/bpacking_dispatch_internal.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index f1c70dde861..9bd63532db0 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -74,8 +74,11 @@ constexpr int PackedMaxSpreadBytes(int width) {
 
 // Integer type that tries to contain as much as the spread as possible.
 template <int kSpreadBytes>
-using SpreadBufferUint =
-    std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)), uint_fast32_t, uint_fast64_t>;
+using SpreadBufferUint = std::conditional_t<
+    (kSpreadBytes <= sizeof(uint8_t)), uint_fast8_t,
+    std::conditional_t<(kSpreadBytes <= sizeof(uint16_t)), uint_fast16_t,
+                       std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)),
+                                          uint_fast32_t, uint_fast64_t>>>;
 
 /// Unpack integers.
 /// This function works for all input batch sizes but is not the fastest.

From 77fd67c834b949d5536df7a8f5b49476a8779cbc Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 10:22:52 +0200
Subject: [PATCH 03/12] void return type

---
 .../arrow/util/bit_stream_utils_internal.h    |   5 +-
 cpp/src/arrow/util/bpacking.cc                |  12 +-
 cpp/src/arrow/util/bpacking_benchmark.cc      |   2 +-
 .../arrow/util/bpacking_dispatch_internal.h   |  16 +--
 cpp/src/arrow/util/bpacking_internal.h        |  29 ++---
 cpp/src/arrow/util/bpacking_scalar.cc         |  12 +-
 cpp/src/arrow/util/bpacking_scalar_internal.h |  40 +++---
 cpp/src/arrow/util/bpacking_simd_avx2.cc      |  12 +-
 cpp/src/arrow/util/bpacking_simd_avx512.cc    |  12 +-
 cpp/src/arrow/util/bpacking_simd_default.cc   |  12 +-
 cpp/src/arrow/util/bpacking_simd_internal.h   | 114 +++++++++---------
 cpp/src/arrow/util/bpacking_test.cc           |   9 +-
 12 files changed, 135 insertions(+), 140 deletions(-)

diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index 3b7252d6ba8..543df6d9e79 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -337,9 +337,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
 
   using unpack_t = typename internal_bit_reader::unpack_detect<T>::type;
 
-  int num_unpacked = ::arrow::internal::unpack(
-      buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i), batch_size - i, num_bits);
-  ARROW_DCHECK_EQ(batch_size - i, num_unpacked);
+  ::arrow::internal::unpack(buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i),
+                            batch_size - i, num_bits);
 
   this->Advance(batch_size * num_bits);
 
diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index 369f361d9a6..95990dc1866 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -50,7 +50,7 @@ struct UnpackDynamicFunction {
 }  // namespace
 
 template <typename Uint>
-int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
 #if defined(ARROW_HAVE_NEON)
   return unpack_neon(in, out, batch_size, num_bits);
 #else
@@ -59,10 +59,10 @@ int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
 #endif
 }
 
-template int unpack<bool>(const uint8_t*, bool*, int, int);
-template int unpack<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack<bool>(const uint8_t*, bool*, int, int);
+template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int);
+template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);
+template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int);
+template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc
index 144da6ea878..82804d8efd8 100644
--- a/cpp/src/arrow/util/bpacking_benchmark.cc
+++ b/cpp/src/arrow/util/bpacking_benchmark.cc
@@ -33,7 +33,7 @@ namespace arrow::internal {
 namespace {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int);
 
 /// Get the number of bytes associate with a packing.
 constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index 9bd63532db0..cb204cf9cb0 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -31,15 +31,14 @@ namespace arrow::internal {
 
 /// Unpack a zero bit packed array.
 template <typename Uint>
-int unpack_null(const uint8_t* in, Uint* out, int batch_size) {
+void unpack_null(const uint8_t* in, Uint* out, int batch_size) {
   std::memset(out, 0, batch_size * sizeof(Uint));
-  return batch_size;
 }
 
 /// Unpack a packed array where packed and unpacked values have exactly the same number of
 /// bits.
 template <typename Uint>
-int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
+void unpack_full(const uint8_t* in, Uint* out, int batch_size) {
   if constexpr (ARROW_LITTLE_ENDIAN == 1) {
     std::memcpy(out, in, batch_size * sizeof(Uint));
   } else {
@@ -50,7 +49,6 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
       out[k] = FromLittleEndian(SafeLoadAs<Uint>(in + (k * sizeof(Uint))));
     }
   }
-  return batch_size;
 }
 
 /// Compute the maximum spread in bytes that a packed integer can cover.
@@ -144,7 +142,7 @@ int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
   using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
   constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
 
@@ -157,13 +155,11 @@ int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
   ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked);
   ARROW_COMPILER_ASSUME(epilog_size >= 0);
   unpack_epilog<kPackedBitWidth>(in, out + num_loops * kValuesUnpacked, epilog_size);
-
-  return batch_size;
 }
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>
-static int unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
-                       int num_bits) {
+static void unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
+                        int num_bits) {
   if constexpr (std::is_same_v<UnpackedUint, bool>) {
     switch (num_bits) {
       case 0:
@@ -433,6 +429,6 @@ static int unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
     }
   }
   ARROW_DCHECK(false) << "Unsupported num_bits";
-  return 0;
+  return;
 }
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h
index c910f3388f7..a2e338b7d7c 100644
--- a/cpp/src/arrow/util/bpacking_internal.h
+++ b/cpp/src/arrow/util/bpacking_internal.h
@@ -24,24 +24,25 @@
 namespace arrow::internal {
 
 template <typename Uint>
-ARROW_EXPORT int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<bool>(const uint8_t* in, bool* out,
-                                                       int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<bool>(const uint8_t* in, bool* out,
+                                                        int batch_size, int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint8_t>(const uint8_t* in, uint8_t* out,
-                                                          int batch_size, int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint16_t>(const uint8_t* in,
-                                                           uint16_t* out, int batch_size,
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint8_t>(const uint8_t* in,
+                                                           uint8_t* out, int batch_size,
                                                            int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint32_t>(const uint8_t* in,
-                                                           uint32_t* out, int batch_size,
-                                                           int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint16_t>(const uint8_t* in,
+                                                            uint16_t* out, int batch_size,
+                                                            int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint64_t>(const uint8_t* in,
-                                                           uint64_t* out, int batch_size,
-                                                           int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint32_t>(const uint8_t* in,
+                                                            uint32_t* out, int batch_size,
+                                                            int num_bits);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint64_t>(const uint8_t* in,
+                                                            uint64_t* out, int batch_size,
+                                                            int num_bits);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar.cc b/cpp/src/arrow/util/bpacking_scalar.cc
index 11260412d00..abd88b4ad20 100644
--- a/cpp/src/arrow/util/bpacking_scalar.cc
+++ b/cpp/src/arrow/util/bpacking_scalar.cc
@@ -22,14 +22,14 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
   return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
 }
 
-template int unpack_scalar<bool>(const uint8_t*, bool*, int, int);
-template int unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_scalar<bool>(const uint8_t*, bool*, int, int);
+template void unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int);
+template void unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int);
+template void unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int);
+template void unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar_internal.h b/cpp/src/arrow/util/bpacking_scalar_internal.h
index 577ad7f8d21..bb6e062e046 100644
--- a/cpp/src/arrow/util/bpacking_scalar_internal.h
+++ b/cpp/src/arrow/util/bpacking_scalar_internal.h
@@ -24,31 +24,31 @@
 namespace arrow::internal {
 
 template <typename Uint>
-ARROW_EXPORT int unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
-                               int num_bits);
+ARROW_EXPORT void unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
+                                int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<bool>(const uint8_t* in,
-                                                              bool* out, int batch_size,
-                                                              int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<bool>(const uint8_t* in,
+                                                               bool* out, int batch_size,
+                                                               int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint8_t>(const uint8_t* in,
-                                                                 uint8_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint16_t>(const uint8_t* in,
-                                                                  uint16_t* out,
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint8_t>(const uint8_t* in,
+                                                                  uint8_t* out,
                                                                   int batch_size,
                                                                   int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint32_t>(const uint8_t* in,
-                                                                  uint32_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint16_t>(const uint8_t* in,
+                                                                   uint16_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint64_t>(const uint8_t* in,
-                                                                  uint64_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint32_t>(const uint8_t* in,
+                                                                   uint32_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint64_t>(const uint8_t* in,
+                                                                   uint64_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc
index e1adc07adbb..aebdbcda755 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx2.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc
@@ -22,14 +22,14 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
   return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
 }
 
-template int unpack_avx2<bool>(const uint8_t*, bool*, int, int);
-template int unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx2<bool>(const uint8_t*, bool*, int, int);
+template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int);
+template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int);
+template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int);
+template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx512.cc b/cpp/src/arrow/util/bpacking_simd_avx512.cc
index 55e7c16a771..2ca3e1e709c 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx512.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx512.cc
@@ -22,14 +22,14 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
   return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits);
 }
 
-template int unpack_avx512<bool>(const uint8_t*, bool*, int, int);
-template int unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx512<bool>(const uint8_t*, bool*, int, int);
+template void unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int);
+template void unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int);
+template void unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int);
+template void unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc
index 60ca19c1504..72032ffbbf0 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_default.cc
@@ -26,15 +26,15 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
   return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
 }
 
-template int unpack_neon<bool>(const uint8_t*, bool*, int, int);
-template int unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_neon<bool>(const uint8_t*, bool*, int, int);
+template void unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int);
+template void unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int);
+template void unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int);
+template void unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h
index 32d6f654f8d..13d4c5beb81 100644
--- a/cpp/src/arrow/util/bpacking_simd_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_internal.h
@@ -26,92 +26,94 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<bool>(const uint8_t* in, bool* out,
-                                                            int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>(const uint8_t* in, bool* out,
+                                                             int batch_size,
+                                                             int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint8_t>(const uint8_t* in,
-                                                               uint8_t* out,
-                                                               int batch_size,
-                                                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint16_t>(const uint8_t* in,
-                                                                uint16_t* out,
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(const uint8_t* in,
+                                                                uint8_t* out,
                                                                 int batch_size,
                                                                 int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint32_t>(const uint8_t* in,
-                                                                uint32_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(const uint8_t* in,
+                                                                 uint16_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint64_t>(const uint8_t* in,
-                                                                uint64_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(const uint8_t* in,
+                                                                 uint32_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(const uint8_t* in,
+                                                                 uint64_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
 
 #endif
 
 #if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<bool>(const uint8_t* in, bool* out,
-                                                            int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>(const uint8_t* in, bool* out,
+                                                             int batch_size,
+                                                             int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint8_t>(const uint8_t* in,
-                                                               uint8_t* out,
-                                                               int batch_size,
-                                                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint16_t>(const uint8_t* in,
-                                                                uint16_t* out,
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(const uint8_t* in,
+                                                                uint8_t* out,
                                                                 int batch_size,
                                                                 int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint32_t>(const uint8_t* in,
-                                                                uint32_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(const uint8_t* in,
+                                                                 uint16_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint64_t>(const uint8_t* in,
-                                                                uint64_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(const uint8_t* in,
+                                                                 uint32_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(const uint8_t* in,
+                                                                 uint64_t* out,
+                                                                 int batch_size,
+                                                                 int num_bits);
 
 #endif
 
 #if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
-                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<bool>(const uint8_t* in,
-                                                              bool* out, int batch_size,
-                                                              int num_bits);
+ARROW_EXPORT void unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
+                                int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint8_t>(const uint8_t* in,
-                                                                 uint8_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<bool>(const uint8_t* in,
+                                                               bool* out, int batch_size,
+                                                               int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint16_t>(const uint8_t* in,
-                                                                  uint16_t* out,
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint8_t>(const uint8_t* in,
+                                                                  uint8_t* out,
                                                                   int batch_size,
                                                                   int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint32_t>(const uint8_t* in,
-                                                                  uint32_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint16_t>(const uint8_t* in,
+                                                                   uint16_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint64_t>(const uint8_t* in,
-                                                                  uint64_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint32_t>(const uint8_t* in,
+                                                                   uint32_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint64_t>(const uint8_t* in,
+                                                                   uint64_t* out,
+                                                                   int batch_size,
+                                                                   int num_bits);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index 0df17a55fa4..0a705ba3daf 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -25,7 +25,6 @@
 #include "arrow/util/bpacking_internal.h"
 #include "arrow/util/bpacking_scalar_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
-#include "arrow/util/logging.h"
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
 #  include "arrow/util/cpu_info.h"
@@ -34,7 +33,7 @@
 namespace arrow::internal {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int);
 
 /// Get the number of bytes associate with a packing.
 int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
@@ -58,11 +57,9 @@ std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
                               int32_t bit_width, UnpackFunc<Int> unpack) {
   // Using dynamic array to avoid std::vector<bool>
   auto buffer = std::make_unique<Int[]>(num_values);
-  int values_read = unpack(packed, buffer.get(), num_values, bit_width);
-  ARROW_DCHECK_GE(values_read, 0);
-  EXPECT_LE(values_read, num_values);
+  unpack(packed, buffer.get(), num_values, bit_width);
 
-  return std::vector<Int>(buffer.get(), buffer.get() + values_read);
+  return std::vector<Int>(buffer.get(), buffer.get() + num_values);
 }
 
 /// Use BitWriter to pack values into a vector.

From 532dd0ac88e39c8ee3a204556ed9dd9ef5171a44 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 10:53:20 +0200
Subject: [PATCH 04/12] Adapt unpack_epilog for prolog

---
 .../arrow/util/bpacking_dispatch_internal.h   | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index cb204cf9cb0..c0aba8cd583 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -80,8 +80,8 @@ using SpreadBufferUint = std::conditional_t<
 
 /// Unpack integers.
 /// This function works for all input batch sizes but is not the fastest.
-template <int kPackedBitWidth, typename Uint>
-int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) {
+template <int kPackedBitWidth, bool kBreakWhenAligned, typename Uint>
+int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset = 0) {
   constexpr int kMaxSpreadBytes = PackedMaxSpreadBytes(kPackedBitWidth);
   using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;
   constexpr int kBufferSize = sizeof(buffer_uint);
@@ -92,9 +92,13 @@ int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) {
   constexpr buffer_uint kLowMask =
       bit_util::LeastSignificantBitMaskInc<buffer_uint>(kPackedBitWidth);
 
+  ARROW_DCHECK_GE(bit_offset, 0);
+  ARROW_DCHECK_LE(bit_offset, 8);
+
   // Looping over values one by one
-  for (int k = 0; k < batch_size; ++k) {
-    const int start_bit = k * kPackedBitWidth;
+  const int start_bit_term = batch_size * kPackedBitWidth + bit_offset;
+  int start_bit = bit_offset;
+  while ((start_bit < start_bit_term) && (!kBreakWhenAligned || (start_bit % 8 != 0))) {
     const int start_byte = start_bit / 8;
     const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1;
     ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
@@ -127,7 +131,9 @@ int unpack_epilog(const uint8_t* in, Uint* out, int batch_size) {
       }
     }
 
-    out[k] = val;
+    *out = val;
+    out++;
+    start_bit += kPackedBitWidth;
   }
 
   return batch_size;
@@ -154,7 +160,8 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
   const auto epilog_size = batch_size - num_loops * kValuesUnpacked;
   ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked);
   ARROW_COMPILER_ASSUME(epilog_size >= 0);
-  unpack_epilog<kPackedBitWidth>(in, out + num_loops * kValuesUnpacked, epilog_size);
+  unpack_exact<kPackedBitWidth, false>(in, out + num_loops * kValuesUnpacked, epilog_size,
+                                       0);
 }
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>

From 2376c6cc6c912ae29e4d87c359ebba6a7e56db45 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 11:10:14 +0200
Subject: [PATCH 05/12] Add bit_offset parameter to unpack functions

---
 cpp/src/arrow/util/bpacking.cc                | 16 +--
 cpp/src/arrow/util/bpacking_benchmark.cc      |  4 +-
 .../arrow/util/bpacking_dispatch_internal.h   |  4 +-
 cpp/src/arrow/util/bpacking_internal.h        | 14 +--
 cpp/src/arrow/util/bpacking_scalar.cc         | 15 +--
 cpp/src/arrow/util/bpacking_scalar_internal.h | 37 +++----
 cpp/src/arrow/util/bpacking_simd_avx2.cc      | 15 +--
 cpp/src/arrow/util/bpacking_simd_avx512.cc    | 15 +--
 cpp/src/arrow/util/bpacking_simd_default.cc   | 15 +--
 cpp/src/arrow/util/bpacking_simd_internal.h   | 99 ++++++++-----------
 cpp/src/arrow/util/bpacking_test.cc           |  4 +-
 11 files changed, 108 insertions(+), 130 deletions(-)

diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index 95990dc1866..fdb1c5a52ac 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -50,19 +50,19 @@ struct UnpackDynamicFunction {
 }  // namespace
 
 template <typename Uint>
-void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
 #if defined(ARROW_HAVE_NEON)
-  return unpack_neon(in, out, batch_size, num_bits);
+  return unpack_neon(in, out, batch_size, num_bits, bit_offset);
 #else
   static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
-  return dispatch.func(in, out, batch_size, num_bits);
+  return dispatch.func(in, out, batch_size, num_bits, bit_offset);
 #endif
 }
 
-template void unpack<bool>(const uint8_t*, bool*, int, int);
-template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc
index 82804d8efd8..7094656eeef 100644
--- a/cpp/src/arrow/util/bpacking_benchmark.cc
+++ b/cpp/src/arrow/util/bpacking_benchmark.cc
@@ -33,7 +33,7 @@ namespace arrow::internal {
 namespace {
 
 template <typename Int>
-using UnpackFunc = void (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
 constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
@@ -89,7 +89,7 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
   std::vector<Int> unpacked(num_values, 0);
 
   for (auto _ : state) {
-    unpack(packed_ptr, unpacked.data(), num_values, bit_width);
+    unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(num_values * state.iterations());
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index c0aba8cd583..57ce127ec72 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -81,7 +81,7 @@ using SpreadBufferUint = std::conditional_t<
 /// Unpack integers.
 /// This function works for all input batch sizes but is not the fastest.
 template <int kPackedBitWidth, bool kBreakWhenAligned, typename Uint>
-int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset = 0) {
+int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
   constexpr int kMaxSpreadBytes = PackedMaxSpreadBytes(kPackedBitWidth);
   using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;
   constexpr int kBufferSize = sizeof(buffer_uint);
@@ -166,7 +166,7 @@ void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>
 static void unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
-                        int num_bits) {
+                        int num_bits, int bit_offset) {
   if constexpr (std::is_same_v<UnpackedUint, bool>) {
     switch (num_bits) {
       case 0:
diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h
index a2e338b7d7c..dd2a7e3ce93 100644
--- a/cpp/src/arrow/util/bpacking_internal.h
+++ b/cpp/src/arrow/util/bpacking_internal.h
@@ -24,25 +24,27 @@
 namespace arrow::internal {
 
 template <typename Uint>
-ARROW_EXPORT void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                         int bit_offset = 0);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack<bool>(const uint8_t* in, bool* out,
-                                                        int batch_size, int num_bits);
+                                                        int batch_size, int num_bits,
+                                                        int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack<uint8_t>(const uint8_t* in,
                                                            uint8_t* out, int batch_size,
-                                                           int num_bits);
+                                                           int num_bits, int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack<uint16_t>(const uint8_t* in,
                                                             uint16_t* out, int batch_size,
-                                                            int num_bits);
+                                                            int num_bits, int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack<uint32_t>(const uint8_t* in,
                                                             uint32_t* out, int batch_size,
-                                                            int num_bits);
+                                                            int num_bits, int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack<uint64_t>(const uint8_t* in,
                                                             uint64_t* out, int batch_size,
-                                                            int num_bits);
+                                                            int num_bits, int bit_offset);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar.cc b/cpp/src/arrow/util/bpacking_scalar.cc
index abd88b4ad20..b4265379335 100644
--- a/cpp/src/arrow/util/bpacking_scalar.cc
+++ b/cpp/src/arrow/util/bpacking_scalar.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-void unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                   int bit_offset) {
+  return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template void unpack_scalar<bool>(const uint8_t*, bool*, int, int);
-template void unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template void unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template void unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template void unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_scalar<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar_internal.h b/cpp/src/arrow/util/bpacking_scalar_internal.h
index bb6e062e046..7885f68e7b3 100644
--- a/cpp/src/arrow/util/bpacking_scalar_internal.h
+++ b/cpp/src/arrow/util/bpacking_scalar_internal.h
@@ -25,30 +25,23 @@ namespace arrow::internal {
 
 template <typename Uint>
 ARROW_EXPORT void unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
-                                int num_bits);
+                                int num_bits, int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<bool>(const uint8_t* in,
                                                                bool* out, int batch_size,
-                                                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint8_t>(const uint8_t* in,
-                                                                  uint8_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint16_t>(const uint8_t* in,
-                                                                   uint16_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint32_t>(const uint8_t* in,
-                                                                   uint32_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint64_t>(const uint8_t* in,
-                                                                   uint64_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
+                                                               int num_bits,
+                                                               int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc
index aebdbcda755..8261cdadf25 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx2.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                 int bit_offset) {
+  return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template void unpack_avx2<bool>(const uint8_t*, bool*, int, int);
-template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx2<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx512.cc b/cpp/src/arrow/util/bpacking_simd_avx512.cc
index 2ca3e1e709c..76b102169ad 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx512.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx512.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-void unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                   int bit_offset) {
+  return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template void unpack_avx512<bool>(const uint8_t*, bool*, int, int);
-template void unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template void unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template void unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template void unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx512<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc
index 72032ffbbf0..e9af823807d 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_default.cc
@@ -26,15 +26,16 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                 int bit_offset) {
+  return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template void unpack_neon<bool>(const uint8_t*, bool*, int, int);
-template void unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template void unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template void unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template void unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_neon<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h
index 13d4c5beb81..755b001140b 100644
--- a/cpp/src/arrow/util/bpacking_simd_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_internal.h
@@ -26,62 +26,48 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                              int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>(const uint8_t* in, bool* out,
-                                                             int batch_size,
-                                                             int num_bits);
+                                                             int batch_size, int num_bits,
+                                                             int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(const uint8_t* in,
-                                                                uint8_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(const uint8_t* in,
-                                                                 uint16_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(const uint8_t* in,
-                                                                 uint32_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(const uint8_t* in,
-                                                                 uint64_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
 #if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
 
 template <typename Uint>
-ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                              int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>(const uint8_t* in, bool* out,
-                                                             int batch_size,
-                                                             int num_bits);
+                                                             int batch_size, int num_bits,
+                                                             int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(const uint8_t* in,
-                                                                uint8_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(const uint8_t* in,
-                                                                 uint16_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(const uint8_t* in,
-                                                                 uint32_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(const uint8_t* in,
-                                                                 uint64_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
@@ -89,31 +75,24 @@ extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(const uint8_t*
 
 template <typename Uint>
 ARROW_EXPORT void unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
-                                int num_bits);
+                                int num_bits, int bit_offset);
 
 extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<bool>(const uint8_t* in,
                                                                bool* out, int batch_size,
-                                                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint8_t>(const uint8_t* in,
-                                                                  uint8_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint16_t>(const uint8_t* in,
-                                                                   uint16_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint32_t>(const uint8_t* in,
-                                                                   uint32_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint64_t>(const uint8_t* in,
-                                                                   uint64_t* out,
-                                                                   int batch_size,
-                                                                   int num_bits);
+                                                               int num_bits,
+                                                               int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index 0a705ba3daf..c7bafbdd426 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -33,7 +33,7 @@
 namespace arrow::internal {
 
 template <typename Int>
-using UnpackFunc = void (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
 int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
@@ -57,7 +57,7 @@ std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
                               int32_t bit_width, UnpackFunc<Int> unpack) {
   // Using dynamic array to avoid std::vector<bool>
   auto buffer = std::make_unique<Int[]>(num_values);
-  unpack(packed, buffer.get(), num_values, bit_width);
+  unpack(packed, buffer.get(), num_values, bit_width, /* bit_offset = */ 0);
 
   return std::vector<Int>(buffer.get(), buffer.get() + num_values);
 }

From 3cabfbee616515a08a542bff51339dcf24681e46 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 13:43:37 +0200
Subject: [PATCH 06/12] Add unpack prolog

---
 .../arrow/util/bit_stream_utils_internal.h    |  89 +----
 .../arrow/util/bpacking_dispatch_internal.h   | 344 ++++++++++--------
 cpp/src/arrow/util/bpacking_test.cc           | 138 +++----
 3 files changed, 283 insertions(+), 288 deletions(-)

diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index 543df6d9e79..e070f39c9e0 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -19,7 +19,6 @@
 
 #pragma once
 
-#include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <type_traits>
@@ -249,98 +248,36 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-namespace detail {
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
-                      int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4800)
-#endif
-  *v = static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
-                      *bit_offset);
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-  *bit_offset += num_bits;
-  if (*bit_offset >= 64) {
-    *byte_offset += 8;
-    *bit_offset -= 64;
-
-    *buffered_values =
-        detail::ReadLittleEndianWord(buffer + *byte_offset, max_bytes - *byte_offset);
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4800 4805)
-#endif
-    // Read bits of v that crossed into new buffered_values_
-    if (ARROW_PREDICT_TRUE(num_bits - *bit_offset < static_cast<int>(8 * sizeof(T)))) {
-      // if shift exponent(num_bits - *bit_offset) is not less than sizeof(T), *v will not
-      // change and the following code may cause a runtime error that the shift exponent
-      // is too large
-      *v = *v | static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset)
-                               << (num_bits - *bit_offset));
-    }
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-    ARROW_DCHECK_LE(*bit_offset, 64);
-  }
-}
-
-}  // namespace detail
-
 template <typename T>
 inline bool BitReader::GetValue(int num_bits, T* v) {
   return GetBatch(num_bits, v, 1) == 1;
 }
 
-namespace internal_bit_reader {
-template <typename T>
-struct unpack_detect {
-  using type = std::make_unsigned_t<T>;
-};
-
-template <>
-struct unpack_detect<bool> {
-  using type = bool;
-};
-}  // namespace internal_bit_reader
-
 template <typename T>
 inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
-  ARROW_DCHECK(buffer_ != NULL);
-  ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
+  constexpr uint64_t kBitsPerByte = 8;
 
-  int bit_offset = bit_offset_;
-  int byte_offset = byte_offset_;
-  uint64_t buffered_values = buffered_values_;
-  int max_bytes = max_bytes_;
-  const uint8_t* buffer = buffer_;
+  ARROW_DCHECK(buffer_ != NULLPTR);
+  ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
 
   const int64_t needed_bits = num_bits * static_cast<int64_t>(batch_size);
-  constexpr uint64_t kBitsPerByte = 8;
   const int64_t remaining_bits =
-      static_cast<int64_t>(max_bytes - byte_offset) * kBitsPerByte - bit_offset;
+      static_cast<int64_t>(max_bytes_ - byte_offset_) * kBitsPerByte - bit_offset_;
   if (remaining_bits < needed_bits) {
     batch_size = static_cast<int>(remaining_bits / num_bits);
   }
 
-  int i = 0;
-  if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
-    for (; i < batch_size && bit_offset != 0; ++i) {
-      detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
-                        &buffered_values);
-    }
-  }
-
-  using unpack_t = typename internal_bit_reader::unpack_detect<T>::type;
+  if constexpr (std::is_same_v<T, bool>) {
+    ::arrow::internal::unpack(buffer_ + byte_offset_, reinterpret_cast<bool*>(v),
+                              batch_size, num_bits, bit_offset_);
 
-  ::arrow::internal::unpack(buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i),
-                            batch_size - i, num_bits);
+  } else {
+    ::arrow::internal::unpack(buffer_ + byte_offset_,
+                              reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
+                              num_bits, bit_offset_);
+  }
 
-  this->Advance(batch_size * num_bits);
+  Advance(batch_size * num_bits);
 
   return batch_size;
 }
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index 57ce127ec72..a8b8c194d5e 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -53,19 +53,32 @@ void unpack_full(const uint8_t* in, Uint* out, int batch_size) {
 
 /// Compute the maximum spread in bytes that a packed integer can cover.
 ///
-/// This is assuming contiguous packed integer starting on a byte aligned boundary.
-/// This function is non-monotonic, for instance three bit integers will be split on the
-/// first byte boundary (hence having a spread of two bytes) while four bit integer will
-/// be well behaved and never spread over byte boundary (hence having a spread of one).
-constexpr int PackedMaxSpreadBytes(int width) {
+/// This is assuming contiguous packed integer starting with the given bit offset away
+/// from a byte boundary.
+/// This function is non-monotonic, for instance with zero offset, three bit integers
+/// will be split on the first byte boundary (hence having a spread of two bytes) while
+/// four bit integer will be well behaved and never spread over byte boundary (hence
+/// having a spread of one).
+constexpr int PackedMaxSpreadBytes(int width, int bit_offset) {
   int max = static_cast<int>(bit_util::BytesForBits(width));
-  int start = width;
-  while (start % 8 != 0) {
+  int start = bit_offset;
+  do {
     const int byte_start = start / 8;
     const int byte_end = (start + width - 1) / 8;  // inclusive end bit
     const int spread = byte_end - byte_start + 1;
     max = spread > max ? spread : max;
     start += width;
+  } while (start % 8 != bit_offset);
+  return max;
+}
+
+/// Compute the maximum spread in bytes that a packed integer can cover across all bit
+/// offsets.
+constexpr int PackedMaxSpreadBytes(int width) {
+  int max = 0;
+  for (int offset = 0; offset < 8; ++offset) {
+    const int spread = PackedMaxSpreadBytes(width, offset);
+    max = spread > max ? spread : max;
   }
   return max;
 }
@@ -80,9 +93,14 @@ using SpreadBufferUint = std::conditional_t<
 
 /// Unpack integers.
 /// This function works for all input batch sizes but is not the fastest.
-template <int kPackedBitWidth, bool kBreakWhenAligned, typename Uint>
+/// In prolog mode, instead of unpacking all required element, the function will
+/// stop if it finds a byte aligned value start.
+template <int kPackedBitWidth, bool kIsProlog, typename Uint>
 int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
-  constexpr int kMaxSpreadBytes = PackedMaxSpreadBytes(kPackedBitWidth);
+  // For the epilog we adapt the max spread since better alignment give shorter spreads
+  ARROW_DCHECK(kIsProlog || bit_offset == 0);
+  constexpr int kMaxSpreadBytes = kIsProlog ? PackedMaxSpreadBytes(kPackedBitWidth)
+                                            : PackedMaxSpreadBytes(kPackedBitWidth, 0);
   using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;
   constexpr int kBufferSize = sizeof(buffer_uint);
   // Due to misalignment, on large bit width, the spread can be larger than the maximum
@@ -98,7 +116,7 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
   // Looping over values one by one
   const int start_bit_term = batch_size * kPackedBitWidth + bit_offset;
   int start_bit = bit_offset;
-  while ((start_bit < start_bit_term) && (!kBreakWhenAligned || (start_bit % 8 != 0))) {
+  while ((start_bit < start_bit_term) && (!kIsProlog || (start_bit % 8 != 0))) {
     const int start_byte = start_bit / 8;
     const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1;
     ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
@@ -136,7 +154,8 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
     start_bit += kPackedBitWidth;
   }
 
-  return batch_size;
+  ARROW_DCHECK((start_bit - bit_offset) % kPackedBitWidth == 0);
+  return (start_bit - bit_offset) / kPackedBitWidth;
 }
 
 /// Unpack a packed array, delegating to a Unpacker struct.
@@ -148,20 +167,44 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
-  using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
-  constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset) {
+  if constexpr (kPackedBitWidth == 0) {
+    // Easy case to handle, simply setting memory to zero.
+    return unpack_null(in, out, batch_size);
+  } else {
+    // In case of misalignment, we need to run the prolog until aligned.
+    int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size, bit_offset);
+    // We either extracted everything or found a alignment
+    const int start_bit = extracted * kPackedBitWidth + bit_offset;
+    ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+    batch_size -= extracted;
+    ARROW_DCHECK_GE(batch_size, 0);
+    in += start_bit / 8;
+    out += extracted;
 
-  const int num_loops = batch_size / kValuesUnpacked;
-  for (int i = 0; i < num_loops; ++i) {
-    in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
-  }
+    if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+      // Only memcpy / static_cast
+      return unpack_full(in, out, batch_size);
+    } else {
+      using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+      constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
+
+      // Running the optimized kernel for batch extraction
+      const int unpacker_iter_count = batch_size / kValuesUnpacked;
+      for (int i = 0; i < unpacker_iter_count; ++i) {
+        in = UnpackerForWidth::unpack(in, out);
+        out += kValuesUnpacked;
+      }
+      batch_size -= unpacker_iter_count * kValuesUnpacked;
 
-  const auto epilog_size = batch_size - num_loops * kValuesUnpacked;
-  ARROW_COMPILER_ASSUME(epilog_size < kValuesUnpacked);
-  ARROW_COMPILER_ASSUME(epilog_size >= 0);
-  unpack_exact<kPackedBitWidth, false>(in, out + num_loops * kValuesUnpacked, epilog_size,
-                                       0);
+      // Running the epilog for the remaining values that don't fit in a kernel
+      ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+      ARROW_DCHECK_GE(batch_size, 0);
+      ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+      ARROW_COMPILER_ASSUME(batch_size >= 0);
+      unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= */ 0);
+    }
+  }
 }
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>
@@ -170,272 +213,271 @@ static void unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
   if constexpr (std::is_same_v<UnpackedUint, bool>) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 1) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 2) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 4) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_width<16, Unpacker>(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
       case 17:
-        return unpack_width<17, Unpacker>(in, out, batch_size);
+        return unpack_width<17, Unpacker>(in, out, batch_size, bit_offset);
       case 18:
-        return unpack_width<18, Unpacker>(in, out, batch_size);
+        return unpack_width<18, Unpacker>(in, out, batch_size, bit_offset);
       case 19:
-        return unpack_width<19, Unpacker>(in, out, batch_size);
+        return unpack_width<19, Unpacker>(in, out, batch_size, bit_offset);
       case 20:
-        return unpack_width<20, Unpacker>(in, out, batch_size);
+        return unpack_width<20, Unpacker>(in, out, batch_size, bit_offset);
       case 21:
-        return unpack_width<21, Unpacker>(in, out, batch_size);
+        return unpack_width<21, Unpacker>(in, out, batch_size, bit_offset);
       case 22:
-        return unpack_width<22, Unpacker>(in, out, batch_size);
+        return unpack_width<22, Unpacker>(in, out, batch_size, bit_offset);
       case 23:
-        return unpack_width<23, Unpacker>(in, out, batch_size);
+        return unpack_width<23, Unpacker>(in, out, batch_size, bit_offset);
       case 24:
-        return unpack_width<24, Unpacker>(in, out, batch_size);
+        return unpack_width<24, Unpacker>(in, out, batch_size, bit_offset);
       case 25:
-        return unpack_width<25, Unpacker>(in, out, batch_size);
+        return unpack_width<25, Unpacker>(in, out, batch_size, bit_offset);
       case 26:
-        return unpack_width<26, Unpacker>(in, out, batch_size);
+        return unpack_width<26, Unpacker>(in, out, batch_size, bit_offset);
       case 27:
-        return unpack_width<27, Unpacker>(in, out, batch_size);
+        return unpack_width<27, Unpacker>(in, out, batch_size, bit_offset);
       case 28:
-        return unpack_width<28, Unpacker>(in, out, batch_size);
+        return unpack_width<28, Unpacker>(in, out, batch_size, bit_offset);
       case 29:
-        return unpack_width<29, Unpacker>(in, out, batch_size);
+        return unpack_width<29, Unpacker>(in, out, batch_size, bit_offset);
       case 30:
-        return unpack_width<30, Unpacker>(in, out, batch_size);
+        return unpack_width<30, Unpacker>(in, out, batch_size, bit_offset);
       case 31:
-        return unpack_width<31, Unpacker>(in, out, batch_size);
+        return unpack_width<31, Unpacker>(in, out, batch_size, bit_offset);
       case 32:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<32, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 8) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_width<16, Unpacker>(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
       case 17:
-        return unpack_width<17, Unpacker>(in, out, batch_size);
+        return unpack_width<17, Unpacker>(in, out, batch_size, bit_offset);
       case 18:
-        return unpack_width<18, Unpacker>(in, out, batch_size);
+        return unpack_width<18, Unpacker>(in, out, batch_size, bit_offset);
       case 19:
-        return unpack_width<19, Unpacker>(in, out, batch_size);
+        return unpack_width<19, Unpacker>(in, out, batch_size, bit_offset);
       case 20:
-        return unpack_width<20, Unpacker>(in, out, batch_size);
+        return unpack_width<20, Unpacker>(in, out, batch_size, bit_offset);
       case 21:
-        return unpack_width<21, Unpacker>(in, out, batch_size);
+        return unpack_width<21, Unpacker>(in, out, batch_size, bit_offset);
       case 22:
-        return unpack_width<22, Unpacker>(in, out, batch_size);
+        return unpack_width<22, Unpacker>(in, out, batch_size, bit_offset);
       case 23:
-        return unpack_width<23, Unpacker>(in, out, batch_size);
+        return unpack_width<23, Unpacker>(in, out, batch_size, bit_offset);
       case 24:
-        return unpack_width<24, Unpacker>(in, out, batch_size);
+        return unpack_width<24, Unpacker>(in, out, batch_size, bit_offset);
       case 25:
-        return unpack_width<25, Unpacker>(in, out, batch_size);
+        return unpack_width<25, Unpacker>(in, out, batch_size, bit_offset);
       case 26:
-        return unpack_width<26, Unpacker>(in, out, batch_size);
+        return unpack_width<26, Unpacker>(in, out, batch_size, bit_offset);
       case 27:
-        return unpack_width<27, Unpacker>(in, out, batch_size);
+        return unpack_width<27, Unpacker>(in, out, batch_size, bit_offset);
       case 28:
-        return unpack_width<28, Unpacker>(in, out, batch_size);
+        return unpack_width<28, Unpacker>(in, out, batch_size, bit_offset);
       case 29:
-        return unpack_width<29, Unpacker>(in, out, batch_size);
+        return unpack_width<29, Unpacker>(in, out, batch_size, bit_offset);
       case 30:
-        return unpack_width<30, Unpacker>(in, out, batch_size);
+        return unpack_width<30, Unpacker>(in, out, batch_size, bit_offset);
       case 31:
-        return unpack_width<31, Unpacker>(in, out, batch_size);
+        return unpack_width<31, Unpacker>(in, out, batch_size, bit_offset);
       case 32:
-        return unpack_width<32, Unpacker>(in, out, batch_size);
+        return unpack_width<32, Unpacker>(in, out, batch_size, bit_offset);
       case 33:
-        return unpack_width<33, Unpacker>(in, out, batch_size);
+        return unpack_width<33, Unpacker>(in, out, batch_size, bit_offset);
       case 34:
-        return unpack_width<34, Unpacker>(in, out, batch_size);
+        return unpack_width<34, Unpacker>(in, out, batch_size, bit_offset);
       case 35:
-        return unpack_width<35, Unpacker>(in, out, batch_size);
+        return unpack_width<35, Unpacker>(in, out, batch_size, bit_offset);
       case 36:
-        return unpack_width<36, Unpacker>(in, out, batch_size);
+        return unpack_width<36, Unpacker>(in, out, batch_size, bit_offset);
       case 37:
-        return unpack_width<37, Unpacker>(in, out, batch_size);
+        return unpack_width<37, Unpacker>(in, out, batch_size, bit_offset);
       case 38:
-        return unpack_width<38, Unpacker>(in, out, batch_size);
+        return unpack_width<38, Unpacker>(in, out, batch_size, bit_offset);
       case 39:
-        return unpack_width<39, Unpacker>(in, out, batch_size);
+        return unpack_width<39, Unpacker>(in, out, batch_size, bit_offset);
       case 40:
-        return unpack_width<40, Unpacker>(in, out, batch_size);
+        return unpack_width<40, Unpacker>(in, out, batch_size, bit_offset);
       case 41:
-        return unpack_width<41, Unpacker>(in, out, batch_size);
+        return unpack_width<41, Unpacker>(in, out, batch_size, bit_offset);
       case 42:
-        return unpack_width<42, Unpacker>(in, out, batch_size);
+        return unpack_width<42, Unpacker>(in, out, batch_size, bit_offset);
       case 43:
-        return unpack_width<43, Unpacker>(in, out, batch_size);
+        return unpack_width<43, Unpacker>(in, out, batch_size, bit_offset);
       case 44:
-        return unpack_width<44, Unpacker>(in, out, batch_size);
+        return unpack_width<44, Unpacker>(in, out, batch_size, bit_offset);
       case 45:
-        return unpack_width<45, Unpacker>(in, out, batch_size);
+        return unpack_width<45, Unpacker>(in, out, batch_size, bit_offset);
       case 46:
-        return unpack_width<46, Unpacker>(in, out, batch_size);
+        return unpack_width<46, Unpacker>(in, out, batch_size, bit_offset);
       case 47:
-        return unpack_width<47, Unpacker>(in, out, batch_size);
+        return unpack_width<47, Unpacker>(in, out, batch_size, bit_offset);
       case 48:
-        return unpack_width<48, Unpacker>(in, out, batch_size);
+        return unpack_width<48, Unpacker>(in, out, batch_size, bit_offset);
       case 49:
-        return unpack_width<49, Unpacker>(in, out, batch_size);
+        return unpack_width<49, Unpacker>(in, out, batch_size, bit_offset);
       case 50:
-        return unpack_width<50, Unpacker>(in, out, batch_size);
+        return unpack_width<50, Unpacker>(in, out, batch_size, bit_offset);
       case 51:
-        return unpack_width<51, Unpacker>(in, out, batch_size);
+        return unpack_width<51, Unpacker>(in, out, batch_size, bit_offset);
       case 52:
-        return unpack_width<52, Unpacker>(in, out, batch_size);
+        return unpack_width<52, Unpacker>(in, out, batch_size, bit_offset);
       case 53:
-        return unpack_width<53, Unpacker>(in, out, batch_size);
+        return unpack_width<53, Unpacker>(in, out, batch_size, bit_offset);
       case 54:
-        return unpack_width<54, Unpacker>(in, out, batch_size);
+        return unpack_width<54, Unpacker>(in, out, batch_size, bit_offset);
       case 55:
-        return unpack_width<55, Unpacker>(in, out, batch_size);
+        return unpack_width<55, Unpacker>(in, out, batch_size, bit_offset);
       case 56:
-        return unpack_width<56, Unpacker>(in, out, batch_size);
+        return unpack_width<56, Unpacker>(in, out, batch_size, bit_offset);
       case 57:
-        return unpack_width<57, Unpacker>(in, out, batch_size);
+        return unpack_width<57, Unpacker>(in, out, batch_size, bit_offset);
       case 58:
-        return unpack_width<58, Unpacker>(in, out, batch_size);
+        return unpack_width<58, Unpacker>(in, out, batch_size, bit_offset);
       case 59:
-        return unpack_width<59, Unpacker>(in, out, batch_size);
+        return unpack_width<59, Unpacker>(in, out, batch_size, bit_offset);
       case 60:
-        return unpack_width<60, Unpacker>(in, out, batch_size);
+        return unpack_width<60, Unpacker>(in, out, batch_size, bit_offset);
       case 61:
-        return unpack_width<61, Unpacker>(in, out, batch_size);
+        return unpack_width<61, Unpacker>(in, out, batch_size, bit_offset);
       case 62:
-        return unpack_width<62, Unpacker>(in, out, batch_size);
+        return unpack_width<62, Unpacker>(in, out, batch_size, bit_offset);
       case 63:
-        return unpack_width<63, Unpacker>(in, out, batch_size);
+        return unpack_width<63, Unpacker>(in, out, batch_size, bit_offset);
       case 64:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<64, Unpacker>(in, out, batch_size, bit_offset);
     }
   }
-  ARROW_DCHECK(false) << "Unsupported num_bits";
-  return;
+  ARROW_DCHECK(false) << "Unsupported num_bits " << num_bits;
 }
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index c7bafbdd426..ac3a0706015 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -36,14 +36,15 @@ template <typename Int>
 using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
-int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
-  return static_cast<int32_t>(bit_util::BytesForBits(num_values * bit_width));
+int GetNumBytes(int num_values, int bit_width, int bit_offset) {
+  return static_cast<int>(bit_util::BytesForBits(num_values * bit_width + bit_offset));
 }
 
 /// Generate random bytes as packed integers.
-std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) {
+std::vector<uint8_t> GenerateRandomPackedValues(int num_values, int bit_width,
+                                                int bit_offset) {
   constexpr uint32_t kSeed = 3214;
-  const auto num_bytes = GetNumBytes(num_values, bit_width);
+  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
   std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid pointer for size 0
   random_bytes(num_bytes, kSeed, out.data());
@@ -54,27 +55,32 @@ std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_
 /// Convenience wrapper to unpack into a vector
 template <typename Int>
 std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
+                              int32_t bit_width, int32_t bit_offset,
+                              UnpackFunc<Int> unpack) {
   // Using dynamic array to avoid std::vector<bool>
   auto buffer = std::make_unique<Int[]>(num_values);
-  unpack(packed, buffer.get(), num_values, bit_width, /* bit_offset = */ 0);
+  unpack(packed, buffer.get(), num_values, bit_width, bit_offset);
 
   return std::vector<Int>(buffer.get(), buffer.get() + num_values);
 }
 
 /// Use BitWriter to pack values into a vector.
 template <typename Int>
-std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_values,
-                                int32_t bit_width) {
-  const auto num_bytes = GetNumBytes(num_values, bit_width);
+std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
+                                int bit_width, int bit_offset) {
+  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
   std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));
   bit_util::BitWriter writer(out.data(), num_bytes);
+
+  // Write a first 0 value to make an offset
+  bool written = writer.PutValue(0, bit_offset);
   for (const auto& v : values) {
-    bool written = writer.PutValue(v, bit_width);
-    if (!written) {
-      throw std::runtime_error("Cannot write move values");
-    }
+    written &= writer.PutValue(v, bit_width);
+  }
+
+  if (!written) {
+    throw std::runtime_error("Cannot write move values");
   }
   writer.Flush();
 
@@ -82,24 +88,28 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_valu
 }
 
 template <typename Int>
-void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
-  const auto num_bytes = GetNumBytes(num_values, bit_width);
+void CheckUnpackPackRoundtrip(const uint8_t* packed, int num_values, int bit_width,
+                              int bit_offset, UnpackFunc<Int> unpack) {
+  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
-  const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack);
+  const auto unpacked = UnpackValues(packed, num_values, bit_width, bit_offset, unpack);
   EXPECT_EQ(unpacked.size(), num_values);
-  const auto roundtrip = PackValues(unpacked, num_values, bit_width);
+  const auto roundtrip = PackValues(unpacked, num_values, bit_width, bit_offset);
   EXPECT_EQ(num_bytes, roundtrip.size());
 
-  // Checking all bytes but the last (that may not fall aligned)
-  for (int i = 0; i < num_bytes - 1; ++i) {
+  // Checking all bytes but the first and last (that may not fall aligned)
+  for (int i = 1; i < num_bytes - 1; ++i) {
     EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
   }
 
-  // Checking last byte
+  // Checking last and first byte
   if (num_bytes >= 1) {
+    // We need to mask the first bits in the packed data that are arbitrary and not used.
+    const auto mask = static_cast<uint8_t>(~((1 << bit_offset) - 1));
+    EXPECT_EQ(packed[0] & mask, roundtrip[0] & mask) << "differ in position " << 0;
+
     const int i = num_bytes - 1;
-    const int last_bits_cnt = (num_values * bit_width) % 8;
+    const int last_bits_cnt = (num_values * bit_width + bit_offset) % 8;
 
     if (last_bits_cnt == 0) {
       // Properly aligned, this is the same check as before
@@ -112,52 +122,36 @@ void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values,
   }
 }
 
-const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
-  auto addr = reinterpret_cast<std::uintptr_t>(ptr);
-
-  if (addr % alignment == 0) {
-    return ptr;
-  }
-
-  auto remainder = addr % alignment;
-  auto bytes_to_add = alignment - remainder;
-
-  return ptr + bytes_to_add;
-}
-
 class TestUnpack : public ::testing::TestWithParam<int> {
  protected:
   template <typename Int>
   void TestRoundtripAlignment(UnpackFunc<Int> unpack, int num_values, int bit_width,
-                              std::size_t alignment_offset) {
-    // Assume std::vector allocation is likely be aligned for greater than a byte.
-    // So we allocate more values than necessary and skip to the next byte with the
-    // desired (non) alignment to test the proper condition.
-    constexpr int32_t kExtraValues = sizeof(Int) * 8;
-    const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width);
-    const uint8_t* packed_unaligned =
-        GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset;
-
-    CheckUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack);
+                              int bit_offset) {
+    const auto packed = GenerateRandomPackedValues(num_values, bit_width, bit_offset);
+    CheckUnpackPackRoundtrip(packed.data(), num_values, bit_width, bit_offset, unpack);
   }
 
   template <typename Int>
-  void TestUnpackZeros(UnpackFunc<Int> unpack, int num_values, int bit_width) {
-    const auto num_bytes = GetNumBytes(num_values, bit_width);
+  void TestUnpackZeros(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                       int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
     const std::vector<Int> expected(static_cast<std::size_t>(num_values), Int{0});
     EXPECT_EQ(unpacked, expected);
   }
 
   template <typename Int>
-  void TestUnpackOnes(UnpackFunc<Int> unpack, int num_values, int bit_width) {
-    const auto num_bytes = GetNumBytes(num_values, bit_width);
+  void TestUnpackOnes(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                      int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xFF});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
     // Generate bit_width ones
     Int expected_value = 0;
@@ -173,13 +167,17 @@ class TestUnpack : public ::testing::TestWithParam<int> {
   }
 
   template <typename Int>
-  void TestUnpackAlternating(UnpackFunc<Int> unpack, int num_values, int bit_width) {
-    const auto num_bytes = GetNumBytes(num_values, bit_width);
+  void TestUnpackAlternating(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                             int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
-    const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xAA});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    // Pick between two different bit patterns so that we always unpack starting with 1
+    const uint8_t byte = bit_offset % 2 == 0 ? 0b10101010 : 0b01010101;
+    const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), byte);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
-    // Generate alternative bit sequence sratring with either 0 or 1
+    // Generate alternative bit sequence starting with either 0 or 1
     Int one_zero_value = 0;
     Int zero_one_value = 0;
     for (int i = 0; i < bit_width; ++i) {
@@ -205,11 +203,28 @@ class TestUnpack : public ::testing::TestWithParam<int> {
     const int num_values_base = GetParam();
 
     constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 * sizeof(Int);
+
     // Given how many edge cases there are in unpacking integers, it is best to test all
     // sizes
     for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
       SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width);
 
+      // We test all bit offset within a byte / misalignments to change how the
+      // prolog.
+      for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
+        SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" << bit_offset);
+
+        // Known values
+        TestUnpackZeros(unpack, num_values_base, bit_width, bit_offset);
+        TestUnpackOnes(unpack, num_values_base, bit_width, bit_offset);
+        TestUnpackAlternating(unpack, num_values_base, bit_width, bit_offset);
+
+        // Roundtrips
+        TestRoundtripAlignment(unpack, num_values_base, bit_width, bit_offset);
+
+        if (testing::Test::HasFailure()) return;
+      }
+
       // Similarly, we test all epilogue sizes. That is extra values that could make it
       // fall outside of an SIMD register
       for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; ++epilogue_size) {
@@ -218,13 +233,14 @@ class TestUnpack : public ::testing::TestWithParam<int> {
         const int num_values = num_values_base + epilogue_size;
 
         // Known values
-        TestUnpackZeros(unpack, num_values, bit_width);
-        TestUnpackOnes(unpack, num_values, bit_width);
-        TestUnpackAlternating(unpack, num_values, bit_width);
+        TestUnpackZeros(unpack, num_values, bit_width, /* bit_offset= */ 0);
+        TestUnpackOnes(unpack, num_values, bit_width, /* bit_offset= */ 0);
+        TestUnpackAlternating(unpack, num_values, bit_width, /* bit_offset= */ 0);
 
         // Roundtrips
-        TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 0);
-        TestRoundtripAlignment(unpack, num_values, bit_width, /* alignment_offset= */ 1);
+        TestRoundtripAlignment(unpack, num_values, bit_width, /* bit_offset= */ 0);
+
+        if (testing::Test::HasFailure()) return;
       }
     }
   }

From 46de8629e5caa6cb4205ebec26916d11d4399861 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:25:17 +0200
Subject: [PATCH 07/12] Simplify test roundtrip logic

---
 cpp/src/arrow/util/bpacking_test.cc | 80 ++++++++++++-----------------
 1 file changed, 32 insertions(+), 48 deletions(-)

diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index ac3a0706015..8dc68a0b118 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -19,6 +19,7 @@
 
 #include <gtest/gtest.h>
 
+#include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/bit_stream_utils_internal.h"
 #include "arrow/util/bit_util.h"
@@ -40,15 +41,24 @@ int GetNumBytes(int num_values, int bit_width, int bit_offset) {
   return static_cast<int>(bit_util::BytesForBits(num_values * bit_width + bit_offset));
 }
 
-/// Generate random bytes as packed integers.
-std::vector<uint8_t> GenerateRandomPackedValues(int num_values, int bit_width,
-                                                int bit_offset) {
+/// Generate random values that can be packed within the given bit width.
+template <typename Uint>
+std::vector<Uint> GenerateRandomValuesForPacking(int num_values, int bit_width) {
   constexpr uint32_t kSeed = 3214;
-  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
-  std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid pointer for size 0
-  random_bytes(num_bytes, kSeed, out.data());
+  num_values = std::max(1, num_values);  // We need a valid pointer for size 0
+  std::vector<Uint> out(num_values);
+
+  if (bit_width == 0) {
+    return out;
+  }
 
+  if constexpr (std::is_same_v<Uint, bool>) {
+    random_is_valid(num_values, 0.5, &out, kSeed);
+  } else {
+    const uint64_t max = (uint64_t{1} << (static_cast<uint64_t>(bit_width) - 1)) - 1;
+    rand_uniform_int(out.size(), kSeed, /* min= */ decltype(max){0}, max, out.data());
+  }
   return out;
 }
 
@@ -57,11 +67,16 @@ template <typename Int>
 std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
                               int32_t bit_width, int32_t bit_offset,
                               UnpackFunc<Int> unpack) {
-  // Using dynamic array to avoid std::vector<bool>
-  auto buffer = std::make_unique<Int[]>(num_values);
-  unpack(packed, buffer.get(), num_values, bit_width, bit_offset);
-
-  return std::vector<Int>(buffer.get(), buffer.get() + num_values);
+  if constexpr (std::is_same_v<Int, bool>) {
+    // Using dynamic array to avoid std::vector<bool>
+    auto buffer = std::make_unique<Int[]>(num_values);
+    unpack(packed, buffer.get(), num_values, bit_width, bit_offset);
+    return std::vector<Int>(buffer.get(), buffer.get() + num_values);
+  } else {
+    std::vector<Int> out(num_values);
+    unpack(packed, out.data(), num_values, bit_width, bit_offset);
+    return out;
+  }
 }
 
 /// Use BitWriter to pack values into a vector.
@@ -87,48 +102,17 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
   return out;
 }
 
-template <typename Int>
-void CheckUnpackPackRoundtrip(const uint8_t* packed, int num_values, int bit_width,
-                              int bit_offset, UnpackFunc<Int> unpack) {
-  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
-
-  const auto unpacked = UnpackValues(packed, num_values, bit_width, bit_offset, unpack);
-  EXPECT_EQ(unpacked.size(), num_values);
-  const auto roundtrip = PackValues(unpacked, num_values, bit_width, bit_offset);
-  EXPECT_EQ(num_bytes, roundtrip.size());
-
-  // Checking all bytes but the first and last (that may not fall aligned)
-  for (int i = 1; i < num_bytes - 1; ++i) {
-    EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
-  }
-
-  // Checking last and first byte
-  if (num_bytes >= 1) {
-    // We need to mask the first bits in the packed data that are arbitrary and not used.
-    const auto mask = static_cast<uint8_t>(~((1 << bit_offset) - 1));
-    EXPECT_EQ(packed[0] & mask, roundtrip[0] & mask) << "differ in position " << 0;
-
-    const int i = num_bytes - 1;
-    const int last_bits_cnt = (num_values * bit_width + bit_offset) % 8;
-
-    if (last_bits_cnt == 0) {
-      // Properly aligned, this is the same check as before
-      EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
-    } else {
-      // We need to mask the last bits in the packed data that are arbitrary and not used.
-      const auto mask = static_cast<uint8_t>((1 << last_bits_cnt) - 1);
-      EXPECT_EQ(packed[i] & mask, roundtrip[i] & mask) << "differ in position " << i;
-    }
-  }
-}
-
 class TestUnpack : public ::testing::TestWithParam<int> {
  protected:
   template <typename Int>
   void TestRoundtripAlignment(UnpackFunc<Int> unpack, int num_values, int bit_width,
                               int bit_offset) {
-    const auto packed = GenerateRandomPackedValues(num_values, bit_width, bit_offset);
-    CheckUnpackPackRoundtrip(packed.data(), num_values, bit_width, bit_offset, unpack);
+    const auto original = GenerateRandomValuesForPacking<Int>(num_values, bit_width);
+    const auto packed = PackValues(original, num_values, bit_width, bit_offset);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
+    EXPECT_EQ(unpacked.size(), num_values);
+    EXPECT_EQ(original, unpacked);
   }
 
   template <typename Int>

From c4254506d1b6392fdef9985da3bd79dd90ebd381 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:01:50 +0200
Subject: [PATCH 08/12] Fix ASAN test error

---
 cpp/src/arrow/util/bpacking_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index 8dc68a0b118..c9616270784 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -83,6 +83,10 @@ std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
 template <typename Int>
 std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
                                 int bit_width, int bit_offset) {
+  if (bit_width == 0) {
+    return {};
+  }
+
   const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
   std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));

From 82ba07cb06f7b0e2c0f8776982ecd87d601f3cd1 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:58:58 +0200
Subject: [PATCH 09/12] Remove BitReader from BitPackedRunDecoder

---
 .../arrow/util/bit_stream_utils_internal.h    |  4 +-
 cpp/src/arrow/util/rle_encoding_internal.h    | 67 ++++++++++---------
 cpp/src/arrow/util/rle_encoding_test.cc       | 12 ++--
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index e070f39c9e0..1057a0bf381 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -268,8 +268,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
   }
 
   if constexpr (std::is_same_v<T, bool>) {
-    ::arrow::internal::unpack(buffer_ + byte_offset_, reinterpret_cast<bool*>(v),
-                              batch_size, num_bits, bit_offset_);
+    ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
+                              bit_offset_);
 
   } else {
     ::arrow::internal::unpack(buffer_ + byte_offset_,
diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h
index 2420270f3ab..6b2782da315 100644
--- a/cpp/src/arrow/util/rle_encoding_internal.h
+++ b/cpp/src/arrow/util/rle_encoding_internal.h
@@ -29,6 +29,8 @@
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_stream_utils_internal.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bpacking_internal.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 
 namespace arrow::util {
@@ -278,10 +280,9 @@ class RleRunDecoder {
   /// Return the repeated value of this decoder.
   constexpr value_type value() const { return value_; }
 
-  /// Try to advance by as many values as provided.
+  /// Advance by as many values as provided or until exhaustion of the decoder.
   /// Return the number of values skipped.
-  /// May advance by less than asked for if there are not enough values left.
-  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) {
+  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
     const auto steps = std::min(batch_size, remaining_count_);
     remaining_count_ -= steps;
     return steps;
@@ -331,52 +332,58 @@ class BitPackedRunDecoder {
   }
 
   void Reset(const RunType& run, rle_size_t value_bit_width) noexcept {
-    remaining_count_ = run.values_count();
     ARROW_DCHECK_GE(value_bit_width, 0);
     ARROW_DCHECK_LE(value_bit_width, 64);
-    bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size(value_bit_width));
+    data_ = run.raw_data_ptr();
+    values_count_ = run.values_count();
+    values_read_ = 0;
   }
 
   /// Return the number of values that can be advanced.
-  constexpr rle_size_t remaining() const { return remaining_count_; }
+  constexpr rle_size_t remaining() const { return values_count_ - values_read_; }
 
-  /// Try to advance by as many values as provided.
-  /// Return the number of values skipped or 0 if it fail to advance.
-  /// May advance by less than asked for if there are not enough values left.
-  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) {
-    const auto steps = std::min(batch_size, remaining_count_);
-    if (bit_reader_.Advance(steps * value_bit_width)) {
-      remaining_count_ -= steps;
-      return steps;
-    }
-    return 0;
+  /// Advance by as many values as provided or until exhaustion of the decoder.
+  /// Return the number of values skipped.
+  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
+    const auto steps = std::min(batch_size, remaining());
+    values_read_ += steps;
+    return steps;
   }
 
-  /// Get the next value and return false if there are no more or an error occurred.
+  /// Get the next value and return false if there are no more.
   [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) {
     return GetBatch(out_value, 1, value_bit_width) == 1;
   }
 
   /// Get a batch of values return the number of decoded elements.
   /// May write fewer elements to the output than requested if there are not enough values
-  /// left or if an error occurred.
+  /// left.
   [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size,
                                     rle_size_t value_bit_width) {
-    if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) {
-      return 0;
-    }
+    const auto steps = std::min(batch_size, remaining());
+    const auto bits_read = values_read_ * value_bit_width;
+    const auto* unread_data = data_ + bits_read / 8;
+    const auto bit_offset = bits_read % 8;
 
-    const auto to_read = std::min(remaining_count_, batch_size);
-    const auto actual_read = bit_reader_.GetBatch(value_bit_width, out, to_read);
-    // There should not be any reason why the actual read would be different
-    // but this is error resistant.
-    remaining_count_ -= actual_read;
-    return actual_read;
+    if constexpr (std::is_same_v<T, bool>) {
+      ::arrow::internal::unpack(unread_data, out, steps, value_bit_width, bit_offset);
+
+    } else {
+      ::arrow::internal::unpack(unread_data,
+                                reinterpret_cast<std::make_unsigned_t<value_type>*>(out),
+                                steps, value_bit_width, bit_offset);
+    }
+    values_read_ += steps;
+    return steps;
   }
 
  private:
-  ::arrow::bit_util::BitReader bit_reader_ = {};
-  rle_size_t remaining_count_ = 0;
+  /// The pointer to the beginning of the run
+  const uint8_t* data_ = nullptr;
+  /// The total number of values in the run
+  rle_size_t values_count_ = 0;
+  /// The number of values read by the decoder
+  rle_size_t values_read_ = 0;
 
   static_assert(std::is_integral_v<value_type>,
                 "This class is meant to decode positive integers");
@@ -895,7 +902,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out,
     return {0, 0};
   }
   converter->WriteRepeated(out, out + batch.total_read(), value);
-  const auto actual_values_read = decoder->Advance(batch.values_read(), value_bit_width);
+  const auto actual_values_read = decoder->Advance(batch.values_read());
   // We always cropped the number of values_read by the remaining values in the run.
   // What's more the RLE decoder should not encounter any errors.
   ARROW_DCHECK_EQ(actual_values_read, batch.values_read());
diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc
index c7f4878b741..f3a14af4412 100644
--- a/cpp/src/arrow/util/rle_encoding_test.cc
+++ b/cpp/src/arrow/util/rle_encoding_test.cc
@@ -290,7 +290,7 @@ void TestRleDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(vals.at(0), expected_value);
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
-  EXPECT_EQ(decoder.Advance(3, bit_width), 3);
+  EXPECT_EQ(decoder.Advance(3), 3);
   read += 3;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
@@ -302,9 +302,9 @@ void TestRleDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
   // Exhaust iteration
-  EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read);
+  EXPECT_EQ(decoder.Advance(value_count - read), value_count - read);
   EXPECT_EQ(decoder.remaining(), 0);
-  EXPECT_EQ(decoder.Advance(1, bit_width), 0);
+  EXPECT_EQ(decoder.Advance(1), 0);
   vals = {0, 0};
   EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0);
   EXPECT_EQ(vals.at(0), 0);
@@ -350,7 +350,7 @@ void TestBitPackedDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   read += 1;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
-  EXPECT_EQ(decoder.Advance(3, bit_width), 3);
+  EXPECT_EQ(decoder.Advance(3), 3);
   read += 3;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
@@ -362,9 +362,9 @@ void TestBitPackedDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
   // Exhaust iteration
-  EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read);
+  EXPECT_EQ(decoder.Advance(value_count - read), value_count - read);
   EXPECT_EQ(decoder.remaining(), 0);
-  EXPECT_EQ(decoder.Advance(1, bit_width), 0);
+  EXPECT_EQ(decoder.Advance(1), 0);
   vals = {0, 0};
   EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0);
   EXPECT_EQ(vals.at(0), 0);

From 95271fe1f2c076d3adc482222b59b3eae195f02b Mon Sep 17 00:00:00 2001
From: Antoine Prouvost <AntoinePrv@users.noreply.github.com>
Date: Thu, 23 Oct 2025 09:38:30 +0200
Subject: [PATCH 10/12] Check bit_offset size

Co-authored-by: Antoine Pitrou <pitrou@free.fr>
---
 cpp/src/arrow/util/bpacking_dispatch_internal.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index a8b8c194d5e..1df3c2df44c 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -99,6 +99,7 @@ template <int kPackedBitWidth, bool kIsProlog, typename Uint>
 int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
   // For the epilog we adapt the max spread since better alignment give shorter spreads
   ARROW_DCHECK(kIsProlog || bit_offset == 0);
+  ARROW_DCHECK(bit_offset >= 0 && bit_offset < 8);
   constexpr int kMaxSpreadBytes = kIsProlog ? PackedMaxSpreadBytes(kPackedBitWidth)
                                             : PackedMaxSpreadBytes(kPackedBitWidth, 0);
   using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;

From 9ace2e676e6c288c734bd3345f68eeefac62fe41 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 23 Oct 2025 10:14:56 +0200
Subject: [PATCH 11/12] Merge mask functions

---
 cpp/src/arrow/util/bit_run_reader.cc          |  2 +-
 cpp/src/arrow/util/bit_run_reader.h           |  6 +++---
 cpp/src/arrow/util/bit_util.h                 | 19 ++++++++-----------
 cpp/src/arrow/util/bitmap_reader.h            |  2 +-
 .../arrow/util/bpacking_dispatch_internal.h   |  2 +-
 cpp/src/arrow/util/decimal.cc                 |  3 ++-
 6 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/util/bit_run_reader.cc b/cpp/src/arrow/util/bit_run_reader.cc
index b1284151d5c..9edf3360890 100644
--- a/cpp/src/arrow/util/bit_run_reader.cc
+++ b/cpp/src/arrow/util/bit_run_reader.cc
@@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t
 
   // Prepare for inversion in NextRun.
   // Clear out any preceding bits.
-  word_ = word_ & ~bit_util::LeastSignificantBitMask(position_);
+  word_ = word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(position_);
 }
 
 #endif
diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h
index ed7be940a54..7bb00140279 100644
--- a/cpp/src/arrow/util/bit_run_reader.h
+++ b/cpp/src/arrow/util/bit_run_reader.h
@@ -106,7 +106,7 @@ class ARROW_EXPORT BitRunReader {
     int64_t start_bit_offset = start_position & 63;
     // Invert the word for proper use of CountTrailingZeros and
     // clear bits so CountTrailingZeros can do it magic.
-    word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset);
+    word_ = ~word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(start_bit_offset);
 
     // Go  forward until the next change from unset to set.
     int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
@@ -311,12 +311,12 @@ class BaseSetBitRunReader {
       memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
       // XXX MostSignificantBitmask
       return (bit_util::ToLittleEndian(word) << bit_offset) &
-             ~bit_util::LeastSignificantBitMask(64 - num_bits);
+             ~bit_util::LeastSignificantBitMask<uint64_t>(64 - num_bits);
     } else {
       memcpy(&word, bitmap_, num_bytes);
       bitmap_ += num_bytes;
       return (bit_util::ToLittleEndian(word) >> bit_offset) &
-             bit_util::LeastSignificantBitMask(num_bits);
+             bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
     }
   }
 
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index e72eca74e86..52a42f538b5 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -113,17 +113,14 @@ constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
 constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
 
 // Returns a mask for the bit_index lower order bits.
-// Only valid for bit_index in the range [0, 64).
-constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
-  return (static_cast<uint64_t>(1) << bit_index) - 1;
-}
-
-// Returns a mask for the bit_index lower order bits.
-// Only valid for bit_index in the range [0, sizeof(Uint)].
-template <typename Uint>
-constexpr auto LeastSignificantBitMaskInc(Uint bit_index) {
-  if (bit_index == 8 * sizeof(Uint)) {
-    return ~Uint{0};
+// Valid in the range `[0, 8*sizof(Uint)]` if `kAllowUpperBound`
+// otherwise `[0, 8*sizof(Uint)[`
+template <typename Uint, bool kAllowUpperBound = false>
+constexpr auto LeastSignificantBitMask(Uint bit_index) {
+  if constexpr (kAllowUpperBound) {
+    if (bit_index == 8 * sizeof(Uint)) {
+      return ~Uint{0};
+    }
   }
   return (Uint{1} << bit_index) - Uint{1};
 }
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index d95fd921f48..83c142c559b 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -136,7 +136,7 @@ class BitmapUInt64Reader {
     memcpy(&word, bitmap_, num_bytes);
     bitmap_ += num_bytes;
     return (bit_util::ToLittleEndian(word) >> bit_offset) &
-           bit_util::LeastSignificantBitMask(num_bits);
+           bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
   }
 
   const uint8_t* bitmap_;
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index 1df3c2df44c..eed5542808d 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -109,7 +109,7 @@ int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
   // aligned bytes.
   constexpr bool kOversized = kBufferSize < kMaxSpreadBytes;
   constexpr buffer_uint kLowMask =
-      bit_util::LeastSignificantBitMaskInc<buffer_uint>(kPackedBitWidth);
+      bit_util::LeastSignificantBitMask<buffer_uint, true>(kPackedBitWidth);
 
   ARROW_DCHECK_GE(bit_offset, 0);
   ARROW_DCHECK_LE(bit_offset, 8);
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 9e075594d6a..d80164f45c0 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -610,7 +610,8 @@ static void AppendLittleEndianArrayToString(const std::array<uint64_t, n>& array
       // *elem = dividend / 1e9;
       // remainder = dividend % 1e9.
       uint32_t hi = static_cast<uint32_t>(*elem >> 32);
-      uint32_t lo = static_cast<uint32_t>(*elem & bit_util::LeastSignificantBitMask(32));
+      uint32_t lo =
+          static_cast<uint32_t>(*elem & bit_util::LeastSignificantBitMask<uint64_t>(32));
       uint64_t dividend_hi = (static_cast<uint64_t>(remainder) << 32) | hi;
       uint64_t quotient_hi = dividend_hi / k1e9;
       remainder = static_cast<uint32_t>(dividend_hi % k1e9);

From 53453c8f4f71112a5013eb14c177143aef19dfc8 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Wed, 29 Oct 2025 10:25:55 +0100
Subject: [PATCH 12/12] Address reviewer comments

---
 cpp/src/arrow/util/bpacking_test.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index c9616270784..a3ab4d6dda2 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -56,7 +56,7 @@ std::vector<Uint> GenerateRandomValuesForPacking(int num_values, int bit_width)
   if constexpr (std::is_same_v<Uint, bool>) {
     random_is_valid(num_values, 0.5, &out, kSeed);
   } else {
-    const uint64_t max = (uint64_t{1} << (static_cast<uint64_t>(bit_width) - 1)) - 1;
+    const uint64_t max = bit_util::LeastSignificantBitMask<uint64_t, true>(bit_width);
     rand_uniform_int(out.size(), kSeed, /* min= */ decltype(max){0}, max, out.data());
   }
   return out;
@@ -93,14 +93,13 @@ std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
   bit_util::BitWriter writer(out.data(), num_bytes);
 
   // Write a first 0 value to make an offset
-  bool written = writer.PutValue(0, bit_offset);
+  const bool written = writer.PutValue(0, bit_offset);
+  ARROW_DCHECK(written);
   for (const auto& v : values) {
-    written &= writer.PutValue(v, bit_width);
+    const bool written = writer.PutValue(v, bit_width);
+    ARROW_DCHECK(written);
   }
 
-  if (!written) {
-    throw std::runtime_error("Cannot write move values");
-  }
   writer.Flush();
 
   return out;