diff --git a/cpp/src/arrow/util/bit_run_reader.cc b/cpp/src/arrow/util/bit_run_reader.cc
index b1284151d5c..9edf3360890 100644
--- a/cpp/src/arrow/util/bit_run_reader.cc
+++ b/cpp/src/arrow/util/bit_run_reader.cc
@@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t
 
   // Prepare for inversion in NextRun.
   // Clear out any preceding bits.
-  word_ = word_ & ~bit_util::LeastSignificantBitMask(position_);
+  word_ = word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(position_);
 }
 
 #endif
diff --git a/cpp/src/arrow/util/bit_run_reader.h b/cpp/src/arrow/util/bit_run_reader.h
index ed7be940a54..7bb00140279 100644
--- a/cpp/src/arrow/util/bit_run_reader.h
+++ b/cpp/src/arrow/util/bit_run_reader.h
@@ -106,7 +106,7 @@ class ARROW_EXPORT BitRunReader {
     int64_t start_bit_offset = start_position & 63;
     // Invert the word for proper use of CountTrailingZeros and
     // clear bits so CountTrailingZeros can do it magic.
-    word_ = ~word_ & ~bit_util::LeastSignificantBitMask(start_bit_offset);
+    word_ = ~word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(start_bit_offset);
 
     // Go  forward until the next change from unset to set.
     int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
@@ -311,12 +311,12 @@ class BaseSetBitRunReader {
       memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
       // XXX MostSignificantBitmask
       return (bit_util::ToLittleEndian(word) << bit_offset) &
-             ~bit_util::LeastSignificantBitMask(64 - num_bits);
+             ~bit_util::LeastSignificantBitMask<uint64_t>(64 - num_bits);
     } else {
       memcpy(&word, bitmap_, num_bytes);
       bitmap_ += num_bytes;
       return (bit_util::ToLittleEndian(word) >> bit_offset) &
-             bit_util::LeastSignificantBitMask(num_bits);
+             bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
     }
   }
 
diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h
index cf039a9ac9f..1057a0bf381 100644
--- a/cpp/src/arrow/util/bit_stream_utils_internal.h
+++ b/cpp/src/arrow/util/bit_stream_utils_internal.h
@@ -19,7 +19,6 @@
 
 #pragma once
 
-#include <algorithm>
 #include <cstdint>
 #include <cstring>
 #include <type_traits>
@@ -249,110 +248,36 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-namespace detail {
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
-                      int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4800)
-#endif
-  *v = static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
-                      *bit_offset);
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-  *bit_offset += num_bits;
-  if (*bit_offset >= 64) {
-    *byte_offset += 8;
-    *bit_offset -= 64;
-
-    *buffered_values =
-        detail::ReadLittleEndianWord(buffer + *byte_offset, max_bytes - *byte_offset);
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4800 4805)
-#endif
-    // Read bits of v that crossed into new buffered_values_
-    if (ARROW_PREDICT_TRUE(num_bits - *bit_offset < static_cast<int>(8 * sizeof(T)))) {
-      // if shift exponent(num_bits - *bit_offset) is not less than sizeof(T), *v will not
-      // change and the following code may cause a runtime error that the shift exponent
-      // is too large
-      *v = *v | static_cast<T>(bit_util::TrailingBits(*buffered_values, *bit_offset)
-                               << (num_bits - *bit_offset));
-    }
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-    ARROW_DCHECK_LE(*bit_offset, 64);
-  }
-}
-
-}  // namespace detail
-
 template <typename T>
 inline bool BitReader::GetValue(int num_bits, T* v) {
   return GetBatch(num_bits, v, 1) == 1;
 }
 
-namespace internal_bit_reader {
-template <typename T>
-struct unpack_detect {
-  using type = std::make_unsigned_t<T>;
-};
-
-template <>
-struct unpack_detect<bool> {
-  using type = bool;
-};
-}  // namespace internal_bit_reader
-
 template <typename T>
 inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
-  ARROW_DCHECK(buffer_ != NULL);
-  ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
+  constexpr uint64_t kBitsPerByte = 8;
 
-  int bit_offset = bit_offset_;
-  int byte_offset = byte_offset_;
-  uint64_t buffered_values = buffered_values_;
-  int max_bytes = max_bytes_;
-  const uint8_t* buffer = buffer_;
+  ARROW_DCHECK(buffer_ != NULLPTR);
+  ARROW_DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8)) << "num_bits: " << num_bits;
 
   const int64_t needed_bits = num_bits * static_cast<int64_t>(batch_size);
-  constexpr uint64_t kBitsPerByte = 8;
   const int64_t remaining_bits =
-      static_cast<int64_t>(max_bytes - byte_offset) * kBitsPerByte - bit_offset;
+      static_cast<int64_t>(max_bytes_ - byte_offset_) * kBitsPerByte - bit_offset_;
   if (remaining_bits < needed_bits) {
     batch_size = static_cast<int>(remaining_bits / num_bits);
   }
 
-  int i = 0;
-  if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
-    for (; i < batch_size && bit_offset != 0; ++i) {
-      detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
-                        &buffered_values);
-    }
-  }
-
-  using unpack_t = typename internal_bit_reader::unpack_detect<T>::type;
-
-  int num_unpacked = ::arrow::internal::unpack(
-      buffer + byte_offset, reinterpret_cast<unpack_t*>(v + i), batch_size - i, num_bits);
-  i += num_unpacked;
-  byte_offset += num_unpacked * num_bits / 8;
-
-  buffered_values =
-      detail::ReadLittleEndianWord(buffer + byte_offset, max_bytes - byte_offset);
+  if constexpr (std::is_same_v<T, bool>) {
+    ::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
+                              bit_offset_);
 
-  for (; i < batch_size; ++i) {
-    detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
-                      &buffered_values);
+  } else {
+    ::arrow::internal::unpack(buffer_ + byte_offset_,
+                              reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
+                              num_bits, bit_offset_);
   }
 
-  bit_offset_ = bit_offset;
-  byte_offset_ = byte_offset;
-  buffered_values_ = buffered_values;
+  Advance(batch_size * num_bits);
 
   return batch_size;
 }
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 8d4811ede79..52a42f538b5 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -113,9 +113,16 @@ constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
 constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
 
 // Returns a mask for the bit_index lower order bits.
-// Only valid for bit_index in the range [0, 64).
-constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
-  return (static_cast<uint64_t>(1) << bit_index) - 1;
+// Valid in the range `[0, 8*sizof(Uint)]` if `kAllowUpperBound`
+// otherwise `[0, 8*sizof(Uint)[`
+template <typename Uint, bool kAllowUpperBound = false>
+constexpr auto LeastSignificantBitMask(Uint bit_index) {
+  if constexpr (kAllowUpperBound) {
+    if (bit_index == 8 * sizeof(Uint)) {
+      return ~Uint{0};
+    }
+  }
+  return (Uint{1} << bit_index) - Uint{1};
 }
 
 // Returns 'value' rounded up to the nearest multiple of 'factor'
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index d95fd921f48..83c142c559b 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -136,7 +136,7 @@ class BitmapUInt64Reader {
     memcpy(&word, bitmap_, num_bytes);
     bitmap_ += num_bytes;
     return (bit_util::ToLittleEndian(word) >> bit_offset) &
-           bit_util::LeastSignificantBitMask(num_bits);
+           bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
   }
 
   const uint8_t* bitmap_;
diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc
index 369f361d9a6..fdb1c5a52ac 100644
--- a/cpp/src/arrow/util/bpacking.cc
+++ b/cpp/src/arrow/util/bpacking.cc
@@ -50,19 +50,19 @@ struct UnpackDynamicFunction {
 }  // namespace
 
 template <typename Uint>
-int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
+void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
 #if defined(ARROW_HAVE_NEON)
-  return unpack_neon(in, out, batch_size, num_bits);
+  return unpack_neon(in, out, batch_size, num_bits, bit_offset);
 #else
   static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
-  return dispatch.func(in, out, batch_size, num_bits);
+  return dispatch.func(in, out, batch_size, num_bits, bit_offset);
 #endif
 }
 
-template int unpack<bool>(const uint8_t*, bool*, int, int);
-template int unpack<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc
index 144da6ea878..7094656eeef 100644
--- a/cpp/src/arrow/util/bpacking_benchmark.cc
+++ b/cpp/src/arrow/util/bpacking_benchmark.cc
@@ -33,7 +33,7 @@ namespace arrow::internal {
 namespace {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
 constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
@@ -89,7 +89,7 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
   std::vector<Int> unpacked(num_values, 0);
 
   for (auto _ : state) {
-    unpack(packed_ptr, unpacked.data(), num_values, bit_width);
+    unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(num_values * state.iterations());
diff --git a/cpp/src/arrow/util/bpacking_dispatch_internal.h b/cpp/src/arrow/util/bpacking_dispatch_internal.h
index a2319c05701..eed5542808d 100644
--- a/cpp/src/arrow/util/bpacking_dispatch_internal.h
+++ b/cpp/src/arrow/util/bpacking_dispatch_internal.h
@@ -17,26 +17,28 @@
 
 #pragma once
 
+#include <algorithm>
 #include <cstring>
 #include <type_traits>
 
+#include "arrow/util/bit_util.h"
 #include "arrow/util/endian.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/ubsan.h"
 
 namespace arrow::internal {
 
 /// Unpack a zero bit packed array.
 template <typename Uint>
-int unpack_null(const uint8_t* in, Uint* out, int batch_size) {
+void unpack_null(const uint8_t* in, Uint* out, int batch_size) {
   std::memset(out, 0, batch_size * sizeof(Uint));
-  return batch_size;
 }
 
 /// Unpack a packed array where packed and unpacked values have exactly the same number of
 /// bits.
 template <typename Uint>
-int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
+void unpack_full(const uint8_t* in, Uint* out, int batch_size) {
   if constexpr (ARROW_LITTLE_ENDIAN == 1) {
     std::memcpy(out, in, batch_size * sizeof(Uint));
   } else {
@@ -47,7 +49,114 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
       out[k] = FromLittleEndian(SafeLoadAs<Uint>(in + (k * sizeof(Uint))));
     }
   }
-  return batch_size;
+}
+
+/// Compute the maximum spread in bytes that a packed integer can cover.
+///
+/// This is assuming contiguous packed integer starting with the given bit offset away
+/// from a byte boundary.
+/// This function is non-monotonic, for instance with zero offset, three bit integers
+/// will be split on the first byte boundary (hence having a spread of two bytes) while
+/// four bit integer will be well behaved and never spread over byte boundary (hence
+/// having a spread of one).
+constexpr int PackedMaxSpreadBytes(int width, int bit_offset) {
+  int max = static_cast<int>(bit_util::BytesForBits(width));
+  int start = bit_offset;
+  do {
+    const int byte_start = start / 8;
+    const int byte_end = (start + width - 1) / 8;  // inclusive end bit
+    const int spread = byte_end - byte_start + 1;
+    max = spread > max ? spread : max;
+    start += width;
+  } while (start % 8 != bit_offset);
+  return max;
+}
+
+/// Compute the maximum spread in bytes that a packed integer can cover across all bit
+/// offsets.
+constexpr int PackedMaxSpreadBytes(int width) {
+  int max = 0;
+  for (int offset = 0; offset < 8; ++offset) {
+    const int spread = PackedMaxSpreadBytes(width, offset);
+    max = spread > max ? spread : max;
+  }
+  return max;
+}
+
+// Integer type that tries to contain as much as the spread as possible.
+template <int kSpreadBytes>
+using SpreadBufferUint = std::conditional_t<
+    (kSpreadBytes <= sizeof(uint8_t)), uint_fast8_t,
+    std::conditional_t<(kSpreadBytes <= sizeof(uint16_t)), uint_fast16_t,
+                       std::conditional_t<(kSpreadBytes <= sizeof(uint32_t)),
+                                          uint_fast32_t, uint_fast64_t>>>;
+
+/// Unpack integers.
+/// This function works for all input batch sizes but is not the fastest.
+/// In prolog mode, instead of unpacking all required element, the function will
+/// stop if it finds a byte aligned value start.
+template <int kPackedBitWidth, bool kIsProlog, typename Uint>
+int unpack_exact(const uint8_t* in, Uint* out, int batch_size, int bit_offset) {
+  // For the epilog we adapt the max spread since better alignment give shorter spreads
+  ARROW_DCHECK(kIsProlog || bit_offset == 0);
+  ARROW_DCHECK(bit_offset >= 0 && bit_offset < 8);
+  constexpr int kMaxSpreadBytes = kIsProlog ? PackedMaxSpreadBytes(kPackedBitWidth)
+                                            : PackedMaxSpreadBytes(kPackedBitWidth, 0);
+  using buffer_uint = SpreadBufferUint<kMaxSpreadBytes>;
+  constexpr int kBufferSize = sizeof(buffer_uint);
+  // Due to misalignment, on large bit width, the spread can be larger than the maximum
+  // size integer. For instance a 63 bit width misaligned packed integer can spread over 9
+  // aligned bytes.
+  constexpr bool kOversized = kBufferSize < kMaxSpreadBytes;
+  constexpr buffer_uint kLowMask =
+      bit_util::LeastSignificantBitMask<buffer_uint, true>(kPackedBitWidth);
+
+  ARROW_DCHECK_GE(bit_offset, 0);
+  ARROW_DCHECK_LE(bit_offset, 8);
+
+  // Looping over values one by one
+  const int start_bit_term = batch_size * kPackedBitWidth + bit_offset;
+  int start_bit = bit_offset;
+  while ((start_bit < start_bit_term) && (!kIsProlog || (start_bit % 8 != 0))) {
+    const int start_byte = start_bit / 8;
+    const int spread_bytes = ((start_bit + kPackedBitWidth - 1) / 8) - start_byte + 1;
+    ARROW_COMPILER_ASSUME(spread_bytes <= kMaxSpreadBytes);
+
+    // Reading the bytes for the current value.
+    // Must be careful not to read out of input bounds.
+    buffer_uint buffer = 0;
+    if constexpr (kOversized) {
+      // We read the max possible bytes in the first pass and handle the rest after.
+      // Even though the worst spread does not happen on all iterations we can still read
+      // all bytes because we will mask them.
+      std::memcpy(&buffer, in + start_byte, std::min(kBufferSize, spread_bytes));
+    } else {
+      std::memcpy(&buffer, in + start_byte, spread_bytes);
+    }
+
+    buffer = bit_util::FromLittleEndian(buffer);
+    const int bit_offset = start_bit % 8;
+    buffer >>= bit_offset;
+    Uint val = static_cast<Uint>(buffer & kLowMask);
+
+    // Handle the oversized bytes
+    if constexpr (kOversized) {
+      // The oversized bytes do not happen at all iterations
+      if (spread_bytes > kBufferSize) {
+        std::memcpy(&buffer, in + start_byte + kBufferSize, spread_bytes - kBufferSize);
+        buffer = bit_util::FromLittleEndian(buffer);
+        buffer <<= 8 * kBufferSize - bit_offset;
+        val |= static_cast<Uint>(buffer & kLowMask);
+      }
+    }
+
+    *out = val;
+    out++;
+    start_bit += kPackedBitWidth;
+  }
+
+  ARROW_DCHECK((start_bit - bit_offset) % kPackedBitWidth == 0);
+  return (start_bit - bit_offset) / kPackedBitWidth;
 }
 
 /// Unpack a packed array, delegating to a Unpacker struct.
@@ -59,292 +168,317 @@ int unpack_full(const uint8_t* in, Uint* out, int batch_size) {
 /// @tparam UnpackedUInt The type in which we unpack the values.
 template <int kPackedBitWidth, template <typename, int> typename Unpacker,
           typename UnpackedUInt>
-int unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size) {
-  using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+void unpack_width(const uint8_t* in, UnpackedUInt* out, int batch_size, int bit_offset) {
+  if constexpr (kPackedBitWidth == 0) {
+    // Easy case to handle, simply setting memory to zero.
+    return unpack_null(in, out, batch_size);
+  } else {
+    // In case of misalignment, we need to run the prolog until aligned.
+    int extracted = unpack_exact<kPackedBitWidth, true>(in, out, batch_size, bit_offset);
+    // We either extracted everything or found a alignment
+    const int start_bit = extracted * kPackedBitWidth + bit_offset;
+    ARROW_DCHECK((extracted == batch_size) || ((start_bit) % 8 == 0));
+    batch_size -= extracted;
+    ARROW_DCHECK_GE(batch_size, 0);
+    in += start_bit / 8;
+    out += extracted;
 
-  constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
-  batch_size = batch_size / kValuesUnpacked * kValuesUnpacked;
-  int num_loops = batch_size / kValuesUnpacked;
+    if constexpr (kPackedBitWidth == 8 * sizeof(UnpackedUInt)) {
+      // Only memcpy / static_cast
+      return unpack_full(in, out, batch_size);
+    } else {
+      using UnpackerForWidth = Unpacker<UnpackedUInt, kPackedBitWidth>;
+      constexpr auto kValuesUnpacked = UnpackerForWidth::kValuesUnpacked;
 
-  for (int i = 0; i < num_loops; ++i) {
-    in = UnpackerForWidth::unpack(in, out + i * kValuesUnpacked);
-  }
+      // Running the optimized kernel for batch extraction
+      const int unpacker_iter_count = batch_size / kValuesUnpacked;
+      for (int i = 0; i < unpacker_iter_count; ++i) {
+        in = UnpackerForWidth::unpack(in, out);
+        out += kValuesUnpacked;
+      }
+      batch_size -= unpacker_iter_count * kValuesUnpacked;
 
-  return batch_size;
+      // Running the epilog for the remaining values that don't fit in a kernel
+      ARROW_DCHECK_LT(batch_size, kValuesUnpacked);
+      ARROW_DCHECK_GE(batch_size, 0);
+      ARROW_COMPILER_ASSUME(batch_size < kValuesUnpacked);
+      ARROW_COMPILER_ASSUME(batch_size >= 0);
+      unpack_exact<kPackedBitWidth, false>(in, out, batch_size, /* bit_offset= */ 0);
+    }
+  }
 }
 
 template <template <typename, int> typename Unpacker, typename UnpackedUint>
-static int unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
-                       int num_bits) {
+static void unpack_jump(const uint8_t* in, UnpackedUint* out, int batch_size,
+                        int num_bits, int bit_offset) {
   if constexpr (std::is_same_v<UnpackedUint, bool>) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 1) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 2) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 4) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_width<16, Unpacker>(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
       case 17:
-        return unpack_width<17, Unpacker>(in, out, batch_size);
+        return unpack_width<17, Unpacker>(in, out, batch_size, bit_offset);
       case 18:
-        return unpack_width<18, Unpacker>(in, out, batch_size);
+        return unpack_width<18, Unpacker>(in, out, batch_size, bit_offset);
       case 19:
-        return unpack_width<19, Unpacker>(in, out, batch_size);
+        return unpack_width<19, Unpacker>(in, out, batch_size, bit_offset);
       case 20:
-        return unpack_width<20, Unpacker>(in, out, batch_size);
+        return unpack_width<20, Unpacker>(in, out, batch_size, bit_offset);
       case 21:
-        return unpack_width<21, Unpacker>(in, out, batch_size);
+        return unpack_width<21, Unpacker>(in, out, batch_size, bit_offset);
       case 22:
-        return unpack_width<22, Unpacker>(in, out, batch_size);
+        return unpack_width<22, Unpacker>(in, out, batch_size, bit_offset);
       case 23:
-        return unpack_width<23, Unpacker>(in, out, batch_size);
+        return unpack_width<23, Unpacker>(in, out, batch_size, bit_offset);
       case 24:
-        return unpack_width<24, Unpacker>(in, out, batch_size);
+        return unpack_width<24, Unpacker>(in, out, batch_size, bit_offset);
       case 25:
-        return unpack_width<25, Unpacker>(in, out, batch_size);
+        return unpack_width<25, Unpacker>(in, out, batch_size, bit_offset);
       case 26:
-        return unpack_width<26, Unpacker>(in, out, batch_size);
+        return unpack_width<26, Unpacker>(in, out, batch_size, bit_offset);
       case 27:
-        return unpack_width<27, Unpacker>(in, out, batch_size);
+        return unpack_width<27, Unpacker>(in, out, batch_size, bit_offset);
       case 28:
-        return unpack_width<28, Unpacker>(in, out, batch_size);
+        return unpack_width<28, Unpacker>(in, out, batch_size, bit_offset);
       case 29:
-        return unpack_width<29, Unpacker>(in, out, batch_size);
+        return unpack_width<29, Unpacker>(in, out, batch_size, bit_offset);
       case 30:
-        return unpack_width<30, Unpacker>(in, out, batch_size);
+        return unpack_width<30, Unpacker>(in, out, batch_size, bit_offset);
       case 31:
-        return unpack_width<31, Unpacker>(in, out, batch_size);
+        return unpack_width<31, Unpacker>(in, out, batch_size, bit_offset);
       case 32:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<32, Unpacker>(in, out, batch_size, bit_offset);
     }
   } else if constexpr (sizeof(UnpackedUint) == 8) {
     switch (num_bits) {
       case 0:
-        return unpack_null(in, out, batch_size);
+        return unpack_width<0, Unpacker>(in, out, batch_size, bit_offset);
       case 1:
-        return unpack_width<1, Unpacker>(in, out, batch_size);
+        return unpack_width<1, Unpacker>(in, out, batch_size, bit_offset);
       case 2:
-        return unpack_width<2, Unpacker>(in, out, batch_size);
+        return unpack_width<2, Unpacker>(in, out, batch_size, bit_offset);
       case 3:
-        return unpack_width<3, Unpacker>(in, out, batch_size);
+        return unpack_width<3, Unpacker>(in, out, batch_size, bit_offset);
       case 4:
-        return unpack_width<4, Unpacker>(in, out, batch_size);
+        return unpack_width<4, Unpacker>(in, out, batch_size, bit_offset);
       case 5:
-        return unpack_width<5, Unpacker>(in, out, batch_size);
+        return unpack_width<5, Unpacker>(in, out, batch_size, bit_offset);
       case 6:
-        return unpack_width<6, Unpacker>(in, out, batch_size);
+        return unpack_width<6, Unpacker>(in, out, batch_size, bit_offset);
       case 7:
-        return unpack_width<7, Unpacker>(in, out, batch_size);
+        return unpack_width<7, Unpacker>(in, out, batch_size, bit_offset);
       case 8:
-        return unpack_width<8, Unpacker>(in, out, batch_size);
+        return unpack_width<8, Unpacker>(in, out, batch_size, bit_offset);
       case 9:
-        return unpack_width<9, Unpacker>(in, out, batch_size);
+        return unpack_width<9, Unpacker>(in, out, batch_size, bit_offset);
       case 10:
-        return unpack_width<10, Unpacker>(in, out, batch_size);
+        return unpack_width<10, Unpacker>(in, out, batch_size, bit_offset);
       case 11:
-        return unpack_width<11, Unpacker>(in, out, batch_size);
+        return unpack_width<11, Unpacker>(in, out, batch_size, bit_offset);
       case 12:
-        return unpack_width<12, Unpacker>(in, out, batch_size);
+        return unpack_width<12, Unpacker>(in, out, batch_size, bit_offset);
       case 13:
-        return unpack_width<13, Unpacker>(in, out, batch_size);
+        return unpack_width<13, Unpacker>(in, out, batch_size, bit_offset);
       case 14:
-        return unpack_width<14, Unpacker>(in, out, batch_size);
+        return unpack_width<14, Unpacker>(in, out, batch_size, bit_offset);
       case 15:
-        return unpack_width<15, Unpacker>(in, out, batch_size);
+        return unpack_width<15, Unpacker>(in, out, batch_size, bit_offset);
       case 16:
-        return unpack_width<16, Unpacker>(in, out, batch_size);
+        return unpack_width<16, Unpacker>(in, out, batch_size, bit_offset);
       case 17:
-        return unpack_width<17, Unpacker>(in, out, batch_size);
+        return unpack_width<17, Unpacker>(in, out, batch_size, bit_offset);
       case 18:
-        return unpack_width<18, Unpacker>(in, out, batch_size);
+        return unpack_width<18, Unpacker>(in, out, batch_size, bit_offset);
       case 19:
-        return unpack_width<19, Unpacker>(in, out, batch_size);
+        return unpack_width<19, Unpacker>(in, out, batch_size, bit_offset);
       case 20:
-        return unpack_width<20, Unpacker>(in, out, batch_size);
+        return unpack_width<20, Unpacker>(in, out, batch_size, bit_offset);
       case 21:
-        return unpack_width<21, Unpacker>(in, out, batch_size);
+        return unpack_width<21, Unpacker>(in, out, batch_size, bit_offset);
       case 22:
-        return unpack_width<22, Unpacker>(in, out, batch_size);
+        return unpack_width<22, Unpacker>(in, out, batch_size, bit_offset);
       case 23:
-        return unpack_width<23, Unpacker>(in, out, batch_size);
+        return unpack_width<23, Unpacker>(in, out, batch_size, bit_offset);
       case 24:
-        return unpack_width<24, Unpacker>(in, out, batch_size);
+        return unpack_width<24, Unpacker>(in, out, batch_size, bit_offset);
       case 25:
-        return unpack_width<25, Unpacker>(in, out, batch_size);
+        return unpack_width<25, Unpacker>(in, out, batch_size, bit_offset);
       case 26:
-        return unpack_width<26, Unpacker>(in, out, batch_size);
+        return unpack_width<26, Unpacker>(in, out, batch_size, bit_offset);
       case 27:
-        return unpack_width<27, Unpacker>(in, out, batch_size);
+        return unpack_width<27, Unpacker>(in, out, batch_size, bit_offset);
       case 28:
-        return unpack_width<28, Unpacker>(in, out, batch_size);
+        return unpack_width<28, Unpacker>(in, out, batch_size, bit_offset);
       case 29:
-        return unpack_width<29, Unpacker>(in, out, batch_size);
+        return unpack_width<29, Unpacker>(in, out, batch_size, bit_offset);
       case 30:
-        return unpack_width<30, Unpacker>(in, out, batch_size);
+        return unpack_width<30, Unpacker>(in, out, batch_size, bit_offset);
       case 31:
-        return unpack_width<31, Unpacker>(in, out, batch_size);
+        return unpack_width<31, Unpacker>(in, out, batch_size, bit_offset);
       case 32:
-        return unpack_width<32, Unpacker>(in, out, batch_size);
+        return unpack_width<32, Unpacker>(in, out, batch_size, bit_offset);
       case 33:
-        return unpack_width<33, Unpacker>(in, out, batch_size);
+        return unpack_width<33, Unpacker>(in, out, batch_size, bit_offset);
       case 34:
-        return unpack_width<34, Unpacker>(in, out, batch_size);
+        return unpack_width<34, Unpacker>(in, out, batch_size, bit_offset);
       case 35:
-        return unpack_width<35, Unpacker>(in, out, batch_size);
+        return unpack_width<35, Unpacker>(in, out, batch_size, bit_offset);
       case 36:
-        return unpack_width<36, Unpacker>(in, out, batch_size);
+        return unpack_width<36, Unpacker>(in, out, batch_size, bit_offset);
       case 37:
-        return unpack_width<37, Unpacker>(in, out, batch_size);
+        return unpack_width<37, Unpacker>(in, out, batch_size, bit_offset);
       case 38:
-        return unpack_width<38, Unpacker>(in, out, batch_size);
+        return unpack_width<38, Unpacker>(in, out, batch_size, bit_offset);
       case 39:
-        return unpack_width<39, Unpacker>(in, out, batch_size);
+        return unpack_width<39, Unpacker>(in, out, batch_size, bit_offset);
       case 40:
-        return unpack_width<40, Unpacker>(in, out, batch_size);
+        return unpack_width<40, Unpacker>(in, out, batch_size, bit_offset);
       case 41:
-        return unpack_width<41, Unpacker>(in, out, batch_size);
+        return unpack_width<41, Unpacker>(in, out, batch_size, bit_offset);
       case 42:
-        return unpack_width<42, Unpacker>(in, out, batch_size);
+        return unpack_width<42, Unpacker>(in, out, batch_size, bit_offset);
       case 43:
-        return unpack_width<43, Unpacker>(in, out, batch_size);
+        return unpack_width<43, Unpacker>(in, out, batch_size, bit_offset);
       case 44:
-        return unpack_width<44, Unpacker>(in, out, batch_size);
+        return unpack_width<44, Unpacker>(in, out, batch_size, bit_offset);
       case 45:
-        return unpack_width<45, Unpacker>(in, out, batch_size);
+        return unpack_width<45, Unpacker>(in, out, batch_size, bit_offset);
       case 46:
-        return unpack_width<46, Unpacker>(in, out, batch_size);
+        return unpack_width<46, Unpacker>(in, out, batch_size, bit_offset);
       case 47:
-        return unpack_width<47, Unpacker>(in, out, batch_size);
+        return unpack_width<47, Unpacker>(in, out, batch_size, bit_offset);
       case 48:
-        return unpack_width<48, Unpacker>(in, out, batch_size);
+        return unpack_width<48, Unpacker>(in, out, batch_size, bit_offset);
       case 49:
-        return unpack_width<49, Unpacker>(in, out, batch_size);
+        return unpack_width<49, Unpacker>(in, out, batch_size, bit_offset);
       case 50:
-        return unpack_width<50, Unpacker>(in, out, batch_size);
+        return unpack_width<50, Unpacker>(in, out, batch_size, bit_offset);
       case 51:
-        return unpack_width<51, Unpacker>(in, out, batch_size);
+        return unpack_width<51, Unpacker>(in, out, batch_size, bit_offset);
       case 52:
-        return unpack_width<52, Unpacker>(in, out, batch_size);
+        return unpack_width<52, Unpacker>(in, out, batch_size, bit_offset);
       case 53:
-        return unpack_width<53, Unpacker>(in, out, batch_size);
+        return unpack_width<53, Unpacker>(in, out, batch_size, bit_offset);
       case 54:
-        return unpack_width<54, Unpacker>(in, out, batch_size);
+        return unpack_width<54, Unpacker>(in, out, batch_size, bit_offset);
       case 55:
-        return unpack_width<55, Unpacker>(in, out, batch_size);
+        return unpack_width<55, Unpacker>(in, out, batch_size, bit_offset);
       case 56:
-        return unpack_width<56, Unpacker>(in, out, batch_size);
+        return unpack_width<56, Unpacker>(in, out, batch_size, bit_offset);
       case 57:
-        return unpack_width<57, Unpacker>(in, out, batch_size);
+        return unpack_width<57, Unpacker>(in, out, batch_size, bit_offset);
       case 58:
-        return unpack_width<58, Unpacker>(in, out, batch_size);
+        return unpack_width<58, Unpacker>(in, out, batch_size, bit_offset);
       case 59:
-        return unpack_width<59, Unpacker>(in, out, batch_size);
+        return unpack_width<59, Unpacker>(in, out, batch_size, bit_offset);
       case 60:
-        return unpack_width<60, Unpacker>(in, out, batch_size);
+        return unpack_width<60, Unpacker>(in, out, batch_size, bit_offset);
       case 61:
-        return unpack_width<61, Unpacker>(in, out, batch_size);
+        return unpack_width<61, Unpacker>(in, out, batch_size, bit_offset);
       case 62:
-        return unpack_width<62, Unpacker>(in, out, batch_size);
+        return unpack_width<62, Unpacker>(in, out, batch_size, bit_offset);
       case 63:
-        return unpack_width<63, Unpacker>(in, out, batch_size);
+        return unpack_width<63, Unpacker>(in, out, batch_size, bit_offset);
       case 64:
-        return unpack_full(in, out, batch_size);
+        return unpack_width<64, Unpacker>(in, out, batch_size, bit_offset);
     }
   }
-  ARROW_DCHECK(false) << "Unsupported num_bits";
-  return 0;
+  ARROW_DCHECK(false) << "Unsupported num_bits " << num_bits;
 }
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h
index c910f3388f7..dd2a7e3ce93 100644
--- a/cpp/src/arrow/util/bpacking_internal.h
+++ b/cpp/src/arrow/util/bpacking_internal.h
@@ -24,24 +24,27 @@
 namespace arrow::internal {
 
 template <typename Uint>
-ARROW_EXPORT int unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                         int bit_offset = 0);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<bool>(const uint8_t* in, bool* out,
-                                                       int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<bool>(const uint8_t* in, bool* out,
+                                                        int batch_size, int num_bits,
+                                                        int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint8_t>(const uint8_t* in, uint8_t* out,
-                                                          int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint8_t>(const uint8_t* in,
+                                                           uint8_t* out, int batch_size,
+                                                           int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint16_t>(const uint8_t* in,
-                                                           uint16_t* out, int batch_size,
-                                                           int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint16_t>(const uint8_t* in,
+                                                            uint16_t* out, int batch_size,
+                                                            int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint32_t>(const uint8_t* in,
-                                                           uint32_t* out, int batch_size,
-                                                           int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint32_t>(const uint8_t* in,
+                                                            uint32_t* out, int batch_size,
+                                                            int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack<uint64_t>(const uint8_t* in,
-                                                           uint64_t* out, int batch_size,
-                                                           int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack<uint64_t>(const uint8_t* in,
+                                                            uint64_t* out, int batch_size,
+                                                            int num_bits, int bit_offset);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar.cc b/cpp/src/arrow/util/bpacking_scalar.cc
index 11260412d00..b4265379335 100644
--- a/cpp/src/arrow/util/bpacking_scalar.cc
+++ b/cpp/src/arrow/util/bpacking_scalar.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_scalar(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                   int bit_offset) {
+  return unpack_jump<ScalarUnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template int unpack_scalar<bool>(const uint8_t*, bool*, int, int);
-template int unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_scalar<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_scalar<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_scalar<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_scalar<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_scalar<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_scalar_internal.h b/cpp/src/arrow/util/bpacking_scalar_internal.h
index 577ad7f8d21..7885f68e7b3 100644
--- a/cpp/src/arrow/util/bpacking_scalar_internal.h
+++ b/cpp/src/arrow/util/bpacking_scalar_internal.h
@@ -24,31 +24,24 @@
 namespace arrow::internal {
 
 template <typename Uint>
-ARROW_EXPORT int unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
-                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<bool>(const uint8_t* in,
-                                                              bool* out, int batch_size,
-                                                              int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint8_t>(const uint8_t* in,
-                                                                 uint8_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint16_t>(const uint8_t* in,
-                                                                  uint16_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint32_t>(const uint8_t* in,
-                                                                  uint32_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_scalar<uint64_t>(const uint8_t* in,
-                                                                  uint64_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+ARROW_EXPORT void unpack_scalar(const uint8_t* in, Uint* out, int batch_size,
+                                int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<bool>(const uint8_t* in,
+                                                               bool* out, int batch_size,
+                                                               int num_bits,
+                                                               int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_scalar<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx2.cc b/cpp/src/arrow/util/bpacking_simd_avx2.cc
index e1adc07adbb..8261cdadf25 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx2.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx2.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                 int bit_offset) {
+  return unpack_jump<Simd256UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template int unpack_avx2<bool>(const uint8_t*, bool*, int, int);
-template int unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx2<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_avx2<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_avx2<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_avx2<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_avx2<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_avx512.cc b/cpp/src/arrow/util/bpacking_simd_avx512.cc
index 55e7c16a771..76b102169ad 100644
--- a/cpp/src/arrow/util/bpacking_simd_avx512.cc
+++ b/cpp/src/arrow/util/bpacking_simd_avx512.cc
@@ -22,14 +22,15 @@
 namespace arrow::internal {
 
 template <typename Uint>
-int unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_avx512(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                   int bit_offset) {
+  return unpack_jump<Simd512UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template int unpack_avx512<bool>(const uint8_t*, bool*, int, int);
-template int unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_avx512<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_avx512<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_avx512<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_avx512<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_avx512<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/util/bpacking_simd_default.cc b/cpp/src/arrow/util/bpacking_simd_default.cc
index 60ca19c1504..e9af823807d 100644
--- a/cpp/src/arrow/util/bpacking_simd_default.cc
+++ b/cpp/src/arrow/util/bpacking_simd_default.cc
@@ -26,15 +26,16 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits) {
-  return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits);
+void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                 int bit_offset) {
+  return unpack_jump<Simd128UnpackerForWidth>(in, out, batch_size, num_bits, bit_offset);
 }
 
-template int unpack_neon<bool>(const uint8_t*, bool*, int, int);
-template int unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int);
-template int unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int);
-template int unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int);
-template int unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int);
+template void unpack_neon<bool>(const uint8_t*, bool*, int, int, int);
+template void unpack_neon<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
+template void unpack_neon<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
+template void unpack_neon<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
+template void unpack_neon<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_simd_internal.h b/cpp/src/arrow/util/bpacking_simd_internal.h
index 32d6f654f8d..755b001140b 100644
--- a/cpp/src/arrow/util/bpacking_simd_internal.h
+++ b/cpp/src/arrow/util/bpacking_simd_internal.h
@@ -26,92 +26,73 @@ namespace arrow::internal {
 #if defined(ARROW_HAVE_NEON)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_neon(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                              int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<bool>(const uint8_t* in, bool* out,
-                                                            int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<bool>(const uint8_t* in, bool* out,
+                                                             int batch_size, int num_bits,
+                                                             int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint8_t>(const uint8_t* in,
-                                                               uint8_t* out,
-                                                               int batch_size,
-                                                               int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint16_t>(const uint8_t* in,
-                                                                uint16_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint32_t>(const uint8_t* in,
-                                                                uint32_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_neon<uint64_t>(const uint8_t* in,
-                                                                uint64_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_neon<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
 #if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX2)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits);
+ARROW_EXPORT void unpack_avx2(const uint8_t* in, Uint* out, int batch_size, int num_bits,
+                              int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<bool>(const uint8_t* in, bool* out,
-                                                            int batch_size, int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<bool>(const uint8_t* in, bool* out,
+                                                             int batch_size, int num_bits,
+                                                             int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint8_t>(const uint8_t* in,
-                                                               uint8_t* out,
-                                                               int batch_size,
-                                                               int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint16_t>(const uint8_t* in,
-                                                                uint16_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint32_t>(const uint8_t* in,
-                                                                uint32_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
 
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx2<uint64_t>(const uint8_t* in,
-                                                                uint64_t* out,
-                                                                int batch_size,
-                                                                int num_bits);
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx2<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
 #if defined(ARROW_HAVE_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX512)
 
 template <typename Uint>
-ARROW_EXPORT int unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
-                               int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<bool>(const uint8_t* in,
-                                                              bool* out, int batch_size,
-                                                              int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint8_t>(const uint8_t* in,
-                                                                 uint8_t* out,
-                                                                 int batch_size,
-                                                                 int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint16_t>(const uint8_t* in,
-                                                                  uint16_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint32_t>(const uint8_t* in,
-                                                                  uint32_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
-
-extern template ARROW_TEMPLATE_EXPORT int unpack_avx512<uint64_t>(const uint8_t* in,
-                                                                  uint64_t* out,
-                                                                  int batch_size,
-                                                                  int num_bits);
+ARROW_EXPORT void unpack_avx512(const uint8_t* in, Uint* out, int batch_size,
+                                int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<bool>(const uint8_t* in,
+                                                               bool* out, int batch_size,
+                                                               int num_bits,
+                                                               int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint8_t>(
+    const uint8_t* in, uint8_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint16_t>(
+    const uint8_t* in, uint16_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint32_t>(
+    const uint8_t* in, uint32_t* out, int batch_size, int num_bits, int bit_offset);
+
+extern template ARROW_TEMPLATE_EXPORT void unpack_avx512<uint64_t>(
+    const uint8_t* in, uint64_t* out, int batch_size, int num_bits, int bit_offset);
 
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc
index 9a3e31b5893..a3ab4d6dda2 100644
--- a/cpp/src/arrow/util/bpacking_test.cc
+++ b/cpp/src/arrow/util/bpacking_test.cc
@@ -19,14 +19,13 @@
 
 #include <gtest/gtest.h>
 
-#include "arrow/result.h"
-#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/util/bit_stream_utils_internal.h"
+#include "arrow/util/bit_util.h"
 #include "arrow/util/bpacking_internal.h"
 #include "arrow/util/bpacking_scalar_internal.h"
 #include "arrow/util/bpacking_simd_internal.h"
-#include "arrow/util/logging.h"
 
 #if defined(ARROW_HAVE_RUNTIME_AVX2)
 #  include "arrow/util/cpu_info.h"
@@ -35,124 +34,111 @@
 namespace arrow::internal {
 
 template <typename Int>
-using UnpackFunc = int (*)(const uint8_t*, Int*, int, int);
+using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
 
 /// Get the number of bytes associate with a packing.
-Result<int32_t> GetNumBytes(int32_t num_values, int32_t bit_width) {
-  const auto num_bits = num_values * bit_width;
-  if (num_bits % 8 != 0) {
-    return Status::NotImplemented(
-        "The unpack functions only work on a multiple of 8 bits.");
-  }
-  return num_bits / 8;
+int GetNumBytes(int num_values, int bit_width, int bit_offset) {
+  return static_cast<int>(bit_util::BytesForBits(num_values * bit_width + bit_offset));
 }
 
-/// Generate random bytes as packed integers.
-std::vector<uint8_t> GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) {
+/// Generate random values that can be packed within the given bit width.
+template <typename Uint>
+std::vector<Uint> GenerateRandomValuesForPacking(int num_values, int bit_width) {
   constexpr uint32_t kSeed = 3214;
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
 
-  std::vector<uint8_t> out(std::max(1, num_bytes));  // We need a valid pointer for size 0
-  random_bytes(num_bytes, kSeed, out.data());
+  num_values = std::max(1, num_values);  // We need a valid pointer for size 0
+  std::vector<Uint> out(num_values);
+
+  if (bit_width == 0) {
+    return out;
+  }
 
+  if constexpr (std::is_same_v<Uint, bool>) {
+    random_is_valid(num_values, 0.5, &out, kSeed);
+  } else {
+    const uint64_t max = bit_util::LeastSignificantBitMask<uint64_t, true>(bit_width);
+    rand_uniform_int(out.size(), kSeed, /* min= */ decltype(max){0}, max, out.data());
+  }
   return out;
 }
 
 /// Convenience wrapper to unpack into a vector
 template <typename Int>
 std::vector<Int> UnpackValues(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
-  // Using dynamic array to avoid std::vector<bool>
-  auto buffer = std::make_unique<Int[]>(num_values);
-  int values_read = unpack(packed, buffer.get(), num_values, bit_width);
-  ARROW_DCHECK_GE(values_read, 0);
-  EXPECT_LE(values_read, num_values);
-
-  return std::vector<Int>(buffer.get(), buffer.get() + values_read);
+                              int32_t bit_width, int32_t bit_offset,
+                              UnpackFunc<Int> unpack) {
+  if constexpr (std::is_same_v<Int, bool>) {
+    // Using dynamic array to avoid std::vector<bool>
+    auto buffer = std::make_unique<Int[]>(num_values);
+    unpack(packed, buffer.get(), num_values, bit_width, bit_offset);
+    return std::vector<Int>(buffer.get(), buffer.get() + num_values);
+  } else {
+    std::vector<Int> out(num_values);
+    unpack(packed, out.data(), num_values, bit_width, bit_offset);
+    return out;
+  }
 }
 
 /// Use BitWriter to pack values into a vector.
 template <typename Int>
-std::vector<uint8_t> PackValues(const std::vector<Int>& values, int32_t num_values,
-                                int32_t bit_width) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
-
-  std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));
-  bit_util::BitWriter writer(out.data(), num_bytes);
-  for (const auto& v : values) {
-    bool written = writer.PutValue(v, bit_width);
-    if (!written) {
-      throw std::runtime_error("Cannot write move values");
-    }
+std::vector<uint8_t> PackValues(const std::vector<Int>& values, int num_values,
+                                int bit_width, int bit_offset) {
+  if (bit_width == 0) {
+    return {};
   }
 
-  return out;
-}
-
-template <typename Int>
-void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values,
-                              int32_t bit_width, UnpackFunc<Int> unpack) {
-  EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
-
-  const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack);
-  EXPECT_EQ(unpacked.size(), num_values);
-  const auto roundtrip = PackValues(unpacked, num_values, bit_width);
-  EXPECT_EQ(num_bytes, roundtrip.size());
-  for (int i = 0; i < num_bytes; ++i) {
-    EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i;
-  }
-}
+  const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
-const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) {
-  auto addr = reinterpret_cast<std::uintptr_t>(ptr);
+  std::vector<uint8_t> out(static_cast<std::size_t>(num_bytes));
+  bit_util::BitWriter writer(out.data(), num_bytes);
 
-  if (addr % alignment == 0) {
-    return ptr;
+  // Write a first 0 value to make an offset
+  const bool written = writer.PutValue(0, bit_offset);
+  ARROW_DCHECK(written);
+  for (const auto& v : values) {
+    const bool written = writer.PutValue(v, bit_width);
+    ARROW_DCHECK(written);
   }
 
-  auto remainder = addr % alignment;
-  auto bytes_to_add = alignment - remainder;
+  writer.Flush();
 
-  return ptr + bytes_to_add;
+  return out;
 }
 
 class TestUnpack : public ::testing::TestWithParam<int> {
  protected:
   template <typename Int>
-  void TestRoundtripAlignment(UnpackFunc<Int> unpack, int bit_width,
-                              std::size_t alignment_offset) {
-    int num_values = GetParam();
-
-    // Assume std::vector allocation is likely be aligned for greater than a byte.
-    // So we allocate more values than necessary and skip to the next byte with the
-    // desired (non) alignment to test the proper condition.
-    constexpr int32_t kExtraValues = sizeof(Int) * 8;
-    const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width);
-    const uint8_t* packed_unaligned =
-        GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset;
-
-    CheckUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack);
+  void TestRoundtripAlignment(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                              int bit_offset) {
+    const auto original = GenerateRandomValuesForPacking<Int>(num_values, bit_width);
+    const auto packed = PackValues(original, num_values, bit_width, bit_offset);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
+    EXPECT_EQ(unpacked.size(), num_values);
+    EXPECT_EQ(original, unpacked);
   }
 
   template <typename Int>
-  void TestUnpackZeros(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackZeros(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                       int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
     const std::vector<Int> expected(static_cast<std::size_t>(num_values), Int{0});
     EXPECT_EQ(unpacked, expected);
   }
 
   template <typename Int>
-  void TestUnpackOnes(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackOnes(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                      int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
     const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xFF});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
     // Generate bit_width ones
     Int expected_value = 0;
@@ -168,14 +154,17 @@ class TestUnpack : public ::testing::TestWithParam<int> {
   }
 
   template <typename Int>
-  void TestUnpackAlternating(UnpackFunc<Int> unpack, int bit_width) {
-    int num_values = GetParam();
-    EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width));
+  void TestUnpackAlternating(UnpackFunc<Int> unpack, int num_values, int bit_width,
+                             int bit_offset) {
+    const auto num_bytes = GetNumBytes(num_values, bit_width, bit_offset);
 
-    const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), uint8_t{0xAA});
-    const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack);
+    // Pick between two different bit patterns so that we always unpack starting with 1
+    const uint8_t byte = bit_offset % 2 == 0 ? 0b10101010 : 0b01010101;
+    const std::vector<uint8_t> packed(static_cast<std::size_t>(num_bytes), byte);
+    const auto unpacked =
+        UnpackValues(packed.data(), num_values, bit_width, bit_offset, unpack);
 
-    // Generate alternative bit sequence sratring with either 0 or 1
+    // Generate alternative bit sequence starting with either 0 or 1
     Int one_zero_value = 0;
     Int zero_one_value = 0;
     for (int i = 0; i < bit_width; ++i) {
@@ -198,20 +187,48 @@ class TestUnpack : public ::testing::TestWithParam<int> {
 
   template <typename Int>
   void TestAll(UnpackFunc<Int> unpack) {
+    const int num_values_base = GetParam();
+
     constexpr int kMaxBitWidth = std::is_same_v<Int, bool> ? 1 : 8 * sizeof(Int);
+
     // Given how many edge cases there are in unpacking integers, it is best to test all
     // sizes
     for (int bit_width = 0; bit_width <= kMaxBitWidth; ++bit_width) {
       SCOPED_TRACE(::testing::Message() << "Testing bit_width=" << bit_width);
 
-      // Known values
-      TestUnpackZeros(unpack, bit_width);
-      TestUnpackOnes(unpack, bit_width);
-      TestUnpackAlternating(unpack, bit_width);
+      // We test all bit offset within a byte / misalignments to change how the
+      // prolog.
+      for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
+        SCOPED_TRACE(::testing::Message() << "Testing bit_offset=" << bit_offset);
+
+        // Known values
+        TestUnpackZeros(unpack, num_values_base, bit_width, bit_offset);
+        TestUnpackOnes(unpack, num_values_base, bit_width, bit_offset);
+        TestUnpackAlternating(unpack, num_values_base, bit_width, bit_offset);
+
+        // Roundtrips
+        TestRoundtripAlignment(unpack, num_values_base, bit_width, bit_offset);
 
-      // Roundtrips
-      TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 0);
-      TestRoundtripAlignment(unpack, bit_width, /* alignment_offset= */ 1);
+        if (testing::Test::HasFailure()) return;
+      }
+
+      // Similarly, we test all epilogue sizes. That is extra values that could make it
+      // fall outside of an SIMD register
+      for (int epilogue_size = 0; epilogue_size <= kMaxBitWidth; ++epilogue_size) {
+        SCOPED_TRACE(::testing::Message() << "Testing epilogue_size=" << epilogue_size);
+
+        const int num_values = num_values_base + epilogue_size;
+
+        // Known values
+        TestUnpackZeros(unpack, num_values, bit_width, /* bit_offset= */ 0);
+        TestUnpackOnes(unpack, num_values, bit_width, /* bit_offset= */ 0);
+        TestUnpackAlternating(unpack, num_values, bit_width, /* bit_offset= */ 0);
+
+        // Roundtrips
+        TestRoundtripAlignment(unpack, num_values, bit_width, /* bit_offset= */ 0);
+
+        if (testing::Test::HasFailure()) return;
+      }
     }
   }
 };
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 9e075594d6a..d80164f45c0 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -610,7 +610,8 @@ static void AppendLittleEndianArrayToString(const std::array<uint64_t, n>& array
       // *elem = dividend / 1e9;
       // remainder = dividend % 1e9.
       uint32_t hi = static_cast<uint32_t>(*elem >> 32);
-      uint32_t lo = static_cast<uint32_t>(*elem & bit_util::LeastSignificantBitMask(32));
+      uint32_t lo =
+          static_cast<uint32_t>(*elem & bit_util::LeastSignificantBitMask<uint64_t>(32));
       uint64_t dividend_hi = (static_cast<uint64_t>(remainder) << 32) | hi;
       uint64_t quotient_hi = dividend_hi / k1e9;
       remainder = static_cast<uint32_t>(dividend_hi % k1e9);
diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h
index 2420270f3ab..6b2782da315 100644
--- a/cpp/src/arrow/util/rle_encoding_internal.h
+++ b/cpp/src/arrow/util/rle_encoding_internal.h
@@ -29,6 +29,8 @@
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_stream_utils_internal.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/bpacking_internal.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 
 namespace arrow::util {
@@ -278,10 +280,9 @@ class RleRunDecoder {
   /// Return the repeated value of this decoder.
   constexpr value_type value() const { return value_; }
 
-  /// Try to advance by as many values as provided.
+  /// Advance by as many values as provided or until exhaustion of the decoder.
   /// Return the number of values skipped.
-  /// May advance by less than asked for if there are not enough values left.
-  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) {
+  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
     const auto steps = std::min(batch_size, remaining_count_);
     remaining_count_ -= steps;
     return steps;
@@ -331,52 +332,58 @@ class BitPackedRunDecoder {
   }
 
   void Reset(const RunType& run, rle_size_t value_bit_width) noexcept {
-    remaining_count_ = run.values_count();
     ARROW_DCHECK_GE(value_bit_width, 0);
     ARROW_DCHECK_LE(value_bit_width, 64);
-    bit_reader_.Reset(run.raw_data_ptr(), run.raw_data_size(value_bit_width));
+    data_ = run.raw_data_ptr();
+    values_count_ = run.values_count();
+    values_read_ = 0;
   }
 
   /// Return the number of values that can be advanced.
-  constexpr rle_size_t remaining() const { return remaining_count_; }
+  constexpr rle_size_t remaining() const { return values_count_ - values_read_; }
 
-  /// Try to advance by as many values as provided.
-  /// Return the number of values skipped or 0 if it fail to advance.
-  /// May advance by less than asked for if there are not enough values left.
-  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size, rle_size_t value_bit_width) {
-    const auto steps = std::min(batch_size, remaining_count_);
-    if (bit_reader_.Advance(steps * value_bit_width)) {
-      remaining_count_ -= steps;
-      return steps;
-    }
-    return 0;
+  /// Advance by as many values as provided or until exhaustion of the decoder.
+  /// Return the number of values skipped.
+  [[nodiscard]] rle_size_t Advance(rle_size_t batch_size) {
+    const auto steps = std::min(batch_size, remaining());
+    values_read_ += steps;
+    return steps;
   }
 
-  /// Get the next value and return false if there are no more or an error occurred.
+  /// Get the next value and return false if there are no more.
   [[nodiscard]] constexpr bool Get(value_type* out_value, rle_size_t value_bit_width) {
     return GetBatch(out_value, 1, value_bit_width) == 1;
   }
 
   /// Get a batch of values return the number of decoded elements.
   /// May write fewer elements to the output than requested if there are not enough values
-  /// left or if an error occurred.
+  /// left.
   [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size,
                                     rle_size_t value_bit_width) {
-    if (ARROW_PREDICT_FALSE(remaining_count_ == 0)) {
-      return 0;
-    }
+    const auto steps = std::min(batch_size, remaining());
+    const auto bits_read = values_read_ * value_bit_width;
+    const auto* unread_data = data_ + bits_read / 8;
+    const auto bit_offset = bits_read % 8;
 
-    const auto to_read = std::min(remaining_count_, batch_size);
-    const auto actual_read = bit_reader_.GetBatch(value_bit_width, out, to_read);
-    // There should not be any reason why the actual read would be different
-    // but this is error resistant.
-    remaining_count_ -= actual_read;
-    return actual_read;
+    if constexpr (std::is_same_v<T, bool>) {
+      ::arrow::internal::unpack(unread_data, out, steps, value_bit_width, bit_offset);
+
+    } else {
+      ::arrow::internal::unpack(unread_data,
+                                reinterpret_cast<std::make_unsigned_t<value_type>*>(out),
+                                steps, value_bit_width, bit_offset);
+    }
+    values_read_ += steps;
+    return steps;
   }
 
  private:
-  ::arrow::bit_util::BitReader bit_reader_ = {};
-  rle_size_t remaining_count_ = 0;
+  /// The pointer to the beginning of the run
+  const uint8_t* data_ = nullptr;
+  /// The total number of values in the run
+  rle_size_t values_count_ = 0;
+  /// The number of values read by the decoder
+  rle_size_t values_read_ = 0;
 
   static_assert(std::is_integral_v<value_type>,
                 "This class is meant to decode positive integers");
@@ -895,7 +902,7 @@ auto RunGetSpaced(Converter* converter, typename Converter::out_type* out,
     return {0, 0};
   }
   converter->WriteRepeated(out, out + batch.total_read(), value);
-  const auto actual_values_read = decoder->Advance(batch.values_read(), value_bit_width);
+  const auto actual_values_read = decoder->Advance(batch.values_read());
   // We always cropped the number of values_read by the remaining values in the run.
   // What's more the RLE decoder should not encounter any errors.
   ARROW_DCHECK_EQ(actual_values_read, batch.values_read());
diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc
index c7f4878b741..f3a14af4412 100644
--- a/cpp/src/arrow/util/rle_encoding_test.cc
+++ b/cpp/src/arrow/util/rle_encoding_test.cc
@@ -290,7 +290,7 @@ void TestRleDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(vals.at(0), expected_value);
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
-  EXPECT_EQ(decoder.Advance(3, bit_width), 3);
+  EXPECT_EQ(decoder.Advance(3), 3);
   read += 3;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
@@ -302,9 +302,9 @@ void TestRleDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
   // Exhaust iteration
-  EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read);
+  EXPECT_EQ(decoder.Advance(value_count - read), value_count - read);
   EXPECT_EQ(decoder.remaining(), 0);
-  EXPECT_EQ(decoder.Advance(1, bit_width), 0);
+  EXPECT_EQ(decoder.Advance(1), 0);
   vals = {0, 0};
   EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0);
   EXPECT_EQ(vals.at(0), 0);
@@ -350,7 +350,7 @@ void TestBitPackedDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   read += 1;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
-  EXPECT_EQ(decoder.Advance(3, bit_width), 3);
+  EXPECT_EQ(decoder.Advance(3), 3);
   read += 3;
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
@@ -362,9 +362,9 @@ void TestBitPackedDecoder(std::vector<uint8_t> bytes, rle_size_t value_count,
   EXPECT_EQ(decoder.remaining(), value_count - read);
 
   // Exhaust iteration
-  EXPECT_EQ(decoder.Advance(value_count - read, bit_width), value_count - read);
+  EXPECT_EQ(decoder.Advance(value_count - read), value_count - read);
   EXPECT_EQ(decoder.remaining(), 0);
-  EXPECT_EQ(decoder.Advance(1, bit_width), 0);
+  EXPECT_EQ(decoder.Advance(1), 0);
   vals = {0, 0};
   EXPECT_EQ(decoder.Get(vals.data(), bit_width), 0);
   EXPECT_EQ(vals.at(0), 0);