From f32b3b30b7770c2c28d090a790264a11f18e057d Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 14:18:39 +0100
Subject: [PATCH 1/6] Medium string hashing inline(never) instead of cold,
 medium length performance improvement, url -4%

---
 rapidhash/src/inner/rapid_const.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/rapidhash/src/inner/rapid_const.rs b/rapidhash/src/inner/rapid_const.rs
index 4a065c6..bf0adc7 100644
--- a/rapidhash/src/inner/rapid_const.rs
+++ b/rapidhash/src/inner/rapid_const.rs
@@ -75,9 +75,8 @@ pub(super) const fn rapidhash_core<const AVALANCHE: bool, const COMPACT: bool, c
 }
 
 // allow rustc to inline this, but it should prefer inlining the .hash and .finish
-#[cold]
+#[inline(never)]
 #[must_use]
-// #[inline(never)]
 const fn rapidhash_core_16_288<const AVALANCHE: bool, const COMPACT: bool, const PROTECTED: bool>(mut seed: u64, secrets: &[u64; 7], data: &[u8]) -> u64 {
     let mut a = 0;
     let mut b = 0;

From 8ed39b4bf9141a4e20faecf7f43baa59572b55aa Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Tue, 5 Aug 2025 16:54:14 +0100
Subject: [PATCH 2/6] Improve small string hashing further

---
 rapidhash/src/inner/rapid_const.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/rapidhash/src/inner/rapid_const.rs b/rapidhash/src/inner/rapid_const.rs
index bf0adc7..c7bfa94 100644
--- a/rapidhash/src/inner/rapid_const.rs
+++ b/rapidhash/src/inner/rapid_const.rs
@@ -45,22 +45,22 @@ pub(super) const fn rapidhash_core<const AVALANCHE: bool, const COMPACT: bool, c
     if data.len() <= 16 {
         let mut a = 0;
         let mut b = 0;
-        if data.len() >= 4 {
-            if data.len() >= 8 {
-                a ^= read_u64(data, 0);
-                b ^= read_u64(data, data.len() - 8);
-            } else {
-                a ^= read_u32(data, 0) as u64;
-                b ^= read_u32(data, data.len() - 4) as u64;
-            }
+
+        if data.len() >= 8 {
+            a = read_u64(data, 0);
+            b = read_u64(data, data.len() - 8);
+        } else if data.len() >= 4 {
+            a = read_u32(data, 0) as u64;
+            b = read_u32(data, data.len() - 4) as u64;
         } else if !data.is_empty() {
-            a ^= ((data[0] as u64) << 45) | data[data.len() - 1] as u64;
-            b ^= data[data.len() >> 1] as u64;
+            a = ((data[0] as u64) << 45) | data[data.len() - 1] as u64;
+            b = data[data.len() >> 1] as u64;
         }
 
         seed = seed.wrapping_add(data.len() as u64);
         rapidhash_finish::<AVALANCHE, PROTECTED>(a, b , seed, secrets)
     } else {
+        // rapidhash_core_16_288::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data)
         if data.len() <= 288 {
             // This can cause other code to not be inlined, and slow everything down. So at the cost of
             // marginally slower (-10%) 16..288 hashing,

From 22f052dcff56598c6f3f1e96cae7875634513e44 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Wed, 6 Aug 2025 01:06:37 +0100
Subject: [PATCH 3/6] Reduce small input size codepath further

---
 rapidhash-bench-wasm/Cargo.toml    |  2 +-
 rapidhash/src/inner/rapid_const.rs | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/rapidhash-bench-wasm/Cargo.toml b/rapidhash-bench-wasm/Cargo.toml
index 333096e..ba69729 100644
--- a/rapidhash-bench-wasm/Cargo.toml
+++ b/rapidhash-bench-wasm/Cargo.toml
@@ -11,7 +11,7 @@ name = "wasm"
 harness = false
 
 [dependencies]
-rapidhash = { path = "../rapidhash" }
+rapidhash = { path = "../rapidhash", default-features = false }
 foldhash = "0.1.5"
 fxhash = "0.2.1"
 
diff --git a/rapidhash/src/inner/rapid_const.rs b/rapidhash/src/inner/rapid_const.rs
index c7bfa94..653d9ee 100644
--- a/rapidhash/src/inner/rapid_const.rs
+++ b/rapidhash/src/inner/rapid_const.rs
@@ -60,21 +60,16 @@ pub(super) const fn rapidhash_core<const AVALANCHE: bool, const COMPACT: bool, c
         seed = seed.wrapping_add(data.len() as u64);
         rapidhash_finish::<AVALANCHE, PROTECTED>(a, b , seed, secrets)
     } else {
-        // rapidhash_core_16_288::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data)
-        if data.len() <= 288 {
-            // This can cause other code to not be inlined, and slow everything down. So at the cost of
-            // marginally slower (-10%) 16..288 hashing,
-            // NOT COMPACT: len is 16..=288
-            rapidhash_core_16_288::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data)
-        } else {
-            // len is >288, on a cold path to avoid inlining as this doesn't impact large strings, but
-            // can otherwise prevent
-            rapidhash_core_cold::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data)
-        }
+        rapidhash_core_16_288::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data)
     }
 }
 
-// allow rustc to inline this, but it should prefer inlining the .hash and .finish
+// Never inline this, keep the small string path as small as possible to improve the inlining
+// chances of the write_length_prefix and finish functions. If those two don't get inlined, the
+// overall performance can be 5x worse when hashing a single string under 100 bytes. <=288 inputs
+// pay the cost of one extra if, and >288 inputs pay one more function call, but this is nominal
+// in comparison to the overall hashing cost.
+#[cold]
 #[inline(never)]
 #[must_use]
 const fn rapidhash_core_16_288<const AVALANCHE: bool, const COMPACT: bool, const PROTECTED: bool>(mut seed: u64, secrets: &[u64; 7], data: &[u8]) -> u64 {
@@ -83,6 +78,10 @@ const fn rapidhash_core_16_288<const AVALANCHE: bool, const COMPACT: bool, const
     let mut slice = data;
 
     if slice.len() > 48 {
+        if slice.len() > 288 {
+            return rapidhash_core_cold::<AVALANCHE, COMPACT, PROTECTED>(seed, secrets, data);
+        }
+
         // most CPUs appear to benefit from this unrolled loop
         let mut see1 = seed;
         let mut see2 = seed;

From cd1e0be8ea3e7c097b07815331c39dfa52703de5 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Wed, 6 Aug 2025 01:55:52 +0100
Subject: [PATCH 4/6] Add foldhash optimisations acknowledgement

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 986392b..7880539 100644
--- a/README.md
+++ b/README.md
@@ -281,6 +281,6 @@ This project is licensed under both the MIT and Apache-2.0 licenses. You are fre
 
 With thanks to [Nicolas De Carli](https://github.com/Nicoshev) for the original [rapidhash](https://github.com/Nicoshev/rapidhash) C++ implementation, which is licensed under the [MIT License](https://github.com/Nicoshev/rapidhash/blob/master/LICENSE).
 
-With thanks to [Orson Peters](https://github.com/orlp) for his work on [foldhash](https://github.com/orlp/foldhash), which inspired much of the integer hashing optimisations in this crate.
+With thanks to [Orson Peters](https://github.com/orlp) for his work on [foldhash](https://github.com/orlp/foldhash), which inspired much of the integer hashing optimisations in this crate. Some of the RapidHasher string hashing [optimisations](https://github.com/orlp/foldhash/pull/35) have made their way back into foldhash as a thanks, and both hashers are now very similar in performance and quality. The rust community wins!
 
 With thanks to [Justin Bradford](https://github.com/jabr) for letting us use the rapidhash crate name 🍻

From 38b0436abab3a3fe9cf9d3bc43b28ec16e1bff12 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Sat, 9 Aug 2025 13:58:12 +0100
Subject: [PATCH 5/6] Commented lines for high granularity benchmarking on
 short inputs

---
 rapidhash-bench/benches/bench/basic.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rapidhash-bench/benches/bench/basic.rs b/rapidhash-bench/benches/bench/basic.rs
index 1515823..5a039d0 100644
--- a/rapidhash-bench/benches/bench/basic.rs
+++ b/rapidhash-bench/benches/bench/basic.rs
@@ -69,7 +69,8 @@ where
 fn bench_group<H: BuildHasher + Default>(c: &mut Criterion, group_name: &str) {
     let mut group = c.benchmark_group(group_name.to_string());
     let sizes = [2usize, 8, 16, 25, 50, 64, 80, 160, 256, 350, 1024, 4096, 65536, 1024 * 1024 * 500];
-    for &size in &sizes {
+    // let sizes = 0usize..68;  // for micro short input testing
+    for size in sizes {
         profile_bytes::<H>(size, &mut group);
     }
     profile_int::<H, u8>("u8", &mut group);
@@ -113,7 +114,7 @@ fn profile_bytes_raw<H: Fn(&[u8], u64) -> u64>(
 fn bench_group_raw<H: Fn(&[u8], u64) -> u64>(c: &mut Criterion, group_name: &str, hash: &H) {
     let mut group = c.benchmark_group(group_name.to_string());
     let sizes = [2usize, 8, 16, 25, 50, 64, 80, 160, 256, 350, 1024, 4096, 65536, 1024 * 1024 * 500];
-    for &size in &sizes {
+    for size in sizes {
         profile_bytes_raw(hash, size, &mut group);
     }
 }

From b297584fdbe10c6e0bf955192e03cda0f1050524 Mon Sep 17 00:00:00 2001
From: Liam Gray <hoxxep@gmail.com>
Date: Sat, 9 Aug 2025 14:30:28 +0100
Subject: [PATCH 6/6] Use non-portable read_u64 helper for RapidHasher

---
 rapidhash/src/inner/{mix.rs => mix_np.rs} |   0
 rapidhash/src/inner/mod.rs                |   7 +-
 rapidhash/src/inner/rapid_const.rs        | 110 +++++++++---------
 rapidhash/src/inner/rapid_hasher.rs       |   2 +-
 rapidhash/src/inner/read_np.rs            | 134 ++++++++++++++++++++++
 rapidhash/src/inner/seeding.rs            |   2 +-
 6 files changed, 196 insertions(+), 59 deletions(-)
 rename rapidhash/src/inner/{mix.rs => mix_np.rs} (100%)
 create mode 100644 rapidhash/src/inner/read_np.rs

diff --git a/rapidhash/src/inner/mix.rs b/rapidhash/src/inner/mix_np.rs
similarity index 100%
rename from rapidhash/src/inner/mix.rs
rename to rapidhash/src/inner/mix_np.rs
diff --git a/rapidhash/src/inner/mod.rs b/rapidhash/src/inner/mod.rs
index d9f7461..b78f07b 100644
--- a/rapidhash/src/inner/mod.rs
+++ b/rapidhash/src/inner/mod.rs
@@ -33,8 +33,9 @@ mod rapid_const;
 mod rapid_hasher;
 mod state;
 pub(crate) mod seeding;
-mod mix;
+mod mix_np;
 mod seed;
+mod read_np;
 
 #[doc(inline)]
 pub use rapid_hasher::*;
@@ -53,7 +54,7 @@ mod tests {
     use std::hash::{BuildHasher, Hash, Hasher};
     use std::collections::BTreeSet;
     use rand::Rng;
-    use crate::inner::mix::rapid_mix_np;
+    use crate::inner::mix_np::rapid_mix_np;
     use super::seed::{DEFAULT_RAPID_SECRETS, DEFAULT_SEED};
     use super::rapid_const::{rapidhash_rs, rapidhash_rs_seeded};
 
@@ -66,6 +67,7 @@ mod tests {
     }
 
     /// `#[derive(Hash)]` writes a length prefix first, check understanding.
+    #[cfg(target_endian = "little")]
     #[test]
     fn derive_hash_works() {
         let object = Object { bytes: b"hello world".to_vec() };
@@ -186,6 +188,7 @@ mod tests {
     }
 
     /// Compare to the C rapidhash implementation to ensure we match perfectly.
+    #[cfg(target_endian = "little")]
     #[test]
     fn compare_to_c() {
         use rand::Rng;
diff --git a/rapidhash/src/inner/rapid_const.rs b/rapidhash/src/inner/rapid_const.rs
index 653d9ee..a8a1b17 100644
--- a/rapidhash/src/inner/rapid_const.rs
+++ b/rapidhash/src/inner/rapid_const.rs
@@ -1,5 +1,5 @@
-use crate::util::read::{read_u32, read_u64};
-use super::mix::{rapid_mix_np, rapid_mum_np};
+use super::mix_np::{rapid_mix_np, rapid_mum_np};
+use super::read_np::{read_u32_np, read_u64_np};
 
 #[cfg(test)]
 use super::{DEFAULT_RAPID_SECRETS, RapidSecrets};
@@ -47,11 +47,11 @@ pub(super) const fn rapidhash_core<const AVALANCHE: bool, const COMPACT: bool, c
         let mut b = 0;
 
         if data.len() >= 8 {
-            a = read_u64(data, 0);
-            b = read_u64(data, data.len() - 8);
+            a = read_u64_np(data, 0);
+            b = read_u64_np(data, data.len() - 8);
         } else if data.len() >= 4 {
-            a = read_u32(data, 0) as u64;
-            b = read_u32(data, data.len() - 4) as u64;
+            a = read_u32_np(data, 0) as u64;
+            b = read_u32_np(data, data.len() - 4) as u64;
         } else if !data.is_empty() {
             a = ((data[0] as u64) << 45) | data[data.len() - 1] as u64;
             b = data[data.len() >> 1] as u64;
@@ -87,9 +87,9 @@ const fn rapidhash_core_16_288<const AVALANCHE: bool, const COMPACT: bool, const
         let mut see2 = seed;
 
         while slice.len() >= 48 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
             let (_, split) = slice.split_at(48);
             slice = split;
         }
@@ -98,14 +98,14 @@ const fn rapidhash_core_16_288<const AVALANCHE: bool, const COMPACT: bool, const
     }
 
     if slice.len() > 16 {
-        seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
+        seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
         if slice.len() > 32 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ seed);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ seed);
         }
     }
 
-    a ^= read_u64(data, data.len() - 16);
-    b ^= read_u64(data, data.len() - 8);
+    a ^= read_u64_np(data, data.len() - 16);
+    b ^= read_u64_np(data, data.len() - 8);
 
     seed = seed.wrapping_add(data.len() as u64);
     rapidhash_finish::<AVALANCHE, PROTECTED>(a, b , seed, secrets)
@@ -132,46 +132,46 @@ const fn rapidhash_core_cold<const AVALANCHE: bool, const COMPACT: bool, const P
 
     if !COMPACT {
         while slice.len() >= 224 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
-            see3 = rapid_mix_np::<PROTECTED>(read_u64(slice, 48) ^ secrets[3], read_u64(slice, 56) ^ see3);
-            see4 = rapid_mix_np::<PROTECTED>(read_u64(slice, 64) ^ secrets[4], read_u64(slice, 72) ^ see4);
-            see5 = rapid_mix_np::<PROTECTED>(read_u64(slice, 80) ^ secrets[5], read_u64(slice, 88) ^ see5);
-            see6 = rapid_mix_np::<PROTECTED>(read_u64(slice, 96) ^ secrets[6], read_u64(slice, 104) ^ see6);
-
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 112) ^ secrets[0], read_u64(slice, 120) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 128) ^ secrets[1], read_u64(slice, 136) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 144) ^ secrets[2], read_u64(slice, 152) ^ see2);
-            see3 = rapid_mix_np::<PROTECTED>(read_u64(slice, 160) ^ secrets[3], read_u64(slice, 168) ^ see3);
-            see4 = rapid_mix_np::<PROTECTED>(read_u64(slice, 176) ^ secrets[4], read_u64(slice, 184) ^ see4);
-            see5 = rapid_mix_np::<PROTECTED>(read_u64(slice, 192) ^ secrets[5], read_u64(slice, 200) ^ see5);
-            see6 = rapid_mix_np::<PROTECTED>(read_u64(slice, 208) ^ secrets[6], read_u64(slice, 216) ^ see6);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
+            see3 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 48) ^ secrets[3], read_u64_np(slice, 56) ^ see3);
+            see4 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 64) ^ secrets[4], read_u64_np(slice, 72) ^ see4);
+            see5 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 80) ^ secrets[5], read_u64_np(slice, 88) ^ see5);
+            see6 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 96) ^ secrets[6], read_u64_np(slice, 104) ^ see6);
+
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 112) ^ secrets[0], read_u64_np(slice, 120) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 128) ^ secrets[1], read_u64_np(slice, 136) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 144) ^ secrets[2], read_u64_np(slice, 152) ^ see2);
+            see3 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 160) ^ secrets[3], read_u64_np(slice, 168) ^ see3);
+            see4 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 176) ^ secrets[4], read_u64_np(slice, 184) ^ see4);
+            see5 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 192) ^ secrets[5], read_u64_np(slice, 200) ^ see5);
+            see6 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 208) ^ secrets[6], read_u64_np(slice, 216) ^ see6);
 
             let (_, split) = slice.split_at(224);
             slice = split;
         }
 
         if slice.len() >= 112 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
-            see3 = rapid_mix_np::<PROTECTED>(read_u64(slice, 48) ^ secrets[3], read_u64(slice, 56) ^ see3);
-            see4 = rapid_mix_np::<PROTECTED>(read_u64(slice, 64) ^ secrets[4], read_u64(slice, 72) ^ see4);
-            see5 = rapid_mix_np::<PROTECTED>(read_u64(slice, 80) ^ secrets[5], read_u64(slice, 88) ^ see5);
-            see6 = rapid_mix_np::<PROTECTED>(read_u64(slice, 96) ^ secrets[6], read_u64(slice, 104) ^ see6);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
+            see3 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 48) ^ secrets[3], read_u64_np(slice, 56) ^ see3);
+            see4 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 64) ^ secrets[4], read_u64_np(slice, 72) ^ see4);
+            see5 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 80) ^ secrets[5], read_u64_np(slice, 88) ^ see5);
+            see6 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 96) ^ secrets[6], read_u64_np(slice, 104) ^ see6);
             let (_, split) = slice.split_at(112);
             slice = split;
         }
     } else {
         while slice.len() >= 112 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
-            see3 = rapid_mix_np::<PROTECTED>(read_u64(slice, 48) ^ secrets[3], read_u64(slice, 56) ^ see3);
-            see4 = rapid_mix_np::<PROTECTED>(read_u64(slice, 64) ^ secrets[4], read_u64(slice, 72) ^ see4);
-            see5 = rapid_mix_np::<PROTECTED>(read_u64(slice, 80) ^ secrets[5], read_u64(slice, 88) ^ see5);
-            see6 = rapid_mix_np::<PROTECTED>(read_u64(slice, 96) ^ secrets[6], read_u64(slice, 104) ^ see6);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
+            see3 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 48) ^ secrets[3], read_u64_np(slice, 56) ^ see3);
+            see4 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 64) ^ secrets[4], read_u64_np(slice, 72) ^ see4);
+            see5 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 80) ^ secrets[5], read_u64_np(slice, 88) ^ see5);
+            see6 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 96) ^ secrets[6], read_u64_np(slice, 104) ^ see6);
             let (_, split) = slice.split_at(112);
             slice = split;
         }
@@ -179,25 +179,25 @@ const fn rapidhash_core_cold<const AVALANCHE: bool, const COMPACT: bool, const P
 
     if !COMPACT {
         if slice.len() >= 48 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
             let (_, split) = slice.split_at(48);
             slice = split;
 
             if slice.len() >= 48 {
-                seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-                see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-                see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
+                seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+                see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+                see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
                 let (_, split) = slice.split_at(48);
                 slice = split;
             }
         }
     } else {
         while slice.len() >= 48 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[0], read_u64(slice, 8) ^ seed);
-            see1 = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[1], read_u64(slice, 24) ^ see1);
-            see2 = rapid_mix_np::<PROTECTED>(read_u64(slice, 32) ^ secrets[2], read_u64(slice, 40) ^ see2);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[0], read_u64_np(slice, 8) ^ seed);
+            see1 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[1], read_u64_np(slice, 24) ^ see1);
+            see2 = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 32) ^ secrets[2], read_u64_np(slice, 40) ^ see2);
             let (_, split) = slice.split_at(48);
             slice = split;
         }
@@ -211,14 +211,14 @@ const fn rapidhash_core_cold<const AVALANCHE: bool, const COMPACT: bool, const P
     seed ^= see3;
 
     if slice.len() > 16 {
-        seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 0) ^ secrets[2], read_u64(slice, 8) ^ seed);
+        seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 0) ^ secrets[2], read_u64_np(slice, 8) ^ seed);
         if slice.len() > 32 {
-            seed = rapid_mix_np::<PROTECTED>(read_u64(slice, 16) ^ secrets[2], read_u64(slice, 24) ^ seed);
+            seed = rapid_mix_np::<PROTECTED>(read_u64_np(slice, 16) ^ secrets[2], read_u64_np(slice, 24) ^ seed);
         }
     }
 
-    a ^= read_u64(data, data.len() - 16);
-    b ^= read_u64(data, data.len() - 8);
+    a ^= read_u64_np(data, data.len() - 16);
+    b ^= read_u64_np(data, data.len() - 8);
 
     seed = seed.wrapping_add(data.len() as u64);
     rapidhash_finish::<AVALANCHE, PROTECTED>(a, b , seed, secrets)
diff --git a/rapidhash/src/inner/rapid_hasher.rs b/rapidhash/src/inner/rapid_hasher.rs
index fe172b9..c15f1e7 100644
--- a/rapidhash/src/inner/rapid_hasher.rs
+++ b/rapidhash/src/inner/rapid_hasher.rs
@@ -1,6 +1,6 @@
 use core::hash::{BuildHasher, Hasher};
 use super::DEFAULT_RAPID_SECRETS;
-use super::mix::rapid_mix_np;
+use super::mix_np::rapid_mix_np;
 use super::rapid_const::rapidhash_core;
 use super::seed::rapidhash_seed;
 
diff --git a/rapidhash/src/inner/read_np.rs b/rapidhash/src/inner/read_np.rs
new file mode 100644
index 0000000..c591284
--- /dev/null
+++ b/rapidhash/src/inner/read_np.rs
@@ -0,0 +1,134 @@
+//! Internal module for reading unaligned bytes from a slice into `u64` and `u32` values.
+//!
+//! This is a non-portable implementation specifically designed for `RapidHasher`.
+
+/// Hacky const-friendly memory-safe unaligned bytes to u64. Compiler can't seem to remove the
+/// bounds check, and so we have an unsafe version behind the `unsafe` feature flag.
+#[cfg(not(feature = "unsafe"))]
+#[inline(always)]
+pub(crate) const fn read_u64_np(slice: &[u8], offset: usize) -> u64 {
+    // equivalent to slice[offset..offset+8].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<8>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u64: slice too short"),
+    };
+    u64::from_ne_bytes(buf)
+}
+
+/// Hacky const-friendly memory-safe unaligned bytes to u64. Compiler can't seem to remove the
+/// bounds check, and so we have an unsafe version behind the `unsafe` feature flag.
+#[cfg(not(feature = "unsafe"))]
+#[inline(always)]
+pub(crate) const fn read_u32_np(slice: &[u8], offset: usize) -> u32 {
+    // equivalent to slice[offset..offset+4].try_into().unwrap(), but const-friendly
+    let maybe_buf = slice.split_at(offset).1.first_chunk::<4>();
+    let buf = match maybe_buf {
+        Some(buf) => *buf,
+        None => panic!("read_u32: slice too short"),
+    };
+    u32::from_ne_bytes(buf)
+}
+
+/// Unsafe but const-friendly unaligned bytes to u64. The compiler can't seem to remove the bounds
+/// checks for small integers because we do some funky bit shifting in the indexing.
+///
+/// SAFETY: `slice` must be at least `offset+8` bytes long, which we guarantee in this rapidhash
+/// implementation.
+#[cfg(feature = "unsafe")]
+#[inline(always)]
+pub(crate) const fn read_u64_np(slice: &[u8], offset: usize) -> u64 {
+    debug_assert!(offset as isize >= 0);
+    debug_assert!(slice.len() >= 8 + offset);
+    unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u64) }
+}
+
+/// Unsafe but const-friendly unaligned bytes to u32. The compiler can't seem to remove the bounds
+/// checks for small integers because we do some funky bit shifting in the indexing.
+///
+/// SAFETY: `slice` must be at least `offset+8` bytes long, which we guarantee in this rapidhash
+/// implementation.
+#[cfg(feature = "unsafe")]
+#[inline(always)]
+pub(crate) const fn read_u32_np(slice: &[u8], offset: usize) -> u32 {
+    debug_assert!(offset as isize >= 0);
+    debug_assert!(slice.len() >= 4 + offset);
+    unsafe { core::ptr::read_unaligned(slice.as_ptr().offset(offset as isize) as *const u32) }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[cfg(target_endian = "little")]
+    #[test]
+    fn test_read_u32_np() {
+        let bytes = &[23, 145, 3, 34];
+
+        let split_result = bytes.split_at(0).1;
+        assert_eq!(split_result.len(), 4);
+        let maybe_buf = split_result.first_chunk::<4>();
+        assert_eq!(maybe_buf, Some(&[23, 145, 3, 34]));
+
+        assert_eq!(read_u32_np(bytes, 0), 570659095);
+
+        let bytes = &[24, 54, 3, 23, 145, 3, 34];
+        assert_eq!(read_u32_np(bytes, 3), 570659095);
+
+        assert_eq!(read_u32_np(&[0, 0, 0, 0], 0), 0);
+        assert_eq!(read_u32_np(&[1, 0, 0, 0], 0), 1);
+        assert_eq!(read_u32_np(&[12, 0, 0, 0], 0), 12);
+        assert_eq!(read_u32_np(&[0, 10, 0, 0], 0), 2560);
+    }
+
+    #[cfg(target_endian = "little")]
+    #[test]
+    fn test_read_u64_np() {
+        let bytes = [23, 145, 3, 34, 0, 0, 0, 0, 0, 0, 0].as_slice();
+        assert_eq!(read_u64_np(bytes, 0), 570659095);
+
+        let bytes = [1, 2, 3, 23, 145, 3, 34, 0, 0, 0, 0, 0, 0, 0].as_slice();
+        assert_eq!(read_u64_np(bytes, 3), 570659095);
+
+        let bytes = [0, 0, 0, 0, 0, 0, 0, 0].as_slice();
+        assert_eq!(read_u64_np(bytes, 0), 0);
+    }
+
+    #[cfg(target_endian = "little")]
+    #[cfg(feature = "std")]
+    #[test]
+    fn test_u32_to_u128_delta() {
+        fn formula(len: u64) -> u64 {
+            (len & 24) >> (len >> 3)
+        }
+
+        fn formula2(len: u64) -> u64 {
+            match len {
+                8.. => 4,
+                _ => 0,
+            }
+        }
+
+        let inputs: std::vec::Vec<u64> = (4..=16).collect();
+        let outputs: std::vec::Vec<u64> = inputs.iter().map(|&x| formula(x)).collect();
+        let expected = std::vec![0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4];
+        assert_eq!(outputs, expected);
+        assert_eq!(outputs, inputs.iter().map(|&x| formula2(x)).collect::<Vec<u64>>());
+    }
+
+    #[test]
+    #[should_panic]
+    #[cfg(any(test, not(feature = "unsafe")))]
+    fn test_read_u32_np_to_short_panics() {
+        let bytes = [23, 145, 0].as_slice();
+        assert_eq!(read_u32_np(bytes, 0), 0);
+    }
+
+    #[test]
+    #[should_panic]
+    #[cfg(any(test, not(feature = "unsafe")))]
+    fn test_read_u64_np_to_short_panics() {
+        let bytes = [23, 145, 0].as_slice();
+        assert_eq!(read_u64_np(bytes, 0), 0);
+    }
+}
diff --git a/rapidhash/src/inner/seeding.rs b/rapidhash/src/inner/seeding.rs
index 35b38a6..a2b2e99 100644
--- a/rapidhash/src/inner/seeding.rs
+++ b/rapidhash/src/inner/seeding.rs
@@ -14,7 +14,7 @@ const DEFAULT_SECRETS: [u64; 7] = [
 ];
 
 pub(crate) mod seed {
-    use crate::inner::mix::rapid_mix_np;
+    use crate::inner::mix_np::rapid_mix_np;
     use super::DEFAULT_SECRETS;
 
     #[inline]