rustic-rs · aawsome · Jan 23, 2023 · Jan 14, 2023 · Jan 20, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -39,7 +39,7 @@ sha2 = "0.10"
 rand = "0.8"
 scrypt = { version = "0.10", default-features = false }
 # chunker / packer
-cdc = "0.1"
+# cdc = "0.1"
 integer-sqrt = "0.1"
 # serialization
 base64 = "0.20"

diff --git a/src/cdc/LICENSE.txt b/src/cdc/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 Vincent Cantin (https://github.com/green-coder)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/cdc/README.md b/src/cdc/README.md
@@ -0,0 +1,60 @@
+cdc
+========
+
+A library for performing *Content-Defined Chunking* (CDC) on data streams. Implemented using generic iterators, very easy to use.
+
+- [API Documentation](https://docs.rs/cdc/)
+
+## Example
+
+```rust
+  let reader: BufReader<File> = BufReader::new(file);
+  let byte_iter = reader.bytes().map(|b| b.unwrap());
+
+  // Finds and iterates on the separators.
+  for separator in SeparatorIter::new(byte_iter) {
+    println!("Index: {}, hash: {:016x}", separator.index, separator.hash);
+  }
+```
+
+Each module is documented via an example which you can find in the `examples/` folder.
+
+To run them, use a command like:
+
+    cargo run --example separator --release
+
+**Note:** Some examples are looking for a file named `myLargeFile.bin` which I didn't upload to Github. Please use your own files for testing.
+
+## What's in the crate
+
+From low level to high level:
+
+* A `RollingHash64` trait, for rolling hash with a 64 bits hash value.
+
+* `Rabin64`, an implementation of the Rabin Fingerprint rolling hash with a 64 bits hash value.
+
+* `Separator`, a struct which describes a place in a data stream identified as a separator.
+
+* `SeparatorIter`, an adaptor which takes an `Iterator<Item=u8>` as input and which enumerates all the separators found.
+
+* `Chunk`, a struct which describes a piece of the data stream (index and size).
+
+* `ChunkIter`, an adaptor which takes an `Iterator<Item=Separator>` as input and which enumerates chunks.
+
+## Implementation details
+
+* The library is not cutting any files, it only provides information on how to do it.
+
+* You can change the default window size used by `Rabin64`, and how the `SeparatorIter` is choosing the separator.
+
+* The design of this crate may be subject to changes sometime in the future. I am waiting for some features of `Rust` to mature up, specially the [`impl Trait`](https://github.com/rust-lang/rust/issues/34511) feature.
+
+## Performance
+
+There is a **huge** difference between the debug build and the release build in terms of performance. Remember that when you test the lib, use `cargo run --release`.
+
+I may try to improve the performance of the lib at some point, but for now it is good enough for most usages.
+
+## License
+
+Coded with ❤️ , licensed under the terms of the [MIT license](LICENSE.txt).
diff --git a/src/cdc/mod.rs b/src/cdc/mod.rs
@@ -0,0 +1,5 @@
+mod polynom;
+mod rolling_hash;
+
+pub use polynom::{Polynom, Polynom64};
+pub use rolling_hash::{Rabin64, RollingHash64};
diff --git a/src/cdc/polynom.rs b/src/cdc/polynom.rs
@@ -0,0 +1,51 @@
+// The irreductible polynom to be used in the fingerprint function.
+pub trait Polynom {
+    fn degree(&self) -> i32;
+    fn modulo(self, m: Self) -> Self;
+}
+
+pub type Polynom64 = u64;
+
+impl Polynom for Polynom64 {
+    // The degree of the polynom.
+    fn degree(&self) -> i32 {
+        63 - self.leading_zeros() as i32
+    }
+
+    fn modulo(self, m: Self) -> Self {
+        let mut p = self;
+        while p.degree() >= m.degree() {
+            p ^= m << (p.degree() - m.degree());
+        }
+
+        p
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn polynom_degree() {
+        assert_eq!(0u64.degree(), -1);
+        assert_eq!(1u64.degree(), 0);
+
+        assert_eq!(((1u64 << 7) - 1).degree(), 6);
+        assert_eq!((1u64 << 7).degree(), 7);
+        assert_eq!(((1u64 << 7) + 1).degree(), 7);
+    }
+
+    #[test]
+    fn polynom_modulo() {
+        assert_eq!(7u64.modulo(3), 1);
+        assert_eq!(7u64.modulo(4), 3);
+        assert_eq!(7u64.modulo(2), 1);
+
+        assert_eq!(16u64.modulo(8), 0);
+        assert_eq!(19u64.modulo(8), 3);
+
+        assert_eq!(16u64.modulo(4), 0);
+        assert_eq!(19u64.modulo(4), 3);
+    }
+}
diff --git a/src/cdc/rolling_hash.rs b/src/cdc/rolling_hash.rs
@@ -0,0 +1,161 @@
+use super::{Polynom, Polynom64};
+
+pub trait RollingHash64 {
+    fn reset(&mut self);
+    fn prefill_window<I>(&mut self, iter: &mut I) -> usize
+    where
+        I: Iterator<Item = u8>;
+    fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
+    where
+        I: Iterator<Item = u8>;
+    fn slide(&mut self, byte: u8);
+    fn get_hash(&self) -> &Polynom64;
+}
+
+pub struct Rabin64 {
+    // Configuration
+    window_size: usize, // The size of the data window used in the hash calculation.
+    window_size_mask: usize, // = window_size - 1, supposing that it is an exponent of 2.
+
+    // Precalculations
+    polynom_shift: i32,
+    out_table: [Polynom64; 256],
+    mod_table: [Polynom64; 256],
+
+    // Current state
+    window_data: Vec<u8>,
+    window_index: usize,
+    pub hash: Polynom64,
+}
+
+impl Rabin64 {
+    pub fn calculate_out_table(window_size: usize, mod_polynom: Polynom64) -> [Polynom64; 256] {
+        let mut out_table = [0; 256];
+        for (b, elem) in out_table.iter_mut().enumerate() {
+            let mut hash = (b as Polynom64).modulo(mod_polynom);
+            for _ in 0..window_size - 1 {
+                hash <<= 8;
+                hash = hash.modulo(mod_polynom);
+            }
+            *elem = hash;
+        }
+
+        out_table
+    }
+
+    pub fn calculate_mod_table(mod_polynom: Polynom64) -> [Polynom64; 256] {
+        let mut mod_table = [0; 256];
+        let k = mod_polynom.degree();
+        for (b, elem) in mod_table.iter_mut().enumerate() {
+            let p: Polynom64 = (b as Polynom64) << k;
+            *elem = p.modulo(mod_polynom) | p;
+        }
+
+        mod_table
+    }
+
+    pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: Polynom64) -> Rabin64 {
+        let window_size = 1 << window_size_nb_bits;
+
+        let window_data = vec![0; window_size];
+
+        Rabin64 {
+            window_size,
+            window_size_mask: window_size - 1,
+            polynom_shift: mod_polynom.degree() - 8,
+            out_table: Self::calculate_out_table(window_size, mod_polynom),
+            mod_table: Self::calculate_mod_table(mod_polynom),
+            window_data,
+            window_index: 0,
+            hash: 0,
+        }
+    }
+}
+
+impl RollingHash64 for Rabin64 {
+    fn reset(&mut self) {
+        self.window_data.clear();
+        self.window_data.resize(self.window_size, 0);
+        self.window_index = 0;
+        self.hash = 0;
+
+        // Not needed.
+        // self.slide(1);
+    }
+
+    // Attempt to fills the window - 1 byte.
+    fn prefill_window<I>(&mut self, iter: &mut I) -> usize
+    where
+        I: Iterator<Item = u8>,
+    {
+        let mut nb_bytes_read = 0;
+        for _ in 0..self.window_size - 1 {
+            match iter.next() {
+                Some(b) => {
+                    self.slide(b);
+                    nb_bytes_read += 1;
+                }
+                None => break,
+            }
+        }
+
+        nb_bytes_read
+    }
+
+    // Combines a reset with a prefill in an optimized way.
+    fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
+    where
+        I: Iterator<Item = u8>,
+    {
+        self.hash = 0;
+        let mut nb_bytes_read = 0;
+        for _ in 0..self.window_size - 1 {
+            match iter.next() {
+                Some(b) => {
+                    // Take the old value out of the window and the hash.
+                    // ... let's suppose that the buffer contains zeroes, do nothing.
+
+                    // Put the new value in the window and in the hash.
+                    self.window_data[self.window_index] = b;
+                    let mod_index = (self.hash >> self.polynom_shift) & 255;
+                    self.hash <<= 8;
+                    self.hash |= u64::from(b);
+                    self.hash ^= self.mod_table[mod_index as usize];
+
+                    // Move the windowIndex to the next position.
+                    self.window_index = (self.window_index + 1) & self.window_size_mask;
+
+                    nb_bytes_read += 1;
+                }
+                None => break,
+            }
+        }
+
+        // Because we didn't overwrite that element in the loop above.
+        self.window_data[self.window_index] = 0;
+
+        nb_bytes_read
+    }
+
+    #[inline]
+    fn slide(&mut self, byte: u8) {
+        // Take the old value out of the window and the hash.
+        let out_value = self.window_data[self.window_index];
+        self.hash ^= self.out_table[out_value as usize];
+
+        // Put the new value in the window and in the hash.
+        self.window_data[self.window_index] = byte;
+        let mod_index = (self.hash >> self.polynom_shift) & 255;
+        self.hash <<= 8;
+        self.hash |= u64::from(byte);
+        self.hash ^= self.mod_table[mod_index as usize];
+
+        // Move the windowIndex to the next position.
+        self.window_index = (self.window_index + 1) & self.window_size_mask;
+    }
+
+    #[inline]
+    fn get_hash(&self) -> &Polynom64 {
+        &self.hash
+    }
+}