Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions salsa20/benches/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
#![feature(test)]

use cipher::{
Array,
consts::{U1, U4, U64},
};
extern crate test;

cipher::stream_cipher_bench!(
salsa20::Salsa8;
salsa8_bench1_16b 16;
salsa8_bench1_64b 64;
salsa8_bench2_256b 256;
salsa8_bench3_1kib 1024;
salsa8_bench4_16kib 16384;
Expand All @@ -12,6 +18,7 @@ cipher::stream_cipher_bench!(
cipher::stream_cipher_bench!(
salsa20::Salsa12;
salsa12_bench1_16b 16;
salsa12_bench1_64b 64;
salsa12_bench2_256b 256;
salsa12_bench3_1kib 1024;
salsa12_bench4_16kib 16384;
Expand All @@ -20,7 +27,50 @@ cipher::stream_cipher_bench!(
cipher::stream_cipher_bench!(
salsa20::Salsa20;
salsa20_bench1_16b 16;
salsa20_bench1_64b 64;
salsa20_bench2_256b 256;
salsa20_bench3_1kib 1024;
salsa20_bench4_16kib 16384;
);

#[bench]
fn salsa8_bench1_ks_altn(b: &mut test::Bencher) {
use salsa20::SalsaChaining;
use std::hash::{BuildHasher, Hasher};

let seed = std::hash::RandomState::new().build_hasher().finish();

let mut buf: Array<[u32; 16], U1> = [[0u32; 16]].into();
buf[0][0] = seed as u32;
buf[0][1] = (seed >> 32) as u32;

b.iter(|| {
let mut cipher = salsa20::SalsaCore::<U4>::from_raw_state_cv(buf);
cipher.write_keystream_block_cv([&mut buf[0]].into());
test::black_box(&buf);
});

b.bytes = buf[0].len() as u64 * core::mem::size_of::<u32>() as u64;
}

#[bench]
fn salsa8_bench1_ks(b: &mut test::Bencher) {
use cipher::StreamCipherCore;
use std::hash::{BuildHasher, Hasher};

let seed = std::hash::RandomState::new().build_hasher().finish();

let mut buf = [0u32; 16];
buf[0] = seed as u32;
buf[1] = (seed >> 32) as u32;

b.iter(|| {
let mut cipher = salsa20::SalsaCore::<U4>::from_raw_state(buf);
cipher.write_keystream_block(unsafe {
core::mem::transmute::<&mut [u32; 16], &mut Array<u8, U64>>(&mut buf)
});
test::black_box(&buf);
});

b.bytes = buf.len() as u64 * core::mem::size_of::<u32>() as u64;
}
31 changes: 30 additions & 1 deletion salsa20/src/backends.rs
Original file line number Diff line number Diff line change
@@ -1 +1,30 @@
pub(crate) mod soft;
use cfg_if::cfg_if;

cfg_if! {
if #[cfg(all(target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))] {
pub(crate) mod sse2;
pub(crate) type Backend<'a, R> = sse2::Backend<'a, R>;
} else {
pub(crate) mod soft;
pub(crate) type Backend<'a, R> = soft::Backend<'a, R>;
}
}

#[inline]
#[allow(clippy::many_single_char_names)]
pub(crate) fn quarter_round(
a: usize,
b: usize,
c: usize,
d: usize,
state: &mut [u32; crate::STATE_WORDS],
) {
let a = crate::DATA_LAYOUT_INVERSE[a];
let b = crate::DATA_LAYOUT_INVERSE[b];
let c = crate::DATA_LAYOUT_INVERSE[c];
let d = crate::DATA_LAYOUT_INVERSE[d];
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
}
39 changes: 22 additions & 17 deletions salsa20/src/backends/soft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,16 @@ use cipher::{
consts::{U1, U64},
};

use super::quarter_round;

pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);

impl<'a, R: Unsigned> From<&'a mut SalsaCore<R>> for Backend<'a, R> {
fn from(core: &'a mut SalsaCore<R>) -> Self {
Backend(core)
}
}

impl<R: Unsigned> BlockSizeUser for Backend<'_, R> {
type BlockSize = U64;
}
Expand All @@ -17,34 +25,31 @@ impl<R: Unsigned> ParBlocksSizeUser for Backend<'_, R> {
type ParBlocksSize = U1;
}

impl<R: Unsigned> Backend<'_, R> {
#[inline(always)]
pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) {
let res = run_rounds::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

block.copy_from_slice(&res);
}
}

impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
let res = run_rounds::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) {
chunk.copy_from_slice(&val.to_le_bytes());
for i in 0..16 {
block[i * 4..(i + 1) * 4]
.copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes());
}
}
}

#[inline]
#[allow(clippy::many_single_char_names)]
pub(crate) fn quarter_round(
a: usize,
b: usize,
c: usize,
d: usize,
state: &mut [u32; STATE_WORDS],
) {
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
}

#[inline(always)]
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
let mut res = *state;
Expand Down
111 changes: 111 additions & 0 deletions salsa20/src/backends/sse2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
//! SSE2 backend for Salsa20.

use crate::{Block, STATE_WORDS, SalsaCore, Unsigned};
use cipher::{
BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherSeekCore,
consts::{U1, U64},
};

pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);

impl<'a, R: Unsigned> From<&'a mut SalsaCore<R>> for Backend<'a, R> {
fn from(core: &'a mut SalsaCore<R>) -> Self {
Backend(core)
}
}

impl<R: Unsigned> BlockSizeUser for Backend<'_, R> {
type BlockSize = U64;
}

impl<R: Unsigned> ParBlocksSizeUser for Backend<'_, R> {
type ParBlocksSize = U1;
}

impl<R: Unsigned> Backend<'_, R> {
#[inline(always)]
pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) {
let res = run_rounds_sse2::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

block.copy_from_slice(&res);
}
}

impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
let res = run_rounds_sse2::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

for i in 0..16 {
block[i * 4..(i + 1) * 4]
.copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes());
}
}
}

#[inline(always)]
fn run_rounds_sse2<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
use core::arch::x86_64::*;
unsafe {
let [a_save, b_save, d_save, c_save] = [
_mm_loadu_si128(state.as_ptr().add(0).cast()),
_mm_loadu_si128(state.as_ptr().add(4).cast()),
_mm_loadu_si128(state.as_ptr().add(8).cast()),
_mm_loadu_si128(state.as_ptr().add(12).cast()),
];
let [mut a, mut b, mut c, mut d] = [a_save, b_save, c_save, d_save];

macro_rules! mm_rol_epi32x {
($w:expr, $amt:literal) => {{
let w = $w;
_mm_xor_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt))
}};
}

macro_rules! quarter_xmmwords {
($a:expr, $b:expr, $c:expr, $d:expr) => {
$b = _mm_xor_si128($b, mm_rol_epi32x!(_mm_add_epi32($a, $d), 7));
$c = _mm_xor_si128($c, mm_rol_epi32x!(_mm_add_epi32($b, $a), 9));
$d = _mm_xor_si128($d, mm_rol_epi32x!(_mm_add_epi32($c, $b), 13));
$a = _mm_xor_si128($a, mm_rol_epi32x!(_mm_add_epi32($d, $c), 18));
};
}

for _ in 0..R::USIZE {
quarter_xmmwords!(a, b, c, d);

// a stays in place
// b = left shuffle d by 1 element
d = _mm_shuffle_epi32(d, 0b00_11_10_01);
// c = left shuffle c by 2 elements
c = _mm_shuffle_epi32(c, 0b01_00_11_10);
// d = left shuffle b by 3 elements
b = _mm_shuffle_epi32(b, 0b10_01_00_11);

(b, d) = (d, b);

quarter_xmmwords!(a, b, c, d);

// a stays in place
// b = left shuffle d by 1 element
d = _mm_shuffle_epi32(d, 0b00_11_10_01);
// c = left shuffle c by 2 elements
c = _mm_shuffle_epi32(c, 0b01_00_11_10);
// d = left shuffle b by 3 elements
b = _mm_shuffle_epi32(b, 0b10_01_00_11);

(b, d) = (d, b);
}

let mut res = [0u32; STATE_WORDS];
_mm_storeu_si128(res.as_mut_ptr().add(0).cast(), _mm_add_epi32(a, a_save));
_mm_storeu_si128(res.as_mut_ptr().add(4).cast(), _mm_add_epi32(b, b_save));
_mm_storeu_si128(res.as_mut_ptr().add(8).cast(), _mm_add_epi32(d, d_save));
_mm_storeu_si128(res.as_mut_ptr().add(12).cast(), _mm_add_epi32(c, c_save));
res
}
}
Loading