diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 3b58978..0ae02ac 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -46,8 +46,13 @@ jobs: args: --all -- --check combo: - name: Test + name: Test ${{ matrix.args }} runs-on: ubuntu-latest + strategy: + fail-fast: false # ensures if one fails, the other keeps running + matrix: + # This creates two jobs: one with the flag, one with an empty string + args: ["--all-features", "--features=complex-nums"] steps: - name: Checkout sources uses: actions/checkout@v2 @@ -63,7 +68,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test - args: --all-features + args: ${{ matrix.args }} coverage: runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index fe3f52d..7f27f02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,10 +18,12 @@ multiversion = "0.8.0" num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" +rayon = { version = "1.11.0", optional = true } [features] default = [] complex-nums = ["dep:num-complex", "dep:bytemuck"] +parallel = ["dep:rayon"] [dev-dependencies] criterion = "0.8.0" diff --git a/src/algorithms/dif.rs b/src/algorithms/dif.rs index 7b34736..31d1d0a 100644 --- a/src/algorithms/dif.rs +++ b/src/algorithms/dif.rs @@ -16,6 +16,7 @@ use crate::algorithms::cobra::cobra_apply; use crate::kernels::common::{fft_chunk_2, fft_chunk_4}; use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_n}; use crate::options::Options; +use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, Planner32, Planner64}; use crate::twiddles::filter_twiddles; @@ -118,15 +119,11 @@ pub fn fft_64_with_opts_and_plan( // Optional bit reversal (controlled by options) if opts.dif_perform_bit_reversal { - if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, n)); - s.spawn(|| cobra_apply(imags, n)); - }); - } else { - cobra_apply(reals, n); - cobra_apply(imags, n); - } + run_maybe_in_parallel( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } // Scaling for inverse transform @@ -225,15 +222,11 @@ pub fn fft_32_with_opts_and_plan( } if opts.dif_perform_bit_reversal { - if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, n)); - s.spawn(|| cobra_apply(imags, n)); - }); - } else { - cobra_apply(reals, n); - cobra_apply(imags, n); - } + run_maybe_in_parallel( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } // Scaling for inverse transform diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index c7733a3..fa85e96 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -23,6 +23,7 @@ use crate::kernels::dit::{ fft_dit_chunk_8_simd_f64, }; use crate::options::Options; +use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; /// L1 cache block size in complex elements (8KB for f32, 16KB for f64) @@ -35,9 +36,9 @@ const L1_BLOCK_SIZE: usize = 1024; fn recursive_dit_fft_f64( reals: &mut [f64], imags: &mut [f64], - offset: usize, size: usize, planner: &PlannerDit64, + opts: &Options, mut stage_twiddle_idx: usize, ) -> usize { let log_size = size.ilog2() as usize; @@ -45,8 +46,8 @@ fn recursive_dit_fft_f64( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f64( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -57,9 +58,14 @@ fn recursive_dit_fft_f64( let half = size / 2; let log_half = half.ilog2() as usize; + let (re_first_half, re_second_half) = reals.split_at_mut(half); + let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - recursive_dit_fft_f64(reals, imags, offset, half, planner, 0); - recursive_dit_fft_f64(reals, imags, offset + half, half, planner, 0); + run_maybe_in_parallel( + size > opts.smallest_parallel_chunk_size, + || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0), + ); // Both halves completed stages 0..log_half-1 // Stages 0-5 use hardcoded twiddles, 6+ use planner @@ -68,8 +74,8 @@ fn recursive_dit_fft_f64( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f64( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -84,9 +90,9 @@ fn recursive_dit_fft_f64( fn recursive_dit_fft_f32( reals: &mut [f32], imags: &mut [f32], - offset: usize, size: usize, planner: &PlannerDit32, + opts: &Options, mut stage_twiddle_idx: usize, ) -> usize { let log_size = size.ilog2() as usize; @@ -94,8 +100,8 @@ fn recursive_dit_fft_f32( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f32( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -106,15 +112,24 @@ fn recursive_dit_fft_f32( let half = size / 2; let log_half = half.ilog2() as usize; - recursive_dit_fft_f32(reals, imags, offset, half, planner, 0); - recursive_dit_fft_f32(reals, imags, offset + half, half, planner, 0); + let (re_first_half, re_second_half) = reals.split_at_mut(half); + let (im_first_half, im_second_half) = imags.split_at_mut(half); + // Recursively process both halves + run_maybe_in_parallel( + size > opts.smallest_parallel_chunk_size, + || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0), + ); + // Both halves completed stages 0..log_half-1 + // Stages 0-5 use hardcoded twiddles, 6+ use planner stage_twiddle_idx = log_half.saturating_sub(6); + // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f32( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -235,15 +250,11 @@ pub fn fft_64_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, log_n)); - s.spawn(|| cobra_apply(imags, log_n)); - }); - } else { - cobra_apply(reals, log_n); - cobra_apply(imags, log_n); - } + run_maybe_in_parallel( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); // Handle inverse FFT if let Direction::Reverse = planner.direction { @@ -252,7 +263,7 @@ pub fn fft_64_dit_with_planner_and_opts( } } - recursive_dit_fft_f64(reals, imags, 0, n, planner, 0); + recursive_dit_fft_f64(reals, imags, n, planner, opts, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { @@ -282,15 +293,11 @@ pub fn fft_32_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, log_n)); - s.spawn(|| cobra_apply(imags, log_n)); - }); - } else { - cobra_apply(reals, log_n); - cobra_apply(imags, log_n); - } + run_maybe_in_parallel( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); // Handle inverse FFT if let Direction::Reverse = planner.direction { @@ -299,7 +306,7 @@ pub fn fft_32_dit_with_planner_and_opts( } } - recursive_dit_fft_f32(reals, imags, 0, n, planner, 0); + recursive_dit_fft_f32(reals, imags, n, planner, opts, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { diff --git a/src/lib.rs b/src/lib.rs index 984ab70..ccab469 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64 mod algorithms; mod kernels; pub mod options; +mod parallel; pub mod planner; mod twiddles; mod utils; diff --git a/src/options.rs b/src/options.rs index e3c9aab..d0b8aab 100644 --- a/src/options.rs +++ b/src/options.rs @@ -9,8 +9,10 @@ #[derive(Debug, Clone)] pub struct Options { /// Whether to run the bit reversal step in 2 threads instead of one. - /// This is beneficial only at large input sizes (i.e. gigabytes of data). + /// This is beneficial only at medium to large sizes (i.e. megabytes of data). /// The exact threshold where it starts being beneficial varies depending on the hardware. + /// + /// This option is ignored if the `parallel` feature is disabled. pub multithreaded_bit_reversal: bool, /// Controls bit reversal behavior for DIF FFT algorithms. @@ -33,6 +35,13 @@ pub struct Options { /// fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); /// ``` pub dif_perform_bit_reversal: bool, + + /// Do not split the input any further to run in parallel below this size + /// + /// Set to `usize::MAX` to disable parallelism in the recursive FFT step. + /// + /// This option is ignored if the `parallel` feature is disabled. + pub smallest_parallel_chunk_size: usize, } impl Default for Options { @@ -40,6 +49,7 @@ impl Default for Options { Self { multithreaded_bit_reversal: false, dif_perform_bit_reversal: true, // Default to standard FFT behavior + smallest_parallel_chunk_size: usize::MAX, } } } @@ -49,7 +59,8 @@ impl Options { pub fn guess_options(input_size: usize) -> Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; - options.multithreaded_bit_reversal = n >= 22; + options.multithreaded_bit_reversal = n >= 16; + options.smallest_parallel_chunk_size = 16384; options } } diff --git a/src/parallel.rs b/src/parallel.rs new file mode 100644 index 0000000..53003c7 --- /dev/null +++ b/src/parallel.rs @@ -0,0 +1,25 @@ +//! Utilities for parallelism + +/// Runs the two specified closures in parallel, +/// if and only if `parallel` is set to `true` and the `parallel` feature is enabled +#[allow(unused_variables)] // when `parallel` feature is disabled, the variable is ignored +pub fn run_maybe_in_parallel(parallel: bool, oper_a: A, oper_b: B) -> (RA, RB) +where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, +{ + #[cfg(feature = "parallel")] + { + if parallel { + rayon::join(oper_a, oper_b) + } else { + (oper_a(), oper_b()) + } + } + #[cfg(not(feature = "parallel"))] + { + (oper_a(), oper_b()) + } +}