Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
c438cf1
proof-of-concept Rayon integration
Shnatsel Nov 30, 2025
5109c8d
Experimental cross-half parallelization; regressed benchmarks
Shnatsel Nov 30, 2025
ac1cf6a
Revert "Experimental cross-half parallelization; regressed benchmarks"
Shnatsel Nov 30, 2025
e96c83c
Merge branch 'as-chunks' into rayon-as-chunks
Shnatsel Nov 30, 2025
42cb42b
Use parallel kernel only for the largest sizes when crossing halves
Shnatsel Nov 30, 2025
1afda6a
Revert "Use parallel kernel only for the largest sizes when crossing …
Shnatsel Nov 30, 2025
2322ec7
Merge branch 'main' into rayon-as-chunks
Shnatsel Nov 30, 2025
3e27178
Use rayon for parallelizing COBRA, helps performance of mid-sized FFT…
Shnatsel Nov 30, 2025
3e4c689
Dramatically lower COBRA multi-threading threshold now that thread sp…
Shnatsel Nov 30, 2025
df82993
Merge branch 'main' into rayon
Shnatsel Nov 30, 2025
0532671
cargo fmt
Shnatsel Nov 30, 2025
53a09a4
Merge branch 'main' into rayon
Shnatsel Nov 30, 2025
f840323
Merge branch 'main' into rayon
Shnatsel Nov 30, 2025
535a457
Add Rayon to the f32 codepath
Shnatsel Dec 1, 2025
c179bda
Make rayon dependency optional
Shnatsel Dec 1, 2025
ddbcb27
Rename parallel_join to something less technical
Shnatsel Dec 1, 2025
e5bdf13
Run tests both with and without all the features
Shnatsel Dec 1, 2025
94fcf91
Commit a file that I forgot to git add
Shnatsel Dec 1, 2025
a9165c7
test with complex-nums feature but without parallelism so that we don…
Shnatsel Dec 1, 2025
65f9ff2
Fix YAML syntax
Shnatsel Dec 1, 2025
d703fe3
Try using chili as the parallelization backend instead of rayon. Benc…
Shnatsel Dec 2, 2025
186e434
Revert "Try using chili as the parallelization backend instead of ray…
Shnatsel Dec 2, 2025
5939d88
Add a tunable for the smallest chunk size beyond which the input will…
Shnatsel Dec 2, 2025
2d4130e
Update doc comments for rayon
Shnatsel Dec 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,13 @@ jobs:
args: --all -- --check

combo:
name: Test
name: Test ${{ matrix.args }}
runs-on: ubuntu-latest
strategy:
fail-fast: false # ensures if one fails, the other keeps running
matrix:
# This creates two jobs: one with the flag, one with an empty string
args: ["--all-features", "--features=complex-nums"]
steps:
- name: Checkout sources
uses: actions/checkout@v2
Expand All @@ -63,7 +68,7 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: test
args: --all-features
args: ${{ matrix.args }}

coverage:
runs-on: ubuntu-latest
Expand Down
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ multiversion = "0.8.0"
num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true }
bytemuck = { version = "1.23.2", optional = true }
wide = "0.8.1"
rayon = { version = "1.11.0", optional = true }

[features]
default = []
complex-nums = ["dep:num-complex", "dep:bytemuck"]
parallel = ["dep:rayon"]

[dev-dependencies]
criterion = "0.8.0"
Expand Down
29 changes: 11 additions & 18 deletions src/algorithms/dif.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use crate::algorithms::cobra::cobra_apply;
use crate::kernels::common::{fft_chunk_2, fft_chunk_4};
use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_n};
use crate::options::Options;
use crate::parallel::run_maybe_in_parallel;
use crate::planner::{Direction, Planner32, Planner64};
use crate::twiddles::filter_twiddles;

Expand Down Expand Up @@ -118,15 +119,11 @@ pub fn fft_64_with_opts_and_plan(

// Optional bit reversal (controlled by options)
if opts.dif_perform_bit_reversal {
if opts.multithreaded_bit_reversal {
std::thread::scope(|s| {
s.spawn(|| cobra_apply(reals, n));
s.spawn(|| cobra_apply(imags, n));
});
} else {
cobra_apply(reals, n);
cobra_apply(imags, n);
}
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, n),
|| cobra_apply(imags, n),
);
}

// Scaling for inverse transform
Expand Down Expand Up @@ -225,15 +222,11 @@ pub fn fft_32_with_opts_and_plan(
}

if opts.dif_perform_bit_reversal {
if opts.multithreaded_bit_reversal {
std::thread::scope(|s| {
s.spawn(|| cobra_apply(reals, n));
s.spawn(|| cobra_apply(imags, n));
});
} else {
cobra_apply(reals, n);
cobra_apply(imags, n);
}
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, n),
|| cobra_apply(imags, n),
);
}

// Scaling for inverse transform
Expand Down
75 changes: 41 additions & 34 deletions src/algorithms/dit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use crate::kernels::dit::{
fft_dit_chunk_8_simd_f64,
};
use crate::options::Options;
use crate::parallel::run_maybe_in_parallel;
use crate::planner::{Direction, PlannerDit32, PlannerDit64};

/// L1 cache block size in complex elements (8KB for f32, 16KB for f64)
Expand All @@ -35,18 +36,18 @@ const L1_BLOCK_SIZE: usize = 1024;
fn recursive_dit_fft_f64(
reals: &mut [f64],
imags: &mut [f64],
offset: usize,
size: usize,
planner: &PlannerDit64,
opts: &Options,
mut stage_twiddle_idx: usize,
) -> usize {
let log_size = size.ilog2() as usize;

if size <= L1_BLOCK_SIZE {
for stage in 0..log_size {
stage_twiddle_idx = execute_dit_stage_f64(
&mut reals[offset..offset + size],
&mut imags[offset..offset + size],
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
Expand All @@ -57,9 +58,14 @@ fn recursive_dit_fft_f64(
let half = size / 2;
let log_half = half.ilog2() as usize;

let (re_first_half, re_second_half) = reals.split_at_mut(half);
let (im_first_half, im_second_half) = imags.split_at_mut(half);
// Recursively process both halves
recursive_dit_fft_f64(reals, imags, offset, half, planner, 0);
recursive_dit_fft_f64(reals, imags, offset + half, half, planner, 0);
run_maybe_in_parallel(
size > opts.smallest_parallel_chunk_size,
|| recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0),
|| recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0),
);

// Both halves completed stages 0..log_half-1
// Stages 0-5 use hardcoded twiddles, 6+ use planner
Expand All @@ -68,8 +74,8 @@ fn recursive_dit_fft_f64(
// Process remaining stages that span both halves
for stage in log_half..log_size {
stage_twiddle_idx = execute_dit_stage_f64(
&mut reals[offset..offset + size],
&mut imags[offset..offset + size],
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
Expand All @@ -84,18 +90,18 @@ fn recursive_dit_fft_f64(
fn recursive_dit_fft_f32(
reals: &mut [f32],
imags: &mut [f32],
offset: usize,
size: usize,
planner: &PlannerDit32,
opts: &Options,
mut stage_twiddle_idx: usize,
) -> usize {
let log_size = size.ilog2() as usize;

if size <= L1_BLOCK_SIZE {
for stage in 0..log_size {
stage_twiddle_idx = execute_dit_stage_f32(
&mut reals[offset..offset + size],
&mut imags[offset..offset + size],
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
Expand All @@ -106,15 +112,24 @@ fn recursive_dit_fft_f32(
let half = size / 2;
let log_half = half.ilog2() as usize;

recursive_dit_fft_f32(reals, imags, offset, half, planner, 0);
recursive_dit_fft_f32(reals, imags, offset + half, half, planner, 0);
let (re_first_half, re_second_half) = reals.split_at_mut(half);
let (im_first_half, im_second_half) = imags.split_at_mut(half);
// Recursively process both halves
run_maybe_in_parallel(
size > opts.smallest_parallel_chunk_size,
|| recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0),
|| recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0),
);

// Both halves completed stages 0..log_half-1
// Stages 0-5 use hardcoded twiddles, 6+ use planner
stage_twiddle_idx = log_half.saturating_sub(6);

// Process remaining stages that span both halves
for stage in log_half..log_size {
stage_twiddle_idx = execute_dit_stage_f32(
&mut reals[offset..offset + size],
&mut imags[offset..offset + size],
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
Expand Down Expand Up @@ -235,15 +250,11 @@ pub fn fft_64_dit_with_planner_and_opts(
assert_eq!(log_n, planner.log_n);

// DIT requires bit-reversed input
if opts.multithreaded_bit_reversal {
std::thread::scope(|s| {
s.spawn(|| cobra_apply(reals, log_n));
s.spawn(|| cobra_apply(imags, log_n));
});
} else {
cobra_apply(reals, log_n);
cobra_apply(imags, log_n);
}
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, log_n),
|| cobra_apply(imags, log_n),
);

// Handle inverse FFT
if let Direction::Reverse = planner.direction {
Expand All @@ -252,7 +263,7 @@ pub fn fft_64_dit_with_planner_and_opts(
}
}

recursive_dit_fft_f64(reals, imags, 0, n, planner, 0);
recursive_dit_fft_f64(reals, imags, n, planner, opts, 0);

// Scaling for inverse transform
if let Direction::Reverse = planner.direction {
Expand Down Expand Up @@ -282,15 +293,11 @@ pub fn fft_32_dit_with_planner_and_opts(
assert_eq!(log_n, planner.log_n);

// DIT requires bit-reversed input
if opts.multithreaded_bit_reversal {
std::thread::scope(|s| {
s.spawn(|| cobra_apply(reals, log_n));
s.spawn(|| cobra_apply(imags, log_n));
});
} else {
cobra_apply(reals, log_n);
cobra_apply(imags, log_n);
}
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| cobra_apply(reals, log_n),
|| cobra_apply(imags, log_n),
);

// Handle inverse FFT
if let Direction::Reverse = planner.direction {
Expand All @@ -299,7 +306,7 @@ pub fn fft_32_dit_with_planner_and_opts(
}
}

recursive_dit_fft_f32(reals, imags, 0, n, planner, 0);
recursive_dit_fft_f32(reals, imags, n, planner, opts, 0);

// Scaling for inverse transform
if let Direction::Reverse = planner.direction {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64
mod algorithms;
mod kernels;
pub mod options;
mod parallel;
pub mod planner;
mod twiddles;
mod utils;
Expand Down
15 changes: 13 additions & 2 deletions src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
#[derive(Debug, Clone)]
pub struct Options {
/// Whether to run the bit reversal step in 2 threads instead of one.
/// This is beneficial only at large input sizes (i.e. gigabytes of data).
/// This is beneficial only at medium to large sizes (i.e. megabytes of data).
/// The exact threshold where it starts being beneficial varies depending on the hardware.
///
/// This option is ignored if the `parallel` feature is disabled.
pub multithreaded_bit_reversal: bool,

/// Controls bit reversal behavior for DIF FFT algorithms.
Expand All @@ -33,13 +35,21 @@ pub struct Options {
/// fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner);
/// ```
pub dif_perform_bit_reversal: bool,

/// Do not split the input any further to run in parallel below this size
///
/// Set to `usize::MAX` to disable parallelism in the recursive FFT step.
///
/// This option is ignored if the `parallel` feature is disabled.
pub smallest_parallel_chunk_size: usize,
}

impl Default for Options {
fn default() -> Self {
Self {
multithreaded_bit_reversal: false,
dif_perform_bit_reversal: true, // Default to standard FFT behavior
smallest_parallel_chunk_size: usize::MAX,
}
}
}
Expand All @@ -49,7 +59,8 @@ impl Options {
pub fn guess_options(input_size: usize) -> Options {
let mut options = Options::default();
let n: usize = input_size.ilog2() as usize;
options.multithreaded_bit_reversal = n >= 22;
options.multithreaded_bit_reversal = n >= 16;
options.smallest_parallel_chunk_size = 16384;
options
}
}
25 changes: 25 additions & 0 deletions src/parallel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//! Utilities for parallelism

/// Runs the two specified closures in parallel,
/// if and only if `parallel` is set to `true` and the `parallel` feature is enabled
#[allow(unused_variables)] // when `parallel` feature is disabled, the variable is ignored
pub fn run_maybe_in_parallel<A, B, RA, RB>(parallel: bool, oper_a: A, oper_b: B) -> (RA, RB)
where
A: FnOnce() -> RA + Send,
B: FnOnce() -> RB + Send,
RA: Send,
RB: Send,
{
#[cfg(feature = "parallel")]
{
if parallel {
rayon::join(oper_a, oper_b)
} else {
(oper_a(), oper_b())
}
}
#[cfg(not(feature = "parallel"))]
{
(oper_a(), oper_b())
}
}
Loading