From c438cf10fd4dfbf585e334333f80cffa45dad92f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 14:04:08 +0000 Subject: [PATCH 01/19] proof-of-concept Rayon integration --- Cargo.toml | 1 + src/algorithms/dit.rs | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 720d8aa..a005df1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ multiversion = "0.8.0" num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" +rayon = "1.11.0" [features] default = [] diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index c7733a3..5203ed2 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -35,7 +35,6 @@ const L1_BLOCK_SIZE: usize = 1024; fn recursive_dit_fft_f64( reals: &mut [f64], imags: &mut [f64], - offset: usize, size: usize, planner: &PlannerDit64, mut stage_twiddle_idx: usize, @@ -45,8 +44,8 @@ fn recursive_dit_fft_f64( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f64( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -57,9 +56,13 @@ fn recursive_dit_fft_f64( let half = size / 2; let log_half = half.ilog2() as usize; + let (re_first_half, re_second_half) = reals.split_at_mut(half); + let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - recursive_dit_fft_f64(reals, imags, offset, half, planner, 0); - recursive_dit_fft_f64(reals, imags, offset + half, half, planner, 0); + rayon::join( + || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, 0), + || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, 0), + ); // Both halves completed stages 0..log_half-1 // Stages 0-5 use hardcoded twiddles, 6+ use planner @@ -68,8 +71,8 @@ fn recursive_dit_fft_f64( // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f64( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -252,7 +255,7 @@ pub fn fft_64_dit_with_planner_and_opts( } } - recursive_dit_fft_f64(reals, imags, 0, n, planner, 0); + recursive_dit_fft_f64(reals, imags, n, planner, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { From 5109c8d5331aa5b5188968f288933fa5afd467a5 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 14:34:18 +0000 Subject: [PATCH 02/19] Experimental cross-half parallelization; regressed benchmarks --- src/kernels/dit.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index f95f194..42e99e2 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -913,9 +913,11 @@ pub fn fft_dit_64_chunk_n_simd( let two = f64x8::splat(2.0); assert!(chunk_size >= LANES * 2); + use rayon::prelude::*; + reals .chunks_exact_mut(chunk_size) - .zip(imags.chunks_exact_mut(chunk_size)) + .zip(imags.chunks_exact_mut(chunk_size)).par_bridge() .for_each(|(reals_chunk, imags_chunk)| { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist); From ac1cf6ae0f8b08979c0d57a34ad24328b3d5af7c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 14:34:22 +0000 Subject: [PATCH 03/19] Revert "Experimental cross-half parallelization; regressed benchmarks" This reverts commit 5109c8d5331aa5b5188968f288933fa5afd467a5. --- src/kernels/dit.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 42e99e2..f95f194 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -913,11 +913,9 @@ pub fn fft_dit_64_chunk_n_simd( let two = f64x8::splat(2.0); assert!(chunk_size >= LANES * 2); - use rayon::prelude::*; - reals .chunks_exact_mut(chunk_size) - .zip(imags.chunks_exact_mut(chunk_size)).par_bridge() + .zip(imags.chunks_exact_mut(chunk_size)) .for_each(|(reals_chunk, imags_chunk)| { let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist); let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist); From 42cb42be546c521e2fd1b173b92740c8638b3bc4 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 17:26:53 +0000 Subject: [PATCH 04/19] Use parallel kernel only for the largest sizes when crossing halves --- src/algorithms/dit.rs | 13 +++++++------ src/kernels/dit.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 5203ed2..99cbcf4 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -16,11 +16,7 @@ //! use crate::algorithms::cobra::cobra_apply; use crate::kernels::dit::{ - fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32, - fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32, - fft_dit_chunk_32_simd_f64, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64, - fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64, fft_dit_chunk_8_simd_f32, - fft_dit_chunk_8_simd_f64, + fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_64_chunk_n_simd_parallel, fft_dit_chunk_2, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64, fft_dit_chunk_8_simd_f32, fft_dit_chunk_8_simd_f64, fft_dit_chunk_16_simd_f32, fft_dit_chunk_16_simd_f64, fft_dit_chunk_32_simd_f32, fft_dit_chunk_32_simd_f64, fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64 }; use crate::options::Options; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; @@ -159,11 +155,16 @@ fn execute_dit_stage_f64( } else if chunk_size == 64 { fft_dit_chunk_64_simd_f64(reals, imags); stage_twiddle_idx - } else { + } else if chunk_size <= 1048576 { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); stage_twiddle_idx + 1 + } else { + // For the very largest chunks, use a parallel version of the general kernel + let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; + fft_dit_64_chunk_n_simd_parallel(reals, imags, twiddles_re, twiddles_im, dist); + stage_twiddle_idx + 1 } } diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 50c4703..24f8be0 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -921,6 +921,49 @@ pub fn fft_dit_64_chunk_n_simd( }); } +/// General DIT butterfly for f64 +#[multiversion::multiversion(targets( + "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", + "x86_64+avx2+fma", + "x86_64+sse4.2", + "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", + "x86+avx2+fma", + "x86+sse4.2", + "x86+sse2", +))] +#[inline] +pub fn fft_dit_64_chunk_n_simd_parallel( + reals: &mut [f64], + imags: &mut [f64], + twiddles_re: &[f64], + twiddles_im: &[f64], + dist: usize, +) { + const LANES: usize = 8; + let chunk_size = dist << 1; + assert!(chunk_size >= LANES * 2); + + use rayon::prelude::*; + + reals + .chunks_exact_mut(chunk_size) + .zip(imags.chunks_exact_mut(chunk_size)).par_bridge() + .for_each(|(reals_chunk, imags_chunk)| { + let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist); + let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist); + + (reals_s0.as_chunks_mut::().0.iter_mut()) + .zip(reals_s1.as_chunks_mut::().0.iter_mut()) + .zip(imags_s0.as_chunks_mut::().0.iter_mut()) + .zip(imags_s1.as_chunks_mut::().0.iter_mut()) + .zip(twiddles_re.as_chunks::().0.iter()) + .zip(twiddles_im.as_chunks::().0.iter()) + .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { + fft_dit_64_chunk_n_simd_kernel(re_s0, re_s1, im_s0, im_s1, tw_re, tw_im) + }); + }); +} + #[inline(always)] // for multiversioning to work fn fft_dit_64_chunk_n_simd_kernel( re_s0: &mut [f64; 8], From 1afda6aed6c4e985553782824dc6abbd1f68d2a2 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 17:26:56 +0000 Subject: [PATCH 05/19] Revert "Use parallel kernel only for the largest sizes when crossing halves" This reverts commit 42cb42be546c521e2fd1b173b92740c8638b3bc4. --- src/algorithms/dit.rs | 13 ++++++------- src/kernels/dit.rs | 43 ------------------------------------------- 2 files changed, 6 insertions(+), 50 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 99cbcf4..5203ed2 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -16,7 +16,11 @@ //! use crate::algorithms::cobra::cobra_apply; use crate::kernels::dit::{ - fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_64_chunk_n_simd_parallel, fft_dit_chunk_2, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64, fft_dit_chunk_8_simd_f32, fft_dit_chunk_8_simd_f64, fft_dit_chunk_16_simd_f32, fft_dit_chunk_16_simd_f64, fft_dit_chunk_32_simd_f32, fft_dit_chunk_32_simd_f64, fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64 + fft_dit_32_chunk_n_simd, fft_dit_64_chunk_n_simd, fft_dit_chunk_16_simd_f32, + fft_dit_chunk_16_simd_f64, fft_dit_chunk_2, fft_dit_chunk_32_simd_f32, + fft_dit_chunk_32_simd_f64, fft_dit_chunk_4_simd_f32, fft_dit_chunk_4_simd_f64, + fft_dit_chunk_64_simd_f32, fft_dit_chunk_64_simd_f64, fft_dit_chunk_8_simd_f32, + fft_dit_chunk_8_simd_f64, }; use crate::options::Options; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; @@ -155,16 +159,11 @@ fn execute_dit_stage_f64( } else if chunk_size == 64 { fft_dit_chunk_64_simd_f64(reals, imags); stage_twiddle_idx - } else if chunk_size <= 1048576 { + } else { // For larger chunks, use general kernel with twiddles from planner let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; fft_dit_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist); stage_twiddle_idx + 1 - } else { - // For the very largest chunks, use a parallel version of the general kernel - let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx]; - fft_dit_64_chunk_n_simd_parallel(reals, imags, twiddles_re, twiddles_im, dist); - stage_twiddle_idx + 1 } } diff --git a/src/kernels/dit.rs b/src/kernels/dit.rs index 24f8be0..50c4703 100644 --- a/src/kernels/dit.rs +++ b/src/kernels/dit.rs @@ -921,49 +921,6 @@ pub fn fft_dit_64_chunk_n_simd( }); } -/// General DIT butterfly for f64 -#[multiversion::multiversion(targets( - "x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86_64+avx2+fma", - "x86_64+sse4.2", - "x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl+gfni", - "x86+avx2+fma", - "x86+sse4.2", - "x86+sse2", -))] -#[inline] -pub fn fft_dit_64_chunk_n_simd_parallel( - reals: &mut [f64], - imags: &mut [f64], - twiddles_re: &[f64], - twiddles_im: &[f64], - dist: usize, -) { - const LANES: usize = 8; - let chunk_size = dist << 1; - assert!(chunk_size >= LANES * 2); - - use rayon::prelude::*; - - reals - .chunks_exact_mut(chunk_size) - .zip(imags.chunks_exact_mut(chunk_size)).par_bridge() - .for_each(|(reals_chunk, imags_chunk)| { - let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist); - let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist); - - (reals_s0.as_chunks_mut::().0.iter_mut()) - .zip(reals_s1.as_chunks_mut::().0.iter_mut()) - .zip(imags_s0.as_chunks_mut::().0.iter_mut()) - .zip(imags_s1.as_chunks_mut::().0.iter_mut()) - .zip(twiddles_re.as_chunks::().0.iter()) - .zip(twiddles_im.as_chunks::().0.iter()) - .for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| { - fft_dit_64_chunk_n_simd_kernel(re_s0, re_s1, im_s0, im_s1, tw_re, tw_im) - }); - }); -} - #[inline(always)] // for multiversioning to work fn fft_dit_64_chunk_n_simd_kernel( re_s0: &mut [f64; 8], From 3e2717875697b151b6a8d985be92bf7e3c9b530d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 19:12:22 +0000 Subject: [PATCH 06/19] Use rayon for parallelizing COBRA, helps performance of mid-sized FFTs by eliminating the thread spawning and termination overhead --- src/algorithms/dif.rs | 16 ++++++++-------- src/algorithms/dit.rs | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/algorithms/dif.rs b/src/algorithms/dif.rs index 7b34736..2e75c87 100644 --- a/src/algorithms/dif.rs +++ b/src/algorithms/dif.rs @@ -119,10 +119,10 @@ pub fn fft_64_with_opts_and_plan( // Optional bit reversal (controlled by options) if opts.dif_perform_bit_reversal { if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, n)); - s.spawn(|| cobra_apply(imags, n)); - }); + rayon::join( + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } else { cobra_apply(reals, n); cobra_apply(imags, n); @@ -226,10 +226,10 @@ pub fn fft_32_with_opts_and_plan( if opts.dif_perform_bit_reversal { if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, n)); - s.spawn(|| cobra_apply(imags, n)); - }); + rayon::join( + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } else { cobra_apply(reals, n); cobra_apply(imags, n); diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 5203ed2..38d8def 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -239,10 +239,10 @@ pub fn fft_64_dit_with_planner_and_opts( // DIT requires bit-reversed input if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, log_n)); - s.spawn(|| cobra_apply(imags, log_n)); - }); + rayon::join( + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); } else { cobra_apply(reals, log_n); cobra_apply(imags, log_n); @@ -286,10 +286,10 @@ pub fn fft_32_dit_with_planner_and_opts( // DIT requires bit-reversed input if opts.multithreaded_bit_reversal { - std::thread::scope(|s| { - s.spawn(|| cobra_apply(reals, log_n)); - s.spawn(|| cobra_apply(imags, log_n)); - }); + rayon::join( + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); } else { cobra_apply(reals, log_n); cobra_apply(imags, log_n); From 3e4c689583b661ba333aa049f0bceed1ac9f5752 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 19:13:50 +0000 Subject: [PATCH 07/19] Dramatically lower COBRA multi-threading threshold now that thread spawning overhead is no longer a concern; benchmarks show improvement even at 15 on my machine but I'm being conservative for now, we'll need to auto-tune COBRA in the future because hardware varies so much --- src/options.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.rs b/src/options.rs index e3c9aab..92d9028 100644 --- a/src/options.rs +++ b/src/options.rs @@ -49,7 +49,7 @@ impl Options { pub fn guess_options(input_size: usize) -> Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; - options.multithreaded_bit_reversal = n >= 22; + options.multithreaded_bit_reversal = n >= 16; options } } From 05326718d2a2c3e20d79eb3bb39cbc11056fe15b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sun, 30 Nov 2025 19:32:19 +0000 Subject: [PATCH 08/19] cargo fmt --- src/algorithms/dif.rs | 10 ++-------- src/algorithms/dit.rs | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/algorithms/dif.rs b/src/algorithms/dif.rs index 2e75c87..e318ba7 100644 --- a/src/algorithms/dif.rs +++ b/src/algorithms/dif.rs @@ -119,10 +119,7 @@ pub fn fft_64_with_opts_and_plan( // Optional bit reversal (controlled by options) if opts.dif_perform_bit_reversal { if opts.multithreaded_bit_reversal { - rayon::join( - || cobra_apply(reals, n), - || cobra_apply(imags, n), - ); + rayon::join(|| cobra_apply(reals, n), || cobra_apply(imags, n)); } else { cobra_apply(reals, n); cobra_apply(imags, n); @@ -226,10 +223,7 @@ pub fn fft_32_with_opts_and_plan( if opts.dif_perform_bit_reversal { if opts.multithreaded_bit_reversal { - rayon::join( - || cobra_apply(reals, n), - || cobra_apply(imags, n), - ); + rayon::join(|| cobra_apply(reals, n), || cobra_apply(imags, n)); } else { cobra_apply(reals, n); cobra_apply(imags, n); diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 38d8def..2717e69 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -239,10 +239,7 @@ pub fn fft_64_dit_with_planner_and_opts( // DIT requires bit-reversed input if opts.multithreaded_bit_reversal { - rayon::join( - || cobra_apply(reals, log_n), - || cobra_apply(imags, log_n), - ); + rayon::join(|| cobra_apply(reals, log_n), || cobra_apply(imags, log_n)); } else { cobra_apply(reals, log_n); cobra_apply(imags, log_n); @@ -286,10 +283,7 @@ pub fn fft_32_dit_with_planner_and_opts( // DIT requires bit-reversed input if opts.multithreaded_bit_reversal { - rayon::join( - || cobra_apply(reals, log_n), - || cobra_apply(imags, log_n), - ); + rayon::join(|| cobra_apply(reals, log_n), || cobra_apply(imags, log_n)); } else { cobra_apply(reals, log_n); cobra_apply(imags, log_n); From 535a457ebbc578357fa307bcd6d620881c79b080 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 18:26:00 +0000 Subject: [PATCH 09/19] Add Rayon to the f32 codepath --- src/algorithms/dit.rs | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 2717e69..dfdfe89 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -87,7 +87,6 @@ fn recursive_dit_fft_f64( fn recursive_dit_fft_f32( reals: &mut [f32], imags: &mut [f32], - offset: usize, size: usize, planner: &PlannerDit32, mut stage_twiddle_idx: usize, @@ -97,8 +96,8 @@ fn recursive_dit_fft_f32( if size <= L1_BLOCK_SIZE { for stage in 0..log_size { stage_twiddle_idx = execute_dit_stage_f32( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -109,15 +108,23 @@ fn recursive_dit_fft_f32( let half = size / 2; let log_half = half.ilog2() as usize; - recursive_dit_fft_f32(reals, imags, offset, half, planner, 0); - recursive_dit_fft_f32(reals, imags, offset + half, half, planner, 0); + let (re_first_half, re_second_half) = reals.split_at_mut(half); + let (im_first_half, im_second_half) = imags.split_at_mut(half); + // Recursively process both halves + rayon::join( + || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, 0), + || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, 0), + ); + // Both halves completed stages 0..log_half-1 + // Stages 0-5 use hardcoded twiddles, 6+ use planner stage_twiddle_idx = log_half.saturating_sub(6); + // Process remaining stages that span both halves for stage in log_half..log_size { stage_twiddle_idx = execute_dit_stage_f32( - &mut reals[offset..offset + size], - &mut imags[offset..offset + size], + &mut reals[..size], + &mut imags[..size], stage, planner, stage_twiddle_idx, @@ -296,7 +303,7 @@ pub fn fft_32_dit_with_planner_and_opts( } } - recursive_dit_fft_f32(reals, imags, 0, n, planner, 0); + recursive_dit_fft_f32(reals, imags, n, planner, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { From c179bda1a82428828e08cae8be67e759155a3c26 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:00:50 +0000 Subject: [PATCH 10/19] Make rayon dependency optional --- Cargo.toml | 3 ++- src/algorithms/dif.rs | 23 +++++++++++------------ src/algorithms/dit.rs | 29 +++++++++++++++-------------- src/lib.rs | 1 + 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c1573e4..7f27f02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,11 +18,12 @@ multiversion = "0.8.0" num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" -rayon = "1.11.0" +rayon = { version = "1.11.0", optional = true } [features] default = [] complex-nums = ["dep:num-complex", "dep:bytemuck"] +parallel = ["dep:rayon"] [dev-dependencies] criterion = "0.8.0" diff --git a/src/algorithms/dif.rs b/src/algorithms/dif.rs index e318ba7..02bf924 100644 --- a/src/algorithms/dif.rs +++ b/src/algorithms/dif.rs @@ -16,6 +16,7 @@ use crate::algorithms::cobra::cobra_apply; use crate::kernels::common::{fft_chunk_2, fft_chunk_4}; use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_n}; use crate::options::Options; +use crate::parallel::parallel_join; use crate::planner::{Direction, Planner32, Planner64}; use crate::twiddles::filter_twiddles; @@ -118,12 +119,11 @@ pub fn fft_64_with_opts_and_plan( // Optional bit reversal (controlled by options) if opts.dif_perform_bit_reversal { - if opts.multithreaded_bit_reversal { - rayon::join(|| cobra_apply(reals, n), || cobra_apply(imags, n)); - } else { - cobra_apply(reals, n); - cobra_apply(imags, n); - } + parallel_join( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } // Scaling for inverse transform @@ -222,12 +222,11 @@ pub fn fft_32_with_opts_and_plan( } if opts.dif_perform_bit_reversal { - if opts.multithreaded_bit_reversal { - rayon::join(|| cobra_apply(reals, n), || cobra_apply(imags, n)); - } else { - cobra_apply(reals, n); - cobra_apply(imags, n); - } + parallel_join( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, n), + || cobra_apply(imags, n), + ); } // Scaling for inverse transform diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index dfdfe89..3b9efa3 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -23,6 +23,7 @@ use crate::kernels::dit::{ fft_dit_chunk_8_simd_f64, }; use crate::options::Options; +use crate::parallel::parallel_join; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; /// L1 cache block size in complex elements (8KB for f32, 16KB for f64) @@ -59,7 +60,8 @@ fn recursive_dit_fft_f64( let (re_first_half, re_second_half) = reals.split_at_mut(half); let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - rayon::join( + parallel_join( + true, || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, 0), || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, 0), ); @@ -111,7 +113,8 @@ fn recursive_dit_fft_f32( let (re_first_half, re_second_half) = reals.split_at_mut(half); let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - rayon::join( + parallel_join( + true, || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, 0), || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, 0), ); @@ -245,12 +248,11 @@ pub fn fft_64_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - if opts.multithreaded_bit_reversal { - rayon::join(|| cobra_apply(reals, log_n), || cobra_apply(imags, log_n)); - } else { - cobra_apply(reals, log_n); - cobra_apply(imags, log_n); - } + parallel_join( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); // Handle inverse FFT if let Direction::Reverse = planner.direction { @@ -289,12 +291,11 @@ pub fn fft_32_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - if opts.multithreaded_bit_reversal { - rayon::join(|| cobra_apply(reals, log_n), || cobra_apply(imags, log_n)); - } else { - cobra_apply(reals, log_n); - cobra_apply(imags, log_n); - } + parallel_join( + opts.multithreaded_bit_reversal, + || cobra_apply(reals, log_n), + || cobra_apply(imags, log_n), + ); // Handle inverse FFT if let Direction::Reverse = planner.direction { diff --git a/src/lib.rs b/src/lib.rs index 984ab70..ccab469 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,7 @@ use crate::utils::{combine_re_im, deinterleave_complex32, deinterleave_complex64 mod algorithms; mod kernels; pub mod options; +mod parallel; pub mod planner; mod twiddles; mod utils; From ddbcb27bdb5cf55b8957e934346366a8c53f4963 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:01:29 +0000 Subject: [PATCH 11/19] Rename parallel_join to something less technical --- src/algorithms/dif.rs | 6 +++--- src/algorithms/dit.rs | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/algorithms/dif.rs b/src/algorithms/dif.rs index 02bf924..31d1d0a 100644 --- a/src/algorithms/dif.rs +++ b/src/algorithms/dif.rs @@ -16,7 +16,7 @@ use crate::algorithms::cobra::cobra_apply; use crate::kernels::common::{fft_chunk_2, fft_chunk_4}; use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_n}; use crate::options::Options; -use crate::parallel::parallel_join; +use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, Planner32, Planner64}; use crate::twiddles::filter_twiddles; @@ -119,7 +119,7 @@ pub fn fft_64_with_opts_and_plan( // Optional bit reversal (controlled by options) if opts.dif_perform_bit_reversal { - parallel_join( + run_maybe_in_parallel( opts.multithreaded_bit_reversal, || cobra_apply(reals, n), || cobra_apply(imags, n), @@ -222,7 +222,7 @@ pub fn fft_32_with_opts_and_plan( } if opts.dif_perform_bit_reversal { - parallel_join( + run_maybe_in_parallel( opts.multithreaded_bit_reversal, || cobra_apply(reals, n), || cobra_apply(imags, n), diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 3b9efa3..9af5dbf 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -23,7 +23,7 @@ use crate::kernels::dit::{ fft_dit_chunk_8_simd_f64, }; use crate::options::Options; -use crate::parallel::parallel_join; +use crate::parallel::run_maybe_in_parallel; use crate::planner::{Direction, PlannerDit32, PlannerDit64}; /// L1 cache block size in complex elements (8KB for f32, 16KB for f64) @@ -60,7 +60,7 @@ fn recursive_dit_fft_f64( let (re_first_half, re_second_half) = reals.split_at_mut(half); let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - parallel_join( + run_maybe_in_parallel( true, || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, 0), || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, 0), @@ -113,7 +113,7 @@ fn recursive_dit_fft_f32( let (re_first_half, re_second_half) = reals.split_at_mut(half); let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves - parallel_join( + run_maybe_in_parallel( true, || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, 0), || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, 0), @@ -248,7 +248,7 @@ pub fn fft_64_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - parallel_join( + run_maybe_in_parallel( opts.multithreaded_bit_reversal, || cobra_apply(reals, log_n), || cobra_apply(imags, log_n), @@ -291,7 +291,7 @@ pub fn fft_32_dit_with_planner_and_opts( assert_eq!(log_n, planner.log_n); // DIT requires bit-reversed input - parallel_join( + run_maybe_in_parallel( opts.multithreaded_bit_reversal, || cobra_apply(reals, log_n), || cobra_apply(imags, log_n), From e5bdf135d4e0f2583bc0d4d557042fd993974664 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:06:35 +0000 Subject: [PATCH 12/19] Run tests both with and without all the features --- .github/workflows/rust.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 3b58978..31ead9c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -45,9 +45,14 @@ jobs: command: fmt args: --all -- --check - combo: - name: Test +combo: + name: Test ${{ matrix.args }} runs-on: ubuntu-latest + strategy: + fail-fast: false # ensures if one fails, the other keeps running + matrix: + # This creates two jobs: one with the flag, one with an empty string + args: ["--all-features", ""] steps: - name: Checkout sources uses: actions/checkout@v2 @@ -63,7 +68,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test - args: --all-features + args: ${{ matrix.args }} coverage: runs-on: ubuntu-latest From 94fcf910b3beefd23e58d956bc7a92cd94015631 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:19:33 +0000 Subject: [PATCH 13/19] Commit a file that I forgot to git add --- src/parallel.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 src/parallel.rs diff --git a/src/parallel.rs b/src/parallel.rs new file mode 100644 index 0000000..53003c7 --- /dev/null +++ b/src/parallel.rs @@ -0,0 +1,25 @@ +//! Utilities for parallelism + +/// Runs the two specified closures in parallel, +/// if and only if `parallel` is set to `true` and the `parallel` feature is enabled +#[allow(unused_variables)] // when `parallel` feature is disabled, the variable is ignored +pub fn run_maybe_in_parallel(parallel: bool, oper_a: A, oper_b: B) -> (RA, RB) +where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, +{ + #[cfg(feature = "parallel")] + { + if parallel { + rayon::join(oper_a, oper_b) + } else { + (oper_a(), oper_b()) + } + } + #[cfg(not(feature = "parallel"))] + { + (oper_a(), oper_b()) + } +} From a9165c7766e3dc84d9d5852bc51510d9e6a09a50 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:20:38 +0000 Subject: [PATCH 14/19] test with complex-nums feature but without parallelism so that we don't have to stop testing README snippets that Rust treats as doctest --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 31ead9c..ea79d5d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -52,7 +52,7 @@ combo: fail-fast: false # ensures if one fails, the other keeps running matrix: # This creates two jobs: one with the flag, one with an empty string - args: ["--all-features", ""] + args: ["--all-features", "--features=complex-nums"] steps: - name: Checkout sources uses: actions/checkout@v2 From 65f9ff29b2941e04295a6be0ddc6fb9829265b88 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 1 Dec 2025 19:25:39 +0000 Subject: [PATCH 15/19] Fix YAML syntax --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ea79d5d..0ae02ac 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -45,7 +45,7 @@ jobs: command: fmt args: --all -- --check -combo: + combo: name: Test ${{ matrix.args }} runs-on: ubuntu-latest strategy: From d703fe3a21b09f13d34e5d214afc6fe7c0d3e983 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 2 Dec 2025 10:20:05 +0000 Subject: [PATCH 16/19] Try using chili as the parallelization backend instead of rayon. Benchmarks show that it regresses small sizes far less, but not enough to break even with the single-threaded implementation; but also regresses large sizes a lot. So it loses out to both rayon and single-threaded depending on the size and doesn't seem to be worth it. --- Cargo.toml | 1 + src/parallel.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7f27f02..b244ada 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" rayon = { version = "1.11.0", optional = true } +chili = "0.2.1" [features] default = [] diff --git a/src/parallel.rs b/src/parallel.rs index 53003c7..eb8b7e2 100644 --- a/src/parallel.rs +++ b/src/parallel.rs @@ -13,7 +13,7 @@ where #[cfg(feature = "parallel")] { if parallel { - rayon::join(oper_a, oper_b) + chili::Scope::global().join(|_| oper_a(), |_| oper_b()) } else { (oper_a(), oper_b()) } From 186e434f74f22b7b396f4a582f6dabffa3ae60a1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 2 Dec 2025 10:24:57 +0000 Subject: [PATCH 17/19] Revert "Try using chili as the parallelization backend instead of rayon. Benchmarks show that it regresses small sizes far less, but not enough to break even with the single-threaded implementation; but also regresses large sizes a lot. So it loses out to both rayon and single-threaded depending on the size and doesn't seem to be worth it." This reverts commit d703fe3a21b09f13d34e5d214afc6fe7c0d3e983. --- Cargo.toml | 1 - src/parallel.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b244ada..7f27f02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,6 @@ num-complex = { version = "0.4.6", features = ["bytemuck"], optional = true } bytemuck = { version = "1.23.2", optional = true } wide = "0.8.1" rayon = { version = "1.11.0", optional = true } -chili = "0.2.1" [features] default = [] diff --git a/src/parallel.rs b/src/parallel.rs index eb8b7e2..53003c7 100644 --- a/src/parallel.rs +++ b/src/parallel.rs @@ -13,7 +13,7 @@ where #[cfg(feature = "parallel")] { if parallel { - chili::Scope::global().join(|_| oper_a(), |_| oper_b()) + rayon::join(oper_a, oper_b) } else { (oper_a(), oper_b()) } From 5939d88a8f5c3e92e776fd31f304e180df5446a8 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 2 Dec 2025 11:07:16 +0000 Subject: [PATCH 18/19] Add a tunable for the smallest chunk size beyond which the input will not be split further --- src/algorithms/dit.rs | 18 ++++++++++-------- src/options.rs | 7 +++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs index 9af5dbf..fa85e96 100644 --- a/src/algorithms/dit.rs +++ b/src/algorithms/dit.rs @@ -38,6 +38,7 @@ fn recursive_dit_fft_f64( imags: &mut [f64], size: usize, planner: &PlannerDit64, + opts: &Options, mut stage_twiddle_idx: usize, ) -> usize { let log_size = size.ilog2() as usize; @@ -61,9 +62,9 @@ fn recursive_dit_fft_f64( let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves run_maybe_in_parallel( - true, - || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, 0), - || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, 0), + size > opts.smallest_parallel_chunk_size, + || recursive_dit_fft_f64(re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f64(re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -91,6 +92,7 @@ fn recursive_dit_fft_f32( imags: &mut [f32], size: usize, planner: &PlannerDit32, + opts: &Options, mut stage_twiddle_idx: usize, ) -> usize { let log_size = size.ilog2() as usize; @@ -114,9 +116,9 @@ fn recursive_dit_fft_f32( let (im_first_half, im_second_half) = imags.split_at_mut(half); // Recursively process both halves run_maybe_in_parallel( - true, - || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, 0), - || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, 0), + size > opts.smallest_parallel_chunk_size, + || recursive_dit_fft_f32(re_first_half, im_first_half, half, planner, opts, 0), + || recursive_dit_fft_f32(re_second_half, im_second_half, half, planner, opts, 0), ); // Both halves completed stages 0..log_half-1 @@ -261,7 +263,7 @@ pub fn fft_64_dit_with_planner_and_opts( } } - recursive_dit_fft_f64(reals, imags, n, planner, 0); + recursive_dit_fft_f64(reals, imags, n, planner, opts, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { @@ -304,7 +306,7 @@ pub fn fft_32_dit_with_planner_and_opts( } } - recursive_dit_fft_f32(reals, imags, n, planner, 0); + recursive_dit_fft_f32(reals, imags, n, planner, opts, 0); // Scaling for inverse transform if let Direction::Reverse = planner.direction { diff --git a/src/options.rs b/src/options.rs index 92d9028..8bfabb6 100644 --- a/src/options.rs +++ b/src/options.rs @@ -33,6 +33,11 @@ pub struct Options { /// fft_64_with_opts_and_plan(&mut reals, &mut imags, &opts, &planner); /// ``` pub dif_perform_bit_reversal: bool, + + /// Do not split the input any further to run in parallel below this size + /// + /// Set to `usize::MAX` to disable parallelism in the recursive FFT step. + pub smallest_parallel_chunk_size: usize, } impl Default for Options { @@ -40,6 +45,7 @@ impl Default for Options { Self { multithreaded_bit_reversal: false, dif_perform_bit_reversal: true, // Default to standard FFT behavior + smallest_parallel_chunk_size: usize::MAX, } } } @@ -50,6 +56,7 @@ impl Options { let mut options = Options::default(); let n: usize = input_size.ilog2() as usize; options.multithreaded_bit_reversal = n >= 16; + options.smallest_parallel_chunk_size = 16384; options } } From 2d4130e2cf7736cf300cdca8fd009402a05f0fe6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 2 Dec 2025 11:22:15 +0000 Subject: [PATCH 19/19] Update doc comments for rayon --- src/options.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/options.rs b/src/options.rs index 8bfabb6..d0b8aab 100644 --- a/src/options.rs +++ b/src/options.rs @@ -9,8 +9,10 @@ #[derive(Debug, Clone)] pub struct Options { /// Whether to run the bit reversal step in 2 threads instead of one. - /// This is beneficial only at large input sizes (i.e. gigabytes of data). + /// This is beneficial only at medium to large sizes (i.e. megabytes of data). /// The exact threshold where it starts being beneficial varies depending on the hardware. + /// + /// This option is ignored if the `parallel` feature is disabled. pub multithreaded_bit_reversal: bool, /// Controls bit reversal behavior for DIF FFT algorithms. @@ -37,6 +39,8 @@ pub struct Options { /// Do not split the input any further to run in parallel below this size /// /// Set to `usize::MAX` to disable parallelism in the recursive FFT step. + /// + /// This option is ignored if the `parallel` feature is disabled. pub smallest_parallel_chunk_size: usize, }