@@ -23,6 +23,7 @@ use crate::kernels::dit::{
2323 fft_dit_chunk_8_simd_f64,
2424} ;
2525use crate :: options:: Options ;
26+ use crate :: parallel:: run_maybe_in_parallel;
2627use crate :: planner:: { Direction , PlannerDit32 , PlannerDit64 } ;
2728
2829/// L1 cache block size in complex elements (8KB for f32, 16KB for f64)
@@ -35,18 +36,18 @@ const L1_BLOCK_SIZE: usize = 1024;
3536fn recursive_dit_fft_f64 (
3637 reals : & mut [ f64 ] ,
3738 imags : & mut [ f64 ] ,
38- offset : usize ,
3939 size : usize ,
4040 planner : & PlannerDit64 ,
41+ opts : & Options ,
4142 mut stage_twiddle_idx : usize ,
4243) -> usize {
4344 let log_size = size. ilog2 ( ) as usize ;
4445
4546 if size <= L1_BLOCK_SIZE {
4647 for stage in 0 ..log_size {
4748 stage_twiddle_idx = execute_dit_stage_f64 (
48- & mut reals[ offset..offset + size] ,
49- & mut imags[ offset..offset + size] ,
49+ & mut reals[ .. size] ,
50+ & mut imags[ .. size] ,
5051 stage,
5152 planner,
5253 stage_twiddle_idx,
@@ -57,9 +58,14 @@ fn recursive_dit_fft_f64(
5758 let half = size / 2 ;
5859 let log_half = half. ilog2 ( ) as usize ;
5960
61+ let ( re_first_half, re_second_half) = reals. split_at_mut ( half) ;
62+ let ( im_first_half, im_second_half) = imags. split_at_mut ( half) ;
6063 // Recursively process both halves
61- recursive_dit_fft_f64 ( reals, imags, offset, half, planner, 0 ) ;
62- recursive_dit_fft_f64 ( reals, imags, offset + half, half, planner, 0 ) ;
64+ run_maybe_in_parallel (
65+ size > opts. smallest_parallel_chunk_size ,
66+ || recursive_dit_fft_f64 ( re_first_half, im_first_half, half, planner, opts, 0 ) ,
67+ || recursive_dit_fft_f64 ( re_second_half, im_second_half, half, planner, opts, 0 ) ,
68+ ) ;
6369
6470 // Both halves completed stages 0..log_half-1
6571 // Stages 0-5 use hardcoded twiddles, 6+ use planner
@@ -68,8 +74,8 @@ fn recursive_dit_fft_f64(
6874 // Process remaining stages that span both halves
6975 for stage in log_half..log_size {
7076 stage_twiddle_idx = execute_dit_stage_f64 (
71- & mut reals[ offset..offset + size] ,
72- & mut imags[ offset..offset + size] ,
77+ & mut reals[ .. size] ,
78+ & mut imags[ .. size] ,
7379 stage,
7480 planner,
7581 stage_twiddle_idx,
@@ -84,18 +90,18 @@ fn recursive_dit_fft_f64(
8490fn recursive_dit_fft_f32 (
8591 reals : & mut [ f32 ] ,
8692 imags : & mut [ f32 ] ,
87- offset : usize ,
8893 size : usize ,
8994 planner : & PlannerDit32 ,
95+ opts : & Options ,
9096 mut stage_twiddle_idx : usize ,
9197) -> usize {
9298 let log_size = size. ilog2 ( ) as usize ;
9399
94100 if size <= L1_BLOCK_SIZE {
95101 for stage in 0 ..log_size {
96102 stage_twiddle_idx = execute_dit_stage_f32 (
97- & mut reals[ offset..offset + size] ,
98- & mut imags[ offset..offset + size] ,
103+ & mut reals[ .. size] ,
104+ & mut imags[ .. size] ,
99105 stage,
100106 planner,
101107 stage_twiddle_idx,
@@ -106,15 +112,24 @@ fn recursive_dit_fft_f32(
106112 let half = size / 2 ;
107113 let log_half = half. ilog2 ( ) as usize ;
108114
109- recursive_dit_fft_f32 ( reals, imags, offset, half, planner, 0 ) ;
110- recursive_dit_fft_f32 ( reals, imags, offset + half, half, planner, 0 ) ;
115+ let ( re_first_half, re_second_half) = reals. split_at_mut ( half) ;
116+ let ( im_first_half, im_second_half) = imags. split_at_mut ( half) ;
117+ // Recursively process both halves
118+ run_maybe_in_parallel (
119+ size > opts. smallest_parallel_chunk_size ,
120+ || recursive_dit_fft_f32 ( re_first_half, im_first_half, half, planner, opts, 0 ) ,
121+ || recursive_dit_fft_f32 ( re_second_half, im_second_half, half, planner, opts, 0 ) ,
122+ ) ;
111123
124+ // Both halves completed stages 0..log_half-1
125+ // Stages 0-5 use hardcoded twiddles, 6+ use planner
112126 stage_twiddle_idx = log_half. saturating_sub ( 6 ) ;
113127
128+ // Process remaining stages that span both halves
114129 for stage in log_half..log_size {
115130 stage_twiddle_idx = execute_dit_stage_f32 (
116- & mut reals[ offset..offset + size] ,
117- & mut imags[ offset..offset + size] ,
131+ & mut reals[ .. size] ,
132+ & mut imags[ .. size] ,
118133 stage,
119134 planner,
120135 stage_twiddle_idx,
@@ -235,15 +250,11 @@ pub fn fft_64_dit_with_planner_and_opts(
235250 assert_eq ! ( log_n, planner. log_n) ;
236251
237252 // DIT requires bit-reversed input
238- if opts. multithreaded_bit_reversal {
239- std:: thread:: scope ( |s| {
240- s. spawn ( || cobra_apply ( reals, log_n) ) ;
241- s. spawn ( || cobra_apply ( imags, log_n) ) ;
242- } ) ;
243- } else {
244- cobra_apply ( reals, log_n) ;
245- cobra_apply ( imags, log_n) ;
246- }
253+ run_maybe_in_parallel (
254+ opts. multithreaded_bit_reversal ,
255+ || cobra_apply ( reals, log_n) ,
256+ || cobra_apply ( imags, log_n) ,
257+ ) ;
247258
248259 // Handle inverse FFT
249260 if let Direction :: Reverse = planner. direction {
@@ -252,7 +263,7 @@ pub fn fft_64_dit_with_planner_and_opts(
252263 }
253264 }
254265
255- recursive_dit_fft_f64 ( reals, imags, 0 , n, planner, 0 ) ;
266+ recursive_dit_fft_f64 ( reals, imags, n, planner, opts , 0 ) ;
256267
257268 // Scaling for inverse transform
258269 if let Direction :: Reverse = planner. direction {
@@ -282,15 +293,11 @@ pub fn fft_32_dit_with_planner_and_opts(
282293 assert_eq ! ( log_n, planner. log_n) ;
283294
284295 // DIT requires bit-reversed input
285- if opts. multithreaded_bit_reversal {
286- std:: thread:: scope ( |s| {
287- s. spawn ( || cobra_apply ( reals, log_n) ) ;
288- s. spawn ( || cobra_apply ( imags, log_n) ) ;
289- } ) ;
290- } else {
291- cobra_apply ( reals, log_n) ;
292- cobra_apply ( imags, log_n) ;
293- }
296+ run_maybe_in_parallel (
297+ opts. multithreaded_bit_reversal ,
298+ || cobra_apply ( reals, log_n) ,
299+ || cobra_apply ( imags, log_n) ,
300+ ) ;
294301
295302 // Handle inverse FFT
296303 if let Direction :: Reverse = planner. direction {
@@ -299,7 +306,7 @@ pub fn fft_32_dit_with_planner_and_opts(
299306 }
300307 }
301308
302- recursive_dit_fft_f32 ( reals, imags, 0 , n, planner, 0 ) ;
309+ recursive_dit_fft_f32 ( reals, imags, n, planner, opts , 0 ) ;
303310
304311 // Scaling for inverse transform
305312 if let Direction :: Reverse = planner. direction {
0 commit comments