From 491e6906194fddd24038ec887b07ad75b0ab8f96 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 15 Oct 2025 23:05:16 -0400 Subject: [PATCH] Using templates to force unrolling and removing branches from hot loops --- include/finufft_common/utils.h | 102 ++++++++++++++++++++++++++++++++- src/spreadinterp.cpp | 78 ++++++++++++++----------- 2 files changed, 145 insertions(+), 35 deletions(-) diff --git a/include/finufft_common/utils.h b/include/finufft_common/utils.h index 2d03786d4..59cd56259 100644 --- a/include/finufft_common/utils.h +++ b/include/finufft_common/utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -100,7 +101,7 @@ template auto extract_vals(const Tuple &t) { } template -auto extract_seqs_impl(const Tuple &t, std::index_sequence) { +auto extract_seqs_impl(const Tuple &, std::index_sequence) { using T = std::remove_reference_t; return std::make_tuple(typename std::tuple_element_t::seq_type{}...); } @@ -145,5 +146,104 @@ decltype(auto) dispatch(Func &&func, ParamTuple &¶ms, Args &&...args) { if constexpr (!std::is_void_v) return caller.result; } +// ---------------------------------------------- +// compile-time range length for [Start, Stop) with step Inc +// ---------------------------------------------- +template +inline constexpr std::int64_t compute_range_count = [] { + static_assert(Inc != 0, "Inc must not be zero"); + if constexpr (Inc > 0) { + const std::int64_t d = Stop - Start; + return d > 0 ? ((d + Inc - 1) / Inc) : 0; + } else { + const std::int64_t d = Start - Stop; + const std::int64_t s = -Inc; + return d > 0 ? ((d + s - 1) / s) : 0; + } +}(); + +// ---------------------------------------------- +// low-level emitters (C++17, no templated lambdas) +// ---------------------------------------------- +template +constexpr void static_loop_impl_block(F &&f, std::index_sequence) { + (f(std::integral_constant(Is) * Inc>{}), + ...); +} + +template +constexpr void static_loop_emit_all_blocks(F &&f, std::index_sequence) { + static_assert(K > 0, "UNROLL K must be positive"); + (static_loop_impl_block(Bs) * K * Inc, Inc>( + f, std::make_index_sequence(K)>{}), + ...); +} + +// ---------------------------------------------- +// static_loop +// ---------------------------------------------- +template, class F> +constexpr void static_loop(F &&f) { + static_assert(Inc != 0, "Inc must not be zero"); + + constexpr std::int64_t Count = compute_range_count; + if constexpr (Count == 0) { + // do nothing + } else { + constexpr std::int64_t k = (UNROLL > 0 ? UNROLL : Count); + static_assert(k > 0, "internal: k must be positive"); + constexpr std::int64_t Blocks = Count / k; + constexpr std::int64_t Tail = Count % k; + + if constexpr (Blocks > 0) { + static_loop_emit_all_blocks( + std::forward(f), + std::make_index_sequence(Blocks)>{}); + } + if constexpr (Tail > 0) { + constexpr std::int64_t TailStart = Start + Blocks * k * Inc; + static_loop_impl_block( + std::forward(f), std::make_index_sequence(Tail)>{}); + } + } +} + +// convenience: Stop only => Start=0, Inc=1 +template constexpr void static_loop(F &&f) { + static_loop<0, Stop, 1, compute_range_count<0, Stop, 1>>(std::forward(f)); +} + +// ---------------------------------------------- +// static_for wrappers expecting f.template operator()() +// keeps C++17 by adapting to integral_constant form above +// ---------------------------------------------- +namespace detail { +template struct as_template_index { + F *pf; + constexpr explicit as_template_index(F &f) : pf(&f) {} + template + constexpr void operator()(std::integral_constant) const { + pf->template operator()(); + } +}; +template +using is_integral_cx = std::integral_constant>; +} // namespace detail + +template constexpr void static_for(F &&f) { + static_assert(detail::is_integral_cx::value && + detail::is_integral_cx::value, + "Start/End must be integral constant expressions"); + static_assert(End >= Start, "End must be >= Start"); + + constexpr auto S = static_cast(Start); + constexpr auto E = static_cast(End); + static_loop(detail::as_template_index{f}); +} + +template constexpr void static_for(F &&f) { + static_for<0, End>(std::forward(f)); +} } // namespace common } // namespace finufft diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 59c9d8d34..e45e71262 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -95,8 +95,10 @@ template constexpr auto GetPaddedSIMDWidth() { // that minimizes the number of iterations return xsimd::make_sized_batch()>::type::size; } + template using PaddedSIMD = typename xsimd::make_sized_batch()>::type; + template constexpr auto get_padding() { // helper function to get the padding for the given number of elements // ns is known at compile time, rounds ns to the next multiple of the SIMD width @@ -130,8 +132,10 @@ template uint8_t get_padding(uint8_t ns) { // that's why is hardcoded here return get_padding_helper(ns); } + template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; + template constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { // utility function to generate a sequence of a, b interleaved as function arguments @@ -278,43 +282,46 @@ static void evaluate_kernel_vector(T *ker, T *args, } template()>> // aka ns + class simd_type = xsimd::make_sized_batch_t()>> static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts &opts - [[maybe_unused]]) noexcept -/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at -x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. -This is the current evaluation method, since it's faster (except i7 w=16). -Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ + [[maybe_unused]]) noexcept { + /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at + x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. + This is the current evaluation method, since it's faster (except i7 w=16). + Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ + // scale so local grid offset z in [-1,1] + const T z = std::fma(T(2.0), x, T(w - 1)); + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); -{ - // scale so local grid offset z in[-1,1] - const T z = std::fma(T(2.0), x, T(w - 1)); - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); static constexpr auto horner_coeffs = []() constexpr noexcept { if constexpr (upsampfact == 200) { return get_horner_coeffs_200(); - } else if constexpr (upsampfact == 125) { + } else { // upsampfact == 125 + // this way we can have a static assert that things went wrong + static_assert(upsampfact == 125, "Unsupported upsampfact"); return get_horner_coeffs_125(); } }(); - static constexpr auto nc = horner_coeffs.size(); - static constexpr auto use_ker_sym = (simd_size < w); + static constexpr auto nc = horner_coeffs.size(); alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); // use kernel symmetry trick if w > simd_size + static constexpr bool use_ker_sym = (simd_size < w); + + const simd_type zv{z}; + if constexpr (use_ker_sym) { static constexpr uint8_t tail = w % simd_size; static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); - static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv{z}; + static constexpr uint8_t offset_start = tail ? (w - tail) : (w - simd_size); + const auto z2v = zv * zv; // some xsimd constant for shuffle or inverse @@ -328,11 +335,11 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ } }(); - // process simd vecs simd_type k_prev, k_sym{0}; - for (uint8_t i{0}, offset = offset_start; i < end_idx; - i += simd_size, offset -= simd_size) { - auto k_odd = [i]() constexpr noexcept { + + static_loop<0, end_idx, simd_size>([&]([[maybe_unused]] const auto i) { + constexpr auto offset = static_cast(offset_start - i); + auto k_odd = [i]() constexpr noexcept { if constexpr (if_odd_degree) { return simd_type::load_aligned(padded_coeffs[0].data() + i); } else { @@ -340,18 +347,20 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ } }(); auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) { + + static_loop<1 + if_odd_degree, static_cast(nc), 2>([&](const auto j) { const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); k_odd = xsimd::fma(k_odd, z2v, cji_odd); k_even = xsimd::fma(k_even, z2v, cji_even); - } + }); + // left part xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + // right part symmetric to the left part - if (offset >= end_idx) { + if constexpr (offset >= end_idx) { if constexpr (tail) { - // to use aligned store, we need shuffle the previous k_sym and current k_sym k_prev = k_sym; k_sym = xsimd::fnma(k_odd, zv, k_even); xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); @@ -360,20 +369,21 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ .store_aligned(ker + offset); } } - } + }); + } else { - const simd_type zv(z); - for (uint8_t i = 0; i < w; i += simd_size) { + static_loop<0, w, simd_size>([&](const auto i) { auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); - for (uint8_t j = 1; j < nc; ++j) { + + static_loop<1, static_cast(nc), 1>([&](const auto j) { const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); k = xsimd::fma(k, zv, cji); - } + }); + k.store_aligned(ker + i); - } + }); } } - template static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker, const BIGINT i1, const UBIGINT N1) {