Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 101 additions & 1 deletion include/finufft_common/utils.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include <array>
#include <cstdint>
#include <tuple>
#include <type_traits>
#include <utility>
Expand Down Expand Up @@ -100,7 +101,7 @@ template<typename Tuple> auto extract_vals(const Tuple &t) {
}

template<typename Tuple, std::size_t... I>
auto extract_seqs_impl(const Tuple &t, std::index_sequence<I...>) {
auto extract_seqs_impl(const Tuple &, std::index_sequence<I...>) {
using T = std::remove_reference_t<Tuple>;
return std::make_tuple(typename std::tuple_element_t<I, T>::seq_type{}...);
}
Expand Down Expand Up @@ -145,5 +146,104 @@ decltype(auto) dispatch(Func &&func, ParamTuple &&params, Args &&...args) {
if constexpr (!std::is_void_v<result_t>) return caller.result;
}

// ----------------------------------------------
// compile-time range length for [Start, Stop) with step Inc
// ----------------------------------------------
template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc>
inline constexpr std::int64_t compute_range_count = [] {
static_assert(Inc != 0, "Inc must not be zero");
if constexpr (Inc > 0) {
const std::int64_t d = Stop - Start;
return d > 0 ? ((d + Inc - 1) / Inc) : 0;
} else {
const std::int64_t d = Start - Stop;
const std::int64_t s = -Inc;
return d > 0 ? ((d + s - 1) / s) : 0;
}
}();

// ----------------------------------------------
// low-level emitters (C++17, no templated lambdas)
// ----------------------------------------------
template<std::int64_t Start, std::int64_t Inc, class F, std::size_t... Is>
constexpr void static_loop_impl_block(F &&f, std::index_sequence<Is...>) {
(f(std::integral_constant<std::int64_t, Start + static_cast<std::int64_t>(Is) * Inc>{}),
...);
}

template<std::int64_t Start, std::int64_t Inc, std::int64_t K, class F, std::size_t... Bs>
constexpr void static_loop_emit_all_blocks(F &&f, std::index_sequence<Bs...>) {
static_assert(K > 0, "UNROLL K must be positive");
(static_loop_impl_block<Start + static_cast<std::int64_t>(Bs) * K * Inc, Inc>(
f, std::make_index_sequence<static_cast<std::size_t>(K)>{}),
...);
}

// ----------------------------------------------
// static_loop
// ----------------------------------------------
template<std::int64_t Start, std::int64_t Stop, std::int64_t Inc = 1,
std::int64_t UNROLL = compute_range_count<Start, Stop, Inc>, class F>
constexpr void static_loop(F &&f) {
static_assert(Inc != 0, "Inc must not be zero");

constexpr std::int64_t Count = compute_range_count<Start, Stop, Inc>;
if constexpr (Count == 0) {
// do nothing
} else {
constexpr std::int64_t k = (UNROLL > 0 ? UNROLL : Count);
static_assert(k > 0, "internal: k must be positive");
constexpr std::int64_t Blocks = Count / k;
constexpr std::int64_t Tail = Count % k;

if constexpr (Blocks > 0) {
static_loop_emit_all_blocks<Start, Inc, k>(
std::forward<F>(f),
std::make_index_sequence<static_cast<std::size_t>(Blocks)>{});
}
if constexpr (Tail > 0) {
constexpr std::int64_t TailStart = Start + Blocks * k * Inc;
static_loop_impl_block<TailStart, Inc>(
std::forward<F>(f), std::make_index_sequence<static_cast<std::size_t>(Tail)>{});
}
}
}

// convenience: Stop only => Start=0, Inc=1
template<std::int64_t Stop, class F> constexpr void static_loop(F &&f) {
static_loop<0, Stop, 1, compute_range_count<0, Stop, 1>>(std::forward<F>(f));
}

// ----------------------------------------------
// static_for wrappers expecting f.template operator()<I>()
// keeps C++17 by adapting to integral_constant form above
// ----------------------------------------------
namespace detail {
template<class F> struct as_template_index {
F *pf;
constexpr explicit as_template_index(F &f) : pf(&f) {}
template<std::int64_t I>
constexpr void operator()(std::integral_constant<std::int64_t, I>) const {
pf->template operator()<I>();
}
};
template<class T>
using is_integral_cx = std::integral_constant<bool, std::is_integral_v<T>>;
} // namespace detail

template<auto Start, auto End, class F> constexpr void static_for(F &&f) {
static_assert(detail::is_integral_cx<decltype(Start)>::value &&
detail::is_integral_cx<decltype(End)>::value,
"Start/End must be integral constant expressions");
static_assert(End >= Start, "End must be >= Start");

constexpr auto S = static_cast<std::int64_t>(Start);
constexpr auto E = static_cast<std::int64_t>(End);
static_loop<S, E, 1>(detail::as_template_index<F>{f});
}

template<auto End, class F> constexpr void static_for(F &&f) {
static_for<0, End>(std::forward<F>(f));
}
} // namespace common
} // namespace finufft
78 changes: 44 additions & 34 deletions src/spreadinterp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
} else {
return N;
}
};

Check warning on line 74 in src/spreadinterp.cpp

View workflow job for this annotation

GitHub Actions / cmake-ci (ubuntu-22.04, gcc-10)

extra ‘;’ [-Wpedantic]

template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
// finds the smallest simd width that minimizes the number of iterations
Expand All @@ -95,8 +95,10 @@
// that minimizes the number of iterations
return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
}

template<class T, uint8_t N>
using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;

template<class T, uint8_t ns> constexpr auto get_padding() {
// helper function to get the padding for the given number of elements
// ns is known at compile time, rounds ns to the next multiple of the SIMD width
Expand Down Expand Up @@ -130,8 +132,10 @@
// that's why is hardcoded here
return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
}

template<class T, uint8_t N>
using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;

template<class T, class V, size_t... Is>
constexpr T generate_sequence_impl(V a, V b, index_sequence<Is...>) noexcept {
// utility function to generate a sequence of a, b interleaved as function arguments
Expand Down Expand Up @@ -278,43 +282,46 @@
}

template<typename T, uint8_t w, uint8_t upsampfact,
class simd_type =
xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // aka ns
class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>>
static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(T *FINUFFT_RESTRICT ker, T x,
const finufft_spread_opts &opts
[[maybe_unused]]) noexcept
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
This is the current evaluation method, since it's faster (except i7 w=16).
Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
[[maybe_unused]]) noexcept {
/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns.
This is the current evaluation method, since it's faster (except i7 w=16).
Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
// scale so local grid offset z in [-1,1]
const T z = std::fma(T(2.0), x, T(w - 1));
using arch_t = typename simd_type::arch_type;
static constexpr auto alignment = arch_t::alignment();
static constexpr auto simd_size = simd_type::size;
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);

{
// scale so local grid offset z in[-1,1]
const T z = std::fma(T(2.0), x, T(w - 1));
using arch_t = typename simd_type::arch_type;
static constexpr auto alignment = arch_t::alignment();
static constexpr auto simd_size = simd_type::size;
static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1);
static constexpr auto horner_coeffs = []() constexpr noexcept {
if constexpr (upsampfact == 200) {
return get_horner_coeffs_200<T, w>();
} else if constexpr (upsampfact == 125) {
} else { // upsampfact == 125
// this way we can have a static assert that things went wrong
static_assert(upsampfact == 125, "Unsupported upsampfact");
return get_horner_coeffs_125<T, w>();
}
}();
static constexpr auto nc = horner_coeffs.size();
static constexpr auto use_ker_sym = (simd_size < w);
static constexpr auto nc = horner_coeffs.size();

alignas(alignment) static constexpr auto padded_coeffs =
pad_2D_array_with_zeros<T, nc, w, padded_ns>(horner_coeffs);

// use kernel symmetry trick if w > simd_size
static constexpr bool use_ker_sym = (simd_size < w);

const simd_type zv{z};

if constexpr (use_ker_sym) {
static constexpr uint8_t tail = w % simd_size;
static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size;
static constexpr uint8_t end_idx = (w + (tail > 0)) / 2;
const simd_type zv{z};
static constexpr uint8_t offset_start = tail ? (w - tail) : (w - simd_size);

const auto z2v = zv * zv;

// some xsimd constant for shuffle or inverse
Expand All @@ -328,30 +335,32 @@
}
}();

// process simd vecs
simd_type k_prev, k_sym{0};
for (uint8_t i{0}, offset = offset_start; i < end_idx;
i += simd_size, offset -= simd_size) {
auto k_odd = [i]() constexpr noexcept {

static_loop<0, end_idx, simd_size>([&]([[maybe_unused]] const auto i) {
constexpr auto offset = static_cast<uint8_t>(offset_start - i);
auto k_odd = [i]() constexpr noexcept {
if constexpr (if_odd_degree) {
return simd_type::load_aligned(padded_coeffs[0].data() + i);
} else {
return simd_type{0};
}
}();
auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {

static_loop<1 + if_odd_degree, static_cast<std::int64_t>(nc), 2>([&](const auto j) {
const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i);
const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
k_odd = xsimd::fma(k_odd, z2v, cji_odd);
k_even = xsimd::fma(k_even, z2v, cji_even);
}
});

// left part
xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);

// right part symmetric to the left part
if (offset >= end_idx) {
if constexpr (offset >= end_idx) {
if constexpr (tail) {
// to use aligned store, we need shuffle the previous k_sym and current k_sym
k_prev = k_sym;
k_sym = xsimd::fnma(k_odd, zv, k_even);
xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
Expand All @@ -360,20 +369,21 @@
.store_aligned(ker + offset);
}
}
}
});

} else {
const simd_type zv(z);
for (uint8_t i = 0; i < w; i += simd_size) {
static_loop<0, w, simd_size>([&](const auto i) {
auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
for (uint8_t j = 1; j < nc; ++j) {

static_loop<1, static_cast<std::int64_t>(nc), 1>([&](const auto j) {
const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
k = xsimd::fma(k, zv, cji);
}
});

k.store_aligned(ker + i);
}
});
}
}

template<typename T, uint8_t ns>
static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker,
const BIGINT i1, const UBIGINT N1) {
Expand Down Expand Up @@ -1887,7 +1897,7 @@
nb);
} // end of choice of which t1 spread type to use
return 0;
};

Check warning on line 1900 in src/spreadinterp.cpp

View workflow job for this annotation

GitHub Actions / cmake-ci (ubuntu-22.04, gcc-10)

extra ‘;’ [-Wpedantic]

// --------------------------------------------------------------------------
template<typename T, int ns, int kerevalmeth>
Expand Down
Loading