diff --git a/CMakeLists.txt b/CMakeLists.txt index 423c8adc4..7e5e2cf5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,9 +121,7 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE -set(FINUFFT_PRECISION_DEPENDENT_SOURCES - src/finufft.cpp src/fft.cpp src/simpleinterfaces.cpp src/spreadinterp.cpp - src/utils.cpp) +set(FINUFFT_PRECISION_DEPENDENT_SOURCES) # If we're building for Fortran, make sure we also include the translation # layer. @@ -252,25 +250,30 @@ endfunction() if(FINUFFT_USE_CPU) # Main finufft libraries - add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) - target_compile_definitions(finufft_f32 PRIVATE SINGLE) - set_finufft_options(finufft_f32) - - add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) - set_finufft_options(finufft_f64) if(NOT FINUFFT_STATIC_LINKING) - add_library(finufft SHARED src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft SHARED + src/spreadinterp.cpp + src/utils.cpp + contrib/legendre_rule_fast.cpp + src/fft.cpp + src/finufft_core.cpp + src/simpleinterfaces.cpp + fortran/finufftfort.cpp) else() - add_library(finufft STATIC src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft STATIC + src/spreadinterp.cpp + src/utils.cpp + contrib/legendre_rule_fast.cpp + src/fft.cpp + src/finufft_core.cpp + src/simpleinterfaces.cpp + fortran/finufftfort.cpp) endif() - target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) set_finufft_options(finufft) if(WIN32 AND FINUFFT_SHARED_LINKING) - target_compile_definitions(finufft_f32 PRIVATE dll_EXPORTS FINUFFT_DLL) - target_compile_definitions(finufft_f64 PRIVATE dll_EXPORTS FINUFFT_DLL) target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL) endif() find_library(MATH_LIBRARY m) diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp index 799a10041..400ff0985 100644 --- a/fortran/finufftfort.cpp +++ b/fortran/finufftfort.cpp @@ -19,43 +19,15 @@ // public header #include - -// private headers needed... (must come after finufft.h which clobbers FINUFFT*) -#include - -// local prec-switching macros for fortran names, ie -// underscore-suffixed versions of those at end of defs.h -#define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_) -#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_) -#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_) -#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_) -#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_) -#define FINUFFT1D1_ FINUFFTIFY(1d1_) -#define FINUFFT1D2_ FINUFFTIFY(1d2_) -#define FINUFFT1D3_ FINUFFTIFY(1d3_) -#define FINUFFT2D1_ FINUFFTIFY(2d1_) -#define FINUFFT2D2_ FINUFFTIFY(2d2_) -#define FINUFFT2D3_ FINUFFTIFY(2d3_) -#define FINUFFT3D1_ FINUFFTIFY(3d1_) -#define FINUFFT3D2_ FINUFFTIFY(3d2_) -#define FINUFFT3D3_ FINUFFTIFY(3d3_) -#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_) -#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_) -#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_) -#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_) -#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_) -#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_) -#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_) -#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_) -#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_) +#include #ifdef __cplusplus extern "C" { #endif // --------------------- guru interface from fortran ------------------------ -void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, - FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) { +void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, + double *tol, finufft_plan *plan, finufft_opts *o, int *ier) { if (!plan) fprintf(stderr, "%s fortran: plan must be allocated as at least the size of a C pointer " @@ -63,143 +35,325 @@ void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int __func__); else { // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts: - *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); + *ier = finufft_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); } } -void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, - FLT *s, FLT *t, FLT *u, int *ier) { +void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, double *zj, + BIGINT *nk, double *s, double *t, double *u, int *ier) { if (!*plan) { fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); return; } int nk_safe = 0; // catches the case where user passes NULL in - if (nk) nk_safe = *nk; - *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u); + if (nk) nk_safe = int(*nk); + *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } -void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) { +void finufft_execute_(finufft_plan *plan, std::complex *weights, + std::complex *result, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else - *ier = FINUFFT_EXECUTE(*plan, weights, result); + *ier = finufft_execute(*plan, weights, result); } -void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) { +void finufft_destroy_(finufft_plan *plan, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else - *ier = FINUFFT_DESTROY(*plan); + *ier = finufft_destroy(*plan); } // ------------ use FINUFFT to set the default options --------------------- // (Note the finufft_opts is created in f90-style derived types, not here) -void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) { +void finufft_default_opts_(finufft_opts *o) { if (!o) fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__); else // o is a ptr to already-allocated fortran finufft_opts derived type... - FINUFFT_DEFAULT_OPTS(o); + finufft_default_opts(o); } // -------------- simple and many-vector interfaces -------------------- // --- 1D --- -void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, - finufft_opts *o, int *ier) { - *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d1_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, + int *ier) { + *ier = finufft1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d1many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, - finufft_opts *o, int *ier) { - *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d2_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, + int *ier) { + *ier = finufft1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d2many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s, - CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o); +void finufft1d3_(BIGINT *nj, double *x, std::complex *c, int *iflag, double *eps, + BIGINT *nk, double *s, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufft1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); } -void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, - BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); +void finufft1d3many_(int *ntransf, BIGINT *nj, double *x, std::complex *c, + int *iflag, double *eps, BIGINT *nk, double *s, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); } // --- 2D --- -void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, - BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +void finufft2d1_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufft2d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufft2d2_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufft2d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, - FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, + +void finufft2d3_(BIGINT *nj, double *x, double *y, std::complex *c, int *iflag, + double *eps, BIGINT *nk, double *s, double *t, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufft2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +void finufft2d3many_(int *ntransf, BIGINT *nj, double *x, double *y, + std::complex *c, int *iflag, double *eps, BIGINT *nk, + double *s, double *t, std::complex *f, finufft_opts *o, int *ier) { - *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); + *ier = finufft2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } -void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, - BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +// --- 3D --- +void finufft3d1_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, - FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, + +void finufft3d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); + *ier = + finufft3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk, - FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +void finufft3d2_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, - FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o, +void finufft3d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); + *ier = + finufft3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -// --- 3D --- -void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +void finufft3d3_(BIGINT *nj, double *x, double *y, double *z, std::complex *c, + int *iflag, double *eps, BIGINT *nk, double *s, double *t, double *u, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, - int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, +void finufft3d3many_(int *ntransf, BIGINT *nj, double *x, double *y, double *z, + std::complex *c, int *iflag, double *eps, BIGINT *nk, + double *s, double *t, double *u, std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +} + +// --------------------- guru interface from fortran ------------------------ +void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, + int *n_transf, float *tol, finufftf_plan *plan, finufft_opts *o, + int *ier) { + if (!plan) + fprintf(stderr, + "%s fortran: plan must be allocated as at least the size of a C pointer " + "(usually 8 bytes)!\n", + __func__); + else { + // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts: + *ier = finufftf_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); + } +} + +void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, float *zj, + BIGINT *nk, float *s, float *t, float *u, int *ier) { + if (!*plan) { + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + return; + } + int nk_safe = 0; // catches the case where user passes NULL in + if (nk) nk_safe = int(*nk); + *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); +} + +void finufftf_execute_(finufftf_plan *plan, std::complex *weights, + std::complex *result, int *ier) { + if (!plan) + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + else + *ier = finufftf_execute(*plan, weights, result); +} + +void finufftf_destroy_(finufftf_plan *plan, int *ier) { + if (!plan) + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + else + *ier = finufftf_destroy(*plan); +} + +// ------------ use FINUFFT to set the default options --------------------- +// (Note the finufft_opts is created in f90-style derived types, not here) +void finufftf_default_opts_(finufft_opts *o) { + if (!o) + fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__); + else + // o is a ptr to already-allocated fortran finufft_opts derived type... + finufft_default_opts(o); +} + +// -------------- simple and many-vector interfaces -------------------- +// --- 1D --- +void finufftf1d1_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, + BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d1many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d2_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, + BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d2many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d3_(BIGINT *nj, float *x, std::complex *c, int *iflag, float *eps, + BIGINT *nk, float *s, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufftf1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); +} + +void finufftf1d3many_(int *ntransf, BIGINT *nj, float *x, std::complex *c, + int *iflag, float *eps, BIGINT *nk, float *s, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufftf1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); +} + +// --- 2D --- +void finufftf2d1_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, + float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufftf2d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufftf2d2_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, + float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufftf2d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufftf2d3_(BIGINT *nj, float *x, float *y, std::complex *c, int *iflag, + float *eps, BIGINT *nk, float *s, float *t, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufftf2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +void finufftf2d3many_(int *ntransf, BIGINT *nj, float *x, float *y, + std::complex *c, int *iflag, float *eps, BIGINT *nk, + float *s, float *t, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufftf2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +// --- 3D --- +void finufftf3d1_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +} + +void finufftf3d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, + int *ier) { *ier = - FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); + finufftf3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +void finufftf3d2_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, - int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, - finufft_opts *o, int *ier) { +void finufftf3d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, + int *ier) { *ier = - FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); + finufftf3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps, - BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +void finufftf3d3_(BIGINT *nj, float *x, float *y, float *z, std::complex *c, + int *iflag, float *eps, BIGINT *nk, float *s, float *t, float *u, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufftf3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, - FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, - finufft_opts *o, int *ier) { - *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +void finufftf3d3many_(int *ntransf, BIGINT *nj, float *x, float *y, float *z, + std::complex *c, int *iflag, float *eps, BIGINT *nk, + float *s, float *t, float *u, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufftf3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } #ifdef __cplusplus diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 42e5e7ff8..084ffa41c 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -18,6 +18,7 @@ // public header gives access to f_opts, f_spread_opts, f_plan... // (and clobbers FINUFFT* macros; watch out!) #include +#include #include // --------------- Private data types for compilation in either prec --------- @@ -25,8 +26,8 @@ // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. -using BIGINT = int64_t; -using UBIGINT = uint64_t; +// using BIGINT = int64_t; +// using UBIGINT = uint64_t; // Precision-independent real and complex types, for private lib/test compile #ifdef SINGLE using FLT = float; @@ -36,59 +37,6 @@ using FLT = double; #include // we define C++ complex type only using CPX = std::complex; -// inline macro, to force inlining of small functions -// this avoids the use of macros to implement functions -#if defined(_MSC_VER) -#define FINUFFT_ALWAYS_INLINE __forceinline inline -#define FINUFFT_NEVER_INLINE __declspec(noinline) -#define FINUFFT_RESTRICT __restrict -#define FINUFFT_UNREACHABLE __assume(0) -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#elif defined(__GNUC__) || defined(__clang__) -#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline -#define FINUFFT_NEVER_INLINE __attribute__((noinline)) -#define FINUFFT_RESTRICT __restrict__ -#define FINUFFT_UNREACHABLE __builtin_unreachable() -#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) -#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) -#else -#define FINUFFT_ALWAYS_INLINE inline -#define FINUFFT_NEVER_INLINE -#define FINUFFT_RESTRICT -#define FINUFFT_UNREACHABLE -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#endif - -// ------------- Library-wide algorithm parameter settings ---------------- - -// Library version (is a string) -#define FINUFFT_VER "2.3.0" - -// Smallest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MIN_NSPREAD = 2; - -// Largest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MAX_NSPREAD = 16; - -// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 -inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; - -// Max number of positive quadr nodes for kernel FT (used only in common.cpp) -inline constexpr int MAX_NQUAD = 100; - -// Internal (nf1 etc) array allocation size that immediately raises error. -// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) -// Increase this if you need >10TB (!) RAM... -inline constexpr BIGINT MAX_NF = BIGINT(1e12); - -// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 -// values for M = nj (also nk in type 3)... -inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); - // -------------- Math consts (not in math.h) and useful math macros ---------- #include @@ -108,13 +56,6 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); // to avoid mixed precision operators in eg i*pi, an either-prec PI... #define PI FLT(M_PI) -// machine epsilon for decisions of achievable tolerance... -#ifdef SINGLE -#define EPSILON (float)6e-08 -#else -#define EPSILON (double)1.1e-16 -#endif - // Random numbers: crappy unif random number generator in [0,1). // These macros should probably be replaced by modern C++ std lib or random123. // (RAND_MAX is in stdlib.h) @@ -148,32 +89,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) { } #endif -// ----- OpenMP macros which also work when omp not present ----- -// Allows compile-time switch off of openmp, so compilation without any openmp -// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) -#ifdef _OPENMP -#include -// point to actual omp utils -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { - return omp_get_num_threads(); -} -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { - return omp_get_max_threads(); -} -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { - return omp_get_thread_num(); -} -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { - omp_set_num_threads(x); -} -#else -// non-omp safe dummy versions of omp utils... -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} -#endif - // Prec-switching name macros (respond to SINGLE), used in lib & test sources // and the plan object below. // Note: crucially, these are now indep of macros used to gen public finufft.h! @@ -219,70 +134,6 @@ static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} // NB: now private (the public C++ or C etc user sees an opaque pointer to it) #include // (must come after complex.h) - -// group together a bunch of type 3 rescaling/centering/phasing parameters: -template struct type3params { - T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale - T X2, C2, D2, h2, gam2; // y - T X3, C3, D3, h3, gam3; // z -}; - -struct FINUFFT_PLAN_S { // the main plan object, fully C++ - // These default and delete specifications just state the obvious, - // but are here to silence compiler warnings. - FINUFFT_PLAN_S() = default; - // Copy construction and assignent are already deleted implicitly - // because of the unique_ptr member. - FINUFFT_PLAN_S(const FINUFFT_PLAN_S &) = delete; - FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete; - - int type; // transform type (Rokhlin naming): 1,2 or 3 - int dim; // overall dimension: 1,2 or 3 - int ntrans; // how many transforms to do at once (vector or "many" mode) - BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) - BIGINT nk; // number of NU freq pts (type 3 only) - FLT tol; // relative user tolerance - int batchSize; // # strength vectors to group together for FFTW, etc - int nbatch; // how many batches done to cover all ntrans vectors - - BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 - BIGINT mt; // number of modes in y (2) direction = N2 - BIGINT mu; // number of modes in z (3) direction = N3 - BIGINT N; // total # modes (prod of above three) - - BIGINT nf1; // size of internal fine grid in x (1) direction - BIGINT nf2; // " y (2) - BIGINT nf3; // " z (3) - BIGINT nf; // total # fine grid points (product of the above three) - - int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 - - FLT *phiHat1; // FT of kernel in t1,2, on x-axis mode grid - FLT *phiHat2; // " y-axis. - FLT *phiHat3; // " z-axis. - - CPX *fwBatch; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array - - BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp - bool didSort; // whether binsorting used (false: identity perm used) - - FLT *X, *Y, *Z; // for t1,2: ptr to user-supplied NU pts (no new allocs). - // for t3: allocated as "primed" (scaled) src pts x'_j, etc - - // type 3 specific - FLT *S, *T, *U; // pointers to user's target NU pts arrays (no new allocs) - CPX *prephase; // pre-phase, for all input NU pts - CPX *deconv; // reciprocal of kernel FT, phase, all output NU pts - CPX *CpBatch; // working array of prephased strengths - FLT *Sp, *Tp, *Up; // internal primed targs (s'_k, etc), allocated - type3params t3P; // groups together type 3 shift, scale, phase, parameters - FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 - - // other internal structs; each is C-compatible of course - std::unique_ptr> fftPlan; - finufft_opts opts; // this and spopts could be made ptrs - finufft_spread_opts spopts; -}; +struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T {}; #endif // DEFS_H diff --git a/include/finufft/fft.h b/include/finufft/fft.h index bab43966c..c6d5de7a5 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -171,19 +171,22 @@ template<> struct Finufft_FFT_plan { #endif -#include +#include static inline void finufft_fft_forget_wisdom [[maybe_unused]] () { - Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); } static inline void finufft_fft_cleanup [[maybe_unused]] () { - Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); } static inline void finufft_fft_cleanup_threads [[maybe_unused]] () { - Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); } - -std::vector gridsize_for_fft(FINUFFT_PLAN p); -void do_fft(FINUFFT_PLAN p); +template struct FINUFFT_PLAN_T; +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h new file mode 100644 index 000000000..de2f2dab9 --- /dev/null +++ b/include/finufft/finufft_core.h @@ -0,0 +1,213 @@ +#ifndef FINUFFT_CORE_H +#define FINUFFT_CORE_H + +/* IMPORTANT: for Windows compilers, you should add a line + #define FINUFFT_DLL + here if you are compiling/using FINUFFT as a DLL, + in order to do the proper importing/exporting, or + alternatively compile with -DFINUFFT_DLL or the equivalent + command-line flag. This is not necessary under MinGW/Cygwin, where + libtool does the imports/exports automatically. + Alternatively use include(GenerateExportHeader) and + generate_export_header(finufft) to auto generate an header containing + these defines.The main reason is that if msvc changes the way it deals + with it in the future we just need to update cmake for it to work + instead of having a check on the msvc version. */ +#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__)) +#if defined(dll_EXPORTS) +#define FINUFFT_EXPORT __declspec(dllexport) +#else +#define FINUFFT_EXPORT __declspec(dllimport) +#endif +#else +#define FINUFFT_EXPORT +#endif + +/* specify calling convention (Windows only) + The cdecl calling convention is actually not the default in all but a very + few C/C++ compilers. + If the user code changes the default compiler calling convention, may need + this when generating DLL. */ +#if defined(_WIN32) || defined(__WIN32__) +#define FINUFFT_CDECL __cdecl +#else +#define FINUFFT_CDECL +#endif + +// inline macro, to force inlining of small functions +// this avoids the use of macros to implement functions +#if defined(_MSC_VER) +#define FINUFFT_ALWAYS_INLINE __forceinline inline +#define FINUFFT_NEVER_INLINE __declspec(noinline) +#define FINUFFT_RESTRICT __restrict +#define FINUFFT_UNREACHABLE __assume(0) +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#elif defined(__GNUC__) || defined(__clang__) +#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline +#define FINUFFT_NEVER_INLINE __attribute__((noinline)) +#define FINUFFT_RESTRICT __restrict__ +#define FINUFFT_UNREACHABLE __builtin_unreachable() +#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) +#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define FINUFFT_ALWAYS_INLINE inline +#define FINUFFT_NEVER_INLINE +#define FINUFFT_RESTRICT +#define FINUFFT_UNREACHABLE +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#endif + +#include +#include + +// All indexing in library that potentially can exceed 2^31 uses 64-bit signed. +// This includes all calling arguments (eg M,N) that could be huge someday. +using BIGINT = int64_t; +using UBIGINT = uint64_t; + +// ------------- Library-wide algorithm parameter settings ---------------- + +// Library version (is a string) +#define FINUFFT_VER "2.3.0" + +// Smallest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MIN_NSPREAD = 2; + +// Largest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MAX_NSPREAD = 16; + +// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 +inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; + +// Max number of positive quadr nodes for kernel FT (used only in common.cpp) +inline constexpr int MAX_NQUAD = 100; + +// Internal (nf1 etc) array allocation size that immediately raises error. +// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) +// Increase this if you need >10TB (!) RAM... +inline constexpr BIGINT MAX_NF = BIGINT(1e12); + +// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 +// values for M = nj (also nk in type 3)... +inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); + +// ----- OpenMP macros which also work when omp not present ----- +// Allows compile-time switch off of openmp, so compilation without any openmp +// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) +#ifdef _OPENMP +#include +// point to actual omp utils +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { + return omp_get_num_threads(); +} +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { + return omp_get_max_threads(); +} +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { + return omp_get_thread_num(); +} +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { + omp_set_num_threads(x); +} +#else +// non-omp safe dummy versions of omp utils... +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} +#endif + +#include // (must come after complex.h) +#include +#include + +// group together a bunch of type 3 rescaling/centering/phasing parameters: +template struct type3params { + T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, D2, h2, gam2; // y + T X3, C3, D3, h3, gam3; // z +}; + +template struct FINUFFT_PLAN_T { // the main plan object, fully C++ + + using TC = std::complex; + + // These default and delete specifications just state the obvious, + // but are here to silence compiler warnings. + FINUFFT_PLAN_T() = default; + // Copy construction and assignent are already deleted implicitly + // because of the unique_ptr member. + FINUFFT_PLAN_T(const FINUFFT_PLAN_T &) = delete; + FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete; + ~FINUFFT_PLAN_T(); + + int type; // transform type (Rokhlin naming): 1,2 or 3 + int dim; // overall dimension: 1,2 or 3 + int ntrans; // how many transforms to do at once (vector or "many" mode) + BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) + BIGINT nk; // number of NU freq pts (type 3 only) + TF tol; // relative user tolerance + int batchSize; // # strength vectors to group together for FFTW, etc + int nbatch; // how many batches done to cover all ntrans vectors + + BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 + BIGINT mt; // number of modes in y (2) direction = N2 + BIGINT mu; // number of modes in z (3) direction = N3 + BIGINT N; // total # modes (prod of above three) + + BIGINT nf1; // size of internal fine grid in x (1) direction + BIGINT nf2; // " y (2) + BIGINT nf3; // " z (3) + BIGINT nf; // total # fine grid points (product of the above three) + + int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 + + std::vector phiHat1; // FT of kernel in t1,2, on x-axis mode grid + std::vector phiHat2; // " y-axis. + std::vector phiHat3; // " z-axis. + + TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan + // & act on. Usually the largest working array + + std::vector sortIndices; // precomputed NU pt permutation, speeds spread/interp + bool didSort; // whether binsorting used (false: identity perm used) + + TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts + // (no new allocs). for t3: allocated as + // "primed" (scaled) src pts x'_j, etc + + // type 3 specific + TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays + // (no new allocs) + std::vector prephase; // pre-phase, for all input NU pts + std::vector deconv; // reciprocal of kernel FT, phase, all output NU pts + std::vector CpBatch; // working array of prephased strengths + std::vector Sp, Tp, Up; // internal primed targs (s'_k, etc), + // allocated + type3params t3P; // groups together type 3 shift, scale, phase, parameters + FINUFFT_PLAN_T *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3 + + // other internal structs + std::unique_ptr> fftPlan; + finufft_opts opts; // this and spopts could be made ptrs + finufft_spread_opts spopts; + + int setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, TF *u); + int execute(std::complex *cj, std::complex *fk); +}; + +void finufft_default_opts_t(finufft_opts *o); +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts); +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u); +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +#endif // FINUFFT_CORE_H diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 78ecf9f22..8a83af3ce 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -7,7 +7,6 @@ #ifndef SPREADINTERP_H #define SPREADINTERP_H -#include #include /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags. @@ -31,31 +30,28 @@ namespace finufft { namespace spreadinterp { // things external (spreadinterp) interface needs... +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( - UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT N, FLT *kx, FLT *ky, - FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts); + UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz, + T *data_nonuniform, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT N, FLT *kx, FLT *ky, FLT *kz, + UBIGINT N, T *kx, T *ky, T *kz, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, - UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky, - FLT *kz, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL interpSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, - UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform, - const finufft_spread_opts &opts, int did_sort); +template +FINUFFT_EXPORT int FINUFFT_CDECL indexSort(std::vector &sort_indices, UBIGINT N1, + UBIGINT N2, UBIGINT N3, UBIGINT N, T *kx, + T *ky, T *kz, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); -FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps, + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort); +template +FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts); +template +FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug, int showwarn, int dim); diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index 387bef20d..bdd4cf147 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -17,7 +17,6 @@ // convenient private finufft internals (must come after finufft.h) #include -#include // prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc... #include diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 9039fee96..040f60543 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -4,22 +4,117 @@ #ifndef UTILS_H #define UTILS_H -#include "finufft/defs.h" +#include "finufft/finufft_core.h" +// for CNTime... +// using chrono since the interface is portable between linux and windows +#include namespace finufft { namespace utils { // ahb's low-level array helpers -FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b); -FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b); -FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a); -FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a); -FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi); -FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo, - FLT *hi); -FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c); +template +FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, std::complex *a, + std::complex *b) +// ||a-b||_2 / ||a||_2 +{ + T err = 0.0, nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) { + nrm += real(conj(a[m]) * a[m]); + std::complex diff = a[m] - b[m]; + err += real(conj(diff) * diff); + } + return sqrt(err / nrm); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, std::complex *a, + std::complex *b) +// ||a-b||_2 +{ + T err = 0.0; // compute error 2-norm + for (BIGINT m = 0; m < n; ++m) { + std::complex diff = a[m] - b[m]; + err += real(conj(diff) * diff); + } + return sqrt(err); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, std::complex *a) +// ||a||_2 +{ + T nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]); + return sqrt(nrm); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, std::complex *a) +// ||a||_infty +{ + T nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) { + T aa = real(conj(a[m]) * a[m]); + if (aa > nrm) nrm = aa; + } + return sqrt(nrm); +} +template +FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi) +// With a a length-n array, writes out min(a) to lo and max(a) to hi, +// so that all a values lie in [lo,hi]. +// If n==0, lo and hi are not finite. +{ + *lo = INFINITY; + *hi = -INFINITY; + for (BIGINT m = 0; m < n; ++m) { + if (a[m] < *lo) *lo = a[m]; + if (a[m] > *hi) *hi = a[m]; + } +} +template +FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c) +// Writes out w = half-width and c = center of an interval enclosing all a[n]'s +// Only chooses a nonzero center if this increases w by less than fraction +// ARRAYWIDCEN_GROWFRAC defined in defs.h. +// This prevents rephasings which don't grow nf by much. 6/8/17 +// If n==0, w and c are not finite. +{ + T lo, hi; + arrayrange(n, a, &lo, &hi); + *w = (hi - lo) / 2; + *c = (hi + lo) / 2; + if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) { + *w += std::abs(*c); + *c = 0.0; + } +} +FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); + +// jfm's timer class +class FINUFFT_EXPORT CNTime { +public: + void start(); + double restart(); + double elapsedsec(); + +private: + double initial; +}; + +// openmp helpers +int get_num_threads_parallel_block(); + +} // namespace utils +} // namespace finufft + +// thread-safe rand number generator for Windows platform +#ifdef _WIN32 +#include +namespace finufft { +namespace utils { +FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); } // namespace utils } // namespace finufft +#endif #endif // UTILS_H diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h deleted file mode 100644 index 0504bb8df..000000000 --- a/include/finufft/utils_precindep.h +++ /dev/null @@ -1,44 +0,0 @@ -// Header for utils_precindep.cpp, a little library of array and timer stuff. -// Only the precision-independent routines here (get compiled once) - -#ifndef UTILS_PRECINDEP_H -#define UTILS_PRECINDEP_H - -#include "defs.h" -// for CNTime... -// using chrono since the interface is portable between linux and windows -#include - -namespace finufft { -namespace utils { - -FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); - -// jfm's timer class -class FINUFFT_EXPORT CNTime { -public: - void start(); - double restart(); - double elapsedsec(); - -private: - double initial; -}; - -// openmp helpers -int get_num_threads_parallel_block(); - -} // namespace utils -} // namespace finufft - -// thread-safe rand number generator for Windows platform -#ifdef _WIN32 -#include -namespace finufft { -namespace utils { -FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); -} // namespace utils -} // namespace finufft -#endif - -#endif // UTILS_PRECINDEP_H diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h index 47f7860e1..3f0a7d95c 100644 --- a/include/finufft_eitherprec.h +++ b/include/finufft_eitherprec.h @@ -86,8 +86,8 @@ typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN; FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o); FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)( - int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol, - FINUFFT_PLAN *plan, finufft_opts *o); + int type, int dim, const FINUFFT_BIGINT *n_modes, int iflag, int n_transf, + FINUFFT_FLT tol, FINUFFT_PLAN *plan, finufft_opts *o); FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)( FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u); diff --git a/makefile b/makefile index 4a91506db..7ad454198 100644 --- a/makefile +++ b/makefile @@ -31,7 +31,7 @@ PYTHON = python3 # they allow gcc to vectorize the code more effectively CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\ -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\ - -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) + -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) -Wfatal-errors FFLAGS := $(CFLAGS) $(FFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) # FFTW base name, and math linking... @@ -133,24 +133,13 @@ STATICLIB = lib-static/$(LIBNAME).a # absolute path to the .so, useful for linking so executables portable... ABSDYNLIB = $(FINUFFT)$(DYNLIB) -# spreader is subset of the library with self-contained testing, hence own objs: -# double-prec spreader object files that also need single precision... -SOBJS = src/spreadinterp.o src/utils.o -# their single-prec versions -SOBJSF = $(SOBJS:%.o=%_32.o) -# precision-dependent spreader object files (compiled & linked only once)... -SOBJS_PI = src/utils_precindep.o # spreader dual-precision objs -SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI) - -# double-prec library object files that also need single precision... -OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o -# their single-prec versions -OBJSF = $(OBJS:%.o=%_32.o) -# precision-dependent library object files (compiled & linked only once)... -OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o +SOBJSD = src/utils.o src/spreadinterp.o + +# precision-independent library object files (compiled & linked only once)... +OBJS_PI = $(SOBJSD) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) -OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS) +OBJSD = $(OBJS_PI) $(DUCC_OBJS) .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean @@ -190,12 +179,8 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) # implicit rules for objects (note -o ensures writes to correct dir) %.o: %.cpp $(HEADERS) $(CXX) -c $(CXXFLAGS) $< -o $@ -%_32.o: %.cpp $(HEADERS) - $(CXX) -DSINGLE -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ -%_32.o: %.c $(HEADERS) - $(CC) -DSINGLE -c $(CFLAGS) $< -o $@ %.o: %.f $(FC) -c $(FFLAGS) $< -o $@ %_32.o: %.f @@ -209,7 +194,6 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) include/finufft/fft.h: $(DUCC_SETUP) SHEAD = $(wildcard src/*.h) $(XSIMD_DIR)/include/xsimd/xsimd.hpp src/spreadinterp.o: $(SHEAD) -src/spreadinterp_32.o: $(SHEAD) # lib ----------------------------------------------------------------------- @@ -277,10 +261,10 @@ test/%: test/%.cpp $(DYNLIB) test/%f: test/%.cpp $(DYNLIB) $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ # low-level tests that are cleaner if depend on only specific objects... -test/testutils: test/testutils.cpp src/utils.o src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o src/utils_precindep.o $(LIBS) -o test/testutils -test/testutilsf: test/testutils.cpp src/utils_32.o src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils_32.o src/utils_precindep.o $(LIBS) -o test/testutilsf +test/testutils: test/testutils.cpp src/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o $(LIBS) -o test/testutils +test/testutilsf: test/testutils.cpp src/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils.o $(LIBS) -o test/testutilsf # make sure all double-prec test executables ready for testing TESTS := $(basename $(wildcard test/*.cpp)) @@ -325,14 +309,14 @@ ST=perftest/spreadtestnd STA=perftest/spreadtestndall STF=$(ST)f STAF=$(STA)f -$(ST): $(ST).cpp $(SOBJS) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@ -$(STF): $(ST).cpp $(SOBJSF) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@ -$(STA): $(STA).cpp $(SOBJS) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@ -$(STAF): $(STA).cpp $(SOBJSF) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@ +$(ST): $(ST).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@ +$(STF): $(ST).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@ +$(STA): $(STA).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@ +$(STAF): $(STA).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@ spreadtest: $(ST) $(STF) # run one thread per core... (escape the $ to get single $ in bash; one big cmd) (export OMP_NUM_THREADS=$$(perftest/mynumcores.sh) ;\ @@ -436,7 +420,7 @@ endif # python --------------------------------------------------------------------- python: $(STATICLIB) $(DYNLIB) - FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft + FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps $(PYTHON) python/finufft/test/run_accuracy_tests.py $(PYTHON) python/finufft/examples/simple1d1.py diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index 0f2c9d0bb..5e27289d8 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -1,8 +1,9 @@ // public header #include "finufft.h" +#include "finufft/defs.h" // private access to timer -#include "finufft/utils_precindep.h" +#include "finufft/utils.h" using namespace finufft::utils; #include diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp index 9b560a25e..d30626007 100644 --- a/perftest/spreadtestnd.cpp +++ b/perftest/spreadtestnd.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp index 666003137..14aad3420 100644 --- a/perftest/spreadtestndall.cpp +++ b/perftest/spreadtestndall.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/src/fft.cpp b/src/fft.cpp index bb7e32442..68877cacd 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -7,7 +7,7 @@ using namespace std; #include "ducc0/fft/fftnd_impl.h" #endif -std::vector gridsize_for_fft(FINUFFT_PLAN p) { +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p) { // local helper func returns a new int array of length dim, extracted from // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. if (p->dim == 1) return {(int)p->nf1}; @@ -15,8 +15,10 @@ std::vector gridsize_for_fft(FINUFFT_PLAN p) { // if (p->dim == 3) return {(int)p->nf3, (int)p->nf2, (int)p->nf1}; } +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); -void do_fft(FINUFFT_PLAN p) { +template void do_fft(FINUFFT_PLAN_T *p) { #ifdef FINUFFT_USE_DUCC0 size_t nthreads = min(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads); const auto ns = gridsize_for_fft(p); @@ -32,9 +34,9 @@ void do_fft(FINUFFT_PLAN p) { arrdims.push_back(size_t(ns[2])); axes.push_back(3); } - ducc0::vfmav data(p->fwBatch, arrdims); + ducc0::vfmav> data(p->fwBatch, arrdims); #ifdef FINUFFT_NO_DUCC0_TWEAKS - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); #else /* For type 1 NUFFTs, only the low-frequency parts of the output fine grid are going to be used, and for type 2 NUFFTs, the high frequency parts of the @@ -45,10 +47,10 @@ void do_fft(FINUFFT_PLAN p) { of all 1D FFTs, and for the last remaining axis the factor is 1/oversampling_factor^2. */ if (p->dim == 1) // 1D: no chance for FFT shortcuts - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else if (p->dim == 2) { // 2D: do partial FFTs if (p->ms < 2) // something is weird, do standard FFT - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else { size_t y_lo = size_t((p->ms + 1) / 2); size_t y_hi = size_t(ns[1] - p->ms / 2); @@ -58,17 +60,17 @@ void do_fft(FINUFFT_PLAN p) { auto sub2 = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}}); if (p->type == 1) // spreading, not all parts of the output array are needed // do axis 2 in full - ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads); // do only parts of axis 1 - ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, TF(1), nthreads); if (p->type == 2) // interpolation, parts of the input array are zero // do axis 2 in full - ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads); } } else { // 3D if ((p->ms < 2) || (p->mt < 2)) // something is weird, do standard FFT - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else { size_t z_lo = size_t((p->ms + 1) / 2); size_t z_hi = size_t(ns[2] - p->ms / 2); @@ -82,22 +84,22 @@ void do_fft(FINUFFT_PLAN p) { auto sub6 = ducc0::subarray(sub2, {{}, {}, {y_hi, ducc0::MAXIDX}, {}}); if (p->type == 1) { // spreading, not all parts of the output array are needed // do axis 3 in full - ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads); // do only parts of axis 2 - ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads); } // do even smaller parts of axis 1 - ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, TF(1), nthreads); if (p->type == 2) { // interpolation, parts of the input array are zero // do only parts of axis 2 - ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads); // do axis 3 in full - ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads); } } } @@ -106,3 +108,5 @@ void do_fft(FINUFFT_PLAN p) { p->fftPlan->execute(); // if thisBatchSize(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); diff --git a/src/finufft.cpp b/src/finufft.cpp index 21e6db7ab..758fcb723 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -4,1199 +4,35 @@ // private headers for lib build // (must come after finufft.h which clobbers FINUFFT* macros) #include -#include -#include -#include -#include -#include "../contrib/legendre_rule_fast.h" -#include -#include -#include -#include -#include -#include -#include +void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); } -using namespace std; -using namespace finufft; -using namespace finufft::utils; -using namespace finufft::spreadinterp; -using namespace finufft::quadrature; - -/* Computational core for FINUFFT. - - Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus - 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. - Original guru interface written by Andrea Malleo, summer 2019, mentored - by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. - - As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions - and the two finufft2d?many() functions. The (now 18) simple C++ interfaces - are in simpleinterfaces.cpp. - -Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: - - TYPE 1: - The type 1 NUFFT proceeds in three main steps: - 1) spread data to oversampled regular mesh using kernel. - 2) compute FFT on uniform mesh - 3) deconvolve by division of each Fourier mode independently by the kernel - Fourier series coeffs (not merely FFT of kernel), shuffle to output. - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 2: - The type 2 algorithm proceeds in three main steps: - 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff - 2) compute inverse FFT on uniform fine grid - 3) spread (dir=2, ie interpolate) data to regular mesh - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 3: - The type 3 algorithm is basically a type 2 (which is implemented precisely - as call to type 2) replacing the middle FFT (Step 2) of a type 1. - Beyond this, the new twists are: - i) nf1, number of upsampled points for the type-1, depends on the product - of interval widths containing input and output points (X*S). - ii) The deconvolve (post-amplify) step is division by the Fourier transform - of the scaled kernel, evaluated on the *nonuniform* output frequency - grid; this is done by direct approximation of the Fourier integral - using quadrature of the kernel function times exponentials. - iii) Shifts in x (real) and s (Fourier) are done to minimize the interval - half-widths X and S, hence nf1. - - MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): - maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so - this is good only for small problems. - - -Design notes for guru interface implementation: - -* Since finufft_plan is C-compatible, we need to use malloc/free for its - allocatable arrays, keeping it quite low-level. We can't use std::vector - since that would only survive in the scope of each function. - -* Thread-safety: FINUFFT plans are passed as pointers, so it has no global - state apart from that associated with FFTW (and the did_fftw_init). -*/ - -// ---------- local math routines (were in common.cpp; no need now): -------- - -namespace finufft { -namespace common { - -static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf) -// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts -// and requested number of Fourier modes ms. Returns 0 if success, else an -// error code if nf was unreasonably big (& tell the world). -{ - *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails - if (*nf < MAX_NF) { - *nf = next235even(*nf); // expensive at huge nf - return 0; - } else { - fprintf(stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " - "malloc\n", - __func__, (double)*nf, (double)MAX_NF); - return FINUFFT_ERR_MAXNALLOC; - } -} - -int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts, - int dim) -// Set up the spreader parameters given eps, and pass across various nufft -// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 -{ - // this calls spreadinterp.cpp... - int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, - opts.spread_debug, opts.showwarn, dim); - // override various spread opts from their defaults... - spopts.debug = opts.spread_debug; - spopts.sort = opts.spread_sort; // could make dim or CPU choices here? - spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) - spopts.chkbnds = opts.chkbnds; - spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here - if (opts.spread_nthr_atomic >= 0) // overrides - spopts.atomic_threshold = opts.spread_nthr_atomic; - if (opts.spread_max_sp_size > 0) // overrides - spopts.max_subproblem_size = opts.spread_max_sp_size; - if (opts.chkbnds != 1) // deprecated default value hardcoded here - fprintf(stderr, - "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", - __func__); - return ier; -} - -void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf, FLT *h, FLT *gam) -/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), - for type 3 only. - Inputs: - X and S are the xj and sk interval half-widths respectively. - opts and spopts are the NUFFT and spreader opts strucs, respectively. - Outputs: - nf is the size of upsampled grid for a given single dimension. - h is the grid spacing = 2pi/nf - gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). - Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 - New logic 6/12/17 -*/ -{ - int nss = spopts.nspread + 1; // since ns may be odd - FLT Xsafe = X, Ssafe = S; // may be tweaked locally - if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 - if (S == 0.0) { - Xsafe = 1.0; - Ssafe = 1.0; - } else - Xsafe = max(Xsafe, 1 / S); - else - Ssafe = max(Ssafe, 1 / X); - // use the safe X and S... - auto nfd = FLT(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); - if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf - *nf = (BIGINT)nfd; - // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); - // catch too small nf, and nan or +-inf, otherwise spread fails... - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; - if (*nf < MAX_NF) // otherwise will fail anyway - *nf = next235even(*nf); // expensive at huge nf - *h = FLT(2.0 * PI / *nf); // upsampled grid spacing - *gam = FLT(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' -} - -void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) -/* - Approximates exact Fourier series coeffs of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Uses phase winding for cheap eval on the regular freq - grid. Note that this is also the Fourier transform of the non-periodized - kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an - overall prefactor of 1/h, which is needed anyway for the correction, and - arises because the quadrature weights are scaled for grid units not x units. - The kernel is actually centered at nf/2, related to the centering of the grid; - this is now achieved by the sign flip in a[n] below. - - Inputs: - nf - size of 1d uniform spread grid, must be even. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 FLTs) - - Compare onedim_dct_kernel which has same interface, but computes DFT of - sampled kernel, not quite the same object. - - Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. - Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. - */ -{ - FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - FLT f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - CPX a[MAX_NQUAD]; - for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei - a[n] = -exp(2 * PI * IMA * (FLT)z[n] / (FLT)nf); // phase winding rates - } - BIGINT nout = nf / 2 + 1; // how many values we're writing to - int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks - std::vector brk(nt + 1); // start indices for each thread - for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads - brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); -#pragma omp parallel num_threads(nt) - { // each thread gets own chunk to do - int t = MY_OMP_GET_THREAD_NUM(); - CPX aj[MAX_NQUAD]; // phase rotator for this thread - for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (FLT)brk[t]); // init phase factors for chunk - for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array - FLT x = 0.0; // accumulator for answer at this j - for (int n = 0; n < q; ++n) { - x += f[n] * 2 * real(aj[n]); // include the negative freq - aj[n] *= a[n]; // wind the phases - } - fwkerhalf[j] = x; - } - } -} - -void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts) -/* - Approximates exact 1D Fourier transform of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), - for a kernel with x measured in grid-spacings. (See previous routine for - FT definition). - - Inputs: - nk - number of freqs - k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - phihat - real Fourier transform evaluated at freqs (alloc for nk FLTs) - - Barnett 2/8/17. openmp since cos slow 2/9/17 - */ -{ - FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - FLT f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - for (int n = 0; n < q; ++n) { - z[n] *= (FLT)J2; // quadr nodes for [0,J/2] - f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights - } -#pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j = 0; j < nk; ++j) { // loop along output array - FLT x = 0.0; // register - for (int n = 0; n < q; ++n) - x += f[n] * 2 * cos(k[j] * (FLT)z[n]); // pos & neg freq pair. use FLT cos! - phihat[j] = x; - } -} - -void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGINT nf1, - CPX *fw, int modeord) -/* - if dir==1: copies fw to fk with amplification by prefac/ker - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) - 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). - - fk is a size-ms FLT complex array (2*ms FLTs alternating re,im parts) - fw is a size-nf1 complex array (2*nf1 FLTs alternating re,im parts) - ker is real-valued FLT array of length nf1/2+1. - - Single thread only, but shouldn't matter since mostly data movement. - - It has been tested that the repeated floating division in this inner loop - only contributes at the <3% level in 3D relative to the FFT cost (8 threads). - This could be removed by passing in an inverse kernel and doing mults. - - todo: rewrite w/ C++-complex I/O, check complex divide not slower than - real divide, or is there a way to force a real divide? - - Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 -*/ -{ - BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices - if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (kmax + 1); - } // or, instead, FFT ordering - if (dir == 1) { // read fw, write out to fk... - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fk[pp++] = prefac * fw[k].real() / ker[k]; // re - fk[pp++] = prefac * fw[k].imag() / ker[k]; // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re - fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im - } - } else { // read fk, write out to fw w/ zero padding... - for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where - // needed - fw[k] = 0.0; - } - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fw[k].real(prefac * fk[pp++] / ker[k]); // re - fw[k].imag(prefac * fk[pp++] / ker[k]); // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re - fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im - } - } -} - -void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, BIGINT mt, - FLT *fk, BIGINT nf1, BIGINT nf2, CPX *fw, int modeord) -/* - 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. - - if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt FLTs alternating re,im parts, with - ms looped over fast and mt slow. - fw is a complex array stored as 2*nf1*nf2] FLTs alternating re,im parts, with - nf1 looped over fast and nf2 slow. - ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1 - respectively. - - Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices - if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k2max + 1) * ms; - } // or, instead, FFT ordering - if (dir == 2) // zero pad needed x-lines (contiguous in memory) - for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all - // dims - fw[j] = 0.0; - for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs - // point fk and fw to the start of this y value's row (2* is for complex): - common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, - &fw[nf1 * k2], modeord); - for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs - common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, - &fw[nf1 * (nf2 + k2)], modeord); -} - -void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT ms, - BIGINT mt, BIGINT mu, FLT *fk, BIGINT nf1, BIGINT nf2, - BIGINT nf3, CPX *fw, int modeord) -/* - 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. - - if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with - ms looped over fastest and mu slowest. - fw is a complex array stored as 2*nf1*nf2*nf3 FLTs alternating re,im parts, with - nf1 looped over fastest and nf3 slowest. - ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1, - and nf3/2+1 respectively. - - Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices - if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k3max + 1) * ms * mt; - } // or FFT ordering - BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane - if (dir == 2) // zero pad needed xy-planes (contiguous in memory) - for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims - fw[j] = 0.0; - for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs - // point fk and fw to the start of this z value's plane (2* is for complex): - common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, - nf2, &fw[np * k3], modeord); - for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs - common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, - nf2, &fw[np * (nf3 + k3)], modeord); -} - -// --------- batch helper functions for t1,2 exec: --------------------------- - -int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch) -/* - Spreads (or interpolates) a batch of batchSize strength vectors in cBatch - to (or from) the batch of fine working grids p->fwBatch, using the same set of - (index-sorted) NU points p->X,Y,Z for each vector in the batch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - Returns 0 (no error reporting for now). - Notes: - 1) cBatch is already assumed to have the correct offset, ie here we - read from the start of cBatch (unlike Malleo). fwBatch also has zero offset - 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp - Barnett 5/19/20, based on Malleo 2019. -*/ -{ - // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. - // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. - // But when nthr_outer=1 here, omp par inside the loop sees all threads... -#ifdef _OPENMP - int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; -#endif -#pragma omp parallel for num_threads(nthr_outer) - for (int i = 0; i < batchSize; i++) { - CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - CPX *ci = cBatch + i * p->nj; // start of i'th c array in cBatch - spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X, - p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort); - } - return 0; -} - -int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch) -/* - Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch - into each output array fk in fkBatch. - Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, - again looping over fk in fkBatch and fw in p->fwBatch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize - times. - Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) -*/ -{ - // since deconvolveshuffle?d are single-thread, omp par seems to help here... -#pragma omp parallel for num_threads(batchSize) - for (int i = 0; i < batchSize; i++) { - CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - CPX *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch - - // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... - if (p->dim == 1) - deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki, - p->nf1, fwi, p->opts.modeord); - else if (p->dim == 2) - deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms, - p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); - else - deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, - p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2, - p->nf3, fwi, p->opts.modeord); - } - return 0; +int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), opts); } -} // namespace common -} // namespace finufft - -// --------------- rest is the 5 user guru (plan) interface drivers: --------- -// (not namespaced since have safe names finufft{f}_* ) -using namespace finufft::common; // accesses routines defined above - -// Marco Barbone: 5.8.2024 -// These are user-facing. -// The various options could be macros to follow c standard library conventions. -// Question: would these be enums? - -// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO -void FINUFFT_DEFAULT_OPTS(finufft_opts *o) -// Sets default nufft opts (referenced by all language interfaces too). -// See finufft_opts.h for meanings. -// This was created to avoid uncertainty about C++11 style static initialization -// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. -// Sphinx sucks the below code block into the web docs, hence keep it clean... -{ - // sphinx tag (don't remove): @defopts_start - o->modeord = 0; - o->chkbnds = 1; - - o->debug = 0; - o->spread_debug = 0; - o->showwarn = 1; - - o->nthreads = 0; -#ifdef FINUFFT_USE_DUCC0 - o->fftw = 0; -#else - o->fftw = FFTW_ESTIMATE; -#endif - o->spread_sort = 2; - o->spread_kerevalmeth = 1; - o->spread_kerpad = 1; - o->upsampfac = 0.0; - o->spread_thread = 0; - o->maxbatchsize = 0; - o->spread_nthr_atomic = -1; - o->spread_max_sp_size = 0; - o->fftw_lock_fun = nullptr; - o->fftw_unlock_fun = nullptr; - o->fftw_lock_data = nullptr; - // sphinx tag (don't remove): @defopts_end -} - -// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP -int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol, - FINUFFT_PLAN *pp, finufft_opts *opts) -// Populates the fields of finufft_plan which is pointed to by "pp". -// opts is ptr to a finufft_opts to set options, or NULL to use defaults. -// For some of the fields (if "auto" selected) here choose the actual setting. -// For types 1,2 allocates memory for internal working arrays, -// evaluates spreading kernel coefficients, and instantiates the fftw_plan -{ - FINUFFT_PLAN p; - p = new FINUFFT_PLAN_S; // allocate fresh plan struct - *pp = p; // pass out plan as ptr to plan struct - - if (opts == NULL) // use default opts - FINUFFT_DEFAULT_OPTS(&(p->opts)); - else // or read from what's passed in - p->opts = *opts; // keep a deep copy; changing *opts now has no effect - - if (p->opts.debug) // do a hello world - printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", - __func__); - - p->fftPlan = std::make_unique>( - p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); - - if ((type != 1) && (type != 2) && (type != 3)) { - fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); - return FINUFFT_ERR_TYPE_NOTVALID; - } - if ((dim != 1) && (dim != 2) && (dim != 3)) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; - } - if (ntrans < 1) { - fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); - return FINUFFT_ERR_NTRANS_NOTVALID; - } - if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { - fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", - __func__); - return FINUFFT_ERR_LOCK_FUNS_INVALID; - ; - } - - // get stuff from args... - p->type = type; - p->dim = dim; - p->ntrans = ntrans; - p->tol = tol; - p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - - // choose overall # threads... -#ifdef _OPENMP - int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = ompmaxnthr; // default: use as many as OMP gives us - // (the above could be set, or suggested set, to 1 for small enough problems...) - if (p->opts.nthreads > 0) { - nthr = p->opts.nthreads; // user override, now without limit - if (p->opts.showwarn && (nthr > ompmaxnthr)) - fprintf(stderr, - "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " - "available; note large nthreads can be slower.\n", - __func__, nthr, ompmaxnthr); - } -#else - int nthr = 1; // always 1 thread (avoid segfault) - if (p->opts.nthreads > 1) - fprintf(stderr, - "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", - __func__, p->opts.nthreads); -#endif - p->opts.nthreads = nthr; // store actual # thr planned for - // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) - - // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) - if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize - p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss - p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b - } else { // batchSize override by user - p->batchSize = min(p->opts.maxbatchsize, ntrans); - p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches - } - if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice - if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { - fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); - return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; - } - - if (type != 3) { // read in user Fourier mode array sizes... - p->ms = n_modes[0]; - p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims - p->mu = (dim > 2) ? n_modes[2] : 1; - p->N = p->ms * p->mt * p->mu; // N = total # modes - } - - // heuristic to choose default upsampfac... (currently two poss) - if (p->opts.upsampfac == 0.0) { // indicates auto-choose - p->opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (FLT)1E-9) { // the tol sigma=5/4 can reach - if (type == 3) // could move to setpts, more known? - p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT - else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon - p->opts.upsampfac = 1.25; - } - if (p->opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); - } - // use opts to choose and write into plan's spread options... - int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); - if (ier > 1) // proceed if success or warning - return ier; - - // set others as defaults (or unallocated for arrays)... - p->X = NULL; - p->Y = NULL; - p->Z = NULL; - p->phiHat1 = NULL; - p->phiHat2 = NULL; - p->phiHat3 = NULL; - p->nf1 = 1; - p->nf2 = 1; - p->nf3 = 1; // crucial to leave as 1 for unused dims - p->sortIndices = NULL; // used in all three types - - // ------------------------ types 1,2: planning needed --------------------- - if (type == 1 || type == 2) { - - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) - // Note: batchSize not used since might be only 1. - - p->spopts.spread_direction = type; - - if (p->opts.showwarn) { // user warn round-off error... - if (EPSILON * p->ms > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->ms)); - if (EPSILON * p->mt > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mt)); - if (EPSILON * p->mu > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mu)); - } - - // determine fine grid sizes, sanity check.. - int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); - if (nfier) return nfier; // nf too big; we're done - p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1)); - if (dim > 1) { - nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); - if (nfier) return nfier; - p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1)); - } - if (dim > 2) { - nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); - if (nfier) return nfier; - p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1)); - } - - if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " - "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " - "batchSize=%d ", - __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, - (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, - p->batchSize); - if (p->batchSize == 1) // spread_thread has no effect in this case - printf("\n"); - else - printf(" spread_thread=%d\n", p->opts.spread_thread); - } - - // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim - CNTime timer; - timer.start(); - onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); - if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); - if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); - if (p->opts.debug) - printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, - timer.elapsedsec()); - - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 - return FINUFFT_ERR_MAXNALLOC; - } - - timer.restart(); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) - printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec()); - if (!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", - __func__); - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - return FINUFFT_ERR_ALLOC; - } - - timer.restart(); // plan the FFTW - const auto ns = gridsize_for_fft(p); - p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); - if (p->opts.debug) - printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, - nthr_fft, timer.elapsedsec()); - - } else { // -------------------------- type 3 (no planning) ------------ - - if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); - // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; - p->fwBatch = NULL; - p->Sp = NULL; - p->Tp = NULL; - p->Up = NULL; - p->prephase = NULL; - p->deconv = NULL; - p->innerT2plan = NULL; - // Type 3 will call finufft_makeplan for type 2; no need to init FFTW - // Note we don't even know nj or nk yet, so can't do anything else! - } - return ier; // report setup_spreader status (could be warning) -} - -// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk, - FLT *s, FLT *t, FLT *u) -/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for - spreading. (The last 4 arguments are ignored.) - For type 3: allocates internal working arrays, scales/centers the NU points - and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. -*/ -{ - int d = p->dim; // abbrev for spatial dim - CNTime timer; - timer.start(); - p->nj = nj; // the user only now chooses how many NU (x,y,z) pts - if (nj < 0) { - fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nj > MAX_NU_PTS) { - fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - - if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- - // (all we can do is check and maybe bin-sort the NU pts) - p->X = xj; // plan must keep pointers to user's fixed NU pts - p->Y = yj; - p->Z = zj; - int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug > 1) - printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, - timer.elapsedsec()); - if (ier) // no warnings allowed here - return ier; - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = - indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug) - printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - } else { // ------------------------- TYPE 3 SETPTS ----------------------- - // (here we can precompute pre/post-phase factors and plan the t2) - - if (nk < 0) { - fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nk > MAX_NU_PTS) { - fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - p->nk = nk; // user set # targ freq pts - p->S = s; // keep pointers to user's input target pts - p->T = t; - p->U = u; - - // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... - FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}... - arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); - arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} - set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), - &(p->t3P.gam1)); // applies twist i) - p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc - p->t3P.D2 = 0.0; - if (d > 1) { - arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} - arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} - set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), - &(p->t3P.gam2)); - } - p->t3P.C3 = 0.0; - p->t3P.D3 = 0.0; - if (d > 2) { - arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} - arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} - set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), - &(p->t3P.gam3)); - } - - if (p->opts.debug) { // report on choices of shifts, centers, etc... - printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, - p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); - if (d > 1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, - p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); - if (d > 2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, - p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); - } - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - return FINUFFT_ERR_MAXNALLOC; - } - p->fftPlan->free(p->fwBatch); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace - - // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if (p->CpBatch) free(p->CpBatch); - p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work - - if (p->opts.debug) - printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize, - timer.elapsedsec()); - if (!p->fwBatch || !p->CpBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); - return FINUFFT_ERR_ALLOC; - } - // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); - - // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... - // FIXME: should use realloc - if (p->X) free(p->X); - if (p->Sp) free(p->Sp); - p->X = (FLT *)malloc(sizeof(FLT) * nj); - p->Sp = (FLT *)malloc(sizeof(FLT) * nk); - if (d > 1) { - if (p->Y) free(p->Y); - if (p->Tp) free(p->Tp); - p->Y = (FLT *)malloc(sizeof(FLT) * nj); - p->Tp = (FLT *)malloc(sizeof(FLT) * nk); - } - if (d > 2) { - if (p->Z) free(p->Z); - if (p->Up) free(p->Up); - p->Z = (FLT *)malloc(sizeof(FLT) * nj); - p->Up = (FLT *)malloc(sizeof(FLT) * nk); - } - - // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... - FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim - if (d > 1) ig2 = 1.0 / p->t3P.gam2; - if (d > 2) ig3 = 1.0 / p->t3P.gam3; -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { - p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j - if (d > 1) // (ok to do inside loop because of branch predict) - p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j - if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j - } - - // set up prephase array... - CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i - if (p->prephase) free(p->prephase); - p->prephase = (CPX *)malloc(sizeof(CPX) * nj); - if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs - FLT phase = p->t3P.D1 * xj[j]; - if (d > 1) phase += p->t3P.D2 * yj[j]; - if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler - // e^{+-i.phase} - } - } else - for (BIGINT j = 0; j < nj; ++j) - p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec?? - - // rescale the target s_k etc to s'_k etc... -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { - p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R - if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < - // pi/R - if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < - // pi/R - } - // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... - // (exploits that FT separates because kernel is prod of 1D funcs) - if (p->deconv) free(p->deconv); - p->deconv = (CPX *)malloc(sizeof(CPX) * nk); - FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk); // don't confuse w/ p->phiHat - onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 - FLT *phiHatk2 = NULL, *phiHatk3 = NULL; - if (d > 1) { - phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 - } - if (d > 2) { - phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 - } - int Cfinite = - isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan - // or inf if - // M=0, no - // input NU pts - int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs - FLT phiHat = phiHatk1[k]; - if (d > 1) phiHat *= phiHatk2[k]; - if (d > 2) phiHat *= phiHatk3[k]; - p->deconv[k] = (CPX)(1.0 / phiHat); - if (Cfinite && Cnonzero) { - FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1; - if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; - if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; - p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} - } - } - free(phiHatk1); - free(phiHatk2); - free(phiHatk3); // done w/ deconv fill - if (p->opts.debug) - printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); - - // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, - p->Z, p->spopts); - if (p->opts.debug) - printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - // Plan and setpts once, for the (repeated) inner type 2 finufft call... - timer.restart(); - BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw - finufft_opts t2opts = p->opts; // deep copy, since not ptrs - t2opts.modeord = 0; // needed for correct t3! - t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail - t2opts.spread_debug = max(0, p->opts.spread_debug - 1); - t2opts.showwarn = 0; // so don't see warnings 2x - // (...could vary other t2opts here?) - if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan); - int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, - &p->innerT2plan, &t2opts); - if (ier > 1) { // if merely warning, still proceed - fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", - __func__, ier); - return ier; - } - ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, - NULL); // note nk = # output points (not nj) - if (ier > 1) { - fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); - return ier; - } - if (p->opts.debug) - printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); - } - return 0; + FLT *s, FLT *t, FLT *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, zj, + nk, s, t, u); } -// ............ end setpts .................................................. -// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { - /* See ../docs/cguru.doc for current documentation. - - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and FFT as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ - CNTime timer; - timer.start(); - - if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ - - double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing - if (p->opts.debug) - printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // current batch is either batchSize, or possibly truncated if last one - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; // index of vector, since batchsizes same - CPX *cjb = cj + bB * p->nj; // point to batch of weights - CPX *fkb = fk + bB * p->N; // point to batch of mode coeffs - if (p->opts.debug > 1) - printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 1: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } - - // STEP 2: call the FFT on this batch - timer.restart(); - do_fft(p); - t_fft += timer.elapsedsec(); - if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); - - // STEP 3: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - if (p->type == 1) { - printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); - } else { - printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot interp:\t\t\t%.3g s\n", t_sprint); - } - } - } - - else { // ----------------------------- TYPE 3 EXEC --------------------- - - // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long - // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - - double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, - t_deconv = 0.0; // accumulated timings - if (p->opts.debug) - printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // batching and pointers to this batch, identical to t1,2 above... - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; - CPX *cjb = cj + bB * p->nj; // batch of input strengths - CPX *fkb = fk + bB * p->nk; // batch of output strengths - if (p->opts.debug > 1) - printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nj; - for (BIGINT j = 0; j < p->nj; ++j) { - p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; - } - } - t_pre += timer.elapsedsec(); - - // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... - timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed - t_spr += timer.elapsedsec(); - - // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... - timer.restart(); - // illegal possible shrink of ntrans *after* plan for smaller last batch: - p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! - /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array - still the same size, as Andrea explained; just wastes a few flops) */ - FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch); - t_t2 += timer.elapsedsec(); - // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nk; - for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; - } - t_deconv += timer.elapsedsec(); - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); - printf(" tot spread:\t\t\t%.3g s\n", t_spr); - printf(" tot type 2:\t\t\t%.3g s\n", t_t2); - printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); - } - } - // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long - // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug - - return 0; + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); } -// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD int FINUFFT_DESTROY(FINUFFT_PLAN p) // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. -// Thus either each thing free'd here is guaranteed to be NULL or correctly +// Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. { - if (!p) // NULL ptr, so not a ptr to a plan, report error + if (!p) // nullptr, so not a ptr to a plan, report error return 1; - p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array - free(p->sortIndices); - if (p->type == 1 || p->type == 2) { - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - } else { // free the stuff alloc for type 3 only - FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code - free(p->CpBatch); - free(p->Sp); - free(p->Tp); - free(p->Up); - free(p->X); - free(p->Y); - free(p->Z); - free(p->prephase); - free(p->deconv); - } - delete p; + delete reinterpret_cast *>(p); + p = nullptr; return 0; // success } diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp new file mode 100644 index 000000000..834420bbe --- /dev/null +++ b/src/finufft_core.cpp @@ -0,0 +1,1204 @@ +#include +#include +#include +#include + +#include "../contrib/legendre_rule_fast.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace finufft; +using namespace finufft::utils; +using namespace finufft::spreadinterp; +using namespace finufft::quadrature; + +/* Computational core for FINUFFT. + + Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus + 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. + Original guru interface written by Andrea Malleo, summer 2019, mentored + by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. + + As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions + and the two finufft2d?many() functions. The (now 18) simple C++ interfaces + are in simpleinterfaces.cpp. + +Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: + + TYPE 1: + The type 1 NUFFT proceeds in three main steps: + 1) spread data to oversampled regular mesh using kernel. + 2) compute FFT on uniform mesh + 3) deconvolve by division of each Fourier mode independently by the kernel + Fourier series coeffs (not merely FFT of kernel), shuffle to output. + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 2: + The type 2 algorithm proceeds in three main steps: + 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff + 2) compute inverse FFT on uniform fine grid + 3) spread (dir=2, ie interpolate) data to regular mesh + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 3: + The type 3 algorithm is basically a type 2 (which is implemented precisely + as call to type 2) replacing the middle FFT (Step 2) of a type 1. + Beyond this, the new twists are: + i) nf1, number of upsampled points for the type-1, depends on the product + of interval widths containing input and output points (X*S). + ii) The deconvolve (post-amplify) step is division by the Fourier transform + of the scaled kernel, evaluated on the *nonuniform* output frequency + grid; this is done by direct approximation of the Fourier integral + using quadrature of the kernel function times exponentials. + iii) Shifts in x (real) and s (Fourier) are done to minimize the interval + half-widths X and S, hence nf1. + + MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): + maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so + this is good only for small problems. + + +Design notes for guru interface implementation: + +* Since finufft_plan is C-compatible, we need to use malloc/free for its + allocatable arrays, keeping it quite low-level. We can't use std::vector + since that would only survive in the scope of each function. + +* Thread-safety: FINUFFT plans are passed as pointers, so it has no global + state apart from that associated with FFTW (and the did_fftw_init). +*/ + +// ---------- local math routines (were in common.cpp; no need now): -------- + +namespace finufft { +namespace common { + +static constexpr double PI = 3.14159265358979329; + +static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf) +// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts +// and requested number of Fourier modes ms. Returns 0 if success, else an +// error code if nf was unreasonably big (& tell the world). +{ + *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails + if (*nf < MAX_NF) { + *nf = next235even(*nf); // expensive at huge nf + return 0; + } else { + fprintf(stderr, + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " + "malloc\n", + __func__, (double)*nf, (double)MAX_NF); + return FINUFFT_ERR_MAXNALLOC; + } +} + +template +static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, + int dim) +// Set up the spreader parameters given eps, and pass across various nufft +// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 +{ + // this calls spreadinterp.cpp... + int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, + opts.spread_debug, opts.showwarn, dim); + // override various spread opts from their defaults... + spopts.debug = opts.spread_debug; + spopts.sort = opts.spread_sort; // could make dim or CPU choices here? + spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) + spopts.chkbnds = opts.chkbnds; + spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here + if (opts.spread_nthr_atomic >= 0) // overrides + spopts.atomic_threshold = opts.spread_nthr_atomic; + if (opts.spread_max_sp_size > 0) // overrides + spopts.max_subproblem_size = opts.spread_max_sp_size; + if (opts.chkbnds != 1) // deprecated default value hardcoded here + fprintf(stderr, + "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", + __func__); + return ier; +} + +template +static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf, T *h, T *gam) +/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), + for type 3 only. + Inputs: + X and S are the xj and sk interval half-widths respectively. + opts and spopts are the NUFFT and spreader opts strucs, respectively. + Outputs: + nf is the size of upsampled grid for a given single dimension. + h is the grid spacing = 2pi/nf + gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). + Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 + New logic 6/12/17 +*/ +{ + int nss = spopts.nspread + 1; // since ns may be odd + T Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + if (S == 0.0) { + Xsafe = 1.0; + Ssafe = 1.0; + } else + Xsafe = max(Xsafe, 1 / S); + else + Ssafe = max(Ssafe, 1 / X); + // use the safe X and S... + auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); + if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf + *nf = (BIGINT)nfd; + // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); + // catch too small nf, and nan or +-inf, otherwise spread fails... + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; + if (*nf < MAX_NF) // otherwise will fail anyway + *nf = next235even(*nf); // expensive at huge nf + *h = T(2.0 * PI / *nf); // upsampled grid spacing + *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' +} + +template +static void onedim_fseries_kernel(BIGINT nf, std::vector &fwkerhalf, + finufft_spread_opts opts) +/* + Approximates exact Fourier series coeffs of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Uses phase winding for cheap eval on the regular freq + grid. Note that this is also the Fourier transform of the non-periodized + kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an + overall prefactor of 1/h, which is needed anyway for the correction, and + arises because the quadrature weights are scaled for grid units not x units. + The kernel is actually centered at nf/2, related to the centering of the grid; + this is now achieved by the sign flip in a[n] below. + + Inputs: + nf - size of 1d uniform spread grid, must be even. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, + divided by h = 2pi/n. + (should be allocated for at least nf/2+1 Ts) + + Compare onedim_dct_kernel which has same interface, but computes DFT of + sampled kernel, not quite the same object. + + Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. + Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + std::complex a[MAX_NQUAD]; + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = -exp(2 * PI * std::complex(0, 1) * z[n] / double(nf)); // phase winding + // rates + } + BIGINT nout = nf / 2 + 1; // how many values we're writing to + int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks + std::vector brk(nt + 1); // start indices for each thread + for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads + brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); +#pragma omp parallel num_threads(nt) + { // each thread gets own chunk to do + int t = MY_OMP_GET_THREAD_NUM(); + std::complex aj[MAX_NQUAD]; // phase rotator for this thread + for (int n = 0; n < q; ++n) + aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk + for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array + T x = 0.0; // accumulator for answer at this j + for (int n = 0; n < q; ++n) { + x += f[n] * 2 * real(aj[n]); // include the negative freq + aj[n] *= a[n]; // wind the phases + } + fwkerhalf[j] = x; + } + } +} + +template +static void onedim_nuft_kernel(BIGINT nk, const std::vector &k, std::vector &phihat, + finufft_spread_opts opts) +/* + Approximates exact 1D Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). + + Inputs: + nk - number of freqs + k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) + Note, z is in grid-point units, and k values must be in [-pi, pi) for + accuracy. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) + + Barnett 2/8/17. openmp since cos slow 2/9/17 + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + for (int n = 0; n < q; ++n) { + z[n] *= (T)J2; // quadr nodes for [0,J/2] + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights + } +#pragma omp parallel for num_threads(opts.nthreads) + for (BIGINT j = 0; j < nk; ++j) { // loop along output array + T x = 0.0; // register + for (int n = 0; n < q; ++n) + x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! + phihat[j] = x; + } +} + +template +static void deconvolveshuffle1d(int dir, T prefac, const std::vector &ker, BIGINT ms, + T *fk, BIGINT nf1, std::complex *fw, int modeord) +/* + if dir==1: copies fw to fk with amplification by prefac/ker + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) + 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). + + fk is a size-ms T complex array (2*ms Ts alternating re,im parts) + fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) + ker is real-valued T array of length nf1/2+1. + + Single thread only, but shouldn't matter since mostly data movement. + + It has been tested that the repeated floating division in this inner loop + only contributes at the <3% level in 3D relative to the FFT cost (8 threads). + This could be removed by passing in an inverse kernel and doing mults. + + todo: rewrite w/ C++-complex I/O, check complex divide not slower than + real divide, or is there a way to force a real divide? + + Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 +*/ +{ + BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices + if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (kmax + 1); + } // or, instead, FFT ordering + if (dir == 1) { // read fw, write out to fk... + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fk[pp++] = prefac * fw[k].real() / ker[k]; // re + fk[pp++] = prefac * fw[k].imag() / ker[k]; // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re + fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im + } + } else { // read fk, write out to fw w/ zero padding... + for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where + // needed + fw[k] = 0.0; + } + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fw[k].real(prefac * fk[pp++] / ker[k]); // re + fw[k].imag(prefac * fk[pp++] / ker[k]); // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re + fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im + } + } +} + +template +static void deconvolveshuffle2d(int dir, T prefac, const std::vector &ker1, + const std::vector &ker2, BIGINT ms, BIGINT mt, T *fk, + BIGINT nf1, BIGINT nf2, std::complex *fw, int modeord) +/* + 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. + + if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with + ms looped over fast and mt slow. + fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with + nf1 looped over fast and nf2 slow. + ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 + respectively. + + Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices + if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k2max + 1) * ms; + } // or, instead, FFT ordering + if (dir == 2) // zero pad needed x-lines (contiguous in memory) + for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all + // dims + fw[j] = 0.0; + for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs + // point fk and fw to the start of this y value's row (2* is for complex): + common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, + &fw[nf1 * k2], modeord); + for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs + common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, + &fw[nf1 * (nf2 + k2)], modeord); +} + +template +static void deconvolveshuffle3d(int dir, T prefac, std::vector &ker1, + std::vector &ker2, std::vector &ker3, BIGINT ms, + BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, + BIGINT nf3, std::complex *fw, int modeord) +/* + 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. + + if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with + ms looped over fastest and mu slowest. + fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with + nf1 looped over fastest and nf3 slowest. + ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, + and nf3/2+1 respectively. + + Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices + if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k3max + 1) * ms * mt; + } // or FFT ordering + BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane + if (dir == 2) // zero pad needed xy-planes (contiguous in memory) + for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims + fw[j] = 0.0; + for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs + // point fk and fw to the start of this z value's plane (2* is for complex): + common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, + nf2, &fw[np * k3], modeord); + for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs + common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, + nf2, &fw[np * (nf3 + k3)], modeord); +} + +// --------- batch helper functions for t1,2 exec: --------------------------- + +template +static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T *p, + std::complex *cBatch) +/* + Spreads (or interpolates) a batch of batchSize strength vectors in cBatch + to (or from) the batch of fine working grids p->fwBatch, using the same set of + (index-sorted) NU points p->X,Y,Z for each vector in the batch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + Returns 0 (no error reporting for now). + Notes: + 1) cBatch is already assumed to have the correct offset, ie here we + read from the start of cBatch (unlike Malleo). fwBatch also has zero offset + 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp + Barnett 5/19/20, based on Malleo 2019. +*/ +{ + // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. + // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. + // But when nthr_outer=1 here, omp par inside the loop sees all threads... +#ifdef _OPENMP + int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; +#endif +#pragma omp parallel for num_threads(nthr_outer) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, + p->Y, p->Z, (T *)ci, p->spopts, p->didSort); + } + return 0; +} + +template +static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T *p, std::complex *fkBatch) +/* + Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch + into each output array fk in fkBatch. + Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, + again looping over fk in fkBatch and fw in p->fwBatch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize + times. + Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) +*/ +{ + // since deconvolveshuffle?d are single-thread, omp par seems to help here... +#pragma omp parallel for num_threads(batchSize) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + + // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... + if (p->dim == 1) + deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, + p->nf1, fwi, p->opts.modeord); + else if (p->dim == 2) + deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, + p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); + else + deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, + p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, + p->nf3, fwi, p->opts.modeord); + } + return 0; +} + +} // namespace common +} // namespace finufft + +// --------------- rest is the 5 user guru (plan) interface drivers: --------- +// (not namespaced since have safe names finufft{f}_* ) +using namespace finufft::common; // accesses routines defined above + +// Marco Barbone: 5.8.2024 +// These are user-facing. +// The various options could be macros to follow c standard library conventions. +// Question: would these be enums? + +// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO +void finufft_default_opts_t(finufft_opts *o) +// Sets default nufft opts (referenced by all language interfaces too). +// See finufft_opts.h for meanings. +// This was created to avoid uncertainty about C++11 style static initialization +// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. +// Sphinx sucks the below code block into the web docs, hence keep it clean... +{ + // sphinx tag (don't remove): @defopts_start + o->modeord = 0; + o->chkbnds = 1; + + o->debug = 0; + o->spread_debug = 0; + o->showwarn = 1; + + o->nthreads = 0; +#ifdef FINUFFT_USE_DUCC0 + o->fftw = 0; +#else + o->fftw = FFTW_ESTIMATE; +#endif + o->spread_sort = 2; + o->spread_kerevalmeth = 1; + o->spread_kerpad = 1; + o->upsampfac = 0.0; + o->spread_thread = 0; + o->maxbatchsize = 0; + o->spread_nthr_atomic = -1; + o->spread_max_sp_size = 0; + o->fftw_lock_fun = nullptr; + o->fftw_unlock_fun = nullptr; + o->fftw_lock_data = nullptr; + // sphinx tag (don't remove): @defopts_end +} + +// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) +// Populates the fields of finufft_plan which is pointed to by "pp". +// opts is ptr to a finufft_opts to set options, or nullptr to use defaults. +// For some of the fields (if "auto" selected) here choose the actual setting. +// For types 1,2 allocates memory for internal working arrays, +// evaluates spreading kernel coefficients, and instantiates the fftw_plan +{ + FINUFFT_PLAN_T *p; + p = new FINUFFT_PLAN_T; // allocate fresh plan struct + *pp = p; // pass out plan as ptr to plan struct + + if (!opts) // use default opts + finufft_default_opts_t(&(p->opts)); + else // or read from what's passed in + p->opts = *opts; // keep a deep copy; changing *opts now has no effect + + if (p->opts.debug) // do a hello world + printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", + __func__); + + p->fftPlan = std::make_unique>( + p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); + + if ((type != 1) && (type != 2) && (type != 3)) { + fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); + return FINUFFT_ERR_TYPE_NOTVALID; + } + if ((dim != 1) && (dim != 2) && (dim != 3)) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); + return FINUFFT_ERR_DIM_NOTVALID; + } + if (ntrans < 1) { + fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); + return FINUFFT_ERR_NTRANS_NOTVALID; + } + if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", + __func__); + return FINUFFT_ERR_LOCK_FUNS_INVALID; + ; + } + + // get stuff from args... + p->type = type; + p->dim = dim; + p->ntrans = ntrans; + p->tol = tol; + p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input + + // choose overall # threads... +#ifdef _OPENMP + int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); + int nthr = ompmaxnthr; // default: use as many as OMP gives us + // (the above could be set, or suggested set, to 1 for small enough problems...) + if (p->opts.nthreads > 0) { + nthr = p->opts.nthreads; // user override, now without limit + if (p->opts.showwarn && (nthr > ompmaxnthr)) + fprintf(stderr, + "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " + "available; note large nthreads can be slower.\n", + __func__, nthr, ompmaxnthr); + } +#else + int nthr = 1; // always 1 thread (avoid segfault) + if (p->opts.nthreads > 1) + fprintf(stderr, + "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", + __func__, p->opts.nthreads); +#endif + p->opts.nthreads = nthr; // store actual # thr planned for + // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) + + // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) + if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize + p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss + p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b + } else { // batchSize override by user + p->batchSize = min(p->opts.maxbatchsize, ntrans); + p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches + } + if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice + if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { + fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); + return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; + } + + if (type != 3) { // read in user Fourier mode array sizes... + p->ms = n_modes[0]; + p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims + p->mu = (dim > 2) ? n_modes[2] : 1; + p->N = p->ms * p->mt * p->mu; // N = total # modes + } + + // heuristic to choose default upsampfac... (currently two poss) + if (p->opts.upsampfac == 0.0) { // indicates auto-choose + p->opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach + if (type == 3) // could move to setpts, more known? + p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT + else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, + // typ tol, 12-core xeon + p->opts.upsampfac = 1.25; + } + if (p->opts.debug > 1) + printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); + } + // use opts to choose and write into plan's spread options... + int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); + if (ier > 1) // proceed if success or warning + return ier; + + // set others as defaults (or unallocated for arrays)... + p->X = nullptr; + p->Y = nullptr; + p->Z = nullptr; + p->nf1 = 1; + p->nf2 = 1; + p->nf3 = 1; // crucial to leave as 1 for unused dims + + // ------------------------ types 1,2: planning needed --------------------- + if (type == 1 || type == 2) { + + int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) + // Note: batchSize not used since might be only 1. + + p->spopts.spread_direction = type; + + constexpr TF EPSILON = std::numeric_limits::epsilon(); + if (p->opts.showwarn) { // user warn round-off error... + if (EPSILON * p->ms > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->ms)); + if (EPSILON * p->mt > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mt)); + if (EPSILON * p->mu > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mu)); + } + + // determine fine grid sizes, sanity check.. + int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); + if (nfier) return nfier; // nf too big; we're done + p->phiHat1.resize(p->nf1 / 2 + 1); + if (dim > 1) { + nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); + if (nfier) return nfier; + p->phiHat2.resize(p->nf2 / 2 + 1); + } + if (dim > 2) { + nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); + if (nfier) return nfier; + p->phiHat3.resize(p->nf3 / 2 + 1); + } + + if (p->opts.debug) { // "long long" here is to avoid warnings with printf... + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " + "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " + "batchSize=%d ", + __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, + (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, + p->batchSize); + if (p->batchSize == 1) // spread_thread has no effect in this case + printf("\n"); + else + printf(" spread_thread=%d\n", p->opts.spread_thread); + } + + // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim + CNTime timer; + timer.start(); + onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); + if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); + if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); + if (p->opts.debug) + printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, + timer.elapsedsec()); + + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points + if (p->nf * p->batchSize > MAX_NF) { + fprintf(stderr, + "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 + return FINUFFT_ERR_MAXNALLOC; + } + + timer.restart(); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace + if (p->opts.debug) + printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * p->nf * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch) { // we don't catch all such mallocs, just this big one + fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", + __func__); + return FINUFFT_ERR_ALLOC; + } + + timer.restart(); // plan the FFTW + const auto ns = gridsize_for_fft(p); + p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); + if (p->opts.debug) + printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, + nthr_fft, timer.elapsedsec()); + + } else { // -------------------------- type 3 (no planning) ------------ + + if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); + // in case destroy occurs before setpts, need safe dummy ptrs/plans... + p->fwBatch = nullptr; + p->innerT2plan = nullptr; + // Type 3 will call finufft_makeplan for type 2; no need to init FFTW + // Note we don't even know nj or nk yet, so can't do anything else! + } + return ier; // report setup_spreader status (could be warning) +} +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, float tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, double tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); + +// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS +template +int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, + TF *u) { + int d = dim; // abbrev for spatial dim + CNTime timer; + timer.start(); + this->nj = nj; // the user only now chooses how many NU (x,y,z) pts + if (nj < 0) { + fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nj > MAX_NU_PTS) { + fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + + if (type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- + // (all we can do is check and maybe bin-sort the NU pts) + X = xj; // plan must keep pointers to user's fixed NU pts + Y = yj; + Z = zj; + int ier = spreadcheck(nf1, nf2, nf3, nj, xj, yj, zj, spopts); + if (opts.debug > 1) + printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, spopts.chkbnds, + timer.elapsedsec()); + if (ier) // no warnings allowed here + return ier; + timer.restart(); + sortIndices.resize(nj); + didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, xj, yj, zj, spopts); + if (opts.debug) + printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort, + timer.elapsedsec()); + + } else { // ------------------------- TYPE 3 SETPTS ----------------------- + // (here we can precompute pre/post-phase factors and plan the t2) + + if (nk < 0) { + fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nk > MAX_NU_PTS) { + fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + this->nk = nk; // user set # targ freq pts + S = s; // keep pointers to user's input target pts + T = t; + U = u; + + // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... + TF S1, S2, S3; // get half-width X, center C, which contains {x_j}... + arraywidcen(nj, xj, &(t3P.X1), &(t3P.C1)); + arraywidcen(nk, s, &S1, &(t3P.D1)); // same D, S, but for {s_k} + set_nhg_type3(S1, t3P.X1, opts, spopts, &(nf1), &(t3P.h1), + &(t3P.gam1)); // applies twist i) + t3P.C2 = 0.0; // their defaults if dim 2 unused, etc + t3P.D2 = 0.0; + if (d > 1) { + arraywidcen(nj, yj, &(t3P.X2), &(t3P.C2)); // {y_j} + arraywidcen(nk, t, &S2, &(t3P.D2)); // {t_k} + set_nhg_type3(S2, t3P.X2, opts, spopts, &(nf2), &(t3P.h2), &(t3P.gam2)); + } + t3P.C3 = 0.0; + t3P.D3 = 0.0; + if (d > 2) { + arraywidcen(nj, zj, &(t3P.X3), &(t3P.C3)); // {z_j} + arraywidcen(nk, u, &S3, &(t3P.D3)); // {u_k} + set_nhg_type3(S3, t3P.X3, opts, spopts, &(nf3), &(t3P.h3), &(t3P.gam3)); + } + + if (opts.debug) { // report on choices of shifts, centers, etc... + printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", t3P.X1, + t3P.C1, S1, t3P.D1, t3P.gam1, (long long)nf1, t3P.h1); + if (d > 1) + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", t3P.X2, + t3P.C2, S2, t3P.D2, t3P.gam2, (long long)nf2, t3P.h2); + if (d > 2) + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", t3P.X3, + t3P.C3, S3, t3P.D3, t3P.gam3, (long long)nf3, t3P.h3); + } + nf = nf1 * nf2 * nf3; // fine grid total number of points + if (nf * batchSize > MAX_NF) { + fprintf(stderr, + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + return FINUFFT_ERR_MAXNALLOC; + } + fftPlan->free(fwBatch); + fwBatch = fftPlan->alloc_complex(nf * batchSize); // maybe big workspace + + CpBatch.resize(nj * batchSize); // batch c' work + + if (opts.debug) + printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * (nf + nj) * batchSize, + timer.elapsedsec()); + if (!fwBatch) { + fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); + return FINUFFT_ERR_ALLOC; + } + // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch); + + // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... + // FIXME: should use realloc + if (X) free(X); + X = (TF *)malloc(sizeof(TF) * nj); + Sp.resize(nk); + if (d > 1) { + if (Y) free(Y); + Y = (TF *)malloc(sizeof(TF) * nj); + Tp.resize(nk); + } + if (d > 2) { + if (Z) free(Z); + Z = (TF *)malloc(sizeof(TF) * nj); + Up.resize(nk); + } + + // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... + TF ig1 = 1.0 / t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim + if (d > 1) ig2 = 1.0 / t3P.gam2; + if (d > 2) ig3 = 1.0 / t3P.gam3; +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { + X[j] = (xj[j] - t3P.C1) * ig1; // rescale x_j + if (d > 1) // (ok to do inside loop because of branch predict) + Y[j] = (yj[j] - t3P.C2) * ig2; // rescale y_j + if (d > 2) Z[j] = (zj[j] - t3P.C3) * ig3; // rescale z_j + } + + // set up prephase array... + std::complex imasign = + (fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i + prephase.resize(nj); + if (t3P.D1 != 0.0 || t3P.D2 != 0.0 || t3P.D3 != 0.0) { +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs + TF phase = t3P.D1 * xj[j]; + if (d > 1) phase += t3P.D2 * yj[j]; + if (d > 2) phase += t3P.D3 * zj[j]; + prephase[j] = cos(phase) + imasign * sin(phase); // Euler + // e^{+-i.phase} + } + } else + for (BIGINT j = 0; j < nj; ++j) + prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? + + // rescale the target s_k etc to s'_k etc... +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { + Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1); // so |s'_k| < pi/R + if (d > 1) + Tp[k] = t3P.h2 * t3P.gam2 * (t[k] - t3P.D2); // so |t'_k| < + // pi/R + if (d > 2) + Up[k] = t3P.h3 * t3P.gam3 * (u[k] - t3P.D3); // so |u'_k| < + // pi/R + } + // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... + // (exploits that FT separates because kernel is prod of 1D funcs) + deconv.resize(nk); + std::vector phiHatk1(nk); // don't confuse w/ phiHat + onedim_nuft_kernel(nk, Sp, phiHatk1, spopts); // fill phiHat1 + std::vector phiHatk2, phiHatk3; + if (d > 1) { + phiHatk2.resize(nk); + onedim_nuft_kernel(nk, Tp, phiHatk2, spopts); // fill phiHat2 + } + if (d > 2) { + phiHatk3.resize(nk); + onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3 + } + int Cfinite = isfinite(t3P.C1) && isfinite(t3P.C2) && isfinite(t3P.C3); // C can be + // nan or inf + // if M=0, no + // input NU + // pts + int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0; // cen +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs + TF phiHat = phiHatk1[k]; + if (d > 1) phiHat *= phiHatk2[k]; + if (d > 2) phiHat *= phiHatk3[k]; + deconv[k] = (std::complex)(1.0 / phiHat); + if (Cfinite && Cnonzero) { + TF phase = (s[k] - t3P.D1) * t3P.C1; + if (d > 1) phase += (t[k] - t3P.D2) * t3P.C2; + if (d > 2) phase += (u[k] - t3P.D3) * t3P.C3; + deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} + } + } + if (opts.debug) + printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); + + // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... + timer.restart(); + sortIndices.resize(nj); + didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, X, Y, Z, spopts); + if (opts.debug) + printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort, + timer.elapsedsec()); + + // Plan and setpts once, for the (repeated) inner type 2 finufft call... + timer.restart(); + BIGINT t2nmodes[] = {nf1, nf2, nf3}; // t2 input is actually fw + finufft_opts t2opts = opts; // deep copy, since not ptrs + t2opts.modeord = 0; // needed for correct t3! + t2opts.debug = max(0, opts.debug - 1); // don't print as much detail + t2opts.spread_debug = max(0, opts.spread_debug - 1); + t2opts.showwarn = 0; // so don't see warnings 2x + // (...could vary other t2opts here?) + if (innerT2plan) { + delete innerT2plan; + innerT2plan = nullptr; + } + int ier = finufft_makeplan_t(2, d, t2nmodes, fftSign, batchSize, tol, + &innerT2plan, &t2opts); + if (ier > 1) { // if merely warning, still proceed + fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", + __func__, ier); + return ier; + } + ier = finufft_setpts_t(innerT2plan, nk, Sp.data(), Tp.data(), Up.data(), 0, + nullptr, nullptr, + nullptr); // note nk = # output points (not nj) + if (ier > 1) { + fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); + return ier; + } + if (opts.debug) + printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); + } + return 0; +} +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u) +/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for + spreading. (The last 4 arguments are ignored.) + For type 3: allocates internal working arrays, scales/centers the NU points + and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. +*/ +{ + return p->setpts(nj, xj, yj, zj, nk, s, t, u); +} +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, float *xj, + float *yj, float *zj, BIGINT nk, float *s, float *t, + float *u); +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, double *xj, + double *yj, double *zj, BIGINT nk, double *s, + double *t, double *u); + +// ............ end setpts .................................................. + +// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +template +int FINUFFT_PLAN_T::execute(std::complex *cj, std::complex *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and FFT as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ + CNTime timer; + timer.start(); + + if (type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ + + double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing + if (opts.debug) + printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans, nbatch, + batchSize); + + for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches + + // current batch is either batchSize, or possibly truncated if last one + int thisBatchSize = min(ntrans - b * batchSize, batchSize); + int bB = b * batchSize; // index of vector, since batchsizes same + std::complex *cjb = cj + bB * nj; // point to batch of weights + std::complex *fkb = fk + bB * N; // point to batch of mode coeffs + if (opts.debug > 1) + printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 1: (varies by type) + timer.restart(); + if (type == 1) { // type 1: spread NU pts X, weights cj, to fw grid + spreadinterpSortedBatch(thisBatchSize, this, cjb); + t_sprint += timer.elapsedsec(); + } else { // type 2: amplify Fourier coeffs fk into 0-padded fw + deconvolveBatch(thisBatchSize, this, fkb); + t_deconv += timer.elapsedsec(); + } + + // STEP 2: call the FFT on this batch + timer.restart(); + do_fft(this); + t_fft += timer.elapsedsec(); + if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); + + // STEP 3: (varies by type) + timer.restart(); + if (type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk + deconvolveBatch(thisBatchSize, this, fkb); + t_deconv += timer.elapsedsec(); + } else { // type 2: interpolate unif fw grid to NU target pts + spreadinterpSortedBatch(thisBatchSize, this, cjb); + t_sprint += timer.elapsedsec(); + } + } // ........end b loop + + if (opts.debug) { // report total times in their natural order... + if (type == 1) { + printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); + } else { + printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot interp:\t\t\t%.3g s\n", t_sprint); + } + } + } + + else { // ----------------------------- TYPE 3 EXEC --------------------- + + // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long + // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug + + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, + t_deconv = 0.0; // accumulated timings + if (opts.debug) + printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans, + nbatch, batchSize); + + for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches + + // batching and pointers to this batch, identical to t1,2 above... + int thisBatchSize = min(ntrans - b * batchSize, batchSize); + int bB = b * batchSize; + std::complex *cjb = cj + bB * nj; // batch of input strengths + std::complex *fkb = fk + bB * nk; // batch of output strengths + if (opts.debug > 1) + printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... + timer.restart(); +#pragma omp parallel for num_threads(opts.nthreads) // or batchSize? + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * nj; + for (BIGINT j = 0; j < nj; ++j) { + CpBatch[ioff + j] = prephase[j] * cjb[ioff + j]; + } + } + t_pre += timer.elapsedsec(); + + // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... + timer.restart(); + spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, this, CpBatch.data()); // X are primed + t_spr += timer.elapsedsec(); + + // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... + timer.restart(); + // illegal possible shrink of ntrans *after* plan for smaller last batch: + innerT2plan->ntrans = thisBatchSize; // do not try this at home! + /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array + still the same size, as Andrea explained; just wastes a few flops) */ + finufft_execute_t(innerT2plan, fkb, fwBatch); + t_t2 += timer.elapsedsec(); + // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... + timer.restart(); +#pragma omp parallel for num_threads(opts.nthreads) + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * nk; + for (BIGINT k = 0; k < nk; ++k) fkb[ioff + k] *= deconv[k]; + } + t_deconv += timer.elapsedsec(); + } // ........end b loop + + if (opts.debug) { // report total times in their natural order... + printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); + printf(" tot spread:\t\t\t%.3g s\n", t_spr); + printf(" tot type 2:\t\t\t%.3g s\n", t_t2); + printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); + } + } + // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long + // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug + + return 0; +} +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and FFT as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ + return p->execute(cj, fk); +} +template int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, + std::complex *fk); +template int finufft_execute_t( + FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { + // Free everything we allocated inside of finufft_plan pointed to by p. + // Also must not crash if called immediately after finufft_makeplan. + // Thus either each thing free'd here is guaranteed to be nullptr or correctly + // allocated. + if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array + if (type == 3) { + delete innerT2plan; + innerT2plan = nullptr; + free(X); + free(Y); + free(Z); + } +} +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index edd25adfb..4b3630d93 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -1,8 +1,10 @@ // public header #include // private headers +#include #include -#include +#include // (must come after complex.h) + using namespace std; /* --------------------------------------------------------------------------- @@ -18,41 +20,103 @@ using namespace std; --------------------------------------------------------------------------- */ +void finufft_default_opts(finufft_opts *o) { finufft_default_opts_t(o); } +void finufftf_default_opts(finufft_opts *o) { finufft_default_opts_t(o); } + +int finufft_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + double tol, finufft_plan *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), + opts); +} +int finufftf_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + float tol, finufftf_plan *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), opts); +} + +int finufft_setpts(finufft_plan p, BIGINT nj, double *xj, double *yj, double *zj, + BIGINT nk, double *s, double *t, double *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, + yj, zj, nk, s, t, u); +} +int finufftf_setpts(finufftf_plan p, BIGINT nj, float *xj, float *yj, float *zj, + BIGINT nk, float *s, float *t, float *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, + zj, nk, s, t, u); +} + +int finufft_execute(finufft_plan p, std::complex *cj, std::complex *fk) { + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); +} +int finufftf_execute(finufftf_plan p, std::complex *cj, std::complex *fk) { + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); +} + +int finufft_destroy(finufft_plan p) +// Free everything we allocated inside of finufft_plan pointed to by p. +// Also must not crash if called immediately after finufft_makeplan. +// Thus either each thing free'd here is guaranteed to be nullptr or correctly +// allocated. +{ + if (!p) // nullptr ptr, so not a ptr to a plan, report error + return 1; + + delete reinterpret_cast *>(p); + p = nullptr; + return 0; // success +} +int finufftf_destroy(finufftf_plan p) +// Free everything we allocated inside of finufft_plan pointed to by p. +// Also must not crash if called immediately after finufft_makeplan. +// Thus either each thing free'd here is guaranteed to be nullptr or correctly +// allocated. +{ + if (!p) // nullptr ptr, so not a ptr to a plan, report error + return 1; + + delete reinterpret_cast *>(p); + p = nullptr; + return 0; // success +} // Helper layer ........................................................... namespace finufft { namespace common { -int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj, - FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk, - FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts) +template +static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, T *xj, + T *yj, T *zj, std::complex *cj, int iflag, T eps, + const std::array &n_modes, BIGINT nk, T *s, + T *t, T *u, std::complex *fk, finufft_opts *popts) // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { - FINUFFT_PLAN plan; - int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan, - popts); // popts (ptr to opts) can be NULL - if (ier > 1) { // since 1 (a warning) still allows proceeding... + FINUFFT_PLAN_T *plan = nullptr; + int ier = + finufft_makeplan_t(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, + popts); // popts (ptr to opts) can be nullptr + if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); delete plan; return ier; } - int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u); + int ier2 = finufft_setpts_t(plan, nj, xj, yj, zj, nk, s, t, u); if (ier2 > 1) { fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2); - FINUFFT_DESTROY(plan); + delete plan; return ier2; } - int ier3 = FINUFFT_EXECUTE(plan, cj, fk); + int ier3 = finufft_execute_t(plan, cj, fk); if (ier3 > 1) { fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3); - FINUFFT_DESTROY(plan); + delete plan; return ier3; } - FINUFFT_DESTROY(plan); + delete plan; return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning } @@ -63,229 +127,287 @@ using namespace finufft::common; // Dimension 1111111111111111111111111111111111111111111111111111111111111111 -int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, - finufft_opts *opts) -// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst +int finufft1d1many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) +// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } - -int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT ms, CPX *fk, finufft_opts *opts) +int finufftf1d1many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } -int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, - finufft_opts *opts) -// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst +int finufft1d1(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int finufftf1d1(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } -int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT ms, CPX *fk, finufft_opts *opts) +int finufft1d2many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) +// Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); +} +int finufftf1d2many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } -int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, - CPX *fk, finufft_opts *opts) -// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst +int finufft1d2(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 1; - int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, NULL, nk, s, NULL, NULL, fk, opts); - return ier; + return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int finufftf1d2(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } -int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts) +int finufft1d3many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT nk, double *s, std::complex *fk, + finufft_opts *opts) // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, NULL, nk, s, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); +} +int finufftf1d3many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT nk, float *s, std::complex *fk, + finufft_opts *opts) +// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); +} +int finufft1d3(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT nk, double *s, std::complex *fk, finufft_opts *opts) +// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); +} +int finufftf1d3(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT nk, float *s, std::complex *fk, finufft_opts *opts) +// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } // Dimension 22222222222222222222222222222222222222222222222222222222222222222 -int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, - BIGINT mt, CPX *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst +int finufft2d1many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *c, int iflag, double eps, BIGINT ms, BIGINT mt, + std::complex *fk, finufft_opts *opts) +// Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } - -int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, - BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) +int finufftf2d1many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, + int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } - -int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, - BIGINT mt, CPX *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst +int finufft2d1(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int finufftf2d1(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } -int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, - BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) +int finufft2d2many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *c, int iflag, double eps, BIGINT ms, BIGINT mt, + std::complex *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } - -int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, - FLT *s, FLT *t, CPX *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst +int finufftf2d2many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, + int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 2; - int type = 3; - int n_transf = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - NULL, nk, s, t, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); +} +int finufft2d2(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int finufftf2d2(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } -int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts) +int finufft2d3many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *cj, int iflag, double eps, BIGINT nk, double *s, + double *t, std::complex *fk, finufft_opts *opts) +// Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, + {0, 0, 0}, nk, s, t, nullptr, fk, opts); +} +int finufftf2d3many(int n_transf, BIGINT nj, float *xj, float *yj, + std::complex *cj, int iflag, float eps, BIGINT nk, float *s, + float *t, std::complex *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 2; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - NULL, nk, s, t, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, + {0, 0, 0}, nk, s, t, nullptr, fk, opts); +} +int finufft2d3(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT nk, double *s, double *t, std::complex *fk, + finufft_opts *opts) +// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); +} +int finufftf2d3(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT nk, float *s, float *t, std::complex *fk, + finufft_opts *opts) +// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } // Dimension 3333333333333333333333333333333333333333333333333333333333333333 -int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst +int finufft3d1many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) +// Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, + opts); } - -int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts); } - -int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst +int finufft3d1(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} +int finufftf3d1(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } -int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufft3d2many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - n_modes[0] = ms; - n_modes[1] = mt; - n_modes[2] = mu; - int n_dims = 3; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, + opts); } - -int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts); +} +int finufft3d2(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 3; - int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - NULL, nk, s, t, u, fk, opts); - return ier; + return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} +int finufftf3d2(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } -int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, - finufft_opts *opts) +int finufft3d3many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT nk, double *s, + double *t, double *u, std::complex *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {0, 0, 0}, nk, s, t, u, fk, opts); +} +int finufftf3d3many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT nk, float *s, + float *t, float *u, std::complex *fk, finufft_opts *opts) // Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 3; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - NULL, nk, s, t, u, fk, opts); - return ier; + return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {0, 0, 0}, nk, s, t, u, fk, opts); +} +int finufft3d3(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT nk, double *s, double *t, double *u, + std::complex *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); +} +int finufftf3d3(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT nk, float *s, float *t, float *u, + std::complex *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12327c2d6..7c7309de2 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1,10 +1,8 @@ -// Spreading/interpolating module within FINUFFT. Uses precision-switching -// macros for FLT, CPX, etc. +// Spreading/interpolating module within FINUFFT. #include #include #include -#include #include "ker_horner_allw_loop_constexpr.h" #include "ker_lowupsampfac_horner_allw_loop_constexpr.h" @@ -23,896 +21,410 @@ namespace finufft::spreadinterp { namespace { // anonymous namespace for internal structs equivalent to declaring everything // static -struct zip_low; -struct zip_hi; -template struct reverse_index; -template struct shuffle_index; -struct select_even; -struct select_odd; -// forward declaration to clean up the code and be able to use this everywhere in the file -template static constexpr auto BestSIMDHelper(); -template constexpr auto GetPaddedSIMDWidth(); +struct zip_low { + // helper struct to get the lower half of a SIMD register and zip it with itself + // it returns index 0, 0, 1, 1, ... N/2, N/2 + static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } +}; +struct zip_hi { + // helper struct to get the upper half of a SIMD register and zip it with itself + // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N + static constexpr unsigned get(unsigned index, unsigned size) { + return (size + index) / 2; + } +}; +template struct reverse_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : index; + } +}; +template struct shuffle_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; + } +}; +struct select_even { + static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; } +}; +struct select_odd { + static constexpr unsigned get(unsigned index, unsigned /*size*/) { + return index * 2 + 1; + } +}; + +// this finds the largest SIMD instruction set that can handle N elements +// void otherwise -> compile error +template static constexpr auto BestSIMDHelper() { + if constexpr (N % K == 0) { // returns void in the worst case + return xsimd::make_sized_batch{}; + } else { + return BestSIMDHelper> 1)>(); + } +} +template constexpr uint8_t min_simd_width() { + // finds the smallest simd width that can handle N elements + // simd size is batch size the SIMD width in xsimd terminology + if constexpr (std::is_void_v>) { + return min_simd_width(); + } else { + return N; + } +}; + +template constexpr auto find_optimal_simd_width() { + // finds the smallest simd width that minimizes the number of iterations + // NOTE: might be suboptimal for some cases 2^N+1 for example + // in the future we might want to implement a more sophisticated algorithm + uint8_t optimal_simd_width = min_simd_width(); + uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width; + for (uint8_t simd_width = optimal_simd_width; + simd_width <= xsimd::batch::size; + simd_width *= 2) { + uint8_t iterations = (N + simd_width - 1) / simd_width; + if (iterations < min_iterations) { + min_iterations = iterations; + optimal_simd_width = simd_width; + } + } + return optimal_simd_width; +} + +template constexpr auto GetPaddedSIMDWidth() { + // helper function to get the SIMD width with padding for the given number of elements + // that minimizes the number of iterations + return xsimd::make_sized_batch()>::type::size; +} template using PaddedSIMD = typename xsimd::make_sized_batch()>::type; -template uint8_t get_padding(uint8_t ns); -template constexpr auto get_padding(); +template constexpr auto get_padding() { + // helper function to get the padding for the given number of elements + // ns is known at compile time, rounds ns to the next multiple of the SIMD width + // then subtracts ns to get the padding using a bitwise and trick + // WARING: this trick works only for power of 2s + // SOURCE: Agner Fog's VCL manual + constexpr uint8_t width = GetPaddedSIMDWidth(); + return ((ns + width - 1) & (-width)) - ns; +} + +template constexpr auto get_padding_helper(uint8_t runtime_ns) { + // helper function to get the padding for the given number of elements where ns is + // known at runtime, it uses recursion to find the padding + // this allows to avoid having a function with a large number of switch cases + // as GetPaddedSIMDWidth requires a compile time value + // it cannot be a lambda function because of the template recursion + if constexpr (ns < 2) { + return 0; + } else { + if (runtime_ns == ns) { + return get_padding(); + } else { + return get_padding_helper(runtime_ns); + } + } +} + +template uint8_t get_padding(uint8_t ns) { + // return the padding as a function of the number of elements + // 2 * MAX_NSPREAD is the maximum number of elements that we can have + // that's why is hardcoded here + return get_padding_helper(ns); +} template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template constexpr uint8_t min_simd_width(); -template constexpr auto find_optimal_simd_width(); +template +constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { + // utility function to generate a sequence of a, b interleaved as function arguments + return T(((Is % 2 == 0) ? a : b)...); +} + template -constexpr auto initialize_complex_register(V a, V b) noexcept; -template +constexpr auto initialize_complex_register(V a, V b) noexcept { + // populates a SIMD register with a and b interleaved + // for example: + // +-------------------------------+ + // | a | b | a | b | a | b | a | b | + // +-------------------------------+ + // it uses index_sequence to generate the sequence of a, b at compile time + return generate_sequence_impl(a, b, std::make_index_sequence{}); +} +template constexpr auto zip_low_index = - xsimd::make_batch_constant, arch_t, zip_low>(); -template + xsimd::make_batch_constant, arch_t, zip_low>(); +template constexpr auto zip_hi_index = - xsimd::make_batch_constant, arch_t, zip_hi>(); -template -constexpr auto select_even_mask = - xsimd::make_batch_constant, arch_t, select_even>(); -template -constexpr auto select_odd_mask = - xsimd::make_batch_constant, arch_t, select_odd>(); + xsimd::make_batch_constant, arch_t, zip_hi>(); +// template +// constexpr auto select_even_mask = +// xsimd::make_batch_constant, arch_t, select_even>(); +// template +// constexpr auto select_odd_mask = +// xsimd::make_batch_constant, arch_t, select_odd>(); template constexpr std::array, N> pad_2D_array_with_zeros( - const std::array, N> &input) noexcept; -template FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept; + const std::array, N> &input) noexcept { + constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { + std::array padded{0}; + for (auto i = 0; i < input.size(); ++i) { + padded[i] = input[i]; + } + return padded; + }; + std::array, N> output{}; + for (std::size_t i = 0; i < N; ++i) { + output[i] = pad_with_zeros(input[i]); + } + return output; +} + +template FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept { + constexpr auto alignment = T::arch_type::alignment(); + alignas(alignment) std::array array{}; + vec.store_aligned(array.data()); + return array; +} FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, - UBIGINT M0); + UBIGINT M0) { + printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); + switch (ndims) { + case 1: + printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, + (long long)padded_size1, (long long)M0); + break; + case 2: + printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, + (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); + break; + case 3: + printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", + (long long)offset1, (long long)offset2, (long long)offset3, + (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); + break; + default: + printf("Invalid number of dimensions: %d\n", ndims); + break; + } +} } // namespace // declarations of purely internal functions... (thus need not be in .h) -template()>, - typename... V> -static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, - const finufft_spread_opts &opts, - const V... elems) noexcept; -static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept; -template -FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x, - UBIGINT N) noexcept; -static FINUFFT_ALWAYS_INLINE void set_kernel_args( - FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; -static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( - FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept; -template()>> // aka ns -static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( - FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; -template> -static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - BIGINT i1, UBIGINT N1); -template> -static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2); -template> -static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, - UBIGINT N1, UBIGINT N2, UBIGINT N3); -static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du0, UBIGINT M0, - FLT *kx0, FLT *dd0, - const finufft_spread_opts &opts) noexcept; -static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2, - FLT *FINUFFT_RESTRICT du, UBIGINT M, const FLT *kx, - const FLT *ky, const FLT *dd, - const finufft_spread_opts &opts) noexcept; -static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, - UBIGINT size2, UBIGINT size3, FLT *du0, UBIGINT M0, - FLT *kx0, FLT *ky0, FLT *kz0, FLT *dd0, - const finufft_spread_opts &opts) noexcept; -template -static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, - UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0); -static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, - double bin_size_x, double bin_size_y, double bin_size_z, - int debug); -void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, - UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, - double bin_size_z, int debug, int nthr); -static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, - BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, - UBIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0, int ns, int ndims); - -// ========================================================================== -int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - const finufft_spread_opts &opts) -/* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- - If opts.spread_direction=1, evaluate, in the 1D case, - - N1-1 - data_nonuniform[j] = SUM phi(kx[j] - n) data_uniform[n], for j=0...M-1 - n=0 - - If opts.spread_direction=2, evaluate its transpose, in the 1D case, - - M-1 - data_uniform[n] = SUM phi(kx[j] - n) data_nonuniform[j], for n=0...N1-1 - j=0 - - In each case phi is the spreading kernel, which has support - [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with - product of 1D kernels is performed. - For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1. - - Notes: - No particular normalization of the spreading kernel is assumed. - Uniform (U) points are centered at coords - [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x - fastest, y medium, z slowest ordering, up to however many - dimensions are relevant; note that this is Fortran-style ordering for an - array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran - interface of the original CMCL libraries. - Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three - periods in each coordinate (these are folded into the central period). - The finufft_spread_opts struct must have been set up already by calling setup_kernel. - It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel - only ever wraps once when falls below 0 or off the top of a uniform grid - dimension. - - Inputs: - N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively. - If N2==1, 1D spreading is done. If N3==1, 2D spreading. - Otherwise, 3D. - M - number of NU pts. - kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in - 1D, only kx and ky read in 2D). - - These should lie in the box -pi<=kx<=pi. Points outside this domain are also - correctly folded back into this domain. - opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h - - Inputs/Outputs: - data_uniform - output values on grid (dir=1) OR input grid data (dir=2) - data_nonuniform - input strengths of the sources (dir=1) - OR output values at targets (dir=2) - Returned value: - 0 indicates success; other values have meanings in ../docs/error.rst, with - following modifications: - 3 : one or more non-trivial box dimensions is less than 2.nspread. - 5 : failed allocate sort indices - Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17 - error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18 - No separate subprob indices in t-1 2/11/18. - sort_threads (since for M< +static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept { + static constexpr const T x2pi = T(M_1_2PI); + const T result = x * x2pi + T(0.5); + return (result - floor(result)) * T(N); } -static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3) -/* rule for getting number of spreading dimensions from the list of Ns per dim. - Split out, Barnett 7/26/18 -*/ +template +static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, + const BIGINT N) noexcept { + const simd_type x2pi = T(M_1_2PI); + const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); + return (result - xsimd::floor(result)) * simd_type(T(N)); +} +template +static void set_kernel_args(T *args, T x) noexcept +// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. +// needed for the vectorized kernel eval of Ludvig af K. { - return 1 + (N2 > 1) + (N3 > 1); + for (int i = 0; i < ns; i++) args[i] = x + T(i); } +template +static void evaluate_kernel_vector(T *ker, T *args, + const finufft_spread_opts &opts) noexcept +/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. + If opts.kerpad true, args and ker must be allocated for Npad, and args is + written to (to pad to length Npad), only first N outputs are correct. + Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization. + Rescaled so max is 1, Barnett 7/21/24 -int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, - const finufft_spread_opts &opts) -/* This does just the input checking and reporting for the spreader. - See spreadinterp() for input arguments and meaning of returned value. - Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. - Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to - [-3pi,3pi) + Obsolete (replaced by Horner), but keep around for experimentation since + works for arbitrary beta. Formula must match reference implementation. */ { - // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? - int minN = 2 * opts.nspread; - if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) { - fprintf(stderr, - "%s error: one or more non-trivial box dims is less than 2.nspread!\n", - __func__); - return FINUFFT_ERR_SPREAD_BOX_SMALL; - } - if (opts.spread_direction != 1 && opts.spread_direction != 2) { - fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__); - return FINUFFT_ERR_SPREAD_DIR; + T b = (T)opts.ES_beta; + T c = (T)opts.ES_c; + if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { + // Note (by Ludvig af K): Splitting kernel evaluation into two loops + // seems to benefit auto-vectorization. + // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops + int Npad = N; + if (opts.kerpad) { // since always same branch, no speed hit + Npad = 4 * (1 + (N - 1) / 4); // pad N to mult of 4; help i7 GCC, not xeon + for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval + args[i] = 0.0; + } + for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments + // care! 1.0 is double... + ker[i] = b * (sqrt((T)1.0 - c * args[i] * args[i]) - (T)1.0); + } + if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL)) + for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials + ker[i] = exp(ker[i]); + if (opts.kerpad) { + // padded part should be zero, in spread_subproblem_nd_kernels, there are + // out of bound writes to trg arrays + for (int i = N; i < Npad; ++i) ker[i] = 0.0; + } + } else { + for (int i = 0; i < N; i++) // dummy for timing only + ker[i] = 1.0; } - return 0; + // Separate check from arithmetic (Is this really needed? doesn't slow down) + for (int i = 0; i < N; i++) + if (abs(args[i]) >= (T)opts.ES_halfwidth) ker[i] = 0.0; } -int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts) -/* This makes a decision whether or not to sort the NU pts (influenced by - opts.sort), and if yes, calls either single- or multi-threaded bin sort, - writing reordered index list to sort_indices. If decided not to sort, the - identity permutation is written to sort_indices. - The permutation is designed to make RAM access close to contiguous, to - speed up spreading/interpolation, in the case of disordered NU points. - - Inputs: - M - number of input NU points. - kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi), - points outside are folded in. - (only kz used in 1D, only kx and ky used in 2D.) - N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D). - 1 = x (fastest), 2 = y (medium), 3 = z (slowest). - opts - spreading options struct, see ../include/finufft_spread_opts.h - Outputs: - sort_indices - a good permutation of NU points. (User must preallocate - to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good - ordering for the x-coords of NU pts, etc. - returned value - whether a sort was done (1) or not (0). +template()>> // aka ns +static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( + T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts &opts) noexcept +/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at +x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. +This is the current evaluation method, since it's faster (except i7 w=16). +Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ - Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024. -*/ { - CNTime timer{}; - uint8_t ndims = ndims_from_Ns(N1, N2, N3); - auto N = N1 * N2 * N3; // U grid (periodic box) sizes + // scale so local grid offset z in[-1,1] + const T z = std::fma(T(2.0), x, T(w - 1)); + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); + static constexpr auto horner_coeffs = []() constexpr noexcept { + if constexpr (upsampfact == 200) { + return get_horner_coeffs_200(); + } else if constexpr (upsampfact == 125) { + return get_horner_coeffs_125(); + } + }(); + static constexpr auto nc = horner_coeffs.size(); + static constexpr auto use_ker_sym = (simd_size < w); - // heuristic binning box size for U grid... affects performance: - double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4; - // put in heuristics based on cache sizes (only useful for single-thread) ? + alignas(alignment) static constexpr auto padded_coeffs = + pad_2D_array_with_zeros(horner_coeffs); - int better_to_sort = - !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or - // dir=2 case: - // don't sort + // use kernel symmetry trick if w > simd_size + if constexpr (use_ker_sym) { + static constexpr uint8_t tail = w % simd_size; + static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); + static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; + static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; + const simd_type zv{z}; + const auto z2v = zv * zv; - timer.start(); // if needed, sort all the NU pts... - int did_sort = 0; - auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default - if (opts.nthreads > 0) - maxnthr = opts.nthreads; // user nthreads overrides, without limit - if (opts.sort_threads > 0) - maxnthr = opts.sort_threads; // high-priority override, also no limit - // At this point: maxnthr = the max threads sorting could use - // (we don't print warning here, since: no showwarn in spread_opts, and finufft - // already warned about it. spreadinterp-only advanced users will miss a warning) - if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) { - // store a good permutation ordering of all NU pts (dim=1,2 or 3) - int sort_debug = (opts.debug >= 2); // show timing output? - int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort -#ifndef _OPENMP - sort_nthr = 1; // if single-threaded lib, override user -#endif - if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! - sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic - if (sort_nthr == 1) - bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug); - else // sort_nthr>1, user fixes # threads (>=2) - bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug, sort_nthr); - if (opts.debug) - printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec()); - did_sort = 1; + // some xsimd constant for shuffle or inverse + static constexpr auto shuffle_batch = []() constexpr noexcept { + if constexpr (tail) { + return xsimd::make_batch_constant, arch_t, + shuffle_index>(); + } else { + return xsimd::make_batch_constant, arch_t, + reverse_index>(); + } + }(); + + // process simd vecs + simd_type k_prev, k_sym{0}; + for (uint8_t i{0}, offset = offset_start; i < end_idx; + i += simd_size, offset -= simd_size) { + auto k_odd = [i]() constexpr noexcept { + if constexpr (if_odd_degree) { + return simd_type::load_aligned(padded_coeffs[0].data() + i); + } else { + return simd_type{0}; + } + }(); + auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); + for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) { + const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); + const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); + k_odd = xsimd::fma(k_odd, z2v, cji_odd); + k_even = xsimd::fma(k_even, z2v, cji_even); + } + // left part + xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); + // right part symmetric to the left part + if (offset >= end_idx) { + if constexpr (tail) { + // to use aligned store, we need shuffle the previous k_sym and current k_sym + k_prev = k_sym; + k_sym = xsimd::fnma(k_odd, zv, k_even); + xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); + } else { + xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch) + .store_aligned(ker + offset); + } + } + } } else { -#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000) - for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7 - sort_indices[i] = i; // the identity permutation - if (opts.debug) - printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec()); + const simd_type zv(z); + for (uint8_t i = 0; i < w; i += simd_size) { + auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); + for (uint8_t j = 1; j < nc; ++j) { + const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); + k = xsimd::fma(k, zv, cji); + } + k.store_aligned(ker + i); + } } - return did_sort; } -int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, FLT *data_uniform, const UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts, int did_sort) -/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. - See spreadinterp() above for inputs arguments and definitions. - Return value should always be 0 (no error reporting). - Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20. -*/ -{ - if (opts.spread_direction == 1) // ========= direction 1 (spreading) ======= - spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, - opts, did_sort); - - else // ================= direction 2 (interpolation) =========== - interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, - opts); - - return 0; +template +static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker, + const BIGINT i1, const UBIGINT N1) { + /* This function is called when the kernel wraps around the grid. It is + slower than interp_line. + M. Barbone July 2024: - moved the logic to a separate function + - using fused multiply-add (fma) for better performance + */ + std::array out{0}; + BIGINT j = i1; + if (i1 < 0) { // wraps at left + j += BIGINT(N1); + for (uint8_t dx = 0; dx < -i1; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + j -= BIGINT(N1); + for (uint8_t dx = -i1; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + } else if (i1 + ns >= N1) { // wraps at right + for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + j -= BIGINT(N1); + for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + } else { + // padding is okay for ker, but it might spill over du array + // so this checks for that case and does not explicitly vectorize + for (uint8_t dx = 0; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + } + target[0] = out[0]; + target[1] = out[1]; } -// -------------------------------------------------------------------------- -int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - const FLT *data_nonuniform, const finufft_spread_opts &opts, - int did_sort) -// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. -{ - CNTime timer{}; - const auto ndims = ndims_from_Ns(N1, N2, N3); - const auto N = N1 * N2 * N3; // output array size - const auto ns = opts.nspread; // abbrev. for w, kernel width - auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread - if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit -#ifndef _OPENMP - nthr = 1; // single-threaded lib must override user -#endif - if (opts.debug) - printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, - (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); - std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array - if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec()); - if (M == 0) // no NU pts, we're done - return 0; - - auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic? - spread_single = false; // for now - timer.start(); - if (spread_single) { // ------- Basic single-core t1 spreading ------ - for (UBIGINT j = 0; j < M; j++) { - // *** todo, not urgent - // ... (question is: will the index wrapping per NU pt slow it down?) - } - if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec()); - } else { // ------- Fancy multi-core blocked t1 spreading ---- - // Splits sorted inds (jfm's advanced2), could double RAM. - // choose nb (# subprobs) via used nthreads: - auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr... - if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size - nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does - // ceil(M/opts.max_subproblem_size) - if (opts.debug) - printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size); - } - if (M * 1000 < N) { // low-density heuristic: one thread per NU pt! - nb = M; - if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n"); - } - if (!did_sort && nthr == 1) { - nb = 1; - if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n"); - } - if (opts.debug && nthr > opts.atomic_threshold) - printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); - - std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems - for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; - -#pragma omp parallel num_threads(nthr) - { - // local copies of NU pts and data for each subproblem - std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; -#pragma omp for schedule(dynamic, 1) // each is big - for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems - const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem - // copy the location and data vectors for the nonuniform points - kx0.resize(M0); - ky0.resize(M0 * (N2 > 1)); - kz0.resize(M0 * (N3 > 1)); - dd0.resize(2 * M0); // complex strength data - for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? - const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list - kx0[j] = fold_rescale(kx[kk], N1); - if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); - if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); - dd0[j * 2] = data_nonuniform[kk * 2]; // real part - dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part - } - // get the subgrid which will include padding by roughly nspread/2 - // get_subgrid sets - BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; - // sets offsets and sizes - get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, - kx0.data(), ky0.data(), kz0.data(), ns, ndims); - if (opts.debug > 1) { - print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2, - size3, M0); - } - // allocate output data for this subgrid - du0.resize(2 * padded_size1 * size2 * size3); // complex - // Spread to subgrid without need for bounds checking or wrapping - if (!(opts.flags & TF_OMIT_SPREADING)) { - if (ndims == 1) - spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(), - dd0.data(), opts); - else if (ndims == 2) - spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0, - kx0.data(), ky0.data(), dd0.data(), opts); - else - spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, - du0.data(), M0, kx0.data(), ky0.data(), kz0.data(), - dd0.data(), opts); - } - // do the adding of subgrid to output - if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { - if (nthr > opts.atomic_threshold) { // see above for debug reporting - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); // R Blackwell's atomic version - } else { -#pragma omp critical - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); - } - } - } // end main loop over subprobs - } - if (opts.debug) - printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); - } // end of choice of which t1 spread type to use - return 0; -}; - -// -------------------------------------------------------------------------- -template -FINUFFT_NEVER_INLINE static int interpSorted_kernel( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - const FLT *data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) -// Interpolate to NU pts in sorted order from a uniform grid. -// See spreadinterp() for doc. -{ - using simd_type = PaddedSIMD; - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift - - CNTime timer{}; - const auto ndims = ndims_from_Ns(N1, N2, N3); - auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp - if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit -#ifndef _OPENMP - nthr = 1; // single-threaded lib must override user -#endif - if (opts.debug) - printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, - (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); -#pragma omp parallel num_threads(nthr) - { - static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk - alignas(alignment) UBIGINT jlist[CHUNKSIZE]; - alignas(alignment) FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; - alignas(alignment) FLT outbuf[2 * CHUNKSIZE]; - // Kernels: static alloc is faster, so we do it for up to 3D... - alignas(alignment) std::array kernel_values{0}; - auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); - auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; - auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; - - // Loop over interpolation chunks - // main loop over NU trgs, interp each from U - // (note: windows omp doesn't like unsigned loop vars) -#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: - for (BIGINT i = 0; i < M; i += CHUNKSIZE) { - // Setup buffers for this chunk - const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - UBIGINT j = sort_indices[i + ibuf]; - jlist[ibuf] = j; - xjlist[ibuf] = fold_rescale(kx[j], N1); - if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); - if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); - } - - // Loop over targets in chunk - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - const auto xj = xjlist[ibuf]; - const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; - const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; - - auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf; - - // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ - const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index - const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index - const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index - - const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1] - const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0; - const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0; - - // eval kernel values patch and use to interpolate from uniform data... - if (!(opts.flags & TF_OMIT_SPREADING)) { - switch (ndims) { - case 1: - ker_eval(kernel_values.data(), opts, x1); - interp_line(target, data_uniform, ker1, i1, N1); - break; - case 2: - ker_eval(kernel_values.data(), opts, x1, x2); - interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, - N2); - break; - case 3: - ker_eval(kernel_values.data(), opts, x1, x2, - x3); - interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); - break; - default: // can't get here - FINUFFT_UNREACHABLE; - break; - } - } - } // end loop over targets in chunk - - // Copy result buffer to output array - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - const UBIGINT j = jlist[ibuf]; - data_nonuniform[2 * j] = outbuf[2 * ibuf]; - data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; - } - - } // end NU targ loop - } // end parallel section - if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec()); - return 0; -} - -template -static int interpSorted_dispatch( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { - static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, - "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); - if constexpr (NS == MIN_NSPREAD) { // Base case - if (opts.kerevalmeth) - return interpSorted_kernel( - sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); - else { - return interpSorted_kernel( - sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); - } - } else { - if (opts.nspread == NS) { - if (opts.kerevalmeth) { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); - } else { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); - } - } else { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); - } - } -} - -int interpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts) { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); -} - -/////////////////////////////////////////////////////////////////////////// - -int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, - int debug, int showwarn, int dim) -/* Initializes spreader kernel parameters given desired NUFFT tolerance eps, - upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth - (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags. - Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for - opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where - upsampfac is set. Must call this before any kernel evals done, otherwise segfault - likely. Returns: 0 : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be - achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h); - spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. -*/ -{ - if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma - if (kerevalmeth == 1) { - fprintf(stderr, - "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by " - "kerevalmeth=1\n", - upsampfac); - return FINUFFT_ERR_HORNER_WRONG_BETA; - } - if (upsampfac <= 1.0) { // no digits would result - fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n", - upsampfac); - return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; - } - // calling routine must abort on above errors, since opts is garbage! - if (showwarn && upsampfac > 4.0) - fprintf(stderr, - "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be " - "beneficial.\n", - upsampfac); - } - - // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft) - opts.spread_direction = 0; // user should always set to 1 or 2 as desired - opts.sort = 2; // 2:auto-choice - opts.kerpad = 0; // affects only evaluate_kernel_vector - opts.kerevalmeth = kerevalmeth; - opts.upsampfac = upsampfac; - opts.nthreads = 0; // all avail - opts.sort_threads = 0; // 0:auto-choice - // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake... - opts.max_subproblem_size = (dim == 1) ? 10000 : 100000; - opts.flags = 0; // 0:no timing flags (>0 for experts only) - opts.debug = 0; // 0:no debug output - // heuristic nthr above which switch OMP critical to atomic (add_wrapped...): - opts.atomic_threshold = 10; // R Blackwell's value - - int ns, ier = 0; // Set kernel width w (aka ns, nspread) then copy to opts... - if (eps < EPSILON) { // safety; there's no hope of beating e_mach - if (showwarn) - fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__, - (double)eps, (double)EPSILON); - eps = EPSILON; // only changes local copy (not any opts) - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } - if (upsampfac == 2.0) // standard sigma (see SISC paper) - ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10 - else // custom sigma - ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 - ns = max(2, ns); // (we don't have ns=1 version yet) - if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules - if (showwarn) - fprintf(stderr, - "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " - "clipping to max %d.\n", - __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); - ns = MAX_NSPREAD; - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } - opts.nspread = ns; - // setup for reference kernel eval (via formula): select beta width param... - // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel) - opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) - opts.ES_c = 4.0 / (double)(ns * ns); - double betaoverns = 2.30; // gives decent betas for default sigma=2.0 - if (ns == 2) betaoverns = 2.20; // some small-width tweaks... - if (ns == 3) betaoverns = 2.26; - if (ns == 4) betaoverns = 2.38; - if (upsampfac != 2.0) { // again, override beta for custom sigma - FLT gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff - } - opts.ES_beta = betaoverns * ns; // set the kernel beta parameter - if (debug) - printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__, - kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta); - - return ier; -} - -FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) -/* ES ("exp sqrt") kernel evaluation at single real argument: - phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 - related to an asymptotic approximation to the Kaiser--Bessel, itself an - approximation to prolate spheroidal wavefunction (PSWF) of order 0. - This is the "reference implementation", used by eg finufft/onedim_* 2/17/17. - Rescaled so max is 1, Barnett 7/21/24 -*/ -{ - if (abs(x) >= (FLT)opts.ES_halfwidth) - // if spreading/FT careful, shouldn't need this if, but causes no speed hit - return 0.0; - else - return exp((FLT)opts.ES_beta * (sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x) - (FLT)1.0)); -} - -template -void set_kernel_args(FLT *args, FLT x) noexcept -// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. -// needed for the vectorized kernel eval of Ludvig af K. -{ - for (int i = 0; i < ns; i++) args[i] = x + (FLT)i; -} -template -void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept -/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. - If opts.kerpad true, args and ker must be allocated for Npad, and args is - written to (to pad to length Npad), only first N outputs are correct. - Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization. - Rescaled so max is 1, Barnett 7/21/24 - - Obsolete (replaced by Horner), but keep around for experimentation since - works for arbitrary beta. Formula must match reference implementation. -*/ -{ - FLT b = (FLT)opts.ES_beta; - FLT c = (FLT)opts.ES_c; - if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { - // Note (by Ludvig af K): Splitting kernel evaluation into two loops - // seems to benefit auto-vectorization. - // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops - int Npad = N; - if (opts.kerpad) { // since always same branch, no speed hit - Npad = 4 * (1 + (N - 1) / 4); // pad N to mult of 4; help i7 GCC, not xeon - for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval - args[i] = 0.0; - } - for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments - // care! 1.0 is double... - ker[i] = b * (sqrt((FLT)1.0 - c * args[i] * args[i]) - (FLT)1.0); - } - if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL)) - for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials - ker[i] = exp(ker[i]); - if (opts.kerpad) { - // padded part should be zero, in spread_subproblem_nd_kernels, there are - // out of bound writes to trg arrays - for (int i = N; i < Npad; ++i) ker[i] = 0.0; - } - } else { - for (int i = 0; i < N; i++) // dummy for timing only - ker[i] = 1.0; - } - // Separate check from arithmetic (Is this really needed? doesn't slow down) - for (int i = 0; i < N; i++) - if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; -} - -template // aka ns -void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, - const finufft_spread_opts &opts) noexcept -/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at -x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. -This is the current evaluation method, since it's faster (except i7 w=16). -Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ - -{ - // scale so local grid offset z in[-1,1] - const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); - static constexpr auto horner_coeffs = []() constexpr noexcept { - if constexpr (upsampfact == 200) { - return get_horner_coeffs_200(); - } else if constexpr (upsampfact == 125) { - return get_horner_coeffs_125(); - } - }(); - static constexpr auto nc = horner_coeffs.size(); - static constexpr auto use_ker_sym = (simd_size < w); - - alignas(alignment) static constexpr auto padded_coeffs = - pad_2D_array_with_zeros(horner_coeffs); - - // use kernel symmetry trick if w > simd_size - if constexpr (use_ker_sym) { - static constexpr uint8_t tail = w % simd_size; - static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); - static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; - static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv{z}; - const auto z2v = zv * zv; - - // some xsimd constant for shuffle or inverse - static constexpr auto shuffle_batch = []() constexpr noexcept { - if constexpr (tail) { - return xsimd::make_batch_constant, arch_t, - shuffle_index>(); - } else { - return xsimd::make_batch_constant, arch_t, - reverse_index>(); - } - }(); - - // process simd vecs - simd_type k_prev, k_sym{0}; - for (uint8_t i{0}, offset = offset_start; i < end_idx; - i += simd_size, offset -= simd_size) { - auto k_odd = [i]() constexpr noexcept { - if constexpr (if_odd_degree) { - return simd_type::load_aligned(padded_coeffs[0].data() + i); - } else { - return simd_type{0}; - } - }(); - auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - k_even = xsimd::fma(k_even, z2v, cji_even); - } - // left part - xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - // right part symmetric to the left part - if (offset >= end_idx) { - if constexpr (tail) { - // to use aligned store, we need shuffle the previous k_sym and current k_sym - k_prev = k_sym; - k_sym = xsimd::fnma(k_odd, zv, k_even); - xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); - } else { - xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch) - .store_aligned(ker + offset); - } - } - } - } else { - const simd_type zv(z); - for (uint8_t i = 0; i < w; i += simd_size) { - auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); - for (uint8_t j = 1; j < nc; ++j) { - const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); - k = xsimd::fma(k, zv, cji); - } - k.store_aligned(ker + i); - } - } -} - -template -static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - const BIGINT i1, const UBIGINT N1) { - /* This function is called when the kernel wraps around the grid. It is - slower than interp_line. - M. Barbone July 2024: - moved the logic to a separate function - - using fused multiply-add (fma) for better performance - */ - std::array out{0}; - BIGINT j = i1; - if (i1 < 0) { // wraps at left - j += BIGINT(N1); - for (uint8_t dx = 0; dx < -i1; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - j -= BIGINT(N1); - for (uint8_t dx = -i1; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } else if (i1 + ns >= N1) { // wraps at right - for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - j -= BIGINT(N1); - for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } else { - // padding is okay for ker, but it might spill over du array - // so this checks for that case and does not explicitly vectorize - for (uint8_t dx = 0; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } - target[0] = out[0]; - target[1] = out[1]; -} - -template -void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - const BIGINT i1, const UBIGINT N1) { +template> +static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, BIGINT i1, + UBIGINT N1) { /* 1D interpolate complex values from size-ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the 1d kernel evaluation list ker1. @@ -933,16 +445,16 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, limitation */ using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); - std::array out{0}; + std::array out{0}; const auto j = i1; // removing the wrapping leads up to 10% speedup in certain cases // moved the wrapping to another function to reduce instruction cache pressure if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) { - return interp_line_wrap(target, du, ker, i1, N1); + return interp_line_wrap(target, du, ker, i1, N1); } else { // doesn't wrap // logic largely similar to spread 1D kernel, please see the explanation there // for the first part of this code @@ -953,8 +465,8 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, const auto ker_v = simd_type::load_aligned(ker + dx / 2); const auto du_pt0 = simd_type::load_unaligned(du_ptr + dx); const auto du_pt1 = simd_type::load_unaligned(du_ptr + dx + simd_size); - const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); - const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); res_low = xsimd::fma(ker0low, du_pt0, res_low); res_hi = xsimd::fma(ker0hi, du_pt1, res_hi); } @@ -962,7 +474,7 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, if constexpr (regular_part < 2 * ns) { const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2)); const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part); - const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); res_low = xsimd::fma(ker0low, du_pt, res_low); } @@ -994,22 +506,22 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, target[1] = out[1]; } -template -static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, - const FLT *ker1, const FLT *ker2, const BIGINT i1, - const BIGINT i2, const UBIGINT N1, const UBIGINT N2) { +template +static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const BIGINT i1, const BIGINT i2, + const UBIGINT N1, const UBIGINT N2) { /* * This function is called when the kernel wraps around the grid. It is slower than * the non wrapping version. * There is an extra case for when ker is padded and spills over the du array. * In this case uses the old non wrapping version. */ - std::array out{0}; + std::array out{0}; using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { // store a horiz line (interleaved real,imag) - alignas(alignment) std::array line{0}; + alignas(alignment) std::array line{0}; // add remaining const-y lines to the line (expensive inner loop) for (uint8_t dy{0}; dy < ns; ++dy) { const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above) @@ -1047,10 +559,9 @@ static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, target[1] = out[1]; } -template -void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const BIGINT i1, const BIGINT i2, const UBIGINT N1, - const UBIGINT N2) +template> +static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2) /* 2D interpolate complex values from a ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns outer product of the 1d kernel lists ker1 and ker2. @@ -1083,10 +594,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, The code is largely similar to 1D interpolation, please see the explanation there */ { - std::array out{0}; + std::array out{0}; // no wrapping: avoid ptrs using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size; @@ -1117,15 +628,15 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); res_low = xsimd::fma(ker1low, line[i], res_low); res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi); } if constexpr (line_vectors % 2) { const auto ker1_v = simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); res_low = xsimd::fma(ker1low, line.back(), res_low); } return res_low + res_hi; @@ -1138,17 +649,17 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, } else { // wraps somewhere: use ptr list // this is slower than above, but occurs much less often, with fractional // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized. - return interp_square_wrap(target, du, ker1, ker2, i1, i2, N1, N2); + return interp_square_wrap(target, du, ker1, ker2, i1, i2, N1, N2); } target[0] = out[0]; target[1] = out[1]; } -template -static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, - const FLT *ker1, const FLT *ker2, const FLT *ker3, - const BIGINT i1, const BIGINT i2, const BIGINT i3, - const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) { +template +static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const T *ker3, const BIGINT i1, + const BIGINT i2, const BIGINT i3, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3) { /* * This function is called when the kernel wraps around the cube. * Similarly to 2D and 1D wrapping, this is slower than the non wrapping version. @@ -1158,14 +669,14 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); - std::array out{0}; + std::array out{0}; // case no wrapping needed but padding spills over du array. // Hence, no explicit vectorization but the code is still faster if (FINUFFT_LIKELY(in_bounds_1 && in_bounds_2 && in_bounds_3)) { // no wrapping: avoid ptrs (by far the most common case) // store a horiz line (interleaved real,imag) // initialize line with zeros; hard to avoid here, but overhead small in 3D - alignas(alignment) std::array line{0}; + alignas(alignment) std::array line{0}; // co-add y and z contributions to line in x; do not apply x kernel yet // This is expensive innermost loop for (uint8_t dz{0}; dz < ns; ++dz) { @@ -1217,10 +728,10 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, target[1] = out[1]; } -template -void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2, - const BIGINT i3, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) +template> +static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const T *ker3, BIGINT i1, BIGINT i2, BIGINT i3, + UBIGINT N1, UBIGINT N2, UBIGINT N3) /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3. @@ -1251,7 +762,7 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, */ { using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto ker23_size = (ns + simd_size - 1) & -simd_size; @@ -1259,7 +770,7 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); - std::array out{0}; + std::array out{0}; if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) { const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2, ker3, du]() constexpr noexcept { @@ -1284,15 +795,15 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); res_low = xsimd::fma(ker1low, line[i], res_low); res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi); } if constexpr (line_vectors % 2) { const auto ker1_v = simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); res_low = xsimd::fma(ker1low, line.back(), res_low); } return res_low + res_hi; @@ -1303,17 +814,61 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, out[1] += res_array[i + 1]; } } else { - return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); + return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, + N1, N2, N3); } target[0] = out[0]; target[1] = out[1]; } -template +template()>, + typename... V> +static FINUFFT_ALWAYS_INLINE auto ker_eval( + T *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept { + /* Utility function that allows to move the kernel evaluation outside the spreader for + clarity + Inputs are: + ns = kernel width + kerevalmeth = kernel evaluation method + T = (single or double precision) type of the kernel + simd_type = xsimd::batch for Horner + vectorization (default is the optimal simd size) + finufft_spread_opts as Horner needs + the oversampling factor + elems = kernel arguments + Examples usage is + ker_eval(opts, x, y, z) // for 3D or + ker_eval(opts, x, y) // for 2D or + ker_eval(opts, x) // for 1D + */ + const std::array inputs{elems...}; + // compile time loop, no performance overhead + for (auto i = 0; i < sizeof...(elems); ++i) { + // compile time branch no performance overhead + if constexpr (kerevalmeth == 1) { + if (opts.upsampfac == 2.0) { + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); + } + if (opts.upsampfac == 1.25) { + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); + } + } + if constexpr (kerevalmeth == 0) { + alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; + set_kernel_args(kernel_args.data(), inputs[i]); + evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); + } + } + return ker; +} + +template FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( - const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *const kx, const FLT *const dd, const finufft_spread_opts &opts) noexcept { + const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *const kx, const T *const dd, const finufft_spread_opts &opts) noexcept { /* 1D spreader from nonuniform to uniform subproblem grid, without wrapping. Inputs: off1 - integer offset of left end of du subgrid from that of overall fine @@ -1334,15 +889,15 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( This needed off1 as extra arg. AHB 11/30/20. Vectorized using xsimd by M. Barbone 06/24. */ - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; - static constexpr auto ns2 = ns * FLT(0.5); // half spread width + static constexpr auto ns2 = ns * T(0.5); // half spread width // something weird here. Reversing ker{0} and std fill causes ker // to be zeroed inside the loop GCC uses AVX, clang AVX2 - alignas(alignment) std::array ker{0}; + alignas(alignment) std::array ker{0}; std::fill(du, du + 2 * size1, 0); // zero output // no padding needed if MAX_NSPREAD is 16 // the largest read is 16 floats with avx512 @@ -1362,7 +917,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( const auto dd_pt = initialize_complex_register(dd[i * 2], dd[i * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index - // FLT(i1) has different semantics and results an extra cast + // T(i1) has different semantics and results an extra cast const auto x1 = [i, kx]() constexpr noexcept { auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly @@ -1374,8 +929,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( }(); // Libin improvement: pass ker as a parameter and allocate it outside the loop // gcc13 + 10% speedup - ker_eval(ker.data(), opts, x1); - // const auto ker = ker_eval(opts, x1); + ker_eval(ker.data(), opts, x1); + // const auto ker = ker_eval(opts, x1); const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du @@ -1411,12 +966,12 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( const auto du_pt1 = simd_type::load_unaligned(trg + dx + simd_size); // swizzle is faster than zip_lo(ker_v, ker_v) and zip_hi(ker_v, ker_v) // swizzle in this case is equivalent to zip_lo and zip_hi respectively - const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); // ker 0 looks like this now: // +-----------------------+ // |y0|y0|y1|y1|y2|y2|y3|y3| // +-----------------------+ - const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); // ker 1 looks like this now: // +-----------------------+ // |y4|y4|y5|y5|y6|y6|y7|y7| @@ -1443,17 +998,17 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // the corresponding memory is not accessed const auto ker0 = simd_type::load_unaligned(ker.data() + (regular_part / 2)); const auto du_pt = simd_type::load_unaligned(trg + regular_part); - const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); const auto res = xsimd::fma(ker0low, dd_pt, du_pt); res.store_unaligned(trg + regular_part); } } } -template +template static void spread_subproblem_1d_dispatch( - const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *dd, const finufft_spread_opts &opts) noexcept { + const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *kx, const T *dd, const finufft_spread_opts &opts) noexcept { /* this is a dispatch function that will call the correct kernel based on the ns it recursively iterates from MAX_NSPREAD to MIN_NSPREAD it generates the following code: @@ -1486,27 +1041,29 @@ static void spread_subproblem_1d_dispatch( "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, - opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); else { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, - opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, + dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); } else { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); } } else { - return spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); } } } -void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *kx, - FLT *dd, const finufft_spread_opts &opts) noexcept +template +static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, T *du, UBIGINT M, T *kx, + T *dd, const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1515,14 +1072,14 @@ void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *k For algoritmic details see spread_subproblem_1d_kernel. */ { - spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); + spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); } -template +template FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2, - FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *dd, const finufft_spread_opts &opts) noexcept + T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1531,24 +1088,24 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( For algoritmic details see spread_subproblem_1d_kernel. */ { - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); // Kernel values stored in consecutive memory. This allows us to compute // values in all three directions in a single kernel evaluation call. - static constexpr auto ns2 = ns * FLT(0.5); // half spread width - alignas(alignment) std::array kernel_values{0}; - std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding - for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts + static constexpr auto ns2 = ns * T(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; + std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts const auto dd_pt = initialize_complex_register(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); - const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; - const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - ker_eval(kernel_values.data(), opts, x1, x2); + const auto x1 = (T)std::ceil(kx[pt] - ns2) - kx[pt]; + const auto x2 = (T)std::ceil(ky[pt] - ns2) - ky[pt]; + ker_eval(kernel_values.data(), opts, x1, x2); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop @@ -1578,8 +1135,8 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); // this initializes the entire vector registers with the same value // the ker1val_v[i] looks like this: // +-----------------------+ @@ -1591,7 +1148,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( if constexpr (kerval_vectors % 2) { const auto ker1_v = simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); - const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; ker1val_v[kerval_vectors - 1] = res; } return ker1val_v; @@ -1611,41 +1168,42 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( } } -template +template void spread_subproblem_2d_dispatch( const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2, - FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *dd, const finufft_spread_opts &opts) { + T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd, + const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, - M, kx, ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, + du, M, kx, ky, dd, opts); else { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, - M, kx, ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, + du, M, kx, ky, dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, + kx, ky, dd, opts); } else { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, + kx, ky, dd, opts); } } else { - return spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, + ky, dd, opts); } } } -void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const UBIGINT size1, - const UBIGINT size2, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *ky, const FLT *dd, - const finufft_spread_opts &opts) noexcept +template +static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2, + T *FINUFFT_RESTRICT du, UBIGINT M, const T *kx, + const T *ky, const T *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1654,24 +1212,24 @@ void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const UBIGINT si For algoritmic details see spread_subproblem_1d_kernel. */ { - spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, ky, dd, - opts); + spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, ky, + dd, opts); } -template +template FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const BIGINT off1, const BIGINT off2, const BIGINT off3, const UBIGINT size1, - const UBIGINT size2, const UBIGINT size3, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, + const UBIGINT size2, const UBIGINT size3, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *kx, const T *ky, const T *kz, const T *dd, const finufft_spread_opts &opts) noexcept { - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); - static constexpr auto ns2 = ns * FLT(0.5); // half spread width - alignas(alignment) std::array kernel_values{0}; + static constexpr auto ns2 = ns * T(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2 * size3, 0); for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts @@ -1684,7 +1242,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - ker_eval(kernel_values.data(), opts, x1, x2, x3); + ker_eval(kernel_values.data(), opts, x1, x2, x3); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; const auto *ker3 = kernel_values.data() + 2 * MAX_NSPREAD; @@ -1702,8 +1260,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); ker1val_v[i] = ker1low * dd_pt; ker1val_v[i + 1] = ker1hi * dd_pt; } @@ -1712,7 +1270,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( if constexpr (kerval_vectors % 2) { const auto ker1_v = simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); - const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; ker1val_v[kerval_vectors - 1] = res; } return ker1val_v; @@ -1734,41 +1292,42 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( } } -template -void spread_subproblem_3d_dispatch( - BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, UBIGINT size2, UBIGINT size3, - FLT *du, UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, - const finufft_spread_opts &opts) noexcept { +template +void spread_subproblem_3d_dispatch(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, + UBIGINT size2, UBIGINT size3, T *du, UBIGINT M, + const T *kx, const T *ky, const T *kz, const T *dd, + const finufft_spread_opts &opts) noexcept { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_3d_kernel( + return spread_subproblem_3d_kernel( off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); else { - return spread_subproblem_3d_kernel( + return spread_subproblem_3d_kernel( off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, - size3, du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } else { - return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, - size3, du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } } else { - return spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, - du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, + size3, du, M, kx, ky, kz, dd, opts); } } } -void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, - UBIGINT size2, UBIGINT size3, FLT *du, UBIGINT M, FLT *kx, - FLT *ky, FLT *kz, FLT *dd, - const finufft_spread_opts &opts) noexcept +template +static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, + UBIGINT size2, UBIGINT size3, T *du, UBIGINT M, T *kx, + T *ky, T *kz, T *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 3D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim. @@ -1776,15 +1335,15 @@ dd (size M complex) are complex source strengths du (size size1*size2*size3) is uniform complex output array */ { - spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, du, M, - kx, ky, kz, dd, opts); + spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, du, + M, kx, ky, kz, dd, opts); } -template -void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, - UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const FLT *const du0) +template +static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, + UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, + UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, + T *FINUFFT_RESTRICT data_uniform, const T *du0) /* Add a large subgrid (du0) to output grid (data_uniform), with periodic wrapping to N1,N2,N3 box. offset1,2,3 give the offset of the subgrid from the lowest corner of output. @@ -1796,7 +1355,7 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, */ { std::vector o2(size2), o3(size3); - static auto accumulate = [](FLT &a, FLT b) { + static auto accumulate = [](T &a, T b) { if constexpr (thread_safe) { // NOLINT(*-branch-clone) #pragma omp atomic a += b; @@ -1841,10 +1400,11 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } -void bin_sort_singlethread( - BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, - const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, - const double bin_size_y, const double bin_size_z, const int debug) +template +static void bin_sort_singlethread(std::vector &ret, UBIGINT M, const T *kx, + const T *ky, const T *kz, UBIGINT N1, UBIGINT N2, + UBIGINT N3, double bin_size_x, double bin_size_y, + double bin_size_z, int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1877,21 +1437,21 @@ void bin_sort_singlethread( // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); - const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; - const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins1 = BIGINT(T(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(T(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(T(N3) / bin_size_z + 1) : 1; const auto nbins = nbins1 * nbins2 * nbins3; - const auto inv_bin_size_x = FLT(1.0 / bin_size_x); - const auto inv_bin_size_y = FLT(1.0 / bin_size_y); - const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + const auto inv_bin_size_x = T(1.0 / bin_size_x); + const auto inv_bin_size_y = T(1.0 / bin_size_y); + const auto inv_bin_size_z = T(1.0 / bin_size_z); // count how many pts in each bin std::vector counts(nbins, 0); for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; } @@ -1906,18 +1466,20 @@ void bin_sort_singlethread( for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly ++counts[bin]; // update the offsets } } -void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, - UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, - double bin_size_z, int debug, int nthr) +template +static void bin_sort_multithread(std::vector &ret, UBIGINT M, T *kx, T *ky, T *kz, + UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, + double bin_size_y, double bin_size_z, int debug, + int nthr) /* Mostly-OpenMP'ed version of bin_sort. For documentation see: bin_sort_singlethread. Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread. @@ -1952,9 +1514,9 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI my_counts.resize(nbins, 0); // allocate counts[t], now in parallel region for (auto i = brk[t]; i < brk[t + 1]; i++) { // find the bin index in however many dims are needed - BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; + BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; + if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; + if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++my_counts[bin]; // no clash btw threads } @@ -1975,9 +1537,9 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI auto &my_counts(counts[t]); for (UBIGINT i = brk[t]; i < brk[t + 1]; i++) { // find the bin index (again! but better than using RAM) - UBIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; + UBIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; + if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; + if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; UBIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3); ret[my_counts[bin]] = i; // inverse is offset for this NU pt and thread ++my_counts[bin]; // update the offsets; no thread clash @@ -1985,9 +1547,10 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI } } -void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padded_size1, - BIGINT &size1, BIGINT &size2, BIGINT &size3, UBIGINT M, FLT *kx, FLT *ky, - FLT *kz, int ns, int ndims) +template +static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, + BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, + UBIGINT M, T *kx, T *ky, T *kz, int ns, int ndims) /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of Z^ndims) large enough to enclose all of the nonuniform points with (non-periodic) padding of half the kernel width ns to each side in @@ -2031,14 +1594,14 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padd tests. */ { - FLT ns2 = (FLT)ns / 2; - FLT min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points + T ns2 = (T)ns / 2; + T min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points arrayrange(M, kx, &min_kx, &max_kx); offset1 = (BIGINT)std::ceil(min_kx - ns2); // min index touched by kernel size1 = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first! - padded_size1 = size1 + get_padding(2 * ns) / 2; + padded_size1 = size1 + get_padding(2 * ns) / 2; if (ndims > 1) { - FLT min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points + T min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points arrayrange(M, ky, &min_ky, &max_ky); offset2 = (BIGINT)std::ceil(min_ky - ns2); size2 = (BIGINT)std::ceil(max_ky - ns2) - offset2 + ns; @@ -2047,254 +1610,658 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padd size2 = 1; } if (ndims > 2) { - FLT min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points + T min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points arrayrange(M, kz, &min_kz, &max_kz); offset3 = (BIGINT)std::ceil(min_kz - ns2); size3 = (BIGINT)std::ceil(max_kz - ns2) - offset3 + ns; } else { - offset3 = 0; - size3 = 1; + offset3 = 0; + size3 = 1; + } +} + +// ========================================================================== +template +FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz, + T *data_nonuniform, const finufft_spread_opts &opts) +/* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- + If opts.spread_direction=1, evaluate, in the 1D case, + + N1-1 + data_nonuniform[j] = SUM phi(kx[j] - n) data_uniform[n], for j=0...M-1 + n=0 + + If opts.spread_direction=2, evaluate its transpose, in the 1D case, + + M-1 + data_uniform[n] = SUM phi(kx[j] - n) data_nonuniform[j], for n=0...N1-1 + j=0 + + In each case phi is the spreading kernel, which has support + [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with + product of 1D kernels is performed. + For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1. + + Notes: + No particular normalization of the spreading kernel is assumed. + Uniform (U) points are centered at coords + [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x + fastest, y medium, z slowest ordering, up to however many + dimensions are relevant; note that this is Fortran-style ordering for an + array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran + interface of the original CMCL libraries. + Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three + periods in each coordinate (these are folded into the central period). + The finufft_spread_opts struct must have been set up already by calling setup_kernel. + It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel + only ever wraps once when falls below 0 or off the top of a uniform grid + dimension. + + Inputs: + N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively. + If N2==1, 1D spreading is done. If N3==1, 2D spreading. + Otherwise, 3D. + M - number of NU pts. + kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in + 1D, only kx and ky read in 2D). + + These should lie in the box -pi<=kx<=pi. Points outside this domain are also + correctly folded back into this domain. + opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h + + Inputs/Outputs: + data_uniform - output values on grid (dir=1) OR input grid data (dir=2) + data_nonuniform - input strengths of the sources (dir=1) + OR output values at targets (dir=2) + Returned value: + 0 indicates success; other values have meanings in ../docs/error.rst, with + following modifications: + 3 : one or more non-trivial box dimensions is less than 2.nspread. + 5 : failed allocate sort indices + + Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17 + error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18 + No separate subprob indices in t-1 2/11/18. + sort_threads (since for M< sort_indices(M); + int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts); + spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, + data_nonuniform, opts, did_sort); + return 0; +} + +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, float *data_uniform, UBIGINT M, float *kx, + float *ky, float *kz, float *data_nonuniform, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, double *data_uniform, UBIGINT M, double *kx, + double *ky, double *kz, double *data_nonuniform, const finufft_spread_opts &opts); + +static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3) +/* rule for getting number of spreading dimensions from the list of Ns per dim. + Split out, Barnett 7/26/18 +*/ +{ + return 1 + (N2 > 1) + (N3 > 1); +} + +template +int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T *kz, + const finufft_spread_opts &opts) +/* This does just the input checking and reporting for the spreader. + See spreadinterp() for input arguments and meaning of returned value. + Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. + Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to + [-3pi,3pi) +*/ +{ + // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? + int minN = 2 * opts.nspread; + if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) { + fprintf(stderr, + "%s error: one or more non-trivial box dims is less than 2.nspread!\n", + __func__); + return FINUFFT_ERR_SPREAD_BOX_SMALL; + } + if (opts.spread_direction != 1 && opts.spread_direction != 2) { + fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__); + return FINUFFT_ERR_SPREAD_DIR; + } + return 0; +} +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, float *kx, + float *ky, float *kz, const finufft_spread_opts &opts); +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, + double *kx, double *ky, double *kz, + const finufft_spread_opts &opts); + +template +int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, T *kx, T *ky, T *kz, const finufft_spread_opts &opts) +/* This makes a decision whether or not to sort the NU pts (influenced by + opts.sort), and if yes, calls either single- or multi-threaded bin sort, + writing reordered index list to sort_indices. If decided not to sort, the + identity permutation is written to sort_indices. + The permutation is designed to make RAM access close to contiguous, to + speed up spreading/interpolation, in the case of disordered NU points. + + Inputs: + M - number of input NU points. + kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi), + points outside are folded in. + (only kz used in 1D, only kx and ky used in 2D.) + N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D). + 1 = x (fastest), 2 = y (medium), 3 = z (slowest). + opts - spreading options struct, see ../include/finufft_spread_opts.h + Outputs: + sort_indices - a good permutation of NU points. (User must preallocate + to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good + ordering for the x-coords of NU pts, etc. + returned value - whether a sort was done (1) or not (0). + + Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024. +*/ +{ + CNTime timer{}; + uint8_t ndims = ndims_from_Ns(N1, N2, N3); + auto N = N1 * N2 * N3; // U grid (periodic box) sizes + + // heuristic binning box size for U grid... affects performance: + double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4; + // put in heuristics based on cache sizes (only useful for single-thread) ? + + int better_to_sort = + !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or + // dir=2 case: + // don't sort + + timer.start(); // if needed, sort all the NU pts... + int did_sort = 0; + auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default + if (opts.nthreads > 0) + maxnthr = opts.nthreads; // user nthreads overrides, without limit + if (opts.sort_threads > 0) + maxnthr = opts.sort_threads; // high-priority override, also no limit + // At this point: maxnthr = the max threads sorting could use + // (we don't print warning here, since: no showwarn in spread_opts, and finufft + // already warned about it. spreadinterp-only advanced users will miss a warning) + if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) { + // store a good permutation ordering of all NU pts (dim=1,2 or 3) + int sort_debug = (opts.debug >= 2); // show timing output? + int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort +#ifndef _OPENMP + sort_nthr = 1; // if single-threaded lib, override user +#endif + if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! + sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic + if (sort_nthr == 1) + bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug); + else // sort_nthr>1, user fixes # threads (>=2) + bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug, sort_nthr); + if (opts.debug) + printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec()); + did_sort = 1; + } else { +#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000) + for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7 + sort_indices[i] = i; // the identity permutation + if (opts.debug) + printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec()); } + return did_sort; } -/* local NU coord fold+rescale macro: does the following affine transform to x: - (x+PI) mod PI each to [0,N) - Note: folding big numbers can cause numerical inaccuracies - Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range - limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function -*/ -FLT fold_rescale(const FLT x, const UBIGINT N) noexcept { - static constexpr const FLT x2pi = FLT(M_1_2PI); - const FLT result = x * x2pi + FLT(0.5); - return (result - floor(result)) * FLT(N); -} +template int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, UBIGINT M, float *kx, float *ky, float *kz, + const finufft_spread_opts &opts); +template int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, UBIGINT M, double *kx, double *ky, double *kz, + const finufft_spread_opts &opts); -template -simd_type fold_rescale(const simd_type &x, const BIGINT N) noexcept { - const simd_type x2pi = FLT(M_1_2PI); - const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); - return (result - xsimd::floor(result)) * simd_type(FLT(N)); -} +// -------------------------------------------------------------------------- +template +static int spreadSorted(const std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, const T *data_nonuniform, + const finufft_spread_opts &opts, int did_sort) +// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. +{ + CNTime timer{}; + const auto ndims = ndims_from_Ns(N1, N2, N3); + const auto N = N1 * N2 * N3; // output array size + const auto ns = opts.nspread; // abbrev. for w, kernel width + auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread + if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // single-threaded lib must override user +#endif + if (opts.debug) + printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, + (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); + timer.start(); + std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array + if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec()); + if (M == 0) // no NU pts, we're done + return 0; -template -auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, - const V... elems) noexcept { - /* Utility function that allows to move the kernel evaluation outside the spreader for - clarity - Inputs are: - ns = kernel width - kerevalmeth = kernel evaluation method - T = (single or double precision) type of the kernel - simd_type = xsimd::batch for Horner - vectorization (default is the optimal simd size) - finufft_spread_opts as Horner needs - the oversampling factor - elems = kernel arguments - Examples usage is - ker_eval(opts, x, y, z) // for 3D or - ker_eval(opts, x, y) // for 2D or - ker_eval(opts, x) // for 1D - */ - const std::array inputs{elems...}; - // compile time loop, no performance overhead - for (auto i = 0; i < sizeof...(elems); ++i) { - // compile time branch no performance overhead - if constexpr (kerevalmeth == 1) { - if (opts.upsampfac == 2.0) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); - } - if (opts.upsampfac == 1.25) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); - } + auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic? + spread_single = false; // for now + timer.start(); + if (spread_single) { // ------- Basic single-core t1 spreading ------ + for (UBIGINT j = 0; j < M; j++) { + // *** todo, not urgent + // ... (question is: will the index wrapping per NU pt slow it down?) } - if constexpr (kerevalmeth == 0) { - alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; - set_kernel_args(kernel_args.data(), inputs[i]); - evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); + if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec()); + } else { // ------- Fancy multi-core blocked t1 spreading ---- + // Splits sorted inds (jfm's advanced2), could double RAM. + // choose nb (# subprobs) via used nthreads: + auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr... + if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size + nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does + // ceil(M/opts.max_subproblem_size) + if (opts.debug) + printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size); } - } - return ker; -} + if (M * 1000 < N) { // low-density heuristic: one thread per NU pt! + nb = M; + if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n"); + } + if (!did_sort && nthr == 1) { + nb = 1; + if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n"); + } + if (opts.debug && nthr > opts.atomic_threshold) + printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); -namespace { + std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems + for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; -template -constexpr array, N> pad_2D_array_with_zeros( - const array, N> &input) noexcept { - constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { - std::array padded{0}; - for (auto i = 0; i < input.size(); ++i) { - padded[i] = input[i]; +#pragma omp parallel num_threads(nthr) + { + // local copies of NU pts and data for each subproblem + std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; +#pragma omp for schedule(dynamic, 1) // each is big + for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems + const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem + // copy the location and data vectors for the nonuniform points + kx0.resize(M0); + ky0.resize(M0 * (N2 > 1)); + kz0.resize(M0 * (N3 > 1)); + dd0.resize(2 * M0); // complex strength data + for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? + const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list + kx0[j] = fold_rescale(kx[kk], N1); + if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); + if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); + dd0[j * 2] = data_nonuniform[kk * 2]; // real part + dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part + } + // get the subgrid which will include padding by roughly nspread/2 + // get_subgrid sets + BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; + // sets offsets and sizes + get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, + kx0.data(), ky0.data(), kz0.data(), ns, ndims); + if (opts.debug > 1) { + print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2, + size3, M0); + } + // allocate output data for this subgrid + du0.resize(2 * padded_size1 * size2 * size3); // complex + // Spread to subgrid without need for bounds checking or wrapping + if (!(opts.flags & TF_OMIT_SPREADING)) { + if (ndims == 1) + spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(), + dd0.data(), opts); + else if (ndims == 2) + spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0, + kx0.data(), ky0.data(), dd0.data(), opts); + else + spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, + du0.data(), M0, kx0.data(), ky0.data(), kz0.data(), + dd0.data(), opts); + } + // do the adding of subgrid to output + if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { + if (nthr > opts.atomic_threshold) { // see above for debug reporting + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); // R Blackwell's atomic version + } else { +#pragma omp critical + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); + } + } + } // end main loop over subprobs } - return padded; - }; - std::array, N> output{}; - for (std::size_t i = 0; i < N; ++i) { - output[i] = pad_with_zeros(input[i]); - } - return output; -} + if (opts.debug) + printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); + } // end of choice of which t1 spread type to use + return 0; +}; -template -constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { - // utility function to generate a sequence of a, b interleaved as function arguments - return T(((Is % 2 == 0) ? a : b)...); -} +// -------------------------------------------------------------------------- +template +FINUFFT_NEVER_INLINE static int interpSorted_kernel( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts) +// Interpolate to NU pts in sorted order from a uniform grid. +// See spreadinterp() for doc. +{ + using simd_type = PaddedSIMD; + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto ns2 = ns * T(0.5); // half spread width, used as stencil shift -template -constexpr auto initialize_complex_register(V a, V b) noexcept { - // populates a SIMD register with a and b interleaved - // for example: - // +-------------------------------+ - // | a | b | a | b | a | b | a | b | - // +-------------------------------+ - // it uses index_sequence to generate the sequence of a, b at compile time - return generate_sequence_impl(a, b, std::make_index_sequence{}); -} + CNTime timer{}; + const auto ndims = ndims_from_Ns(N1, N2, N3); + auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp + if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // single-threaded lib must override user +#endif + if (opts.debug) + printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, + (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); + timer.start(); +#pragma omp parallel num_threads(nthr) + { + static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk + alignas(alignment) UBIGINT jlist[CHUNKSIZE]; + alignas(alignment) T xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; + alignas(alignment) T outbuf[2 * CHUNKSIZE]; + // Kernels: static alloc is faster, so we do it for up to 3D... + alignas(alignment) std::array kernel_values{0}; + auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); + auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; + + // Loop over interpolation chunks + // main loop over NU trgs, interp each from U + // (note: windows omp doesn't like unsigned loop vars) +#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: + for (BIGINT i = 0; i < M; i += CHUNKSIZE) { + // Setup buffers for this chunk + const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + UBIGINT j = sort_indices[i + ibuf]; + jlist[ibuf] = j; + xjlist[ibuf] = fold_rescale(kx[j], N1); + if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); + if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); + } + + // Loop over targets in chunk + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + const auto xj = xjlist[ibuf]; + const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; + const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; + + auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf; + + // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ + const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index + const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index + const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index + + const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1] + const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0; + const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0; + + // eval kernel values patch and use to interpolate from uniform data... + if (!(opts.flags & TF_OMIT_SPREADING)) { + switch (ndims) { + case 1: + ker_eval(kernel_values.data(), opts, x1); + interp_line(target, data_uniform, ker1, i1, N1); + break; + case 2: + ker_eval(kernel_values.data(), opts, x1, x2); + interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, + N2); + break; + case 3: + ker_eval(kernel_values.data(), opts, x1, x2, + x3); + interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, + i3, N1, N2, N3); + break; + default: // can't get here + FINUFFT_UNREACHABLE; + break; + } + } + } // end loop over targets in chunk -// Below there is some template metaprogramming magic to find the best SIMD type -// for the given number of elements. The code is based on the xsimd library + // Copy result buffer to output array + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + const UBIGINT j = jlist[ibuf]; + data_nonuniform[2 * j] = outbuf[2 * ibuf]; + data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; + } -// this finds the largest SIMD instruction set that can handle N elements -// void otherwise -> compile error -template constexpr auto BestSIMDHelper() { - if constexpr (N % K == 0) { // returns void in the worst case - return xsimd::make_sized_batch{}; - } else { - return BestSIMDHelper> 1)>(); - } + } // end NU targ loop + } // end parallel section + if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec()); + return 0; } -template constexpr uint8_t min_simd_width() { - // finds the smallest simd width that can handle N elements - // simd size is batch size the SIMD width in xsimd terminology - if constexpr (std::is_void_v>) { - return min_simd_width(); +template +static int interpSorted_dispatch( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, + T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { + static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); + else { + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); + } } else { - return N; - } -}; - -template constexpr auto find_optimal_simd_width() { - // finds the smallest simd width that minimizes the number of iterations - // NOTE: might be suboptimal for some cases 2^N+1 for example - // in the future we might want to implement a more sophisticated algorithm - uint8_t optimal_simd_width = min_simd_width(); - uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width; - for (uint8_t simd_width = optimal_simd_width; - simd_width <= xsimd::batch::size; - simd_width *= 2) { - uint8_t iterations = (N + simd_width - 1) / simd_width; - if (iterations < min_iterations) { - min_iterations = iterations; - optimal_simd_width = simd_width; + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); + } else { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, + M, kx, ky, kz, data_nonuniform, opts); + } + } else { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); } } - return optimal_simd_width; } -template constexpr auto GetPaddedSIMDWidth() { - // helper function to get the SIMD width with padding for the given number of elements - // that minimizes the number of iterations - return xsimd::make_sized_batch()>::type::size; +template +static int interpSorted( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, + T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); } -template constexpr auto get_padding() { - // helper function to get the padding for the given number of elements - // ns is known at compile time, rounds ns to the next multiple of the SIMD width - // then subtracts ns to get the padding using a bitwise and trick - // WARING: this trick works only for power of 2s - // SOURCE: Agner Fog's VCL manual - constexpr uint8_t width = GetPaddedSIMDWidth(); - return ((ns + width - 1) & (-width)) - ns; +template +int spreadinterpSorted(const std::vector &sort_indices, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3, T *data_uniform, + const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort) +/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. + See spreadinterp() above for inputs arguments and definitions. + Return value should always be 0 (no error reporting). + Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20. +*/ +{ + if (opts.spread_direction == 1) // ========= direction 1 (spreading) ======= + spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, + opts, did_sort); + + else // ================= direction 2 (interpolation) =========== + interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, + opts); + + return 0; } +template int spreadinterpSorted( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx, + float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, + float *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, + int did_sort); +template int spreadinterpSorted( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx, + double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz, + double *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, + int did_sort); -template constexpr auto get_padding_helper(uint8_t runtime_ns) { - // helper function to get the padding for the given number of elements where ns is - // known at runtime, it uses recursion to find the padding - // this allows to avoid having a function with a large number of switch cases - // as GetPaddedSIMDWidth requires a compile time value - // it cannot be a lambda function because of the template recursion - if constexpr (ns < 2) { - return 0; - } else { - if (runtime_ns == ns) { - return get_padding(); - } else { - return get_padding_helper(runtime_ns); +/////////////////////////////////////////////////////////////////////////// + +template +FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps, + double upsampfac, int kerevalmeth, + int debug, int showwarn, int dim) +/* Initializes spreader kernel parameters given desired NUFFT tolerance eps, + upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth + (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags. + Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for + opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where + upsampfac is set. Must call this before any kernel evals done, otherwise segfault + likely. Returns: 0 : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be + achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h); + spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. +*/ +{ + constexpr T EPSILON = std::numeric_limits::epsilon(); + if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma + if (kerevalmeth == 1) { + fprintf(stderr, + "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by " + "kerevalmeth=1\n", + upsampfac); + return FINUFFT_ERR_HORNER_WRONG_BETA; + } + if (upsampfac <= 1.0) { // no digits would result + fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n", + upsampfac); + return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; } + // calling routine must abort on above errors, since opts is garbage! + if (showwarn && upsampfac > 4.0) + fprintf(stderr, + "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be " + "beneficial.\n", + upsampfac); } -} -template uint8_t get_padding(uint8_t ns) { - // return the padding as a function of the number of elements - // 2 * MAX_NSPREAD is the maximum number of elements that we can have - // that's why is hardcoded here - return get_padding_helper(ns); -} + // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft) + opts.spread_direction = 0; // user should always set to 1 or 2 as desired + opts.sort = 2; // 2:auto-choice + opts.kerpad = 0; // affects only evaluate_kernel_vector + opts.kerevalmeth = kerevalmeth; + opts.upsampfac = upsampfac; + opts.nthreads = 0; // all avail + opts.sort_threads = 0; // 0:auto-choice + // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake... + opts.max_subproblem_size = (dim == 1) ? 10000 : 100000; + opts.flags = 0; // 0:no timing flags (>0 for experts only) + opts.debug = 0; // 0:no debug output + // heuristic nthr above which switch OMP critical to atomic (add_wrapped...): + opts.atomic_threshold = 10; // R Blackwell's value -struct zip_low { - // helper struct to get the lower half of a SIMD register and zip it with itself - // it returns index 0, 0, 1, 1, ... N/2, N/2 - static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } -}; -struct zip_hi { - // helper struct to get the upper half of a SIMD register and zip it with itself - // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N - static constexpr unsigned get(unsigned index, unsigned size) { - return (size + index) / 2; - } -}; -template struct reverse_index { - static constexpr unsigned get(unsigned index, const unsigned size) { - return index < cap ? (cap - 1 - index) : index; + int ns, ier = 0; // Set kernel width w (aka ns, nspread) then copy to opts... + if (eps < EPSILON) { // safety; there's no hope of beating e_mach + if (showwarn) + fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__, + (double)eps, (double)EPSILON); + eps = EPSILON; // only changes local copy (not any opts) + ier = FINUFFT_WARN_EPS_TOO_SMALL; } -}; -template struct shuffle_index { - static constexpr unsigned get(unsigned index, const unsigned size) { - return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; + if (upsampfac == 2.0) // standard sigma (see SISC paper) + ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10 + else // custom sigma + ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 + ns = max(2, ns); // (we don't have ns=1 version yet) + if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules + if (showwarn) + fprintf(stderr, + "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " + "clipping to max %d.\n", + __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); + ns = MAX_NSPREAD; + ier = FINUFFT_WARN_EPS_TOO_SMALL; } -}; - -struct select_even { - static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; } -}; -struct select_odd { - static constexpr unsigned get(unsigned index, unsigned /*size*/) { - return index * 2 + 1; + opts.nspread = ns; + // setup for reference kernel eval (via formula): select beta width param... + // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel) + opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) + opts.ES_c = 4.0 / (double)(ns * ns); + double betaoverns = 2.30; // gives decent betas for default sigma=2.0 + if (ns == 2) betaoverns = 2.20; // some small-width tweaks... + if (ns == 3) betaoverns = 2.26; + if (ns == 4) betaoverns = 2.38; + if (upsampfac != 2.0) { // again, override beta for custom sigma + T gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! + betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff } -}; + opts.ES_beta = betaoverns * ns; // set the kernel beta parameter + if (debug) + printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__, + kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta); -template auto xsimd_to_array(const T &vec) noexcept { - constexpr auto alignment = T::arch_type::alignment(); - alignas(alignment) std::array array{}; - vec.store_aligned(array.data()); - return array; + return ier; } - -void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, - UBIGINT M0) { - printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); - switch (ndims) { - case 1: - printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, - (long long)padded_size1, (long long)M0); - break; - case 2: - printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, - (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); - break; - case 3: - printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)offset3, - (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); - break; - default: - printf("Invalid number of dimensions: %d\n", ndims); - break; - } +template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader( + finufft_spread_opts &opts, float eps, double upsampfac, int kerevalmeth, int debug, + int showwarn, int dim); +template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader( + finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth, int debug, + int showwarn, int dim); + +template +T evaluate_kernel(T x, const finufft_spread_opts &opts) +/* ES ("exp sqrt") kernel evaluation at single real argument: + phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 + related to an asymptotic approximation to the Kaiser--Bessel, itself an + approximation to prolate spheroidal wavefunction (PSWF) of order 0. + This is the "reference implementation", used by eg finufft/onedim_* 2/17/17. + Rescaled so max is 1, Barnett 7/21/24 +*/ +{ + if (abs(x) >= (T)opts.ES_halfwidth) + // if spreading/FT careful, shouldn't need this if, but causes no speed hit + return 0.0; + else + return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } -} // namespace + +template float evaluate_kernel(float x, const finufft_spread_opts &opts); +template double evaluate_kernel(double x, const finufft_spread_opts &opts); + } // namespace finufft::spreadinterp diff --git a/src/utils.cpp b/src/utils.cpp index 8df6ed665..f64009132 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,86 +1,89 @@ -// Low-level array manipulations, timer, and OMP helpers, that need separate -// single/double routines (FLT must be an arg). Others are in utils_precindep +// Low-level array manipulations, timer, and OMP helpers, that are precision- +// independent (no FLT allowed in argument lists). Others are in utils.cpp -// For self-test see ../test/testutils.cpp Barnett 2017-2020. +// For self-test see ../test/testutils.cpp. Barnett 2017-2020. + +#include #include "finufft/utils.h" -#include "finufft/defs.h" +using namespace std; namespace finufft { namespace utils { -// ------------ complex array utils --------------------------------- - -FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b) -// ||a-b||_2 / ||a||_2 +BIGINT next235even(BIGINT n) +// finds even integer not less than n, with prime factors no larger than 5 +// (ie, "smooth"). Adapted from fortran in hellskitchen. Barnett 2/9/17 +// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n. { - FLT err = 0.0, nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) { - nrm += real(conj(a[m]) * a[m]); - CPX diff = a[m] - b[m]; - err += real(conj(diff) * diff); + if (n <= 2) return 2; + if (n % 2 == 1) n += 1; // even + BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop + BIGINT numdiv = 2; // a dummy that is >1 + while (numdiv > 1) { + nplus += 2; // stays even + numdiv = nplus; + while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5... + while (numdiv % 3 == 0) numdiv /= 3; + while (numdiv % 5 == 0) numdiv /= 5; } - return sqrt(err / nrm); + return nplus; } -FLT errtwonorm(BIGINT n, CPX *a, CPX *b) -// ||a-b||_2 -{ - FLT err = 0.0; // compute error 2-norm - for (BIGINT m = 0; m < n; ++m) { - CPX diff = a[m] - b[m]; - err += real(conj(diff) * diff); - } - return sqrt(err); + +// ----------------------- helpers for timing (always stay double prec) ------ + +void CNTime::start() { + initial = double(std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()) * + 1e-6; } -FLT twonorm(BIGINT n, CPX *a) -// ||a||_2 + +double CNTime::restart() +// Barnett changed to returning in sec { - FLT nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]); - return sqrt(nrm); + double delta = elapsedsec(); + start(); + return delta; } -FLT infnorm(BIGINT n, CPX *a) -// ||a||_infty + +double CNTime::elapsedsec() +// returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18 { - FLT nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) { - FLT aa = real(conj(a[m]) * a[m]); - if (aa > nrm) nrm = aa; - } - return sqrt(nrm); + std::uint64_t now = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + const double nowsec = double(now) * 1e-6; + return nowsec - initial; } -// ------------ real array utils --------------------------------- - -void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi) -// With a a length-n array, writes out min(a) to lo and max(a) to hi, -// so that all a values lie in [lo,hi]. -// If n==0, lo and hi are not finite. +// -------------------------- openmp helpers ------------------------------- +int get_num_threads_parallel_block() +// return how many threads an omp parallel block would use. +// omp_get_max_threads() does not report this; consider case of NESTED=0. +// Why is there no such routine? Barnett 5/22/20 { - *lo = INFINITY; - *hi = -INFINITY; - for (BIGINT m = 0; m < n; ++m) { - if (a[m] < *lo) *lo = a[m]; - if (a[m] > *hi) *hi = a[m]; + int nth_used; +#pragma omp parallel + { +#pragma omp single + nth_used = MY_OMP_GET_NUM_THREADS(); } + return nth_used; } -void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c) -// Writes out w = half-width and c = center of an interval enclosing all a[n]'s -// Only chooses a nonzero center if this increases w by less than fraction -// ARRAYWIDCEN_GROWFRAC defined in defs.h. -// This prevents rephasings which don't grow nf by much. 6/8/17 -// If n==0, w and c are not finite. +// ---------- thread-safe rand number generator for Windows platform --------- +// (note this is used by macros in defs.h, and supplied in linux/macosx) +#ifdef _WIN32 +int rand_r(unsigned int * /*seedp*/) +// Libin Lu, 6/18/20 { - FLT lo, hi; - arrayrange(n, a, &lo, &hi); - *w = (hi - lo) / 2; - *c = (hi + lo) / 2; - if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) { - *w += std::abs(*c); - *c = 0.0; - } + std::random_device rd; + std::default_random_engine generator(rd()); + std::uniform_int_distribution distribution(0, RAND_MAX); + return distribution(generator); } +#endif } // namespace utils } // namespace finufft diff --git a/src/utils_precindep.cpp b/src/utils_precindep.cpp deleted file mode 100644 index 194fae7f0..000000000 --- a/src/utils_precindep.cpp +++ /dev/null @@ -1,90 +0,0 @@ -// Low-level array manipulations, timer, and OMP helpers, that are precision- -// independent (no FLT allowed in argument lists). Others are in utils.cpp - -// For self-test see ../test/testutils.cpp. Barnett 2017-2020. - -#include - -#include "finufft/defs.h" -#include "finufft/utils_precindep.h" -using namespace std; - -namespace finufft { -namespace utils { - -BIGINT next235even(BIGINT n) -// finds even integer not less than n, with prime factors no larger than 5 -// (ie, "smooth"). Adapted from fortran in hellskitchen. Barnett 2/9/17 -// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n. -{ - if (n <= 2) return 2; - if (n % 2 == 1) n += 1; // even - BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop - BIGINT numdiv = 2; // a dummy that is >1 - while (numdiv > 1) { - nplus += 2; // stays even - numdiv = nplus; - while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5... - while (numdiv % 3 == 0) numdiv /= 3; - while (numdiv % 5 == 0) numdiv /= 5; - } - return nplus; -} - -// ----------------------- helpers for timing (always stay double prec) ------ - -void CNTime::start() { - initial = std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) - .count() * - 1e-6; -} - -double CNTime::restart() -// Barnett changed to returning in sec -{ - double delta = elapsedsec(); - start(); - return delta; -} - -double CNTime::elapsedsec() -// returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18 -{ - std::uint64_t now = std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) - .count(); - const double nowsec = now * 1e-6; - return nowsec - initial; -} - -// -------------------------- openmp helpers ------------------------------- -int get_num_threads_parallel_block() -// return how many threads an omp parallel block would use. -// omp_get_max_threads() does not report this; consider case of NESTED=0. -// Why is there no such routine? Barnett 5/22/20 -{ - int nth_used; -#pragma omp parallel - { -#pragma omp single - nth_used = MY_OMP_GET_NUM_THREADS(); - } - return nth_used; -} - -// ---------- thread-safe rand number generator for Windows platform --------- -// (note this is used by macros in defs.h, and supplied in linux/macosx) -#ifdef _WIN32 -int rand_r(unsigned int *seedp) -// Libin Lu, 6/18/20 -{ - std::random_device rd; - std::default_random_engine generator(rd()); - std::uniform_int_distribution distribution(0, RAND_MAX); - return distribution(generator); -} -#endif - -} // namespace utils -} // namespace finufft diff --git a/test/testutils.cpp b/test/testutils.cpp index 64b5d7a0a..6facb72cd 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -1,4 +1,4 @@ -/* unit tests for utils & utils_precindep modules. +/* unit tests for utils module. Usage: ./testutils{f} @@ -10,8 +10,8 @@ Suggested compile (double/float versions): g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o - ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp - -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE + ../src/utils.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp + -I../include ../src/utils.o -o testutilsf -lgomp -DSINGLE */ // This switches FLT macro from double to float if SINGLE is defined, etc... @@ -57,7 +57,8 @@ int main(int argc, char *argv[]) { a[j] = CPX(1.0, 0.0); b[j] = a[j]; } - FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly + constexpr FLT EPSILON = std::numeric_limits::epsilon(); + FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1; if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1; b[0] = CPX(0.0, 0.0); // perturb b from a