diff --git a/CMakeLists.txt b/CMakeLists.txt
index 423c8adc4..7e5e2cf5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,9 +121,7 @@ endif()
 
 # This set of sources is compiled twice, once in single precision and once in
 # double precision The single precision compilation is done with -DSINGLE
-set(FINUFFT_PRECISION_DEPENDENT_SOURCES
-    src/finufft.cpp src/fft.cpp src/simpleinterfaces.cpp src/spreadinterp.cpp
-    src/utils.cpp)
+set(FINUFFT_PRECISION_DEPENDENT_SOURCES)
 
 # If we're building for Fortran, make sure we also include the translation
 # layer.
@@ -252,25 +250,30 @@ endfunction()
 
 if(FINUFFT_USE_CPU)
   # Main finufft libraries
-  add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
-  target_compile_definitions(finufft_f32 PRIVATE SINGLE)
-  set_finufft_options(finufft_f32)
-
-  add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
-  set_finufft_options(finufft_f64)
   if(NOT FINUFFT_STATIC_LINKING)
-    add_library(finufft SHARED src/utils_precindep.cpp
-                               contrib/legendre_rule_fast.cpp)
+    add_library(
+      finufft SHARED
+      src/spreadinterp.cpp
+      src/utils.cpp
+      contrib/legendre_rule_fast.cpp
+      src/fft.cpp
+      src/finufft_core.cpp
+      src/simpleinterfaces.cpp
+      fortran/finufftfort.cpp)
   else()
-    add_library(finufft STATIC src/utils_precindep.cpp
-                               contrib/legendre_rule_fast.cpp)
+    add_library(
+      finufft STATIC
+      src/spreadinterp.cpp
+      src/utils.cpp
+      contrib/legendre_rule_fast.cpp
+      src/fft.cpp
+      src/finufft_core.cpp
+      src/simpleinterfaces.cpp
+      fortran/finufftfort.cpp)
   endif()
-  target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64)
   set_finufft_options(finufft)
 
   if(WIN32 AND FINUFFT_SHARED_LINKING)
-    target_compile_definitions(finufft_f32 PRIVATE dll_EXPORTS FINUFFT_DLL)
-    target_compile_definitions(finufft_f64 PRIVATE dll_EXPORTS FINUFFT_DLL)
     target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
   endif()
   find_library(MATH_LIBRARY m)
diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp
index 799a10041..400ff0985 100644
--- a/fortran/finufftfort.cpp
+++ b/fortran/finufftfort.cpp
@@ -19,43 +19,15 @@
 
 // public header
 #include <finufft.h>
-
-// private headers needed... (must come after finufft.h which clobbers FINUFFT*)
-#include <finufft/defs.h>
-
-// local prec-switching macros for fortran names, ie
-// underscore-suffixed versions of those at end of defs.h
-#define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_)
-#define FINUFFT_MAKEPLAN_     FINUFFTIFY(_makeplan_)
-#define FINUFFT_SETPTS_       FINUFFTIFY(_setpts_)
-#define FINUFFT_EXECUTE_      FINUFFTIFY(_execute_)
-#define FINUFFT_DESTROY_      FINUFFTIFY(_destroy_)
-#define FINUFFT1D1_           FINUFFTIFY(1d1_)
-#define FINUFFT1D2_           FINUFFTIFY(1d2_)
-#define FINUFFT1D3_           FINUFFTIFY(1d3_)
-#define FINUFFT2D1_           FINUFFTIFY(2d1_)
-#define FINUFFT2D2_           FINUFFTIFY(2d2_)
-#define FINUFFT2D3_           FINUFFTIFY(2d3_)
-#define FINUFFT3D1_           FINUFFTIFY(3d1_)
-#define FINUFFT3D2_           FINUFFTIFY(3d2_)
-#define FINUFFT3D3_           FINUFFTIFY(3d3_)
-#define FINUFFT1D1MANY_       FINUFFTIFY(1d1many_)
-#define FINUFFT1D2MANY_       FINUFFTIFY(1d2many_)
-#define FINUFFT1D3MANY_       FINUFFTIFY(1d3many_)
-#define FINUFFT2D1MANY_       FINUFFTIFY(2d1many_)
-#define FINUFFT2D2MANY_       FINUFFTIFY(2d2many_)
-#define FINUFFT2D3MANY_       FINUFFTIFY(2d3many_)
-#define FINUFFT3D1MANY_       FINUFFTIFY(3d1many_)
-#define FINUFFT3D2MANY_       FINUFFTIFY(3d2many_)
-#define FINUFFT3D3MANY_       FINUFFTIFY(3d3many_)
+#include <finufft/finufft_core.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // --------------------- guru interface from fortran ------------------------
-void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf,
-                       FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) {
+void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf,
+                       double *tol, finufft_plan *plan, finufft_opts *o, int *ier) {
   if (!plan)
     fprintf(stderr,
             "%s fortran: plan must be allocated as at least the size of a C pointer "
@@ -63,143 +35,325 @@ void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int
             __func__);
   else {
     // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts:
-    *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o);
+    *ier = finufft_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o);
   }
 }
 
-void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk,
-                     FLT *s, FLT *t, FLT *u, int *ier) {
+void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, double *zj,
+                     BIGINT *nk, double *s, double *t, double *u, int *ier) {
   if (!*plan) {
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
     return;
   }
   int nk_safe = 0; // catches the case where user passes NULL in
-  if (nk) nk_safe = *nk;
-  *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
+  if (nk) nk_safe = int(*nk);
+  *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
 }
 
-void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) {
+void finufft_execute_(finufft_plan *plan, std::complex<double> *weights,
+                      std::complex<double> *result, int *ier) {
   if (!plan)
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
-    *ier = FINUFFT_EXECUTE(*plan, weights, result);
+    *ier = finufft_execute(*plan, weights, result);
 }
 
-void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) {
+void finufft_destroy_(finufft_plan *plan, int *ier) {
   if (!plan)
     fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
-    *ier = FINUFFT_DESTROY(*plan);
+    *ier = finufft_destroy(*plan);
 }
 
 // ------------ use FINUFFT to set the default options ---------------------
 // (Note the finufft_opts is created in f90-style derived types, not here)
-void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) {
+void finufft_default_opts_(finufft_opts *o) {
   if (!o)
     fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__);
   else
     // o is a ptr to already-allocated fortran finufft_opts derived type...
-    FINUFFT_DEFAULT_OPTS(o);
+    finufft_default_opts(o);
 }
 
 // -------------- simple and many-vector interfaces --------------------
 // --- 1D ---
-void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
-                 finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
+void finufft1d1_(BIGINT *nj, double *xj, std::complex<double> *cj, int *iflag,
+                 double *eps, BIGINT *ms, std::complex<double> *fk, finufft_opts *o,
+                 int *ier) {
+  *ier = finufft1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
-                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
+void finufft1d1many_(int *ntransf, BIGINT *nj, double *xj, std::complex<double> *cj,
+                     int *iflag, double *eps, BIGINT *ms, std::complex<double> *fk,
+                     finufft_opts *o, int *ier) {
+  *ier = finufft1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
-                 finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
+void finufft1d2_(BIGINT *nj, double *xj, std::complex<double> *cj, int *iflag,
+                 double *eps, BIGINT *ms, std::complex<double> *fk, finufft_opts *o,
+                 int *ier) {
+  *ier = finufft1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
-                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
+void finufft1d2many_(int *ntransf, BIGINT *nj, double *xj, std::complex<double> *cj,
+                     int *iflag, double *eps, BIGINT *ms, std::complex<double> *fk,
+                     finufft_opts *o, int *ier) {
+  *ier = finufft1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s,
-                 CPX *f, finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
+void finufft1d3_(BIGINT *nj, double *x, std::complex<double> *c, int *iflag, double *eps,
+                 BIGINT *nk, double *s, std::complex<double> *f, finufft_opts *o,
+                 int *ier) {
+  *ier = finufft1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
-void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps,
-                     BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) {
-  *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
+void finufft1d3many_(int *ntransf, BIGINT *nj, double *x, std::complex<double> *c,
+                     int *iflag, double *eps, BIGINT *nk, double *s,
+                     std::complex<double> *f, finufft_opts *o, int *ier) {
+  *ier = finufft1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
 // --- 2D ---
-void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
-                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+void finufft2d1_(BIGINT *nj, double *xj, double *yj, std::complex<double> *cj, int *iflag,
+                 double *eps, BIGINT *ms, BIGINT *mt, std::complex<double> *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = finufft2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+void finufft2d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj,
+                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
+                     BIGINT *mt, std::complex<double> *fk, finufft_opts *o, int *ier) {
+  *ier = finufft2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+
+void finufft2d2_(BIGINT *nj, double *xj, double *yj, std::complex<double> *cj, int *iflag,
+                 double *eps, BIGINT *ms, BIGINT *mt, std::complex<double> *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = finufft2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+void finufft2d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj,
+                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
+                     BIGINT *mt, std::complex<double> *fk, finufft_opts *o, int *ier) {
+  *ier = finufft2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
-                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+
+void finufft2d3_(BIGINT *nj, double *x, double *y, std::complex<double> *c, int *iflag,
+                 double *eps, BIGINT *nk, double *s, double *t, std::complex<double> *f,
+                 finufft_opts *o, int *ier) {
+  *ier = finufft2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
+}
+
+void finufft2d3many_(int *ntransf, BIGINT *nj, double *x, double *y,
+                     std::complex<double> *c, int *iflag, double *eps, BIGINT *nk,
+                     double *s, double *t, std::complex<double> *f, finufft_opts *o,
                      int *ier) {
-  *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+  *ier = finufft2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
-void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
-                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+// --- 3D ---
+void finufft3d1_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
+                 int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
+                 std::complex<double> *fk, finufft_opts *o, int *ier) {
+  *ier = finufft3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
-void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
-                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+
+void finufft3d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj,
+                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
+                     BIGINT *mt, BIGINT *mu, std::complex<double> *fk, finufft_opts *o,
                      int *ier) {
-  *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+  *ier =
+      finufft3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk,
-                 FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) {
-  *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
+void finufft3d2_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
+                 int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
+                 std::complex<double> *fk, finufft_opts *o, int *ier) {
+  *ier = finufft3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag,
-                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o,
+void finufft3d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj,
+                     std::complex<double> *cj, int *iflag, double *eps, BIGINT *ms,
+                     BIGINT *mt, BIGINT *mu, std::complex<double> *fk, finufft_opts *o,
                      int *ier) {
-  *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
+  *ier =
+      finufft3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-// --- 3D ---
-void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
-                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
+void finufft3d3_(BIGINT *nj, double *x, double *y, double *z, std::complex<double> *c,
+                 int *iflag, double *eps, BIGINT *nk, double *s, double *t, double *u,
+                 std::complex<double> *f, finufft_opts *o, int *ier) {
+  *ier = finufft3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
-                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
+void finufft3d3many_(int *ntransf, BIGINT *nj, double *x, double *y, double *z,
+                     std::complex<double> *c, int *iflag, double *eps, BIGINT *nk,
+                     double *s, double *t, double *u, std::complex<double> *f,
                      finufft_opts *o, int *ier) {
+  *ier = finufft3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
+}
+
+// --------------------- guru interface from fortran ------------------------
+void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag,
+                        int *n_transf, float *tol, finufftf_plan *plan, finufft_opts *o,
+                        int *ier) {
+  if (!plan)
+    fprintf(stderr,
+            "%s fortran: plan must be allocated as at least the size of a C pointer "
+            "(usually 8 bytes)!\n",
+            __func__);
+  else {
+    // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts:
+    *ier = finufftf_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o);
+  }
+}
+
+void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, float *zj,
+                      BIGINT *nk, float *s, float *t, float *u, int *ier) {
+  if (!*plan) {
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
+    return;
+  }
+  int nk_safe = 0; // catches the case where user passes NULL in
+  if (nk) nk_safe = int(*nk);
+  *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
+}
+
+void finufftf_execute_(finufftf_plan *plan, std::complex<float> *weights,
+                       std::complex<float> *result, int *ier) {
+  if (!plan)
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
+  else
+    *ier = finufftf_execute(*plan, weights, result);
+}
+
+void finufftf_destroy_(finufftf_plan *plan, int *ier) {
+  if (!plan)
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
+  else
+    *ier = finufftf_destroy(*plan);
+}
+
+// ------------ use FINUFFT to set the default options ---------------------
+// (Note the finufft_opts is created in f90-style derived types, not here)
+void finufftf_default_opts_(finufft_opts *o) {
+  if (!o)
+    fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__);
+  else
+    // o is a ptr to already-allocated fortran finufft_opts derived type...
+    finufft_default_opts(o);
+}
+
+// -------------- simple and many-vector interfaces --------------------
+// --- 1D ---
+void finufftf1d1_(BIGINT *nj, float *xj, std::complex<float> *cj, int *iflag, float *eps,
+                  BIGINT *ms, std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
+}
+
+void finufftf1d1many_(int *ntransf, BIGINT *nj, float *xj, std::complex<float> *cj,
+                      int *iflag, float *eps, BIGINT *ms, std::complex<float> *fk,
+                      finufft_opts *o, int *ier) {
+  *ier = finufftf1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
+}
+
+void finufftf1d2_(BIGINT *nj, float *xj, std::complex<float> *cj, int *iflag, float *eps,
+                  BIGINT *ms, std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
+}
+
+void finufftf1d2many_(int *ntransf, BIGINT *nj, float *xj, std::complex<float> *cj,
+                      int *iflag, float *eps, BIGINT *ms, std::complex<float> *fk,
+                      finufft_opts *o, int *ier) {
+  *ier = finufftf1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
+}
+
+void finufftf1d3_(BIGINT *nj, float *x, std::complex<float> *c, int *iflag, float *eps,
+                  BIGINT *nk, float *s, std::complex<float> *f, finufft_opts *o,
+                  int *ier) {
+  *ier = finufftf1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
+}
+
+void finufftf1d3many_(int *ntransf, BIGINT *nj, float *x, std::complex<float> *c,
+                      int *iflag, float *eps, BIGINT *nk, float *s,
+                      std::complex<float> *f, finufft_opts *o, int *ier) {
+  *ier = finufftf1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
+}
+
+// --- 2D ---
+void finufftf2d1_(BIGINT *nj, float *xj, float *yj, std::complex<float> *cj, int *iflag,
+                  float *eps, BIGINT *ms, BIGINT *mt, std::complex<float> *fk,
+                  finufft_opts *o, int *ier) {
+  *ier = finufftf2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+void finufftf2d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj,
+                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
+                      BIGINT *mt, std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+
+void finufftf2d2_(BIGINT *nj, float *xj, float *yj, std::complex<float> *cj, int *iflag,
+                  float *eps, BIGINT *ms, BIGINT *mt, std::complex<float> *fk,
+                  finufft_opts *o, int *ier) {
+  *ier = finufftf2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+void finufftf2d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj,
+                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
+                      BIGINT *mt, std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
+}
+
+void finufftf2d3_(BIGINT *nj, float *x, float *y, std::complex<float> *c, int *iflag,
+                  float *eps, BIGINT *nk, float *s, float *t, std::complex<float> *f,
+                  finufft_opts *o, int *ier) {
+  *ier = finufftf2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
+}
+
+void finufftf2d3many_(int *ntransf, BIGINT *nj, float *x, float *y,
+                      std::complex<float> *c, int *iflag, float *eps, BIGINT *nk,
+                      float *s, float *t, std::complex<float> *f, finufft_opts *o,
+                      int *ier) {
+  *ier = finufftf2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
+}
+
+// --- 3D ---
+void finufftf3d1_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
+                  int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
+                  std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
+}
+
+void finufftf3d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj,
+                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
+                      BIGINT *mt, BIGINT *mu, std::complex<float> *fk, finufft_opts *o,
+                      int *ier) {
   *ier =
-      FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
+      finufftf3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
-                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
-  *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
+void finufftf3d2_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
+                  int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu,
+                  std::complex<float> *fk, finufft_opts *o, int *ier) {
+  *ier = finufftf3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
-                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
-                     finufft_opts *o, int *ier) {
+void finufftf3d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj,
+                      std::complex<float> *cj, int *iflag, float *eps, BIGINT *ms,
+                      BIGINT *mt, BIGINT *mu, std::complex<float> *fk, finufft_opts *o,
+                      int *ier) {
   *ier =
-      FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
+      finufftf3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps,
-                 BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) {
-  *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
+void finufftf3d3_(BIGINT *nj, float *x, float *y, float *z, std::complex<float> *c,
+                  int *iflag, float *eps, BIGINT *nk, float *s, float *t, float *u,
+                  std::complex<float> *f, finufft_opts *o, int *ier) {
+  *ier = finufftf3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag,
-                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f,
-                     finufft_opts *o, int *ier) {
-  *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
+void finufftf3d3many_(int *ntransf, BIGINT *nj, float *x, float *y, float *z,
+                      std::complex<float> *c, int *iflag, float *eps, BIGINT *nk,
+                      float *s, float *t, float *u, std::complex<float> *f,
+                      finufft_opts *o, int *ier) {
+  *ier = finufftf3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
 #ifdef __cplusplus
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 42e5e7ff8..084ffa41c 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -18,6 +18,7 @@
 // public header gives access to f_opts, f_spread_opts, f_plan...
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
+#include <finufft/finufft_core.h>
 #include <memory>
 
 // --------------- Private data types for compilation in either prec ---------
@@ -25,8 +26,8 @@
 
 // All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
 // This includes all calling arguments (eg M,N) that could be huge someday.
-using BIGINT  = int64_t;
-using UBIGINT = uint64_t;
+// using BIGINT  = int64_t;
+// using UBIGINT = uint64_t;
 // Precision-independent real and complex types, for private lib/test compile
 #ifdef SINGLE
 using FLT = float;
@@ -36,59 +37,6 @@ using FLT = double;
 #include <complex> // we define C++ complex type only
 using CPX = std::complex<FLT>;
 
-// inline macro, to force inlining of small functions
-// this avoids the use of macros to implement functions
-#if defined(_MSC_VER)
-#define FINUFFT_ALWAYS_INLINE __forceinline inline
-#define FINUFFT_NEVER_INLINE  __declspec(noinline)
-#define FINUFFT_RESTRICT      __restrict
-#define FINUFFT_UNREACHABLE   __assume(0)
-#define FINUFFT_UNLIKELY(x)   (x)
-#define FINUFFT_LIKELY(x)     (x)
-#elif defined(__GNUC__) || defined(__clang__)
-#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
-#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
-#define FINUFFT_RESTRICT      __restrict__
-#define FINUFFT_UNREACHABLE   __builtin_unreachable()
-#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
-#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
-#else
-#define FINUFFT_ALWAYS_INLINE inline
-#define FINUFFT_NEVER_INLINE
-#define FINUFFT_RESTRICT
-#define FINUFFT_UNREACHABLE
-#define FINUFFT_UNLIKELY(x) (x)
-#define FINUFFT_LIKELY(x)   (x)
-#endif
-
-// ------------- Library-wide algorithm parameter settings ----------------
-
-// Library version (is a string)
-#define FINUFFT_VER "2.3.0"
-
-// Smallest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MIN_NSPREAD = 2;
-
-// Largest possible kernel spread width per dimension, in fine grid points
-// (used only in spreadinterp.cpp)
-inline constexpr int MAX_NSPREAD = 16;
-
-// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
-inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
-
-// Max number of positive quadr nodes for kernel FT (used only in common.cpp)
-inline constexpr int MAX_NQUAD = 100;
-
-// Internal (nf1 etc) array allocation size that immediately raises error.
-// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
-// Increase this if you need >10TB (!) RAM...
-inline constexpr BIGINT MAX_NF = BIGINT(1e12);
-
-// Maximum allowed number M of NU points; useful to catch incorrectly cast int32
-// values for M = nj (also nk in type 3)...
-inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
-
 // -------------- Math consts (not in math.h) and useful math macros ----------
 #include <cmath>
 
@@ -108,13 +56,6 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
 // to avoid mixed precision operators in eg i*pi, an either-prec PI...
 #define PI      FLT(M_PI)
 
-// machine epsilon for decisions of achievable tolerance...
-#ifdef SINGLE
-#define EPSILON (float)6e-08
-#else
-#define EPSILON (double)1.1e-16
-#endif
-
 // Random numbers: crappy unif random number generator in [0,1).
 // These macros should probably be replaced by modern C++ std lib or random123.
 // (RAND_MAX is in stdlib.h)
@@ -148,32 +89,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) {
 }
 #endif
 
-// ----- OpenMP macros which also work when omp not present -----
-// Allows compile-time switch off of openmp, so compilation without any openmp
-// is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
-#ifdef _OPENMP
-#include <omp.h>
-// point to actual omp utils
-static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () {
-  return omp_get_num_threads();
-}
-static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () {
-  return omp_get_max_threads();
-}
-static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () {
-  return omp_get_thread_num();
-}
-static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) {
-  omp_set_num_threads(x);
-}
-#else
-// non-omp safe dummy versions of omp utils...
-static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; }
-static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; }
-static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; }
-static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
-#endif
-
 // Prec-switching name macros (respond to SINGLE), used in lib & test sources
 // and the plan object below.
 // Note: crucially, these are now indep of macros used to gen public finufft.h!
@@ -219,70 +134,6 @@ static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
 // NB: now private (the public C++ or C etc user sees an opaque pointer to it)
 
 #include <finufft/fft.h> // (must come after complex.h)
-
-// group together a bunch of type 3 rescaling/centering/phasing parameters:
-template<typename T> struct type3params {
-  T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
-  T X2, C2, D2, h2, gam2; // y
-  T X3, C3, D3, h3, gam3; // z
-};
-
-struct FINUFFT_PLAN_S { // the main plan object, fully C++
-  // These default and delete specifications just state the obvious,
-  // but are here to silence compiler warnings.
-  FINUFFT_PLAN_S() = default;
-  // Copy construction and assignent are already deleted implicitly
-  // because of the unique_ptr member.
-  FINUFFT_PLAN_S(const FINUFFT_PLAN_S &)            = delete;
-  FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete;
-
-  int type;            // transform type (Rokhlin naming): 1,2 or 3
-  int dim;             // overall dimension: 1,2 or 3
-  int ntrans;          // how many transforms to do at once (vector or "many" mode)
-  BIGINT nj;           // num of NU pts in type 1,2 (for type 3, num input x pts)
-  BIGINT nk;           // number of NU freq pts (type 3 only)
-  FLT tol;             // relative user tolerance
-  int batchSize;       // # strength vectors to group together for FFTW, etc
-  int nbatch;          // how many batches done to cover all ntrans vectors
-
-  BIGINT ms;           // number of modes in x (1) dir (historical CMCL name) = N1
-  BIGINT mt;           // number of modes in y (2) direction = N2
-  BIGINT mu;           // number of modes in z (3) direction = N3
-  BIGINT N;            // total # modes (prod of above three)
-
-  BIGINT nf1;          // size of internal fine grid in x (1) direction
-  BIGINT nf2;          // " y (2)
-  BIGINT nf3;          // " z (3)
-  BIGINT nf;           // total # fine grid points (product of the above three)
-
-  int fftSign;         // sign in exponential for NUFFT defn, guaranteed to be +-1
-
-  FLT *phiHat1;        // FT of kernel in t1,2, on x-axis mode grid
-  FLT *phiHat2;        // " y-axis.
-  FLT *phiHat3;        // " z-axis.
-
-  CPX *fwBatch;        // (batches of) fine grid(s) for FFTW to plan
-                       // & act on. Usually the largest working array
-
-  BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp
-  bool didSort;        // whether binsorting used (false: identity perm used)
-
-  FLT *X, *Y, *Z;      // for t1,2: ptr to user-supplied NU pts (no new allocs).
-                       // for t3: allocated as "primed" (scaled) src pts x'_j, etc
-
-  // type 3 specific
-  FLT *S, *T, *U;           // pointers to user's target NU pts arrays (no new allocs)
-  CPX *prephase;            // pre-phase, for all input NU pts
-  CPX *deconv;              // reciprocal of kernel FT, phase, all output NU pts
-  CPX *CpBatch;             // working array of prephased strengths
-  FLT *Sp, *Tp, *Up;        // internal primed targs (s'_k, etc), allocated
-  type3params<FLT> t3P;     // groups together type 3 shift, scale, phase, parameters
-  FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
-
-  // other internal structs; each is C-compatible of course
-  std::unique_ptr<Finufft_FFT_plan<FLT>> fftPlan;
-  finufft_opts opts; // this and spopts could be made ptrs
-  finufft_spread_opts spopts;
-};
+struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T<FLT> {};
 
 #endif // DEFS_H
diff --git a/include/finufft/fft.h b/include/finufft/fft.h
index bab43966c..c6d5de7a5 100644
--- a/include/finufft/fft.h
+++ b/include/finufft/fft.h
@@ -171,19 +171,22 @@ template<> struct Finufft_FFT_plan<double> {
 
 #endif
 
-#include <finufft/defs.h>
+#include <finufft/finufft_core.h>
 
 static inline void finufft_fft_forget_wisdom [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::forget_wisdom();
+  Finufft_FFT_plan<float>::forget_wisdom();
+  Finufft_FFT_plan<double>::forget_wisdom();
 }
 static inline void finufft_fft_cleanup [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::cleanup();
+  Finufft_FFT_plan<float>::cleanup();
+  Finufft_FFT_plan<double>::cleanup();
 }
 static inline void finufft_fft_cleanup_threads [[maybe_unused]] () {
-  Finufft_FFT_plan<FLT>::cleanup_threads();
+  Finufft_FFT_plan<float>::cleanup_threads();
+  Finufft_FFT_plan<double>::cleanup_threads();
 }
-
-std::vector<int> gridsize_for_fft(FINUFFT_PLAN p);
-void do_fft(FINUFFT_PLAN p);
+template<typename TF> struct FINUFFT_PLAN_T;
+template<typename TF> std::vector<int> gridsize_for_fft(FINUFFT_PLAN_T<TF> *p);
+template<typename TF> void do_fft(FINUFFT_PLAN_T<TF> *p);
 
 #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H
diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h
new file mode 100644
index 000000000..de2f2dab9
--- /dev/null
+++ b/include/finufft/finufft_core.h
@@ -0,0 +1,213 @@
+#ifndef FINUFFT_CORE_H
+#define FINUFFT_CORE_H
+
+/* IMPORTANT: for Windows compilers, you should add a line
+        #define FINUFFT_DLL
+   here if you are compiling/using FINUFFT as a DLL,
+   in order to do the proper importing/exporting, or
+   alternatively compile with -DFINUFFT_DLL or the equivalent
+   command-line flag.  This is not necessary under MinGW/Cygwin, where
+   libtool does the imports/exports automatically.
+   Alternatively use include(GenerateExportHeader) and
+   generate_export_header(finufft) to auto generate an header containing
+   these defines.The main reason is that if msvc changes the way it deals
+   with it in the future we just need to update cmake for it to work
+   instead of having a check on the msvc version. */
+#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
+#else
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT
+#endif
+
+/* specify calling convention (Windows only)
+   The cdecl calling convention is actually not the default in all but a very
+   few C/C++ compilers.
+   If the user code changes the default compiler calling convention, may need
+   this when generating DLL. */
+#if defined(_WIN32) || defined(__WIN32__)
+#define FINUFFT_CDECL __cdecl
+#else
+#define FINUFFT_CDECL
+#endif
+
+// inline macro, to force inlining of small functions
+// this avoids the use of macros to implement functions
+#if defined(_MSC_VER)
+#define FINUFFT_ALWAYS_INLINE __forceinline inline
+#define FINUFFT_NEVER_INLINE  __declspec(noinline)
+#define FINUFFT_RESTRICT      __restrict
+#define FINUFFT_UNREACHABLE   __assume(0)
+#define FINUFFT_UNLIKELY(x)   (x)
+#define FINUFFT_LIKELY(x)     (x)
+#elif defined(__GNUC__) || defined(__clang__)
+#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
+#define FINUFFT_NEVER_INLINE  __attribute__((noinline))
+#define FINUFFT_RESTRICT      __restrict__
+#define FINUFFT_UNREACHABLE   __builtin_unreachable()
+#define FINUFFT_UNLIKELY(x)   __builtin_expect(!!(x), 0)
+#define FINUFFT_LIKELY(x)     __builtin_expect(!!(x), 1)
+#else
+#define FINUFFT_ALWAYS_INLINE inline
+#define FINUFFT_NEVER_INLINE
+#define FINUFFT_RESTRICT
+#define FINUFFT_UNREACHABLE
+#define FINUFFT_UNLIKELY(x) (x)
+#define FINUFFT_LIKELY(x)   (x)
+#endif
+
+#include <finufft_errors.h>
+#include <memory>
+
+// All indexing in library that potentially can exceed 2^31 uses 64-bit signed.
+// This includes all calling arguments (eg M,N) that could be huge someday.
+using BIGINT  = int64_t;
+using UBIGINT = uint64_t;
+
+// ------------- Library-wide algorithm parameter settings ----------------
+
+// Library version (is a string)
+#define FINUFFT_VER "2.3.0"
+
+// Smallest possible kernel spread width per dimension, in fine grid points
+// (used only in spreadinterp.cpp)
+inline constexpr int MIN_NSPREAD = 2;
+
+// Largest possible kernel spread width per dimension, in fine grid points
+// (used only in spreadinterp.cpp)
+inline constexpr int MAX_NSPREAD = 16;
+
+// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
+inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1;
+
+// Max number of positive quadr nodes for kernel FT (used only in common.cpp)
+inline constexpr int MAX_NQUAD = 100;
+
+// Internal (nf1 etc) array allocation size that immediately raises error.
+// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
+// Increase this if you need >10TB (!) RAM...
+inline constexpr BIGINT MAX_NF = BIGINT(1e12);
+
+// Maximum allowed number M of NU points; useful to catch incorrectly cast int32
+// values for M = nj (also nk in type 3)...
+inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14);
+
+// ----- OpenMP macros which also work when omp not present -----
+// Allows compile-time switch off of openmp, so compilation without any openmp
+// is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
+#ifdef _OPENMP
+#include <omp.h>
+// point to actual omp utils
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () {
+  return omp_get_num_threads();
+}
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () {
+  return omp_get_max_threads();
+}
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () {
+  return omp_get_thread_num();
+}
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) {
+  omp_set_num_threads(x);
+}
+#else
+// non-omp safe dummy versions of omp utils...
+static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; }
+static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; }
+static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {}
+#endif
+
+#include <finufft/fft.h> // (must come after complex.h)
+#include <finufft_opts.h>
+#include <finufft_spread_opts.h>
+
+// group together a bunch of type 3 rescaling/centering/phasing parameters:
+template<typename T> struct type3params {
+  T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
+  T X2, C2, D2, h2, gam2; // y
+  T X3, C3, D3, h3, gam3; // z
+};
+
+template<typename TF> struct FINUFFT_PLAN_T { // the main plan object, fully C++
+
+  using TC = std::complex<TF>;
+
+  // These default and delete specifications just state the obvious,
+  // but are here to silence compiler warnings.
+  FINUFFT_PLAN_T() = default;
+  // Copy construction and assignent are already deleted implicitly
+  // because of the unique_ptr member.
+  FINUFFT_PLAN_T(const FINUFFT_PLAN_T &)            = delete;
+  FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete;
+  ~FINUFFT_PLAN_T();
+
+  int type;                // transform type (Rokhlin naming): 1,2 or 3
+  int dim;                 // overall dimension: 1,2 or 3
+  int ntrans;              // how many transforms to do at once (vector or "many" mode)
+  BIGINT nj;               // num of NU pts in type 1,2 (for type 3, num input x pts)
+  BIGINT nk;               // number of NU freq pts (type 3 only)
+  TF tol;                  // relative user tolerance
+  int batchSize;           // # strength vectors to group together for FFTW, etc
+  int nbatch;              // how many batches done to cover all ntrans vectors
+
+  BIGINT ms;               // number of modes in x (1) dir (historical CMCL name) = N1
+  BIGINT mt;               // number of modes in y (2) direction = N2
+  BIGINT mu;               // number of modes in z (3) direction = N3
+  BIGINT N;                // total # modes (prod of above three)
+
+  BIGINT nf1;              // size of internal fine grid in x (1) direction
+  BIGINT nf2;              // " y (2)
+  BIGINT nf3;              // " z (3)
+  BIGINT nf;               // total # fine grid points (product of the above three)
+
+  int fftSign;             // sign in exponential for NUFFT defn, guaranteed to be +-1
+
+  std::vector<TF> phiHat1; // FT of kernel in t1,2, on x-axis mode grid
+  std::vector<TF> phiHat2; // " y-axis.
+  std::vector<TF> phiHat3; // " z-axis.
+
+  TC *fwBatch = nullptr;   // (batches of) fine grid(s) for FFTW to plan
+                           // & act on. Usually the largest working array
+
+  std::vector<BIGINT> sortIndices; // precomputed NU pt permutation, speeds spread/interp
+  bool didSort;                    // whether binsorting used (false: identity perm used)
+
+  TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts
+                                               // (no new allocs). for t3: allocated as
+                                               // "primed" (scaled) src pts x'_j, etc
+
+  // type 3 specific
+  TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays
+                                               // (no new allocs)
+  std::vector<TC> prephase;                    // pre-phase, for all input NU pts
+  std::vector<TC> deconv;     // reciprocal of kernel FT, phase, all output NU pts
+  std::vector<TC> CpBatch;    // working array of prephased strengths
+  std::vector<TF> Sp, Tp, Up; // internal primed targs (s'_k, etc),
+                              // allocated
+  type3params<TF> t3P;        // groups together type 3 shift, scale, phase, parameters
+  FINUFFT_PLAN_T<TF> *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3
+
+  // other internal structs
+  std::unique_ptr<Finufft_FFT_plan<TF>> fftPlan;
+  finufft_opts opts; // this and spopts could be made ptrs
+  finufft_spread_opts spopts;
+
+  int setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, TF *u);
+  int execute(std::complex<TF> *cj, std::complex<TF> *fk);
+};
+
+void finufft_default_opts_t(finufft_opts *o);
+template<typename TF>
+int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts);
+template<typename TF>
+int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
+                     TF *s, TF *t, TF *u);
+template<typename TF>
+int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk);
+
+#endif // FINUFFT_CORE_H
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 78ecf9f22..8a83af3ce 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -7,7 +7,6 @@
 #ifndef SPREADINTERP_H
 #define SPREADINTERP_H
 
-#include <finufft/defs.h>
 #include <finufft_spread_opts.h>
 
 /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags.
@@ -31,31 +30,28 @@ namespace finufft {
 namespace spreadinterp {
 
 // things external (spreadinterp) interface needs...
+template<typename T>
 FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
-    UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT N, FLT *kx, FLT *ky,
-    FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts);
+    UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz,
+    T *data_nonuniform, const finufft_spread_opts &opts);
+template<typename T>
 FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                                             UBIGINT N, FLT *kx, FLT *ky, FLT *kz,
+                                             UBIGINT N, T *kx, T *ky, T *kz,
                                              const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2,
-                                           UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky,
-                                           FLT *kz, const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(
-    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-    FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
-    FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
-    FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(
-    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform,
-    UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform,
-    const finufft_spread_opts &opts, int did_sort);
+template<typename T>
+FINUFFT_EXPORT int FINUFFT_CDECL indexSort(std::vector<BIGINT> &sort_indices, UBIGINT N1,
+                                           UBIGINT N2, UBIGINT N3, UBIGINT N, T *kx,
+                                           T *ky, T *kz, const finufft_spread_opts &opts);
+template<typename T>
 FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
-    const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-    FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx,
-    FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
-    FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort);
-FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps,
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx,
+    T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform,
+    const finufft_spread_opts &opts, int did_sort);
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts);
+template<typename T>
+FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps,
                                                 double upsampfac, int kerevalmeth,
                                                 int debug, int showwarn, int dim);
 
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 387bef20d..bdd4cf147 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -17,7 +17,6 @@
 
 // convenient private finufft internals (must come after finufft.h)
 #include <finufft/utils.h>
-#include <finufft/utils_precindep.h>
 // prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc...
 #include <finufft/defs.h>
 
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
index 9039fee96..040f60543 100644
--- a/include/finufft/utils.h
+++ b/include/finufft/utils.h
@@ -4,22 +4,117 @@
 #ifndef UTILS_H
 #define UTILS_H
 
-#include "finufft/defs.h"
+#include "finufft/finufft_core.h"
+//  for CNTime...
+//  using chrono since the interface is portable between linux and windows
+#include <chrono>
 
 namespace finufft {
 namespace utils {
 
 // ahb's low-level array helpers
-FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a);
-FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a);
-FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo,
-                                                    FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c);
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, std::complex<T> *a,
+                                             std::complex<T> *b)
+// ||a-b||_2 / ||a||_2
+{
+  T err = 0.0, nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) {
+    nrm += real(conj(a[m]) * a[m]);
+    std::complex<T> diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
+  }
+  return sqrt(err / nrm);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, std::complex<T> *a,
+                                          std::complex<T> *b)
+// ||a-b||_2
+{
+  T err = 0.0; // compute error 2-norm
+  for (BIGINT m = 0; m < n; ++m) {
+    std::complex<T> diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
+  }
+  return sqrt(err);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, std::complex<T> *a)
+// ||a||_2
+{
+  T nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]);
+  return sqrt(nrm);
+}
+template<typename T>
+FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, std::complex<T> *a)
+// ||a||_infty
+{
+  T nrm = 0.0;
+  for (BIGINT m = 0; m < n; ++m) {
+    T aa = real(conj(a[m]) * a[m]);
+    if (aa > nrm) nrm = aa;
+  }
+  return sqrt(nrm);
+}
+template<typename T>
+FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi)
+// With a a length-n array, writes out min(a) to lo and max(a) to hi,
+// so that all a values lie in [lo,hi].
+// If n==0, lo and hi are not finite.
+{
+  *lo = INFINITY;
+  *hi = -INFINITY;
+  for (BIGINT m = 0; m < n; ++m) {
+    if (a[m] < *lo) *lo = a[m];
+    if (a[m] > *hi) *hi = a[m];
+  }
+}
+template<typename T>
+FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c)
+// Writes out w = half-width and c = center of an interval enclosing all a[n]'s
+// Only chooses a nonzero center if this increases w by less than fraction
+// ARRAYWIDCEN_GROWFRAC defined in defs.h.
+// This prevents rephasings which don't grow nf by much. 6/8/17
+// If n==0, w and c are not finite.
+{
+  T lo, hi;
+  arrayrange(n, a, &lo, &hi);
+  *w = (hi - lo) / 2;
+  *c = (hi + lo) / 2;
+  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
+    *w += std::abs(*c);
+    *c = 0.0;
+  }
+}
 
+FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
+
+// jfm's timer class
+class FINUFFT_EXPORT CNTime {
+public:
+  void start();
+  double restart();
+  double elapsedsec();
+
+private:
+  double initial;
+};
+
+// openmp helpers
+int get_num_threads_parallel_block();
+
+} // namespace utils
+} // namespace finufft
+
+// thread-safe rand number generator for Windows platform
+#ifdef _WIN32
+#include <random>
+namespace finufft {
+namespace utils {
+FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
 } // namespace utils
 } // namespace finufft
+#endif
 
 #endif // UTILS_H
diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h
deleted file mode 100644
index 0504bb8df..000000000
--- a/include/finufft/utils_precindep.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Header for utils_precindep.cpp, a little library of array and timer stuff.
-// Only the precision-independent routines here (get compiled once)
-
-#ifndef UTILS_PRECINDEP_H
-#define UTILS_PRECINDEP_H
-
-#include "defs.h"
-// for CNTime...
-// using chrono since the interface is portable between linux and windows
-#include <chrono>
-
-namespace finufft {
-namespace utils {
-
-FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
-
-// jfm's timer class
-class FINUFFT_EXPORT CNTime {
-public:
-  void start();
-  double restart();
-  double elapsedsec();
-
-private:
-  double initial;
-};
-
-// openmp helpers
-int get_num_threads_parallel_block();
-
-} // namespace utils
-} // namespace finufft
-
-// thread-safe rand number generator for Windows platform
-#ifdef _WIN32
-#include <random>
-namespace finufft {
-namespace utils {
-FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
-} // namespace utils
-} // namespace finufft
-#endif
-
-#endif // UTILS_PRECINDEP_H
diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
index 47f7860e1..3f0a7d95c 100644
--- a/include/finufft_eitherprec.h
+++ b/include/finufft_eitherprec.h
@@ -86,8 +86,8 @@ typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN;
 
 FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
 FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(
-    int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol,
-    FINUFFT_PLAN *plan, finufft_opts *o);
+    int type, int dim, const FINUFFT_BIGINT *n_modes, int iflag, int n_transf,
+    FINUFFT_FLT tol, FINUFFT_PLAN *plan, finufft_opts *o);
 FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(
     FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj,
     FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u);
diff --git a/makefile b/makefile
index 4a91506db..7ad454198 100644
--- a/makefile
+++ b/makefile
@@ -31,7 +31,7 @@ PYTHON = python3
 #           they allow gcc to vectorize the code more effectively
 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\
 		  -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\
-		  -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS)
+		  -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) -Wfatal-errors
 FFLAGS := $(CFLAGS) $(FFLAGS)
 CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
 # FFTW base name, and math linking...
@@ -133,24 +133,13 @@ STATICLIB = lib-static/$(LIBNAME).a
 # absolute path to the .so, useful for linking so executables portable...
 ABSDYNLIB = $(FINUFFT)$(DYNLIB)
 
-# spreader is subset of the library with self-contained testing, hence own objs:
-# double-prec spreader object files that also need single precision...
-SOBJS = src/spreadinterp.o src/utils.o
-# their single-prec versions
-SOBJSF = $(SOBJS:%.o=%_32.o)
-# precision-dependent spreader object files (compiled & linked only once)...
-SOBJS_PI = src/utils_precindep.o
 # spreader dual-precision objs
-SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI)
-
-# double-prec library object files that also need single precision...
-OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o
-# their single-prec versions
-OBJSF = $(OBJS:%.o=%_32.o)
-# precision-dependent library object files (compiled & linked only once)...
-OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o
+SOBJSD = src/utils.o src/spreadinterp.o
+
+# precision-independent library object files (compiled & linked only once)...
+OBJS_PI = $(SOBJSD) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o
 # all lib dual-precision objs (note DUCC_OBJS empty if unused)
-OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS)
+OBJSD = $(OBJS_PI) $(DUCC_OBJS)
 
 .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean
 
@@ -190,12 +179,8 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS)
 # implicit rules for objects (note -o ensures writes to correct dir)
 %.o: %.cpp $(HEADERS)
 	$(CXX) -c $(CXXFLAGS) $< -o $@
-%_32.o: %.cpp $(HEADERS)
-	$(CXX) -DSINGLE -c $(CXXFLAGS) $< -o $@
 %.o: %.c $(HEADERS)
 	$(CC) -c $(CFLAGS) $< -o $@
-%_32.o: %.c $(HEADERS)
-	$(CC) -DSINGLE -c $(CFLAGS) $< -o $@
 %.o: %.f
 	$(FC) -c $(FFLAGS) $< -o $@
 %_32.o: %.f
@@ -209,7 +194,6 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS)
 include/finufft/fft.h: $(DUCC_SETUP)
 SHEAD = $(wildcard src/*.h) $(XSIMD_DIR)/include/xsimd/xsimd.hpp
 src/spreadinterp.o: $(SHEAD)
-src/spreadinterp_32.o: $(SHEAD)
 
 
 # lib -----------------------------------------------------------------------
@@ -277,10 +261,10 @@ test/%: test/%.cpp $(DYNLIB)
 test/%f: test/%.cpp $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
-test/testutils: test/testutils.cpp src/utils.o src/utils_precindep.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o src/utils_precindep.o $(LIBS) -o test/testutils
-test/testutilsf: test/testutils.cpp src/utils_32.o src/utils_precindep.o
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils_32.o src/utils_precindep.o $(LIBS) -o test/testutilsf
+test/testutils: test/testutils.cpp src/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o $(LIBS) -o test/testutils
+test/testutilsf: test/testutils.cpp src/utils.o
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils.o $(LIBS) -o test/testutilsf
 
 # make sure all double-prec test executables ready for testing
 TESTS := $(basename $(wildcard test/*.cpp))
@@ -325,14 +309,14 @@ ST=perftest/spreadtestnd
 STA=perftest/spreadtestndall
 STF=$(ST)f
 STAF=$(STA)f
-$(ST): $(ST).cpp $(SOBJS) $(SOBJS_PI)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@
-$(STF): $(ST).cpp $(SOBJSF) $(SOBJS_PI)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@
-$(STA): $(STA).cpp $(SOBJS) $(SOBJS_PI)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@
-$(STAF): $(STA).cpp $(SOBJSF) $(SOBJS_PI)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@
+$(ST): $(ST).cpp $(SOBJSD)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@
+$(STF): $(ST).cpp $(SOBJSD)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@
+$(STA): $(STA).cpp $(SOBJSD)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@
+$(STAF): $(STA).cpp $(SOBJSD)
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@
 spreadtest: $(ST) $(STF)
 # run one thread per core... (escape the $ to get single $ in bash; one big cmd)
 	(export OMP_NUM_THREADS=$$(perftest/mynumcores.sh) ;\
@@ -436,7 +420,7 @@ endif
 
 # python ---------------------------------------------------------------------
 python: $(STATICLIB) $(DYNLIB)
-	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft
+	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft
 # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps
 	$(PYTHON) python/finufft/test/run_accuracy_tests.py
 	$(PYTHON) python/finufft/examples/simple1d1.py
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index 0f2c9d0bb..5e27289d8 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -1,8 +1,9 @@
 // public header
 #include "finufft.h"
+#include "finufft/defs.h"
 
 // private access to timer
-#include "finufft/utils_precindep.h"
+#include "finufft/utils.h"
 using namespace finufft::utils;
 
 #include <complex>
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index 9b560a25e..d30626007 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -1,7 +1,6 @@
 #include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
-#include <finufft/utils_precindep.h>
 
 #include <math.h>
 #include <stdio.h>
diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp
index 666003137..14aad3420 100644
--- a/perftest/spreadtestndall.cpp
+++ b/perftest/spreadtestndall.cpp
@@ -1,6 +1,6 @@
 #include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
-#include <finufft/utils_precindep.h>
+#include <finufft/utils.h>
 
 #include <cmath>
 #include <cstdio>
diff --git a/src/fft.cpp b/src/fft.cpp
index bb7e32442..68877cacd 100644
--- a/src/fft.cpp
+++ b/src/fft.cpp
@@ -7,7 +7,7 @@ using namespace std;
 #include "ducc0/fft/fftnd_impl.h"
 #endif
 
-std::vector<int> gridsize_for_fft(FINUFFT_PLAN p) {
+template<typename TF> std::vector<int> gridsize_for_fft(FINUFFT_PLAN_T<TF> *p) {
   // local helper func returns a new int array of length dim, extracted from
   // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
   if (p->dim == 1) return {(int)p->nf1};
@@ -15,8 +15,10 @@ std::vector<int> gridsize_for_fft(FINUFFT_PLAN p) {
   // if (p->dim == 3)
   return {(int)p->nf3, (int)p->nf2, (int)p->nf1};
 }
+template std::vector<int> gridsize_for_fft<float>(FINUFFT_PLAN_T<float> *p);
+template std::vector<int> gridsize_for_fft<double>(FINUFFT_PLAN_T<double> *p);
 
-void do_fft(FINUFFT_PLAN p) {
+template<typename TF> void do_fft(FINUFFT_PLAN_T<TF> *p) {
 #ifdef FINUFFT_USE_DUCC0
   size_t nthreads = min<size_t>(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads);
   const auto ns   = gridsize_for_fft(p);
@@ -32,9 +34,9 @@ void do_fft(FINUFFT_PLAN p) {
     arrdims.push_back(size_t(ns[2]));
     axes.push_back(3);
   }
-  ducc0::vfmav<CPX> data(p->fwBatch, arrdims);
+  ducc0::vfmav<std::complex<TF>> data(p->fwBatch, arrdims);
 #ifdef FINUFFT_NO_DUCC0_TWEAKS
-  ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
+  ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads);
 #else
   /* For type 1 NUFFTs, only the low-frequency parts of the output fine grid are
      going to be used, and for type 2 NUFFTs, the high frequency parts of the
@@ -45,10 +47,10 @@ void do_fft(FINUFFT_PLAN p) {
      of all 1D FFTs, and for the last remaining axis the factor is
      1/oversampling_factor^2. */
   if (p->dim == 1)        // 1D: no chance for FFT shortcuts
-    ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
+    ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads);
   else if (p->dim == 2) { // 2D: do partial FFTs
     if (p->ms < 2)        // something is weird, do standard FFT
-      ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
+      ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads);
     else {
       size_t y_lo = size_t((p->ms + 1) / 2);
       size_t y_hi = size_t(ns[1] - p->ms / 2);
@@ -58,17 +60,17 @@ void do_fft(FINUFFT_PLAN p) {
       auto sub2 = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}});
       if (p->type == 1) // spreading, not all parts of the output array are needed
         // do axis 2 in full
-        ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads);
       // do only parts of axis 1
-      ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, FLT(1), nthreads);
-      ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, FLT(1), nthreads);
+      ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, TF(1), nthreads);
+      ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, TF(1), nthreads);
       if (p->type == 2) // interpolation, parts of the input array are zero
         // do axis 2 in full
-        ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads);
     }
   } else {                          // 3D
     if ((p->ms < 2) || (p->mt < 2)) // something is weird, do standard FFT
-      ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads);
+      ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads);
     else {
       size_t z_lo = size_t((p->ms + 1) / 2);
       size_t z_hi = size_t(ns[2] - p->ms / 2);
@@ -82,22 +84,22 @@ void do_fft(FINUFFT_PLAN p) {
       auto sub6   = ducc0::subarray(sub2, {{}, {}, {y_hi, ducc0::MAXIDX}, {}});
       if (p->type == 1) { // spreading, not all parts of the output array are needed
         // do axis 3 in full
-        ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads);
         // do only parts of axis 2
-        ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads);
-        ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads);
+        ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads);
       }
       // do even smaller parts of axis 1
-      ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, FLT(1), nthreads);
-      ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, FLT(1), nthreads);
-      ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, FLT(1), nthreads);
-      ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, FLT(1), nthreads);
+      ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, TF(1), nthreads);
+      ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, TF(1), nthreads);
+      ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, TF(1), nthreads);
+      ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, TF(1), nthreads);
       if (p->type == 2) { // interpolation, parts of the input array are zero
         // do only parts of axis 2
-        ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads);
-        ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads);
+        ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads);
         // do axis 3 in full
-        ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads);
+        ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads);
       }
     }
   }
@@ -106,3 +108,5 @@ void do_fft(FINUFFT_PLAN p) {
   p->fftPlan->execute(); // if thisBatchSize<batchSize it wastes some flops
 #endif
 }
+template void do_fft<float>(FINUFFT_PLAN_T<float> *p);
+template void do_fft<double>(FINUFFT_PLAN_T<double> *p);
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 21e6db7ab..758fcb723 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -4,1199 +4,35 @@
 // private headers for lib build
 // (must come after finufft.h which clobbers FINUFFT* macros)
 #include <finufft/defs.h>
-#include <finufft/fft.h>
-#include <finufft/spreadinterp.h>
-#include <finufft/utils.h>
-#include <finufft/utils_precindep.h>
 
-#include "../contrib/legendre_rule_fast.h"
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <vector>
+void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); }
 
-using namespace std;
-using namespace finufft;
-using namespace finufft::utils;
-using namespace finufft::spreadinterp;
-using namespace finufft::quadrature;
-
-/* Computational core for FINUFFT.
-
-   Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
-   2d1/2d2 many-vector drivers by Melody Shih, summer 2018.
-   Original guru interface written by Andrea Malleo, summer 2019, mentored
-   by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu.
-
-   As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions
-   and the two finufft2d?many() functions. The (now 18) simple C++ interfaces
-   are in simpleinterfaces.cpp.
-
-Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
-
-   TYPE 1:
-   The type 1 NUFFT proceeds in three main steps:
-   1) spread data to oversampled regular mesh using kernel.
-   2) compute FFT on uniform mesh
-   3) deconvolve by division of each Fourier mode independently by the kernel
-    Fourier series coeffs (not merely FFT of kernel), shuffle to output.
-   The kernel coeffs are precomputed in what is called step 0 in the code.
-
-   TYPE 2:
-   The type 2 algorithm proceeds in three main steps:
-   1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
-   2) compute inverse FFT on uniform fine grid
-   3) spread (dir=2, ie interpolate) data to regular mesh
-   The kernel coeffs are precomputed in what is called step 0 in the code.
-
-   TYPE 3:
-   The type 3 algorithm is basically a type 2 (which is implemented precisely
-   as call to type 2) replacing the middle FFT (Step 2) of a type 1.
-   Beyond this, the new twists are:
-   i) nf1, number of upsampled points for the type-1, depends on the product
-     of interval widths containing input and output points (X*S).
-   ii) The deconvolve (post-amplify) step is division by the Fourier transform
-     of the scaled kernel, evaluated on the *nonuniform* output frequency
-     grid; this is done by direct approximation of the Fourier integral
-     using quadrature of the kernel function times exponentials.
-   iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
-     half-widths X and S, hence nf1.
-
-   MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
-   maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
-   this is good only for small problems.
-
-
-Design notes for guru interface implementation:
-
-* Since finufft_plan is C-compatible, we need to use malloc/free for its
-  allocatable arrays, keeping it quite low-level. We can't use std::vector
-  since that would only survive in the scope of each function.
-
-* Thread-safety: FINUFFT plans are passed as pointers, so it has no global
-  state apart from that associated with FFTW (and the did_fftw_init).
-*/
-
-// ---------- local math routines (were in common.cpp; no need now): --------
-
-namespace finufft {
-namespace common {
-
-static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
-                         BIGINT *nf)
-// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
-// and requested number of Fourier modes ms. Returns 0 if success, else an
-// error code if nf was unreasonably big (& tell the world).
-{
-  *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial
-  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
-  if (*nf < MAX_NF) {
-    *nf = next235even(*nf);                               // expensive at huge nf
-    return 0;
-  } else {
-    fprintf(stderr,
-            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
-            "malloc\n",
-            __func__, (double)*nf, (double)MAX_NF);
-    return FINUFFT_ERR_MAXNALLOC;
-  }
-}
-
-int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts,
-                             int dim)
-// Set up the spreader parameters given eps, and pass across various nufft
-// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
-{
-  // this calls spreadinterp.cpp...
-  int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
-                           opts.spread_debug, opts.showwarn, dim);
-  // override various spread opts from their defaults...
-  spopts.debug    = opts.spread_debug;
-  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
-  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
-  spopts.chkbnds  = opts.chkbnds;
-  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
-  if (opts.spread_nthr_atomic >= 0)     // overrides
-    spopts.atomic_threshold = opts.spread_nthr_atomic;
-  if (opts.spread_max_sp_size > 0)      // overrides
-    spopts.max_subproblem_size = opts.spread_max_sp_size;
-  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
-    fprintf(stderr,
-            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
-            __func__);
-  return ier;
-}
-
-void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
-                   BIGINT *nf, FLT *h, FLT *gam)
-/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
-   for type 3 only.
-   Inputs:
-   X and S are the xj and sk interval half-widths respectively.
-   opts and spopts are the NUFFT and spreader opts strucs, respectively.
-   Outputs:
-   nf is the size of upsampled grid for a given single dimension.
-   h is the grid spacing = 2pi/nf
-   gam is the x rescale factor, ie x'_j = x_j/gam  (modulo shifts).
-   Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17
-   New logic 6/12/17
-*/
-{
-  int nss   = spopts.nspread + 1; // since ns may be odd
-  FLT Xsafe = X, Ssafe = S;       // may be tweaked locally
-  if (X == 0.0)                   // logic ensures XS>=1, handle X=0 a/o S=0
-    if (S == 0.0) {
-      Xsafe = 1.0;
-      Ssafe = 1.0;
-    } else
-      Xsafe = max(Xsafe, 1 / S);
-  else
-    Ssafe = max(Ssafe, 1 / X);
-  // use the safe X and S...
-  auto nfd = FLT(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
-  if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
-  *nf = (BIGINT)nfd;
-  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
-  //  catch too small nf, and nan or +-inf, otherwise spread fails...
-  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
-  if (*nf < MAX_NF)                                 // otherwise will fail anyway
-    *nf = next235even(*nf);                         // expensive at huge nf
-  *h   = FLT(2.0 * PI / *nf);                       // upsampled grid spacing
-  *gam = FLT(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x'
-}
-
-void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
-/*
-  Approximates exact Fourier series coeffs of cnufftspread's real symmetric
-  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
-  narrowness of kernel. Uses phase winding for cheap eval on the regular freq
-  grid. Note that this is also the Fourier transform of the non-periodized
-  kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
-  overall prefactor of 1/h, which is needed anyway for the correction, and
-  arises because the quadrature weights are scaled for grid units not x units.
-  The kernel is actually centered at nf/2, related to the centering of the grid;
-  this is now achieved by the sign flip in a[n] below.
-
-  Inputs:
-  nf - size of 1d uniform spread grid, must be even.
-  opts - spreading opts object, needed to eval kernel (must be already set up)
-
-  Outputs:
-  fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
-        divided by h = 2pi/n.
-        (should be allocated for at least nf/2+1 FLTs)
-
-  Compare onedim_dct_kernel which has same interface, but computes DFT of
-  sampled kernel, not quite the same object.
-
-  Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18.
-  Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24.
- */
-{
-  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
-  FLT f[MAX_NQUAD];
-  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
-  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
-  CPX a[MAX_NQUAD];
-  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
-    z[n] *= J2;                      // rescale nodes
-    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
-    a[n] = -exp(2 * PI * IMA * (FLT)z[n] / (FLT)nf);          // phase winding rates
-  }
-  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
-  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
-  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
-  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
-    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
-#pragma omp parallel num_threads(nt)
-  {                                                // each thread gets own chunk to do
-    int t = MY_OMP_GET_THREAD_NUM();
-    CPX aj[MAX_NQUAD];                             // phase rotator for this thread
-    for (int n = 0; n < q; ++n)
-      aj[n] = pow(a[n], (FLT)brk[t]);              // init phase factors for chunk
-    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
-      FLT x = 0.0;                                 // accumulator for answer at this j
-      for (int n = 0; n < q; ++n) {
-        x += f[n] * 2 * real(aj[n]);               // include the negative freq
-        aj[n] *= a[n];                             // wind the phases
-      }
-      fwkerhalf[j] = x;
-    }
-  }
-}
-
-void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts)
-/*
-  Approximates exact 1D Fourier transform of cnufftspread's real symmetric
-  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
-  narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
-  for a kernel with x measured in grid-spacings. (See previous routine for
-  FT definition).
-
-  Inputs:
-  nk - number of freqs
-  k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z)
-     Note, z is in grid-point units, and k values must be in [-pi, pi) for
-     accuracy.
-  opts - spreading opts object, needed to eval kernel (must be already set up)
-
-  Outputs:
-  phihat - real Fourier transform evaluated at freqs (alloc for nk FLTs)
-
-  Barnett 2/8/17. openmp since cos slow 2/9/17
- */
-{
-  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
-  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
-  FLT f[MAX_NQUAD];
-  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
-  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
-  for (int n = 0; n < q; ++n) {
-    z[n] *= (FLT)J2;                         // quadr nodes for [0,J/2]
-    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights
-  }
-#pragma omp parallel for num_threads(opts.nthreads)
-  for (BIGINT j = 0; j < nk; ++j) {          // loop along output array
-    FLT x = 0.0;                             // register
-    for (int n = 0; n < q; ++n)
-      x += f[n] * 2 * cos(k[j] * (FLT)z[n]); // pos & neg freq pair.  use FLT cos!
-    phihat[j] = x;
-  }
-}
-
-void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGINT nf1,
-                         CPX *fw, int modeord)
-/*
-  if dir==1: copies fw to fk with amplification by prefac/ker
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1)
-      1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
-
-  fk is a size-ms FLT complex array (2*ms FLTs alternating re,im parts)
-  fw is a size-nf1 complex array (2*nf1 FLTs alternating re,im parts)
-  ker is real-valued FLT array of length nf1/2+1.
-
-  Single thread only, but shouldn't matter since mostly data movement.
-
-  It has been tested that the repeated floating division in this inner loop
-  only contributes at the <3% level in 3D relative to the FFT cost (8 threads).
-  This could be removed by passing in an inverse kernel and doing mults.
-
-  todo: rewrite w/ C++-complex I/O, check complex divide not slower than
-    real divide, or is there a way to force a real divide?
-
-  Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
-*/
-{
-  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
-  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (kmax + 1);
-  } // or, instead, FFT ordering
-  if (dir == 1) {                                       // read fw, write out to fk...
-    for (BIGINT k = 0; k <= kmax; ++k) {                // non-neg freqs k
-      fk[pp++] = prefac * fw[k].real() / ker[k];        // re
-      fk[pp++] = prefac * fw[k].imag() / ker[k];        // im
-    }
-    for (BIGINT k = kmin; k < 0; ++k) {                 // neg freqs k
-      fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re
-      fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im
-    }
-  } else { // read fk, write out to fw w/ zero padding...
-    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where
-                                                     // needed
-      fw[k] = 0.0;
-    }
-    for (BIGINT k = 0; k <= kmax; ++k) {             // non-neg freqs k
-      fw[k].real(prefac * fk[pp++] / ker[k]);        // re
-      fw[k].imag(prefac * fk[pp++] / ker[k]);        // im
-    }
-    for (BIGINT k = kmin; k < 0; ++k) {              // neg freqs k
-      fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re
-      fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im
-    }
-  }
-}
-
-void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, BIGINT mt,
-                         FLT *fk, BIGINT nf1, BIGINT nf2, CPX *fw, int modeord)
-/*
-  2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
-
-  if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)).
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-      1: use FFT-style (pos then negative, on each dim)
-
-  fk is a complex array stored as 2*ms*mt FLTs alternating re,im parts, with
-  ms looped over fast and mt slow.
-  fw is a complex array stored as 2*nf1*nf2] FLTs alternating re,im parts, with
-  nf1 looped over fast and nf2 slow.
-  ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1
-     respectively.
-
-  Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
-*/
-{
-  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
-  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (k2max + 1) * ms;
-  } // or, instead, FFT ordering
-  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
-    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all
-                                                                     // dims
-      fw[j] = 0.0;
-  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
-    // point fk and fw to the start of this y value's row (2* is for complex):
-    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
-                                &fw[nf1 * k2], modeord);
-  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
-    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
-                                &fw[nf1 * (nf2 + k2)], modeord);
-}
-
-void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT ms,
-                         BIGINT mt, BIGINT mu, FLT *fk, BIGINT nf1, BIGINT nf2,
-                         BIGINT nf3, CPX *fw, int modeord)
-/*
-  3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
-
-  if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)).
-  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
-
-  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-      1: use FFT-style (pos then negative, on each dim)
-
-  fk is a complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with
-  ms looped over fastest and mu slowest.
-  fw is a complex array stored as 2*nf1*nf2*nf3 FLTs alternating re,im parts, with
-  nf1 looped over fastest and nf3 slowest.
-  ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1,
-     and nf3/2+1 respectively.
-
-  Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
-*/
-{
-  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
-  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
-  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
-  if (modeord == 1) {
-    pp = 0;
-    pn = 2 * (k3max + 1) * ms * mt;
-  } // or FFT ordering
-  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
-  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
-    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
-      fw[j] = 0.0;
-  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
-    // point fk and fw to the start of this z value's plane (2* is for complex):
-    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
-                                nf2, &fw[np * k3], modeord);
-  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
-    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
-                                nf2, &fw[np * (nf3 + k3)], modeord);
-}
-
-// --------- batch helper functions for t1,2 exec: ---------------------------
-
-int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch)
-/*
-  Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
-  to (or from) the batch of fine working grids p->fwBatch, using the same set of
-  (index-sorted) NU points p->X,Y,Z for each vector in the batch.
-  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
-  Returns 0 (no error reporting for now).
-  Notes:
-  1) cBatch is already assumed to have the correct offset, ie here we
-   read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
-  2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp
-  Barnett 5/19/20, based on Malleo 2019.
-*/
-{
-  // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread.
-  // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
-  // But when nthr_outer=1 here, omp par inside the loop sees all threads...
-#ifdef _OPENMP
-  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
-#endif
-#pragma omp parallel for num_threads(nthr_outer)
-  for (int i = 0; i < batchSize; i++) {
-    CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
-    CPX *ci  = cBatch + i * p->nj;     // start of i'th c array in cBatch
-    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X,
-                       p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort);
-  }
-  return 0;
-}
-
-int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch)
-/*
-  Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
-  into each output array fk in fkBatch.
-  Type 2: deconvolves from user-supplied input fk to 0-padded interior fw,
-  again looping over fk in fkBatch and fw in p->fwBatch.
-  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
-  This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize
-  times.
-  Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here)
-*/
-{
-  // since deconvolveshuffle?d are single-thread, omp par seems to help here...
-#pragma omp parallel for num_threads(batchSize)
-  for (int i = 0; i < batchSize; i++) {
-    CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
-    CPX *fki = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
-
-    // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
-    if (p->dim == 1)
-      deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki,
-                          p->nf1, fwi, p->opts.modeord);
-    else if (p->dim == 2)
-      deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms,
-                          p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
-    else
-      deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2,
-                          p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2,
-                          p->nf3, fwi, p->opts.modeord);
-  }
-  return 0;
+int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                     FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<FLT>(type, dim, n_modes, iflag, ntrans, tol,
+                                 reinterpret_cast<FINUFFT_PLAN_T<FLT> **>(pp), opts);
 }
 
-} // namespace common
-} // namespace finufft
-
-// --------------- rest is the 5 user guru (plan) interface drivers: ---------
-// (not namespaced since have safe names finufft{f}_* )
-using namespace finufft::common; // accesses routines defined above
-
-// Marco Barbone: 5.8.2024
-// These are user-facing.
-// The various options could be macros to follow c standard library conventions.
-// Question: would these be enums?
-
-// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
-void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
-// Sets default nufft opts (referenced by all language interfaces too).
-// See finufft_opts.h for meanings.
-// This was created to avoid uncertainty about C++11 style static initialization
-// when called from MEX, but now is generally used. Barnett 10/30/17 onwards.
-// Sphinx sucks the below code block into the web docs, hence keep it clean...
-{
-  // sphinx tag (don't remove): @defopts_start
-  o->modeord = 0;
-  o->chkbnds = 1;
-
-  o->debug        = 0;
-  o->spread_debug = 0;
-  o->showwarn     = 1;
-
-  o->nthreads = 0;
-#ifdef FINUFFT_USE_DUCC0
-  o->fftw = 0;
-#else
-  o->fftw = FFTW_ESTIMATE;
-#endif
-  o->spread_sort        = 2;
-  o->spread_kerevalmeth = 1;
-  o->spread_kerpad      = 1;
-  o->upsampfac          = 0.0;
-  o->spread_thread      = 0;
-  o->maxbatchsize       = 0;
-  o->spread_nthr_atomic = -1;
-  o->spread_max_sp_size = 0;
-  o->fftw_lock_fun      = nullptr;
-  o->fftw_unlock_fun    = nullptr;
-  o->fftw_lock_data     = nullptr;
-  // sphinx tag (don't remove): @defopts_end
-}
-
-// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
-int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol,
-                     FINUFFT_PLAN *pp, finufft_opts *opts)
-// Populates the fields of finufft_plan which is pointed to by "pp".
-// opts is ptr to a finufft_opts to set options, or NULL to use defaults.
-// For some of the fields (if "auto" selected) here choose the actual setting.
-// For types 1,2 allocates memory for internal working arrays,
-// evaluates spreading kernel coefficients, and instantiates the fftw_plan
-{
-  FINUFFT_PLAN p;
-  p   = new FINUFFT_PLAN_S; // allocate fresh plan struct
-  *pp = p;                  // pass out plan as ptr to plan struct
-
-  if (opts == NULL)         // use default opts
-    FINUFFT_DEFAULT_OPTS(&(p->opts));
-  else                      // or read from what's passed in
-    p->opts = *opts;        // keep a deep copy; changing *opts now has no effect
-
-  if (p->opts.debug)        // do a hello world
-    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
-           __func__);
-
-  p->fftPlan = std::make_unique<Finufft_FFT_plan<FLT>>(
-      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
-
-  if ((type != 1) && (type != 2) && (type != 3)) {
-    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
-    return FINUFFT_ERR_TYPE_NOTVALID;
-  }
-  if ((dim != 1) && (dim != 2) && (dim != 3)) {
-    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-    return FINUFFT_ERR_DIM_NOTVALID;
-  }
-  if (ntrans < 1) {
-    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
-    return FINUFFT_ERR_NTRANS_NOTVALID;
-  }
-  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
-    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
-            __func__);
-    return FINUFFT_ERR_LOCK_FUNS_INVALID;
-    ;
-  }
-
-  // get stuff from args...
-  p->type    = type;
-  p->dim     = dim;
-  p->ntrans  = ntrans;
-  p->tol     = tol;
-  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
-
-                                      // choose overall # threads...
-#ifdef _OPENMP
-  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
-  // (the above could be set, or suggested set, to 1 for small enough problems...)
-  if (p->opts.nthreads > 0) {
-    nthr = p->opts.nthreads; // user override, now without limit
-    if (p->opts.showwarn && (nthr > ompmaxnthr))
-      fprintf(stderr,
-              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
-              "available; note large nthreads can be slower.\n",
-              __func__, nthr, ompmaxnthr);
-  }
-#else
-  int nthr = 1; // always 1 thread (avoid segfault)
-  if (p->opts.nthreads > 1)
-    fprintf(stderr,
-            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
-            __func__, p->opts.nthreads);
-#endif
-  p->opts.nthreads = nthr; // store actual # thr planned for
-  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
-
-  // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
-  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
-    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
-    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
-  } else {                                          // batchSize override by user
-    p->batchSize = min(p->opts.maxbatchsize, ntrans);
-    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
-  }
-  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
-  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
-    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
-    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
-  }
-
-  if (type != 3) {                      // read in user Fourier mode array sizes...
-    p->ms = n_modes[0];
-    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
-    p->mu = (dim > 2) ? n_modes[2] : 1;
-    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
-  }
-
-  // heuristic to choose default upsampfac... (currently two poss)
-  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
-    p->opts.upsampfac = 2.0;                 // default, and need for tol small
-    if (tol >= (FLT)1E-9) {                  // the tol sigma=5/4 can reach
-      if (type == 3)                         // could move to setpts, more known?
-        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
-      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
-               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
-                                             // typ tol, 12-core xeon
-        p->opts.upsampfac = 1.25;
-    }
-    if (p->opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
-  }
-  // use opts to choose and write into plan's spread options...
-  int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
-  if (ier > 1) // proceed if success or warning
-    return ier;
-
-  // set others as defaults (or unallocated for arrays)...
-  p->X           = NULL;
-  p->Y           = NULL;
-  p->Z           = NULL;
-  p->phiHat1     = NULL;
-  p->phiHat2     = NULL;
-  p->phiHat3     = NULL;
-  p->nf1         = 1;
-  p->nf2         = 1;
-  p->nf3         = 1;    // crucial to leave as 1 for unused dims
-  p->sortIndices = NULL; // used in all three types
-
-  //  ------------------------ types 1,2: planning needed ---------------------
-  if (type == 1 || type == 2) {
-
-    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
-                         // Note: batchSize not used since might be only 1.
-
-    p->spopts.spread_direction = type;
-
-    if (p->opts.showwarn) { // user warn round-off error...
-      if (EPSILON * p->ms > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->ms));
-      if (EPSILON * p->mt > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mt));
-      if (EPSILON * p->mu > 1.0)
-        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
-                __func__, (double)(EPSILON * p->mu));
-    }
-
-    // determine fine grid sizes, sanity check..
-    int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1));
-    if (nfier) return nfier; // nf too big; we're done
-    p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1));
-    if (dim > 1) {
-      nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2));
-      if (nfier) return nfier;
-      p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1));
-    }
-    if (dim > 2) {
-      nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3));
-      if (nfier) return nfier;
-      p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1));
-    }
-
-    if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
-             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
-             "batchSize=%d ",
-             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
-             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
-             p->batchSize);
-      if (p->batchSize == 1) // spread_thread has no effect in this case
-        printf("\n");
-      else
-        printf(" spread_thread=%d\n", p->opts.spread_thread);
-    }
-
-    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer;
-    timer.start();
-    onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
-    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
-    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
-    if (p->opts.debug)
-      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
-             timer.elapsedsec());
-
-    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
-    if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr,
-              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
-              __func__);
-      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
-      return FINUFFT_ERR_MAXNALLOC;
-    }
-
-    timer.restart();
-    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug)
-      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
-    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
-              __func__);
-      free(p->phiHat1);
-      free(p->phiHat2);
-      free(p->phiHat3);
-      return FINUFFT_ERR_ALLOC;
-    }
-
-    timer.restart(); // plan the FFTW
-    const auto ns = gridsize_for_fft(p);
-    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
-    if (p->opts.debug)
-      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
-             nthr_fft, timer.elapsedsec());
-
-  } else { // -------------------------- type 3 (no planning) ------------
-
-    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
-    // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->CpBatch     = NULL;
-    p->fwBatch     = NULL;
-    p->Sp          = NULL;
-    p->Tp          = NULL;
-    p->Up          = NULL;
-    p->prephase    = NULL;
-    p->deconv      = NULL;
-    p->innerT2plan = NULL;
-    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
-    // Note we don't even know nj or nk yet, so can't do anything else!
-  }
-  return ier; // report setup_spreader status (could be warning)
-}
-
-// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
 int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk,
-                   FLT *s, FLT *t, FLT *u)
-/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
-   spreading. (The last 4 arguments are ignored.)
-   For type 3: allocates internal working arrays, scales/centers the NU points
-   and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
-*/
-{
-  int d = p->dim; // abbrev for spatial dim
-  CNTime timer;
-  timer.start();
-  p->nj = nj; // the user only now chooses how many NU (x,y,z) pts
-  if (nj < 0) {
-    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
-    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  } else if (nj > MAX_NU_PTS) {
-    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
-    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  }
-
-  if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
-                      // (all we can do is check and maybe bin-sort the NU pts)
-    p->X    = xj;     // plan must keep pointers to user's fixed NU pts
-    p->Y    = yj;
-    p->Z    = zj;
-    int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug > 1)
-      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds,
-             timer.elapsedsec());
-    if (ier) // no warnings allowed here
-      return ier;
-    timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts
-    // calls causing memory leak. We don't know it is the same size as before, so we
-    // have to malloc each time.
-    if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
-    if (!p->sortIndices) {
-      fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__);
-      return FINUFFT_ERR_SPREAD_ALLOC;
-    }
-    p->didSort =
-        indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug)
-      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
-             timer.elapsedsec());
-
-  } else { // ------------------------- TYPE 3 SETPTS -----------------------
-           // (here we can precompute pre/post-phase factors and plan the t2)
-
-    if (nk < 0) {
-      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
-      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    } else if (nk > MAX_NU_PTS) {
-      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
-      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    }
-    p->nk = nk; // user set # targ freq pts
-    p->S  = s;  // keep pointers to user's input target pts
-    p->T  = t;
-    p->U  = u;
-
-    // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
-    FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}...
-    arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1));
-    arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k}
-    set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1),
-                  &(p->t3P.gam1));         // applies twist i)
-    p->t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
-    p->t3P.D2 = 0.0;
-    if (d > 1) {
-      arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j}
-      arraywidcen(nk, t, &S2, &(p->t3P.D2));           // {t_k}
-      set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2),
-                    &(p->t3P.gam2));
-    }
-    p->t3P.C3 = 0.0;
-    p->t3P.D3 = 0.0;
-    if (d > 2) {
-      arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j}
-      arraywidcen(nk, u, &S3, &(p->t3P.D3));           // {u_k}
-      set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3),
-                    &(p->t3P.gam3));
-    }
-
-    if (p->opts.debug) { // report on choices of shifts, centers, etc...
-      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
-      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1,
-             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1);
-      if (d > 1)
-        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2,
-               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2);
-      if (d > 2)
-        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3,
-               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3);
-    }
-    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
-    if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr,
-              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
-              __func__);
-      return FINUFFT_ERR_MAXNALLOC;
-    }
-    p->fftPlan->free(p->fwBatch);
-    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace
-
-    // (note FFTW_ALLOC is not needed over malloc, but matches its type)
-    if (p->CpBatch) free(p->CpBatch);
-    p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work
-
-    if (p->opts.debug)
-      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
-             (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize,
-             timer.elapsedsec());
-    if (!p->fwBatch || !p->CpBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
-      return FINUFFT_ERR_ALLOC;
-    }
-    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
-
-    // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
-    // FIXME: should use realloc
-    if (p->X) free(p->X);
-    if (p->Sp) free(p->Sp);
-    p->X  = (FLT *)malloc(sizeof(FLT) * nj);
-    p->Sp = (FLT *)malloc(sizeof(FLT) * nk);
-    if (d > 1) {
-      if (p->Y) free(p->Y);
-      if (p->Tp) free(p->Tp);
-      p->Y  = (FLT *)malloc(sizeof(FLT) * nj);
-      p->Tp = (FLT *)malloc(sizeof(FLT) * nk);
-    }
-    if (d > 2) {
-      if (p->Z) free(p->Z);
-      if (p->Up) free(p->Up);
-      p->Z  = (FLT *)malloc(sizeof(FLT) * nj);
-      p->Up = (FLT *)malloc(sizeof(FLT) * nk);
-    }
-
-    // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
-    FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
-    if (d > 1) ig2 = 1.0 / p->t3P.gam2;
-    if (d > 2) ig3 = 1.0 / p->t3P.gam3;
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT j = 0; j < nj; ++j) {
-      p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j
-      if (d > 1) // (ok to do inside loop because of branch predict)
-        p->Y[j] = (yj[j] - p->t3P.C2) * ig2;          // rescale y_j
-      if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j
-    }
-
-    // set up prephase array...
-    CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i
-    if (p->prephase) free(p->prephase);
-    p->prephase = (CPX *)malloc(sizeof(CPX) * nj);
-    if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) {
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
-        FLT phase = p->t3P.D1 * xj[j];
-        if (d > 1) phase += p->t3P.D2 * yj[j];
-        if (d > 2) phase += p->t3P.D3 * zj[j];
-        p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler
-                                                            // e^{+-i.phase}
-      }
-    } else
-      for (BIGINT j = 0; j < nj; ++j)
-        p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec??
-
-                                   // rescale the target s_k etc to s'_k etc...
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k = 0; k < nk; ++k) {
-      p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1);   // so |s'_k| < pi/R
-      if (d > 1)
-        p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| <
-                                                                 // pi/R
-      if (d > 2)
-        p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| <
-                                                                 // pi/R
-    }
-    // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
-    // (exploits that FT separates because kernel is prod of 1D funcs)
-    if (p->deconv) free(p->deconv);
-    p->deconv     = (CPX *)malloc(sizeof(CPX) * nk);
-    FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk);    // don't confuse w/ p->phiHat
-    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1
-    FLT *phiHatk2 = NULL, *phiHatk3 = NULL;
-    if (d > 1) {
-      phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk);
-      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2
-    }
-    if (d > 2) {
-      phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk);
-      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3
-    }
-    int Cfinite =
-        isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan
-                                                                           // or inf if
-                                                                           // M=0, no
-                                                                           // input NU pts
-    int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen
-#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
-      FLT phiHat = phiHatk1[k];
-      if (d > 1) phiHat *= phiHatk2[k];
-      if (d > 2) phiHat *= phiHatk3[k];
-      p->deconv[k] = (CPX)(1.0 / phiHat);
-      if (Cfinite && Cnonzero) {
-        FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1;
-        if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2;
-        if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3;
-        p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
-      }
-    }
-    free(phiHatk1);
-    free(phiHatk2);
-    free(phiHatk3); // done w/ deconv fill
-    if (p->opts.debug)
-      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
-
-    // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
-    timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts
-    // calls causing memory leak. We don't know it is the same size as before, so we
-    // have to malloc each time.
-    if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
-    if (!p->sortIndices) {
-      fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__);
-      return FINUFFT_ERR_SPREAD_ALLOC;
-    }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y,
-                           p->Z, p->spopts);
-    if (p->opts.debug)
-      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
-             timer.elapsedsec());
-
-    // Plan and setpts once, for the (repeated) inner type 2 finufft call...
-    timer.restart();
-    BIGINT t2nmodes[]   = {p->nf1, p->nf2, p->nf3};  // t2 input is actually fw
-    finufft_opts t2opts = p->opts;                   // deep copy, since not ptrs
-    t2opts.modeord      = 0;                         // needed for correct t3!
-    t2opts.debug        = max(0, p->opts.debug - 1); // don't print as much detail
-    t2opts.spread_debug = max(0, p->opts.spread_debug - 1);
-    t2opts.showwarn     = 0;                         // so don't see warnings 2x
-    // (...could vary other t2opts here?)
-    if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
-    int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
-                               &p->innerT2plan, &t2opts);
-    if (ier > 1) { // if merely warning, still proceed
-      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
-              __func__, ier);
-      return ier;
-    }
-    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL,
-                         NULL); // note nk = # output points (not nj)
-    if (ier > 1) {
-      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
-      return ier;
-    }
-    if (p->opts.debug)
-      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
-  }
-  return 0;
+                   FLT *s, FLT *t, FLT *u) {
+  return finufft_setpts_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), nj, xj, yj, zj,
+                               nk, s, t, u);
 }
-// ............ end setpts ..................................................
 
-// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
 int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
-  /* See ../docs/cguru.doc for current documentation.
-
-   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
-   existing (sorted) NU pts and existing plan.
-   For type 1 and 3: cj is input, fk is output.
-   For type 2: fk is input, cj is output.
-   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
-   for each of the 3 types.
-   For cases of ntrans>1, performs work in blocks of size up to batchSize.
-   Return value 0 (no error diagnosis yet).
-   Barnett 5/20/20, based on Malleo 2019.
-*/
-  CNTime timer;
-  timer.start();
-
-  if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
-
-    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
-    if (p->opts.debug)
-      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
-             p->nbatch, p->batchSize);
-
-    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
-
-      // current batch is either batchSize, or possibly truncated if last one
-      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
-      int bB            = b * p->batchSize; // index of vector, since batchsizes same
-      CPX *cjb          = cj + bB * p->nj;  // point to batch of weights
-      CPX *fkb          = fk + bB * p->N;   // point to batch of mode coeffs
-      if (p->opts.debug > 1)
-        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
-
-      // STEP 1: (varies by type)
-      timer.restart();
-      if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid
-        spreadinterpSortedBatch(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec();
-      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
-        deconvolveBatch(thisBatchSize, p, fkb);
-        t_deconv += timer.elapsedsec();
-      }
-
-      // STEP 2: call the FFT on this batch
-      timer.restart();
-      do_fft(p);
-      t_fft += timer.elapsedsec();
-      if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
-
-      // STEP 3: (varies by type)
-      timer.restart();
-      if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
-        deconvolveBatch(thisBatchSize, p, fkb);
-        t_deconv += timer.elapsedsec();
-      } else { // type 2: interpolate unif fw grid to NU target pts
-        spreadinterpSortedBatch(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec();
-      }
-    } // ........end b loop
-
-    if (p->opts.debug) { // report total times in their natural order...
-      if (p->type == 1) {
-        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
-        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
-      } else {
-        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
-        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
-      }
-    }
-  }
-
-  else { // ----------------------------- TYPE 3 EXEC ---------------------
-
-    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
-    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
-
-    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0,
-           t_deconv = 0.0; // accumulated timings
-    if (p->opts.debug)
-      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
-             p->nbatch, p->batchSize);
-
-    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
-
-      // batching and pointers to this batch, identical to t1,2 above...
-      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
-      int bB            = b * p->batchSize;
-      CPX *cjb          = cj + bB * p->nj; // batch of input strengths
-      CPX *fkb          = fk + bB * p->nk; // batch of output strengths
-      if (p->opts.debug > 1)
-        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
-
-      // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
-      timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
-      for (int i = 0; i < thisBatchSize; i++) {
-        BIGINT ioff = i * p->nj;
-        for (BIGINT j = 0; j < p->nj; ++j) {
-          p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
-        }
-      }
-      t_pre += timer.elapsedsec();
-
-      // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
-      timer.restart();
-      p->spopts.spread_direction = 1;                        // spread
-      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed
-      t_spr += timer.elapsedsec();
-
-      // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
-      timer.restart();
-      // illegal possible shrink of ntrans *after* plan for smaller last batch:
-      p->innerT2plan->ntrans = thisBatchSize; // do not try this at home!
-      /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
-     still the same size, as Andrea explained; just wastes a few flops) */
-      FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch);
-      t_t2 += timer.elapsedsec();
-      // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
-      timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads)
-      for (int i = 0; i < thisBatchSize; i++) {
-        BIGINT ioff = i * p->nk;
-        for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k];
-      }
-      t_deconv += timer.elapsedsec();
-    } // ........end b loop
-
-    if (p->opts.debug) { // report total times in their natural order...
-      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
-      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
-      printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
-      printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
-    }
-  }
-  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
-  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
-
-  return 0;
+  return finufft_execute_t<FLT>(reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p), cj, fk);
 }
 
-// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 int FINUFFT_DESTROY(FINUFFT_PLAN p)
 // Free everything we allocated inside of finufft_plan pointed to by p.
 // Also must not crash if called immediately after finufft_makeplan.
-// Thus either each thing free'd here is guaranteed to be NULL or correctly
+// Thus either each thing free'd here is guaranteed to be nullptr or correctly
 // allocated.
 {
-  if (!p) // NULL ptr, so not a ptr to a plan, report error
+  if (!p) // nullptr, so not a ptr to a plan, report error
     return 1;
 
-  p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array
-  free(p->sortIndices);
-  if (p->type == 1 || p->type == 2) {
-    free(p->phiHat1);
-    free(p->phiHat2);
-    free(p->phiHat3);
-  } else {                           // free the stuff alloc for type 3 only
-    FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code
-    free(p->CpBatch);
-    free(p->Sp);
-    free(p->Tp);
-    free(p->Up);
-    free(p->X);
-    free(p->Y);
-    free(p->Z);
-    free(p->prephase);
-    free(p->deconv);
-  }
-  delete p;
+  delete reinterpret_cast<FINUFFT_PLAN_T<FLT> *>(p);
+  p = nullptr;
   return 0; // success
 }
diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp
new file mode 100644
index 000000000..834420bbe
--- /dev/null
+++ b/src/finufft_core.cpp
@@ -0,0 +1,1204 @@
+#include <finufft/fft.h>
+#include <finufft/finufft_core.h>
+#include <finufft/spreadinterp.h>
+#include <finufft/utils.h>
+
+#include "../contrib/legendre_rule_fast.h"
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+using namespace std;
+using namespace finufft;
+using namespace finufft::utils;
+using namespace finufft::spreadinterp;
+using namespace finufft::quadrature;
+
+/* Computational core for FINUFFT.
+
+   Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
+   2d1/2d2 many-vector drivers by Melody Shih, summer 2018.
+   Original guru interface written by Andrea Malleo, summer 2019, mentored
+   by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu.
+
+   As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions
+   and the two finufft2d?many() functions. The (now 18) simple C++ interfaces
+   are in simpleinterfaces.cpp.
+
+Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
+
+   TYPE 1:
+   The type 1 NUFFT proceeds in three main steps:
+   1) spread data to oversampled regular mesh using kernel.
+   2) compute FFT on uniform mesh
+   3) deconvolve by division of each Fourier mode independently by the kernel
+    Fourier series coeffs (not merely FFT of kernel), shuffle to output.
+   The kernel coeffs are precomputed in what is called step 0 in the code.
+
+   TYPE 2:
+   The type 2 algorithm proceeds in three main steps:
+   1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
+   2) compute inverse FFT on uniform fine grid
+   3) spread (dir=2, ie interpolate) data to regular mesh
+   The kernel coeffs are precomputed in what is called step 0 in the code.
+
+   TYPE 3:
+   The type 3 algorithm is basically a type 2 (which is implemented precisely
+   as call to type 2) replacing the middle FFT (Step 2) of a type 1.
+   Beyond this, the new twists are:
+   i) nf1, number of upsampled points for the type-1, depends on the product
+     of interval widths containing input and output points (X*S).
+   ii) The deconvolve (post-amplify) step is division by the Fourier transform
+     of the scaled kernel, evaluated on the *nonuniform* output frequency
+     grid; this is done by direct approximation of the Fourier integral
+     using quadrature of the kernel function times exponentials.
+   iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
+     half-widths X and S, hence nf1.
+
+   MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
+   maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
+   this is good only for small problems.
+
+
+Design notes for guru interface implementation:
+
+* Since finufft_plan is C-compatible, we need to use malloc/free for its
+  allocatable arrays, keeping it quite low-level. We can't use std::vector
+  since that would only survive in the scope of each function.
+
+* Thread-safety: FINUFFT plans are passed as pointers, so it has no global
+  state apart from that associated with FFTW (and the did_fftw_init).
+*/
+
+// ---------- local math routines (were in common.cpp; no need now): --------
+
+namespace finufft {
+namespace common {
+
+static constexpr double PI = 3.14159265358979329;
+
+static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts,
+                         BIGINT *nf)
+// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
+// and requested number of Fourier modes ms. Returns 0 if success, else an
+// error code if nf was unreasonably big (& tell the world).
+{
+  *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {
+    *nf = next235even(*nf);                               // expensive at huge nf
+    return 0;
+  } else {
+    fprintf(stderr,
+            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
+            "malloc\n",
+            __func__, (double)*nf, (double)MAX_NF);
+    return FINUFFT_ERR_MAXNALLOC;
+  }
+}
+
+template<typename T>
+static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts,
+                                    int dim)
+// Set up the spreader parameters given eps, and pass across various nufft
+// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
+{
+  // this calls spreadinterp.cpp...
+  int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
+                           opts.spread_debug, opts.showwarn, dim);
+  // override various spread opts from their defaults...
+  spopts.debug    = opts.spread_debug;
+  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
+  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
+  spopts.chkbnds  = opts.chkbnds;
+  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
+  if (opts.spread_nthr_atomic >= 0)     // overrides
+    spopts.atomic_threshold = opts.spread_nthr_atomic;
+  if (opts.spread_max_sp_size > 0)      // overrides
+    spopts.max_subproblem_size = opts.spread_max_sp_size;
+  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
+    fprintf(stderr,
+            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
+            __func__);
+  return ier;
+}
+
+template<typename T>
+static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts,
+                          BIGINT *nf, T *h, T *gam)
+/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
+   for type 3 only.
+   Inputs:
+   X and S are the xj and sk interval half-widths respectively.
+   opts and spopts are the NUFFT and spreader opts strucs, respectively.
+   Outputs:
+   nf is the size of upsampled grid for a given single dimension.
+   h is the grid spacing = 2pi/nf
+   gam is the x rescale factor, ie x'_j = x_j/gam  (modulo shifts).
+   Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17
+   New logic 6/12/17
+*/
+{
+  int nss = spopts.nspread + 1; // since ns may be odd
+  T Xsafe = X, Ssafe = S;       // may be tweaked locally
+  if (X == 0.0)                 // logic ensures XS>=1, handle X=0 a/o S=0
+    if (S == 0.0) {
+      Xsafe = 1.0;
+      Ssafe = 1.0;
+    } else
+      Xsafe = max(Xsafe, 1 / S);
+  else
+    Ssafe = max(Ssafe, 1 / X);
+  // use the safe X and S...
+  auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss);
+  if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf
+  *nf = (BIGINT)nfd;
+  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
+  //  catch too small nf, and nan or +-inf, otherwise spread fails...
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
+  if (*nf < MAX_NF)                               // otherwise will fail anyway
+    *nf = next235even(*nf);                       // expensive at huge nf
+  *h   = T(2.0 * PI / *nf);                       // upsampled grid spacing
+  *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x'
+}
+
+template<typename T>
+static void onedim_fseries_kernel(BIGINT nf, std::vector<T> &fwkerhalf,
+                                  finufft_spread_opts opts)
+/*
+  Approximates exact Fourier series coeffs of cnufftspread's real symmetric
+  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
+  narrowness of kernel. Uses phase winding for cheap eval on the regular freq
+  grid. Note that this is also the Fourier transform of the non-periodized
+  kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an
+  overall prefactor of 1/h, which is needed anyway for the correction, and
+  arises because the quadrature weights are scaled for grid units not x units.
+  The kernel is actually centered at nf/2, related to the centering of the grid;
+  this is now achieved by the sign flip in a[n] below.
+
+  Inputs:
+  nf - size of 1d uniform spread grid, must be even.
+  opts - spreading opts object, needed to eval kernel (must be already set up)
+
+  Outputs:
+  fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
+        divided by h = 2pi/n.
+        (should be allocated for at least nf/2+1 Ts)
+
+  Compare onedim_dct_kernel which has same interface, but computes DFT of
+  sampled kernel, not quite the same object.
+
+  Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18.
+  Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24.
+ */
+{
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
+  T f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
+  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
+  std::complex<T> a[MAX_NQUAD];
+  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
+    z[n] *= J2;                      // rescale nodes
+    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
+    a[n] = -exp(2 * PI * std::complex<double>(0, 1) * z[n] / double(nf)); // phase winding
+                                                                          // rates
+  }
+  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
+  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
+  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
+  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
+    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
+#pragma omp parallel num_threads(nt)
+  {                                                // each thread gets own chunk to do
+    int t = MY_OMP_GET_THREAD_NUM();
+    std::complex<T> aj[MAX_NQUAD];                 // phase rotator for this thread
+    for (int n = 0; n < q; ++n)
+      aj[n] = pow(a[n], (T)brk[t]);                // init phase factors for chunk
+    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
+      T x = 0.0;                                   // accumulator for answer at this j
+      for (int n = 0; n < q; ++n) {
+        x += f[n] * 2 * real(aj[n]);               // include the negative freq
+        aj[n] *= a[n];                             // wind the phases
+      }
+      fwkerhalf[j] = x;
+    }
+  }
+}
+
+template<typename T>
+static void onedim_nuft_kernel(BIGINT nk, const std::vector<T> &k, std::vector<T> &phihat,
+                               finufft_spread_opts opts)
+/*
+  Approximates exact 1D Fourier transform of cnufftspread's real symmetric
+  kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting
+  narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
+  for a kernel with x measured in grid-spacings. (See previous routine for
+  FT definition).
+
+  Inputs:
+  nk - number of freqs
+  k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z)
+     Note, z is in grid-point units, and k values must be in [-pi, pi) for
+     accuracy.
+  opts - spreading opts object, needed to eval kernel (must be already set up)
+
+  Outputs:
+  phihat - real Fourier transform evaluated at freqs (alloc for nk Ts)
+
+  Barnett 2/8/17. openmp since cos slow 2/9/17
+ */
+{
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
+  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
+  T f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
+  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
+  for (int n = 0; n < q; ++n) {
+    z[n] *= (T)J2;                           // quadr nodes for [0,J/2]
+    f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights
+  }
+#pragma omp parallel for num_threads(opts.nthreads)
+  for (BIGINT j = 0; j < nk; ++j) {        // loop along output array
+    T x = 0.0;                             // register
+    for (int n = 0; n < q; ++n)
+      x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair.  use T cos!
+    phihat[j] = x;
+  }
+}
+
+template<typename T>
+static void deconvolveshuffle1d(int dir, T prefac, const std::vector<T> &ker, BIGINT ms,
+                                T *fk, BIGINT nf1, std::complex<T> *fw, int modeord)
+/*
+  if dir==1: copies fw to fk with amplification by prefac/ker
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1)
+      1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
+
+  fk is a size-ms T complex array (2*ms Ts alternating re,im parts)
+  fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts)
+  ker is real-valued T array of length nf1/2+1.
+
+  Single thread only, but shouldn't matter since mostly data movement.
+
+  It has been tested that the repeated floating division in this inner loop
+  only contributes at the <3% level in 3D relative to the FFT cost (8 threads).
+  This could be removed by passing in an inverse kernel and doing mults.
+
+  todo: rewrite w/ C++-complex I/O, check complex divide not slower than
+    real divide, or is there a way to force a real divide?
+
+  Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
+*/
+{
+  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
+  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (kmax + 1);
+  } // or, instead, FFT ordering
+  if (dir == 1) {                                       // read fw, write out to fk...
+    for (BIGINT k = 0; k <= kmax; ++k) {                // non-neg freqs k
+      fk[pp++] = prefac * fw[k].real() / ker[k];        // re
+      fk[pp++] = prefac * fw[k].imag() / ker[k];        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {                 // neg freqs k
+      fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re
+      fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im
+    }
+  } else { // read fk, write out to fw w/ zero padding...
+    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where
+                                                     // needed
+      fw[k] = 0.0;
+    }
+    for (BIGINT k = 0; k <= kmax; ++k) {             // non-neg freqs k
+      fw[k].real(prefac * fk[pp++] / ker[k]);        // re
+      fw[k].imag(prefac * fk[pp++] / ker[k]);        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {              // neg freqs k
+      fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re
+      fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im
+    }
+  }
+}
+
+template<typename T>
+static void deconvolveshuffle2d(int dir, T prefac, const std::vector<T> &ker1,
+                                const std::vector<T> &ker2, BIGINT ms, BIGINT mt, T *fk,
+                                BIGINT nf1, BIGINT nf2, std::complex<T> *fw, int modeord)
+/*
+  2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
+
+  if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)).
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
+      1: use FFT-style (pos then negative, on each dim)
+
+  fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with
+  ms looped over fast and mt slow.
+  fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with
+  nf1 looped over fast and nf2 slow.
+  ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1
+     respectively.
+
+  Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
+*/
+{
+  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
+  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k2max + 1) * ms;
+  } // or, instead, FFT ordering
+  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
+    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all
+                                                                     // dims
+      fw[j] = 0.0;
+  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
+    // point fk and fw to the start of this y value's row (2* is for complex):
+    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
+                                &fw[nf1 * k2], modeord);
+  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
+    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
+                                &fw[nf1 * (nf2 + k2)], modeord);
+}
+
+template<typename T>
+static void deconvolveshuffle3d(int dir, T prefac, std::vector<T> &ker1,
+                                std::vector<T> &ker2, std::vector<T> &ker3, BIGINT ms,
+                                BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2,
+                                BIGINT nf3, std::complex<T> *fw, int modeord)
+/*
+  3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
+
+  if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)).
+  if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
+
+  modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
+      1: use FFT-style (pos then negative, on each dim)
+
+  fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with
+  ms looped over fastest and mu slowest.
+  fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with
+  nf1 looped over fastest and nf3 slowest.
+  ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1,
+     and nf3/2+1 respectively.
+
+  Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
+*/
+{
+  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
+  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
+  // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
+  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k3max + 1) * ms * mt;
+  } // or FFT ordering
+  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
+  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
+    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
+      fw[j] = 0.0;
+  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
+    // point fk and fw to the start of this z value's plane (2* is for complex):
+    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
+                                nf2, &fw[np * k3], modeord);
+  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
+    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
+                                nf2, &fw[np * (nf3 + k3)], modeord);
+}
+
+// --------- batch helper functions for t1,2 exec: ---------------------------
+
+template<typename T>
+static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T<T> *p,
+                                   std::complex<T> *cBatch)
+/*
+  Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
+  to (or from) the batch of fine working grids p->fwBatch, using the same set of
+  (index-sorted) NU points p->X,Y,Z for each vector in the batch.
+  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
+  Returns 0 (no error reporting for now).
+  Notes:
+  1) cBatch is already assumed to have the correct offset, ie here we
+   read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
+  2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp
+  Barnett 5/19/20, based on Malleo 2019.
+*/
+{
+  // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread.
+  // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
+  // But when nthr_outer=1 here, omp par inside the loop sees all threads...
+#ifdef _OPENMP
+  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
+#endif
+#pragma omp parallel for num_threads(nthr_outer)
+  for (int i = 0; i < batchSize; i++) {
+    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    std::complex<T> *ci  = cBatch + i * p->nj;     // start of i'th c array in cBatch
+    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X,
+                       p->Y, p->Z, (T *)ci, p->spopts, p->didSort);
+  }
+  return 0;
+}
+
+template<typename T>
+static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T<T> *p, std::complex<T> *fkBatch)
+/*
+  Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
+  into each output array fk in fkBatch.
+  Type 2: deconvolves from user-supplied input fk to 0-padded interior fw,
+  again looping over fk in fkBatch and fw in p->fwBatch.
+  The direction (spread vs interpolate) is set by p->spopts.spread_direction.
+  This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize
+  times.
+  Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here)
+*/
+{
+  // since deconvolveshuffle?d are single-thread, omp par seems to help here...
+#pragma omp parallel for num_threads(batchSize)
+  for (int i = 0; i < batchSize; i++) {
+    std::complex<T> *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    std::complex<T> *fki = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
+
+    // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
+    if (p->dim == 1)
+      deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki,
+                          p->nf1, fwi, p->opts.modeord);
+    else if (p->dim == 2)
+      deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms,
+                          p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
+    else
+      deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2,
+                          p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2,
+                          p->nf3, fwi, p->opts.modeord);
+  }
+  return 0;
+}
+
+} // namespace common
+} // namespace finufft
+
+// --------------- rest is the 5 user guru (plan) interface drivers: ---------
+// (not namespaced since have safe names finufft{f}_* )
+using namespace finufft::common; // accesses routines defined above
+
+// Marco Barbone: 5.8.2024
+// These are user-facing.
+// The various options could be macros to follow c standard library conventions.
+// Question: would these be enums?
+
+// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
+void finufft_default_opts_t(finufft_opts *o)
+// Sets default nufft opts (referenced by all language interfaces too).
+// See finufft_opts.h for meanings.
+// This was created to avoid uncertainty about C++11 style static initialization
+// when called from MEX, but now is generally used. Barnett 10/30/17 onwards.
+// Sphinx sucks the below code block into the web docs, hence keep it clean...
+{
+  // sphinx tag (don't remove): @defopts_start
+  o->modeord = 0;
+  o->chkbnds = 1;
+
+  o->debug        = 0;
+  o->spread_debug = 0;
+  o->showwarn     = 1;
+
+  o->nthreads = 0;
+#ifdef FINUFFT_USE_DUCC0
+  o->fftw = 0;
+#else
+  o->fftw = FFTW_ESTIMATE;
+#endif
+  o->spread_sort        = 2;
+  o->spread_kerevalmeth = 1;
+  o->spread_kerpad      = 1;
+  o->upsampfac          = 0.0;
+  o->spread_thread      = 0;
+  o->maxbatchsize       = 0;
+  o->spread_nthr_atomic = -1;
+  o->spread_max_sp_size = 0;
+  o->fftw_lock_fun      = nullptr;
+  o->fftw_unlock_fun    = nullptr;
+  o->fftw_lock_data     = nullptr;
+  // sphinx tag (don't remove): @defopts_end
+}
+
+// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
+template<typename TF>
+int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                       TF tol, FINUFFT_PLAN_T<TF> **pp, finufft_opts *opts)
+// Populates the fields of finufft_plan which is pointed to by "pp".
+// opts is ptr to a finufft_opts to set options, or nullptr to use defaults.
+// For some of the fields (if "auto" selected) here choose the actual setting.
+// For types 1,2 allocates memory for internal working arrays,
+// evaluates spreading kernel coefficients, and instantiates the fftw_plan
+{
+  FINUFFT_PLAN_T<TF> *p;
+  p   = new FINUFFT_PLAN_T<TF>; // allocate fresh plan struct
+  *pp = p;                      // pass out plan as ptr to plan struct
+
+  if (!opts)                    // use default opts
+    finufft_default_opts_t(&(p->opts));
+  else                          // or read from what's passed in
+    p->opts = *opts;            // keep a deep copy; changing *opts now has no effect
+
+  if (p->opts.debug)            // do a hello world
+    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
+           __func__);
+
+  p->fftPlan = std::make_unique<Finufft_FFT_plan<TF>>(
+      p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data);
+
+  if ((type != 1) && (type != 2) && (type != 3)) {
+    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
+    return FINUFFT_ERR_TYPE_NOTVALID;
+  }
+  if ((dim != 1) && (dim != 2) && (dim != 3)) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
+  if (ntrans < 1) {
+    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
+    return FINUFFT_ERR_NTRANS_NOTVALID;
+  }
+  if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) {
+    fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n",
+            __func__);
+    return FINUFFT_ERR_LOCK_FUNS_INVALID;
+    ;
+  }
+
+  // get stuff from args...
+  p->type    = type;
+  p->dim     = dim;
+  p->ntrans  = ntrans;
+  p->tol     = tol;
+  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
+
+                                      // choose overall # threads...
+#ifdef _OPENMP
+  int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
+  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
+  // (the above could be set, or suggested set, to 1 for small enough problems...)
+  if (p->opts.nthreads > 0) {
+    nthr = p->opts.nthreads; // user override, now without limit
+    if (p->opts.showwarn && (nthr > ompmaxnthr))
+      fprintf(stderr,
+              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
+              "available; note large nthreads can be slower.\n",
+              __func__, nthr, ompmaxnthr);
+  }
+#else
+  int nthr = 1; // always 1 thread (avoid segfault)
+  if (p->opts.nthreads > 1)
+    fprintf(stderr,
+            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
+            __func__, p->opts.nthreads);
+#endif
+  p->opts.nthreads = nthr; // store actual # thr planned for
+  // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
+
+  // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
+  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
+    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
+    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
+  } else {                                          // batchSize override by user
+    p->batchSize = min(p->opts.maxbatchsize, ntrans);
+    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
+  }
+  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
+  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
+    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
+    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
+  }
+
+  if (type != 3) {                      // read in user Fourier mode array sizes...
+    p->ms = n_modes[0];
+    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
+    p->mu = (dim > 2) ? n_modes[2] : 1;
+    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
+  }
+
+  // heuristic to choose default upsampfac... (currently two poss)
+  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
+    p->opts.upsampfac = 2.0;                 // default, and need for tol small
+    if (tol >= (TF)1E-9) {                   // the tol sigma=5/4 can reach
+      if (type == 3)                         // could move to setpts, more known?
+        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
+      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
+               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
+                                             // typ tol, 12-core xeon
+        p->opts.upsampfac = 1.25;
+    }
+    if (p->opts.debug > 1)
+      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
+  }
+  // use opts to choose and write into plan's spread options...
+  int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
+  if (ier > 1) // proceed if success or warning
+    return ier;
+
+  // set others as defaults (or unallocated for arrays)...
+  p->X   = nullptr;
+  p->Y   = nullptr;
+  p->Z   = nullptr;
+  p->nf1 = 1;
+  p->nf2 = 1;
+  p->nf3 = 1; // crucial to leave as 1 for unused dims
+
+  //  ------------------------ types 1,2: planning needed ---------------------
+  if (type == 1 || type == 2) {
+
+    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
+                         // Note: batchSize not used since might be only 1.
+
+    p->spopts.spread_direction = type;
+
+    constexpr TF EPSILON = std::numeric_limits<TF>::epsilon();
+    if (p->opts.showwarn) { // user warn round-off error...
+      if (EPSILON * p->ms > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->ms));
+      if (EPSILON * p->mt > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mt));
+      if (EPSILON * p->mu > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mu));
+    }
+
+    // determine fine grid sizes, sanity check..
+    int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1));
+    if (nfier) return nfier; // nf too big; we're done
+    p->phiHat1.resize(p->nf1 / 2 + 1);
+    if (dim > 1) {
+      nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2));
+      if (nfier) return nfier;
+      p->phiHat2.resize(p->nf2 / 2 + 1);
+    }
+    if (dim > 2) {
+      nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3));
+      if (nfier) return nfier;
+      p->phiHat3.resize(p->nf3 / 2 + 1);
+    }
+
+    if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
+      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
+             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
+             "batchSize=%d ",
+             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
+             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
+             p->batchSize);
+      if (p->batchSize == 1) // spread_thread has no effect in this case
+        printf("\n");
+      else
+        printf(" spread_thread=%d\n", p->opts.spread_thread);
+    }
+
+    // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
+    CNTime timer;
+    timer.start();
+    onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
+    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
+    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
+             timer.elapsedsec());
+
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
+    if (p->nf * p->batchSize > MAX_NF) {
+      fprintf(stderr,
+              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
+      // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3
+      return FINUFFT_ERR_MAXNALLOC;
+    }
+
+    timer.restart();
+    p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace
+    if (p->opts.debug)
+      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(std::complex<TF>) * p->nf * p->batchSize,
+             timer.elapsedsec());
+    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
+      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
+              __func__);
+      return FINUFFT_ERR_ALLOC;
+    }
+
+    timer.restart(); // plan the FFTW
+    const auto ns = gridsize_for_fft(p);
+    p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft);
+    if (p->opts.debug)
+      printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
+             nthr_fft, timer.elapsedsec());
+
+  } else { // -------------------------- type 3 (no planning) ------------
+
+    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
+    // in case destroy occurs before setpts, need safe dummy ptrs/plans...
+    p->fwBatch     = nullptr;
+    p->innerT2plan = nullptr;
+    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
+    // Note we don't even know nj or nk yet, so can't do anything else!
+  }
+  return ier; // report setup_spreader status (could be warning)
+}
+template int finufft_makeplan_t<float>(int type, int dim, const BIGINT *n_modes,
+                                       int iflag, int ntrans, float tol,
+                                       FINUFFT_PLAN_T<float> **pp, finufft_opts *opts);
+template int finufft_makeplan_t<double>(int type, int dim, const BIGINT *n_modes,
+                                        int iflag, int ntrans, double tol,
+                                        FINUFFT_PLAN_T<double> **pp, finufft_opts *opts);
+
+// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
+template<typename TF>
+int FINUFFT_PLAN_T<TF>::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t,
+                               TF *u) {
+  int d = dim; // abbrev for spatial dim
+  CNTime timer;
+  timer.start();
+  this->nj = nj; // the user only now chooses how many NU (x,y,z) pts
+  if (nj < 0) {
+    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  } else if (nj > MAX_NU_PTS) {
+    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
+    return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+  }
+
+  if (type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
+                   // (all we can do is check and maybe bin-sort the NU pts)
+    X       = xj;  // plan must keep pointers to user's fixed NU pts
+    Y       = yj;
+    Z       = zj;
+    int ier = spreadcheck(nf1, nf2, nf3, nj, xj, yj, zj, spopts);
+    if (opts.debug > 1)
+      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, spopts.chkbnds,
+             timer.elapsedsec());
+    if (ier) // no warnings allowed here
+      return ier;
+    timer.restart();
+    sortIndices.resize(nj);
+    didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, xj, yj, zj, spopts);
+    if (opts.debug)
+      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort,
+             timer.elapsedsec());
+
+  } else { // ------------------------- TYPE 3 SETPTS -----------------------
+           // (here we can precompute pre/post-phase factors and plan the t2)
+
+    if (nk < 0) {
+      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
+      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+    } else if (nk > MAX_NU_PTS) {
+      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
+      return FINUFFT_ERR_NUM_NU_PTS_INVALID;
+    }
+    this->nk = nk; // user set # targ freq pts
+    S        = s;  // keep pointers to user's input target pts
+    T        = t;
+    U        = u;
+
+    // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
+    TF S1, S2, S3; // get half-width X, center C, which contains {x_j}...
+    arraywidcen(nj, xj, &(t3P.X1), &(t3P.C1));
+    arraywidcen(nk, s, &S1, &(t3P.D1)); // same D, S, but for {s_k}
+    set_nhg_type3(S1, t3P.X1, opts, spopts, &(nf1), &(t3P.h1),
+                  &(t3P.gam1));         // applies twist i)
+    t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
+    t3P.D2 = 0.0;
+    if (d > 1) {
+      arraywidcen(nj, yj, &(t3P.X2), &(t3P.C2)); // {y_j}
+      arraywidcen(nk, t, &S2, &(t3P.D2));        // {t_k}
+      set_nhg_type3(S2, t3P.X2, opts, spopts, &(nf2), &(t3P.h2), &(t3P.gam2));
+    }
+    t3P.C3 = 0.0;
+    t3P.D3 = 0.0;
+    if (d > 2) {
+      arraywidcen(nj, zj, &(t3P.X3), &(t3P.C3)); // {z_j}
+      arraywidcen(nk, u, &S3, &(t3P.D3));        // {u_k}
+      set_nhg_type3(S3, t3P.X3, opts, spopts, &(nf3), &(t3P.h3), &(t3P.gam3));
+    }
+
+    if (opts.debug) { // report on choices of shifts, centers, etc...
+      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
+      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", t3P.X1,
+             t3P.C1, S1, t3P.D1, t3P.gam1, (long long)nf1, t3P.h1);
+      if (d > 1)
+        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", t3P.X2,
+               t3P.C2, S2, t3P.D2, t3P.gam2, (long long)nf2, t3P.h2);
+      if (d > 2)
+        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", t3P.X3,
+               t3P.C3, S3, t3P.D3, t3P.gam3, (long long)nf3, t3P.h3);
+    }
+    nf = nf1 * nf2 * nf3; // fine grid total number of points
+    if (nf * batchSize > MAX_NF) {
+      fprintf(stderr,
+              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
+      return FINUFFT_ERR_MAXNALLOC;
+    }
+    fftPlan->free(fwBatch);
+    fwBatch = fftPlan->alloc_complex(nf * batchSize); // maybe big workspace
+
+    CpBatch.resize(nj * batchSize);                   // batch c' work
+
+    if (opts.debug)
+      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(std::complex<TF>) * (nf + nj) * batchSize,
+             timer.elapsedsec());
+    if (!fwBatch) {
+      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
+      return FINUFFT_ERR_ALLOC;
+    }
+    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch);
+
+    // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
+    // FIXME: should use realloc
+    if (X) free(X);
+    X = (TF *)malloc(sizeof(TF) * nj);
+    Sp.resize(nk);
+    if (d > 1) {
+      if (Y) free(Y);
+      Y = (TF *)malloc(sizeof(TF) * nj);
+      Tp.resize(nk);
+    }
+    if (d > 2) {
+      if (Z) free(Z);
+      Z = (TF *)malloc(sizeof(TF) * nj);
+      Up.resize(nk);
+    }
+
+    // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
+    TF ig1 = 1.0 / t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
+    if (d > 1) ig2 = 1.0 / t3P.gam2;
+    if (d > 2) ig3 = 1.0 / t3P.gam3;
+#pragma omp parallel for num_threads(opts.nthreads) schedule(static)
+    for (BIGINT j = 0; j < nj; ++j) {
+      X[j] = (xj[j] - t3P.C1) * ig1;   // rescale x_j
+      if (d > 1)                       // (ok to do inside loop because of branch predict)
+        Y[j] = (yj[j] - t3P.C2) * ig2; // rescale y_j
+      if (d > 2) Z[j] = (zj[j] - t3P.C3) * ig3; // rescale z_j
+    }
+
+    // set up prephase array...
+    std::complex<TF> imasign =
+        (fftSign >= 0) ? std::complex<TF>(0, 1) : std::complex<TF>(0, -1); // +-i
+    prephase.resize(nj);
+    if (t3P.D1 != 0.0 || t3P.D2 != 0.0 || t3P.D3 != 0.0) {
+#pragma omp parallel for num_threads(opts.nthreads) schedule(static)
+      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
+        TF phase = t3P.D1 * xj[j];
+        if (d > 1) phase += t3P.D2 * yj[j];
+        if (d > 2) phase += t3P.D3 * zj[j];
+        prephase[j] = cos(phase) + imasign * sin(phase); // Euler
+                                                         // e^{+-i.phase}
+      }
+    } else
+      for (BIGINT j = 0; j < nj; ++j)
+        prephase[j] = (std::complex<TF>)1.0; // *** or keep flag so no mult in exec??
+
+                                             // rescale the target s_k etc to s'_k etc...
+#pragma omp parallel for num_threads(opts.nthreads) schedule(static)
+    for (BIGINT k = 0; k < nk; ++k) {
+      Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1);   // so |s'_k| < pi/R
+      if (d > 1)
+        Tp[k] = t3P.h2 * t3P.gam2 * (t[k] - t3P.D2); // so |t'_k| <
+                                                     // pi/R
+      if (d > 2)
+        Up[k] = t3P.h3 * t3P.gam3 * (u[k] - t3P.D3); // so |u'_k| <
+                                                     // pi/R
+    }
+    // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
+    // (exploits that FT separates because kernel is prod of 1D funcs)
+    deconv.resize(nk);
+    std::vector<TF> phiHatk1(nk);                 // don't confuse w/ phiHat
+    onedim_nuft_kernel(nk, Sp, phiHatk1, spopts); // fill phiHat1
+    std::vector<TF> phiHatk2, phiHatk3;
+    if (d > 1) {
+      phiHatk2.resize(nk);
+      onedim_nuft_kernel(nk, Tp, phiHatk2, spopts); // fill phiHat2
+    }
+    if (d > 2) {
+      phiHatk3.resize(nk);
+      onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3
+    }
+    int Cfinite = isfinite(t3P.C1) && isfinite(t3P.C2) && isfinite(t3P.C3); // C can be
+                                                                            // nan or inf
+                                                                            // if M=0, no
+                                                                            // input NU
+                                                                            // pts
+    int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0;         // cen
+#pragma omp parallel for num_threads(opts.nthreads) schedule(static)
+    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
+      TF phiHat = phiHatk1[k];
+      if (d > 1) phiHat *= phiHatk2[k];
+      if (d > 2) phiHat *= phiHatk3[k];
+      deconv[k] = (std::complex<TF>)(1.0 / phiHat);
+      if (Cfinite && Cnonzero) {
+        TF phase = (s[k] - t3P.D1) * t3P.C1;
+        if (d > 1) phase += (t[k] - t3P.D2) * t3P.C2;
+        if (d > 2) phase += (u[k] - t3P.D3) * t3P.C3;
+        deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
+      }
+    }
+    if (opts.debug)
+      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
+
+    // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
+    timer.restart();
+    sortIndices.resize(nj);
+    didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, X, Y, Z, spopts);
+    if (opts.debug)
+      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort,
+             timer.elapsedsec());
+
+    // Plan and setpts once, for the (repeated) inner type 2 finufft call...
+    timer.restart();
+    BIGINT t2nmodes[]   = {nf1, nf2, nf3};        // t2 input is actually fw
+    finufft_opts t2opts = opts;                   // deep copy, since not ptrs
+    t2opts.modeord      = 0;                      // needed for correct t3!
+    t2opts.debug        = max(0, opts.debug - 1); // don't print as much detail
+    t2opts.spread_debug = max(0, opts.spread_debug - 1);
+    t2opts.showwarn     = 0;                      // so don't see warnings 2x
+    // (...could vary other t2opts here?)
+    if (innerT2plan) {
+      delete innerT2plan;
+      innerT2plan = nullptr;
+    }
+    int ier = finufft_makeplan_t<TF>(2, d, t2nmodes, fftSign, batchSize, tol,
+                                     &innerT2plan, &t2opts);
+    if (ier > 1) { // if merely warning, still proceed
+      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
+              __func__, ier);
+      return ier;
+    }
+    ier = finufft_setpts_t<TF>(innerT2plan, nk, Sp.data(), Tp.data(), Up.data(), 0,
+                               nullptr, nullptr,
+                               nullptr); // note nk = # output points (not nj)
+    if (ier > 1) {
+      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
+      return ier;
+    }
+    if (opts.debug)
+      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
+  }
+  return 0;
+}
+template<typename TF>
+int finufft_setpts_t(FINUFFT_PLAN_T<TF> *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk,
+                     TF *s, TF *t, TF *u)
+/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
+   spreading. (The last 4 arguments are ignored.)
+   For type 3: allocates internal working arrays, scales/centers the NU points
+   and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
+*/
+{
+  return p->setpts(nj, xj, yj, zj, nk, s, t, u);
+}
+template int finufft_setpts_t<float>(FINUFFT_PLAN_T<float> *p, BIGINT nj, float *xj,
+                                     float *yj, float *zj, BIGINT nk, float *s, float *t,
+                                     float *u);
+template int finufft_setpts_t<double>(FINUFFT_PLAN_T<double> *p, BIGINT nj, double *xj,
+                                      double *yj, double *zj, BIGINT nk, double *s,
+                                      double *t, double *u);
+
+// ............ end setpts ..................................................
+
+// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+template<typename TF>
+int FINUFFT_PLAN_T<TF>::execute(std::complex<TF> *cj, std::complex<TF> *fk) {
+  /* See ../docs/cguru.doc for current documentation.
+
+   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
+   existing (sorted) NU pts and existing plan.
+   For type 1 and 3: cj is input, fk is output.
+   For type 2: fk is input, cj is output.
+   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
+   for each of the 3 types.
+   For cases of ntrans>1, performs work in blocks of size up to batchSize.
+   Return value 0 (no error diagnosis yet).
+   Barnett 5/20/20, based on Malleo 2019.
+*/
+  CNTime timer;
+  timer.start();
+
+  if (type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
+
+    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
+    if (opts.debug)
+      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans, nbatch,
+             batchSize);
+
+    for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches
+
+      // current batch is either batchSize, or possibly truncated if last one
+      int thisBatchSize     = min(ntrans - b * batchSize, batchSize);
+      int bB                = b * batchSize; // index of vector, since batchsizes same
+      std::complex<TF> *cjb = cj + bB * nj;  // point to batch of weights
+      std::complex<TF> *fkb = fk + bB * N;   // point to batch of mode coeffs
+      if (opts.debug > 1)
+        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
+      // STEP 1: (varies by type)
+      timer.restart();
+      if (type == 1) { // type 1: spread NU pts X, weights cj, to fw grid
+        spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
+        t_sprint += timer.elapsedsec();
+      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
+        deconvolveBatch<TF>(thisBatchSize, this, fkb);
+        t_deconv += timer.elapsedsec();
+      }
+
+      // STEP 2: call the FFT on this batch
+      timer.restart();
+      do_fft(this);
+      t_fft += timer.elapsedsec();
+      if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
+
+      // STEP 3: (varies by type)
+      timer.restart();
+      if (type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
+        deconvolveBatch<TF>(thisBatchSize, this, fkb);
+        t_deconv += timer.elapsedsec();
+      } else { // type 2: interpolate unif fw grid to NU target pts
+        spreadinterpSortedBatch<TF>(thisBatchSize, this, cjb);
+        t_sprint += timer.elapsedsec();
+      }
+    } // ........end b loop
+
+    if (opts.debug) { // report total times in their natural order...
+      if (type == 1) {
+        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
+        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
+        printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
+      } else {
+        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
+        printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
+        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
+      }
+    }
+  }
+
+  else { // ----------------------------- TYPE 3 EXEC ---------------------
+
+    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
+    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
+
+    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0,
+           t_deconv = 0.0; // accumulated timings
+    if (opts.debug)
+      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans,
+             nbatch, batchSize);
+
+    for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches
+
+      // batching and pointers to this batch, identical to t1,2 above...
+      int thisBatchSize     = min(ntrans - b * batchSize, batchSize);
+      int bB                = b * batchSize;
+      std::complex<TF> *cjb = cj + bB * nj; // batch of input strengths
+      std::complex<TF> *fkb = fk + bB * nk; // batch of output strengths
+      if (opts.debug > 1)
+        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
+      // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
+      timer.restart();
+#pragma omp parallel for num_threads(opts.nthreads) // or batchSize?
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * nj;
+        for (BIGINT j = 0; j < nj; ++j) {
+          CpBatch[ioff + j] = prephase[j] * cjb[ioff + j];
+        }
+      }
+      t_pre += timer.elapsedsec();
+
+      // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
+      timer.restart();
+      spopts.spread_direction = 1;                                      // spread
+      spreadinterpSortedBatch<TF>(thisBatchSize, this, CpBatch.data()); // X are primed
+      t_spr += timer.elapsedsec();
+
+      // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
+      timer.restart();
+      // illegal possible shrink of ntrans *after* plan for smaller last batch:
+      innerT2plan->ntrans = thisBatchSize; // do not try this at home!
+      /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
+     still the same size, as Andrea explained; just wastes a few flops) */
+      finufft_execute_t(innerT2plan, fkb, fwBatch);
+      t_t2 += timer.elapsedsec();
+      // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
+      timer.restart();
+#pragma omp parallel for num_threads(opts.nthreads)
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * nk;
+        for (BIGINT k = 0; k < nk; ++k) fkb[ioff + k] *= deconv[k];
+      }
+      t_deconv += timer.elapsedsec();
+    } // ........end b loop
+
+    if (opts.debug) { // report total times in their natural order...
+      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
+      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
+      printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
+      printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
+    }
+  }
+  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
+  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
+
+  return 0;
+}
+template<typename TF>
+int finufft_execute_t(FINUFFT_PLAN_T<TF> *p, std::complex<TF> *cj, std::complex<TF> *fk) {
+  /* See ../docs/cguru.doc for current documentation.
+
+   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
+   existing (sorted) NU pts and existing plan.
+   For type 1 and 3: cj is input, fk is output.
+   For type 2: fk is input, cj is output.
+   Performs spread/interp, pre/post deconvolve, and FFT as appropriate
+   for each of the 3 types.
+   For cases of ntrans>1, performs work in blocks of size up to batchSize.
+   Return value 0 (no error diagnosis yet).
+   Barnett 5/20/20, based on Malleo 2019.
+*/
+  return p->execute(cj, fk);
+}
+template int finufft_execute_t<float>(FINUFFT_PLAN_T<float> *p, std::complex<float> *cj,
+                                      std::complex<float> *fk);
+template int finufft_execute_t<double>(
+    FINUFFT_PLAN_T<double> *p, std::complex<double> *cj, std::complex<double> *fk);
+
+// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
+template<typename TF> FINUFFT_PLAN_T<TF>::~FINUFFT_PLAN_T() {
+  // Free everything we allocated inside of finufft_plan pointed to by p.
+  // Also must not crash if called immediately after finufft_makeplan.
+  // Thus either each thing free'd here is guaranteed to be nullptr or correctly
+  // allocated.
+  if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array
+  if (type == 3) {
+    delete innerT2plan;
+    innerT2plan = nullptr;
+    free(X);
+    free(Y);
+    free(Z);
+  }
+}
+template FINUFFT_PLAN_T<float>::~FINUFFT_PLAN_T();
+template FINUFFT_PLAN_T<double>::~FINUFFT_PLAN_T();
diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp
index edd25adfb..4b3630d93 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/simpleinterfaces.cpp
@@ -1,8 +1,10 @@
 // public header
 #include <finufft.h>
 // private headers
+#include <array>
 #include <cstdio>
-#include <finufft/defs.h>
+#include <finufft/finufft_core.h> // (must come after complex.h)
+
 using namespace std;
 
 /* ---------------------------------------------------------------------------
@@ -18,41 +20,103 @@ using namespace std;
    ---------------------------------------------------------------------------
 */
 
+void finufft_default_opts(finufft_opts *o) { finufft_default_opts_t(o); }
+void finufftf_default_opts(finufft_opts *o) { finufft_default_opts_t(o); }
+
+int finufft_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                     double tol, finufft_plan *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<double>(type, dim, n_modes, iflag, ntrans, tol,
+                                    reinterpret_cast<FINUFFT_PLAN_T<double> **>(pp),
+                                    opts);
+}
+int finufftf_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans,
+                      float tol, finufftf_plan *pp, finufft_opts *opts) {
+  return finufft_makeplan_t<float>(type, dim, n_modes, iflag, ntrans, tol,
+                                   reinterpret_cast<FINUFFT_PLAN_T<float> **>(pp), opts);
+}
+
+int finufft_setpts(finufft_plan p, BIGINT nj, double *xj, double *yj, double *zj,
+                   BIGINT nk, double *s, double *t, double *u) {
+  return finufft_setpts_t<double>(reinterpret_cast<FINUFFT_PLAN_T<double> *>(p), nj, xj,
+                                  yj, zj, nk, s, t, u);
+}
+int finufftf_setpts(finufftf_plan p, BIGINT nj, float *xj, float *yj, float *zj,
+                    BIGINT nk, float *s, float *t, float *u) {
+  return finufft_setpts_t<float>(reinterpret_cast<FINUFFT_PLAN_T<float> *>(p), nj, xj, yj,
+                                 zj, nk, s, t, u);
+}
+
+int finufft_execute(finufft_plan p, std::complex<double> *cj, std::complex<double> *fk) {
+  return finufft_execute_t<double>(reinterpret_cast<FINUFFT_PLAN_T<double> *>(p), cj, fk);
+}
+int finufftf_execute(finufftf_plan p, std::complex<float> *cj, std::complex<float> *fk) {
+  return finufft_execute_t<float>(reinterpret_cast<FINUFFT_PLAN_T<float> *>(p), cj, fk);
+}
+
+int finufft_destroy(finufft_plan p)
+// Free everything we allocated inside of finufft_plan pointed to by p.
+// Also must not crash if called immediately after finufft_makeplan.
+// Thus either each thing free'd here is guaranteed to be nullptr or correctly
+// allocated.
+{
+  if (!p) // nullptr ptr, so not a ptr to a plan, report error
+    return 1;
+
+  delete reinterpret_cast<FINUFFT_PLAN_T<double> *>(p);
+  p = nullptr;
+  return 0; // success
+}
+int finufftf_destroy(finufftf_plan p)
+// Free everything we allocated inside of finufft_plan pointed to by p.
+// Also must not crash if called immediately after finufft_makeplan.
+// Thus either each thing free'd here is guaranteed to be nullptr or correctly
+// allocated.
+{
+  if (!p) // nullptr ptr, so not a ptr to a plan, report error
+    return 1;
+
+  delete reinterpret_cast<FINUFFT_PLAN_T<float> *>(p);
+  p = nullptr;
+  return 0; // success
+}
 // Helper layer ...........................................................
 
 namespace finufft {
 namespace common {
 
-int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj,
-                        FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk,
-                        FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts)
+template<typename T>
+static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, T *xj,
+                               T *yj, T *zj, std::complex<T> *cj, int iflag, T eps,
+                               const std::array<BIGINT, 3> &n_modes, BIGINT nk, T *s,
+                               T *t, T *u, std::complex<T> *fk, finufft_opts *popts)
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
-  FINUFFT_PLAN plan;
-  int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan,
-                             popts); // popts (ptr to opts) can be NULL
-  if (ier > 1) {                     // since 1 (a warning) still allows proceeding...
+  FINUFFT_PLAN_T<T> *plan = nullptr;
+  int ier =
+      finufft_makeplan_t<T>(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan,
+                            popts); // popts (ptr to opts) can be nullptr
+  if (ier > 1) {                    // since 1 (a warning) still allows proceeding...
     fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier);
     delete plan;
     return ier;
   }
 
-  int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u);
+  int ier2 = finufft_setpts_t<T>(plan, nj, xj, yj, zj, nk, s, t, u);
   if (ier2 > 1) {
     fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
-    FINUFFT_DESTROY(plan);
+    delete plan;
     return ier2;
   }
 
-  int ier3 = FINUFFT_EXECUTE(plan, cj, fk);
+  int ier3 = finufft_execute_t<T>(plan, cj, fk);
   if (ier3 > 1) {
     fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
-    FINUFFT_DESTROY(plan);
+    delete plan;
     return ier3;
   }
 
-  FINUFFT_DESTROY(plan);
+  delete plan;
   return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning
 }
 
@@ -63,229 +127,287 @@ using namespace finufft::common;
 
 // Dimension 1111111111111111111111111111111111111111111111111111111111111111
 
-int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
-               finufft_opts *opts)
-//  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
+int finufft1d1many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
+                   int iflag, double eps, BIGINT ms, std::complex<double> *fk,
+                   finufft_opts *opts)
+// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, 1, 1};
-  int n_dims       = 1;
-  int n_transf     = 1;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                     eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
+                                     opts);
 }
-
-int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
-                   BIGINT ms, CPX *fk, finufft_opts *opts)
+int finufftf1d1many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
+                    int iflag, float eps, BIGINT ms, std::complex<float> *fk,
+                    finufft_opts *opts)
 // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, 1, 1};
-  int n_dims       = 1;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                    eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
+                                    opts);
 }
 
-int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
-               finufft_opts *opts)
-//  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
+int finufft1d1(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
+               BIGINT ms, std::complex<double> *fk, finufft_opts *opts)
+//  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, 1, 1};
-  int n_dims       = 1;
-  int n_transf     = 1;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+int finufftf1d1(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
+                BIGINT ms, std::complex<float> *fk, finufft_opts *opts)
+//  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
+{
+  return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
-int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
-                   BIGINT ms, CPX *fk, finufft_opts *opts)
+int finufft1d2many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
+                   int iflag, double eps, BIGINT ms, std::complex<double> *fk,
+                   finufft_opts *opts)
+//  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
+{
+  return invokeGuruInterface<double>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                     eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
+                                     opts);
+}
+int finufftf1d2many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
+                    int iflag, float eps, BIGINT ms, std::complex<float> *fk,
+                    finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, 1, 1};
-  int n_dims       = 1;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                    eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk,
+                                    opts);
 }
 
-int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s,
-               CPX *fk, finufft_opts *opts)
-// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
+int finufft1d2(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
+               BIGINT ms, std::complex<double> *fk, finufft_opts *opts)
+//  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  int n_dims   = 1;
-  int n_transf = 1;
-  int type     = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                eps, NULL, nk, s, NULL, NULL, fk, opts);
-  return ier;
+  return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
+}
+int finufftf1d2(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
+                BIGINT ms, std::complex<float> *fk, finufft_opts *opts)
+//  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
+{
+  return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts);
 }
 
-int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
-                   BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts)
+int finufft1d3many(int n_transf, BIGINT nj, double *xj, std::complex<double> *cj,
+                   int iflag, double eps, BIGINT nk, double *s, std::complex<double> *fk,
+                   finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  int n_dims = 1;
-  int type   = 3;
-  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
-                                   eps, NULL, nk, s, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                     eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+}
+int finufftf1d3many(int n_transf, BIGINT nj, float *xj, std::complex<float> *cj,
+                    int iflag, float eps, BIGINT nk, float *s, std::complex<float> *fk,
+                    finufft_opts *opts)
+// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
+{
+  return invokeGuruInterface<float>(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag,
+                                    eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts);
+}
+int finufft1d3(BIGINT nj, double *xj, std::complex<double> *cj, int iflag, double eps,
+               BIGINT nk, double *s, std::complex<double> *fk, finufft_opts *opts)
+// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
+{
+  return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
+}
+int finufftf1d3(BIGINT nj, float *xj, std::complex<float> *cj, int iflag, float eps,
+                BIGINT nk, float *s, std::complex<float> *fk, finufft_opts *opts)
+// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
+{
+  return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts);
 }
 
 // Dimension 22222222222222222222222222222222222222222222222222222222222222222
 
-int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
-               BIGINT mt, CPX *fk, finufft_opts *opts)
-//  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
+int finufft2d1many(int n_transf, BIGINT nj, double *xj, double *yj,
+                   std::complex<double> *c, int iflag, double eps, BIGINT ms, BIGINT mt,
+                   std::complex<double> *fk, finufft_opts *opts)
+//  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, 1};
-  int n_dims       = 2;
-  int n_transf     = 1;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
+                                     {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-
-int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
-                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
+int finufftf2d1many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex<float> *c,
+                    int iflag, float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
+                    finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, 1};
-  int n_dims       = 2;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
+                                    {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-
-int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
-               BIGINT mt, CPX *fk, finufft_opts *opts)
-//  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
+int finufft2d1(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
+               double eps, BIGINT ms, BIGINT mt, std::complex<double> *fk,
+               finufft_opts *opts)
+//  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, 1};
-  int n_dims       = 2;
-  int n_transf     = 1;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+int finufftf2d1(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
+                float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
+                finufft_opts *opts)
+//  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
+{
+  return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
-int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
-                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
+int finufft2d2many(int n_transf, BIGINT nj, double *xj, double *yj,
+                   std::complex<double> *c, int iflag, double eps, BIGINT ms, BIGINT mt,
+                   std::complex<double> *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, 1};
-  int n_dims       = 2;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
+                                     {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-
-int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk,
-               FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
-// Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
+int finufftf2d2many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex<float> *c,
+                    int iflag, float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
+                    finufft_opts *opts)
+//  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  int n_dims   = 2;
-  int type     = 3;
-  int n_transf = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
-                                NULL, nk, s, t, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps,
+                                    {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts);
+}
+int finufft2d2(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
+               double eps, BIGINT ms, BIGINT mt, std::complex<double> *fk,
+               finufft_opts *opts)
+//  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
+{
+  return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
+}
+int finufftf2d2(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
+                float eps, BIGINT ms, BIGINT mt, std::complex<float> *fk,
+                finufft_opts *opts)
+//  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
+{
+  return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts);
 }
 
-int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps,
-                   BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
+int finufft2d3many(int n_transf, BIGINT nj, double *xj, double *yj,
+                   std::complex<double> *cj, int iflag, double eps, BIGINT nk, double *s,
+                   double *t, std::complex<double> *fk, finufft_opts *opts)
+// Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
+{
+  return invokeGuruInterface<double>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps,
+                                     {0, 0, 0}, nk, s, t, nullptr, fk, opts);
+}
+int finufftf2d3many(int n_transf, BIGINT nj, float *xj, float *yj,
+                    std::complex<float> *cj, int iflag, float eps, BIGINT nk, float *s,
+                    float *t, std::complex<float> *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  int n_dims = 2;
-  int type   = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
-                                NULL, nk, s, t, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps,
+                                    {0, 0, 0}, nk, s, t, nullptr, fk, opts);
+}
+int finufft2d3(BIGINT nj, double *xj, double *yj, std::complex<double> *cj, int iflag,
+               double eps, BIGINT nk, double *s, double *t, std::complex<double> *fk,
+               finufft_opts *opts)
+// Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
+{
+  return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
+}
+int finufftf2d3(BIGINT nj, float *xj, float *yj, std::complex<float> *cj, int iflag,
+                float eps, BIGINT nk, float *s, float *t, std::complex<float> *fk,
+                finufft_opts *opts)
+// Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
+{
+  return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts);
 }
 
 // Dimension 3333333333333333333333333333333333333333333333333333333333333333
 
-int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
-               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
-//  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
+int finufft3d1many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
+                   std::complex<double> *cj, int iflag, double eps, BIGINT ms, BIGINT mt,
+                   BIGINT mu, std::complex<double> *fk, finufft_opts *opts)
+// Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, mu};
-  int n_dims       = 3;
-  int n_transf     = 1;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                     {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk,
+                                     opts);
 }
-
-int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
-                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
+int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
+                    std::complex<float> *cj, int iflag, float eps, BIGINT ms, BIGINT mt,
+                    BIGINT mu, std::complex<float> *fk, finufft_opts *opts)
 // Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, mu};
-  int n_dims       = 3;
-  int type         = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                    {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts);
 }
-
-int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
-               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
-// Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
+int finufft3d1(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
+               int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu,
+               std::complex<double> *fk, finufft_opts *opts)
+//  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
+{
+  return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+int finufftf3d1(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
+                int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu,
+                std::complex<float> *fk, finufft_opts *opts)
+//  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, mu};
-  int n_dims       = 3;
-  int n_transf     = 1;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
-int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
-                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
+int finufft3d2many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
+                   std::complex<double> *cj, int iflag, double eps, BIGINT ms, BIGINT mt,
+                   BIGINT mu, std::complex<double> *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  BIGINT n_modes[] = {ms, mt, mu};
-  n_modes[0]       = ms;
-  n_modes[1]       = mt;
-  n_modes[2]       = mu;
-  int n_dims       = 3;
-  int type         = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                n_modes, 0, NULL, NULL, NULL, fk, opts);
-  return ier;
+  return invokeGuruInterface<double>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                     {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk,
+                                     opts);
 }
-
-int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
-               BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts)
-//  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
+int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
+                    std::complex<float> *cj, int iflag, float eps, BIGINT ms, BIGINT mt,
+                    BIGINT mu, std::complex<float> *fk, finufft_opts *opts)
+// Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
+{
+  return invokeGuruInterface<float>(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                    {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts);
+}
+int finufft3d2(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
+               int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu,
+               std::complex<double> *fk, finufft_opts *opts)
+// Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  int n_dims   = 3;
-  int n_transf = 1;
-  int type     = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                NULL, nk, s, t, u, fk, opts);
-  return ier;
+  return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
+}
+int finufftf3d2(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
+                int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu,
+                std::complex<float> *fk, finufft_opts *opts)
+// Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
+{
+  return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts);
 }
 
-int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
-                   FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk,
-                   finufft_opts *opts)
+int finufft3d3many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj,
+                   std::complex<double> *cj, int iflag, double eps, BIGINT nk, double *s,
+                   double *t, double *u, std::complex<double> *fk, finufft_opts *opts)
+//  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
+{
+  return invokeGuruInterface<double>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                     {0, 0, 0}, nk, s, t, u, fk, opts);
+}
+int finufftf3d3many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj,
+                    std::complex<float> *cj, int iflag, float eps, BIGINT nk, float *s,
+                    float *t, float *u, std::complex<float> *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  int n_dims = 3;
-  int type   = 3;
-  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
-                                   NULL, nk, s, t, u, fk, opts);
-  return ier;
+  return invokeGuruInterface<float>(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                    {0, 0, 0}, nk, s, t, u, fk, opts);
+}
+int finufft3d3(BIGINT nj, double *xj, double *yj, double *zj, std::complex<double> *cj,
+               int iflag, double eps, BIGINT nk, double *s, double *t, double *u,
+               std::complex<double> *fk, finufft_opts *opts)
+//  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
+{
+  return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
+}
+int finufftf3d3(BIGINT nj, float *xj, float *yj, float *zj, std::complex<float> *cj,
+                int iflag, float eps, BIGINT nk, float *s, float *t, float *u,
+                std::complex<float> *fk, finufft_opts *opts)
+//  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
+{
+  return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts);
 }
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 12327c2d6..7c7309de2 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1,10 +1,8 @@
-// Spreading/interpolating module within FINUFFT. Uses precision-switching
-// macros for FLT, CPX, etc.
+// Spreading/interpolating module within FINUFFT.
 
 #include <finufft/defs.h>
 #include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
-#include <finufft/utils_precindep.h>
 
 #include "ker_horner_allw_loop_constexpr.h"
 #include "ker_lowupsampfac_horner_allw_loop_constexpr.h"
@@ -23,896 +21,410 @@ namespace finufft::spreadinterp {
 
 namespace { // anonymous namespace for internal structs equivalent to declaring everything
             // static
-struct zip_low;
-struct zip_hi;
-template<unsigned cap> struct reverse_index;
-template<unsigned cap> struct shuffle_index;
-struct select_even;
-struct select_odd;
-// forward declaration to clean up the code and be able to use this everywhere in the file
-template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper();
-template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth();
+struct zip_low {
+  // helper struct to get the lower half of a SIMD register and zip it with itself
+  // it returns index 0, 0, 1, 1, ... N/2, N/2
+  static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; }
+};
+struct zip_hi {
+  // helper struct to get the upper half of a SIMD register and zip it with itself
+  // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N
+  static constexpr unsigned get(unsigned index, unsigned size) {
+    return (size + index) / 2;
+  }
+};
+template<unsigned cap> struct reverse_index {
+  static constexpr unsigned get(unsigned index, const unsigned size) {
+    return index < cap ? (cap - 1 - index) : index;
+  }
+};
+template<unsigned cap> struct shuffle_index {
+  static constexpr unsigned get(unsigned index, const unsigned size) {
+    return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
+  }
+};
+struct select_even {
+  static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
+};
+struct select_odd {
+  static constexpr unsigned get(unsigned index, unsigned /*size*/) {
+    return index * 2 + 1;
+  }
+};
+
+// this finds the largest SIMD instruction set that can handle N elements
+// void otherwise -> compile error
+template<class T, uint8_t N, uint8_t K = N> static constexpr auto BestSIMDHelper() {
+  if constexpr (N % K == 0) { // returns void in the worst case
+    return xsimd::make_sized_batch<T, K>{};
+  } else {
+    return BestSIMDHelper<T, N, (K >> 1)>();
+  }
+}
+template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width() {
+  // finds the smallest simd width that can handle N elements
+  // simd size is batch size the SIMD width in xsimd terminology
+  if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
+    return min_simd_width<T, N * 2>();
+  } else {
+    return N;
+  }
+};
+
+template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
+  // finds the smallest simd width that minimizes the number of iterations
+  // NOTE: might be suboptimal for some cases 2^N+1 for example
+  // in the future we might want to implement a more sophisticated algorithm
+  uint8_t optimal_simd_width = min_simd_width<T>();
+  uint8_t min_iterations     = (N + optimal_simd_width - 1) / optimal_simd_width;
+  for (uint8_t simd_width = optimal_simd_width;
+       simd_width <= xsimd::batch<T, xsimd::best_arch>::size;
+       simd_width *= 2) {
+    uint8_t iterations = (N + simd_width - 1) / simd_width;
+    if (iterations < min_iterations) {
+      min_iterations     = iterations;
+      optimal_simd_width = simd_width;
+    }
+  }
+  return optimal_simd_width;
+}
+
+template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
+  // helper function to get the SIMD width with padding for the given number of elements
+  // that minimizes the number of iterations
+  return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
+}
 template<class T, uint8_t N>
 using PaddedSIMD = typename xsimd::make_sized_batch<T, GetPaddedSIMDWidth<T, N>()>::type;
-template<class T> uint8_t get_padding(uint8_t ns);
-template<class T, uint8_t ns> constexpr auto get_padding();
+template<class T, uint8_t ns> constexpr auto get_padding() {
+  // helper function to get the padding for the given number of elements
+  // ns is known at compile time, rounds ns to the next multiple of the SIMD width
+  // then subtracts ns to get the padding using a bitwise and trick
+  // WARING: this trick works only for power of 2s
+  // SOURCE: Agner Fog's VCL manual
+  constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
+  return ((ns + width - 1) & (-width)) - ns;
+}
+
+template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
+  // helper function to get the padding for the given number of elements where ns is
+  // known at runtime, it uses recursion to find the padding
+  // this allows to avoid having a function with a large number of switch cases
+  // as GetPaddedSIMDWidth requires a compile time value
+  // it cannot be a lambda function because of the template recursion
+  if constexpr (ns < 2) {
+    return 0;
+  } else {
+    if (runtime_ns == ns) {
+      return get_padding<T, ns>();
+    } else {
+      return get_padding_helper<T, ns - 1>(runtime_ns);
+    }
+  }
+}
+
+template<class T> uint8_t get_padding(uint8_t ns) {
+  // return the padding as a function of the number of elements
+  // 2 * MAX_NSPREAD is the maximum number of elements that we can have
+  // that's why is hardcoded here
+  return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
+}
 template<class T, uint8_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
-template<class T, uint8_t N = 1> constexpr uint8_t min_simd_width();
-template<class T, uint8_t N> constexpr auto find_optimal_simd_width();
+template<class T, class V, size_t... Is>
+constexpr T generate_sequence_impl(V a, V b, index_sequence<Is...>) noexcept {
+  // utility function to generate a sequence of a, b interleaved as function arguments
+  return T(((Is % 2 == 0) ? a : b)...);
+}
+
 template<class T, class V = typename T::value_type, std::size_t N = T::size>
-constexpr auto initialize_complex_register(V a, V b) noexcept;
-template<class arch_t>
+constexpr auto initialize_complex_register(V a, V b) noexcept {
+  // populates a SIMD register with a and b interleaved
+  // for example:
+  // +-------------------------------+
+  // | a | b | a | b | a | b | a | b |
+  // +-------------------------------+
+  // it uses index_sequence to generate the sequence of a, b at compile time
+  return generate_sequence_impl<T>(a, b, std::make_index_sequence<N>{});
+}
+template<class arch_t, typename T>
 constexpr auto zip_low_index =
-    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, zip_low>();
-template<class arch_t>
+    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t, zip_low>();
+template<class arch_t, typename T>
 constexpr auto zip_hi_index =
-    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, zip_hi>();
-template<class arch_t>
-constexpr auto select_even_mask =
-    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, select_even>();
-template<class arch_t>
-constexpr auto select_odd_mask =
-    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t, select_odd>();
+    xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t, zip_hi>();
+// template<class arch_t, typename T>
+// constexpr auto select_even_mask =
+//     xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t, select_even>();
+// template<class arch_t, typename T>
+// constexpr auto select_odd_mask =
+//     xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t, select_odd>();
 template<typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
 constexpr std::array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros(
-    const std::array<std::array<T, M>, N> &input) noexcept;
-template<typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept;
+    const std::array<std::array<T, M>, N> &input) noexcept {
+  constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept {
+    std::array<T, PaddedM> padded{0};
+    for (auto i = 0; i < input.size(); ++i) {
+      padded[i] = input[i];
+    }
+    return padded;
+  };
+  std::array<std::array<T, PaddedM>, N> output{};
+  for (std::size_t i = 0; i < N; ++i) {
+    output[i] = pad_with_zeros(input[i]);
+  }
+  return output;
+}
+
+template<typename T> FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept {
+  constexpr auto alignment = T::arch_type::alignment();
+  alignas(alignment) std::array<typename T::value_type, T::size> array{};
+  vec.store_aligned(array.data());
+  return array;
+}
 
 FINUFFT_NEVER_INLINE
 void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
                         UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
-                        UBIGINT M0);
+                        UBIGINT M0) {
+  printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1);
+  switch (ndims) {
+  case 1:
+    printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1,
+           (long long)padded_size1, (long long)M0);
+    break;
+  case 2:
+    printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1,
+           (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0);
+    break;
+  case 3:
+    printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n",
+           (long long)offset1, (long long)offset2, (long long)offset3,
+           (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0);
+    break;
+  default:
+    printf("Invalid number of dimensions: %d\n", ndims);
+    break;
+  }
+}
 } // namespace
 // declarations of purely internal functions... (thus need not be in .h)
-template<uint8_t ns, uint8_t kerevalmeth, class T,
-         class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, ns>()>,
-         typename... V>
-static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker,
-                                           const finufft_spread_opts &opts,
-                                           const V... elems) noexcept;
-static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept;
-template<class simd_type>
-FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x,
-                                                    UBIGINT N) noexcept;
-static FINUFFT_ALWAYS_INLINE void set_kernel_args(
-    FLT *args, FLT x, const finufft_spread_opts &opts) noexcept;
-static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
-    FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept;
-template<uint8_t w, uint8_t upsampfact,
-         class simd_type =
-             xsimd::make_sized_batch_t<FLT, find_optimal_simd_width<FLT, w>()>> // aka ns
-static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
-    FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept;
-template<uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
-static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
-                        BIGINT i1, UBIGINT N1);
-template<uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
-static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
-                          const FLT *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2);
-template<uint8_t ns, class simd_type = PaddedSIMD<FLT, 2 * ns>>
-static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
-                        const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3,
-                        UBIGINT N1, UBIGINT N2, UBIGINT N3);
-static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du0, UBIGINT M0,
-                                 FLT *kx0, FLT *dd0,
-                                 const finufft_spread_opts &opts) noexcept;
-static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2,
-                                 FLT *FINUFFT_RESTRICT du, UBIGINT M, const FLT *kx,
-                                 const FLT *ky, const FLT *dd,
-                                 const finufft_spread_opts &opts) noexcept;
-static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
-                                 UBIGINT size2, UBIGINT size3, FLT *du0, UBIGINT M0,
-                                 FLT *kx0, FLT *ky0, FLT *kz0, FLT *dd0,
-                                 const finufft_spread_opts &opts) noexcept;
-template<bool thread_safe>
-static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
-                                UBIGINT padded_size1, UBIGINT size1, UBIGINT size2,
-                                UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                                FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0);
-static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const FLT *ky,
-                                  const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                                  double bin_size_x, double bin_size_y, double bin_size_z,
-                                  int debug);
-void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1,
-                          UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y,
-                          double bin_size_z, int debug, int nthr);
-static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3,
-                        BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3,
-                        UBIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0, int ns, int ndims);
-
-// ==========================================================================
-int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT M,
-                 FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform,
-                 const finufft_spread_opts &opts)
-/* ------------Spreader/interpolator for 1, 2, or 3 dimensions --------------
-   If opts.spread_direction=1, evaluate, in the 1D case,
-
-                         N1-1
-   data_nonuniform[j] =  SUM phi(kx[j] - n) data_uniform[n],   for j=0...M-1
-                         n=0
-
-   If opts.spread_direction=2, evaluate its transpose, in the 1D case,
-
-                      M-1
-   data_uniform[n] =  SUM phi(kx[j] - n) data_nonuniform[j],   for n=0...N1-1
-                      j=0
-
-   In each case phi is the spreading kernel, which has support
-   [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with
-   product of 1D kernels is performed.
-   For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1.
-
-   Notes:
-   No particular normalization of the spreading kernel is assumed.
-   Uniform (U) points are centered at coords
-   [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x
-   fastest, y medium, z slowest ordering, up to however many
-   dimensions are relevant; note that this is Fortran-style ordering for an
-   array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran
-   interface of the original CMCL libraries.
-   Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three
-   periods in each coordinate (these are folded into the central period).
-   The finufft_spread_opts struct must have been set up already by calling setup_kernel.
-   It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel
-   only ever wraps once when falls below 0 or off the top of a uniform grid
-   dimension.
-
-   Inputs:
-   N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively.
-              If N2==1, 1D spreading is done. If N3==1, 2D spreading.
-          Otherwise, 3D.
-   M - number of NU pts.
-   kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in
-                1D, only kx and ky read in 2D).
-
-        These should lie in the box -pi<=kx<=pi. Points outside this domain are also
-        correctly folded back into this domain.
-   opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h
-
-   Inputs/Outputs:
-   data_uniform - output values on grid (dir=1) OR input grid data (dir=2)
-   data_nonuniform - input strengths of the sources (dir=1)
-                     OR output values at targets (dir=2)
-   Returned value:
-   0 indicates success; other values have meanings in ../docs/error.rst, with
-   following modifications:
-      3 : one or more non-trivial box dimensions is less than 2.nspread.
-      5 : failed allocate sort indices
 
-   Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17
-   error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18
-   No separate subprob indices in t-1 2/11/18.
-   sort_threads (since for M<<N, multithread sort slower than single) 3/27/18
-   kereval, kerpad 4/24/18
-   Melody Shih split into 3 routines: check, sort, spread. Jun 2018, making
-   this routine just a caller to them. Name change, Barnett 7/27/18
-   Tidy, Barnett 5/20/20. Tidy doc, Barnett 10/22/20.
+/* local NU coord fold+rescale macro: does the following affine transform to x:
+    (x+PI) mod PI    each to [0,N)
+   Note: folding big numbers can cause numerical inaccuracies
+   Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range
+   limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function
 */
-{
-  int ier = spreadcheck(N1, N2, N3, M, kx, ky, kz, opts);
-  if (ier) return ier;
-  BIGINT *sort_indices = (BIGINT *)malloc(sizeof(BIGINT) * M);
-  if (!sort_indices) {
-    fprintf(stderr, "%s failed to allocate sort_indices!\n", __func__);
-    return FINUFFT_ERR_SPREAD_ALLOC;
-  }
-  int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts);
-  spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz,
-                     data_nonuniform, opts, did_sort);
-  free(sort_indices);
-  return 0;
+template<typename T>
+static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept {
+  static constexpr const T x2pi = T(M_1_2PI);
+  const T result                = x * x2pi + T(0.5);
+  return (result - floor(result)) * T(N);
 }
 
-static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2,
-                                       const UBIGINT N3)
-/* rule for getting number of spreading dimensions from the list of Ns per dim.
-   Split out, Barnett 7/26/18
-*/
+template<typename T, class simd_type>
+static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x,
+                                                    const BIGINT N) noexcept {
+  const simd_type x2pi   = T(M_1_2PI);
+  const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5));
+  return (result - xsimd::floor(result)) * simd_type(T(N));
+}
+template<typename T, uint8_t ns>
+static void set_kernel_args(T *args, T x) noexcept
+// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
+// needed for the vectorized kernel eval of Ludvig af K.
 {
-  return 1 + (N2 > 1) + (N3 > 1);
+  for (int i = 0; i < ns; i++) args[i] = x + T(i);
 }
+template<typename T, uint8_t N>
+static void evaluate_kernel_vector(T *ker, T *args,
+                                   const finufft_spread_opts &opts) noexcept
+/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
+   If opts.kerpad true, args and ker must be allocated for Npad, and args is
+   written to (to pad to length Npad), only first N outputs are correct.
+   Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization.
+   Rescaled so max is 1, Barnett 7/21/24
 
-int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, FLT *kz,
-                const finufft_spread_opts &opts)
-/* This does just the input checking and reporting for the spreader.
-   See spreadinterp() for input arguments and meaning of returned value.
-   Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18.
-   Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to
-   [-3pi,3pi)
+   Obsolete (replaced by Horner), but keep around for experimentation since
+   works for arbitrary beta. Formula must match reference implementation.
 */
 {
-  // INPUT CHECKING & REPORTING .... cuboid not too small for spreading?
-  int minN = 2 * opts.nspread;
-  if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) {
-    fprintf(stderr,
-            "%s error: one or more non-trivial box dims is less than 2.nspread!\n",
-            __func__);
-    return FINUFFT_ERR_SPREAD_BOX_SMALL;
-  }
-  if (opts.spread_direction != 1 && opts.spread_direction != 2) {
-    fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__);
-    return FINUFFT_ERR_SPREAD_DIR;
+  T b = (T)opts.ES_beta;
+  T c = (T)opts.ES_c;
+  if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) {
+    // Note (by Ludvig af K): Splitting kernel evaluation into two loops
+    // seems to benefit auto-vectorization.
+    // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops
+    int Npad = N;
+    if (opts.kerpad) {               // since always same branch, no speed hit
+      Npad = 4 * (1 + (N - 1) / 4);  // pad N to mult of 4; help i7 GCC, not xeon
+      for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval
+        args[i] = 0.0;
+    }
+    for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments
+      // care! 1.0 is double...
+      ker[i] = b * (sqrt((T)1.0 - c * args[i] * args[i]) - (T)1.0);
+    }
+    if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL))
+      for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials
+        ker[i] = exp(ker[i]);
+    if (opts.kerpad) {
+      // padded part should be zero, in spread_subproblem_nd_kernels, there are
+      // out of bound writes to trg arrays
+      for (int i = N; i < Npad; ++i) ker[i] = 0.0;
+    }
+  } else {
+    for (int i = 0; i < N; i++) // dummy for timing only
+      ker[i] = 1.0;
   }
-  return 0;
+  // Separate check from arithmetic (Is this really needed? doesn't slow down)
+  for (int i = 0; i < N; i++)
+    if (abs(args[i]) >= (T)opts.ES_halfwidth) ker[i] = 0.0;
 }
 
-int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M,
-              FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts)
-/* This makes a decision whether or not to sort the NU pts (influenced by
-   opts.sort), and if yes, calls either single- or multi-threaded bin sort,
-   writing reordered index list to sort_indices. If decided not to sort, the
-   identity permutation is written to sort_indices.
-   The permutation is designed to make RAM access close to contiguous, to
-   speed up spreading/interpolation, in the case of disordered NU points.
-
-   Inputs:
-    M        - number of input NU points.
-    kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi),
-                points outside are folded in.
-               (only kz used in 1D, only kx and ky used in 2D.)
-    N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D).
-               1 = x (fastest), 2 = y (medium), 3 = z (slowest).
-    opts     - spreading options struct, see ../include/finufft_spread_opts.h
-   Outputs:
-    sort_indices - a good permutation of NU points. (User must preallocate
-                   to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good
-                   ordering for the x-coords of NU pts, etc.
-    returned value - whether a sort was done (1) or not (0).
+template<typename T, uint8_t w, uint8_t upsampfact,
+         class simd_type =
+             xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, w>()>> // aka ns
+static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(
+    T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts &opts) noexcept
+/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
+x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
+This is the current evaluation method, since it's faster (except i7 w=16).
+Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 
-   Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024.
-*/
 {
-  CNTime timer{};
-  uint8_t ndims = ndims_from_Ns(N1, N2, N3);
-  auto N        = N1 * N2 * N3; // U grid (periodic box) sizes
+  // scale so local grid offset z in[-1,1]
+  const T z                           = std::fma(T(2.0), x, T(w - 1));
+  using arch_t                        = typename simd_type::arch_type;
+  static constexpr auto alignment     = arch_t::alignment();
+  static constexpr auto simd_size     = simd_type::size;
+  static constexpr auto padded_ns     = (w + simd_size - 1) & ~(simd_size - 1);
+  static constexpr auto horner_coeffs = []() constexpr noexcept {
+    if constexpr (upsampfact == 200) {
+      return get_horner_coeffs_200<T, w>();
+    } else if constexpr (upsampfact == 125) {
+      return get_horner_coeffs_125<T, w>();
+    }
+  }();
+  static constexpr auto nc          = horner_coeffs.size();
+  static constexpr auto use_ker_sym = (simd_size < w);
 
-  // heuristic binning box size for U grid... affects performance:
-  double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4;
-  // put in heuristics based on cache sizes (only useful for single-thread) ?
+  alignas(alignment) static constexpr auto padded_coeffs =
+      pad_2D_array_with_zeros<T, nc, w, padded_ns>(horner_coeffs);
 
-  int better_to_sort =
-      !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or
-                                                                        // dir=2 case:
-                                                                        // don't sort
+  // use kernel symmetry trick if w > simd_size
+  if constexpr (use_ker_sym) {
+    static constexpr uint8_t tail          = w % simd_size;
+    static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
+    static constexpr uint8_t offset_start  = tail ? w - tail : w - simd_size;
+    static constexpr uint8_t end_idx       = (w + (tail > 0)) / 2;
+    const simd_type zv{z};
+    const auto z2v = zv * zv;
 
-  timer.start();                           // if needed, sort all the NU pts...
-  int did_sort = 0;
-  auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default
-  if (opts.nthreads > 0)
-    maxnthr = opts.nthreads;               // user nthreads overrides, without limit
-  if (opts.sort_threads > 0)
-    maxnthr = opts.sort_threads;           // high-priority override, also no limit
-  // At this point: maxnthr = the max threads sorting could use
-  // (we don't print warning here, since: no showwarn in spread_opts, and finufft
-  // already warned about it. spreadinterp-only advanced users will miss a warning)
-  if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) {
-    // store a good permutation ordering of all NU pts (dim=1,2 or 3)
-    int sort_debug = (opts.debug >= 2); // show timing output?
-    int sort_nthr  = opts.sort_threads; // 0, or user max # threads for sort
-#ifndef _OPENMP
-    sort_nthr = 1;                      // if single-threaded lib, override user
-#endif
-    if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better!
-      sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic
-    if (sort_nthr == 1)
-      bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
-                            bin_size_y, bin_size_z, sort_debug);
-    else // sort_nthr>1, user fixes # threads (>=2)
-      bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
-                           bin_size_y, bin_size_z, sort_debug, sort_nthr);
-    if (opts.debug)
-      printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec());
-    did_sort = 1;
+    // some xsimd constant for shuffle or inverse
+    static constexpr auto shuffle_batch = []() constexpr noexcept {
+      if constexpr (tail) {
+        return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t,
+                                          shuffle_index<tail>>();
+      } else {
+        return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<T>, arch_t,
+                                          reverse_index<simd_size>>();
+      }
+    }();
+
+    // process simd vecs
+    simd_type k_prev, k_sym{0};
+    for (uint8_t i{0}, offset = offset_start; i < end_idx;
+         i += simd_size, offset -= simd_size) {
+      auto k_odd = [i]() constexpr noexcept {
+        if constexpr (if_odd_degree) {
+          return simd_type::load_aligned(padded_coeffs[0].data() + i);
+        } else {
+          return simd_type{0};
+        }
+      }();
+      auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
+      for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {
+        const auto cji_odd  = simd_type::load_aligned(padded_coeffs[j].data() + i);
+        const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
+        k_odd               = xsimd::fma(k_odd, z2v, cji_odd);
+        k_even              = xsimd::fma(k_even, z2v, cji_even);
+      }
+      // left part
+      xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
+      // right part symmetric to the left part
+      if (offset >= end_idx) {
+        if constexpr (tail) {
+          // to use aligned store, we need shuffle the previous k_sym and current k_sym
+          k_prev = k_sym;
+          k_sym  = xsimd::fnma(k_odd, zv, k_even);
+          xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
+        } else {
+          xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch)
+              .store_aligned(ker + offset);
+        }
+      }
+    }
   } else {
-#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000)
-    for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7
-      sort_indices[i] = i;         // the identity permutation
-    if (opts.debug)
-      printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec());
+    const simd_type zv(z);
+    for (uint8_t i = 0; i < w; i += simd_size) {
+      auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
+      for (uint8_t j = 1; j < nc; ++j) {
+        const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
+        k              = xsimd::fma(k, zv, cji);
+      }
+      k.store_aligned(ker + i);
+    }
   }
-  return did_sort;
 }
 
-int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2,
-                       const UBIGINT N3, FLT *data_uniform, const UBIGINT M,
-                       FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky,
-                       FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform,
-                       const finufft_spread_opts &opts, int did_sort)
-/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine.
-   See spreadinterp() above for inputs arguments and definitions.
-   Return value should always be 0 (no error reporting).
-   Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20.
-*/
-{
-  if (opts.spread_direction == 1) // ========= direction 1 (spreading) =======
-    spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
-                 opts, did_sort);
-
-  else // ================= direction 2 (interpolation) ===========
-    interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
-                 opts);
-
-  return 0;
+template<typename T, uint8_t ns>
+static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker,
+                             const BIGINT i1, const UBIGINT N1) {
+  /* This function is called when the kernel wraps around the grid. It is
+     slower than interp_line.
+     M. Barbone July 2024: - moved the logic to a separate function
+                           - using fused multiply-add (fma) for better performance
+     */
+  std::array<T, 2> out{0};
+  BIGINT j = i1;
+  if (i1 < 0) { // wraps at left
+    j += BIGINT(N1);
+    for (uint8_t dx = 0; dx < -i1; ++dx, ++j) {
+      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
+      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
+    }
+    j -= BIGINT(N1);
+    for (uint8_t dx = -i1; dx < ns; ++dx, ++j) {
+      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
+      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
+    }
+  } else if (i1 + ns >= N1) { // wraps at right
+    for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) {
+      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
+      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
+    }
+    j -= BIGINT(N1);
+    for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) {
+      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
+      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
+    }
+  } else {
+    // padding is okay for ker, but it might spill over du array
+    // so this checks for that case and does not explicitly vectorize
+    for (uint8_t dx = 0; dx < ns; ++dx, ++j) {
+      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
+      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
+    }
+  }
+  target[0] = out[0];
+  target[1] = out[1];
 }
 
-// --------------------------------------------------------------------------
-int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                 FLT *FINUFFT_RESTRICT data_uniform, UBIGINT M, FLT *FINUFFT_RESTRICT kx,
-                 FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
-                 const FLT *data_nonuniform, const finufft_spread_opts &opts,
-                 int did_sort)
-// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc.
-{
-  CNTime timer{};
-  const auto ndims = ndims_from_Ns(N1, N2, N3);
-  const auto N     = N1 * N2 * N3;             // output array size
-  const auto ns    = opts.nspread;             // abbrev. for w, kernel width
-  auto nthr        = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread
-  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
-#ifndef _OPENMP
-  nthr = 1;                                    // single-threaded lib must override user
-#endif
-  if (opts.debug)
-    printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
-           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
-  timer.start();
-  std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array
-  if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec());
-  if (M == 0)                                         // no NU pts, we're done
-    return 0;
-
-  auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic?
-  spread_single      = false;                        // for now
-  timer.start();
-  if (spread_single) { // ------- Basic single-core t1 spreading ------
-    for (UBIGINT j = 0; j < M; j++) {
-      // *** todo, not urgent
-      // ... (question is: will the index wrapping per NU pt slow it down?)
-    }
-    if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec());
-  } else { // ------- Fancy multi-core blocked t1 spreading ----
-           // Splits sorted inds (jfm's advanced2), could double RAM.
-    // choose nb (# subprobs) via used nthreads:
-    auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr...
-    if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size
-      nb = 1 + (M - 1) / opts.max_subproblem_size;   // int div does
-                                                     // ceil(M/opts.max_subproblem_size)
-      if (opts.debug)
-        printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size);
-    }
-    if (M * 1000 < N) { // low-density heuristic: one thread per NU pt!
-      nb = M;
-      if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n");
-    }
-    if (!did_sort && nthr == 1) {
-      nb = 1;
-      if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n");
-    }
-    if (opts.debug && nthr > opts.atomic_threshold)
-      printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n");
-
-    std::vector<UBIGINT> brk(nb + 1); // NU index breakpoints defining nb subproblems
-    for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb;
-
-#pragma omp parallel num_threads(nthr)
-    {
-      // local copies of NU pts and data for each subproblem
-      std::vector<FLT> kx0{}, ky0{}, kz0{}, dd0{}, du0{};
-#pragma omp for schedule(dynamic, 1)               // each is big
-      for (int isub = 0; isub < nb; isub++) {      // Main loop through the subproblems
-        const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem
-        // copy the location and data vectors for the nonuniform points
-        kx0.resize(M0);
-        ky0.resize(M0 * (N2 > 1));
-        kz0.resize(M0 * (N3 > 1));
-        dd0.resize(2 * M0);                            // complex strength data
-        for (auto j = 0; j < M0; j++) {                // todo: can avoid this copying?
-          const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list
-          kx0[j]        = fold_rescale(kx[kk], N1);
-          if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2);
-          if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3);
-          dd0[j * 2]     = data_nonuniform[kk * 2];     // real part
-          dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part
-        }
-        // get the subgrid which will include padding by roughly nspread/2
-        // get_subgrid sets
-        BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3;
-        // sets offsets and sizes
-        get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0,
-                    kx0.data(), ky0.data(), kz0.data(), ns, ndims);
-        if (opts.debug > 1) {
-          print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2,
-                             size3, M0);
-        }
-        // allocate output data for this subgrid
-        du0.resize(2 * padded_size1 * size2 * size3); // complex
-        // Spread to subgrid without need for bounds checking or wrapping
-        if (!(opts.flags & TF_OMIT_SPREADING)) {
-          if (ndims == 1)
-            spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(),
-                                 dd0.data(), opts);
-          else if (ndims == 2)
-            spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0,
-                                 kx0.data(), ky0.data(), dd0.data(), opts);
-          else
-            spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3,
-                                 du0.data(), M0, kx0.data(), ky0.data(), kz0.data(),
-                                 dd0.data(), opts);
-        }
-        // do the adding of subgrid to output
-        if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) {
-          if (nthr > opts.atomic_threshold) { // see above for debug reporting
-            add_wrapped_subgrid<true>(offset1, offset2, offset3, padded_size1, size1,
-                                      size2, size3, N1, N2, N3, data_uniform,
-                                      du0.data()); // R Blackwell's atomic version
-          } else {
-#pragma omp critical
-            add_wrapped_subgrid<false>(offset1, offset2, offset3, padded_size1, size1,
-                                       size2, size3, N1, N2, N3, data_uniform,
-                                       du0.data());
-          }
-        }
-      } // end main loop over subprobs
-    }
-    if (opts.debug)
-      printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb);
-  } // end of choice of which t1 spread type to use
-  return 0;
-};
-
-// --------------------------------------------------------------------------
-template<uint16_t ns, uint16_t kerevalmeth>
-FINUFFT_NEVER_INLINE static int interpSorted_kernel(
-    const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3,
-    const FLT *data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx,
-    FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
-    FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts)
-// Interpolate to NU pts in sorted order from a uniform grid.
-// See spreadinterp() for doc.
-{
-  using simd_type                 = PaddedSIMD<FLT, 2 * ns>;
-  using arch_t                    = typename simd_type::arch_type;
-  static constexpr auto alignment = arch_t::alignment();
-  static constexpr auto simd_size = simd_type::size;
-  static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift
-
-  CNTime timer{};
-  const auto ndims = ndims_from_Ns(N1, N2, N3);
-  auto nthr        = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp
-  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
-#ifndef _OPENMP
-  nthr = 1;                                    // single-threaded lib must override user
-#endif
-  if (opts.debug)
-    printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
-           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
-  timer.start();
-#pragma omp parallel num_threads(nthr)
-  {
-    static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk
-    alignas(alignment) UBIGINT jlist[CHUNKSIZE];
-    alignas(alignment) FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE];
-    alignas(alignment) FLT outbuf[2 * CHUNKSIZE];
-    // Kernels: static alloc is faster, so we do it for up to 3D...
-    alignas(alignment) std::array<FLT, 3 * MAX_NSPREAD> kernel_values{0};
-    auto *FINUFFT_RESTRICT ker1 = kernel_values.data();
-    auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD;
-    auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD;
-
-    // Loop over interpolation chunks
-    // main loop over NU trgs, interp each from U
-    // (note: windows omp doesn't like unsigned loop vars)
-#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts:
-    for (BIGINT i = 0; i < M; i += CHUNKSIZE) {
-      // Setup buffers for this chunk
-      const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE;
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
-        UBIGINT j    = sort_indices[i + ibuf];
-        jlist[ibuf]  = j;
-        xjlist[ibuf] = fold_rescale(kx[j], N1);
-        if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2);
-        if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3);
-      }
-
-      // Loop over targets in chunk
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
-        const auto xj = xjlist[ibuf];
-        const auto yj = (ndims > 1) ? yjlist[ibuf] : 0;
-        const auto zj = (ndims > 2) ? zjlist[ibuf] : 0;
-
-        auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf;
-
-        // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ
-        const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index
-        const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index
-        const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index
-
-        const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1]
-        const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0;
-        const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0;
-
-        // eval kernel values patch and use to interpolate from uniform data...
-        if (!(opts.flags & TF_OMIT_SPREADING)) {
-          switch (ndims) {
-          case 1:
-            ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1);
-            interp_line<ns, simd_type>(target, data_uniform, ker1, i1, N1);
-            break;
-          case 2:
-            ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1, x2);
-            interp_square<ns, simd_type>(target, data_uniform, ker1, ker2, i1, i2, N1,
-                                         N2);
-            break;
-          case 3:
-            ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1, x2,
-                                                      x3);
-            interp_cube<ns, simd_type>(target, data_uniform, ker1, ker2, ker3, i1, i2, i3,
-                                       N1, N2, N3);
-            break;
-          default: // can't get here
-            FINUFFT_UNREACHABLE;
-            break;
-          }
-        }
-      } // end loop over targets in chunk
-
-      // Copy result buffer to output array
-      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
-        const UBIGINT j            = jlist[ibuf];
-        data_nonuniform[2 * j]     = outbuf[2 * ibuf];
-        data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1];
-      }
-
-    } // end NU targ loop
-  } // end parallel section
-  if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec());
-  return 0;
-}
-
-template<uint16_t NS>
-static int interpSorted_dispatch(
-    const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3,
-    FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx,
-    FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz,
-    FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) {
-  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
-                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
-  if constexpr (NS == MIN_NSPREAD) { // Base case
-    if (opts.kerevalmeth)
-      return interpSorted_kernel<MIN_NSPREAD, true>(
-          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
-    else {
-      return interpSorted_kernel<MIN_NSPREAD, false>(
-          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
-    }
-  } else {
-    if (opts.nspread == NS) {
-      if (opts.kerevalmeth) {
-        return interpSorted_kernel<NS, true>(sort_indices, N1, N2, N3, data_uniform, M,
-                                             kx, ky, kz, data_nonuniform, opts);
-      } else {
-        return interpSorted_kernel<NS, false>(sort_indices, N1, N2, N3, data_uniform, M,
-                                              kx, ky, kz, data_nonuniform, opts);
-      }
-    } else {
-      return interpSorted_dispatch<NS - 1>(sort_indices, N1, N2, N3, data_uniform, M, kx,
-                                           ky, kz, data_nonuniform, opts);
-    }
-  }
-}
-
-int interpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2,
-                 const UBIGINT N3, FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M,
-                 FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky,
-                 FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform,
-                 const finufft_spread_opts &opts) {
-  return interpSorted_dispatch<MAX_NSPREAD>(sort_indices, N1, N2, N3, data_uniform, M, kx,
-                                            ky, kz, data_nonuniform, opts);
-}
-
-///////////////////////////////////////////////////////////////////////////
-
-int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth,
-                   int debug, int showwarn, int dim)
-/* Initializes spreader kernel parameters given desired NUFFT tolerance eps,
-   upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth
-   (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags.
-   Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for
-   opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where
-   upsampfac is set. Must call this before any kernel evals done, otherwise segfault
-   likely. Returns: 0  : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be
-   achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h);
-   spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20.
-*/
-{
-  if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma
-    if (kerevalmeth == 1) {
-      fprintf(stderr,
-              "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by "
-              "kerevalmeth=1\n",
-              upsampfac);
-      return FINUFFT_ERR_HORNER_WRONG_BETA;
-    }
-    if (upsampfac <= 1.0) { // no digits would result
-      fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
-              upsampfac);
-      return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
-    }
-    // calling routine must abort on above errors, since opts is garbage!
-    if (showwarn && upsampfac > 4.0)
-      fprintf(stderr,
-              "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be "
-              "beneficial.\n",
-              upsampfac);
-  }
-
-  // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft)
-  opts.spread_direction = 0; // user should always set to 1 or 2 as desired
-  opts.sort             = 2; // 2:auto-choice
-  opts.kerpad           = 0; // affects only evaluate_kernel_vector
-  opts.kerevalmeth      = kerevalmeth;
-  opts.upsampfac        = upsampfac;
-  opts.nthreads         = 0; // all avail
-  opts.sort_threads     = 0; // 0:auto-choice
-  // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake...
-  opts.max_subproblem_size = (dim == 1) ? 10000 : 100000;
-  opts.flags               = 0; // 0:no timing flags (>0 for experts only)
-  opts.debug               = 0; // 0:no debug output
-  // heuristic nthr above which switch OMP critical to atomic (add_wrapped...):
-  opts.atomic_threshold = 10; // R Blackwell's value
-
-  int ns, ier = 0;            // Set kernel width w (aka ns, nspread) then copy to opts...
-  if (eps < EPSILON) {        // safety; there's no hope of beating e_mach
-    if (showwarn)
-      fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__,
-              (double)eps, (double)EPSILON);
-    eps = EPSILON; // only changes local copy (not any opts)
-    ier = FINUFFT_WARN_EPS_TOO_SMALL;
-  }
-  if (upsampfac == 2.0)                      // standard sigma (see SISC paper)
-    ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10
-  else                                       // custom sigma
-    ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
-  ns = max(2, ns);        // (we don't have ns=1 version yet)
-  if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules
-    if (showwarn)
-      fprintf(stderr,
-              "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; "
-              "clipping to max %d.\n",
-              __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
-    ns  = MAX_NSPREAD;
-    ier = FINUFFT_WARN_EPS_TOO_SMALL;
-  }
-  opts.nspread = ns;
-  // setup for reference kernel eval (via formula): select beta width param...
-  // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel)
-  opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines)
-  opts.ES_c         = 4.0 / (double)(ns * ns);
-  double betaoverns = 2.30;           // gives decent betas for default sigma=2.0
-  if (ns == 2) betaoverns = 2.20;     // some small-width tweaks...
-  if (ns == 3) betaoverns = 2.26;
-  if (ns == 4) betaoverns = 2.38;
-  if (upsampfac != 2.0) { // again, override beta for custom sigma
-    FLT gamma  = 0.97;    // must match devel/gen_all_horner_C_code.m !
-    betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff
-  }
-  opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
-  if (debug)
-    printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__,
-           kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta);
-
-  return ier;
-}
-
-FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts)
-/* ES ("exp sqrt") kernel evaluation at single real argument:
-      phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)),    for |x| < nspread/2
-   related to an asymptotic approximation to the Kaiser--Bessel, itself an
-   approximation to prolate spheroidal wavefunction (PSWF) of order 0.
-   This is the "reference implementation", used by eg finufft/onedim_* 2/17/17.
-   Rescaled so max is 1, Barnett 7/21/24
-*/
-{
-  if (abs(x) >= (FLT)opts.ES_halfwidth)
-    // if spreading/FT careful, shouldn't need this if, but causes no speed hit
-    return 0.0;
-  else
-    return exp((FLT)opts.ES_beta * (sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x) - (FLT)1.0));
-}
-
-template<uint8_t ns>
-void set_kernel_args(FLT *args, FLT x) noexcept
-// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
-// needed for the vectorized kernel eval of Ludvig af K.
-{
-  for (int i = 0; i < ns; i++) args[i] = x + (FLT)i;
-}
-template<uint8_t N>
-void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept
-/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
-   If opts.kerpad true, args and ker must be allocated for Npad, and args is
-   written to (to pad to length Npad), only first N outputs are correct.
-   Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization.
-   Rescaled so max is 1, Barnett 7/21/24
-
-   Obsolete (replaced by Horner), but keep around for experimentation since
-   works for arbitrary beta. Formula must match reference implementation.
-*/
-{
-  FLT b = (FLT)opts.ES_beta;
-  FLT c = (FLT)opts.ES_c;
-  if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) {
-    // Note (by Ludvig af K): Splitting kernel evaluation into two loops
-    // seems to benefit auto-vectorization.
-    // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops
-    int Npad = N;
-    if (opts.kerpad) {               // since always same branch, no speed hit
-      Npad = 4 * (1 + (N - 1) / 4);  // pad N to mult of 4; help i7 GCC, not xeon
-      for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval
-        args[i] = 0.0;
-    }
-    for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments
-      // care! 1.0 is double...
-      ker[i] = b * (sqrt((FLT)1.0 - c * args[i] * args[i]) - (FLT)1.0);
-    }
-    if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL))
-      for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials
-        ker[i] = exp(ker[i]);
-    if (opts.kerpad) {
-      // padded part should be zero, in spread_subproblem_nd_kernels, there are
-      // out of bound writes to trg arrays
-      for (int i = N; i < Npad; ++i) ker[i] = 0.0;
-    }
-  } else {
-    for (int i = 0; i < N; i++) // dummy for timing only
-      ker[i] = 1.0;
-  }
-  // Separate check from arithmetic (Is this really needed? doesn't slow down)
-  for (int i = 0; i < N; i++)
-    if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0;
-}
-
-template<uint8_t w, uint8_t upsampfact, class simd_type> // aka ns
-void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x,
-                            const finufft_spread_opts &opts) noexcept
-/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
-x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
-This is the current evaluation method, since it's faster (except i7 w=16).
-Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
-
-{
-  // scale so local grid offset z in[-1,1]
-  const FLT z                         = std::fma(FLT(2.0), x, FLT(w - 1));
-  using arch_t                        = typename simd_type::arch_type;
-  static constexpr auto alignment     = arch_t::alignment();
-  static constexpr auto simd_size     = simd_type::size;
-  static constexpr auto padded_ns     = (w + simd_size - 1) & ~(simd_size - 1);
-  static constexpr auto horner_coeffs = []() constexpr noexcept {
-    if constexpr (upsampfact == 200) {
-      return get_horner_coeffs_200<FLT, w>();
-    } else if constexpr (upsampfact == 125) {
-      return get_horner_coeffs_125<FLT, w>();
-    }
-  }();
-  static constexpr auto nc          = horner_coeffs.size();
-  static constexpr auto use_ker_sym = (simd_size < w);
-
-  alignas(alignment) static constexpr auto padded_coeffs =
-      pad_2D_array_with_zeros<FLT, nc, w, padded_ns>(horner_coeffs);
-
-  // use kernel symmetry trick if w > simd_size
-  if constexpr (use_ker_sym) {
-    static constexpr uint8_t tail          = w % simd_size;
-    static constexpr uint8_t if_odd_degree = ((nc + 1) % 2);
-    static constexpr uint8_t offset_start  = tail ? w - tail : w - simd_size;
-    static constexpr uint8_t end_idx       = (w + (tail > 0)) / 2;
-    const simd_type zv{z};
-    const auto z2v = zv * zv;
-
-    // some xsimd constant for shuffle or inverse
-    static constexpr auto shuffle_batch = []() constexpr noexcept {
-      if constexpr (tail) {
-        return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
-                                          shuffle_index<tail>>();
-      } else {
-        return xsimd::make_batch_constant<xsimd::as_unsigned_integer_t<FLT>, arch_t,
-                                          reverse_index<simd_size>>();
-      }
-    }();
-
-    // process simd vecs
-    simd_type k_prev, k_sym{0};
-    for (uint8_t i{0}, offset = offset_start; i < end_idx;
-         i += simd_size, offset -= simd_size) {
-      auto k_odd = [i]() constexpr noexcept {
-        if constexpr (if_odd_degree) {
-          return simd_type::load_aligned(padded_coeffs[0].data() + i);
-        } else {
-          return simd_type{0};
-        }
-      }();
-      auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i);
-      for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) {
-        const auto cji_odd  = simd_type::load_aligned(padded_coeffs[j].data() + i);
-        const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i);
-        k_odd               = xsimd::fma(k_odd, z2v, cji_odd);
-        k_even              = xsimd::fma(k_even, z2v, cji_even);
-      }
-      // left part
-      xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i);
-      // right part symmetric to the left part
-      if (offset >= end_idx) {
-        if constexpr (tail) {
-          // to use aligned store, we need shuffle the previous k_sym and current k_sym
-          k_prev = k_sym;
-          k_sym  = xsimd::fnma(k_odd, zv, k_even);
-          xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset);
-        } else {
-          xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch)
-              .store_aligned(ker + offset);
-        }
-      }
-    }
-  } else {
-    const simd_type zv(z);
-    for (uint8_t i = 0; i < w; i += simd_size) {
-      auto k = simd_type::load_aligned(padded_coeffs[0].data() + i);
-      for (uint8_t j = 1; j < nc; ++j) {
-        const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i);
-        k              = xsimd::fma(k, zv, cji);
-      }
-      k.store_aligned(ker + i);
-    }
-  }
-}
-
-template<uint8_t ns>
-static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
-                             const BIGINT i1, const UBIGINT N1) {
-  /* This function is called when the kernel wraps around the grid. It is
-     slower than interp_line.
-     M. Barbone July 2024: - moved the logic to a separate function
-                           - using fused multiply-add (fma) for better performance
-     */
-  std::array<FLT, 2> out{0};
-  BIGINT j = i1;
-  if (i1 < 0) { // wraps at left
-    j += BIGINT(N1);
-    for (uint8_t dx = 0; dx < -i1; ++dx, ++j) {
-      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
-      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
-    }
-    j -= BIGINT(N1);
-    for (uint8_t dx = -i1; dx < ns; ++dx, ++j) {
-      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
-      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
-    }
-  } else if (i1 + ns >= N1) { // wraps at right
-    for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) {
-      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
-      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
-    }
-    j -= BIGINT(N1);
-    for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) {
-      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
-      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
-    }
-  } else {
-    // padding is okay for ker, but it might spill over du array
-    // so this checks for that case and does not explicitly vectorize
-    for (uint8_t dx = 0; dx < ns; ++dx, ++j) {
-      out[0] = std::fma(du[2 * j], ker[dx], out[0]);
-      out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]);
-    }
-  }
-  target[0] = out[0];
-  target[1] = out[1];
-}
-
-template<uint8_t ns, class simd_type>
-void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
-                 const BIGINT i1, const UBIGINT N1) {
+template<typename T, uint8_t ns, class simd_type = PaddedSIMD<T, 2 * ns>>
+static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, BIGINT i1,
+                        UBIGINT N1) {
   /* 1D interpolate complex values from size-ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    1d kernel evaluation list ker1.
@@ -933,16 +445,16 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
                             limitation
 */
   using arch_t                       = typename simd_type::arch_type;
-  static constexpr auto padding      = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding      = get_padding<T, 2 * ns>();
   static constexpr auto alignment    = arch_t::alignment();
   static constexpr auto simd_size    = simd_type::size;
   static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size));
-  std::array<FLT, 2> out{0};
+  std::array<T, 2> out{0};
   const auto j = i1;
   // removing the wrapping leads up to 10% speedup in certain cases
   // moved the wrapping to another function to reduce instruction cache pressure
   if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) {
-    return interp_line_wrap<ns>(target, du, ker, i1, N1);
+    return interp_line_wrap<T, ns>(target, du, ker, i1, N1);
   } else { // doesn't wrap
     // logic largely similar to spread 1D kernel, please see the explanation there
     // for the first part of this code
@@ -953,8 +465,8 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
         const auto ker_v   = simd_type::load_aligned(ker + dx / 2);
         const auto du_pt0  = simd_type::load_unaligned(du_ptr + dx);
         const auto du_pt1  = simd_type::load_unaligned(du_ptr + dx + simd_size);
-        const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t>);
-        const auto ker0hi  = xsimd::swizzle(ker_v, zip_hi_index<arch_t>);
+        const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t, T>);
+        const auto ker0hi  = xsimd::swizzle(ker_v, zip_hi_index<arch_t, T>);
         res_low            = xsimd::fma(ker0low, du_pt0, res_low);
         res_hi             = xsimd::fma(ker0hi, du_pt1, res_hi);
       }
@@ -962,7 +474,7 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
       if constexpr (regular_part < 2 * ns) {
         const auto ker0    = simd_type::load_unaligned(ker + (regular_part / 2));
         const auto du_pt   = simd_type::load_unaligned(du_ptr + regular_part);
-        const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t>);
+        const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t, T>);
         res_low            = xsimd::fma(ker0low, du_pt, res_low);
       }
 
@@ -994,22 +506,22 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker,
   target[1] = out[1];
 }
 
-template<uint8_t ns, class simd_type>
-static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du,
-                               const FLT *ker1, const FLT *ker2, const BIGINT i1,
-                               const BIGINT i2, const UBIGINT N1, const UBIGINT N2) {
+template<typename T, uint8_t ns, class simd_type>
+static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
+                               const T *ker2, const BIGINT i1, const BIGINT i2,
+                               const UBIGINT N1, const UBIGINT N2) {
   /*
    * This function is called when the kernel wraps around the grid. It is slower than
    * the non wrapping version.
    * There is an extra case for when ker is padded and spills over the du array.
    * In this case uses the old non wrapping version.
    */
-  std::array<FLT, 2> out{0};
+  std::array<T, 2> out{0};
   using arch_t                    = typename simd_type::arch_type;
   static constexpr auto alignment = arch_t::alignment();
   if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) {
     // store a horiz line (interleaved real,imag)
-    alignas(alignment) std::array<FLT, 2 * ns> line{0};
+    alignas(alignment) std::array<T, 2 * ns> line{0};
     // add remaining const-y lines to the line (expensive inner loop)
     for (uint8_t dy{0}; dy < ns; ++dy) {
       const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
@@ -1047,10 +559,9 @@ static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du,
   target[1] = out[1];
 }
 
-template<uint8_t ns, class simd_type>
-void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
-                   const FLT *ker2, const BIGINT i1, const BIGINT i2, const UBIGINT N1,
-                   const UBIGINT N2)
+template<typename T, uint8_t ns, class simd_type = PaddedSIMD<T, 2 * ns>>
+static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
+                          const T *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2)
 /* 2D interpolate complex values from a ns*ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    ns*ns outer product of the 1d kernel lists ker1 and ker2.
@@ -1083,10 +594,10 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
    The code is largely similar to 1D interpolation, please see the explanation there
 */
 {
-  std::array<FLT, 2> out{0};
+  std::array<T, 2> out{0};
   // no wrapping: avoid ptrs
   using arch_t                          = typename simd_type::arch_type;
-  static constexpr auto padding         = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding         = get_padding<T, 2 * ns>();
   static constexpr auto alignment       = arch_t::alignment();
   static constexpr auto simd_size       = simd_type::size;
   static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size;
@@ -1117,15 +628,15 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
       for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
            i += 2) {
         const auto ker1_v  = simd_type::load_aligned(ker1 + i * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
-        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
+        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t, T>);
         res_low            = xsimd::fma(ker1low, line[i], res_low);
         res_hi             = xsimd::fma(ker1hi, line[i + 1], res_hi);
       }
       if constexpr (line_vectors % 2) {
         const auto ker1_v =
             simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
         res_low            = xsimd::fma(ker1low, line.back(), res_low);
       }
       return res_low + res_hi;
@@ -1138,17 +649,17 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
   } else { // wraps somewhere: use ptr list
     // this is slower than above, but occurs much less often, with fractional
     // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized.
-    return interp_square_wrap<ns, simd_type>(target, du, ker1, ker2, i1, i2, N1, N2);
+    return interp_square_wrap<T, ns, simd_type>(target, du, ker1, ker2, i1, i2, N1, N2);
   }
   target[0] = out[0];
   target[1] = out[1];
 }
 
-template<uint8_t ns, class simd_type>
-static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du,
-                                const FLT *ker1, const FLT *ker2, const FLT *ker3,
-                                const BIGINT i1, const BIGINT i2, const BIGINT i3,
-                                const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) {
+template<typename T, uint8_t ns, class simd_type>
+static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
+                                const T *ker2, const T *ker3, const BIGINT i1,
+                                const BIGINT i2, const BIGINT i3, const UBIGINT N1,
+                                const UBIGINT N2, const UBIGINT N3) {
   /*
    * This function is called when the kernel wraps around the cube.
    * Similarly to 2D and 1D wrapping, this is slower than the non wrapping version.
@@ -1158,14 +669,14 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du,
   const auto in_bounds_1          = (i1 >= 0) & (i1 + ns <= N1);
   const auto in_bounds_2          = (i2 >= 0) & (i2 + ns <= N2);
   const auto in_bounds_3          = (i3 >= 0) & (i3 + ns <= N3);
-  std::array<FLT, 2> out{0};
+  std::array<T, 2> out{0};
   // case no wrapping needed but padding spills over du array.
   // Hence, no explicit vectorization but the code is still faster
   if (FINUFFT_LIKELY(in_bounds_1 && in_bounds_2 && in_bounds_3)) {
     // no wrapping: avoid ptrs (by far the most common case)
     // store a horiz line (interleaved real,imag)
     // initialize line with zeros; hard to avoid here, but overhead small in 3D
-    alignas(alignment) std::array<FLT, 2 * ns> line{0};
+    alignas(alignment) std::array<T, 2 * ns> line{0};
     // co-add y and z contributions to line in x; do not apply x kernel yet
     // This is expensive innermost loop
     for (uint8_t dz{0}; dz < ns; ++dz) {
@@ -1217,10 +728,10 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du,
   target[1] = out[1];
 }
 
-template<uint8_t ns, class simd_type>
-void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
-                 const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2,
-                 const BIGINT i3, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3)
+template<typename T, uint8_t ns, class simd_type = PaddedSIMD<T, 2 * ns>>
+static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1,
+                        const T *ker2, const T *ker3, BIGINT i1, BIGINT i2, BIGINT i3,
+                        UBIGINT N1, UBIGINT N2, UBIGINT N3)
 /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3.
@@ -1251,7 +762,7 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
 */
 {
   using arch_t                          = typename simd_type::arch_type;
-  static constexpr auto padding         = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding         = get_padding<T, 2 * ns>();
   static constexpr auto alignment       = arch_t::alignment();
   static constexpr auto simd_size       = simd_type::size;
   static constexpr auto ker23_size      = (ns + simd_size - 1) & -simd_size;
@@ -1259,7 +770,7 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
   const auto in_bounds_1                = (i1 >= 0) & (i1 + ns <= N1);
   const auto in_bounds_2                = (i2 >= 0) & (i2 + ns <= N2);
   const auto in_bounds_3                = (i3 >= 0) & (i3 + ns <= N3);
-  std::array<FLT, 2> out{0};
+  std::array<T, 2> out{0};
   if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) {
     const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2,
                        ker3, du]() constexpr noexcept {
@@ -1284,15 +795,15 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
       for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable)
            i += 2) {
         const auto ker1_v  = simd_type::load_aligned(ker1 + i * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
-        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
+        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t, T>);
         res_low            = xsimd::fma(ker1low, line[i], res_low);
         res_hi             = xsimd::fma(ker1hi, line[i + 1], res_hi);
       }
       if constexpr (line_vectors % 2) {
         const auto ker1_v =
             simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
         res_low            = xsimd::fma(ker1low, line.back(), res_low);
       }
       return res_low + res_hi;
@@ -1303,17 +814,61 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1,
       out[1] += res_array[i + 1];
     }
   } else {
-    return interp_cube_wrapped<ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
-                                              N1, N2, N3);
+    return interp_cube_wrapped<T, ns, simd_type>(target, du, ker1, ker2, ker3, i1, i2, i3,
+                                                 N1, N2, N3);
   }
   target[0] = out[0];
   target[1] = out[1];
 }
 
-template<uint8_t ns, bool kerevalmeth>
+template<uint8_t ns, uint8_t kerevalmeth, class T,
+         class simd_type = xsimd::make_sized_batch_t<T, find_optimal_simd_width<T, ns>()>,
+         typename... V>
+static FINUFFT_ALWAYS_INLINE auto ker_eval(
+    T *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept {
+  /* Utility function that allows to move the kernel evaluation outside the spreader for
+     clarity
+     Inputs are:
+     ns = kernel width
+     kerevalmeth = kernel evaluation method
+     T = (single or double precision) type of the kernel
+     simd_type = xsimd::batch for Horner
+     vectorization (default is the optimal simd size)
+     finufft_spread_opts as Horner needs
+     the oversampling factor
+     elems = kernel arguments
+     Examples usage is
+     ker_eval<ns,kerevalmeth>(opts, x, y, z) // for 3D or
+     ker_eval<ns, kerevalmeth>(opts, x, y) // for 2D or
+     ker_eval<ns, kerevalmeth>(opts, x) // for 1D
+   */
+  const std::array inputs{elems...};
+  // compile time loop, no performance overhead
+  for (auto i = 0; i < sizeof...(elems); ++i) {
+    // compile time branch no performance overhead
+    if constexpr (kerevalmeth == 1) {
+      if (opts.upsampfac == 2.0) {
+        eval_kernel_vec_Horner<T, ns, 200, simd_type>(ker + (i * MAX_NSPREAD), inputs[i],
+                                                      opts);
+      }
+      if (opts.upsampfac == 1.25) {
+        eval_kernel_vec_Horner<T, ns, 125, simd_type>(ker + (i * MAX_NSPREAD), inputs[i],
+                                                      opts);
+      }
+    }
+    if constexpr (kerevalmeth == 0) {
+      alignas(simd_type::arch_type::alignment()) std::array<T, MAX_NSPREAD> kernel_args{};
+      set_kernel_args<T, ns>(kernel_args.data(), inputs[i]);
+      evaluate_kernel_vector<T, ns>(ker + (i * MAX_NSPREAD), kernel_args.data(), opts);
+    }
+  }
+  return ker;
+}
+
+template<typename T, uint8_t ns, bool kerevalmeth>
 FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
-    const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M,
-    const FLT *const kx, const FLT *const dd, const finufft_spread_opts &opts) noexcept {
+    const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M,
+    const T *const kx, const T *const dd, const finufft_spread_opts &opts) noexcept {
   /* 1D spreader from nonuniform to uniform subproblem grid, without wrapping.
      Inputs:
      off1 - integer offset of left end of du subgrid from that of overall fine
@@ -1334,15 +889,15 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
      This needed off1 as extra arg. AHB 11/30/20.
      Vectorized using xsimd by M. Barbone 06/24.
   */
-  using simd_type                 = PaddedSIMD<FLT, 2 * ns>;
+  using simd_type                 = PaddedSIMD<T, 2 * ns>;
   using arch_t                    = typename simd_type::arch_type;
-  static constexpr auto padding   = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding   = get_padding<T, 2 * ns>();
   static constexpr auto alignment = arch_t::alignment();
   static constexpr auto simd_size = simd_type::size;
-  static constexpr auto ns2       = ns * FLT(0.5); // half spread width
+  static constexpr auto ns2       = ns * T(0.5); // half spread width
   // something weird here. Reversing ker{0} and std fill causes ker
   // to be zeroed inside the loop GCC uses AVX, clang AVX2
-  alignas(alignment) std::array<FLT, MAX_NSPREAD> ker{0};
+  alignas(alignment) std::array<T, MAX_NSPREAD> ker{0};
   std::fill(du, du + 2 * size1, 0); // zero output
   // no padding needed if MAX_NSPREAD is 16
   // the largest read is 16 floats with avx512
@@ -1362,7 +917,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
     const auto dd_pt = initialize_complex_register<simd_type>(dd[i * 2], dd[i * 2 + 1]);
     // ceil offset, hence rounding, must match that in get_subgrid...
     const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index
-    // FLT(i1) has different semantics and results an extra cast
+    // T(i1) has different semantics and results an extra cast
     const auto x1 = [i, kx]() constexpr noexcept {
       auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding
       // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly
@@ -1374,8 +929,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
     }();
     // Libin improvement: pass ker as a parameter and allocate it outside the loop
     // gcc13 + 10% speedup
-    ker_eval<ns, kerevalmeth, FLT, simd_type>(ker.data(), opts, x1);
-    //    const auto ker = ker_eval<ns, kerevalmeth, FLT, simd_type>(opts, x1);
+    ker_eval<ns, kerevalmeth, T, simd_type>(ker.data(), opts, x1);
+    //    const auto ker = ker_eval<ns, kerevalmeth, T, simd_type>(opts, x1);
     const auto j = i1 - off1; // offset rel to subgrid, starts the output indices
     auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize
     // du is padded, so we can use SIMD even if we write more than ns values in du
@@ -1411,12 +966,12 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
       const auto du_pt1 = simd_type::load_unaligned(trg + dx + simd_size);
       // swizzle is faster than zip_lo(ker_v, ker_v) and zip_hi(ker_v, ker_v)
       // swizzle in this case is equivalent to zip_lo and zip_hi respectively
-      const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t>);
+      const auto ker0low = xsimd::swizzle(ker_v, zip_low_index<arch_t, T>);
       // ker 0 looks like this now:
       // +-----------------------+
       // |y0|y0|y1|y1|y2|y2|y3|y3|
       // +-----------------------+
-      const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index<arch_t>);
+      const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index<arch_t, T>);
       // ker 1 looks like this now:
       // +-----------------------+
       // |y4|y4|y5|y5|y6|y6|y7|y7|
@@ -1443,17 +998,17 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel(
       // the corresponding memory is not accessed
       const auto ker0    = simd_type::load_unaligned(ker.data() + (regular_part / 2));
       const auto du_pt   = simd_type::load_unaligned(trg + regular_part);
-      const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t>);
+      const auto ker0low = xsimd::swizzle(ker0, zip_low_index<arch_t, T>);
       const auto res     = xsimd::fma(ker0low, dd_pt, du_pt);
       res.store_unaligned(trg + regular_part);
     }
   }
 }
 
-template<uint8_t NS>
+template<typename T, uint8_t NS>
 static void spread_subproblem_1d_dispatch(
-    const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M,
-    const FLT *kx, const FLT *dd, const finufft_spread_opts &opts) noexcept {
+    const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M,
+    const T *kx, const T *dd, const finufft_spread_opts &opts) noexcept {
   /* this is a dispatch function that will call the correct kernel based on the ns
    it recursively iterates from MAX_NSPREAD to MIN_NSPREAD
    it generates the following code:
@@ -1486,27 +1041,29 @@ static void spread_subproblem_1d_dispatch(
                 "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
   if constexpr (NS == MIN_NSPREAD) { // Base case
     if (opts.kerevalmeth)
-      return spread_subproblem_1d_kernel<MIN_NSPREAD, true>(off1, size1, du, M, kx, dd,
-                                                            opts);
+      return spread_subproblem_1d_kernel<T, MIN_NSPREAD, true>(off1, size1, du, M, kx, dd,
+                                                               opts);
     else {
-      return spread_subproblem_1d_kernel<MIN_NSPREAD, false>(off1, size1, du, M, kx, dd,
-                                                             opts);
+      return spread_subproblem_1d_kernel<T, MIN_NSPREAD, false>(off1, size1, du, M, kx,
+                                                                dd, opts);
     }
   } else {
     if (opts.nspread == NS) {
       if (opts.kerevalmeth) {
-        return spread_subproblem_1d_kernel<NS, true>(off1, size1, du, M, kx, dd, opts);
+        return spread_subproblem_1d_kernel<T, NS, true>(off1, size1, du, M, kx, dd, opts);
       } else {
-        return spread_subproblem_1d_kernel<NS, false>(off1, size1, du, M, kx, dd, opts);
+        return spread_subproblem_1d_kernel<T, NS, false>(off1, size1, du, M, kx, dd,
+                                                         opts);
       }
     } else {
-      return spread_subproblem_1d_dispatch<NS - 1>(off1, size1, du, M, kx, dd, opts);
+      return spread_subproblem_1d_dispatch<T, NS - 1>(off1, size1, du, M, kx, dd, opts);
     }
   }
 }
 
-void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *kx,
-                          FLT *dd, const finufft_spread_opts &opts) noexcept
+template<typename T>
+static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, T *du, UBIGINT M, T *kx,
+                                 T *dd, const finufft_spread_opts &opts) noexcept
 /* spreader from dd (NU) to du (uniform) in 2D without wrapping.
    See above docs/notes for spread_subproblem_2d.
    kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims.
@@ -1515,14 +1072,14 @@ void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *k
    For algoritmic details see spread_subproblem_1d_kernel.
 */
 {
-  spread_subproblem_1d_dispatch<MAX_NSPREAD>(off1, size1, du, M, kx, dd, opts);
+  spread_subproblem_1d_dispatch<T, MAX_NSPREAD>(off1, size1, du, M, kx, dd, opts);
 }
 
-template<uint8_t ns, bool kerevalmeth>
+template<typename T, uint8_t ns, bool kerevalmeth>
 FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
     const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2,
-    FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky,
-    const FLT *dd, const finufft_spread_opts &opts) noexcept
+    T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd,
+    const finufft_spread_opts &opts) noexcept
 /* spreader from dd (NU) to du (uniform) in 2D without wrapping.
    See above docs/notes for spread_subproblem_2d.
    kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims.
@@ -1531,24 +1088,24 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
    For algoritmic details see spread_subproblem_1d_kernel.
 */
 {
-  using simd_type                 = PaddedSIMD<FLT, 2 * ns>;
+  using simd_type                 = PaddedSIMD<T, 2 * ns>;
   using arch_t                    = typename simd_type::arch_type;
-  static constexpr auto padding   = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding   = get_padding<T, 2 * ns>();
   static constexpr auto simd_size = simd_type::size;
   static constexpr auto alignment = arch_t::alignment();
   // Kernel values stored in consecutive memory. This allows us to compute
   // values in all three directions in a single kernel evaluation call.
-  static constexpr auto ns2 = ns * FLT(0.5); // half spread width
-  alignas(alignment) std::array<FLT, 2 * MAX_NSPREAD> kernel_values{0};
-  std::fill(du, du + 2 * size1 * size2, 0);  // initialized to 0 due to the padding
-  for (uint64_t pt = 0; pt < M; pt++) {      // loop over NU pts
+  static constexpr auto ns2 = ns * T(0.5);  // half spread width
+  alignas(alignment) std::array<T, 2 * MAX_NSPREAD> kernel_values{0};
+  std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding
+  for (uint64_t pt = 0; pt < M; pt++) {     // loop over NU pts
     const auto dd_pt = initialize_complex_register<simd_type>(dd[pt * 2], dd[pt * 2 + 1]);
     // ceil offset, hence rounding, must match that in get_subgrid...
     const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices
     const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2);
-    const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt];
-    const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt];
-    ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1, x2);
+    const auto x1 = (T)std::ceil(kx[pt] - ns2) - kx[pt];
+    const auto x2 = (T)std::ceil(ky[pt] - ns2) - ky[pt];
+    ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2);
     const auto *ker1 = kernel_values.data();
     const auto *ker2 = kernel_values.data() + MAX_NSPREAD;
     // Combine kernel with complex source value to simplify inner loop
@@ -1578,8 +1135,8 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
       for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable)
            i += 2) {
         const auto ker1_v  = simd_type::load_aligned(ker1 + i * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
-        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
+        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t, T>);
         // this initializes the entire vector registers with the same value
         // the ker1val_v[i] looks like this:
         // +-----------------------+
@@ -1591,7 +1148,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
       if constexpr (kerval_vectors % 2) {
         const auto ker1_v =
             simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2);
-        const auto res = xsimd::swizzle(ker1_v, zip_low_index<arch_t>) * dd_pt;
+        const auto res = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>) * dd_pt;
         ker1val_v[kerval_vectors - 1] = res;
       }
       return ker1val_v;
@@ -1611,41 +1168,42 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel(
   }
 }
 
-template<uint8_t NS>
+template<typename T, uint8_t NS>
 void spread_subproblem_2d_dispatch(
     const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2,
-    FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky,
-    const FLT *dd, const finufft_spread_opts &opts) {
+    T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd,
+    const finufft_spread_opts &opts) {
   static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
                 "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
   if constexpr (NS == MIN_NSPREAD) { // Base case
     if (opts.kerevalmeth)
-      return spread_subproblem_2d_kernel<MIN_NSPREAD, true>(off1, off2, size1, size2, du,
-                                                            M, kx, ky, dd, opts);
+      return spread_subproblem_2d_kernel<T, MIN_NSPREAD, true>(off1, off2, size1, size2,
+                                                               du, M, kx, ky, dd, opts);
     else {
-      return spread_subproblem_2d_kernel<MIN_NSPREAD, false>(off1, off2, size1, size2, du,
-                                                             M, kx, ky, dd, opts);
+      return spread_subproblem_2d_kernel<T, MIN_NSPREAD, false>(off1, off2, size1, size2,
+                                                                du, M, kx, ky, dd, opts);
     }
   } else {
     if (opts.nspread == NS) {
       if (opts.kerevalmeth) {
-        return spread_subproblem_2d_kernel<NS, true>(off1, off2, size1, size2, du, M, kx,
-                                                     ky, dd, opts);
+        return spread_subproblem_2d_kernel<T, NS, true>(off1, off2, size1, size2, du, M,
+                                                        kx, ky, dd, opts);
       } else {
-        return spread_subproblem_2d_kernel<NS, false>(off1, off2, size1, size2, du, M, kx,
-                                                      ky, dd, opts);
+        return spread_subproblem_2d_kernel<T, NS, false>(off1, off2, size1, size2, du, M,
+                                                         kx, ky, dd, opts);
       }
     } else {
-      return spread_subproblem_2d_dispatch<NS - 1>(off1, off2, size1, size2, du, M, kx,
-                                                   ky, dd, opts);
+      return spread_subproblem_2d_dispatch<T, NS - 1>(off1, off2, size1, size2, du, M, kx,
+                                                      ky, dd, opts);
     }
   }
 }
 
-void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const UBIGINT size1,
-                          const UBIGINT size2, FLT *FINUFFT_RESTRICT du, const UBIGINT M,
-                          const FLT *kx, const FLT *ky, const FLT *dd,
-                          const finufft_spread_opts &opts) noexcept
+template<typename T>
+static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2,
+                                 T *FINUFFT_RESTRICT du, UBIGINT M, const T *kx,
+                                 const T *ky, const T *dd,
+                                 const finufft_spread_opts &opts) noexcept
 /* spreader from dd (NU) to du (uniform) in 2D without wrapping.
    See above docs/notes for spread_subproblem_2d.
    kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims.
@@ -1654,24 +1212,24 @@ void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const UBIGINT si
    For algoritmic details see spread_subproblem_1d_kernel.
 */
 {
-  spread_subproblem_2d_dispatch<MAX_NSPREAD>(off1, off2, size1, size2, du, M, kx, ky, dd,
-                                             opts);
+  spread_subproblem_2d_dispatch<T, MAX_NSPREAD>(off1, off2, size1, size2, du, M, kx, ky,
+                                                dd, opts);
 }
 
-template<uint8_t ns, bool kerevalmeth>
+template<typename T, uint8_t ns, bool kerevalmeth>
 FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
     const BIGINT off1, const BIGINT off2, const BIGINT off3, const UBIGINT size1,
-    const UBIGINT size2, const UBIGINT size3, FLT *FINUFFT_RESTRICT du, const UBIGINT M,
-    const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd,
+    const UBIGINT size2, const UBIGINT size3, T *FINUFFT_RESTRICT du, const UBIGINT M,
+    const T *kx, const T *ky, const T *kz, const T *dd,
     const finufft_spread_opts &opts) noexcept {
-  using simd_type                 = PaddedSIMD<FLT, 2 * ns>;
+  using simd_type                 = PaddedSIMD<T, 2 * ns>;
   using arch_t                    = typename simd_type::arch_type;
-  static constexpr auto padding   = get_padding<FLT, 2 * ns>();
+  static constexpr auto padding   = get_padding<T, 2 * ns>();
   static constexpr auto simd_size = simd_type::size;
   static constexpr auto alignment = arch_t::alignment();
 
-  static constexpr auto ns2 = ns * FLT(0.5); // half spread width
-  alignas(alignment) std::array<FLT, 3 * MAX_NSPREAD> kernel_values{0};
+  static constexpr auto ns2 = ns * T(0.5); // half spread width
+  alignas(alignment) std::array<T, 3 * MAX_NSPREAD> kernel_values{0};
   std::fill(du, du + 2 * size1 * size2 * size3, 0);
 
   for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts
@@ -1684,7 +1242,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
     const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt];
     const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt];
 
-    ker_eval<ns, kerevalmeth, FLT, simd_type>(kernel_values.data(), opts, x1, x2, x3);
+    ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2, x3);
     const auto *ker1 = kernel_values.data();
     const auto *ker2 = kernel_values.data() + MAX_NSPREAD;
     const auto *ker3 = kernel_values.data() + 2 * MAX_NSPREAD;
@@ -1702,8 +1260,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
       for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable
            i += 2) {
         const auto ker1_v  = simd_type::load_aligned(ker1 + i * simd_size / 2);
-        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t>);
-        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t>);
+        const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>);
+        const auto ker1hi  = xsimd::swizzle(ker1_v, zip_hi_index<arch_t, T>);
         ker1val_v[i]       = ker1low * dd_pt;
         ker1val_v[i + 1]   = ker1hi * dd_pt;
       }
@@ -1712,7 +1270,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
       if constexpr (kerval_vectors % 2) {
         const auto ker1_v =
             simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2);
-        const auto res = xsimd::swizzle(ker1_v, zip_low_index<arch_t>) * dd_pt;
+        const auto res = xsimd::swizzle(ker1_v, zip_low_index<arch_t, T>) * dd_pt;
         ker1val_v[kerval_vectors - 1] = res;
       }
       return ker1val_v;
@@ -1734,41 +1292,42 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel(
   }
 }
 
-template<uint8_t NS>
-void spread_subproblem_3d_dispatch(
-    BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, UBIGINT size2, UBIGINT size3,
-    FLT *du, UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd,
-    const finufft_spread_opts &opts) noexcept {
+template<typename T, uint8_t NS>
+void spread_subproblem_3d_dispatch(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
+                                   UBIGINT size2, UBIGINT size3, T *du, UBIGINT M,
+                                   const T *kx, const T *ky, const T *kz, const T *dd,
+                                   const finufft_spread_opts &opts) noexcept {
   static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
                 "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
   if constexpr (NS == MIN_NSPREAD) { // Base case
     if (opts.kerevalmeth)
-      return spread_subproblem_3d_kernel<MIN_NSPREAD, true>(
+      return spread_subproblem_3d_kernel<T, MIN_NSPREAD, true>(
           off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
     else {
-      return spread_subproblem_3d_kernel<MIN_NSPREAD, false>(
+      return spread_subproblem_3d_kernel<T, MIN_NSPREAD, false>(
           off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
     }
   } else {
     if (opts.nspread == NS) {
       if (opts.kerevalmeth) {
-        return spread_subproblem_3d_kernel<NS, true>(off1, off2, off3, size1, size2,
-                                                     size3, du, M, kx, ky, kz, dd, opts);
+        return spread_subproblem_3d_kernel<T, NS, true>(
+            off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
       } else {
-        return spread_subproblem_3d_kernel<NS, false>(off1, off2, off3, size1, size2,
-                                                      size3, du, M, kx, ky, kz, dd, opts);
+        return spread_subproblem_3d_kernel<T, NS, false>(
+            off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts);
       }
     } else {
-      return spread_subproblem_3d_dispatch<NS - 1>(off1, off2, off3, size1, size2, size3,
-                                                   du, M, kx, ky, kz, dd, opts);
+      return spread_subproblem_3d_dispatch<T, NS - 1>(off1, off2, off3, size1, size2,
+                                                      size3, du, M, kx, ky, kz, dd, opts);
     }
   }
 }
 
-void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
-                          UBIGINT size2, UBIGINT size3, FLT *du, UBIGINT M, FLT *kx,
-                          FLT *ky, FLT *kz, FLT *dd,
-                          const finufft_spread_opts &opts) noexcept
+template<typename T>
+static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1,
+                                 UBIGINT size2, UBIGINT size3, T *du, UBIGINT M, T *kx,
+                                 T *ky, T *kz, T *dd,
+                                 const finufft_spread_opts &opts) noexcept
 /* spreader from dd (NU) to du (uniform) in 3D without wrapping.
 See above docs/notes for spread_subproblem_2d.
 kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim.
@@ -1776,15 +1335,15 @@ dd (size M complex) are complex source strengths
 du (size size1*size2*size3) is uniform complex output array
 */
 {
-  spread_subproblem_3d_dispatch<MAX_NSPREAD>(off1, off2, off3, size1, size2, size3, du, M,
-                                             kx, ky, kz, dd, opts);
+  spread_subproblem_3d_dispatch<T, MAX_NSPREAD>(off1, off2, off3, size1, size2, size3, du,
+                                                M, kx, ky, kz, dd, opts);
 }
 
-template<bool thread_safe>
-void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
-                         UBIGINT padded_size1, UBIGINT size1, UBIGINT size2,
-                         UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3,
-                         FLT *FINUFFT_RESTRICT data_uniform, const FLT *const du0)
+template<typename T, bool thread_safe>
+static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
+                                UBIGINT padded_size1, UBIGINT size1, UBIGINT size2,
+                                UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3,
+                                T *FINUFFT_RESTRICT data_uniform, const T *du0)
 /* Add a large subgrid (du0) to output grid (data_uniform),
    with periodic wrapping to N1,N2,N3 box.
    offset1,2,3 give the offset of the subgrid from the lowest corner of output.
@@ -1796,7 +1355,7 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
 */
 {
   std::vector<BIGINT> o2(size2), o3(size3);
-  static auto accumulate = [](FLT &a, FLT b) {
+  static auto accumulate = [](T &a, T b) {
     if constexpr (thread_safe) { // NOLINT(*-branch-clone)
 #pragma omp atomic
       a += b;
@@ -1841,10 +1400,11 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
   }
 }
 
-void bin_sort_singlethread(
-    BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
-    const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x,
-    const double bin_size_y, const double bin_size_z, const int debug)
+template<typename T>
+static void bin_sort_singlethread(std::vector<BIGINT> &ret, UBIGINT M, const T *kx,
+                                  const T *ky, const T *kz, UBIGINT N1, UBIGINT N2,
+                                  UBIGINT N3, double bin_size_x, double bin_size_y,
+                                  double bin_size_z, int debug)
 /* Returns permutation of all nonuniform points with good RAM access,
  * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
  *
@@ -1877,21 +1437,21 @@ void bin_sort_singlethread(
   // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
   // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
   // Note that round-off near kx=-pi stably rounds negative to i1=0.
-  const auto nbins1         = BIGINT(FLT(N1) / bin_size_x + 1);
-  const auto nbins2         = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
-  const auto nbins3         = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
+  const auto nbins1         = BIGINT(T(N1) / bin_size_x + 1);
+  const auto nbins2         = isky ? BIGINT(T(N2) / bin_size_y + 1) : 1;
+  const auto nbins3         = iskz ? BIGINT(T(N3) / bin_size_z + 1) : 1;
   const auto nbins          = nbins1 * nbins2 * nbins3;
-  const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
-  const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
-  const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
+  const auto inv_bin_size_x = T(1.0 / bin_size_x);
+  const auto inv_bin_size_y = T(1.0 / bin_size_y);
+  const auto inv_bin_size_z = T(1.0 / bin_size_z);
   // count how many pts in each bin
   std::vector<BIGINT> counts(nbins, 0);
 
   for (auto i = 0; i < M; i++) {
     // find the bin index in however many dims are needed
-    const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
-    const auto i2  = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
-    const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto i1  = BIGINT(fold_rescale<T>(kx[i], N1) * inv_bin_size_x);
+    const auto i2  = isky ? BIGINT(fold_rescale<T>(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3  = iskz ? BIGINT(fold_rescale<T>(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
     ++counts[bin];
   }
@@ -1906,18 +1466,20 @@ void bin_sort_singlethread(
 
   for (auto i = 0; i < M; i++) {
     // find the bin index (again! but better than using RAM)
-    const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
-    const auto i2    = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
-    const auto i3    = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto i1    = BIGINT(fold_rescale<T>(kx[i], N1) * inv_bin_size_x);
+    const auto i2    = isky ? BIGINT(fold_rescale<T>(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3    = iskz ? BIGINT(fold_rescale<T>(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin   = i1 + nbins1 * (i2 + nbins2 * i3);
     ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
     ++counts[bin];                // update the offsets
   }
 }
 
-void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1,
-                          UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y,
-                          double bin_size_z, int debug, int nthr)
+template<typename T>
+static void bin_sort_multithread(std::vector<BIGINT> &ret, UBIGINT M, T *kx, T *ky, T *kz,
+                                 UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x,
+                                 double bin_size_y, double bin_size_z, int debug,
+                                 int nthr)
 /* Mostly-OpenMP'ed version of bin_sort.
    For documentation see: bin_sort_singlethread.
    Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread.
@@ -1952,9 +1514,9 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI
     my_counts.resize(nbins, 0);      // allocate counts[t], now in parallel region
     for (auto i = brk[t]; i < brk[t + 1]; i++) {
       // find the bin index in however many dims are needed
-      BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
-      if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
-      if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
+      BIGINT i1 = fold_rescale<T>(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
+      if (isky) i2 = fold_rescale<T>(ky[i], N2) / bin_size_y;
+      if (iskz) i3 = fold_rescale<T>(kz[i], N3) / bin_size_z;
       const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
       ++my_counts[bin]; // no clash btw threads
     }
@@ -1975,9 +1537,9 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI
     auto &my_counts(counts[t]);
     for (UBIGINT i = brk[t]; i < brk[t + 1]; i++) {
       // find the bin index (again! but better than using RAM)
-      UBIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
-      if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
-      if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
+      UBIGINT i1 = fold_rescale<T>(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
+      if (isky) i2 = fold_rescale<T>(ky[i], N2) / bin_size_y;
+      if (iskz) i3 = fold_rescale<T>(kz[i], N3) / bin_size_z;
       UBIGINT bin         = i1 + nbins1 * (i2 + nbins2 * i3);
       ret[my_counts[bin]] = i; // inverse is offset for this NU pt and thread
       ++my_counts[bin];        // update the offsets; no thread clash
@@ -1985,9 +1547,10 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI
   }
 }
 
-void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padded_size1,
-                 BIGINT &size1, BIGINT &size2, BIGINT &size3, UBIGINT M, FLT *kx, FLT *ky,
-                 FLT *kz, int ns, int ndims)
+template<typename T>
+static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3,
+                        BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3,
+                        UBIGINT M, T *kx, T *ky, T *kz, int ns, int ndims)
 /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of
    Z^ndims) large enough to enclose all of the nonuniform points with
    (non-periodic) padding of half the kernel width ns to each side in
@@ -2031,14 +1594,14 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padd
    tests.
 */
 {
-  FLT ns2 = (FLT)ns / 2;
-  FLT min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points
+  T ns2 = (T)ns / 2;
+  T min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points
   arrayrange(M, kx, &min_kx, &max_kx);
   offset1      = (BIGINT)std::ceil(min_kx - ns2); // min index touched by kernel
   size1        = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first!
-  padded_size1 = size1 + get_padding<FLT>(2 * ns) / 2;
+  padded_size1 = size1 + get_padding<T>(2 * ns) / 2;
   if (ndims > 1) {
-    FLT min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points
+    T min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points
     arrayrange(M, ky, &min_ky, &max_ky);
     offset2 = (BIGINT)std::ceil(min_ky - ns2);
     size2   = (BIGINT)std::ceil(max_ky - ns2) - offset2 + ns;
@@ -2047,254 +1610,658 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padd
     size2   = 1;
   }
   if (ndims > 2) {
-    FLT min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points
+    T min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points
     arrayrange(M, kz, &min_kz, &max_kz);
     offset3 = (BIGINT)std::ceil(min_kz - ns2);
     size3   = (BIGINT)std::ceil(max_kz - ns2) - offset3 + ns;
   } else {
-    offset3 = 0;
-    size3   = 1;
+    offset3 = 0;
+    size3   = 1;
+  }
+}
+
+// ==========================================================================
+template<typename T>
+FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
+    UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz,
+    T *data_nonuniform, const finufft_spread_opts &opts)
+/* ------------Spreader/interpolator for 1, 2, or 3 dimensions --------------
+   If opts.spread_direction=1, evaluate, in the 1D case,
+
+                         N1-1
+   data_nonuniform[j] =  SUM phi(kx[j] - n) data_uniform[n],   for j=0...M-1
+                         n=0
+
+   If opts.spread_direction=2, evaluate its transpose, in the 1D case,
+
+                      M-1
+   data_uniform[n] =  SUM phi(kx[j] - n) data_nonuniform[j],   for n=0...N1-1
+                      j=0
+
+   In each case phi is the spreading kernel, which has support
+   [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with
+   product of 1D kernels is performed.
+   For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1.
+
+   Notes:
+   No particular normalization of the spreading kernel is assumed.
+   Uniform (U) points are centered at coords
+   [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x
+   fastest, y medium, z slowest ordering, up to however many
+   dimensions are relevant; note that this is Fortran-style ordering for an
+   array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran
+   interface of the original CMCL libraries.
+   Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three
+   periods in each coordinate (these are folded into the central period).
+   The finufft_spread_opts struct must have been set up already by calling setup_kernel.
+   It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel
+   only ever wraps once when falls below 0 or off the top of a uniform grid
+   dimension.
+
+   Inputs:
+   N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively.
+              If N2==1, 1D spreading is done. If N3==1, 2D spreading.
+          Otherwise, 3D.
+   M - number of NU pts.
+   kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in
+                1D, only kx and ky read in 2D).
+
+        These should lie in the box -pi<=kx<=pi. Points outside this domain are also
+        correctly folded back into this domain.
+   opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h
+
+   Inputs/Outputs:
+   data_uniform - output values on grid (dir=1) OR input grid data (dir=2)
+   data_nonuniform - input strengths of the sources (dir=1)
+                     OR output values at targets (dir=2)
+   Returned value:
+   0 indicates success; other values have meanings in ../docs/error.rst, with
+   following modifications:
+      3 : one or more non-trivial box dimensions is less than 2.nspread.
+      5 : failed allocate sort indices
+
+   Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17
+   error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18
+   No separate subprob indices in t-1 2/11/18.
+   sort_threads (since for M<<N, multithread sort slower than single) 3/27/18
+   kereval, kerpad 4/24/18
+   Melody Shih split into 3 routines: check, sort, spread. Jun 2018, making
+   this routine just a caller to them. Name change, Barnett 7/27/18
+   Tidy, Barnett 5/20/20. Tidy doc, Barnett 10/22/20.
+*/
+{
+  int ier = spreadcheck(N1, N2, N3, M, kx, ky, kz, opts);
+  if (ier) return ier;
+  std::vector<BIGINT> sort_indices(M);
+  int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts);
+  spreadinterpSorted<T>(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz,
+                        data_nonuniform, opts, did_sort);
+  return 0;
+}
+
+template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp<float>(
+    UBIGINT N1, UBIGINT N2, UBIGINT N3, float *data_uniform, UBIGINT M, float *kx,
+    float *ky, float *kz, float *data_nonuniform, const finufft_spread_opts &opts);
+template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp<double>(
+    UBIGINT N1, UBIGINT N2, UBIGINT N3, double *data_uniform, UBIGINT M, double *kx,
+    double *ky, double *kz, double *data_nonuniform, const finufft_spread_opts &opts);
+
+static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2,
+                                       const UBIGINT N3)
+/* rule for getting number of spreading dimensions from the list of Ns per dim.
+   Split out, Barnett 7/26/18
+*/
+{
+  return 1 + (N2 > 1) + (N3 > 1);
+}
+
+template<typename T>
+int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T *kz,
+                const finufft_spread_opts &opts)
+/* This does just the input checking and reporting for the spreader.
+   See spreadinterp() for input arguments and meaning of returned value.
+   Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18.
+   Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to
+   [-3pi,3pi)
+*/
+{
+  // INPUT CHECKING & REPORTING .... cuboid not too small for spreading?
+  int minN = 2 * opts.nspread;
+  if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) {
+    fprintf(stderr,
+            "%s error: one or more non-trivial box dims is less than 2.nspread!\n",
+            __func__);
+    return FINUFFT_ERR_SPREAD_BOX_SMALL;
+  }
+  if (opts.spread_direction != 1 && opts.spread_direction != 2) {
+    fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__);
+    return FINUFFT_ERR_SPREAD_DIR;
+  }
+  return 0;
+}
+template int spreadcheck<float>(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, float *kx,
+                                float *ky, float *kz, const finufft_spread_opts &opts);
+template int spreadcheck<double>(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M,
+                                 double *kx, double *ky, double *kz,
+                                 const finufft_spread_opts &opts);
+
+template<typename T>
+int indexSort(std::vector<BIGINT> &sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3,
+              UBIGINT M, T *kx, T *ky, T *kz, const finufft_spread_opts &opts)
+/* This makes a decision whether or not to sort the NU pts (influenced by
+   opts.sort), and if yes, calls either single- or multi-threaded bin sort,
+   writing reordered index list to sort_indices. If decided not to sort, the
+   identity permutation is written to sort_indices.
+   The permutation is designed to make RAM access close to contiguous, to
+   speed up spreading/interpolation, in the case of disordered NU points.
+
+   Inputs:
+    M        - number of input NU points.
+    kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi),
+                points outside are folded in.
+               (only kz used in 1D, only kx and ky used in 2D.)
+    N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D).
+               1 = x (fastest), 2 = y (medium), 3 = z (slowest).
+    opts     - spreading options struct, see ../include/finufft_spread_opts.h
+   Outputs:
+    sort_indices - a good permutation of NU points. (User must preallocate
+                   to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good
+                   ordering for the x-coords of NU pts, etc.
+    returned value - whether a sort was done (1) or not (0).
+
+   Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024.
+*/
+{
+  CNTime timer{};
+  uint8_t ndims = ndims_from_Ns(N1, N2, N3);
+  auto N        = N1 * N2 * N3; // U grid (periodic box) sizes
+
+  // heuristic binning box size for U grid... affects performance:
+  double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4;
+  // put in heuristics based on cache sizes (only useful for single-thread) ?
+
+  int better_to_sort =
+      !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or
+                                                                        // dir=2 case:
+                                                                        // don't sort
+
+  timer.start();                           // if needed, sort all the NU pts...
+  int did_sort = 0;
+  auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default
+  if (opts.nthreads > 0)
+    maxnthr = opts.nthreads;               // user nthreads overrides, without limit
+  if (opts.sort_threads > 0)
+    maxnthr = opts.sort_threads;           // high-priority override, also no limit
+  // At this point: maxnthr = the max threads sorting could use
+  // (we don't print warning here, since: no showwarn in spread_opts, and finufft
+  // already warned about it. spreadinterp-only advanced users will miss a warning)
+  if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) {
+    // store a good permutation ordering of all NU pts (dim=1,2 or 3)
+    int sort_debug = (opts.debug >= 2); // show timing output?
+    int sort_nthr  = opts.sort_threads; // 0, or user max # threads for sort
+#ifndef _OPENMP
+    sort_nthr = 1;                      // if single-threaded lib, override user
+#endif
+    if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better!
+      sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic
+    if (sort_nthr == 1)
+      bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
+                            bin_size_y, bin_size_z, sort_debug);
+    else // sort_nthr>1, user fixes # threads (>=2)
+      bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
+                           bin_size_y, bin_size_z, sort_debug, sort_nthr);
+    if (opts.debug)
+      printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec());
+    did_sort = 1;
+  } else {
+#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000)
+    for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7
+      sort_indices[i] = i;         // the identity permutation
+    if (opts.debug)
+      printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec());
   }
+  return did_sort;
 }
-/* local NU coord fold+rescale macro: does the following affine transform to x:
-    (x+PI) mod PI    each to [0,N)
-   Note: folding big numbers can cause numerical inaccuracies
-   Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range
-   limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function
-*/
-FLT fold_rescale(const FLT x, const UBIGINT N) noexcept {
-  static constexpr const FLT x2pi = FLT(M_1_2PI);
-  const FLT result                = x * x2pi + FLT(0.5);
-  return (result - floor(result)) * FLT(N);
-}
+template int indexSort<float>(std::vector<BIGINT> &sort_indices, UBIGINT N1, UBIGINT N2,
+                              UBIGINT N3, UBIGINT M, float *kx, float *ky, float *kz,
+                              const finufft_spread_opts &opts);
+template int indexSort<double>(std::vector<BIGINT> &sort_indices, UBIGINT N1, UBIGINT N2,
+                               UBIGINT N3, UBIGINT M, double *kx, double *ky, double *kz,
+                               const finufft_spread_opts &opts);
 
-template<class simd_type>
-simd_type fold_rescale(const simd_type &x, const BIGINT N) noexcept {
-  const simd_type x2pi   = FLT(M_1_2PI);
-  const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5));
-  return (result - xsimd::floor(result)) * simd_type(FLT(N));
-}
+// --------------------------------------------------------------------------
+template<typename T>
+static int spreadSorted(const std::vector<BIGINT> &sort_indices, UBIGINT N1, UBIGINT N2,
+                        UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, UBIGINT M,
+                        T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky,
+                        T *FINUFFT_RESTRICT kz, const T *data_nonuniform,
+                        const finufft_spread_opts &opts, int did_sort)
+// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc.
+{
+  CNTime timer{};
+  const auto ndims = ndims_from_Ns(N1, N2, N3);
+  const auto N     = N1 * N2 * N3;             // output array size
+  const auto ns    = opts.nspread;             // abbrev. for w, kernel width
+  auto nthr        = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread
+  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                                    // single-threaded lib must override user
+#endif
+  if (opts.debug)
+    printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
+  timer.start();
+  std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array
+  if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec());
+  if (M == 0)                                         // no NU pts, we're done
+    return 0;
 
-template<uint8_t ns, uint8_t kerevalmeth, class T, class simd_type, typename... V>
-auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts,
-              const V... elems) noexcept {
-  /* Utility function that allows to move the kernel evaluation outside the spreader for
-     clarity
-     Inputs are:
-     ns = kernel width
-     kerevalmeth = kernel evaluation method
-     T = (single or double precision) type of the kernel
-     simd_type = xsimd::batch for Horner
-     vectorization (default is the optimal simd size)
-     finufft_spread_opts as Horner needs
-     the oversampling factor
-     elems = kernel arguments
-     Examples usage is
-     ker_eval<ns,kerevalmeth>(opts, x, y, z) // for 3D or
-     ker_eval<ns, kerevalmeth>(opts, x, y) // for 2D or
-     ker_eval<ns, kerevalmeth>(opts, x) // for 1D
-   */
-  const std::array inputs{elems...};
-  // compile time loop, no performance overhead
-  for (auto i = 0; i < sizeof...(elems); ++i) {
-    // compile time branch no performance overhead
-    if constexpr (kerevalmeth == 1) {
-      if (opts.upsampfac == 2.0) {
-        eval_kernel_vec_Horner<ns, 200, simd_type>(ker + (i * MAX_NSPREAD), inputs[i],
-                                                   opts);
-      }
-      if (opts.upsampfac == 1.25) {
-        eval_kernel_vec_Horner<ns, 125, simd_type>(ker + (i * MAX_NSPREAD), inputs[i],
-                                                   opts);
-      }
+  auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic?
+  spread_single      = false;                        // for now
+  timer.start();
+  if (spread_single) { // ------- Basic single-core t1 spreading ------
+    for (UBIGINT j = 0; j < M; j++) {
+      // *** todo, not urgent
+      // ... (question is: will the index wrapping per NU pt slow it down?)
     }
-    if constexpr (kerevalmeth == 0) {
-      alignas(simd_type::arch_type::alignment()) std::array<T, MAX_NSPREAD> kernel_args{};
-      set_kernel_args<ns>(kernel_args.data(), inputs[i]);
-      evaluate_kernel_vector<ns>(ker + (i * MAX_NSPREAD), kernel_args.data(), opts);
+    if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec());
+  } else { // ------- Fancy multi-core blocked t1 spreading ----
+           // Splits sorted inds (jfm's advanced2), could double RAM.
+    // choose nb (# subprobs) via used nthreads:
+    auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr...
+    if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size
+      nb = 1 + (M - 1) / opts.max_subproblem_size;   // int div does
+                                                     // ceil(M/opts.max_subproblem_size)
+      if (opts.debug)
+        printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size);
     }
-  }
-  return ker;
-}
+    if (M * 1000 < N) { // low-density heuristic: one thread per NU pt!
+      nb = M;
+      if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n");
+    }
+    if (!did_sort && nthr == 1) {
+      nb = 1;
+      if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n");
+    }
+    if (opts.debug && nthr > opts.atomic_threshold)
+      printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n");
 
-namespace {
+    std::vector<UBIGINT> brk(nb + 1); // NU index breakpoints defining nb subproblems
+    for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb;
 
-template<typename T, std::size_t N, std::size_t M, std::size_t PaddedM>
-constexpr array<std::array<T, PaddedM>, N> pad_2D_array_with_zeros(
-    const array<std::array<T, M>, N> &input) noexcept {
-  constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept {
-    std::array<T, PaddedM> padded{0};
-    for (auto i = 0; i < input.size(); ++i) {
-      padded[i] = input[i];
+#pragma omp parallel num_threads(nthr)
+    {
+      // local copies of NU pts and data for each subproblem
+      std::vector<T> kx0{}, ky0{}, kz0{}, dd0{}, du0{};
+#pragma omp for schedule(dynamic, 1)               // each is big
+      for (int isub = 0; isub < nb; isub++) {      // Main loop through the subproblems
+        const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem
+        // copy the location and data vectors for the nonuniform points
+        kx0.resize(M0);
+        ky0.resize(M0 * (N2 > 1));
+        kz0.resize(M0 * (N3 > 1));
+        dd0.resize(2 * M0);                            // complex strength data
+        for (auto j = 0; j < M0; j++) {                // todo: can avoid this copying?
+          const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list
+          kx0[j]        = fold_rescale<T>(kx[kk], N1);
+          if (N2 > 1) ky0[j] = fold_rescale<T>(ky[kk], N2);
+          if (N3 > 1) kz0[j] = fold_rescale<T>(kz[kk], N3);
+          dd0[j * 2]     = data_nonuniform[kk * 2];     // real part
+          dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part
+        }
+        // get the subgrid which will include padding by roughly nspread/2
+        // get_subgrid sets
+        BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3;
+        // sets offsets and sizes
+        get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0,
+                    kx0.data(), ky0.data(), kz0.data(), ns, ndims);
+        if (opts.debug > 1) {
+          print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2,
+                             size3, M0);
+        }
+        // allocate output data for this subgrid
+        du0.resize(2 * padded_size1 * size2 * size3); // complex
+        // Spread to subgrid without need for bounds checking or wrapping
+        if (!(opts.flags & TF_OMIT_SPREADING)) {
+          if (ndims == 1)
+            spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(),
+                                 dd0.data(), opts);
+          else if (ndims == 2)
+            spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0,
+                                 kx0.data(), ky0.data(), dd0.data(), opts);
+          else
+            spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3,
+                                 du0.data(), M0, kx0.data(), ky0.data(), kz0.data(),
+                                 dd0.data(), opts);
+        }
+        // do the adding of subgrid to output
+        if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) {
+          if (nthr > opts.atomic_threshold) { // see above for debug reporting
+            add_wrapped_subgrid<T, true>(offset1, offset2, offset3, padded_size1, size1,
+                                         size2, size3, N1, N2, N3, data_uniform,
+                                         du0.data()); // R Blackwell's atomic version
+          } else {
+#pragma omp critical
+            add_wrapped_subgrid<T, false>(offset1, offset2, offset3, padded_size1, size1,
+                                          size2, size3, N1, N2, N3, data_uniform,
+                                          du0.data());
+          }
+        }
+      } // end main loop over subprobs
     }
-    return padded;
-  };
-  std::array<std::array<T, PaddedM>, N> output{};
-  for (std::size_t i = 0; i < N; ++i) {
-    output[i] = pad_with_zeros(input[i]);
-  }
-  return output;
-}
+    if (opts.debug)
+      printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb);
+  } // end of choice of which t1 spread type to use
+  return 0;
+};
 
-template<class T, class V, size_t... Is>
-constexpr T generate_sequence_impl(V a, V b, index_sequence<Is...>) noexcept {
-  // utility function to generate a sequence of a, b interleaved as function arguments
-  return T(((Is % 2 == 0) ? a : b)...);
-}
+// --------------------------------------------------------------------------
+template<typename T, uint16_t ns, uint16_t kerevalmeth>
+FINUFFT_NEVER_INLINE static int interpSorted_kernel(
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx,
+    T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform,
+    const finufft_spread_opts &opts)
+// Interpolate to NU pts in sorted order from a uniform grid.
+// See spreadinterp() for doc.
+{
+  using simd_type                 = PaddedSIMD<T, 2 * ns>;
+  using arch_t                    = typename simd_type::arch_type;
+  static constexpr auto alignment = arch_t::alignment();
+  static constexpr auto simd_size = simd_type::size;
+  static constexpr auto ns2 = ns * T(0.5); // half spread width, used as stencil shift
 
-template<class T, class V, std::size_t N>
-constexpr auto initialize_complex_register(V a, V b) noexcept {
-  // populates a SIMD register with a and b interleaved
-  // for example:
-  // +-------------------------------+
-  // | a | b | a | b | a | b | a | b |
-  // +-------------------------------+
-  // it uses index_sequence to generate the sequence of a, b at compile time
-  return generate_sequence_impl<T>(a, b, std::make_index_sequence<N>{});
-}
+  CNTime timer{};
+  const auto ndims = ndims_from_Ns(N1, N2, N3);
+  auto nthr        = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp
+  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
+#ifndef _OPENMP
+  nthr = 1;                                    // single-threaded lib must override user
+#endif
+  if (opts.debug)
+    printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
+  timer.start();
+#pragma omp parallel num_threads(nthr)
+  {
+    static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk
+    alignas(alignment) UBIGINT jlist[CHUNKSIZE];
+    alignas(alignment) T xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE];
+    alignas(alignment) T outbuf[2 * CHUNKSIZE];
+    // Kernels: static alloc is faster, so we do it for up to 3D...
+    alignas(alignment) std::array<T, 3 * MAX_NSPREAD> kernel_values{0};
+    auto *FINUFFT_RESTRICT ker1 = kernel_values.data();
+    auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD;
+    auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD;
+
+    // Loop over interpolation chunks
+    // main loop over NU trgs, interp each from U
+    // (note: windows omp doesn't like unsigned loop vars)
+#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts:
+    for (BIGINT i = 0; i < M; i += CHUNKSIZE) {
+      // Setup buffers for this chunk
+      const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE;
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        UBIGINT j    = sort_indices[i + ibuf];
+        jlist[ibuf]  = j;
+        xjlist[ibuf] = fold_rescale<T>(kx[j], N1);
+        if (ndims >= 2) yjlist[ibuf] = fold_rescale<T>(ky[j], N2);
+        if (ndims == 3) zjlist[ibuf] = fold_rescale<T>(kz[j], N3);
+      }
+
+      // Loop over targets in chunk
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        const auto xj = xjlist[ibuf];
+        const auto yj = (ndims > 1) ? yjlist[ibuf] : 0;
+        const auto zj = (ndims > 2) ? zjlist[ibuf] : 0;
+
+        auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf;
+
+        // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ
+        const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index
+        const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index
+        const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index
+
+        const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1]
+        const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0;
+        const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0;
+
+        // eval kernel values patch and use to interpolate from uniform data...
+        if (!(opts.flags & TF_OMIT_SPREADING)) {
+          switch (ndims) {
+          case 1:
+            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1);
+            interp_line<T, ns, simd_type>(target, data_uniform, ker1, i1, N1);
+            break;
+          case 2:
+            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2);
+            interp_square<T, ns, simd_type>(target, data_uniform, ker1, ker2, i1, i2, N1,
+                                            N2);
+            break;
+          case 3:
+            ker_eval<ns, kerevalmeth, T, simd_type>(kernel_values.data(), opts, x1, x2,
+                                                    x3);
+            interp_cube<T, ns, simd_type>(target, data_uniform, ker1, ker2, ker3, i1, i2,
+                                          i3, N1, N2, N3);
+            break;
+          default: // can't get here
+            FINUFFT_UNREACHABLE;
+            break;
+          }
+        }
+      } // end loop over targets in chunk
 
-// Below there is some template metaprogramming magic to find the best SIMD type
-// for the given number of elements. The code is based on the xsimd library
+      // Copy result buffer to output array
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        const UBIGINT j            = jlist[ibuf];
+        data_nonuniform[2 * j]     = outbuf[2 * ibuf];
+        data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1];
+      }
 
-// this finds the largest SIMD instruction set that can handle N elements
-// void otherwise -> compile error
-template<class T, uint8_t N, uint8_t K> constexpr auto BestSIMDHelper() {
-  if constexpr (N % K == 0) { // returns void in the worst case
-    return xsimd::make_sized_batch<T, K>{};
-  } else {
-    return BestSIMDHelper<T, N, (K >> 1)>();
-  }
+    } // end NU targ loop
+  } // end parallel section
+  if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec());
+  return 0;
 }
 
-template<class T, uint8_t N> constexpr uint8_t min_simd_width() {
-  // finds the smallest simd width that can handle N elements
-  // simd size is batch size the SIMD width in xsimd terminology
-  if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
-    return min_simd_width<T, N * 2>();
+template<typename T, uint16_t NS>
+static int interpSorted_dispatch(
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M,
+    T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz,
+    T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) {
+  static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD,
+                "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)");
+  if constexpr (NS == MIN_NSPREAD) { // Base case
+    if (opts.kerevalmeth)
+      return interpSorted_kernel<T, MIN_NSPREAD, true>(
+          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
+    else {
+      return interpSorted_kernel<T, MIN_NSPREAD, false>(
+          sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts);
+    }
   } else {
-    return N;
-  }
-};
-
-template<class T, uint8_t N> constexpr auto find_optimal_simd_width() {
-  // finds the smallest simd width that minimizes the number of iterations
-  // NOTE: might be suboptimal for some cases 2^N+1 for example
-  // in the future we might want to implement a more sophisticated algorithm
-  uint8_t optimal_simd_width = min_simd_width<T>();
-  uint8_t min_iterations     = (N + optimal_simd_width - 1) / optimal_simd_width;
-  for (uint8_t simd_width = optimal_simd_width;
-       simd_width <= xsimd::batch<T, xsimd::best_arch>::size;
-       simd_width *= 2) {
-    uint8_t iterations = (N + simd_width - 1) / simd_width;
-    if (iterations < min_iterations) {
-      min_iterations     = iterations;
-      optimal_simd_width = simd_width;
+    if (opts.nspread == NS) {
+      if (opts.kerevalmeth) {
+        return interpSorted_kernel<T, NS, true>(sort_indices, N1, N2, N3, data_uniform, M,
+                                                kx, ky, kz, data_nonuniform, opts);
+      } else {
+        return interpSorted_kernel<T, NS, false>(sort_indices, N1, N2, N3, data_uniform,
+                                                 M, kx, ky, kz, data_nonuniform, opts);
+      }
+    } else {
+      return interpSorted_dispatch<T, NS - 1>(sort_indices, N1, N2, N3, data_uniform, M,
+                                              kx, ky, kz, data_nonuniform, opts);
     }
   }
-  return optimal_simd_width;
 }
 
-template<class T, uint8_t N> constexpr auto GetPaddedSIMDWidth() {
-  // helper function to get the SIMD width with padding for the given number of elements
-  // that minimizes the number of iterations
-  return xsimd::make_sized_batch<T, find_optimal_simd_width<T, N>()>::type::size;
+template<typename T>
+static int interpSorted(
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M,
+    T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz,
+    T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) {
+  return interpSorted_dispatch<T, MAX_NSPREAD>(sort_indices, N1, N2, N3, data_uniform, M,
+                                               kx, ky, kz, data_nonuniform, opts);
 }
 
-template<class T, uint8_t ns> constexpr auto get_padding() {
-  // helper function to get the padding for the given number of elements
-  // ns is known at compile time, rounds ns to the next multiple of the SIMD width
-  // then subtracts ns to get the padding using a bitwise and trick
-  // WARING: this trick works only for power of 2s
-  // SOURCE: Agner Fog's VCL manual
-  constexpr uint8_t width = GetPaddedSIMDWidth<T, ns>();
-  return ((ns + width - 1) & (-width)) - ns;
+template<typename T>
+int spreadinterpSorted(const std::vector<BIGINT> &sort_indices, const UBIGINT N1,
+                       const UBIGINT N2, const UBIGINT N3, T *data_uniform,
+                       const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky,
+                       T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform,
+                       const finufft_spread_opts &opts, int did_sort)
+/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine.
+   See spreadinterp() above for inputs arguments and definitions.
+   Return value should always be 0 (no error reporting).
+   Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20.
+*/
+{
+  if (opts.spread_direction == 1) // ========= direction 1 (spreading) =======
+    spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
+                 opts, did_sort);
+
+  else // ================= direction 2 (interpolation) ===========
+    interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
+                 opts);
+
+  return 0;
 }
+template int spreadinterpSorted<float>(
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx,
+    float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz,
+    float *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts,
+    int did_sort);
+template int spreadinterpSorted<double>(
+    const std::vector<BIGINT> &sort_indices, const UBIGINT N1, const UBIGINT N2,
+    const UBIGINT N3, double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx,
+    double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz,
+    double *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts,
+    int did_sort);
 
-template<class T, uint8_t ns> constexpr auto get_padding_helper(uint8_t runtime_ns) {
-  // helper function to get the padding for the given number of elements where ns is
-  // known at runtime, it uses recursion to find the padding
-  // this allows to avoid having a function with a large number of switch cases
-  // as GetPaddedSIMDWidth requires a compile time value
-  // it cannot be a lambda function because of the template recursion
-  if constexpr (ns < 2) {
-    return 0;
-  } else {
-    if (runtime_ns == ns) {
-      return get_padding<T, ns>();
-    } else {
-      return get_padding_helper<T, ns - 1>(runtime_ns);
+///////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps,
+                                                double upsampfac, int kerevalmeth,
+                                                int debug, int showwarn, int dim)
+/* Initializes spreader kernel parameters given desired NUFFT tolerance eps,
+   upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth
+   (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags.
+   Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for
+   opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where
+   upsampfac is set. Must call this before any kernel evals done, otherwise segfault
+   likely. Returns: 0  : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be
+   achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h);
+   spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20.
+*/
+{
+  constexpr T EPSILON = std::numeric_limits<T>::epsilon();
+  if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma
+    if (kerevalmeth == 1) {
+      fprintf(stderr,
+              "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by "
+              "kerevalmeth=1\n",
+              upsampfac);
+      return FINUFFT_ERR_HORNER_WRONG_BETA;
+    }
+    if (upsampfac <= 1.0) { // no digits would result
+      fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
+              upsampfac);
+      return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
+    // calling routine must abort on above errors, since opts is garbage!
+    if (showwarn && upsampfac > 4.0)
+      fprintf(stderr,
+              "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be "
+              "beneficial.\n",
+              upsampfac);
   }
-}
 
-template<class T> uint8_t get_padding(uint8_t ns) {
-  // return the padding as a function of the number of elements
-  // 2 * MAX_NSPREAD is the maximum number of elements that we can have
-  // that's why is hardcoded here
-  return get_padding_helper<T, 2 * MAX_NSPREAD>(ns);
-}
+  // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft)
+  opts.spread_direction = 0; // user should always set to 1 or 2 as desired
+  opts.sort             = 2; // 2:auto-choice
+  opts.kerpad           = 0; // affects only evaluate_kernel_vector
+  opts.kerevalmeth      = kerevalmeth;
+  opts.upsampfac        = upsampfac;
+  opts.nthreads         = 0; // all avail
+  opts.sort_threads     = 0; // 0:auto-choice
+  // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake...
+  opts.max_subproblem_size = (dim == 1) ? 10000 : 100000;
+  opts.flags               = 0; // 0:no timing flags (>0 for experts only)
+  opts.debug               = 0; // 0:no debug output
+  // heuristic nthr above which switch OMP critical to atomic (add_wrapped...):
+  opts.atomic_threshold = 10; // R Blackwell's value
 
-struct zip_low {
-  // helper struct to get the lower half of a SIMD register and zip it with itself
-  // it returns index 0, 0, 1, 1, ... N/2, N/2
-  static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; }
-};
-struct zip_hi {
-  // helper struct to get the upper half of a SIMD register and zip it with itself
-  // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N
-  static constexpr unsigned get(unsigned index, unsigned size) {
-    return (size + index) / 2;
-  }
-};
-template<unsigned cap> struct reverse_index {
-  static constexpr unsigned get(unsigned index, const unsigned size) {
-    return index < cap ? (cap - 1 - index) : index;
+  int ns, ier = 0;            // Set kernel width w (aka ns, nspread) then copy to opts...
+  if (eps < EPSILON) {        // safety; there's no hope of beating e_mach
+    if (showwarn)
+      fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__,
+              (double)eps, (double)EPSILON);
+    eps = EPSILON; // only changes local copy (not any opts)
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
-};
-template<unsigned cap> struct shuffle_index {
-  static constexpr unsigned get(unsigned index, const unsigned size) {
-    return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index;
+  if (upsampfac == 2.0)                    // standard sigma (see SISC paper)
+    ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10
+  else                                     // custom sigma
+    ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
+  ns = max(2, ns);        // (we don't have ns=1 version yet)
+  if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules
+    if (showwarn)
+      fprintf(stderr,
+              "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; "
+              "clipping to max %d.\n",
+              __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
+    ns  = MAX_NSPREAD;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
-};
-
-struct select_even {
-  static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; }
-};
-struct select_odd {
-  static constexpr unsigned get(unsigned index, unsigned /*size*/) {
-    return index * 2 + 1;
+  opts.nspread = ns;
+  // setup for reference kernel eval (via formula): select beta width param...
+  // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel)
+  opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines)
+  opts.ES_c         = 4.0 / (double)(ns * ns);
+  double betaoverns = 2.30;           // gives decent betas for default sigma=2.0
+  if (ns == 2) betaoverns = 2.20;     // some small-width tweaks...
+  if (ns == 3) betaoverns = 2.26;
+  if (ns == 4) betaoverns = 2.38;
+  if (upsampfac != 2.0) { // again, override beta for custom sigma
+    T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m !
+    betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff
   }
-};
+  opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
+  if (debug)
+    printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__,
+           kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta);
 
-template<typename T> auto xsimd_to_array(const T &vec) noexcept {
-  constexpr auto alignment = T::arch_type::alignment();
-  alignas(alignment) std::array<typename T::value_type, T::size> array{};
-  vec.store_aligned(array.data());
-  return array;
+  return ier;
 }
-
-void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
-                        UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3,
-                        UBIGINT M0) {
-  printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1);
-  switch (ndims) {
-  case 1:
-    printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1,
-           (long long)padded_size1, (long long)M0);
-    break;
-  case 2:
-    printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1,
-           (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0);
-    break;
-  case 3:
-    printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n",
-           (long long)offset1, (long long)offset2, (long long)offset3,
-           (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0);
-    break;
-  default:
-    printf("Invalid number of dimensions: %d\n", ndims);
-    break;
-  }
+template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader<float>(
+    finufft_spread_opts &opts, float eps, double upsampfac, int kerevalmeth, int debug,
+    int showwarn, int dim);
+template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader<double>(
+    finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth, int debug,
+    int showwarn, int dim);
+
+template<typename T>
+T evaluate_kernel(T x, const finufft_spread_opts &opts)
+/* ES ("exp sqrt") kernel evaluation at single real argument:
+      phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)),    for |x| < nspread/2
+   related to an asymptotic approximation to the Kaiser--Bessel, itself an
+   approximation to prolate spheroidal wavefunction (PSWF) of order 0.
+   This is the "reference implementation", used by eg finufft/onedim_* 2/17/17.
+   Rescaled so max is 1, Barnett 7/21/24
+*/
+{
+  if (abs(x) >= (T)opts.ES_halfwidth)
+    // if spreading/FT careful, shouldn't need this if, but causes no speed hit
+    return 0.0;
+  else
+    return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0));
 }
-} // namespace
+
+template float evaluate_kernel<float>(float x, const finufft_spread_opts &opts);
+template double evaluate_kernel<double>(double x, const finufft_spread_opts &opts);
+
 } // namespace finufft::spreadinterp
diff --git a/src/utils.cpp b/src/utils.cpp
index 8df6ed665..f64009132 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -1,86 +1,89 @@
-// Low-level array manipulations, timer, and OMP helpers, that need separate
-// single/double routines (FLT must be an arg). Others are in utils_precindep
+// Low-level array manipulations, timer, and OMP helpers, that are precision-
+// independent (no FLT allowed in argument lists). Others are in utils.cpp
 
-// For self-test see ../test/testutils.cpp        Barnett 2017-2020.
+// For self-test see ../test/testutils.cpp.      Barnett 2017-2020.
+
+#include <cstdint>
 
 #include "finufft/utils.h"
-#include "finufft/defs.h"
+using namespace std;
 
 namespace finufft {
 namespace utils {
 
-// ------------ complex array utils ---------------------------------
-
-FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b)
-// ||a-b||_2 / ||a||_2
+BIGINT next235even(BIGINT n)
+// finds even integer not less than n, with prime factors no larger than 5
+// (ie, "smooth"). Adapted from fortran in hellskitchen.  Barnett 2/9/17
+// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 {
-  FLT err = 0.0, nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) {
-    nrm += real(conj(a[m]) * a[m]);
-    CPX diff = a[m] - b[m];
-    err += real(conj(diff) * diff);
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  BIGINT nplus  = n - 2;                 // to cancel out the +=2 at start of loop
+  BIGINT numdiv = 2;                     // a dummy that is >1
+  while (numdiv > 1) {
+    nplus += 2;                          // stays even
+    numdiv = nplus;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
   }
-  return sqrt(err / nrm);
+  return nplus;
 }
-FLT errtwonorm(BIGINT n, CPX *a, CPX *b)
-// ||a-b||_2
-{
-  FLT err = 0.0; // compute error 2-norm
-  for (BIGINT m = 0; m < n; ++m) {
-    CPX diff = a[m] - b[m];
-    err += real(conj(diff) * diff);
-  }
-  return sqrt(err);
+
+// ----------------------- helpers for timing (always stay double prec) ------
+
+void CNTime::start() {
+  initial = double(std::chrono::duration_cast<std::chrono::microseconds>(
+                       std::chrono::steady_clock::now().time_since_epoch())
+                       .count()) *
+            1e-6;
 }
-FLT twonorm(BIGINT n, CPX *a)
-// ||a||_2
+
+double CNTime::restart()
+// Barnett changed to returning in sec
 {
-  FLT nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]);
-  return sqrt(nrm);
+  double delta = elapsedsec();
+  start();
+  return delta;
 }
-FLT infnorm(BIGINT n, CPX *a)
-// ||a||_infty
+
+double CNTime::elapsedsec()
+// returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
-  FLT nrm = 0.0;
-  for (BIGINT m = 0; m < n; ++m) {
-    FLT aa = real(conj(a[m]) * a[m]);
-    if (aa > nrm) nrm = aa;
-  }
-  return sqrt(nrm);
+  std::uint64_t now = std::chrono::duration_cast<std::chrono::microseconds>(
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+  const double nowsec = double(now) * 1e-6;
+  return nowsec - initial;
 }
 
-// ------------ real array utils ---------------------------------
-
-void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi)
-// With a a length-n array, writes out min(a) to lo and max(a) to hi,
-// so that all a values lie in [lo,hi].
-// If n==0, lo and hi are not finite.
+// -------------------------- openmp helpers -------------------------------
+int get_num_threads_parallel_block()
+// return how many threads an omp parallel block would use.
+// omp_get_max_threads() does not report this; consider case of NESTED=0.
+// Why is there no such routine?   Barnett 5/22/20
 {
-  *lo = INFINITY;
-  *hi = -INFINITY;
-  for (BIGINT m = 0; m < n; ++m) {
-    if (a[m] < *lo) *lo = a[m];
-    if (a[m] > *hi) *hi = a[m];
+  int nth_used;
+#pragma omp parallel
+  {
+#pragma omp single
+    nth_used = MY_OMP_GET_NUM_THREADS();
   }
+  return nth_used;
 }
 
-void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c)
-// Writes out w = half-width and c = center of an interval enclosing all a[n]'s
-// Only chooses a nonzero center if this increases w by less than fraction
-// ARRAYWIDCEN_GROWFRAC defined in defs.h.
-// This prevents rephasings which don't grow nf by much. 6/8/17
-// If n==0, w and c are not finite.
+// ---------- thread-safe rand number generator for Windows platform ---------
+// (note this is used by macros in defs.h, and supplied in linux/macosx)
+#ifdef _WIN32
+int rand_r(unsigned int * /*seedp*/)
+// Libin Lu, 6/18/20
 {
-  FLT lo, hi;
-  arrayrange(n, a, &lo, &hi);
-  *w = (hi - lo) / 2;
-  *c = (hi + lo) / 2;
-  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
-    *w += std::abs(*c);
-    *c = 0.0;
-  }
+  std::random_device rd;
+  std::default_random_engine generator(rd());
+  std::uniform_int_distribution<int> distribution(0, RAND_MAX);
+  return distribution(generator);
 }
+#endif
 
 } // namespace utils
 } // namespace finufft
diff --git a/src/utils_precindep.cpp b/src/utils_precindep.cpp
deleted file mode 100644
index 194fae7f0..000000000
--- a/src/utils_precindep.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// Low-level array manipulations, timer, and OMP helpers, that are precision-
-// independent (no FLT allowed in argument lists). Others are in utils.cpp
-
-// For self-test see ../test/testutils.cpp.      Barnett 2017-2020.
-
-#include <cstdint>
-
-#include "finufft/defs.h"
-#include "finufft/utils_precindep.h"
-using namespace std;
-
-namespace finufft {
-namespace utils {
-
-BIGINT next235even(BIGINT n)
-// finds even integer not less than n, with prime factors no larger than 5
-// (ie, "smooth"). Adapted from fortran in hellskitchen.  Barnett 2/9/17
-// changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
-{
-  if (n <= 2) return 2;
-  if (n % 2 == 1) n += 1;                // even
-  BIGINT nplus  = n - 2;                 // to cancel out the +=2 at start of loop
-  BIGINT numdiv = 2;                     // a dummy that is >1
-  while (numdiv > 1) {
-    nplus += 2;                          // stays even
-    numdiv = nplus;
-    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
-    while (numdiv % 3 == 0) numdiv /= 3;
-    while (numdiv % 5 == 0) numdiv /= 5;
-  }
-  return nplus;
-}
-
-// ----------------------- helpers for timing (always stay double prec) ------
-
-void CNTime::start() {
-  initial = std::chrono::duration_cast<std::chrono::microseconds>(
-                std::chrono::steady_clock::now().time_since_epoch())
-                .count() *
-            1e-6;
-}
-
-double CNTime::restart()
-// Barnett changed to returning in sec
-{
-  double delta = elapsedsec();
-  start();
-  return delta;
-}
-
-double CNTime::elapsedsec()
-// returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
-{
-  std::uint64_t now = std::chrono::duration_cast<std::chrono::microseconds>(
-                          std::chrono::steady_clock::now().time_since_epoch())
-                          .count();
-  const double nowsec = now * 1e-6;
-  return nowsec - initial;
-}
-
-// -------------------------- openmp helpers -------------------------------
-int get_num_threads_parallel_block()
-// return how many threads an omp parallel block would use.
-// omp_get_max_threads() does not report this; consider case of NESTED=0.
-// Why is there no such routine?   Barnett 5/22/20
-{
-  int nth_used;
-#pragma omp parallel
-  {
-#pragma omp single
-    nth_used = MY_OMP_GET_NUM_THREADS();
-  }
-  return nth_used;
-}
-
-// ---------- thread-safe rand number generator for Windows platform ---------
-// (note this is used by macros in defs.h, and supplied in linux/macosx)
-#ifdef _WIN32
-int rand_r(unsigned int *seedp)
-// Libin Lu, 6/18/20
-{
-  std::random_device rd;
-  std::default_random_engine generator(rd());
-  std::uniform_int_distribution<int> distribution(0, RAND_MAX);
-  return distribution(generator);
-}
-#endif
-
-} // namespace utils
-} // namespace finufft
diff --git a/test/testutils.cpp b/test/testutils.cpp
index 64b5d7a0a..6facb72cd 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -1,4 +1,4 @@
-/* unit tests for utils & utils_precindep modules.
+/* unit tests for utils module.
 
    Usage: ./testutils{f}
 
@@ -10,8 +10,8 @@
 
    Suggested compile (double/float versions):
    g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o
-   ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp
-   -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
+   ../src/utils.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp
+   -I../include ../src/utils.o -o testutilsf -lgomp -DSINGLE
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
@@ -57,7 +57,8 @@ int main(int argc, char *argv[]) {
     a[j] = CPX(1.0, 0.0);
     b[j] = a[j];
   }
-  FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
+  constexpr FLT EPSILON = std::numeric_limits<FLT>::epsilon();
+  FLT relerr            = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
   if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1;
   if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1;
   b[0] = CPX(0.0, 0.0); // perturb b from a