CFD-GO · llaniewski · Aug 24, 2023 · Aug 5, 2023 · Aug 5, 2023 · Aug 6, 2023
diff --git a/src/Lattice.cu.Rt b/src/Lattice.cu.Rt
@@ -1222,7 +1222,7 @@ void Lattice::Get<?%s q$name ?>(lbRegion over, <?%s q$type ?> * tab, real_t scal
 		small.dx -= region.dx;
 		small.dy -= region.dy;
 		small.dz -= region.dz;
-		CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny,small.nz) , dim3(1) , (small, buf, scale));
+		CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny,small.nz) , dim3(1) , small, buf, scale);
 		CudaMemcpy(tab, buf, small.sizeL()*sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
 	}
 	CudaFree(buf);
@@ -1396,7 +1396,7 @@ void Lattice::GetSample<?%s q$name ?>(lbRegion over, real_t scale,real_t* buf)
        	<?R if (q$adjoint) { ?> container->adjin = aSnaps[aSnap]; <?R } ?>
        	container->CopyToConst();
 	lbRegion small = region.intersect(over);
-	CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny) , dim3(1) , (small, (<?%s q$type?>*) buf, scale));
+	CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny) , dim3(1) , small, (<?%s q$type?>*) buf, scale);
 }
 <?R } ;ifdef() ?>
 void Lattice::updateAllSamples(){

diff --git a/src/LatticeContainer.h.Rt b/src/LatticeContainer.h.Rt
@@ -148,7 +148,6 @@ class LatticeContainer {
 template<class T> CudaGlobalFunction void Kernel();
 template < eOperationType I, eCalculateGlobals G, eStage S > class InteriorExecutor;
 template < eOperationType I, eCalculateGlobals G, eStage S > class BorderExecutor;
-CudaGlobalFunction void ColorKernel( uchar4 *optr);
 
 <?R
 for (q in rows(Quantities)) { ifdef(q$adjoint);

diff --git a/src/LatticeContainer.inc.cpp.Rt b/src/LatticeContainer.inc.cpp.Rt
@@ -316,7 +316,7 @@ template <class EX> inline void LatticeContainer::RunBorderT(CudaStream_t stream
   int totx = <?%s blx ?>;
   blx.x = ceiling_div(totx, thr.y);
   blx.y = <?%d thy ?>;
-  CudaKernelRunNoWait(Kernel< EX >, blx, thr, (), stream);
+  CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
 };
 
 /// Run the interior kernel
@@ -336,7 +336,7 @@ template <class EX> inline void LatticeContainer::RunInteriorT(CudaStream_t stre
   blx.x = ceiling_div(totx, thr.y);
   int toty = nz - <?%d BorderMargin$max[3]-BorderMargin$min[3] ?>;
   blx.y = toty;
-  CudaKernelRunNoWait(Kernel< EX >, blx, thr, (), stream);
+  CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
 };
 
 template < eOperationType I, eCalculateGlobals G, eStage S >
@@ -417,7 +417,7 @@ CudaGlobalFunction void ColorKernel( uchar4 *optr, int z )
 */
 void LatticeContainer::Color( uchar4 *optr ) {
    CudaCopyToConstant("constContainer", constContainer, this, sizeof(LatticeContainer));	
-   CudaKernelRun( ColorKernel , dim3(floor(nx/X_BLOCK),ny,1), dim3(X_BLOCK) ,(optr, nz/2));
+   CudaKernelRun( ColorKernel , dim3(floor(nx/X_BLOCK),ny,1), dim3(X_BLOCK) , optr, nz/2);
 };
 
 // Functions for getting quantities

diff --git a/src/conf.R b/src/conf.R
@@ -15,8 +15,9 @@ if (!exists("DOUBLE")) DOUBLE=0
 if (!exists("SYMALGEBRA")) SYMALGEBRA=FALSE
 if (!exists("NEED_OFFSETS")) NEED_OFFSETS=TRUE
 if (!exists("X_MOD")) X_MOD=0
+if (!exists("CPU_LAYOUT")) CPU_LAYOUT=FALSE
 
-memory_arr_cpu = FALSE
+memory_arr_cpu = CPU_LAYOUT
 memory_arr_mod = X_MOD
 
 # SYMALGEBRA=TRUE

diff --git a/src/config.R.in b/src/config.R.in
@@ -1 +1,2 @@
 X_MOD = @X_MOD@
+CPU_LAYOUT = @CPU_LAYOUT@
diff --git a/src/configure.ac b/src/configure.ac
@@ -155,6 +155,10 @@ AC_ARG_WITH([warp-size],
 	AS_HELP_STRING([--with-warp-size],
 		[Set the value used for the warp size]))
 
+AC_ARG_ENABLE([cpu-layout],
+	AS_HELP_STRING([--cpu-layout],
+		[Enable cpu-optimised memory layout]))
+
 
 AC_ARG_ENABLE([paranoid],
 	AS_HELP_STRING([--enable-paranoid],
@@ -399,7 +403,7 @@ else
 	if ! test -z "$OPENMP"
 	then
 		CPPFLAGS="${CPPFLAGS} $OPENMP"
-		LDFLAGS="${CPPFLAGS} $OPENMP"
+		LDFLAGS="${LDFLAGS} $OPENMP"
 		AC_DEFINE([CROSS_OPENMP], [1], [Using OpenMP])
 	fi
 	if test "x${with_x_block}" != "x"
@@ -433,6 +437,14 @@ then
 	AC_DEFINE([USE_ADDOPP], [1], [Opportunistic warp level operations])
 fi
 
+if test "x${enable_cpu_layout}" == "xyes"
+then
+	CPU_LAYOUT="TRUE"
+else
+	CPU_LAYOUT="FALSE"
+fi
+
+
 AC_MSG_CHECKING([MPI include path])
 if test -z "${MPI_INCLUDE}"; then
 	if test -z "${MPI}"; then
@@ -947,7 +959,7 @@ fi
 if test "x${enable_paranoid}" == "xyes"
 then
 	CPPFLAGS="${CPPFLAGS} -Wall -Werror -Wno-unknown-warning-option"
-	CPPFLAGS="${CPPFLAGS} -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-overflow -Wno-unused-private-field -Wno-self-assign"
+	CPPFLAGS="${CPPFLAGS} -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-overflow -Wno-unused-private-field -Wno-self-assign -Wno-unknown-pragmas"
 fi
 
 if test "x${enable_profiling}" == "xyes"
@@ -999,6 +1011,7 @@ AC_SUBST(EMBEDED_PYTHON)
 AC_SUBST(X_BLOCK)
 AC_SUBST(WARPSIZE)
 AC_SUBST(X_MOD)
+AC_SUBST(CPU_LAYOUT)
 
 AC_CONFIG_FILES([CLB/config.mk:src/config.mk.in])
 AC_CONFIG_FILES([CLB/config.R_:src/config.R.in])

diff --git a/src/cross.h b/src/cross.h
@@ -54,18 +54,18 @@
       #endif
 
       #ifndef CROSS_HIP 
-       #define CudaKernelRun(a__,b__,c__,d__) a__<<<b__,c__>>>d__; HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
+       #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__>>>d__; HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
+         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__,0,e__>>>d__;
+         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #else
-       #define CudaKernelRun(a__,b__,c__,d__) a__<<<b__,c__>>>d__; HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
+       #define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
        #ifdef CROSS_SYNC
-         #define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__>>>d__; HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
+         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
        #else
-         #define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__,0,e__>>>d__;
+         #define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
        #endif
       #endif
       #define CudaBlock blockIdx
@@ -239,24 +239,6 @@
     #define CudaSyncThreads() //assert(CpuThread.x == 0)
     #define CudaSyncThreadsOr(x__) x__
     #define CudaSyncWarpOr(x__) x__
-    #ifdef CROSS_OPENMP
-      #define OMP_PARALLEL_FOR _Pragma("omp parallel for simd")
-      #define CudaKernelRun(a__,b__,c__,d__) \
-                                      OMP_PARALLEL_FOR \
-                                       for (int x__ = 0; x__ < b__.x; x__++) { CpuBlock.x = x__; \
-                                        for (CpuBlock.y = 0; CpuBlock.y < b__.y; CpuBlock.y++) \
-                                         for (CpuBlock.z = 0; CpuBlock.z < b__.z; CpuBlock.z++) \
-                                          a__ d__; \
-                                       }
-    #else
-      #define CudaKernelRun(a__,b__,c__,d__) \
-                                      for (CpuBlock.y = 0; CpuBlock.y < b__.y; CpuBlock.y++) \
-                                       for (CpuBlock.x = 0; CpuBlock.x < b__.x; CpuBlock.x++) \
-                                        for (CpuBlock.z = 0; CpuBlock.z < b__.z; CpuBlock.z++) \
-                                         a__ d__;
-    #endif
-
-    #define CudaKernelRunNoWait(a__,b__,c__,d__,e__) CudaKernelRun(a__,b__,c__,d__);
     #define CudaBlock CpuBlock
     #define CudaThread CpuThread
     #define CudaNumberOfThreads CpuSize
@@ -275,11 +257,7 @@
     #define CudaEventCreate(a__) *a__ = 0
     #define CudaEventDestroy(a__)
     #define CudaEventRecord(a__,b__) a__ = b__
-    #ifdef CROSS_OPENMP
-      #define CudaEventSynchronize(a__) a__ = omp_get_wtime()*1000
-    #else
-      #define CudaEventSynchronize(a__) a__ = 1000*((double) clock())/CLOCKS_PER_SEC
-    #endif
+    #define CudaEventSynchronize(a__) a__ = get_walltime()*1000;
     #define CudaEventQuery(a__) CudaSuccess
     #define CudaEventElapsedTime(a__,b__,c__) *(a__) = (c__ -  b__)
     #define CudaDeviceSynchronize()
@@ -302,6 +280,32 @@
     #endif
     extern uint3 CpuThread;
     extern uint3 CpuSize;
+
+    #include <functional>
+
+    template <typename F, typename ...P>
+    inline void CPUKernelRun(F &&func, const dim3& blocks, P &&... args) {
+      #pragma omp parallel for collapse(3) schedule(static)
+      for (unsigned int y = 0; y < blocks.y; y++)
+        for (unsigned int x = 0; x < blocks.x; x++)
+          for (unsigned int z = 0; z < blocks.z; z++) {
+            CpuBlock.x = x;
+            CpuBlock.y = y;
+            CpuBlock.z = z;
+            func(std::forward<P>(args)...);
+      }
+    }
+
+    template <typename F, typename ...P>
+    inline void CudaKernelRun(F &&func, const dim3& blocks, const dim3& threads, P &&... args) {
+      CPUKernelRun(func, blocks, std::forward<P>(args)...);
+    }
+
+    template <typename F, typename ...P>
+    inline void CudaKernelRunNoWait(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
+      CPUKernelRun(func, blocks, std::forward<P>(args)...);
+    }
+
     void memcpy2D(void * dst_, int dpitch, void * src_, int spitch, int width, int height);
 
     template <class T, class P> inline T data_cast(const P& x) {
@@ -318,16 +322,23 @@
     #define __longlong_as_double(x__) data_cast<double        , long long int >(x__)
     #define __double_as_longlong(x__) data_cast<long long int , double        >(x__)
 
-    template <typename T> inline void CudaAtomicAdd(T * sum, T val) { sum[0] += val; }
-    template <typename T> inline void CudaAtomicAddReduce(T * sum, T val) { sum[0] += val; }
-    template <typename T> inline void CudaAtomicAddReduceWarp(T * sum, T val) { sum[0] += val; }
-    template <typename T> inline void CudaAtomicAddReduceDiff(T * sum, T val, bool yes) { if (yes) sum[0] += val; }
-    template <typename T> inline void CudaAtomicMaxReduce(T * sum, T val) { if (val > sum[0]) sum[0] = val; }
-    template <typename T> inline void CudaAtomicMaxReduceWarp(T * sum, T val) { if (val > sum[0]) sum[0] = val; }
+    template <typename T> inline void CudaAtomicAdd(T * sum, T val) {
+      #pragma omp atomic
+      sum[0] += val;
+    }
+    template <typename T> inline void CudaAtomicMax(T * sum, T val) {
+      #pragma omp critical
+      { if (val > sum[0]) sum[0] = val; }
+    }
+    template <typename T> inline void CudaAtomicAddReduce(T * sum, T val) { CudaAtomicAdd(sum, val); }
+    template <typename T> inline void CudaAtomicAddReduceWarp(T * sum, T val) { CudaAtomicAdd(sum, val); }
+    template <typename T> inline void CudaAtomicAddReduceDiff(T * sum, T val, bool yes) { if (yes) CudaAtomicAdd(sum, val); }
+    template <typename T> inline void CudaAtomicMaxReduce(T * sum, T val) { CudaAtomicMax(sum, val); }
+    template <typename T> inline void CudaAtomicMaxReduceWarp(T * sum, T val) { CudaAtomicMax(sum, val); }
 
     template <int LEN, typename T>
     inline void CudaAtomicAddReduceWarpArr(T * sum, T val[LEN]) {
-      for (unsigned char i = 0; i < LEN; i ++) sum[i] += val[i];
+      for (unsigned char i = 0; i < LEN; i ++) CudaAtomicAdd(&sum[i], val[i]);
     }
 
   #define ISFINITE(l__) std::isfinite(l__)