Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved OpenMP support #443

Merged
merged 5 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Lattice.cu.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ void Lattice::Get<?%s q$name ?>(lbRegion over, <?%s q$type ?> * tab, real_t scal
small.dx -= region.dx;
small.dy -= region.dy;
small.dz -= region.dz;
CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny,small.nz) , dim3(1) , (small, buf, scale));
CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny,small.nz) , dim3(1) , small, buf, scale);
CudaMemcpy(tab, buf, small.sizeL()*sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
}
CudaFree(buf);
Expand Down Expand Up @@ -1396,7 +1396,7 @@ void Lattice::GetSample<?%s q$name ?>(lbRegion over, real_t scale,real_t* buf)
<?R if (q$adjoint) { ?> container->adjin = aSnaps[aSnap]; <?R } ?>
container->CopyToConst();
lbRegion small = region.intersect(over);
CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny) , dim3(1) , (small, (<?%s q$type?>*) buf, scale));
CudaKernelRun( get<?%s q$name ?> , dim3(small.nx,small.ny) , dim3(1) , small, (<?%s q$type?>*) buf, scale);
}
<?R } ;ifdef() ?>
void Lattice::updateAllSamples(){
Expand Down
1 change: 0 additions & 1 deletion src/LatticeContainer.h.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ class LatticeContainer {
template<class T> CudaGlobalFunction void Kernel();
template < eOperationType I, eCalculateGlobals G, eStage S > class InteriorExecutor;
template < eOperationType I, eCalculateGlobals G, eStage S > class BorderExecutor;
CudaGlobalFunction void ColorKernel( uchar4 *optr);

<?R
for (q in rows(Quantities)) { ifdef(q$adjoint);
Expand Down
6 changes: 3 additions & 3 deletions src/LatticeContainer.inc.cpp.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ template <class EX> inline void LatticeContainer::RunBorderT(CudaStream_t stream
int totx = <?%s blx ?>;
blx.x = ceiling_div(totx, thr.y);
blx.y = <?%d thy ?>;
CudaKernelRunNoWait(Kernel< EX >, blx, thr, (), stream);
CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
};

/// Run the interior kernel
Expand All @@ -336,7 +336,7 @@ template <class EX> inline void LatticeContainer::RunInteriorT(CudaStream_t stre
blx.x = ceiling_div(totx, thr.y);
int toty = nz - <?%d BorderMargin$max[3]-BorderMargin$min[3] ?>;
blx.y = toty;
CudaKernelRunNoWait(Kernel< EX >, blx, thr, (), stream);
CudaKernelRunNoWait(Kernel< EX >, blx, thr, stream);
};

template < eOperationType I, eCalculateGlobals G, eStage S >
Expand Down Expand Up @@ -417,7 +417,7 @@ CudaGlobalFunction void ColorKernel( uchar4 *optr, int z )
*/
void LatticeContainer::Color( uchar4 *optr ) {
CudaCopyToConstant("constContainer", constContainer, this, sizeof(LatticeContainer));
CudaKernelRun( ColorKernel , dim3(floor(nx/X_BLOCK),ny,1), dim3(X_BLOCK) ,(optr, nz/2));
CudaKernelRun( ColorKernel , dim3(floor(nx/X_BLOCK),ny,1), dim3(X_BLOCK) , optr, nz/2);
};

// Functions for getting quantities
Expand Down
3 changes: 2 additions & 1 deletion src/conf.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ if (!exists("DOUBLE")) DOUBLE=0
if (!exists("SYMALGEBRA")) SYMALGEBRA=FALSE
if (!exists("NEED_OFFSETS")) NEED_OFFSETS=TRUE
if (!exists("X_MOD")) X_MOD=0
if (!exists("CPU_LAYOUT")) CPU_LAYOUT=FALSE

memory_arr_cpu = FALSE
memory_arr_cpu = CPU_LAYOUT
memory_arr_mod = X_MOD

# SYMALGEBRA=TRUE
Expand Down
1 change: 1 addition & 0 deletions src/config.R.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
X_MOD = @X_MOD@
CPU_LAYOUT = @CPU_LAYOUT@
17 changes: 15 additions & 2 deletions src/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ AC_ARG_WITH([warp-size],
AS_HELP_STRING([--with-warp-size],
[Set the value used for the warp size]))

AC_ARG_ENABLE([cpu-layout],
AS_HELP_STRING([--cpu-layout],
[Enable cpu-optimised memory layout]))


AC_ARG_ENABLE([paranoid],
AS_HELP_STRING([--enable-paranoid],
Expand Down Expand Up @@ -399,7 +403,7 @@ else
if ! test -z "$OPENMP"
then
CPPFLAGS="${CPPFLAGS} $OPENMP"
LDFLAGS="${CPPFLAGS} $OPENMP"
LDFLAGS="${LDFLAGS} $OPENMP"
AC_DEFINE([CROSS_OPENMP], [1], [Using OpenMP])
fi
if test "x${with_x_block}" != "x"
Expand Down Expand Up @@ -433,6 +437,14 @@ then
AC_DEFINE([USE_ADDOPP], [1], [Opportunistic warp level operations])
fi

if test "x${enable_cpu_layout}" == "xyes"
then
CPU_LAYOUT="TRUE"
else
CPU_LAYOUT="FALSE"
fi


AC_MSG_CHECKING([MPI include path])
if test -z "${MPI_INCLUDE}"; then
if test -z "${MPI}"; then
Expand Down Expand Up @@ -947,7 +959,7 @@ fi
if test "x${enable_paranoid}" == "xyes"
then
CPPFLAGS="${CPPFLAGS} -Wall -Werror -Wno-unknown-warning-option"
CPPFLAGS="${CPPFLAGS} -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-overflow -Wno-unused-private-field -Wno-self-assign"
CPPFLAGS="${CPPFLAGS} -Wno-unused-but-set-variable -Wno-unused-variable -Wno-format-overflow -Wno-unused-private-field -Wno-self-assign -Wno-unknown-pragmas"
fi

if test "x${enable_profiling}" == "xyes"
Expand Down Expand Up @@ -999,6 +1011,7 @@ AC_SUBST(EMBEDED_PYTHON)
AC_SUBST(X_BLOCK)
AC_SUBST(WARPSIZE)
AC_SUBST(X_MOD)
AC_SUBST(CPU_LAYOUT)

AC_CONFIG_FILES([CLB/config.mk:src/config.mk.in])
AC_CONFIG_FILES([CLB/config.R_:src/config.R.in])
Expand Down
83 changes: 47 additions & 36 deletions src/cross.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,18 @@
#endif

#ifndef CROSS_HIP
#define CudaKernelRun(a__,b__,c__,d__) a__<<<b__,c__>>>d__; HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
#define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() )
#ifdef CROSS_SYNC
#define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__>>>d__; HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
#define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( cudaDeviceSynchronize()); HANDLE_ERROR( cudaGetLastError() );
#else
#define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__,0,e__>>>d__;
#define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
#endif
#else
#define CudaKernelRun(a__,b__,c__,d__) a__<<<b__,c__>>>d__; HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
#define CudaKernelRun(a__,b__,c__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() )
#ifdef CROSS_SYNC
#define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__>>>d__; HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
#define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__>>>(__VA_ARGS__); HANDLE_ERROR( hipDeviceSynchronize()); HANDLE_ERROR( hipGetLastError() );
#else
#define CudaKernelRunNoWait(a__,b__,c__,d__,e__) a__<<<b__,c__,0,e__>>>d__;
#define CudaKernelRunNoWait(a__,b__,c__,e__,...) a__<<<b__,c__,0,e__>>>(__VA_ARGS__);
#endif
#endif
#define CudaBlock blockIdx
Expand Down Expand Up @@ -239,24 +239,6 @@
#define CudaSyncThreads() //assert(CpuThread.x == 0)
#define CudaSyncThreadsOr(x__) x__
#define CudaSyncWarpOr(x__) x__
#ifdef CROSS_OPENMP
#define OMP_PARALLEL_FOR _Pragma("omp parallel for simd")
#define CudaKernelRun(a__,b__,c__,d__) \
OMP_PARALLEL_FOR \
for (int x__ = 0; x__ < b__.x; x__++) { CpuBlock.x = x__; \
for (CpuBlock.y = 0; CpuBlock.y < b__.y; CpuBlock.y++) \
for (CpuBlock.z = 0; CpuBlock.z < b__.z; CpuBlock.z++) \
a__ d__; \
}
#else
#define CudaKernelRun(a__,b__,c__,d__) \
for (CpuBlock.y = 0; CpuBlock.y < b__.y; CpuBlock.y++) \
for (CpuBlock.x = 0; CpuBlock.x < b__.x; CpuBlock.x++) \
for (CpuBlock.z = 0; CpuBlock.z < b__.z; CpuBlock.z++) \
a__ d__;
#endif

#define CudaKernelRunNoWait(a__,b__,c__,d__,e__) CudaKernelRun(a__,b__,c__,d__);
#define CudaBlock CpuBlock
#define CudaThread CpuThread
#define CudaNumberOfThreads CpuSize
Expand All @@ -275,11 +257,7 @@
#define CudaEventCreate(a__) *a__ = 0
#define CudaEventDestroy(a__)
#define CudaEventRecord(a__,b__) a__ = b__
#ifdef CROSS_OPENMP
#define CudaEventSynchronize(a__) a__ = omp_get_wtime()*1000
#else
#define CudaEventSynchronize(a__) a__ = 1000*((double) clock())/CLOCKS_PER_SEC
#endif
#define CudaEventSynchronize(a__) a__ = get_walltime()*1000;
#define CudaEventQuery(a__) CudaSuccess
#define CudaEventElapsedTime(a__,b__,c__) *(a__) = (c__ - b__)
#define CudaDeviceSynchronize()
Expand All @@ -302,6 +280,32 @@
#endif
extern uint3 CpuThread;
extern uint3 CpuSize;

#include <functional>

template <typename F, typename ...P>
inline void CPUKernelRun(F &&func, const dim3& blocks, P &&... args) {
#pragma omp parallel for collapse(3) schedule(static)
for (unsigned int y = 0; y < blocks.y; y++)
for (unsigned int x = 0; x < blocks.x; x++)
for (unsigned int z = 0; z < blocks.z; z++) {
CpuBlock.x = x;
CpuBlock.y = y;
CpuBlock.z = z;
func(std::forward<P>(args)...);
}
}

template <typename F, typename ...P>
inline void CudaKernelRun(F &&func, const dim3& blocks, const dim3& threads, P &&... args) {
CPUKernelRun(func, blocks, std::forward<P>(args)...);
}

template <typename F, typename ...P>
inline void CudaKernelRunNoWait(F &&func, const dim3& blocks, const dim3& threads, CudaStream_t stream, P &&... args) {
CPUKernelRun(func, blocks, std::forward<P>(args)...);
}

void memcpy2D(void * dst_, int dpitch, void * src_, int spitch, int width, int height);

template <class T, class P> inline T data_cast(const P& x) {
Expand All @@ -318,16 +322,23 @@
#define __longlong_as_double(x__) data_cast<double , long long int >(x__)
#define __double_as_longlong(x__) data_cast<long long int , double >(x__)

template <typename T> inline void CudaAtomicAdd(T * sum, T val) { sum[0] += val; }
template <typename T> inline void CudaAtomicAddReduce(T * sum, T val) { sum[0] += val; }
template <typename T> inline void CudaAtomicAddReduceWarp(T * sum, T val) { sum[0] += val; }
template <typename T> inline void CudaAtomicAddReduceDiff(T * sum, T val, bool yes) { if (yes) sum[0] += val; }
template <typename T> inline void CudaAtomicMaxReduce(T * sum, T val) { if (val > sum[0]) sum[0] = val; }
template <typename T> inline void CudaAtomicMaxReduceWarp(T * sum, T val) { if (val > sum[0]) sum[0] = val; }
template <typename T> inline void CudaAtomicAdd(T * sum, T val) {
#pragma omp atomic
sum[0] += val;
}
template <typename T> inline void CudaAtomicMax(T * sum, T val) {
#pragma omp critical
{ if (val > sum[0]) sum[0] = val; }
}
template <typename T> inline void CudaAtomicAddReduce(T * sum, T val) { CudaAtomicAdd(sum, val); }
template <typename T> inline void CudaAtomicAddReduceWarp(T * sum, T val) { CudaAtomicAdd(sum, val); }
template <typename T> inline void CudaAtomicAddReduceDiff(T * sum, T val, bool yes) { if (yes) CudaAtomicAdd(sum, val); }
template <typename T> inline void CudaAtomicMaxReduce(T * sum, T val) { CudaAtomicMax(sum, val); }
template <typename T> inline void CudaAtomicMaxReduceWarp(T * sum, T val) { CudaAtomicMax(sum, val); }

template <int LEN, typename T>
inline void CudaAtomicAddReduceWarpArr(T * sum, T val[LEN]) {
for (unsigned char i = 0; i < LEN; i ++) sum[i] += val[i];
for (unsigned char i = 0; i < LEN; i ++) CudaAtomicAdd(&sum[i], val[i]);
}

#define ISFINITE(l__) std::isfinite(l__)
Expand Down