Merge branch 'fmihpc:cudasiator' into cudasiator

hokkanen · Aug 31, 2023 · ccff5a2 · ccff5a2
2 parents a1e0ec6 + 18842c9
commit ccff5a2
Show file tree

Hide file tree

Showing 29 changed files with 331 additions and 319 deletions.
diff --git a/MAKE/Makefile.NOMAD b/MAKE/Makefile.NOMAD
@@ -0,0 +1,91 @@
+CMP = hipcc
+LNK = hipcc
+
+# Modules loaded (after clean shell, no module purging, one-by-one not oneline)
+# module load LUMI/22.08
+# module load partition/G
+# module load cpeAMD
+# module load rocm/5.3.3
+# module load Boost/1.79.0-cpeAMD-22.08
+
+#======== Vectorization ==========
+#Set vector backend type for vlasov solvers, sets precision and length. 
+#Options: 
+# AVX:	    VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
+# AVX512:   VEC8D_AGNER, VEC16F_AGNER
+# Fallback: VECTORCLASS = VEC_FALLBACK_GENERIC (Defaults to VECL8)
+VECTORCLASS = VEC_FALLBACK_GENERIC
+
+#===== Vector Lenghts ====
+# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8 
+WID=8
+VECL=64
+#WID=4
+#VECL=16
+
+# Compile with GPU support (USE_HIP or USE_CUDA)
+USE_HIP=1
+
+#======= Compiler and compilation flags =========
+# NOTES on compiler flags:
+# CXXFLAGS is for compiler flags, they are always used
+# MATHFLAGS are for special math etc. flags, these are only applied on solver functions
+# LDFLAGS flags for linker
+# Important note: Do not edit COMPFLAGS in this file!
+
+CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
+
+testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize  -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
+
+LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
+LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}
+
+# -fgpu-rdc # relocatable device code, needed for the velocity mesh
+# -fgpu-sanitize
+
+#======== PAPI ==========
+#Add PAPI_MEM define to use papi to report memory consumption?
+CXXFLAGS +=  -DPAPI_MEM
+testpackage: CXXFLAGS +=  -DPAPI_MEM
+
+#======== Allocator =========
+#Use jemalloc instead of system malloc to reduce memory fragmentation? https://github.com/jemalloc/jemalloc
+#Configure jemalloc with  --with-jemalloc-prefix=je_ when installing it
+#Note: jemalloc not supported with GPUs
+#CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+#testpackage: CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+
+#-DNO_WRITE_AT_ALL:  Define to disable write at all to 
+#                    avoid memleak (much slower IO)
+
+#======== Libraries ===========
+
+LIBRARY_PREFIX = /scratch/project_465000538/vlasiator/libraries
+
+# Compiled libraries
+#INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include
+#LIB_BOOST = -L$(LIBRARY_PREFIX)/boost/lib -lboost_program_options -Wl,-rpath=$(LIBRARY_PREFIX)/boost/lib
+LIB_BOOST = -lboost_program_options
+
+INC_ZOLTAN = -isystem $(LIBRARY_PREFIX)/zoltan/include
+LIB_ZOLTAN = -L$(LIBRARY_PREFIX)/zoltan/lib -lzoltan -Wl,-rpath=$(LIBRARY_PREFIX)/zoltan/lib
+
+#INC_JEMALLOC = -I$(LIBRARY_PREFIX)/jemalloc/include
+#LIB_JEMALLOC = -L$(LIBRARY_PREFIX)/jemalloc/lib -ljemalloc -Wl,-rpath=$(LIBRARY_PREFIX)/jemalloc/lib
+
+INC_PAPI = -isystem $(LIBRARY_PREFIX)/papi/include
+LIB_PAPI = -lpapi -L$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib
+
+INC_VLSV = -isystem $(LIBRARY_PREFIX)/vlsv
+LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Wl,-rpath=$(LIBRARY_PREFIX)/vlsv
+
+INC_PROFILE = -isystem $(LIBRARY_PREFIX)/phiprof/include -D_ROCTX -I${ROCM_PATH}/include
+LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -lgfortran -Wl,-rpath=$(LIBRARY_PREFIX)/phiprof/lib -Wl,-rpath=${ROCM_PATH}/lib -lroctx64 -lroctracer64
+
+# Header libraries
+
+INC_FSGRID = -I$(LIBRARY_PREFIX)/fsgrid/
+INC_EIGEN = -isystem $(LIBRARY_PREFIX)/eigen/
+INC_DCCRG = -I$(LIBRARY_PREFIX)/dccrg/
+# INC_VECTORCLASS = -I$(LIBRARY_PREFIX_HEADERS)/vectorclass/ # not used with GPU
+INC_HASHINATOR = -isystem $(LIBRARY_PREFIX)/hashinator/
diff --git a/MAKE/Makefile.lumi_hipcc b/MAKE/Makefile.lumi_hipcc
@@ -34,9 +34,9 @@ USE_HIP=1
 
 CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
 
-testpackage: CXXFLAGS = -g -O2 -fopenmp -x hip --amdgpu-target=gfx90a:xnack+ -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -funroll-loops -std=c++17  -fgpu-sanitize
+testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize  -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
 
-LDFLAGS = -fopenmp --hip-link -lrt -lgomp -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
+LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
 LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}
 
 # -fgpu-rdc # relocatable device code, needed for the velocity mesh

diff --git a/Makefile b/Makefile
@@ -335,6 +335,7 @@ DEPS_PARTICLES = particles/particles.h particles/particles.cpp particles/field.h
 	readparameters.h version.h particles/scenario.h particles/histogram.h
 OBJS_PARTICLES = particles/physconst.o particles/particles.o particles/readfields.o particles/particleparameters.o particles/distribution.o readparameters.o version.o particles/scenario.o particles/histogram.o
 
+# todo: verify compilation and working of tools other than vlsvdiff
 vlsvextract: ${DEPS_VLSVREADER} ${DEPS_VLSVREADERINTERFACE} tools/vlsvextract.h tools/vlsvextract.cpp ${OBJS_VLSVREADER} ${OBJS_VLSVREADERINTERFACE}
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvextract.cpp ${INC_BOOST} ${INC_DCCRG} ${INC_EIGEN} ${INC_VLSV} -I$(CURDIR)
 	${LNK} -o vlsvextract_${FP_PRECISION} vlsvextract.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_BOOST} ${LIB_DCCRG}  ${LIB_VLSV} ${LDFLAGS}
@@ -343,9 +344,10 @@ vlsv2silo:  ${DEPS_VLSVREADERINTERFACE} tools/vlsv2silo.cpp  ${OBJS_VLSVREADERIN
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsv2silo.cpp ${INC_SILO} ${INC_VLSV} -I$(CURDIR)
 	${LNK} -o vlsv2silo_${FP_PRECISION} vlsv2silo.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_SILO} ${LIB_VLSV} ${LDFLAGS}
 
-vlsvdiff:  ${DEPS_VLSVREADERINTERFACE} tools/vlsvdiff.cpp ${OBJS_VLSVREADEREXTRA} ${OBJS_VLSVREADERINTERFACE}
-	${CMP} ${CXXEXTRAFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_VLSV} -I$(CURDIR)
-	${LNK} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LDFLAGS}
+vlsvdiff: tools/vlsvdiff.cpp
+	@echo [CC] $<
+	$(SILENT)$(CMP) $(CXXEXTRAFLAGS) ${MATHFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_DCCRG} ${INC_VLSV} ${INC_FSGRID}
+	$(SILENT)${LNK} ${LDFLAGS} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LIBS}
 
 vlsvreaderinterface.o:  tools/vlsvreaderinterface.h tools/vlsvreaderinterface.cpp
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvreaderinterface.cpp ${INC_VLSV} -I$(CURDIR)

diff --git a/arch/arch_device_hip.h b/arch/arch_device_hip.h
@@ -102,11 +102,7 @@
 #endif
 #define FULL_MASK 0xffffffffffffffff
 
-#ifdef ARCH_MAIN
-  hipStream_t stream[64];
-#else
-  extern hipStream_t stream[];
-#endif
+extern hipStream_t gpuStreamList[];
 
 /* Define the HIP error checking macro */
 #define CHK_ERR(err) (hip_error(err, __FILE__, __LINE__))
@@ -153,16 +149,16 @@ class buf {
   public:
 
   void syncDeviceData(void){
-    CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, stream[thread_id]));
+    CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
   }
 
   void syncHostData(void){
-    CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
+    CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
   }
 
   buf(T * const _ptr, uint _bytes) : ptr(_ptr), bytes(_bytes) {
     thread_id = omp_get_thread_num();
-    CHK_ERR(hipMallocAsync(&d_ptr, bytes, stream[thread_id]));
+    CHK_ERR(hipMallocAsync(&d_ptr, bytes, gpuStreamList[thread_id]));
     syncDeviceData();
   }
 
@@ -173,7 +169,7 @@ class buf {
     if(!is_copy){
       // syncHostData();
       #ifdef __HIP_DEVICE_COMPILE__
-        hipFreeAsync(d_ptr, stream[thread_id]);
+        hipFreeAsync(d_ptr, gpuStreamList[thread_id]);
       #endif
     }
   }
@@ -204,7 +200,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes) {
   void* ptr;
   const uint thread_id = omp_get_thread_num();
   device_mempool_check(UINT64_MAX);
-  CHK_ERR(hipMallocAsync(&ptr, bytes, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&ptr, bytes, gpuStreamList[thread_id]));
   return ptr;
 }
 
@@ -219,7 +215,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes, hipStream_t stream)
 template <typename T>
 __host__ __forceinline__ static void free(T* ptr) {
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipFreeAsync(ptr, stream[thread_id]));
+  CHK_ERR(hipFreeAsync(ptr, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -230,7 +226,7 @@ __host__ __forceinline__ static void free(T* ptr, hipStream_t stream) {
 template <typename T>
 __forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes){
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -242,7 +238,7 @@ __forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes, hipStream_t
 template <typename T>
 __forceinline__ static void memcpy_d2h(T* dst, T* src, size_t bytes){
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -387,45 +383,45 @@ __forceinline__ static void parallel_reduce_driver(const uint (&limits)[NDim], L
 
   /* Create a device buffer for the reduction results */
   T* d_buf;
-  CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, gpuStreamList[thread_id]));
 
   /* Create a device buffer to transfer the initial values to device */
   T* d_const_buf;
-  CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, gpuStreamList[thread_id]));
 
   /* Create a device buffer to transfer the loop limits of each dimension to device */
   uint* d_limits;
-  CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,gpuStreamList[thread_id]));
 
   /* Call the reduction kernel with different arguments depending
    * on if the number of reductions is known at the compile time
    */
-  T* d_thread_data_dynamic;
+  T* d_thread_data_dynamic=0; // declared zero to suppress unitialized use warning
   if(NReduStatic == 0) {
     /* Get the cub temp storage size for the dynamic shared memory kernel argument */
     constexpr auto cub_temp_storage_type_size = sizeof(typename hipcub::BlockReduce<T, ARCH_BLOCKSIZE_R, hipcub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 1, 1>::TempStorage);
     /* Allocate memory for the thread data values */
-    CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), stream[thread_id]));
+    CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), gpuStreamList[thread_id]));
     /* Call the kernel (the number of reductions not known at compile time) */
-    reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
+    reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
     /* Synchronize and free the thread data allocation */
-    CHK_ERR(hipStreamSynchronize(stream[thread_id]));
-    CHK_ERR(hipFreeAsync(d_thread_data_dynamic, stream[thread_id]));
+    CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
+    CHK_ERR(hipFreeAsync(d_thread_data_dynamic, gpuStreamList[thread_id]));
   }
   else{
     /* Call the kernel (the number of reductions known at compile time) */
-    reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
+    reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
     /* Synchronize after kernel call */
-    CHK_ERR(hipStreamSynchronize(stream[thread_id]));
+    CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
   }
   /* Copy the results back to host and free the allocated memory back to pool*/
-  CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_buf, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_const_buf, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_limits, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_buf, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_const_buf, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_limits, gpuStreamList[thread_id]));
 }
 }