Skip to content

Commit

Permalink
Merge branch 'fmihpc:cudasiator' into cudasiator
Browse files Browse the repository at this point in the history
  • Loading branch information
hokkanen authored Aug 31, 2023
2 parents a1e0ec6 + 18842c9 commit ccff5a2
Show file tree
Hide file tree
Showing 29 changed files with 331 additions and 319 deletions.
91 changes: 91 additions & 0 deletions MAKE/Makefile.NOMAD
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
CMP = hipcc
LNK = hipcc

# Modules loaded (after clean shell, no module purging, one-by-one not oneline)
# module load LUMI/22.08
# module load partition/G
# module load cpeAMD
# module load rocm/5.3.3
# module load Boost/1.79.0-cpeAMD-22.08

#======== Vectorization ==========
#Set vector backend type for vlasov solvers, sets precision and length.
#Options:
# AVX: VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
# AVX512: VEC8D_AGNER, VEC16F_AGNER
# Fallback: VECTORCLASS = VEC_FALLBACK_GENERIC (Defaults to VECL8)
VECTORCLASS = VEC_FALLBACK_GENERIC

#===== Vector Lenghts ====
# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8
WID=8
VECL=64
#WID=4
#VECL=16

# Compile with GPU support (USE_HIP or USE_CUDA)
USE_HIP=1

#======= Compiler and compilation flags =========
# NOTES on compiler flags:
# CXXFLAGS is for compiler flags, they are always used
# MATHFLAGS are for special math etc. flags, these are only applied on solver functions
# LDFLAGS flags for linker
# Important note: Do not edit COMPFLAGS in this file!

CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable

testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable

LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}

# -fgpu-rdc # relocatable device code, needed for the velocity mesh
# -fgpu-sanitize

#======== PAPI ==========
#Add PAPI_MEM define to use papi to report memory consumption?
CXXFLAGS += -DPAPI_MEM
testpackage: CXXFLAGS += -DPAPI_MEM

#======== Allocator =========
#Use jemalloc instead of system malloc to reduce memory fragmentation? https://github.com/jemalloc/jemalloc
#Configure jemalloc with --with-jemalloc-prefix=je_ when installing it
#Note: jemalloc not supported with GPUs
#CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE
#testpackage: CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE

#-DNO_WRITE_AT_ALL: Define to disable write at all to
# avoid memleak (much slower IO)

#======== Libraries ===========

LIBRARY_PREFIX = /scratch/project_465000538/vlasiator/libraries

# Compiled libraries
#INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include
#LIB_BOOST = -L$(LIBRARY_PREFIX)/boost/lib -lboost_program_options -Wl,-rpath=$(LIBRARY_PREFIX)/boost/lib
LIB_BOOST = -lboost_program_options

INC_ZOLTAN = -isystem $(LIBRARY_PREFIX)/zoltan/include
LIB_ZOLTAN = -L$(LIBRARY_PREFIX)/zoltan/lib -lzoltan -Wl,-rpath=$(LIBRARY_PREFIX)/zoltan/lib

#INC_JEMALLOC = -I$(LIBRARY_PREFIX)/jemalloc/include
#LIB_JEMALLOC = -L$(LIBRARY_PREFIX)/jemalloc/lib -ljemalloc -Wl,-rpath=$(LIBRARY_PREFIX)/jemalloc/lib

INC_PAPI = -isystem $(LIBRARY_PREFIX)/papi/include
LIB_PAPI = -lpapi -L$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib

INC_VLSV = -isystem $(LIBRARY_PREFIX)/vlsv
LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Wl,-rpath=$(LIBRARY_PREFIX)/vlsv

INC_PROFILE = -isystem $(LIBRARY_PREFIX)/phiprof/include -D_ROCTX -I${ROCM_PATH}/include
LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -lgfortran -Wl,-rpath=$(LIBRARY_PREFIX)/phiprof/lib -Wl,-rpath=${ROCM_PATH}/lib -lroctx64 -lroctracer64

# Header libraries

INC_FSGRID = -I$(LIBRARY_PREFIX)/fsgrid/
INC_EIGEN = -isystem $(LIBRARY_PREFIX)/eigen/
INC_DCCRG = -I$(LIBRARY_PREFIX)/dccrg/
# INC_VECTORCLASS = -I$(LIBRARY_PREFIX_HEADERS)/vectorclass/ # not used with GPU
INC_HASHINATOR = -isystem $(LIBRARY_PREFIX)/hashinator/
4 changes: 2 additions & 2 deletions MAKE/Makefile.lumi_hipcc
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ USE_HIP=1

CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable

testpackage: CXXFLAGS = -g -O2 -fopenmp -x hip --amdgpu-target=gfx90a:xnack+ -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -funroll-loops -std=c++17 -fgpu-sanitize
testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable

LDFLAGS = -fopenmp --hip-link -lrt -lgomp -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}

# -fgpu-rdc # relocatable device code, needed for the velocity mesh
Expand Down
8 changes: 5 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ DEPS_PARTICLES = particles/particles.h particles/particles.cpp particles/field.h
readparameters.h version.h particles/scenario.h particles/histogram.h
OBJS_PARTICLES = particles/physconst.o particles/particles.o particles/readfields.o particles/particleparameters.o particles/distribution.o readparameters.o version.o particles/scenario.o particles/histogram.o

# todo: verify compilation and working of tools other than vlsvdiff
vlsvextract: ${DEPS_VLSVREADER} ${DEPS_VLSVREADERINTERFACE} tools/vlsvextract.h tools/vlsvextract.cpp ${OBJS_VLSVREADER} ${OBJS_VLSVREADERINTERFACE}
${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvextract.cpp ${INC_BOOST} ${INC_DCCRG} ${INC_EIGEN} ${INC_VLSV} -I$(CURDIR)
${LNK} -o vlsvextract_${FP_PRECISION} vlsvextract.o ${OBJS_VLSVREADERINTERFACE} ${LIB_BOOST} ${LIB_DCCRG} ${LIB_VLSV} ${LDFLAGS}
Expand All @@ -343,9 +344,10 @@ vlsv2silo: ${DEPS_VLSVREADERINTERFACE} tools/vlsv2silo.cpp ${OBJS_VLSVREADERIN
${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsv2silo.cpp ${INC_SILO} ${INC_VLSV} -I$(CURDIR)
${LNK} -o vlsv2silo_${FP_PRECISION} vlsv2silo.o ${OBJS_VLSVREADERINTERFACE} ${LIB_SILO} ${LIB_VLSV} ${LDFLAGS}

vlsvdiff: ${DEPS_VLSVREADERINTERFACE} tools/vlsvdiff.cpp ${OBJS_VLSVREADEREXTRA} ${OBJS_VLSVREADERINTERFACE}
${CMP} ${CXXEXTRAFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_VLSV} -I$(CURDIR)
${LNK} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LDFLAGS}
vlsvdiff: tools/vlsvdiff.cpp
@echo [CC] $<
$(SILENT)$(CMP) $(CXXEXTRAFLAGS) ${MATHFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_DCCRG} ${INC_VLSV} ${INC_FSGRID}
$(SILENT)${LNK} ${LDFLAGS} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LIBS}

vlsvreaderinterface.o: tools/vlsvreaderinterface.h tools/vlsvreaderinterface.cpp
${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvreaderinterface.cpp ${INC_VLSV} -I$(CURDIR)
Expand Down
56 changes: 26 additions & 30 deletions arch/arch_device_hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,7 @@
#endif
#define FULL_MASK 0xffffffffffffffff

#ifdef ARCH_MAIN
hipStream_t stream[64];
#else
extern hipStream_t stream[];
#endif
extern hipStream_t gpuStreamList[];

/* Define the HIP error checking macro */
#define CHK_ERR(err) (hip_error(err, __FILE__, __LINE__))
Expand Down Expand Up @@ -153,16 +149,16 @@ class buf {
public:

void syncDeviceData(void){
CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, stream[thread_id]));
CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
}

void syncHostData(void){
CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
}

buf(T * const _ptr, uint _bytes) : ptr(_ptr), bytes(_bytes) {
thread_id = omp_get_thread_num();
CHK_ERR(hipMallocAsync(&d_ptr, bytes, stream[thread_id]));
CHK_ERR(hipMallocAsync(&d_ptr, bytes, gpuStreamList[thread_id]));
syncDeviceData();
}

Expand All @@ -173,7 +169,7 @@ class buf {
if(!is_copy){
// syncHostData();
#ifdef __HIP_DEVICE_COMPILE__
hipFreeAsync(d_ptr, stream[thread_id]);
hipFreeAsync(d_ptr, gpuStreamList[thread_id]);
#endif
}
}
Expand Down Expand Up @@ -204,7 +200,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes) {
void* ptr;
const uint thread_id = omp_get_thread_num();
device_mempool_check(UINT64_MAX);
CHK_ERR(hipMallocAsync(&ptr, bytes, stream[thread_id]));
CHK_ERR(hipMallocAsync(&ptr, bytes, gpuStreamList[thread_id]));
return ptr;
}

Expand All @@ -219,7 +215,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes, hipStream_t stream)
template <typename T>
__host__ __forceinline__ static void free(T* ptr) {
const uint thread_id = omp_get_thread_num();
CHK_ERR(hipFreeAsync(ptr, stream[thread_id]));
CHK_ERR(hipFreeAsync(ptr, gpuStreamList[thread_id]));
}

template <typename T>
Expand All @@ -230,7 +226,7 @@ __host__ __forceinline__ static void free(T* ptr, hipStream_t stream) {
template <typename T>
__forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes){
const uint thread_id = omp_get_thread_num();
CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream[thread_id]));
CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
}

template <typename T>
Expand All @@ -242,7 +238,7 @@ __forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes, hipStream_t
template <typename T>
__forceinline__ static void memcpy_d2h(T* dst, T* src, size_t bytes){
const uint thread_id = omp_get_thread_num();
CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
}

template <typename T>
Expand Down Expand Up @@ -387,45 +383,45 @@ __forceinline__ static void parallel_reduce_driver(const uint (&limits)[NDim], L

/* Create a device buffer for the reduction results */
T* d_buf;
CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), stream[thread_id]));
CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, stream[thread_id]));
CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, gpuStreamList[thread_id]));

/* Create a device buffer to transfer the initial values to device */
T* d_const_buf;
CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), stream[thread_id]));
CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, stream[thread_id]));
CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, gpuStreamList[thread_id]));

/* Create a device buffer to transfer the loop limits of each dimension to device */
uint* d_limits;
CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), stream[thread_id]));
CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,stream[thread_id]));
CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), gpuStreamList[thread_id]));
CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,gpuStreamList[thread_id]));

/* Call the reduction kernel with different arguments depending
* on if the number of reductions is known at the compile time
*/
T* d_thread_data_dynamic;
T* d_thread_data_dynamic=0; // declared zero to suppress unitialized use warning
if(NReduStatic == 0) {
/* Get the cub temp storage size for the dynamic shared memory kernel argument */
constexpr auto cub_temp_storage_type_size = sizeof(typename hipcub::BlockReduce<T, ARCH_BLOCKSIZE_R, hipcub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 1, 1>::TempStorage);
/* Allocate memory for the thread data values */
CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), stream[thread_id]));
CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), gpuStreamList[thread_id]));
/* Call the kernel (the number of reductions not known at compile time) */
reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
/* Synchronize and free the thread data allocation */
CHK_ERR(hipStreamSynchronize(stream[thread_id]));
CHK_ERR(hipFreeAsync(d_thread_data_dynamic, stream[thread_id]));
CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
CHK_ERR(hipFreeAsync(d_thread_data_dynamic, gpuStreamList[thread_id]));
}
else{
/* Call the kernel (the number of reductions known at compile time) */
reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
/* Synchronize after kernel call */
CHK_ERR(hipStreamSynchronize(stream[thread_id]));
CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
}
/* Copy the results back to host and free the allocated memory back to pool*/
CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, stream[thread_id]));
CHK_ERR(hipFreeAsync(d_buf, stream[thread_id]));
CHK_ERR(hipFreeAsync(d_const_buf, stream[thread_id]));
CHK_ERR(hipFreeAsync(d_limits, stream[thread_id]));
CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
CHK_ERR(hipFreeAsync(d_buf, gpuStreamList[thread_id]));
CHK_ERR(hipFreeAsync(d_const_buf, gpuStreamList[thread_id]));
CHK_ERR(hipFreeAsync(d_limits, gpuStreamList[thread_id]));
}
}

Expand Down
Loading

0 comments on commit ccff5a2

Please sign in to comment.