From d2fc8a27f407e8382aabc8c41a053844bbd76ac8 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Tue, 29 Aug 2023 16:54:49 +0300
Subject: [PATCH 1/3] ARCH fixes to LUMI-G compilation, squelch some warnigns,
 added NOMAD makefile for hackathon

---
 MAKE/Makefile.NOMAD                     | 89 +++++++++++++++++++++++++
 arch/arch_device_hip.h                  | 56 ++++++++--------
 datareduction/datareductionoperator.cpp | 86 ++++++++++++------------
 3 files changed, 158 insertions(+), 73 deletions(-)
 create mode 100644 MAKE/Makefile.NOMAD

diff --git a/MAKE/Makefile.NOMAD b/MAKE/Makefile.NOMAD
new file mode 100644
index 000000000..5e061c556
--- /dev/null
+++ b/MAKE/Makefile.NOMAD
@@ -0,0 +1,89 @@
+CMP = hipcc
+LNK = hipcc
+
+# Modules loaded (after clean shell, no module purging, one-by-one not oneline)
+# module load LUMI/22.08
+# module load partition/G
+# module load cpeAMD
+# module load rocm/5.3.3
+
+#======== Vectorization ==========
+#Set vector backend type for vlasov solvers, sets precision and length. 
+#Options: 
+# AVX:	    VEC4D_AGNER, VEC4F_AGNER, VEC8F_AGNER
+# AVX512:   VEC8D_AGNER, VEC16F_AGNER
+# Fallback: VECTORCLASS = VEC_FALLBACK_GENERIC (Defaults to VECL8)
+VECTORCLASS = VEC_FALLBACK_GENERIC
+
+#===== Vector Lenghts ====
+# Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8 
+WID=8
+VECL=64
+# WID=4
+# VECL=16
+
+# Compile with GPU support (USE_HIP or USE_CUDA)
+USE_HIP=1
+
+#======= Compiler and compilation flags =========
+# NOTES on compiler flags:
+# CXXFLAGS is for compiler flags, they are always used
+# MATHFLAGS are for special math etc. flags, these are only applied on solver functions
+# LDFLAGS flags for linker
+# Important note: Do not edit COMPFLAGS in this file!
+
+CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
+
+testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize  -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
+
+LDFLAGS = -fopenmp --hip-link -lrt -lgomp -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
+LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}
+
+# -fgpu-rdc # relocatable device code, needed for the velocity mesh
+# -fgpu-sanitize
+
+#======== PAPI ==========
+#Add PAPI_MEM define to use papi to report memory consumption?
+CXXFLAGS +=  -DPAPI_MEM
+testpackage: CXXFLAGS +=  -DPAPI_MEM
+
+#======== Allocator =========
+#Use jemalloc instead of system malloc to reduce memory fragmentation? https://github.com/jemalloc/jemalloc
+#Configure jemalloc with  --with-jemalloc-prefix=je_ when installing it
+#Note: jemalloc not supported with GPUs
+#CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+#testpackage: CXXFLAGS += -DUSE_JEMALLOC -DJEMALLOC_NO_DEMANGLE
+
+#-DNO_WRITE_AT_ALL:  Define to disable write at all to 
+#                    avoid memleak (much slower IO)
+
+#======== Libraries ===========
+
+LIBRARY_PREFIX = /projappl/project_465000538/libraries
+
+# Compiled libraries
+INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include
+LIB_BOOST = -L$(LIBRARY_PREFIX)/boost/lib -lboost_program_options -Wl,-rpath=$(LIBRARY_PREFIX)/boost/lib
+
+INC_ZOLTAN = -isystem $(LIBRARY_PREFIX)/zoltan/include
+LIB_ZOLTAN = -L$(LIBRARY_PREFIX)/zoltan/lib -lzoltan -Wl,-rpath=$(LIBRARY_PREFIX)/zoltan/lib
+
+#INC_JEMALLOC = -I$(LIBRARY_PREFIX)/jemalloc/include
+#LIB_JEMALLOC = -L$(LIBRARY_PREFIX)/jemalloc/lib -ljemalloc -Wl,-rpath=$(LIBRARY_PREFIX)/jemalloc/lib
+
+INC_PAPI = -isystem $(LIBRARY_PREFIX)/papi/include
+LIB_PAPI = -lpapi -L$(LIBRARY_PREFIX)/papi/lib -Wl,-rpath=$(LIBRARY_PREFIX)/papi/lib
+
+INC_VLSV = -isystem $(LIBRARY_PREFIX)/vlsv
+LIB_VLSV = -L$(LIBRARY_PREFIX)/vlsv -lvlsv -Wl,-rpath=$(LIBRARY_PREFIX)/vlsv
+
+INC_PROFILE = -isystem $(LIBRARY_PREFIX)/phiprof/include -D_ROCTX -I${ROCM_PATH}/include
+LIB_PROFILE = -L$(LIBRARY_PREFIX)/phiprof/lib -lphiprof -lgfortran -Wl,-rpath=$(LIBRARY_PREFIX)/phiprof/lib -Wl,-rpath=${ROCM_PATH}/lib -lroctx64 -lroctracer64
+
+# Header libraries
+
+INC_FSGRID = -I$(LIBRARY_PREFIX)/fsgrid/
+INC_EIGEN = -isystem $(LIBRARY_PREFIX)/eigen/
+INC_DCCRG = -I$(LIBRARY_PREFIX)/dccrg/
+# INC_VECTORCLASS = -I$(LIBRARY_PREFIX_HEADERS)/vectorclass/ # not used with GPU
+INC_HASHINATOR = -isystem $(LIBRARY_PREFIX)/hashinator/
diff --git a/arch/arch_device_hip.h b/arch/arch_device_hip.h
index 8497f4c88..d4f8365d2 100644
--- a/arch/arch_device_hip.h
+++ b/arch/arch_device_hip.h
@@ -102,11 +102,7 @@
 #endif
 #define FULL_MASK 0xffffffffffffffff
 
-#ifdef ARCH_MAIN
-  hipStream_t stream[64];
-#else
-  extern hipStream_t stream[];
-#endif
+extern hipStream_t gpuStreamList[];
 
 /* Define the HIP error checking macro */
 #define CHK_ERR(err) (hip_error(err, __FILE__, __LINE__))
@@ -153,16 +149,16 @@ class buf {
   public:
 
   void syncDeviceData(void){
-    CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, stream[thread_id]));
+    CHK_ERR(hipMemcpyAsync(d_ptr, ptr, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
   }
 
   void syncHostData(void){
-    CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
+    CHK_ERR(hipMemcpyAsync(ptr, d_ptr, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
   }
 
   buf(T * const _ptr, uint _bytes) : ptr(_ptr), bytes(_bytes) {
     thread_id = omp_get_thread_num();
-    CHK_ERR(hipMallocAsync(&d_ptr, bytes, stream[thread_id]));
+    CHK_ERR(hipMallocAsync(&d_ptr, bytes, gpuStreamList[thread_id]));
     syncDeviceData();
   }
 
@@ -173,7 +169,7 @@ class buf {
     if(!is_copy){
       // syncHostData();
       #ifdef __HIP_DEVICE_COMPILE__
-        hipFreeAsync(d_ptr, stream[thread_id]);
+        hipFreeAsync(d_ptr, gpuStreamList[thread_id]);
       #endif
     }
   }
@@ -204,7 +200,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes) {
   void* ptr;
   const uint thread_id = omp_get_thread_num();
   device_mempool_check(UINT64_MAX);
-  CHK_ERR(hipMallocAsync(&ptr, bytes, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&ptr, bytes, gpuStreamList[thread_id]));
   return ptr;
 }
 
@@ -219,7 +215,7 @@ __host__ __forceinline__ static void* allocate(size_t bytes, hipStream_t stream)
 template <typename T>
 __host__ __forceinline__ static void free(T* ptr) {
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipFreeAsync(ptr, stream[thread_id]));
+  CHK_ERR(hipFreeAsync(ptr, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -230,7 +226,7 @@ __host__ __forceinline__ static void free(T* ptr, hipStream_t stream) {
 template <typename T>
 __forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes){
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyHostToDevice, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -242,7 +238,7 @@ __forceinline__ static void memcpy_h2d(T* dst, T* src, size_t bytes, hipStream_t
 template <typename T>
 __forceinline__ static void memcpy_d2h(T* dst, T* src, size_t bytes){
   const uint thread_id = omp_get_thread_num();
-  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(dst, src, bytes, hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
 }
 
 template <typename T>
@@ -387,45 +383,45 @@ __forceinline__ static void parallel_reduce_driver(const uint (&limits)[NDim], L
 
   /* Create a device buffer for the reduction results */
   T* d_buf;
-  CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_buf, sum, n_reductions*sizeof(T), hipMemcpyHostToDevice, gpuStreamList[thread_id]));
 
   /* Create a device buffer to transfer the initial values to device */
   T* d_const_buf;
-  CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_const_buf, n_reductions*sizeof(T), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_const_buf, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToDevice, gpuStreamList[thread_id]));
 
   /* Create a device buffer to transfer the loop limits of each dimension to device */
   uint* d_limits;
-  CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), stream[thread_id]));
-  CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,stream[thread_id]));
+  CHK_ERR(hipMallocAsync(&d_limits, NDim*sizeof(uint), gpuStreamList[thread_id]));
+  CHK_ERR(hipMemcpyAsync(d_limits, limits, NDim*sizeof(uint), hipMemcpyHostToDevice,gpuStreamList[thread_id]));
 
   /* Call the reduction kernel with different arguments depending
    * on if the number of reductions is known at the compile time
    */
-  T* d_thread_data_dynamic;
+  T* d_thread_data_dynamic=0; // declared zero to suppress unitialized use warning
   if(NReduStatic == 0) {
     /* Get the cub temp storage size for the dynamic shared memory kernel argument */
     constexpr auto cub_temp_storage_type_size = sizeof(typename hipcub::BlockReduce<T, ARCH_BLOCKSIZE_R, hipcub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, 1, 1>::TempStorage);
     /* Allocate memory for the thread data values */
-    CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), stream[thread_id]));
+    CHK_ERR(hipMallocAsync(&d_thread_data_dynamic, n_reductions * blocksize * gridsize * sizeof(T), gpuStreamList[thread_id]));
     /* Call the kernel (the number of reductions not known at compile time) */
-    reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
+    reduction_kernel<Op, NDim, 0><<<gridsize, blocksize, n_reductions * cub_temp_storage_type_size, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
     /* Synchronize and free the thread data allocation */
-    CHK_ERR(hipStreamSynchronize(stream[thread_id]));
-    CHK_ERR(hipFreeAsync(d_thread_data_dynamic, stream[thread_id]));
+    CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
+    CHK_ERR(hipFreeAsync(d_thread_data_dynamic, gpuStreamList[thread_id]));
   }
   else{
     /* Call the kernel (the number of reductions known at compile time) */
-    reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, stream[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
+    reduction_kernel<Op, NDim, NReduStatic><<<gridsize, blocksize, 0, gpuStreamList[thread_id]>>>(loop_body, d_const_buf, d_buf, d_limits, n_total, n_reductions, d_thread_data_dynamic);
     /* Synchronize after kernel call */
-    CHK_ERR(hipStreamSynchronize(stream[thread_id]));
+    CHK_ERR(hipStreamSynchronize(gpuStreamList[thread_id]));
   }
   /* Copy the results back to host and free the allocated memory back to pool*/
-  CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_buf, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_const_buf, stream[thread_id]));
-  CHK_ERR(hipFreeAsync(d_limits, stream[thread_id]));
+  CHK_ERR(hipMemcpyAsync(sum, d_buf, n_reductions*sizeof(T), hipMemcpyDeviceToHost, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_buf, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_const_buf, gpuStreamList[thread_id]));
+  CHK_ERR(hipFreeAsync(d_limits, gpuStreamList[thread_id]));
 }
 }
 
diff --git a/datareduction/datareductionoperator.cpp b/datareduction/datareductionoperator.cpp
index 94b2b33bf..6bfe3b180 100644
--- a/datareduction/datareductionoperator.cpp
+++ b/datareduction/datareductionoperator.cpp
@@ -855,17 +855,17 @@ namespace DRO {
                                                 const Real VY = block_parameters[BlockParams::VYCRD] + (j + HALF) * block_parameters[BlockParams::DVY];
                                                 const Real VZ = block_parameters[BlockParams::VZCRD] + (k + HALF) * block_parameters[BlockParams::DVZ];
                                                 // Compare the distance of the velocity cell from the center of the maxwellian distribution to the radius of the maxwellian distribution
-                                                if(calculateNonthermal == true &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   > thermalRadius*thermalRadius
-                                                   ||
-                                                   calculateNonthermal == false &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   <= thermalRadius*thermalRadius ) {
+                                                if (((calculateNonthermal == true) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      > thermalRadius*thermalRadius))
+                                                     ||
+                                                     ((calculateNonthermal == false) &&
+                                                      (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                         + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                         + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                       <= thermalRadius*thermalRadius) )) {
                                                    //The velocity cell is a part of the nonthermal/thermal population:
                                                    lsum[0] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * DV3;
                                                 }
@@ -916,17 +916,17 @@ namespace DRO {
                                                 const Real VY = block_parameters[BlockParams::VYCRD] + (j + HALF) * block_parameters[BlockParams::DVY];
                                                 const Real VZ = block_parameters[BlockParams::VZCRD] + (k + HALF) * block_parameters[BlockParams::DVZ];
                                                 // Calculate the distance of the velocity cell from the center of the maxwellian distribution and compare it to the approximate radius of the maxwellian distribution
-                                                if(calculateNonthermal == true &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   > thermalRadius*thermalRadius
+                                                if (((calculateNonthermal == true) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      > thermalRadius*thermalRadius))
                                                    ||
-                                                   calculateNonthermal == false &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   <= thermalRadius*thermalRadius ) {
+                                                    ((calculateNonthermal == false) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      <= thermalRadius*thermalRadius) )) {
                                                    // Add the value of the coordinates and multiply by the AVGS value of the velocity cell and the volume of the velocity cell
                                                    lsum[0] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)]*VX*DV3;
                                                    lsum[1] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)]*VY*DV3;
@@ -987,17 +987,17 @@ namespace DRO {
                                                 const Real VY = block_parameters[BlockParams::VYCRD] + (j + HALF) * block_parameters[BlockParams::DVY];
                                                 const Real VZ = block_parameters[BlockParams::VZCRD] + (k + HALF) * block_parameters[BlockParams::DVZ];
                                                 // Calculate the distance of the velocity cell from the center of the maxwellian distribution and compare it to the approximate radius of the maxwellian distribution
-                                                if(calculateNonthermal == true &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   > thermalRadius*thermalRadius
-                                                   ||
-                                                   calculateNonthermal == false &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   <= thermalRadius*thermalRadius ) {
+                                                if (((calculateNonthermal == true) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      > thermalRadius*thermalRadius))
+                                                    ||
+                                                    ((calculateNonthermal == false) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      <= thermalRadius*thermalRadius ))) {
                                                    lsum[0] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VX - averageVX) * (VX - averageVX) * DV3;
                                                    lsum[1] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VY - averageVY) * (VY - averageVY) * DV3;
                                                    lsum[2] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VZ - averageVZ) * (VZ - averageVZ) * DV3;
@@ -1053,17 +1053,17 @@ namespace DRO {
                                                 const Real VY = block_parameters[BlockParams::VYCRD] + (j + HALF) * block_parameters[BlockParams::DVY];
                                                 const Real VZ = block_parameters[BlockParams::VZCRD] + (k + HALF) * block_parameters[BlockParams::DVZ];
                                                 // Calculate the distance of the velocity cell from the center of the maxwellian distribution and compare it to the approximate radius of the maxwellian distribution
-                                                if(calculateNonthermal == true &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   > thermalRadius*thermalRadius
-                                                   ||
-                                                   calculateNonthermal == false &&
-                                                   ( (thermalV[0] - VX) * (thermalV[0] - VX)
-                                                     + (thermalV[1] - VY) * (thermalV[1] - VY)
-                                                     + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
-                                                   <= thermalRadius*thermalRadius ) {
+                                                if (((calculateNonthermal == true) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      > thermalRadius*thermalRadius))
+                                                    ||
+                                                    ((calculateNonthermal == false) &&
+                                                     (( (thermalV[0] - VX) * (thermalV[0] - VX)
+                                                        + (thermalV[1] - VY) * (thermalV[1] - VY)
+                                                        + (thermalV[2] - VZ) * (thermalV[2] - VZ) )
+                                                      <= thermalRadius*thermalRadius ))) {
                                                    lsum[0] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VX - averageVX) * (VY - averageVY) * DV3;
                                                    lsum[1] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VZ - averageVZ) * (VX - averageVX) * DV3;
                                                    lsum[2] += block_data[n * SIZE_VELBLOCK + cellIndex(i,j,k)] * (VY - averageVY) * (VZ - averageVZ) * DV3;

From b5a8f802f6ebad912e1be68064cd03668592cef7 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Thu, 31 Aug 2023 16:33:39 +0300
Subject: [PATCH 2/3] Use the boost module on lumi, and if WID==8, halve the
 block counts. Testpackage config updates from threadsafe_randomgen PR.

---
 MAKE/Makefile.NOMAD                           |  14 ++-
 MAKE/Makefile.lumi_hipcc                      |   4 +-
 object_wrapper.cpp                            |  21 ++++
 parameters.cpp                                |   7 +-
 parameters.h                                  |   2 +
 .../tests/Flowthrough_amr/Flowthrough_amr.cfg |  22 ++--
 .../Flowthrough_trans_periodic.cfg            |  24 ++--
 .../Flowthrough_x_inflow_y_outflow.cfg        |  22 ++--
 .../Flowthrough_x_inflow_y_outflow_acc.cfg    |  22 ++--
 .../tests/Ionosphere_small/Fluctuations.cfg   |  18 +--
 .../Ionosphere_small/Ionosphere_small.cfg     |  18 +--
 .../Magnetosphere_3D_small.cfg                |  24 ++--
 .../Magnetosphere_polar_small.cfg             |  24 ++--
 .../Magnetosphere_small.cfg                   |  24 ++--
 .../Selfgen_Waves_Periodic.cfg                |   2 +-
 .../acctest_1_maxw_500k_30kms_1deg.cfg        |   2 +-
 .../acctest_2_maxw_500k_100k_20kms_10deg.cfg  |   2 +-
 .../acctest_3_substeps/acctest_3_substeps.cfg |   2 +-
 .../acctest_4_helium/acctest_4_helium.cfg     |   2 +-
 .../acctest_5_proton_antiproton.cfg           |   2 +-
 .../tests/restart_read/restart_read.cfg       |  22 ++--
 .../tests/restart_write/restart_write.cfg     |  22 ++--
 .../test_fp_fsolver_only_3D.cfg               |   6 +-
 .../test_fp_substeps/test_fp_substeps.cfg     |   6 +-
 ...transtest_2_maxw_500k_100k_20kms_20x20.cfg |   4 +-
 .../tests/transtest_amr/transtest_amr.cfg     | 105 ------------------
 26 files changed, 174 insertions(+), 249 deletions(-)
 delete mode 100644 testpackage/tests/transtest_amr/transtest_amr.cfg

diff --git a/MAKE/Makefile.NOMAD b/MAKE/Makefile.NOMAD
index 5e061c556..d777c4464 100644
--- a/MAKE/Makefile.NOMAD
+++ b/MAKE/Makefile.NOMAD
@@ -6,6 +6,7 @@ LNK = hipcc
 # module load partition/G
 # module load cpeAMD
 # module load rocm/5.3.3
+# module load Boost/1.79.0-cpeAMD-22.08
 
 #======== Vectorization ==========
 #Set vector backend type for vlasov solvers, sets precision and length. 
@@ -19,8 +20,8 @@ VECTORCLASS = VEC_FALLBACK_GENERIC
 # Default for VEC_FALLBACK_GENERIC is WID=4, VECL=8 
 WID=8
 VECL=64
-# WID=4
-# VECL=16
+#WID=4
+#VECL=16
 
 # Compile with GPU support (USE_HIP or USE_CUDA)
 USE_HIP=1
@@ -36,7 +37,7 @@ CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loop
 
 testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize  -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
 
-LDFLAGS = -fopenmp --hip-link -lrt -lgomp -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
+LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
 LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}
 
 # -fgpu-rdc # relocatable device code, needed for the velocity mesh
@@ -59,11 +60,12 @@ testpackage: CXXFLAGS +=  -DPAPI_MEM
 
 #======== Libraries ===========
 
-LIBRARY_PREFIX = /projappl/project_465000538/libraries
+LIBRARY_PREFIX = /scratch/project_465000538/vlasiator/libraries
 
 # Compiled libraries
-INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include
-LIB_BOOST = -L$(LIBRARY_PREFIX)/boost/lib -lboost_program_options -Wl,-rpath=$(LIBRARY_PREFIX)/boost/lib
+#INC_BOOST = -isystem $(LIBRARY_PREFIX)/boost/include
+#LIB_BOOST = -L$(LIBRARY_PREFIX)/boost/lib -lboost_program_options -Wl,-rpath=$(LIBRARY_PREFIX)/boost/lib
+LIB_BOOST = -lboost_program_options
 
 INC_ZOLTAN = -isystem $(LIBRARY_PREFIX)/zoltan/include
 LIB_ZOLTAN = -L$(LIBRARY_PREFIX)/zoltan/lib -lzoltan -Wl,-rpath=$(LIBRARY_PREFIX)/zoltan/lib
diff --git a/MAKE/Makefile.lumi_hipcc b/MAKE/Makefile.lumi_hipcc
index 4d013c762..0784582dc 100644
--- a/MAKE/Makefile.lumi_hipcc
+++ b/MAKE/Makefile.lumi_hipcc
@@ -34,9 +34,9 @@ USE_HIP=1
 
 CXXFLAGS += -g -O3 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -funroll-loops -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
 
-testpackage: CXXFLAGS = -g -O2 -fopenmp -x hip --amdgpu-target=gfx90a:xnack+ -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -funroll-loops -std=c++17  -fgpu-sanitize
+testpackage: CXXFLAGS = -g -O2 -x hip --amdgpu-target=gfx90a:xnack+ -std=c++17 -fopenmp -fgpu-rdc -I. -Ihip -Iomp -D__HIP_PLATFORM_AMD__ -I${CRAY_MPICH_DIR}/include -fgpu-sanitize  -W -Wall -Wno-unused-parameter -Wno-unused-result -Wno-unused-function -Wno-unused-variable -Wno-unknown-pragmas -Wno-deprecated-register -Wno-unused-but-set-variable
 
-LDFLAGS = -fopenmp --hip-link -lrt -lgomp -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
+LDFLAGS = -fopenmp --hip-link -lrt -lpthread -fgpu-rdc -D__HIP_PLATFORM_AMD__ -L${CRAY_MPICH_DIR}/lib ${PE_MPICH_GTL_DIR_amd_gfx90a} --amdgpu-target=gfx90a:xnack+
 LIB_MPI = -lmpi ${PE_MPICH_GTL_LIBS_amd_gfx90a}
 
 # -fgpu-rdc # relocatable device code, needed for the velocity mesh
diff --git a/object_wrapper.cpp b/object_wrapper.cpp
index 157011a56..cbc0c7743 100644
--- a/object_wrapper.cpp
+++ b/object_wrapper.cpp
@@ -154,6 +154,27 @@ bool ObjectWrapper::getPopulationParameters() {
          std::cerr << errormsg;
       }
 
+      /* Special handling of GPU version with WID=8; halve the number of blocks */
+      #ifdef USE_GPU
+      int myRank;
+      MPI_Comm_rank(MPI_COMM_WORLD,&myRank);
+      if ((WID==8 && P::adaptGPUWID)) {
+         // First verify that we can halve the value2
+         if ( (vMesh.gridLength[0]%2==0) && (vMesh.gridLength[1]%2==0) && (vMesh.gridLength[2]%2==0)) {
+            vMesh.gridLength[0] /= 2;
+            vMesh.gridLength[1] /= 2;
+            vMesh.gridLength[2] /= 2;
+            if(myRank==MASTER_RANK) {
+               std::cerr<<" Note: Using GPU mode with WID=8; Halving velocity block counts per dimension. Deactivate with parameter adaptGPUWID=false."<<std::endl;
+            }
+         } else {
+            if(myRank==MASTER_RANK) {
+               std::cerr<<" Warning: Using GPU mode with WID=8 but odd number of velocity blocks! Cannot halve the blocks count."<<std::endl;
+            }
+         }
+      }
+      #endif
+
       vMesh.blockLength[0] = vMesh.blockLength[1] = vMesh.blockLength[2] = WID;
       int maxRefLevel; // Temporary variable, since target value is a uint8_t
       RP::get(pop + "_vspace.max_refinement_level",maxRefLevel);
diff --git a/parameters.cpp b/parameters.cpp
index da4a5ca1b..462554493 100644
--- a/parameters.cpp
+++ b/parameters.cpp
@@ -84,6 +84,8 @@ bool P::meshRepartitioned = true;
 bool P::prepareForRebalance = false;
 vector<CellID> P::localCells;
 
+bool P::adaptGPUWID = true;
+
 vector<string> P::systemWriteName;
 vector<string> P::systemWritePath;
 vector<Real> P::systemWriteTimeInterval;
@@ -446,7 +448,9 @@ bool P::addParameters() {
    RP::add("AMR.box_center_z", "z coordinate of the center of the box that is refined (for testing)", 0.0);
    RP::add("AMR.transShortPencils", "if true, use one-cell pencils", false);
    RP::addComposing("AMR.filterpasses", string("AMR filter passes for each individual refinement level"));
-   
+
+   RP::add("adaptGPUWID", "if true, will halve velocity block counts if GPU is in use and WID==8", true);
+
    RP::add("fieldtracing.fieldLineTracer", "Field line tracing method to use for coupling ionosphere and magnetosphere (options are: Euler, BS)", std::string("Euler"));
    RP::add("fieldtracing.tracer_max_allowed_error", "Maximum allowed error for the adaptive field line tracers ", 1000);
    RP::add("fieldtracing.tracer_max_attempts", "Maximum allowed attempts for the adaptive field line tracers", 100);
@@ -685,6 +689,7 @@ void Parameters::getParameters() {
    RP::get("AMR.box_center_z", P::amrBoxCenterZ);
    RP::get("AMR.transShortPencils", P::amrTransShortPencils);
    RP::get("AMR.filterpasses", P::blurPassString);
+   RP::get("adaptGPUWID", P::adaptGPUWID);
 
    // If we are in an AMR run we need to set up the filtering scheme.
    if (P::amrMaxSpatialRefLevel>0){
diff --git a/parameters.h b/parameters.h
index a296b24c8..bf2e530c1 100644
--- a/parameters.h
+++ b/parameters.h
@@ -72,6 +72,8 @@ struct Parameters {
    static bool meshRepartitioned;         /*!< If true, mesh was repartitioned on this time step.*/
    static std::vector<CellID> localCells; /*!< Cached copy of spatial cell IDs on this process.*/
 
+   static bool adaptGPUWID;         /*!< If true, GPU runs with WID=8 use halved velocity block counts.*/
+
    static uint diagnosticInterval;
    static std::vector<std::string> systemWriteName;  /*!< Names for the different classes of grid output*/
    static std::vector<std::string> systemWritePath;  /*!< Save this series in this location. Default is ./ */
diff --git a/testpackage/tests/Flowthrough_amr/Flowthrough_amr.cfg b/testpackage/tests/Flowthrough_amr/Flowthrough_amr.cfg
index fa7cf0f4e..3a3fa751b 100644
--- a/testpackage/tests/Flowthrough_amr/Flowthrough_amr.cfg
+++ b/testpackage/tests/Flowthrough_amr/Flowthrough_amr.cfg
@@ -31,15 +31,15 @@ t_max = 182.0
 dt = 2.0
 
 [proton_vspace]
-vx_min = -2e6
-vx_max = +2e6
-vy_min = -2e6
-vy_max = +2e6
-vz_min = -2e6
-vz_max = +2e6
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -1.92e6
+vx_max = +1.92e6
+vy_min = -1.92e6
+vy_max = +1.92e6
+vz_min = -1.92e6
+vz_max = +1.92e6
+vx_length = 16
+vy_length = 16
+vz_length = 16
 
 [io]
 write_initial_state = 1
@@ -102,8 +102,8 @@ VX0 = 1e5
 VY0 = 0
 VZ0 = 0
 
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [loadBalance]
 algorithm = RCB
diff --git a/testpackage/tests/Flowthrough_trans_periodic/Flowthrough_trans_periodic.cfg b/testpackage/tests/Flowthrough_trans_periodic/Flowthrough_trans_periodic.cfg
index 02a72a846..6742e540e 100644
--- a/testpackage/tests/Flowthrough_trans_periodic/Flowthrough_trans_periodic.cfg
+++ b/testpackage/tests/Flowthrough_trans_periodic/Flowthrough_trans_periodic.cfg
@@ -26,7 +26,7 @@ output = populations_vg_v
 output = vg_boundarytype
 output = vg_rank
 output = populations_vg_blocks
-output = populations_vg_blocks
+output = populations_vg_nonmaxwellianity
 diagnostic = populations_vg_blocks
 
 [gridbuilder]
@@ -48,15 +48,15 @@ mass_units = PROTON
 charge = 1
 
 [proton_vspace]
-vx_min = -600000.0
-vx_max = +600000.0
-vy_min = -600000.0
-vy_max = +600000.0
-vz_min = -600000.0
-vz_max = +600000.0
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -640000.0
+vx_max = +640000.0
+vy_min = -640000.0
+vy_max = +640000.0
+vz_min = -640000.0
+vz_max = +640000.0
+vx_length = 32
+vy_length = 32
+vz_length = 32
 [proton_sparse]
 minValue = 1.0e-15
 
@@ -80,8 +80,8 @@ rho  = 1000000.0
 VX0 = 4e5
 VY0 = 4e5
 VZ0 = 4e5
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [bailout]
 velocity_space_wall_block_margin = 0
diff --git a/testpackage/tests/Flowthrough_x_inflow_y_outflow/Flowthrough_x_inflow_y_outflow.cfg b/testpackage/tests/Flowthrough_x_inflow_y_outflow/Flowthrough_x_inflow_y_outflow.cfg
index b25de181f..5d0861f41 100644
--- a/testpackage/tests/Flowthrough_x_inflow_y_outflow/Flowthrough_x_inflow_y_outflow.cfg
+++ b/testpackage/tests/Flowthrough_x_inflow_y_outflow/Flowthrough_x_inflow_y_outflow.cfg
@@ -47,15 +47,15 @@ mass_units = PROTON
 charge = 1
 
 [proton_vspace]
-vx_min = -600000.0
-vx_max = +600000.0
-vy_min = -600000.0
-vy_max = +600000.0
-vz_min = -600000.0
-vz_max = +600000.0
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -640000.0
+vx_max = +640000.0
+vy_min = -640000.0
+vy_max = +640000.0
+vz_min = -640000.0
+vz_max = +640000.0
+vx_length = 32
+vy_length = 32
+vz_length = 32
 
 [proton_sparse]
 minValue = 1.0e-15
@@ -101,8 +101,8 @@ rho  = 1000000.0
 VX0 = 4e5
 VY0 = 0
 VZ0 = 0
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [bailout]
 velocity_space_wall_block_margin = 0
diff --git a/testpackage/tests/Flowthrough_x_inflow_y_outflow_acc/Flowthrough_x_inflow_y_outflow_acc.cfg b/testpackage/tests/Flowthrough_x_inflow_y_outflow_acc/Flowthrough_x_inflow_y_outflow_acc.cfg
index c089ef51f..7eb5198a9 100644
--- a/testpackage/tests/Flowthrough_x_inflow_y_outflow_acc/Flowthrough_x_inflow_y_outflow_acc.cfg
+++ b/testpackage/tests/Flowthrough_x_inflow_y_outflow_acc/Flowthrough_x_inflow_y_outflow_acc.cfg
@@ -47,15 +47,15 @@ mass_units = PROTON
 charge = 1
 
 [proton_vspace]
-vx_min = -600000.0
-vx_max = +600000.0
-vy_min = -600000.0
-vy_max = +600000.0
-vz_min = -600000.0
-vz_max = +600000.0
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -640000.0
+vx_max = +640000.0
+vy_min = -640000.0
+vy_max = +640000.0
+vz_min = -640000.0
+vz_max = +640000.0
+vx_length = 32
+vy_length = 32
+vz_length = 32
 
 [proton_sparse]
 minValue = 1.0e-15
@@ -101,8 +101,8 @@ rho  = 1000000.0
 VX0 = 4e5
 VY0 = 0
 VZ0 = 0
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [bailout]
 velocity_space_wall_block_margin = 0
diff --git a/testpackage/tests/Ionosphere_small/Fluctuations.cfg b/testpackage/tests/Ionosphere_small/Fluctuations.cfg
index d3092ff10..f89cb3162 100644
--- a/testpackage/tests/Ionosphere_small/Fluctuations.cfg
+++ b/testpackage/tests/Ionosphere_small/Fluctuations.cfg
@@ -47,15 +47,15 @@ z_max = 1.5e5
 dt = 2.05e-05
 
 [proton_vspace]
-vx_min = -4.0e6
-vx_max = +4.0e6
-vy_min = -4.0e6
-vy_max = +4.0e6
-vz_min = -4.0e6
-vz_max = +4.0e6
-vx_length = 67 # 100 km/s resolution - more isn't needed for protons
-vy_length = 67
-vz_length = 67
+vx_min = -3.96e6
+vx_max = +3.96e6
+vy_min = -3.96e6
+vy_max = +3.96e6
+vz_min = -3.96e6
+vz_max = +3.96e6
+vx_length = 66 # 30 km/s resolution
+vy_length = 66
+vz_length = 66
 [proton_sparse]
 minValue = 1.0e-16
 #minValue = 0e-14
diff --git a/testpackage/tests/Ionosphere_small/Ionosphere_small.cfg b/testpackage/tests/Ionosphere_small/Ionosphere_small.cfg
index 00d1c3316..029b1a877 100644
--- a/testpackage/tests/Ionosphere_small/Ionosphere_small.cfg
+++ b/testpackage/tests/Ionosphere_small/Ionosphere_small.cfg
@@ -37,15 +37,15 @@ max_spatial_level = 1
 refine_radius = 2.2e8 #about 35 RE
 
 [proton_vspace]
-vx_min = -4.02e6
-vx_max = +4.02e6
-vy_min = -4.02e6
-vy_max = +4.02e6
-vz_min = -4.02e6
-vz_max = +4.02e6
-vx_length = 67
-vy_length = 67
-vz_length = 67
+vx_min = -3.96e6
+vx_max = +3.96e6
+vy_min = -3.96e6
+vy_max = +3.96e6
+vz_min = -3.96e6
+vz_max = +3.96e6
+vx_length = 66 # 30 km/s resolution
+vy_length = 66
+vz_length = 66
 [proton_sparse]
 minValue = 1.0e-15
 
diff --git a/testpackage/tests/Magnetosphere_3D_small/Magnetosphere_3D_small.cfg b/testpackage/tests/Magnetosphere_3D_small/Magnetosphere_3D_small.cfg
index 83b52c3bd..27c353aa2 100644
--- a/testpackage/tests/Magnetosphere_3D_small/Magnetosphere_3D_small.cfg
+++ b/testpackage/tests/Magnetosphere_3D_small/Magnetosphere_3D_small.cfg
@@ -37,15 +37,15 @@ max_spatial_level = 1
 refine_radius = 2.2e8 #about 35 RE
 
 [proton_vspace]
-vx_min = -4.02e6
-vx_max = +4.02e6
-vy_min = -4.02e6
-vy_max = +4.02e6
-vz_min = -4.02e6
-vz_max = +4.02e6
-vx_length = 67
-vy_length = 67
-vz_length = 67
+vx_min = -3.96e6
+vx_max = +3.96e6
+vy_min = -3.96e6
+vy_max = +3.96e6
+vz_min = -3.96e6
+vz_max = +3.96e6
+vx_length = 66 # 30 km/s resolution
+vy_length = 66
+vz_length = 66
 [proton_sparse]
 minValue = 1.0e-15
 
@@ -81,16 +81,16 @@ periodic_y = no
 periodic_z = no
 boundary = Outflow
 boundary = Maxwellian
-boundary = Conductingsphere
+boundary = Copysphere
 
-[conductingsphere]
+[copysphere]
 centerX = 0.0
 centerY = 0.0
 centerZ = 0.0
 radius = 70e6
 precedence = 2
 
-[proton_conductingsphere]
+[proton_copysphere]
 rho = 1.0e6
 T = 0.5e6
 
diff --git a/testpackage/tests/Magnetosphere_polar_small/Magnetosphere_polar_small.cfg b/testpackage/tests/Magnetosphere_polar_small/Magnetosphere_polar_small.cfg
index a3a64c807..a1d7a9b3d 100644
--- a/testpackage/tests/Magnetosphere_polar_small/Magnetosphere_polar_small.cfg
+++ b/testpackage/tests/Magnetosphere_polar_small/Magnetosphere_polar_small.cfg
@@ -37,15 +37,15 @@ t_max = 20.05
 
 
 [proton_vspace]
-vx_min = -2.0e6
-vx_max = +2.0e6
-vy_min = -2.0e6
-vy_max = +2.0e6
-vz_min = -2.0e6
-vz_max = +2.0e6
-vx_length = 25
-vy_length = 25
-vz_length = 25
+vx_min = -2.08e6
+vx_max = +2.08e6
+vy_min = -2.08e6
+vy_max = +2.08e6
+vz_min = -2.08e6
+vz_max = +2.08e6
+vx_length = 26 # 40 km/s resolution
+vy_length = 26
+vz_length = 26
 [proton_sparse]
 minValue = 1.0e-15
 
@@ -87,9 +87,9 @@ periodic_y = yes
 periodic_z = no
 boundary = Outflow
 boundary = Maxwellian
-boundary = Conductingsphere
+boundary = Copysphere
 
-[conductingsphere]
+[copysphere]
 centerX = 0.0
 centerY = 0.0
 centerZ = 0.0
@@ -97,7 +97,7 @@ geometry = 2
 radius = 50.0e6
 precedence = 2
 
-[proton_conductingsphere]
+[proton_copysphere]
 rho = 1.0e6
 T = 0.5e6
 VX0 = 0.0
diff --git a/testpackage/tests/Magnetosphere_small/Magnetosphere_small.cfg b/testpackage/tests/Magnetosphere_small/Magnetosphere_small.cfg
index 35f3bc5ca..621e45826 100644
--- a/testpackage/tests/Magnetosphere_small/Magnetosphere_small.cfg
+++ b/testpackage/tests/Magnetosphere_small/Magnetosphere_small.cfg
@@ -33,15 +33,15 @@ t_max = 10.1
 #timestep_max = 100
 
 [proton_vspace]
-vx_min = -2.0e6
-vx_max = +2.0e6
-vy_min = -2.0e6
-vy_max = +2.0e6
-vz_min = -2.0e6
-vz_max = +2.0e6
-vx_length = 25
-vy_length = 25
-vz_length = 25
+vx_min = -2.08e6
+vx_max = +2.08e6
+vy_min = -2.08e6
+vy_max = +2.08e6
+vz_min = -2.08e6
+vz_max = +2.08e6
+vx_length = 26 # 40 km/s resolution
+vy_length = 26
+vz_length = 26
 [proton_sparse]
 minValue = 1.0e-15
 
@@ -77,16 +77,16 @@ periodic_y = no
 periodic_z = yes
 boundary = Outflow
 boundary = Maxwellian
-boundary = Conductingsphere
+boundary = Copysphere
 
-[conductingsphere]
+[copysphere]
 centerX = 0.0
 centerY = 0.0
 centerZ = 0.0
 radius = 38.2e6
 precedence = 2
 
-[proton_conductingsphere]
+[proton_copysphere]
 rho = 1.0e6
 T=100000.0
 
diff --git a/testpackage/tests/Selfgen_Waves_Periodic/Selfgen_Waves_Periodic.cfg b/testpackage/tests/Selfgen_Waves_Periodic/Selfgen_Waves_Periodic.cfg
index 3eac7f946..ee1a11d3d 100644
--- a/testpackage/tests/Selfgen_Waves_Periodic/Selfgen_Waves_Periodic.cfg
+++ b/testpackage/tests/Selfgen_Waves_Periodic/Selfgen_Waves_Periodic.cfg
@@ -77,7 +77,7 @@ magYPertAbsAmp = 0
 magZPertAbsAmp = 0
 densityModel = testcase
 
-nVelocitySamples = 3
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 2
diff --git a/testpackage/tests/acctest_1_maxw_500k_30kms_1deg/acctest_1_maxw_500k_30kms_1deg.cfg b/testpackage/tests/acctest_1_maxw_500k_30kms_1deg/acctest_1_maxw_500k_30kms_1deg.cfg
index 4e73cbbdb..53610b1cc 100644
--- a/testpackage/tests/acctest_1_maxw_500k_30kms_1deg/acctest_1_maxw_500k_30kms_1deg.cfg
+++ b/testpackage/tests/acctest_1_maxw_500k_30kms_1deg/acctest_1_maxw_500k_30kms_1deg.cfg
@@ -113,7 +113,7 @@ lambda = 10
 magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
-nVelocitySamples = 4
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 1
diff --git a/testpackage/tests/acctest_2_maxw_500k_100k_20kms_10deg/acctest_2_maxw_500k_100k_20kms_10deg.cfg b/testpackage/tests/acctest_2_maxw_500k_100k_20kms_10deg/acctest_2_maxw_500k_100k_20kms_10deg.cfg
index 346331a9a..ee5eaa432 100644
--- a/testpackage/tests/acctest_2_maxw_500k_100k_20kms_10deg/acctest_2_maxw_500k_100k_20kms_10deg.cfg
+++ b/testpackage/tests/acctest_2_maxw_500k_100k_20kms_10deg/acctest_2_maxw_500k_100k_20kms_10deg.cfg
@@ -77,7 +77,7 @@ Bz = 1.1135233442526334e-10
 magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
-nVelocitySamples = 3
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 2
diff --git a/testpackage/tests/acctest_3_substeps/acctest_3_substeps.cfg b/testpackage/tests/acctest_3_substeps/acctest_3_substeps.cfg
index 4381c1f90..303640837 100644
--- a/testpackage/tests/acctest_3_substeps/acctest_3_substeps.cfg
+++ b/testpackage/tests/acctest_3_substeps/acctest_3_substeps.cfg
@@ -81,7 +81,7 @@ Bz = 1.1135233442526334e-10
 magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
-nVelocitySamples = 3
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 2
diff --git a/testpackage/tests/acctest_4_helium/acctest_4_helium.cfg b/testpackage/tests/acctest_4_helium/acctest_4_helium.cfg
index ff8b4d8c8..fc93ac49a 100644
--- a/testpackage/tests/acctest_4_helium/acctest_4_helium.cfg
+++ b/testpackage/tests/acctest_4_helium/acctest_4_helium.cfg
@@ -80,7 +80,7 @@ lambda = 10
 magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
-nVelocitySamples = 4
+nVelocitySamples = 1
 
 [helium_MultiPeak]
 n = 1
diff --git a/testpackage/tests/acctest_5_proton_antiproton/acctest_5_proton_antiproton.cfg b/testpackage/tests/acctest_5_proton_antiproton/acctest_5_proton_antiproton.cfg
index 8134fb6a3..f8c3d23e2 100644
--- a/testpackage/tests/acctest_5_proton_antiproton/acctest_5_proton_antiproton.cfg
+++ b/testpackage/tests/acctest_5_proton_antiproton/acctest_5_proton_antiproton.cfg
@@ -97,7 +97,7 @@ Bz = 1.1135233442526334e-10
 magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
-nVelocitySamples = 3
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 2
diff --git a/testpackage/tests/restart_read/restart_read.cfg b/testpackage/tests/restart_read/restart_read.cfg
index ff3295fed..225e22b76 100644
--- a/testpackage/tests/restart_read/restart_read.cfg
+++ b/testpackage/tests/restart_read/restart_read.cfg
@@ -50,15 +50,15 @@ mass_units = PROTON
 charge = 1
 
 [proton_vspace]
-vx_min = -600000.0
-vx_max = +600000.0
-vy_min = -600000.0
-vy_max = +600000.0
-vz_min = -600000.0
-vz_max = +600000.0
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -640000.0
+vx_max = +640000.0
+vy_min = -640000.0
+vy_max = +640000.0
+vz_min = -640000.0
+vz_max = +640000.0
+vx_length = 32
+vy_length = 32
+vz_length = 32
 
 [proton_sparse]
 minValue = 1.0e-15
@@ -104,8 +104,8 @@ rho  = 1000000.0
 VX0 = 4e5
 VY0 = 0
 VZ0 = 0
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [bailout]
 velocity_space_wall_block_margin = 0
diff --git a/testpackage/tests/restart_write/restart_write.cfg b/testpackage/tests/restart_write/restart_write.cfg
index dbe9170eb..06105607e 100644
--- a/testpackage/tests/restart_write/restart_write.cfg
+++ b/testpackage/tests/restart_write/restart_write.cfg
@@ -53,15 +53,15 @@ mass_units = PROTON
 charge = 1
 
 [proton_vspace]
-vx_min = -600000.0
-vx_max = +600000.0
-vy_min = -600000.0
-vy_max = +600000.0
-vz_min = -600000.0
-vz_max = +600000.0
-vx_length = 15
-vy_length = 15
-vz_length = 15
+vx_min = -640000.0
+vx_max = +640000.0
+vy_min = -640000.0
+vy_max = +640000.0
+vz_min = -640000.0
+vz_max = +640000.0
+vx_length = 32
+vy_length = 32
+vz_length = 32
 
 [proton_sparse]
 minValue = 1.0e-15
@@ -107,8 +107,8 @@ rho  = 1000000.0
 VX0 = 4e5
 VY0 = 0
 VZ0 = 0
-nSpaceSamples = 2
-nVelocitySamples = 2
+nSpaceSamples = 1
+nVelocitySamples = 1
 
 [bailout]
 velocity_space_wall_block_margin = 0
diff --git a/testpackage/tests/test_fp_fsolver_only_3D/test_fp_fsolver_only_3D.cfg b/testpackage/tests/test_fp_fsolver_only_3D/test_fp_fsolver_only_3D.cfg
index a9fbf8333..979936d54 100644
--- a/testpackage/tests/test_fp_fsolver_only_3D/test_fp_fsolver_only_3D.cfg
+++ b/testpackage/tests/test_fp_fsolver_only_3D/test_fp_fsolver_only_3D.cfg
@@ -45,9 +45,9 @@ vy_min = -2.0e4
 vy_max = +2.0e4
 vz_min = -2.0e4
 vz_max = +2.0e4
-vx_length = 9
-vy_length = 9
-vz_length = 9
+vx_length = 10
+vy_length = 10
+vz_length = 10
 [proton_sparse]
 minValue = 1.0e-15
 
diff --git a/testpackage/tests/test_fp_substeps/test_fp_substeps.cfg b/testpackage/tests/test_fp_substeps/test_fp_substeps.cfg
index 72f8c2fae..10638e883 100644
--- a/testpackage/tests/test_fp_substeps/test_fp_substeps.cfg
+++ b/testpackage/tests/test_fp_substeps/test_fp_substeps.cfg
@@ -45,9 +45,9 @@ vy_min = -2.0e4
 vy_max = +2.0e4
 vz_min = -2.0e4
 vz_max = +2.0e4
-vx_length = 9
-vy_length = 9
-vz_length = 9
+vx_length = 10
+vy_length = 10
+vz_length = 10
 [proton_sparse]
 minValue = 1.0e-15
 
diff --git a/testpackage/tests/transtest_2_maxw_500k_100k_20kms_20x20/transtest_2_maxw_500k_100k_20kms_20x20.cfg b/testpackage/tests/transtest_2_maxw_500k_100k_20kms_20x20/transtest_2_maxw_500k_100k_20kms_20x20.cfg
index 4c4be9db7..00773fc06 100644
--- a/testpackage/tests/transtest_2_maxw_500k_100k_20kms_20x20/transtest_2_maxw_500k_100k_20kms_20x20.cfg
+++ b/testpackage/tests/transtest_2_maxw_500k_100k_20kms_20x20/transtest_2_maxw_500k_100k_20kms_20x20.cfg
@@ -44,7 +44,7 @@ vz_max = +2.0e6
 vx_length = 50
 vy_length = 50
 vz_length = 50
-max_refinement_level = 0
+
 [proton_sparse]
 minValue = 1.0e-16
 
@@ -77,7 +77,7 @@ magXPertAbsAmp = 0
 magYPertAbsAmp = 0
 magZPertAbsAmp = 0
 
-nVelocitySamples = 3
+nVelocitySamples = 1
 
 [proton_MultiPeak]
 n = 1
diff --git a/testpackage/tests/transtest_amr/transtest_amr.cfg b/testpackage/tests/transtest_amr/transtest_amr.cfg
deleted file mode 100644
index bb065566a..000000000
--- a/testpackage/tests/transtest_amr/transtest_amr.cfg
+++ /dev/null
@@ -1,105 +0,0 @@
-dynamic_timestep = 1
-project = testAmr
-ParticlePopulations = proton
-propagate_field = 0
-propagate_vlasov_acceleration = 0
-propagate_vlasov_translation = 1
-
-[proton_properties]
-mass = 1
-mass_units = PROTON
-charge = 1
-
-[io]
-diagnostic_write_interval = 1
-write_initial_state = 1
-
-system_write_t_interval = 180.0
-system_write_file_name = fullf
-system_write_distribution_stride = 1
-system_write_distribution_xline_stride = 0
-system_write_distribution_yline_stride = 0
-system_write_distribution_zline_stride = 0
-
-[AMR]
-max_spatial_level = 2
-box_half_width_x = 1
-box_half_width_y = 1
-box_half_width_z = 1
-box_center_x = 1.0e6
-box_center_y = 1.0e6
-box_center_z = 1.0e6
-
-[gridbuilder]
-x_length = 8
-y_length = 8
-z_length = 8
-x_min = -1.0e6
-x_max = 1.0e6
-y_min = -1.0e6
-y_max = 1.0e6
-z_min = -1.0e6
-z_max = 1.0e6
-t_max = 181.0
-
-[proton_vspace]
-vx_min = -2.0e6
-vx_max = +2.0e6
-vy_min = -2.0e6
-vy_max = +2.0e6
-vz_min = -2.0e6
-vz_max = +2.0e6
-vx_length = 1
-vy_length = 1
-vz_length = 1
-max_refinement_level = 1
-[proton_sparse]
-minValue = 1.0e-16
-
-[boundaries]
-periodic_x = yes
-periodic_y = yes
-periodic_z = yes
-
-[variables]
-output = populations_vg_rho
-output = fg_b
-output = vg_pressure
-output = populations_vg_v
-output = fg_e
-output = vg_rank
-output = populations_vg_blocks
-#output = populations_vg_acceleration_subcycles
-
-diagnostic = populations_vg_blocks
-#diagnostic = vg_pressure
-#diagnostic = populations_vg_rho
-#diagnostic = populations_vg_rho_loss_adjust
-
-[testAmr]
-#magnitude of 1.82206867e-10 gives a period of 360s, useful for testing...
-Bx = 1.2e-10
-By = 0.8e-10
-Bz = 1.1135233442526334e-10
-magXPertAbsAmp = 0
-magYPertAbsAmp = 0
-magZPertAbsAmp = 0
-densityModel = uniform
-nVelocitySamples = 3
-
-[proton_testAmr]
-n = 1
-Vx = 5e5
-Vy = 5e5
-Vz = 0.0
-Tx = 500000.0
-Ty = 500000.0
-Tz = 500000.0
-rho  = 1.0e6
-rhoPertAbsAmp = 1.0e5
-
-[loadBalance]
-algorithm = RCB
-
-[bailout]
-velocity_space_wall_block_margin = 0

From 10f2c7724e4d85e6d11e78bab3b03bd1e2d52468 Mon Sep 17 00:00:00 2001
From: Markus Battarbee <markus.battarbee@gmail.com>
Date: Thu, 31 Aug 2023 17:05:42 +0300
Subject: [PATCH 3/3] Fixed compilation of vlsvdiff on GPU systems

---
 Makefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index cd7cbc8bc..983510c4c 100644
--- a/Makefile
+++ b/Makefile
@@ -335,6 +335,7 @@ DEPS_PARTICLES = particles/particles.h particles/particles.cpp particles/field.h
 	readparameters.h version.h particles/scenario.h particles/histogram.h
 OBJS_PARTICLES = particles/physconst.o particles/particles.o particles/readfields.o particles/particleparameters.o particles/distribution.o readparameters.o version.o particles/scenario.o particles/histogram.o
 
+# todo: verify compilation and working of tools other than vlsvdiff
 vlsvextract: ${DEPS_VLSVREADER} ${DEPS_VLSVREADERINTERFACE} tools/vlsvextract.h tools/vlsvextract.cpp ${OBJS_VLSVREADER} ${OBJS_VLSVREADERINTERFACE}
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvextract.cpp ${INC_BOOST} ${INC_DCCRG} ${INC_EIGEN} ${INC_VLSV} -I$(CURDIR)
 	${LNK} -o vlsvextract_${FP_PRECISION} vlsvextract.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_BOOST} ${LIB_DCCRG}  ${LIB_VLSV} ${LDFLAGS}
@@ -343,9 +344,10 @@ vlsv2silo:  ${DEPS_VLSVREADERINTERFACE} tools/vlsv2silo.cpp  ${OBJS_VLSVREADERIN
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsv2silo.cpp ${INC_SILO} ${INC_VLSV} -I$(CURDIR)
 	${LNK} -o vlsv2silo_${FP_PRECISION} vlsv2silo.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_SILO} ${LIB_VLSV} ${LDFLAGS}
 
-vlsvdiff:  ${DEPS_VLSVREADERINTERFACE} tools/vlsvdiff.cpp ${OBJS_VLSVREADEREXTRA} ${OBJS_VLSVREADERINTERFACE}
-	${CMP} ${CXXEXTRAFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_VLSV} -I$(CURDIR)
-	${LNK} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o  ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LDFLAGS}
+vlsvdiff: tools/vlsvdiff.cpp
+	@echo [CC] $<
+	$(SILENT)$(CMP) $(CXXEXTRAFLAGS) ${MATHFLAGS} ${FLAGS} -c tools/vlsvdiff.cpp ${INC_DCCRG} ${INC_VLSV} ${INC_FSGRID}
+	$(SILENT)${LNK} ${LDFLAGS} -o vlsvdiff_${FP_PRECISION} vlsvdiff.o ${OBJS_VLSVREADERINTERFACE} ${LIB_VLSV} ${LIBS}
 
 vlsvreaderinterface.o:  tools/vlsvreaderinterface.h tools/vlsvreaderinterface.cpp
 	${CMP} ${CXXFLAGS} ${FLAGS} -c tools/vlsvreaderinterface.cpp ${INC_VLSV} -I$(CURDIR)