From b5db2b82a47f7420262f865ab560bc47e1d71da7 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sat, 27 Jan 2024 09:35:00 -0700 Subject: [PATCH] Add chicoma-gpu, GPU nodes on LANL Chicoma --- .../machines/Depends.chicoma-gpu.nvidia.cmake | 10 ++ .../Depends.chicoma-gpu.nvidiagpu.cmake | 10 ++ cime_config/machines/config_batch.xml | 30 +++- cime_config/machines/config_machines.xml | 134 ++++++++++++++++++ cime_config/machines/config_pio.xml | 1 + 5 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 cime_config/machines/Depends.chicoma-gpu.nvidia.cmake create mode 100644 cime_config/machines/Depends.chicoma-gpu.nvidiagpu.cmake diff --git a/cime_config/machines/Depends.chicoma-gpu.nvidia.cmake b/cime_config/machines/Depends.chicoma-gpu.nvidia.cmake new file mode 100644 index 000000000000..89235ac5efd1 --- /dev/null +++ b/cime_config/machines/Depends.chicoma-gpu.nvidia.cmake @@ -0,0 +1,10 @@ +list(APPEND REDUCE_OPT_LIST + homme/src/share/derivative_mod_base.F90 +) + +# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11) +if (NOT DEBUG) + foreach(ITEM IN LISTS REDUCE_OPT_LIST) + e3sm_add_flags("${ITEM}" " -Mnovect") + endforeach() +endif() diff --git a/cime_config/machines/Depends.chicoma-gpu.nvidiagpu.cmake b/cime_config/machines/Depends.chicoma-gpu.nvidiagpu.cmake new file mode 100644 index 000000000000..89235ac5efd1 --- /dev/null +++ b/cime_config/machines/Depends.chicoma-gpu.nvidiagpu.cmake @@ -0,0 +1,10 @@ +list(APPEND REDUCE_OPT_LIST + homme/src/share/derivative_mod_base.F90 +) + +# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11) +if (NOT DEBUG) + foreach(ITEM IN LISTS REDUCE_OPT_LIST) + e3sm_add_flags("${ITEM}" " -Mnovect") + endforeach() +endif() diff --git a/cime_config/machines/config_batch.xml b/cime_config/machines/config_batch.xml index 9441a193ecec..69744f5430ab 100644 --- a/cime_config/machines/config_batch.xml +++ b/cime_config/machines/config_batch.xml @@ -649,7 +649,35 @@ --qos=standard - standard + standard + + + + + + --partition=gpu + + + --gpus-per-task=1 + + + --gpu-bind=none + + + --gpu-bind=map_gpu:0,1,2,3 + + + --gpus-per-task=1 + --gpu-bind=none + + + -G 0 + + + -G 0 + + + standard diff --git a/cime_config/machines/config_machines.xml b/cime_config/machines/config_machines.xml index fe81f85d250d..33550f30dc7a 100644 --- a/cime_config/machines/config_machines.xml +++ b/cime_config/machines/config_machines.xml @@ -4229,6 +4229,140 @@ + + Chicoma GPU nodes at LANL IC. Each GPU node has single +AMD EPYC 7713 64-Core (Milan) (256GB) and 4 nvidia A100' + ch-fe* + Linux + gnugpu,gnu,nvidiagpu,nvidia + mpich + /lustre/scratch4/turquoise/$ENV{USER}/E3SM/scratch/chicoma-cpu + /usr/projects/e3sm/inputdata + /usr/projects/e3sm/inputdata/atm/datm7 + /lustre/scratch4/turquoise/$ENV{USER}/E3SM/archive/$CASE + /lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER + /usr/projects/climate/SHARED_CLIMATE/software/badger/cprnc + 10 + e3sm_developer + 4 + slurm + e3sm + 128 + 256 + 256 + 4 + 64 + 64 + TRUE + + srun + + --label + -n {{ total_tasks }} -N {{ num_nodes }} + -c $SHELL{echo 256/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc} + $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;} + -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`} + + + + /usr/share/lmod/8.3.1/init/perl + + /usr/share/lmod/8.3.1/init/python + /usr/share/lmod/8.3.1/init/sh + /usr/share/lmod/8.3.1/init/csh + /usr/share/lmod/lmod/libexec/lmod perl + /usr/share/lmod/lmod/libexec/lmod python + module + module + + + cray-hdf5-parallel + cray-netcdf-hdf5parallel + cray-parallel-netcdf + cray-netcdf + cray-hdf5 + PrgEnv-gnu + PrgEnv-intel + PrgEnv-nvidia + PrgEnv-cray + PrgEnv-aocc + intel + intel-oneapi + nvidia + aocc + cudatoolkit + climate-utils + craype-accel-nvidia80 + craype-accel-host + perftools-base + perftools + darshan + + + + PrgEnv-gnu/8.4.0 + gcc/12.2.0 + + + + PrgEnv-nvidia/8.4.0 + nvidia/22.7 + + + + cudatoolkit/22.7_11.7 + craype-accel-nvidia80 + + + + cudatoolkit/22.7_11.7 + craype-accel-nvidia80 + gcc-mixed/12.2.0 + + + + craype-accel-host + + + + craype-accel-host + + + + cray-libsci/23.05.1.4 + craype/2.7.21 + cray-mpich/8.1.26 + libfabric/1.15.2.0 + cray-hdf5-parallel/1.12.2.3 + cray-netcdf-hdf5parallel/4.9.0.3 + cray-parallel-netcdf/1.12.3.3 + cmake/3.25.1 + + + + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + 0.1 + + + 1 + 1 + 128M + spread + threads + FALSE + /usr/projects/climate/SHARED_CLIMATE/software/chicoma-cpu/perl5-only-switch/lib/perl5 + romio_ds_write=disable;romio_ds_read=disable;romio_cb_write=enable;romio_cb_read=enable + software + MPI_Bcast + $ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX} + $ENV{CRAY_PARALLEL_NETCDF_PREFIX} + + + -1 + + + Mesabi batch queue LINUX diff --git a/cime_config/machines/config_pio.xml b/cime_config/machines/config_pio.xml index e1784b1618d1..51aa6fc31ebe 100644 --- a/cime_config/machines/config_pio.xml +++ b/cime_config/machines/config_pio.xml @@ -66,6 +66,7 @@ netcdf netcdf netcdf + netcdf netcdf netcdf