Skip to content

Commit

Permalink
Add chicoma-gpu, GPU nodes on LANL Chicoma
Browse files Browse the repository at this point in the history
  • Loading branch information
xylar committed Feb 3, 2024
1 parent 6e7b8e6 commit b5db2b8
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 1 deletion.
10 changes: 10 additions & 0 deletions cime_config/machines/Depends.chicoma-gpu.nvidia.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
list(APPEND REDUCE_OPT_LIST
homme/src/share/derivative_mod_base.F90
)

# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11)
if (NOT DEBUG)
foreach(ITEM IN LISTS REDUCE_OPT_LIST)
e3sm_add_flags("${ITEM}" " -Mnovect")
endforeach()
endif()
10 changes: 10 additions & 0 deletions cime_config/machines/Depends.chicoma-gpu.nvidiagpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
list(APPEND REDUCE_OPT_LIST
homme/src/share/derivative_mod_base.F90
)

# Can use this flag to avoid internal compiler error for this file (with nvidia/21.11)
if (NOT DEBUG)
foreach(ITEM IN LISTS REDUCE_OPT_LIST)
e3sm_add_flags("${ITEM}" " -Mnovect")
endforeach()
endif()
30 changes: 29 additions & 1 deletion cime_config/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,35 @@
<directive>--qos=standard </directive>
</directives>
<queues>
<queue walltimemax="24:00:00" nodemax="1792" default="true">standard</queue>
<queue walltimemax="16:00:00" nodemax="1792" default="true">standard</queue>
</queues>
</batch_system>

<batch_system MACH="chicoma-gpu" type="slurm">
<directives>
<directive> --partition=gpu</directive>
</directives>
<directives compiler="gnugpu">
<directive> --gpus-per-task=1</directive>
</directives>
<directives COMPSET="!.*MMF.*" compiler="gnugpu">
<directive> --gpu-bind=none</directive>
</directives>
<directives COMPSET=".*MMF.*" compiler="gnugpu">
<directive> --gpu-bind=map_gpu:0,1,2,3</directive>
</directives>
<directives compiler="nvidiagpu">
<directive> --gpus-per-task=1</directive>
<directive> --gpu-bind=none</directive>
</directives>
<directives compiler="gnu">
<directive> -G 0</directive>
</directives>
<directives compiler="nvidia">
<directive> -G 0</directive>
</directives>
<queues>
<queue walltimemax="16:00:00" nodemax="96" default="true">standard</queue>
</queues>
</batch_system>

Expand Down
134 changes: 134 additions & 0 deletions cime_config/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4229,6 +4229,140 @@
</resource_limits>
</machine>

<machine MACH="chicoma-gpu">
<DESC>Chicoma GPU nodes at LANL IC. Each GPU node has single
AMD EPYC 7713 64-Core (Milan) (256GB) and 4 nvidia A100'</DESC>
<NODENAME_REGEX>ch-fe*</NODENAME_REGEX>
<OS>Linux</OS>
<COMPILERS>gnugpu,gnu,nvidiagpu,nvidia</COMPILERS>
<MPILIBS>mpich</MPILIBS>
<CIME_OUTPUT_ROOT>/lustre/scratch4/turquoise/$ENV{USER}/E3SM/scratch/chicoma-cpu</CIME_OUTPUT_ROOT>
<DIN_LOC_ROOT>/usr/projects/e3sm/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/usr/projects/e3sm/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
<DOUT_S_ROOT>/lustre/scratch4/turquoise/$ENV{USER}/E3SM/archive/$CASE</DOUT_S_ROOT>
<BASELINE_ROOT>/lustre/scratch4/turquoise/$ENV{USER}/E3SM/input_data/ccsm_baselines/$COMPILER</BASELINE_ROOT>
<CCSM_CPRNC>/usr/projects/climate/SHARED_CLIMATE/software/badger/cprnc</CCSM_CPRNC>
<GMAKE_J>10</GMAKE_J>
<TESTS>e3sm_developer</TESTS>
<NTEST_PARALLEL_JOBS>4</NTEST_PARALLEL_JOBS>
<BATCH_SYSTEM>slurm</BATCH_SYSTEM>
<SUPPORTED_BY>e3sm</SUPPORTED_BY>
<MAX_TASKS_PER_NODE>128</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="gnu">256</MAX_TASKS_PER_NODE>
<MAX_TASKS_PER_NODE compiler="nvidia">256</MAX_TASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE>4</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="gnu">64</MAX_MPITASKS_PER_NODE>
<MAX_MPITASKS_PER_NODE compiler="nvidia">64</MAX_MPITASKS_PER_NODE>
<PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
<mpirun mpilib="default">
<executable>srun</executable>
<arguments>
<arg name="label"> --label</arg>
<arg name="num_tasks"> -n {{ total_tasks }} -N {{ num_nodes }}</arg>
<arg name="thread_count">-c $SHELL{echo 256/`./xmlquery --value MAX_MPITASKS_PER_NODE`|bc}</arg>
<arg name="binding"> $SHELL{if [ 128 -ge `./xmlquery --value MAX_MPITASKS_PER_NODE` ]; then echo "--cpu_bind=cores"; else echo "--cpu_bind=threads";fi;}</arg>
<arg name="placement"> -m plane=$SHELL{echo `./xmlquery --value MAX_MPITASKS_PER_NODE`}</arg>
</arguments>
</mpirun>
<module_system type="module" allow_error="true">
<init_path lang="perl">/usr/share/lmod/8.3.1/init/perl</init_path>
<!-- does not exist -->
<init_path lang="python">/usr/share/lmod/8.3.1/init/python</init_path>
<init_path lang="sh">/usr/share/lmod/8.3.1/init/sh</init_path>
<init_path lang="csh">/usr/share/lmod/8.3.1/init/csh</init_path>
<cmd_path lang="perl">/usr/share/lmod/lmod/libexec/lmod perl</cmd_path>
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
<cmd_path lang="sh">module</cmd_path>
<cmd_path lang="csh">module</cmd_path>

<modules>
<command name="unload">cray-hdf5-parallel</command>
<command name="unload">cray-netcdf-hdf5parallel</command>
<command name="unload">cray-parallel-netcdf</command>
<command name="unload">cray-netcdf</command>
<command name="unload">cray-hdf5</command>
<command name="unload">PrgEnv-gnu</command>
<command name="unload">PrgEnv-intel</command>
<command name="unload">PrgEnv-nvidia</command>
<command name="unload">PrgEnv-cray</command>
<command name="unload">PrgEnv-aocc</command>
<command name="unload">intel</command>
<command name="unload">intel-oneapi</command>
<command name="unload">nvidia</command>
<command name="unload">aocc</command>
<command name="unload">cudatoolkit</command>
<command name="unload">climate-utils</command>
<command name="unload">craype-accel-nvidia80</command>
<command name="unload">craype-accel-host</command>
<command name="unload">perftools-base</command>
<command name="unload">perftools</command>
<command name="unload">darshan</command>
</modules>

<modules compiler="gnu.*">
<command name="load">PrgEnv-gnu/8.4.0</command>
<command name="load">gcc/12.2.0</command>
</modules>

<modules compiler="nvidia.*">
<command name="load">PrgEnv-nvidia/8.4.0</command>
<command name="load">nvidia/22.7</command>
</modules>

<modules compiler="gnugpu">
<command name="load">cudatoolkit/22.7_11.7</command>
<command name="load">craype-accel-nvidia80</command>
</modules>

<modules compiler="nvidiagpu">
<command name="load">cudatoolkit/22.7_11.7</command>
<command name="load">craype-accel-nvidia80</command>
<command name="load">gcc-mixed/12.2.0</command>
</modules>

<modules compiler="gnu">
<command name="load">craype-accel-host</command>
</modules>

<modules compiler="nvidia">
<command name="load">craype-accel-host</command>
</modules>

<modules>
<command name="load">cray-libsci/23.05.1.4</command>
<command name="load">craype/2.7.21</command>
<command name="load">cray-mpich/8.1.26</command>
<command name="load">libfabric/1.15.2.0</command>
<command name="load">cray-hdf5-parallel/1.12.2.3</command>
<command name="load">cray-netcdf-hdf5parallel/4.9.0.3</command>
<command name="load">cray-parallel-netcdf/1.12.3.3</command>
<command name="load">cmake/3.25.1</command>
</modules>
</module_system>

<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<TEST_TPUT_TOLERANCE>0.1</TEST_TPUT_TOLERANCE>

<environment_variables>
<env name="MPICH_ENV_DISPLAY">1</env>
<env name="MPICH_VERSION_DISPLAY">1</env>
<env name="OMP_STACKSIZE">128M</env>
<env name="OMP_PROC_BIND">spread</env>
<env name="OMP_PLACES">threads</env>
<env name="HDF5_USE_FILE_LOCKING">FALSE</env>
<env name="PERL5LIB">/usr/projects/climate/SHARED_CLIMATE/software/chicoma-cpu/perl5-only-switch/lib/perl5</env>
<env name="PNETCDF_HINTS">romio_ds_write=disable;romio_ds_read=disable;romio_cb_write=enable;romio_cb_read=enable</env>
<env name="FI_CXI_RX_MATCH_MODE">software</env>
<env name="MPICH_COLL_SYNC">MPI_Bcast</env>
<env name="NETCDF_PATH">$ENV{CRAY_NETCDF_HDF5PARALLEL_PREFIX}</env>
<env name="PNETCDF_PATH">$ENV{CRAY_PARALLEL_NETCDF_PREFIX}</env>
</environment_variables>
<resource_limits>
<resource name="RLIMIT_STACK">-1</resource>
</resource_limits>
</machine>

<machine MACH="mesabi">
<DESC>Mesabi batch queue</DESC>
<OS>LINUX</OS>
Expand Down
1 change: 1 addition & 0 deletions cime_config/machines/config_pio.xml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
<value mach="grizzly">netcdf</value>
<value mach="badger">netcdf</value>
<value mach="chicoma-cpu">netcdf</value>
<value mach="chicoma-gpu">netcdf</value>
<value mach="bebop" mpilib="impi" compset=".*CAM5.+MPAS.*">netcdf</value>
<value mach="fugaku" compiler="gnu">netcdf</value>
</values>
Expand Down

0 comments on commit b5db2b8

Please sign in to comment.