Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flexible controls of GPU configuration #4396

Merged
merged 26 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bb7175c
Add Jim's changes for Gust's GPU options, based on https://github.com…
sjsprecious Mar 23, 2023
6667b40
Change the settings for Gust GPU nodes
sjsprecious Mar 23, 2023
e0d625e
Add missing definition of max_cputasks_per_gpu_node
sjsprecious Mar 24, 2023
fd2ae21
Add more missing attrs
sjsprecious Mar 24, 2023
7e372b5
Add hard-code check for valid gpu_type and gpu_offload
sjsprecious Mar 25, 2023
403d4f3
Use a more general way for valid value check
sjsprecious Mar 25, 2023
8d7e923
update error message for gpu_type and gpu_offload
sjsprecious Mar 25, 2023
080972b
fix a typo
sjsprecious Mar 25, 2023
7cdc7be
bug fix for PE layout and batch script template on a GPU node
sjsprecious Mar 25, 2023
18aa22e
Add a sanity check for compiler on GPU nodes
sjsprecious Mar 27, 2023
802fb04
Fix a bug for setting two GPU XML variables
sjsprecious Apr 11, 2023
82c617f
Bug fix of an invalid input for GPU_TYPE and GPU_OFFLOAD
sjsprecious Apr 12, 2023
d7fb13b
Merge branch 'master' into add_gpu_gust
sjsprecious Apr 17, 2023
e3607d1
update handling of "none" input for GPU_TYPE and GPU_OFFLOAD
sjsprecious Apr 18, 2023
011e807
Merge branch 'master' into add_gpu_gust
sjsprecious Aug 7, 2023
2341850
fix the usage of wrapper script for Derecho
sjsprecious Aug 7, 2023
de5476a
update description for Derecho
sjsprecious Aug 7, 2023
f6c42fd
use a generic way to apply the MPI wrapper script
sjsprecious Aug 8, 2023
5b3d157
one more sanity check for pure CPU run
sjsprecious Aug 8, 2023
ce56a9b
apply black and pylint
jedwards4b Aug 9, 2023
eaf9ec4
update externals
jedwards4b Aug 15, 2023
21d6a1f
do not set unless assigned
jedwards4b Aug 16, 2023
8f97110
do not set unless assigned
jedwards4b Aug 16, 2023
66b5fb8
fix issue with NTASKS
jedwards4b Aug 16, 2023
780a209
one more ntasks fix
jedwards4b Aug 16, 2023
128e03e
another none value comparison fixed
jedwards4b Aug 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CIME/Tools/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,9 @@ endif

# Remove arch flag if it exists
F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
ifdef GPUFLAGS
F90_LDFLAGS += $(GPUFLAGS)
endif

# Machine stuff to appear last on the link step
ifndef MLIBS
Expand Down
3 changes: 3 additions & 0 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
if name:
if resolve and "$" in name:
rflag = self._resolve_argument(case, flag, name, job)
# This is to prevent -gpu_type=none in qsub args
if rflag.endswith("=none"):
continue
if len(rflag) > len(flag):
submitargs += " {}".format(rflag)
else:
Expand Down
33 changes: 26 additions & 7 deletions CIME/XML/env_mach_pes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def get_value(
resolved=True,
subgroup=None,
max_mpitasks_per_node=None,
max_cputasks_per_gpu_node=None,
ngpus_per_node=None,
): # pylint: disable=arguments-differ
# Special variable NINST_MAX is used to determine the number of
# drivers in multi-driver mode.
Expand All @@ -58,8 +60,15 @@ def get_value(
if "NTASKS" in vid or "ROOTPE" in vid:
if max_mpitasks_per_node is None:
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if max_cputasks_per_gpu_node is None:
max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
if ngpus_per_node is None:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if value is not None and value < 0:
value = -1 * value * max_mpitasks_per_node
if ngpus_per_node > 0:
value = -1 * value * max_cputasks_per_gpu_node
else:
value = -1 * value * max_mpitasks_per_node
# in the nuopc driver there is only one NINST value
# so that NINST_{comp} = NINST
if "NINST_" in vid and value is None:
Expand Down Expand Up @@ -167,13 +176,23 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
"totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
)
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if self.get_value("NGPUS_PER_NODE") > 0:
tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
else:
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
if self.get_value("NGPUS_PER_NODE") > 0:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
total_tasks,
)
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
return tasks_per_node if tasks_per_node > 0 else 1

def get_total_nodes(self, total_tasks, max_thread_count):
Expand Down
3 changes: 2 additions & 1 deletion CIME/XML/env_mach_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):

def _compute_actions(self, nodes, child_tag, case, job=None):
result = [] # list of tuples ("name", "argument")
compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
compiler = case.get_value("COMPILER")
mpilib = case.get_value("MPILIB")

for node in nodes:
if self._match_attribs(self.attrib(node), case, job=job):
Expand Down
14 changes: 14 additions & 0 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
cmake_args += " -Dcompile_threaded={} ".format(
stringify_bool(case.get_build_threaded())
)
# check settings for GPU
gpu_type = case.get_value("GPU_TYPE")
gpu_offload = case.get_value("GPU_OFFLOAD")
if gpu_type != "none":
expect(
gpu_offload != "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)
cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
else:
expect(
gpu_offload == "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)

ocn_model = case.get_value("COMP_OCN")
atm_dycore = case.get_value("CAM_DYCORE")
Expand Down
122 changes: 83 additions & 39 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
self._env_generic_files = []
self._files = []
self._comp_interface = None
self.gpu_enabled = False
self._non_local = non_local
self.read_xml()

Expand Down Expand Up @@ -275,6 +276,9 @@ def initialize_derived_attributes(self):

if max_gpus_per_node:
self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
# update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
if self.ngpus_per_node > 0:
max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")

self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(
Expand Down Expand Up @@ -451,6 +455,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
return []

def get_value(self, item, attribute=None, resolved=True, subgroup=None):
if item == "GPU_ENABLED":
if not self.gpu_enabled:
if self.get_value("GPU_TYPE") != "none":
self.gpu_enabled = True
return "true" if self.gpu_enabled else "false"

result = None
for env_file in self._files:
# Wait and resolve in self rather than in env_file
Expand Down Expand Up @@ -1262,6 +1272,8 @@ def configure(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):

expect(
Expand Down Expand Up @@ -1344,6 +1356,7 @@ def configure(
and "MPILIB" not in x
and "MAX_MPITASKS_PER_NODE" not in x
and "MAX_TASKS_PER_NODE" not in x
and "MAX_CPUTASKS_PER_GPU_NODE" not in x
and "MAX_GPUS_PER_NODE" not in x
]

Expand Down Expand Up @@ -1378,20 +1391,32 @@ def configure(
for name in (
"MAX_TASKS_PER_NODE",
"MAX_MPITASKS_PER_NODE",
"MAX_CPUTASKS_PER_GPU_NODE",
"MAX_GPUS_PER_NODE",
):
dmax = machobj.get_value(name, {"compiler": compiler})
if not dmax:
dmax = machobj.get_value(name)
if dmax:
print(f"here name is {name} and dmax is {dmax}")
self.set_value(name, dmax)
elif name == "MAX_CPUTASKS_PER_GPU_NODE":
logger.debug(
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
elif name == "MAX_GPUS_PER_NODE":
logger.debug(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
else:
logger.warning(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)

machdir = machobj.get_machines_dir()
Expand Down Expand Up @@ -1509,47 +1534,55 @@ def configure(
self.set_value("TEST", True)

# ----------------------------------------------------------------------------------------------------------
# Sanity check:
# 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
# 2. For compilers without the string "gpu" in the name:
# 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
# the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
# 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
# must be set to 0. Otherwise, an error will be triggered.
# 3. For compilers with the string "gpu" in the name:
# 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
# 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# Sanity check for a GPU run:
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
# 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
# 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
if max_gpus_per_node:
if "gpu" in compiler:
if not ngpus_per_node:
ngpus_per_node = 1
logger.warning(
"Setting ngpus_per_node to 1 for compiler {}".format(compiler)
)
expect(
ngpus_per_node > 0,
" ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
else:
expect(
ngpus_per_node == 0,
" ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
if gpu_type and str(gpu_type).lower() != 'none':
expect(
max_gpus_per_node,
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
)
expect(
gpu_offload,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
expect(
compiler in ["nvhpc", "cray"],
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
)
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
valid_gpu_type.remove("none")
expect(
gpu_type in valid_gpu_type,
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
)
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
valid_gpu_offload.remove("none")
expect(
gpu_offload in valid_gpu_offload,
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
)
self.gpu_enabled = True
if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
ngpus_per_node
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
elif gpu_offload and str(gpu_offload).lower() != 'none':
expect(
False,
"Both gpu-type and gpu-offload must be defined if either is defined",
)

# Set these two GPU XML variables here to overwrite the default values
self.set_value("GPU_TYPE", str(gpu_type).lower())
self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())

self.initialize_derived_attributes()

Expand Down Expand Up @@ -2074,11 +2107,18 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None

ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
# 1. this setting is tested on Casper only and may not work on other machines
# 2. need to be revisited in the future for a more adaptable implementation
rundir = self.get_value("RUNDIR")
output_name = rundir + "/set_device_rank.sh"
mpi_arg_string = mpi_arg_string + " " + output_name + " "
if self.get_value("MACH") == "gust" or self.get_value("MACH") == "derecho":
mpi_arg_string = mpi_arg_string + " get_local_rank "
else:
# this wrapper script only works with OpenMPI library
# has been tested on Casper
expect(
self.get_value("MPILIB") == "openmpi",
"The wrapper script only works with OpenMPI library; {} is currently used".format(self.get_value("MPILIB")),
)
rundir = self.get_value("RUNDIR")
output_name = rundir + "/set_device_rank.sh"
mpi_arg_string = mpi_arg_string + " " + output_name + " "

return self.get_resolved_value(
"{} {} {} {}".format(
Expand Down Expand Up @@ -2375,6 +2415,8 @@ def create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
try:
# Set values for env_case.xml
Expand Down Expand Up @@ -2448,6 +2490,8 @@ def create(
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

self.create_caseroot()
Expand Down
2 changes: 1 addition & 1 deletion CIME/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def __init__(self):
self._set_attribute(
"gpus_use_set_device_rank",
True,
desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.",
desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Derecho/Gust) is appended when the MPI run command is generated.",
jedwards4b marked this conversation as resolved.
Show resolved Hide resolved
)
self._set_attribute(
"test_custom_project_machine",
Expand Down
14 changes: 14 additions & 0 deletions CIME/data/config/xml_schemas/config_machines.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
<xs:attribute name="compiler" type="xs:string"/>
<xs:attribute name="mpilib" type="xs:string"/>
<xs:attribute name="comp_interface" type="xs:string"/>
<xs:attribute name="gpu_type" type="xs:string"/>
<xs:attribute name="gpu_offload" type="xs:string"/>
<xs:attribute name="queue" type="xs:string"/>
<xs:attribute name="DEBUG" type="upperBoolean"/>
<xs:attribute name="PIO_VERSION" type="xs:integer"/>
Expand Down Expand Up @@ -56,6 +58,9 @@
<xs:element name="MAX_TASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
<xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
<xs:element name="GPU_TYPE" type="AttrElement"/>
<xs:element name="GPU_OFFLOAD" type="AttrElement"/>
<xs:element name="COSTPES_PER_NODE" type="xs:integer"/>
<xs:element name="PROJECT_REQUIRED" type="xs:NCName"/>
<xs:element name="executable" type="xs:string"/>
Expand Down Expand Up @@ -166,6 +171,13 @@
<!-- MAX_MPITASKS_PER_NODE: number of physical PES per shared node on
this machine, in practice the MPI tasks per node will not exceed this value -->
<xs:element ref="MAX_MPITASKS_PER_NODE" minOccurs="1" maxOccurs="unbounded"/>
<!-- MAX_CPUTASKS_PER_GPU_NODE: number of physical PES per GPU node on
this machine, in practice the MPI tasks per node will not exceed this value -->
<xs:element ref="MAX_CPUTASKS_PER_GPU_NODE" minOccurs="0" maxOccurs="unbounded"/>
<!-- GPU_TYPE: the type of GPU hardware available on this machine -->
<xs:element ref="GPU_TYPE" minOccurs="0" maxOccurs="unbounded"/>
<!-- GPU_OFFLOAD: the GPU programming model used for GPU porting -->
<xs:element ref="GPU_OFFLOAD" minOccurs="0" maxOccurs="unbounded"/>
<!-- Optional cost factor per node unit -->
<xs:element ref="COSTPES_PER_NODE" minOccurs="0" maxOccurs="1"/>
<!-- PROJECT_REQUIRED: Does this machine require a project to be specified to
Expand Down Expand Up @@ -249,6 +261,8 @@
<xs:attribute ref="PIO_VERSION"/>
<xs:attribute ref="mpilib"/>
<xs:attribute ref="comp_interface"/>
<xs:attribute ref="gpu_offload"/>
<xs:attribute ref="gpu_type"/>
</xs:complexType>
</xs:element>
<xs:element name="command">
Expand Down
4 changes: 4 additions & 0 deletions CIME/data/config/xml_schemas/env_mach_specific.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
<xs:attribute name="PIO_VERSION" type="xs:integer"/>
<xs:attribute name="mpilib" type="xs:string"/>
<xs:attribute name="comp_interface" type="xs:string"/>
<xs:attribute name="gpu_type" type="xs:string"/>
<xs:attribute name="gpu_offload" type="xs:string"/>
<xs:attribute name="SMP_PRESENT" type="xs:string"/>
<xs:attribute name="value" type="xs:string"/>
<xs:attribute name="unit_testing" type="xs:boolean"/>
Expand Down Expand Up @@ -102,6 +104,8 @@
<xs:attribute ref="PIO_VERSION" />
<xs:attribute ref="mpilib"/>
<xs:attribute ref="comp_interface"/>
<xs:attribute ref="gpu_type"/>
<xs:attribute ref="gpu_offload"/>
</xs:complexType>
</xs:element>

Expand Down
Loading
Loading