Skip to content

Commit

Permalink
Merge pull request #4396 from sjsprecious/add_gpu_gust
Browse files Browse the repository at this point in the history
Add flexible controls of GPU configuration (docs test expected to fail and will update on merge. )
  • Loading branch information
jedwards4b authored Aug 16, 2023
2 parents 02fad90 + 128e03e commit 41b67a8
Show file tree
Hide file tree
Showing 15 changed files with 192 additions and 92 deletions.
3 changes: 3 additions & 0 deletions CIME/Tools/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,9 @@ endif

# Remove arch flag if it exists
F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
ifdef GPUFLAGS
F90_LDFLAGS += $(GPUFLAGS)
endif

# Machine stuff to appear last on the link step
ifndef MLIBS
Expand Down
3 changes: 3 additions & 0 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
if name:
if resolve and "$" in name:
rflag = self._resolve_argument(case, flag, name, job)
# This is to prevent -gpu_type=none in qsub args
if rflag.endswith("=none"):
continue
if len(rflag) > len(flag):
submitargs += " {}".format(rflag)
else:
Expand Down
34 changes: 27 additions & 7 deletions CIME/XML/env_mach_pes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def get_value(
resolved=True,
subgroup=None,
max_mpitasks_per_node=None,
max_cputasks_per_gpu_node=None,
ngpus_per_node=None,
): # pylint: disable=arguments-differ
# Special variable NINST_MAX is used to determine the number of
# drivers in multi-driver mode.
Expand All @@ -58,7 +60,13 @@ def get_value(
if "NTASKS" in vid or "ROOTPE" in vid:
if max_mpitasks_per_node is None:
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if value is not None and value < 0:
if max_cputasks_per_gpu_node is None:
max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
if ngpus_per_node is None:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if (ngpus_per_node and value) and value < 0:
value = -1 * value * max_cputasks_per_gpu_node
elif value and value < 0:
value = -1 * value * max_mpitasks_per_node
# in the nuopc driver there is only one NINST value
# so that NINST_{comp} = NINST
Expand Down Expand Up @@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False):
tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1)
maxrootpe = max(maxrootpe, rootpe)
total_tasks = max(tt, total_tasks)

if asyncio_tasks:
total_tasks = total_tasks + len(asyncio_tasks)
if self.get_value("MULTI_DRIVER"):
Expand All @@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
"totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
)
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if self.get_value("NGPUS_PER_NODE") > 0:
tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
else:
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
total_tasks,
)
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
return tasks_per_node if tasks_per_node > 0 else 1

def get_total_nodes(self, total_tasks, max_thread_count):
Expand Down
3 changes: 2 additions & 1 deletion CIME/XML/env_mach_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):

def _compute_actions(self, nodes, child_tag, case, job=None):
result = [] # list of tuples ("name", "argument")
compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
compiler = case.get_value("COMPILER")
mpilib = case.get_value("MPILIB")

for node in nodes:
if self._match_attribs(self.attrib(node), case, job=job):
Expand Down
14 changes: 14 additions & 0 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
cmake_args += " -Dcompile_threaded={} ".format(
stringify_bool(case.get_build_threaded())
)
# check settings for GPU
gpu_type = case.get_value("GPU_TYPE")
gpu_offload = case.get_value("GPU_OFFLOAD")
if gpu_type != "none":
expect(
gpu_offload != "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)
cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
else:
expect(
gpu_offload == "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)

ocn_model = case.get_value("COMP_OCN")
atm_dycore = case.get_value("CAM_DYCORE")
Expand Down
124 changes: 82 additions & 42 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Case(object):
This class extends across multiple files, class members external to this file
are listed in the following imports
"""

from CIME.case.case_setup import case_setup
Expand Down Expand Up @@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
self._env_generic_files = []
self._files = []
self._comp_interface = None
self.gpu_enabled = False
self._non_local = non_local
self.read_xml()

Expand Down Expand Up @@ -275,6 +277,9 @@ def initialize_derived_attributes(self):

if max_gpus_per_node:
self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
# update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
if self.ngpus_per_node > 0:
max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")

self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(
Expand Down Expand Up @@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
return []

def get_value(self, item, attribute=None, resolved=True, subgroup=None):
if item == "GPU_ENABLED":
if not self.gpu_enabled:
if self.get_value("GPU_TYPE") != "none":
self.gpu_enabled = True
return "true" if self.gpu_enabled else "false"

result = None
for env_file in self._files:
# Wait and resolve in self rather than in env_file
Expand Down Expand Up @@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
comment = None
force_tasks = None
force_thrds = None

if match1:
opti_tasks = match1.group(1)
if opti_tasks.isdigit():
Expand Down Expand Up @@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1

totaltasks.append((ntasks + rootpe) * nthrds)

mach_pes_obj.set_value(ntasks_str, ntasks)
mach_pes_obj.set_value(nthrds_str, nthrds)
mach_pes_obj.set_value(rootpe_str, rootpe)
Expand Down Expand Up @@ -1262,6 +1271,8 @@ def configure(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):

expect(
Expand Down Expand Up @@ -1344,6 +1355,7 @@ def configure(
and "MPILIB" not in x
and "MAX_MPITASKS_PER_NODE" not in x
and "MAX_TASKS_PER_NODE" not in x
and "MAX_CPUTASKS_PER_GPU_NODE" not in x
and "MAX_GPUS_PER_NODE" not in x
]

Expand Down Expand Up @@ -1378,20 +1390,31 @@ def configure(
for name in (
"MAX_TASKS_PER_NODE",
"MAX_MPITASKS_PER_NODE",
"MAX_CPUTASKS_PER_GPU_NODE",
"MAX_GPUS_PER_NODE",
):
dmax = machobj.get_value(name, {"compiler": compiler})
if not dmax:
dmax = machobj.get_value(name)
if dmax:
self.set_value(name, dmax)
elif name == "MAX_CPUTASKS_PER_GPU_NODE":
logger.debug(
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
elif name == "MAX_GPUS_PER_NODE":
logger.debug(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
else:
logger.warning(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)

machdir = machobj.get_machines_dir()
Expand Down Expand Up @@ -1509,47 +1532,62 @@ def configure(
self.set_value("TEST", True)

# ----------------------------------------------------------------------------------------------------------
# Sanity check:
# 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
# 2. For compilers without the string "gpu" in the name:
# 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
# the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
# 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
# must be set to 0. Otherwise, an error will be triggered.
# 3. For compilers with the string "gpu" in the name:
# 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
# 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# Sanity check for a GPU run:
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
# 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
# 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
if max_gpus_per_node:
if "gpu" in compiler:
if not ngpus_per_node:
ngpus_per_node = 1
logger.warning(
"Setting ngpus_per_node to 1 for compiler {}".format(compiler)
)
expect(
ngpus_per_node > 0,
" ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
else:
expect(
ngpus_per_node == 0,
" ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
if gpu_type and str(gpu_type).lower() != "none":
expect(
max_gpus_per_node,
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
)
expect(
gpu_offload,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
expect(
compiler in ["nvhpc", "cray"],
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
)
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
valid_gpu_type.remove("none")
expect(
gpu_type in valid_gpu_type,
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
)
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
valid_gpu_offload.remove("none")
expect(
gpu_offload in valid_gpu_offload,
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
)
self.gpu_enabled = True
if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
ngpus_per_node
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
elif gpu_offload and str(gpu_offload).lower() != "none":
expect(
False,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
elif ngpus_per_node != 0:
expect(
False,
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
)

# Set these two GPU XML variables here to overwrite the default values
if gpu_type:
self.set_value("GPU_TYPE", str(gpu_type).lower())
if gpu_offload:
self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())

self.initialize_derived_attributes()

Expand Down Expand Up @@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
mpi_arg_string += " : "

ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
# 1. this setting is tested on Casper only and may not work on other machines
# 2. need to be revisited in the future for a more adaptable implementation
rundir = self.get_value("RUNDIR")
output_name = rundir + "/set_device_rank.sh"
mpi_arg_string = mpi_arg_string + " " + output_name + " "
if ngpus_per_node and ngpus_per_node > 0:
mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
if mpi_gpu_run_script:
mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script

return self.get_resolved_value(
"{} {} {} {}".format(
Expand Down Expand Up @@ -2375,6 +2411,8 @@ def create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
try:
# Set values for env_case.xml
Expand Down Expand Up @@ -2448,6 +2486,8 @@ def create(
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

self.create_caseroot()
Expand Down
29 changes: 0 additions & 29 deletions CIME/case/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
copy_local_macros_to_dir,
)
from CIME.utils import batch_jobid
from CIME.utils import transform_vars
from CIME.test_status import *
from CIME.locked_files import unlock_file, lock_file

Expand Down Expand Up @@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
caseroot=caseroot,
is_batch=is_batch,
)

# put the following section here to make sure the rundir is generated first
machdir = self.get_value("MACHDIR")
mach = self.get_value("MACH")
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
overrides = {}
overrides["ngpus_per_node"] = ngpus_per_node
input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
if os.path.isfile(input_template):
# update the wrapper script that sets the device id for each MPI rank
output_text = transform_vars(
open(input_template, "r").read(), case=self, overrides=overrides
)

# write it out to the run dir
rundir = self.get_value("RUNDIR")
output_name = os.path.join(rundir, "set_device_rank.sh")
logger.info("Creating file {}".format(output_name))
with open(output_name, "w") as f:
f.write(output_text)

# make the wrapper script executable
if os.path.isfile(output_name):
os.system("chmod +x " + output_name)
else:
expect(
False, "The file {} is not written out correctly.".format(output_name)
)
Loading

0 comments on commit 41b67a8

Please sign in to comment.