Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flexible controls of GPU configuration #4396

Merged
merged 26 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
bb7175c
Add Jim's changes for Gust's GPU options, based on https://github.com…
sjsprecious Mar 23, 2023
6667b40
Change the settings for Gust GPU nodes
sjsprecious Mar 23, 2023
e0d625e
Add missing definition of max_cputasks_per_gpu_node
sjsprecious Mar 24, 2023
fd2ae21
Add more missing attrs
sjsprecious Mar 24, 2023
7e372b5
Add hard-code check for valid gpu_type and gpu_offload
sjsprecious Mar 25, 2023
403d4f3
Use a more general way for valid value check
sjsprecious Mar 25, 2023
8d7e923
update error message for gpu_type and gpu_offload
sjsprecious Mar 25, 2023
080972b
fix a typo
sjsprecious Mar 25, 2023
7cdc7be
bug fix for PE layout and batch script template on a GPU node
sjsprecious Mar 25, 2023
18aa22e
Add a sanity check for compiler on GPU nodes
sjsprecious Mar 27, 2023
802fb04
Fix a bug for setting two GPU XML variables
sjsprecious Apr 11, 2023
82c617f
Bug fix of an invalid input for GPU_TYPE and GPU_OFFLOAD
sjsprecious Apr 12, 2023
d7fb13b
Merge branch 'master' into add_gpu_gust
sjsprecious Apr 17, 2023
e3607d1
update handling of "none" input for GPU_TYPE and GPU_OFFLOAD
sjsprecious Apr 18, 2023
011e807
Merge branch 'master' into add_gpu_gust
sjsprecious Aug 7, 2023
2341850
fix the usage of wrapper script for Derecho
sjsprecious Aug 7, 2023
de5476a
update description for Derecho
sjsprecious Aug 7, 2023
f6c42fd
use a generic way to apply the MPI wrapper script
sjsprecious Aug 8, 2023
5b3d157
one more sanity check for pure CPU run
sjsprecious Aug 8, 2023
ce56a9b
apply black and pylint
jedwards4b Aug 9, 2023
eaf9ec4
update externals
jedwards4b Aug 15, 2023
21d6a1f
do not set unless assigned
jedwards4b Aug 16, 2023
8f97110
do not set unless assigned
jedwards4b Aug 16, 2023
66b5fb8
fix issue with NTASKS
jedwards4b Aug 16, 2023
780a209
one more ntasks fix
jedwards4b Aug 16, 2023
128e03e
another none value comparison fixed
jedwards4b Aug 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CIME/Tools/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,9 @@ endif

# Remove arch flag if it exists
F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
ifdef GPUFLAGS
F90_LDFLAGS += $(GPUFLAGS)
endif

# Machine stuff to appear last on the link step
ifndef MLIBS
Expand Down
3 changes: 3 additions & 0 deletions CIME/XML/env_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
if name:
if resolve and "$" in name:
rflag = self._resolve_argument(case, flag, name, job)
# This is to prevent -gpu_type=none in qsub args
if rflag.endswith("=none"):
continue
if len(rflag) > len(flag):
submitargs += " {}".format(rflag)
else:
Expand Down
34 changes: 27 additions & 7 deletions CIME/XML/env_mach_pes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def get_value(
resolved=True,
subgroup=None,
max_mpitasks_per_node=None,
max_cputasks_per_gpu_node=None,
ngpus_per_node=None,
): # pylint: disable=arguments-differ
# Special variable NINST_MAX is used to determine the number of
# drivers in multi-driver mode.
Expand All @@ -58,7 +60,13 @@ def get_value(
if "NTASKS" in vid or "ROOTPE" in vid:
if max_mpitasks_per_node is None:
max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if value is not None and value < 0:
if max_cputasks_per_gpu_node is None:
max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
if ngpus_per_node is None:
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if (ngpus_per_node and value) and value < 0:
value = -1 * value * max_cputasks_per_gpu_node
elif value and value < 0:
value = -1 * value * max_mpitasks_per_node
# in the nuopc driver there is only one NINST value
# so that NINST_{comp} = NINST
Expand Down Expand Up @@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False):
tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1)
maxrootpe = max(maxrootpe, rootpe)
total_tasks = max(tt, total_tasks)

if asyncio_tasks:
total_tasks = total_tasks + len(asyncio_tasks)
if self.get_value("MULTI_DRIVER"):
Expand All @@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
"totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
)
if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
if self.get_value("NGPUS_PER_NODE") > 0:
tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
else:
tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
total_tasks,
)
else:
tasks_per_node = min(
self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
self.get_value("MAX_MPITASKS_PER_NODE"),
total_tasks,
)
return tasks_per_node if tasks_per_node > 0 else 1

def get_total_nodes(self, total_tasks, max_thread_count):
Expand Down
3 changes: 2 additions & 1 deletion CIME/XML/env_mach_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):

def _compute_actions(self, nodes, child_tag, case, job=None):
result = [] # list of tuples ("name", "argument")
compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
compiler = case.get_value("COMPILER")
mpilib = case.get_value("MPILIB")

for node in nodes:
if self._match_attribs(self.attrib(node), case, job=job):
Expand Down
14 changes: 14 additions & 0 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
cmake_args += " -Dcompile_threaded={} ".format(
stringify_bool(case.get_build_threaded())
)
# check settings for GPU
gpu_type = case.get_value("GPU_TYPE")
gpu_offload = case.get_value("GPU_OFFLOAD")
if gpu_type != "none":
expect(
gpu_offload != "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)
cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
else:
expect(
gpu_offload == "none",
"Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
)

ocn_model = case.get_value("COMP_OCN")
atm_dycore = case.get_value("CAM_DYCORE")
Expand Down
124 changes: 82 additions & 42 deletions CIME/case/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Case(object):

This class extends across multiple files, class members external to this file
are listed in the following imports

"""

from CIME.case.case_setup import case_setup
Expand Down Expand Up @@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
self._env_generic_files = []
self._files = []
self._comp_interface = None
self.gpu_enabled = False
self._non_local = non_local
self.read_xml()

Expand Down Expand Up @@ -275,6 +277,9 @@ def initialize_derived_attributes(self):

if max_gpus_per_node:
self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
# update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
if self.ngpus_per_node > 0:
max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")

self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
smt_factor = max(
Expand Down Expand Up @@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
return []

def get_value(self, item, attribute=None, resolved=True, subgroup=None):
if item == "GPU_ENABLED":
if not self.gpu_enabled:
if self.get_value("GPU_TYPE") != "none":
self.gpu_enabled = True
return "true" if self.gpu_enabled else "false"

result = None
for env_file in self._files:
# Wait and resolve in self rather than in env_file
Expand Down Expand Up @@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
comment = None
force_tasks = None
force_thrds = None

if match1:
opti_tasks = match1.group(1)
if opti_tasks.isdigit():
Expand Down Expand Up @@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1

totaltasks.append((ntasks + rootpe) * nthrds)

mach_pes_obj.set_value(ntasks_str, ntasks)
mach_pes_obj.set_value(nthrds_str, nthrds)
mach_pes_obj.set_value(rootpe_str, rootpe)
Expand Down Expand Up @@ -1262,6 +1271,8 @@ def configure(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):

expect(
Expand Down Expand Up @@ -1344,6 +1355,7 @@ def configure(
and "MPILIB" not in x
and "MAX_MPITASKS_PER_NODE" not in x
and "MAX_TASKS_PER_NODE" not in x
and "MAX_CPUTASKS_PER_GPU_NODE" not in x
and "MAX_GPUS_PER_NODE" not in x
]

Expand Down Expand Up @@ -1378,20 +1390,31 @@ def configure(
for name in (
"MAX_TASKS_PER_NODE",
"MAX_MPITASKS_PER_NODE",
"MAX_CPUTASKS_PER_GPU_NODE",
"MAX_GPUS_PER_NODE",
):
dmax = machobj.get_value(name, {"compiler": compiler})
if not dmax:
dmax = machobj.get_value(name)
if dmax:
self.set_value(name, dmax)
elif name == "MAX_CPUTASKS_PER_GPU_NODE":
logger.debug(
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
elif name == "MAX_GPUS_PER_NODE":
logger.debug(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)
else:
logger.warning(
"Variable {} not defined for machine {}".format(name, machine_name)
"Variable {} not defined for machine {} and compiler {}".format(
name, machine_name, compiler
)
)

machdir = machobj.get_machines_dir()
Expand Down Expand Up @@ -1509,47 +1532,62 @@ def configure(
self.set_value("TEST", True)

# ----------------------------------------------------------------------------------------------------------
# Sanity check:
# 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
# 2. For compilers without the string "gpu" in the name:
# 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
# the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
# 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
# must be set to 0. Otherwise, an error will be triggered.
# 3. For compilers with the string "gpu" in the name:
# 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
# 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# Sanity check for a GPU run:
# 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
# 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
# XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
# 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
# ----------------------------------------------------------------------------------------------------------
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
if max_gpus_per_node:
if "gpu" in compiler:
if not ngpus_per_node:
ngpus_per_node = 1
logger.warning(
"Setting ngpus_per_node to 1 for compiler {}".format(compiler)
)
expect(
ngpus_per_node > 0,
" ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
else:
expect(
ngpus_per_node == 0,
" ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
compiler, ngpus_per_node
),
)
if gpu_type and str(gpu_type).lower() != "none":
expect(
max_gpus_per_node,
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
)
expect(
gpu_offload,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
expect(
compiler in ["nvhpc", "cray"],
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
)
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
valid_gpu_type.remove("none")
expect(
gpu_type in valid_gpu_type,
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
)
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
valid_gpu_offload.remove("none")
expect(
gpu_offload in valid_gpu_offload,
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
)
self.gpu_enabled = True
if ngpus_per_node >= 0:
self.set_value(
"NGPUS_PER_NODE",
ngpus_per_node
max(1, ngpus_per_node)
if ngpus_per_node <= max_gpus_per_node
else max_gpus_per_node,
)
elif gpu_offload and str(gpu_offload).lower() != "none":
expect(
False,
"Both gpu-type and gpu-offload must be defined if either is defined",
)
elif ngpus_per_node != 0:
expect(
False,
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
)

# Set these two GPU XML variables here to overwrite the default values
if gpu_type:
self.set_value("GPU_TYPE", str(gpu_type).lower())
if gpu_offload:
self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())

self.initialize_derived_attributes()

Expand Down Expand Up @@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
mpi_arg_string += " : "

ngpus_per_node = self.get_value("NGPUS_PER_NODE")
if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
# 1. this setting is tested on Casper only and may not work on other machines
# 2. need to be revisited in the future for a more adaptable implementation
rundir = self.get_value("RUNDIR")
output_name = rundir + "/set_device_rank.sh"
mpi_arg_string = mpi_arg_string + " " + output_name + " "
if ngpus_per_node and ngpus_per_node > 0:
mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
if mpi_gpu_run_script:
mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script

return self.get_resolved_value(
"{} {} {} {}".format(
Expand Down Expand Up @@ -2375,6 +2411,8 @@ def create(
extra_machines_dir=None,
case_group=None,
ngpus_per_node=0,
gpu_type=None,
gpu_offload=None,
):
try:
# Set values for env_case.xml
Expand Down Expand Up @@ -2448,6 +2486,8 @@ def create(
extra_machines_dir=extra_machines_dir,
case_group=case_group,
ngpus_per_node=ngpus_per_node,
gpu_type=gpu_type,
gpu_offload=gpu_offload,
)

self.create_caseroot()
Expand Down
29 changes: 0 additions & 29 deletions CIME/case/case_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
copy_local_macros_to_dir,
)
from CIME.utils import batch_jobid
from CIME.utils import transform_vars
from CIME.test_status import *
from CIME.locked_files import unlock_file, lock_file

Expand Down Expand Up @@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
caseroot=caseroot,
is_batch=is_batch,
)

# put the following section here to make sure the rundir is generated first
machdir = self.get_value("MACHDIR")
mach = self.get_value("MACH")
ngpus_per_node = self.get_value("NGPUS_PER_NODE")
overrides = {}
overrides["ngpus_per_node"] = ngpus_per_node
input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
if os.path.isfile(input_template):
# update the wrapper script that sets the device id for each MPI rank
output_text = transform_vars(
open(input_template, "r").read(), case=self, overrides=overrides
)

# write it out to the run dir
rundir = self.get_value("RUNDIR")
output_name = os.path.join(rundir, "set_device_rank.sh")
logger.info("Creating file {}".format(output_name))
with open(output_name, "w") as f:
f.write(output_text)

# make the wrapper script executable
if os.path.isfile(output_name):
os.system("chmod +x " + output_name)
else:
expect(
False, "The file {} is not written out correctly.".format(output_name)
)
Loading
Loading