Merge pull request #4396 from sjsprecious/add_gpu_gust

Add flexible controls of GPU configuration (docs test expected to fail and will update on merge. )
ESMCI · Aug 16, 2023 · 41b67a8 · 41b67a8
2 parents 02fad90 + 128e03e
commit 41b67a8
Show file tree

Hide file tree

Showing 15 changed files with 192 additions and 92 deletions.
diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile
@@ -613,6 +613,9 @@ endif
 
 # Remove arch flag if it exists
 F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
+ifdef GPUFLAGS
+  F90_LDFLAGS += $(GPUFLAGS)
+endif
 
 # Machine stuff to appear last on the link step
 ifndef MLIBS

diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py
@@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
                 if name:
                     if resolve and "$" in name:
                         rflag = self._resolve_argument(case, flag, name, job)
+                        # This is to prevent -gpu_type=none in qsub args
+                        if rflag.endswith("=none"):
+                            continue
                         if len(rflag) > len(flag):
                             submitargs += " {}".format(rflag)
                     else:

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
@@ -42,6 +42,8 @@ def get_value(
         resolved=True,
         subgroup=None,
         max_mpitasks_per_node=None,
+        max_cputasks_per_gpu_node=None,
+        ngpus_per_node=None,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -58,7 +60,13 @@ def get_value(
         if "NTASKS" in vid or "ROOTPE" in vid:
             if max_mpitasks_per_node is None:
                 max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
-            if value is not None and value < 0:
+            if max_cputasks_per_gpu_node is None:
+                max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            if ngpus_per_node is None:
+                ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if (ngpus_per_node and value) and value < 0:
+                value = -1 * value * max_cputasks_per_gpu_node
+            elif value and value < 0:
                 value = -1 * value * max_mpitasks_per_node
         # in the nuopc driver there is only one NINST value
         # so that NINST_{comp} = NINST
@@ -154,6 +162,7 @@ def get_total_tasks(self, comp_classes, async_interface=False):
             tt = rootpe + nthrds * ((ntasks - 1) * pstrid + 1)
             maxrootpe = max(maxrootpe, rootpe)
             total_tasks = max(tt, total_tasks)
+
         if asyncio_tasks:
             total_tasks = total_tasks + len(asyncio_tasks)
         if self.get_value("MULTI_DRIVER"):
@@ -167,13 +176,24 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
             "totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
-            tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            if self.get_value("NGPUS_PER_NODE") > 0:
+                tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            else:
+                tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
-            tasks_per_node = min(
-                self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
-                self.get_value("MAX_MPITASKS_PER_NODE"),
-                total_tasks,
-            )
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+                    total_tasks,
+                )
+            else:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_MPITASKS_PER_NODE"),
+                    total_tasks,
+                )
         return tasks_per_node if tasks_per_node > 0 else 1
 
     def get_total_nodes(self, total_tasks, max_thread_count):

diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py
@@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):
 
     def _compute_actions(self, nodes, child_tag, case, job=None):
         result = []  # list of tuples ("name", "argument")
-        compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
+        compiler = case.get_value("COMPILER")
+        mpilib = case.get_value("MPILIB")
 
         for node in nodes:
             if self._match_attribs(self.attrib(node), case, job=job):

diff --git a/CIME/build.py b/CIME/build.py
@@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
     cmake_args += " -Dcompile_threaded={} ".format(
         stringify_bool(case.get_build_threaded())
     )
+    # check settings for GPU
+    gpu_type = case.get_value("GPU_TYPE")
+    gpu_offload = case.get_value("GPU_OFFLOAD")
+    if gpu_type != "none":
+        expect(
+            gpu_offload != "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
+        cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
+    else:
+        expect(
+            gpu_offload == "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")

diff --git a/CIME/case/case.py b/CIME/case/case.py
@@ -74,6 +74,7 @@ class Case(object):
 
     This class extends across multiple files, class members external to this file
     are listed in the following imports
+
     """
 
     from CIME.case.case_setup import case_setup
@@ -123,6 +124,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
         self._env_generic_files = []
         self._files = []
         self._comp_interface = None
+        self.gpu_enabled = False
         self._non_local = non_local
         self.read_xml()
 
@@ -275,6 +277,9 @@ def initialize_derived_attributes(self):
 
         if max_gpus_per_node:
             self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+        # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
+        if self.ngpus_per_node > 0:
+            max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
 
         self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
         smt_factor = max(
@@ -451,6 +456,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
         return []
 
     def get_value(self, item, attribute=None, resolved=True, subgroup=None):
+        if item == "GPU_ENABLED":
+            if not self.gpu_enabled:
+                if self.get_value("GPU_TYPE") != "none":
+                    self.gpu_enabled = True
+            return "true" if self.gpu_enabled else "false"
+
         result = None
         for env_file in self._files:
             # Wait and resolve in self rather than in env_file
@@ -1141,7 +1152,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
         comment = None
         force_tasks = None
         force_thrds = None
-
         if match1:
             opti_tasks = match1.group(1)
             if opti_tasks.isdigit():
@@ -1211,7 +1221,6 @@ def _setup_mach_pes(self, pecount, multi_driver, ninst, machine_name, mpilib):
             pstrid = pes_pstrid[pstrid_str] if pstrid_str in pes_pstrid else 1
 
             totaltasks.append((ntasks + rootpe) * nthrds)
-
             mach_pes_obj.set_value(ntasks_str, ntasks)
             mach_pes_obj.set_value(nthrds_str, nthrds)
             mach_pes_obj.set_value(rootpe_str, rootpe)
@@ -1262,6 +1271,8 @@ def configure(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
 
         expect(
@@ -1344,6 +1355,7 @@ def configure(
             and "MPILIB" not in x
             and "MAX_MPITASKS_PER_NODE" not in x
             and "MAX_TASKS_PER_NODE" not in x
+            and "MAX_CPUTASKS_PER_GPU_NODE" not in x
             and "MAX_GPUS_PER_NODE" not in x
         ]
 
@@ -1378,20 +1390,31 @@ def configure(
         for name in (
             "MAX_TASKS_PER_NODE",
             "MAX_MPITASKS_PER_NODE",
+            "MAX_CPUTASKS_PER_GPU_NODE",
             "MAX_GPUS_PER_NODE",
         ):
             dmax = machobj.get_value(name, {"compiler": compiler})
             if not dmax:
                 dmax = machobj.get_value(name)
             if dmax:
                 self.set_value(name, dmax)
+            elif name == "MAX_CPUTASKS_PER_GPU_NODE":
+                logger.debug(
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
+                )
             elif name == "MAX_GPUS_PER_NODE":
                 logger.debug(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
             else:
                 logger.warning(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
 
         machdir = machobj.get_machines_dir()
@@ -1509,47 +1532,62 @@ def configure(
             self.set_value("TEST", True)
 
         # ----------------------------------------------------------------------------------------------------------
-        # Sanity check:
-        #     1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
-        #     2. For compilers without the string "gpu" in the name:
-        #        2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
-        #             the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
-        #        2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
-        #             must be set to 0. Otherwise, an error will be triggered.
-        #     3. For compilers with the string "gpu" in the name:
-        #        3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
-        #        3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
+        # Sanity check for a GPU run:
+        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
+        #        2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
         #             XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
-        #        3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
+        #        3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
         # ----------------------------------------------------------------------------------------------------------
         max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
-        if max_gpus_per_node:
-            if "gpu" in compiler:
-                if not ngpus_per_node:
-                    ngpus_per_node = 1
-                    logger.warning(
-                        "Setting ngpus_per_node to 1 for compiler {}".format(compiler)
-                    )
-                expect(
-                    ngpus_per_node > 0,
-                    " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
-            else:
-                expect(
-                    ngpus_per_node == 0,
-                    " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
+        if gpu_type and str(gpu_type).lower() != "none":
+            expect(
+                max_gpus_per_node,
+                f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
+            )
+            expect(
+                gpu_offload,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+            expect(
+                compiler in ["nvhpc", "cray"],
+                f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
+            )
+            valid_gpu_type = self.get_value("GPU_TYPE").split(",")
+            valid_gpu_type.remove("none")
+            expect(
+                gpu_type in valid_gpu_type,
+                f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
+            )
+            valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
+            valid_gpu_offload.remove("none")
+            expect(
+                gpu_offload in valid_gpu_offload,
+                f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
+            )
+            self.gpu_enabled = True
             if ngpus_per_node >= 0:
                 self.set_value(
                     "NGPUS_PER_NODE",
-                    ngpus_per_node
+                    max(1, ngpus_per_node)
                     if ngpus_per_node <= max_gpus_per_node
                     else max_gpus_per_node,
                 )
+        elif gpu_offload and str(gpu_offload).lower() != "none":
+            expect(
+                False,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+        elif ngpus_per_node != 0:
+            expect(
+                False,
+                f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
+            )
+
+        # Set these two GPU XML variables here to overwrite the default values
+        if gpu_type:
+            self.set_value("GPU_TYPE", str(gpu_type).lower())
+        if gpu_offload:
+            self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
 
         self.initialize_derived_attributes()
 
@@ -2073,12 +2111,10 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
             mpi_arg_string += " : "
 
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-        if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
-            # 1. this setting is tested on Casper only and may not work on other machines
-            # 2. need to be revisited in the future for a more adaptable implementation
-            rundir = self.get_value("RUNDIR")
-            output_name = rundir + "/set_device_rank.sh"
-            mpi_arg_string = mpi_arg_string + " " + output_name + " "
+        if ngpus_per_node and ngpus_per_node > 0:
+            mpi_gpu_run_script = self.get_value("MPI_GPU_WRAPPER_SCRIPT")
+            if mpi_gpu_run_script:
+                mpi_arg_string = mpi_arg_string + " " + mpi_gpu_run_script
 
         return self.get_resolved_value(
             "{} {} {} {}".format(
@@ -2375,6 +2411,8 @@ def create(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
         try:
             # Set values for env_case.xml
@@ -2448,6 +2486,8 @@ def create(
                 extra_machines_dir=extra_machines_dir,
                 case_group=case_group,
                 ngpus_per_node=ngpus_per_node,
+                gpu_type=gpu_type,
+                gpu_offload=gpu_offload,
             )
 
             self.create_caseroot()

diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
@@ -21,7 +21,6 @@
     copy_local_macros_to_dir,
 )
 from CIME.utils import batch_jobid
-from CIME.utils import transform_vars
 from CIME.test_status import *
 from CIME.locked_files import unlock_file, lock_file
 
@@ -482,31 +481,3 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
             caseroot=caseroot,
             is_batch=is_batch,
         )
-
-    # put the following section here to make sure the rundir is generated first
-    machdir = self.get_value("MACHDIR")
-    mach = self.get_value("MACH")
-    ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-    overrides = {}
-    overrides["ngpus_per_node"] = ngpus_per_node
-    input_template = os.path.join(machdir, "mpi_run_gpu.{}".format(mach))
-    if os.path.isfile(input_template):
-        # update the wrapper script that sets the device id for each MPI rank
-        output_text = transform_vars(
-            open(input_template, "r").read(), case=self, overrides=overrides
-        )
-
-        # write it out to the run dir
-        rundir = self.get_value("RUNDIR")
-        output_name = os.path.join(rundir, "set_device_rank.sh")
-        logger.info("Creating file {}".format(output_name))
-        with open(output_name, "w") as f:
-            f.write(output_text)
-
-        # make the wrapper script executable
-        if os.path.isfile(output_name):
-            os.system("chmod +x " + output_name)
-        else:
-            expect(
-                False, "The file {} is not written out correctly.".format(output_name)
-            )