ESMCI · jedwards4b · Aug 16, 2023 · Mar 23, 2023 · Mar 23, 2023 · Mar 24, 2023
diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile
@@ -613,6 +613,9 @@ endif
 
 # Remove arch flag if it exists
 F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS))
+ifdef GPUFLAGS
+  F90_LDFLAGS += $(GPUFLAGS)
+endif
 
 # Machine stuff to appear last on the link step
 ifndef MLIBS

diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py
@@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job, resolve=True):
                 if name:
                     if resolve and "$" in name:
                         rflag = self._resolve_argument(case, flag, name, job)
+                        # This is to prevent -gpu_type=none in qsub args
+                        if rflag.endswith("=none"):
+                            continue
                         if len(rflag) > len(flag):
                             submitargs += " {}".format(rflag)
                     else:

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
@@ -42,6 +42,8 @@ def get_value(
         resolved=True,
         subgroup=None,
         max_mpitasks_per_node=None,
+        max_cputasks_per_gpu_node=None,
+        ngpus_per_node=None,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -58,8 +60,15 @@ def get_value(
         if "NTASKS" in vid or "ROOTPE" in vid:
             if max_mpitasks_per_node is None:
                 max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            if max_cputasks_per_gpu_node is None:
+                max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            if ngpus_per_node is None:
+                ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if value is not None and value < 0:
-                value = -1 * value * max_mpitasks_per_node
+                if ngpus_per_node > 0:
+                    value = -1 * value * max_cputasks_per_gpu_node
+                else:
+                    value = -1 * value * max_mpitasks_per_node
         # in the nuopc driver there is only one NINST value
         # so that NINST_{comp} = NINST
         if "NINST_" in vid and value is None:
@@ -167,13 +176,23 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
             "totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
-            tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            if self.get_value("NGPUS_PER_NODE") > 0:
+                tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            else:
+                tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
-            tasks_per_node = min(
-                self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
-                self.get_value("MAX_MPITASKS_PER_NODE"),
-                total_tasks,
-            )
+            if self.get_value("NGPUS_PER_NODE") > 0:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+                    total_tasks,
+                )
+            else:
+                tasks_per_node = min(
+                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                    self.get_value("MAX_MPITASKS_PER_NODE"),
+                    total_tasks,
+                )
         return tasks_per_node if tasks_per_node > 0 else 1
 
     def get_total_nodes(self, total_tasks, max_thread_count):

diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py
@@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None):
 
     def _compute_actions(self, nodes, child_tag, case, job=None):
         result = []  # list of tuples ("name", "argument")
-        compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB")
+        compiler = case.get_value("COMPILER")
+        mpilib = case.get_value("MPILIB")
 
         for node in nodes:
             if self._match_attribs(self.attrib(node), case, job=job):

diff --git a/CIME/build.py b/CIME/build.py
@@ -246,6 +246,20 @@ def get_standard_cmake_args(case, sharedpath):
     cmake_args += " -Dcompile_threaded={} ".format(
         stringify_bool(case.get_build_threaded())
     )
+    # check settings for GPU
+    gpu_type = case.get_value("GPU_TYPE")
+    gpu_offload = case.get_value("GPU_OFFLOAD")
+    if gpu_type != "none":
+        expect(
+            gpu_offload != "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
+        cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
+    else:
+        expect(
+            gpu_offload == "none",
+            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
+        )
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")

diff --git a/CIME/case/case.py b/CIME/case/case.py
@@ -123,6 +123,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False
         self._env_generic_files = []
         self._files = []
         self._comp_interface = None
+        self.gpu_enabled = False 
         self._non_local = non_local
         self.read_xml()
 
@@ -275,6 +276,9 @@ def initialize_derived_attributes(self):
 
         if max_gpus_per_node:
             self.ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+        # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node)
+        if self.ngpus_per_node > 0:
+            max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
 
         self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0))
         smt_factor = max(
@@ -451,6 +455,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None):
         return []
 
     def get_value(self, item, attribute=None, resolved=True, subgroup=None):
+        if item == "GPU_ENABLED":
+            if not self.gpu_enabled:
+                if self.get_value("GPU_TYPE") != "none":
+                    self.gpu_enabled = True
+            return "true" if self.gpu_enabled else "false"
+
         result = None
         for env_file in self._files:
             # Wait and resolve in self rather than in env_file
@@ -1262,6 +1272,8 @@ def configure(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
 
         expect(
@@ -1344,6 +1356,7 @@ def configure(
             and "MPILIB" not in x
             and "MAX_MPITASKS_PER_NODE" not in x
             and "MAX_TASKS_PER_NODE" not in x
+            and "MAX_CPUTASKS_PER_GPU_NODE" not in x
             and "MAX_GPUS_PER_NODE" not in x
         ]
 
@@ -1378,20 +1391,32 @@ def configure(
         for name in (
             "MAX_TASKS_PER_NODE",
             "MAX_MPITASKS_PER_NODE",
+            "MAX_CPUTASKS_PER_GPU_NODE",
             "MAX_GPUS_PER_NODE",
         ):
             dmax = machobj.get_value(name, {"compiler": compiler})
             if not dmax:
                 dmax = machobj.get_value(name)
             if dmax:
+                print(f"here name is {name} and dmax is {dmax}")
                 self.set_value(name, dmax)
+            elif name == "MAX_CPUTASKS_PER_GPU_NODE":
+                logger.debug(
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
+                )
             elif name == "MAX_GPUS_PER_NODE":
                 logger.debug(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
             else:
                 logger.warning(
-                    "Variable {} not defined for machine {}".format(name, machine_name)
+                    "Variable {} not defined for machine {} and compiler {}".format(
+                        name, machine_name, compiler
+                    )
                 )
 
         machdir = machobj.get_machines_dir()
@@ -1509,47 +1534,55 @@ def configure(
             self.set_value("TEST", True)
 
         # ----------------------------------------------------------------------------------------------------------
-        # Sanity check:
-        #     1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU
-        #     2. For compilers without the string "gpu" in the name:
-        #        2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as
-        #             the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect).
-        #        2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument
-        #             must be set to 0. Otherwise, an error will be triggered.
-        #     3. For compilers with the string "gpu" in the name:
-        #        3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered.
-        #        3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
+        # Sanity check for a GPU run:
+        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
+        #        2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
         #             XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
-        #        3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
+        #        3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
         # ----------------------------------------------------------------------------------------------------------
         max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
-        if max_gpus_per_node:
-            if "gpu" in compiler:
-                if not ngpus_per_node:
-                    ngpus_per_node = 1
-                    logger.warning(
-                        "Setting ngpus_per_node to 1 for compiler {}".format(compiler)
-                    )
-                expect(
-                    ngpus_per_node > 0,
-                    " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
-            else:
-                expect(
-                    ngpus_per_node == 0,
-                    " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format(
-                        compiler, ngpus_per_node
-                    ),
-                )
+        if gpu_type and str(gpu_type).lower() != 'none':
+            expect(
+                max_gpus_per_node,
+                f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
+            )
+            expect(
+                gpu_offload,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+            expect(
+                compiler in ["nvhpc", "cray"],
+                f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
+            )
+            valid_gpu_type = self.get_value("GPU_TYPE").split(",")
+            valid_gpu_type.remove("none")
+            expect(
+                gpu_type in valid_gpu_type,
+                f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
+            )
+            valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
+            valid_gpu_offload.remove("none")
+            expect(
+                gpu_offload in valid_gpu_offload,
+                f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
+            )
+            self.gpu_enabled = True
             if ngpus_per_node >= 0:
                 self.set_value(
                     "NGPUS_PER_NODE",
-                    ngpus_per_node
+                    max(1, ngpus_per_node)
                     if ngpus_per_node <= max_gpus_per_node
                     else max_gpus_per_node,
                 )
+        elif gpu_offload and str(gpu_offload).lower() != 'none':
+            expect(
+                False,
+                "Both gpu-type and gpu-offload must be defined if either is defined",
+            )
+
+        # Set these two GPU XML variables here to overwrite the default values
+        self.set_value("GPU_TYPE", str(gpu_type).lower())
+        self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
 
         self.initialize_derived_attributes()
 
@@ -2074,11 +2107,18 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None
 
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
         if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank:
-            # 1. this setting is tested on Casper only and may not work on other machines
-            # 2. need to be revisited in the future for a more adaptable implementation
-            rundir = self.get_value("RUNDIR")
-            output_name = rundir + "/set_device_rank.sh"
-            mpi_arg_string = mpi_arg_string + " " + output_name + " "
+            if self.get_value("MACH") == "gust" or self.get_value("MACH") == "derecho":
+                mpi_arg_string = mpi_arg_string + " get_local_rank "
+            else:
+                # this wrapper script only works with OpenMPI library
+                # has been tested on Casper
+                expect(
+                    self.get_value("MPILIB") == "openmpi",
+                    "The wrapper script only works with OpenMPI library; {} is currently used".format(self.get_value("MPILIB")),
+                )
+                rundir = self.get_value("RUNDIR")
+                output_name = rundir + "/set_device_rank.sh"
+                mpi_arg_string = mpi_arg_string + " " + output_name + " "
 
         return self.get_resolved_value(
             "{} {} {} {}".format(
@@ -2375,6 +2415,8 @@ def create(
         extra_machines_dir=None,
         case_group=None,
         ngpus_per_node=0,
+        gpu_type=None,
+        gpu_offload=None,
     ):
         try:
             # Set values for env_case.xml
@@ -2448,6 +2490,8 @@ def create(
                 extra_machines_dir=extra_machines_dir,
                 case_group=case_group,
                 ngpus_per_node=ngpus_per_node,
+                gpu_type=gpu_type,
+                gpu_offload=gpu_offload,
             )
 
             self.create_caseroot()

diff --git a/CIME/config.py b/CIME/config.py
@@ -180,7 +180,7 @@ def __init__(self):
         self._set_attribute(
             "gpus_use_set_device_rank",
             True,
-            desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.",
+            desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Derecho/Gust) is appended when the MPI run command is generated.",
         )
         self._set_attribute(
             "test_custom_project_machine",

diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -6,6 +6,8 @@
   <xs:attribute name="compiler" type="xs:string"/>
   <xs:attribute name="mpilib" type="xs:string"/>
   <xs:attribute name="comp_interface" type="xs:string"/>
+  <xs:attribute name="gpu_type" type="xs:string"/>
+  <xs:attribute name="gpu_offload" type="xs:string"/>
   <xs:attribute name="queue" type="xs:string"/>
   <xs:attribute name="DEBUG" type="upperBoolean"/>
   <xs:attribute name="PIO_VERSION" type="xs:integer"/>
@@ -56,6 +58,9 @@
   <xs:element name="MAX_TASKS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
+  <xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
+  <xs:element name="GPU_TYPE" type="AttrElement"/>
+  <xs:element name="GPU_OFFLOAD" type="AttrElement"/>
   <xs:element name="COSTPES_PER_NODE" type="xs:integer"/>
   <xs:element name="PROJECT_REQUIRED" type="xs:NCName"/>
   <xs:element name="executable" type="xs:string"/>
@@ -166,6 +171,13 @@
         <!-- MAX_MPITASKS_PER_NODE: number of physical PES per shared node on
              this machine, in practice the MPI tasks per node will not exceed this value -->
         <xs:element ref="MAX_MPITASKS_PER_NODE" minOccurs="1" maxOccurs="unbounded"/>
+        <!-- MAX_CPUTASKS_PER_GPU_NODE: number of physical PES per GPU node on
+             this machine, in practice the MPI tasks per node will not exceed this value -->
+        <xs:element ref="MAX_CPUTASKS_PER_GPU_NODE" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- GPU_TYPE: the type of GPU hardware available on this machine -->
+        <xs:element ref="GPU_TYPE" minOccurs="0" maxOccurs="unbounded"/>
+	<!-- GPU_OFFLOAD: the GPU programming model used for GPU porting -->
+        <xs:element ref="GPU_OFFLOAD" minOccurs="0" maxOccurs="unbounded"/>
         <!-- Optional cost factor per node unit -->
         <xs:element ref="COSTPES_PER_NODE" minOccurs="0" maxOccurs="1"/>
         <!-- PROJECT_REQUIRED: Does this machine require a project to be specified to
@@ -249,6 +261,8 @@
       <xs:attribute ref="PIO_VERSION"/>
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_offload"/>
+      <xs:attribute ref="gpu_type"/>
     </xs:complexType>
   </xs:element>
   <xs:element name="command">

diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd
@@ -9,6 +9,8 @@
 <xs:attribute name="PIO_VERSION" type="xs:integer"/>
 <xs:attribute name="mpilib" type="xs:string"/>
 <xs:attribute name="comp_interface" type="xs:string"/>
+<xs:attribute name="gpu_type" type="xs:string"/>
+<xs:attribute name="gpu_offload" type="xs:string"/>
 <xs:attribute name="SMP_PRESENT" type="xs:string"/>
 <xs:attribute name="value" type="xs:string"/>
 <xs:attribute name="unit_testing" type="xs:boolean"/>
@@ -102,6 +104,8 @@
       <xs:attribute ref="PIO_VERSION" />
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_type"/>
+      <xs:attribute ref="gpu_offload"/>
     </xs:complexType>
   </xs:element>