From bb7175cdec2fbf64cbade33c4ebda5c973dcb0ae Mon Sep 17 00:00:00 2001 From: Jian Sun Date: Wed, 22 Mar 2023 23:39:30 -0600 Subject: [PATCH 01/24] Add Jim's changes for Gust's GPU options, based on https://github.com/jedwards4b/cime/compare/28b7431..3f4b1ab modified: CIME/Tools/Makefile modified: CIME/XML/env_batch.py modified: CIME/XML/env_mach_specific.py modified: CIME/build.py modified: CIME/case/case.py modified: CIME/data/config/xml_schemas/config_machines.xsd modified: CIME/data/config/xml_schemas/env_mach_specific.xsd modified: CIME/scripts/create_newcase.py modified: CIME/test_scheduler.py modified: CIME/tests/test_unit_case.py --- CIME/Tools/Makefile | 3 + CIME/XML/env_batch.py | 3 + CIME/XML/env_mach_specific.py | 3 +- CIME/build.py | 14 ++++ CIME/case/case.py | 76 ++++++++++--------- .../config/xml_schemas/config_machines.xsd | 4 + .../config/xml_schemas/env_mach_specific.xsd | 4 + CIME/scripts/create_newcase.py | 18 +++++ CIME/test_scheduler.py | 9 ++- CIME/tests/test_unit_case.py | 4 + 10 files changed, 101 insertions(+), 37 deletions(-) diff --git a/CIME/Tools/Makefile b/CIME/Tools/Makefile index 8cf5ba1104e..d01eeebd0d6 100644 --- a/CIME/Tools/Makefile +++ b/CIME/Tools/Makefile @@ -613,6 +613,9 @@ endif # Remove arch flag if it exists F90_LDFLAGS := $(filter-out -arch%,$(LDFLAGS)) +ifdef GPUFLAGS + F90_LDFLAGS += $(GPUFLAGS) +endif # Machine stuff to appear last on the link step ifndef MLIBS diff --git a/CIME/XML/env_batch.py b/CIME/XML/env_batch.py index 21fba92ad90..5a960119cbe 100644 --- a/CIME/XML/env_batch.py +++ b/CIME/XML/env_batch.py @@ -620,6 +620,9 @@ def _process_args(self, case, submit_arg_nodes, job): if name: if "$" in name: rflag = self._resolve_argument(case, flag, name, job) + # This is to prevent -gpu_type=none in qsub args + if rflag.endswith("=none"): + continue if len(rflag) > len(flag): submitargs += " {}".format(rflag) else: diff --git a/CIME/XML/env_mach_specific.py b/CIME/XML/env_mach_specific.py index 03e84f0faee..4652f2a7d0a 100644 --- a/CIME/XML/env_mach_specific.py +++ b/CIME/XML/env_mach_specific.py @@ -320,7 +320,8 @@ def _compute_resource_actions(self, resource_nodes, case, job=None): def _compute_actions(self, nodes, child_tag, case, job=None): result = [] # list of tuples ("name", "argument") - compiler, mpilib = case.get_value("COMPILER"), case.get_value("MPILIB") + compiler = case.get_value("COMPILER") + mpilib = case.get_value("MPILIB") for node in nodes: if self._match_attribs(self.attrib(node), case, job=job): diff --git a/CIME/build.py b/CIME/build.py index 0e232396f3a..99506ff9bf7 100644 --- a/CIME/build.py +++ b/CIME/build.py @@ -239,6 +239,20 @@ def get_standard_cmake_args(case, sharedpath): cmake_args += " -Dcompile_threaded={} ".format( stringify_bool(case.get_build_threaded()) ) + # check settings for GPU + gpu_type = case.get_value("GPU_TYPE") + gpu_offload = case.get_value("GPU_OFFLOAD") + if gpu_type != "none": + expect( + gpu_offload != "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) + cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}" + else: + expect( + gpu_offload == "none", + "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is", + ) ocn_model = case.get_value("COMP_OCN") atm_model = case.get_value("COMP_ATM") diff --git a/CIME/case/case.py b/CIME/case/case.py index 4924baf8cda..1000f944acd 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -123,6 +123,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False self._env_generic_files = [] self._files = [] self._comp_interface = None + self.gpu_enabled = None self._non_local = non_local self.read_xml() @@ -451,6 +452,12 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None): return [] def get_value(self, item, attribute=None, resolved=True, subgroup=None): + if item == "GPU_ENABLED": + if self.gpu_enabled == None: + if self.get_value("GPU_TYPE") != "none": + self.gpu_enabled = True + return "true" if self.gpu_enabled else "false" + result = None for env_file in self._files: # Wait and resolve in self rather than in env_file @@ -1262,6 +1269,8 @@ def configure( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): expect( @@ -1384,14 +1393,19 @@ def configure( if not dmax: dmax = machobj.get_value(name) if dmax: + print(f"here name is {name} and dmax is {dmax}") self.set_value(name, dmax) elif name == "MAX_GPUS_PER_NODE": logger.debug( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) else: logger.warning( - "Variable {} not defined for machine {}".format(name, machine_name) + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) ) machdir = machobj.get_machines_dir() @@ -1509,47 +1523,37 @@ def configure( self.set_value("TEST", True) # ---------------------------------------------------------------------------------------------------------- - # Sanity check: - # 1. We assume that there is always a string "gpu" in the compiler name if we want to enable GPU - # 2. For compilers without the string "gpu" in the name: - # 2.1. the ngpus-per-node argument would not update the NGPUS_PER_NODE XML variable, as long as - # the MAX_GPUS_PER_NODE XML variable is not defined (i.e., this argument is not in effect). - # 2.2. if the MAX_GPUS_PER_NODE XML variable is defined, then the ngpus-per-node argument - # must be set to 0. Otherwise, an error will be triggered. - # 3. For compilers with the string "gpu" in the name: - # 3.1. if ngpus-per-node argument is smaller than 0, an error will be triggered. - # 3.2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE + # Sanity check for a GPU run: + # 1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS + # 2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE # XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically. - # 3.3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. + # 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically. # ---------------------------------------------------------------------------------------------------------- max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE") - if max_gpus_per_node: - if "gpu" in compiler: - if not ngpus_per_node: - ngpus_per_node = 1 - logger.warning( - "Setting ngpus_per_node to 1 for compiler {}".format(compiler) - ) - expect( - ngpus_per_node > 0, - " ngpus_per_node is expected > 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) - else: - expect( - ngpus_per_node == 0, - " ngpus_per_node is expected = 0 for compiler {}; current value is {}".format( - compiler, ngpus_per_node - ), - ) + if gpu_type: + expect( + max_gpus_per_node, + f"GPUS are not defined for machine={machine_name} and compiler={compiler}", + ) + expect( + gpu_offload, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) + self.set_value("GPU_TYPE", gpu_type) + self.set_value("GPU_OFFLOAD", gpu_offload) + self.gpu_enabled = True if ngpus_per_node >= 0: self.set_value( "NGPUS_PER_NODE", - ngpus_per_node + max(1, ngpus_per_node) if ngpus_per_node <= max_gpus_per_node else max_gpus_per_node, ) + elif gpu_offload: + expect( + False, + "Both gpu-type and gpu-offload must be defined if either is defined", + ) self.initialize_derived_attributes() @@ -2354,6 +2358,8 @@ def create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ): try: # Set values for env_case.xml @@ -2427,6 +2433,8 @@ def create( extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) self.create_caseroot() diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd index d6e3c280a93..d5cc7d5c597 100644 --- a/CIME/data/config/xml_schemas/config_machines.xsd +++ b/CIME/data/config/xml_schemas/config_machines.xsd @@ -6,6 +6,8 @@ + + @@ -249,6 +251,8 @@ + + diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd index f86c6b9f6e1..7778635592b 100644 --- a/CIME/data/config/xml_schemas/env_mach_specific.xsd +++ b/CIME/data/config/xml_schemas/env_mach_specific.xsd @@ -9,6 +9,8 @@ + + @@ -102,6 +104,8 @@ + + diff --git a/CIME/scripts/create_newcase.py b/CIME/scripts/create_newcase.py index 3faea5d6553..ee3df32dc76 100755 --- a/CIME/scripts/create_newcase.py +++ b/CIME/scripts/create_newcase.py @@ -269,6 +269,18 @@ def parse_command_line(args, cimeroot, description): help="Specify number of GPUs used for simulation. ", ) + parser.add_argument( + "--gpu-type", + default=None, + help="Specify type of GPU hardware - currently supported are v100, a100, mi250", + ) + + parser.add_argument( + "--gpu-offload", + default=None, + help="Specify gpu offload method - currently supported are openacc, openmp, combined", + ) + args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser) if args.srcroot is not None: @@ -345,6 +357,8 @@ def parse_command_line(args, cimeroot, description): args.extra_machines_dir, args.case_group, args.ngpus_per_node, + args.gpu_type, + args.gpu_offload, ) @@ -382,6 +396,8 @@ def _main_func(description=None): extra_machines_dir, case_group, ngpus_per_node, + gpu_type, + gpu_offload, ) = parse_command_line(sys.argv, cimeroot, description) if script_root is None: @@ -447,6 +463,8 @@ def _main_func(description=None): extra_machines_dir=extra_machines_dir, case_group=case_group, ngpus_per_node=ngpus_per_node, + gpu_type=gpu_type, + gpu_offload=gpu_offload, ) # Called after create since casedir does not exist yet diff --git a/CIME/test_scheduler.py b/CIME/test_scheduler.py index a657e2a6b39..d6de0801cfa 100644 --- a/CIME/test_scheduler.py +++ b/CIME/test_scheduler.py @@ -661,8 +661,13 @@ def _create_newcase_phase(self, test): pesize = case_opt[1:] create_newcase_cmd += " --pecount {}".format(pesize) elif case_opt.startswith("G"): - ngpus_per_node = case_opt[1:] - create_newcase_cmd += " --ngpus-per-node {}".format(ngpus_per_node) + if "-" in case_opt: + ngpus_per_node, gpu_type, gpu_offload = case_opt[1:].split("-") + else: + error = "GPU test argument format is ngpus_per_node-gpu_type-gpu_offload" + self._log_output(test, error) + return False, error + create_newcase_cmd += " --ngpus-per-node {} --gpu-type {} --gpu-offload {}".format(ngpus_per_node,gpu_type,gpu_offload) elif case_opt.startswith("V"): self._cime_driver = case_opt[1:] create_newcase_cmd += " --driver {}".format(self._cime_driver) diff --git a/CIME/tests/test_unit_case.py b/CIME/tests/test_unit_case.py index ed473cea21f..dd4d18edf66 100755 --- a/CIME/tests/test_unit_case.py +++ b/CIME/tests/test_unit_case.py @@ -251,6 +251,8 @@ def test_copy( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ) create_caseroot.assert_called() apply_user_mods.assert_called() @@ -326,6 +328,8 @@ def test_create( extra_machines_dir=None, case_group=None, ngpus_per_node=0, + gpu_type=None, + gpu_offload=None, ) create_caseroot.assert_called() apply_user_mods.assert_called() From 6667b4061546c807e7f7bfab0c9874e0d1b2b700 Mon Sep 17 00:00:00 2001 From: Jian Sun Date: Thu, 23 Mar 2023 11:11:21 -0600 Subject: [PATCH 02/24] Change the settings for Gust GPU nodes modified: CIME/XML/env_mach_pes.py modified: CIME/case/case.py modified: CIME/config.py modified: doc/source/users_guide/cime-customize.rst --- CIME/XML/env_mach_pes.py | 6 ++++- CIME/case/case.py | 32 ++++++++++++++++++----- CIME/config.py | 2 +- doc/source/users_guide/cime-customize.rst | 2 +- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py index c7635573f95..f5dfa68e889 100644 --- a/CIME/XML/env_mach_pes.py +++ b/CIME/XML/env_mach_pes.py @@ -167,11 +167,15 @@ def get_tasks_per_node(self, total_tasks, max_thread_count): "totaltasks > 0 expected, totaltasks = {}".format(total_tasks), ) if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"): - tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") + if self.get_value("NGPUS_PER_NODE") > 0: + tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") + else: + tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE") else: tasks_per_node = min( self.get_value("MAX_TASKS_PER_NODE") // max_thread_count, self.get_value("MAX_MPITASKS_PER_NODE"), + self.get_value("MAX_CPUTASKS_PER_GPU_NODE"), total_tasks, ) return tasks_per_node if tasks_per_node > 0 else 1 diff --git a/CIME/case/case.py b/CIME/case/case.py index 1000f944acd..26eacfe2b1f 100644 --- a/CIME/case/case.py +++ b/CIME/case/case.py @@ -123,7 +123,7 @@ def __init__(self, case_root=None, read_only=True, record=False, non_local=False self._env_generic_files = [] self._files = [] self._comp_interface = None - self.gpu_enabled = None + self.gpu_enabled = False self._non_local = non_local self.read_xml() @@ -276,6 +276,9 @@ def initialize_derived_attributes(self): if max_gpus_per_node: self.ngpus_per_node = self.get_value("NGPUS_PER_NODE") + # update the maximum MPI tasks for a GPU node (could differ from a pure-CPU node) + if self.ngpus_per_node > 0: + max_mpitasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE") self.tasks_per_numa = int(math.ceil(self.tasks_per_node / 2.0)) smt_factor = max( @@ -453,7 +456,7 @@ def get_values(self, item, attribute=None, resolved=True, subgroup=None): def get_value(self, item, attribute=None, resolved=True, subgroup=None): if item == "GPU_ENABLED": - if self.gpu_enabled == None: + if not self.gpu_enabled: if self.get_value("GPU_TYPE") != "none": self.gpu_enabled = True return "true" if self.gpu_enabled else "false" @@ -1353,6 +1356,7 @@ def configure( and "MPILIB" not in x and "MAX_MPITASKS_PER_NODE" not in x and "MAX_TASKS_PER_NODE" not in x + and "MAX_CPUTASKS_PER_GPU_NODE" not in x and "MAX_GPUS_PER_NODE" not in x ] @@ -1387,6 +1391,7 @@ def configure( for name in ( "MAX_TASKS_PER_NODE", "MAX_MPITASKS_PER_NODE", + "MAX_CPUTASKS_PER_GPU_NODE", "MAX_GPUS_PER_NODE", ): dmax = machobj.get_value(name, {"compiler": compiler}) @@ -1395,6 +1400,12 @@ def configure( if dmax: print(f"here name is {name} and dmax is {dmax}") self.set_value(name, dmax) + elif name == "MAX_CPUTASKS_PER_GPU_NODE": + logger.debug( + "Variable {} not defined for machine {} and compiler {}".format( + name, machine_name, compiler + ) + ) elif name == "MAX_GPUS_PER_NODE": logger.debug( "Variable {} not defined for machine {} and compiler {}".format( @@ -2057,11 +2068,18 @@ def get_mpirun_cmd(self, job=None, allow_unresolved_envvars=True, overrides=None ngpus_per_node = self.get_value("NGPUS_PER_NODE") if ngpus_per_node and ngpus_per_node > 0 and config.gpus_use_set_device_rank: - # 1. this setting is tested on Casper only and may not work on other machines - # 2. need to be revisited in the future for a more adaptable implementation - rundir = self.get_value("RUNDIR") - output_name = rundir + "/set_device_rank.sh" - mpi_arg_string = mpi_arg_string + " " + output_name + " " + if self.get_value("MACH") == "Gust": + mpi_arg_string = mpi_arg_string + " get_local_rank " + else: + # this wrapper script only works with OpenMPI library + # has been tested on Casper + expect( + self.get_value("MPILIB") == "openmpi", + "The wrapper script only works with OpenMPI library; {} is currently used".format(self.get_value("MPILIB")), + ) + rundir = self.get_value("RUNDIR") + output_name = rundir + "/set_device_rank.sh" + mpi_arg_string = mpi_arg_string + " " + output_name + " " return self.get_resolved_value( "{} {} {} {}".format( diff --git a/CIME/config.py b/CIME/config.py index 8491b2f3f2e..9666439cb2e 100644 --- a/CIME/config.py +++ b/CIME/config.py @@ -180,7 +180,7 @@ def __init__(self): self._set_attribute( "gpus_use_set_device_rank", True, - desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated.", + desc="If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Gust) is appended when the MPI run command is generated.", ) self._set_attribute( "test_custom_project_machine", diff --git a/doc/source/users_guide/cime-customize.rst b/doc/source/users_guide/cime-customize.rst index ed90e21472a..2c65d1ab954 100644 --- a/doc/source/users_guide/cime-customize.rst +++ b/doc/source/users_guide/cime-customize.rst @@ -44,7 +44,7 @@ default_short_term_archiving True bool If set to `Tr driver_choices ('mct', 'nuopc') tuple Sets the available driver choices for the model. driver_default nuopc str Sets the default driver for the model. enable_smp True bool If set to `True` then `SMP=` is added to model compile command. -gpus_use_set_device_rank True bool If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` is appended when the MPI run command is generated. +gpus_use_set_device_rank True bool If set to `True` and NGPUS_PER_NODE > 0 then `$RUNDIR/set_device_rank.sh` or `get_local_rank` (a global script on Gust) is appended when the MPI run command is generated. make_case_run_batch_script False bool If set to `True` and case is not a test then `case.run.sh` is created in case directory from `$MACHDIR/template.case.run.sh`. mct_path {srcroot}/libraries/mct str Sets the path to the mct library. serialize_sharedlib_builds True bool If set to `True` then the TestScheduler will use `proc_pool + 1` processors to build shared libraries otherwise a single processor is used. From e0d625ef3e404a77a49c741072bc7d9fdf8bb3ef Mon Sep 17 00:00:00 2001 From: Jian Sun Date: Thu, 23 Mar 2023 22:16:25 -0600 Subject: [PATCH 03/24] Add missing definition of max_cputasks_per_gpu_node modified: CIME/data/config/xml_schemas/config_machines.xsd --- CIME/data/config/xml_schemas/config_machines.xsd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd index d5cc7d5c597..53bd359a503 100644 --- a/CIME/data/config/xml_schemas/config_machines.xsd +++ b/CIME/data/config/xml_schemas/config_machines.xsd @@ -58,6 +58,7 @@ + @@ -168,6 +169,9 @@ + + - + + + + + + +