From 2b0f37047dc9e85176e0cb8ffee88908232372f0 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 11:28:20 -0400 Subject: [PATCH 01/66] add user quotas to fs by default, 1g per user on each fs --- common/main.tf | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/common/main.tf b/common/main.tf index fbe591f..0bd363e 100644 --- a/common/main.tf +++ b/common/main.tf @@ -44,6 +44,13 @@ locals { home_size = 100 project_size = 100 scratch_size = 100 + + user_quotas = { + home = "1g" + project = "1g" + scratch = "1g" + } + cluster_purpose = "cours_academiques" config_version = "1ba3a12" @@ -158,16 +165,16 @@ locals { volumes_map = { arbutus = { nfs = { - home = { size = try(local.custom.home_size, local.default_pod.home_size) } - project = { size = try(local.custom.project_size, local.default_pod.project_size) } - scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size) } + home = { size = try(local.custom.home_size, local.default_pod.home_size), quota = try(local.custom.user_quotas.home, local.default_pod.user_quotas.home) } + project = { size = try(local.custom.project_size, local.default_pod.project_size), quota = try(local.custom.user_quotas.project, local.default_pod.user_quotas.project) } + scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch) } } } beluga = { nfs = { - home = { size = try(local.custom.home_size, local.default_pod.home_size), type = "volumes-ssd" } - project = { size = try(local.custom.project_size, local.default_pod.project_size), type = "volumes-ec" } - scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), type = "volumes-ec" } + home = { size = try(local.custom.home_size, local.default_pod.home_size), type = "volumes-ssd", quota = try(local.custom.user_quotas.home, local.default_pod.user_quotas.home) } + project = { size = try(local.custom.project_size, local.default_pod.project_size), type = "volumes-ec", quota = try(local.custom.user_quotas.project, local.default_pod.user_quotas.project) } + scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), type = "volumes-ec", quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch) } } } } From 9f0c173044c5582130ddd4c50d8b4759d6e5e1c0 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 11:46:53 -0400 Subject: [PATCH 02/66] add juno to the configuration, make ncpu and friends into a structure, spacing fixes --- common/main.tf | 200 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 149 insertions(+), 51 deletions(-) diff --git a/common/main.tf b/common/main.tf index 0bd363e..47341f4 100644 --- a/common/main.tf +++ b/common/main.tf @@ -36,11 +36,18 @@ locals { image = "Rocky-8" image_cpu = "snapshot-cpunode-2024-R810.5" image_gpu = "snapshot-gpunode-2024-R810.5" - ncpu = 0 - ngpu = 0 - ncpupool = 0 - ngpupool = 0 - nlogin = 1 + + n = { + cpu = 0 + gpu = 0 + cpupool = 0 + gpupool = 0 + login = 1 + gpupool12 = 0 + gpupool16 = 0 + gpupool80 = 0 + } + home_size = 100 project_size = 100 scratch_size = 100 @@ -57,108 +64,192 @@ locals { instances_type_map = { arbutus = { mgmt = "p8-12gb" - login = "c2-7.5gb-31-avx2" - cpu = "c8-30gb-186-avx2" - cpupool = "c8-30gb-186-avx2" - gpu = "g1-8gb-c4-22gb" - gpupool = "g1-8gb-c4-22gb" + login = "c2-7.5gb-31-avx2" + cpu = "c8-30gb-186-avx2" + cpupool = "c8-30gb-186-avx2" + gpu = "g1-8gb-c4-22gb" + gpupool = "g1-8gb-c4-22gb" } beluga = { mgmt = "p8-15gb" - login = "p4-7.5gb" - cpu = "c8-60gb" - cpupool = "c8-60gb" - gpu = "gpu32-240-3450gb-a100x1" - gpupool = "gpu32-240-3450gb-a100x1" + login = "p4-7.5gb" + cpu = "c8-60gb" + cpupool = "c8-60gb" + gpu = "gpu32-240-3450gb-a100x1" + gpupool = "gpu32-240-3450gb-a100x1" + } + juno = { + mgmt = "ha4-15gb" + login = "ha4-15gb" + cpu = "c8-30gb" + cpupool = "c8-30gb"" + gpu = "gpu16-240-3375gb-a100x1" + gpupool = "gpu16-240-3375gb-a100x1" + gpupool16 = "gpu16-240-3375gb-a100x1" + gpupool80 = "gpu13-240-2500gb-a100-80gx1" + gpupool12 = "gpu12-120-850gb-a100x1" } } - gpu_mig_config = { "3g.20gb" = 1, "2g.10gb" = 1, "1g.5gb" = 2 } - gpupool_mig_config = { "1g.5gb" = 7 } + mig = { + gpu = { "3g.20gb" = 1, "2g.10gb" = 1, "1g.5gb" = 2 } + gpupool = { "1g.5gb" = 7 } + gpupool16 = { "1g.5gb" = 7 } + gpupool80 = { "1g.10gb" = 7 } + gpupool12 = { "1g.5gb" = 7 } + } } default = { instances_map = { arbutus = { mgmt = { - type = try(local.custom.instances_type_map.arbutus.mgmt, local.default_pod.instances_type_map.arbutus.mgmt), - tags = ["puppet", "mgmt", "nfs"], - disk_size = 20, - count = 1 - } + type = try(local.custom.instances_type_map.arbutus.mgmt, local.default_pod.instances_type_map.arbutus.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } login = { - type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login), - tags = ["login", "public", "proxy"], - disk_size = 20, - count = 1 - } + type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = 1 + } nodecpu = { type = try(local.custom.instances_type_map.arbutus.cpu, local.default_pod.instances_type_map.arbutus.cpu), tags = ["node"], - count = try(local.custom.ncpu, local.default_pod.ncpu), + count = try(local.custom.n.cpu, local.default_pod.n.cpu), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodecpupool = { type = try(local.custom.instances_type_map.arbutus.cpupool, local.default_pod.instances_type_map.arbutus.cpupool), tags = ["node", "pool"], - count = try(local.custom.ncpupool, local.default_pod.ncpupool), + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodegpu = { type = try(local.custom.instances_type_map.arbutus.gpu, local.default_pod.instances_type_map.arbutus.gpu), tags = ["node"], - count = try(local.custom.ngpu, local.default_pod.ngpu), + count = try(local.custom.n.gpu, local.default_pod.n.gpu), image = try(local.custom.image_gpu, local.default_pod.image_gpu), } nodegpupool = { type = try(local.custom.instances_type_map.arbutus.gpupool, local.default_pod.instances_type_map.arbutus.gpupool), tags = ["node", "pool"], - count = try(local.custom.ngpupool, local.default_pod.ngpupool), + count = try(local.custom.n.gpupool, local.default_pod.n.gpupool), image = try(local.custom.image_gpu, local.default_pod.image_gpu), } } beluga = { mgmt = { - type = try(local.custom.instances_type_map.beluga.mgmt, local.default_pod.instances_type_map.beluga.mgmt), - tags = ["puppet", "mgmt", "nfs"], - disk_size = 20, - count = 1 - } + type = try(local.custom.instances_type_map.beluga.mgmt, local.default_pod.instances_type_map.beluga.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } login = { - type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login), - tags = ["login", "public", "proxy"], - disk_size = 20, - count = try(local.custom.nlogin, local.default_pod.nlogin) - } + type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = try(local.custom.n.login, local.default_pod.n.login) + } nodecpu = { type = try(local.custom.instances_type_map.beluga.cpu, local.default_pod.instances_type_map.beluga.cpu), - disk_size = 20 + disk_size = 20 tags = ["node"], - count = try(local.custom.ncpu, local.default_pod.ncpu), + count = try(local.custom.n.cpu, local.default_pod.n.cpu), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodecpupool = { type = try(local.custom.instances_type_map.beluga.cpupool, local.default_pod.instances_type_map.beluga.cpupool), - disk_size = 20 + disk_size = 20 tags = ["node", "pool"], - count = try(local.custom.ncpupool, local.default_pod.ncpupool), + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodegpu = { type = try(local.custom.instances_type_map.beluga.gpu, local.default_pod.instances_type_map.beluga.gpu), tags = ["node"], - count = try(local.custom.ngpu, local.default_pod.ngpu), - mig = try(local.custom.gpu_mig_config, local.default_pod.gpu_mig_config) + count = try(local.custom.n.gpu, local.default_pod.n.gpu), + mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu) image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" + disk_size = "50" } nodegpupool = { type = try(local.custom.instances_type_map.beluga.gpupool, local.default_pod.instances_type_map.beluga.gpupool), tags = ["node", "pool"], - count = try(local.custom.ngpupool, local.default_pod.ngpupool), - mig = try(local.custom.gpupool_mig_config, local.default_pod.gpupool_mig_config) + count = try(local.custom.n.gpupool, local.default_pod.n.gpupool), + mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool) image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" + disk_size = "50" + } + } + juno = { + mgmt = { + type = try(local.custom.instances_type_map.juno.mgmt, local.default_pod.instances_type_map.juno.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } + login = { + type = try(local.custom.instances_type_map.juno.login, local.default_pod.instances_type_map.juno.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = try(local.custom.n.login, local.default_pod.n.login) + } + nodecpu = { + type = try(local.custom.instances_type_map.juno.cpu, local.default_pod.instances_type_map.juno.cpu), + disk_size = 20 + tags = ["node"], + count = try(local.custom.n.cpu, local.default_pod.n.cpu), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodecpupool = { + type = try(local.custom.instances_type_map.juno.cpupool, local.default_pod.instances_type_map.juno.cpupool), + disk_size = 20 + tags = ["node", "pool"], + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodegpu = { + type = try(local.custom.instances_type_map.juno.gpu, local.default_pod.instances_type_map.juno.gpu), + tags = ["node"], + count = try(local.custom.n.gpu, local.default_pod.n.gpu), + mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool = { + type = try(local.custom.instances_type_map.juno.gpupool, local.default_pod.instances_type_map.juno.gpupool16), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool, 0), + mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool16 = { + type = try(local.custom.instances_type_map.juno.gpupool16, local.default_pod.instances_type_map.juno.gpupool16), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool16, 0), + mig = try(local.custom.mig.gpupool16, local.default_pod.mig.gpupool16) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool12 = { + type = try(local.custom.instances_type_map.juno.gpupool12, local.default_pod.instances_type_map.juno.gpupool12), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool12, 0), + mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool80 = { + type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool80, 0), + mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" } } } @@ -177,6 +268,13 @@ locals { scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), type = "volumes-ec", quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch) } } } + juno = { + nfs = { + home = { size = try(local.custom.home_size, local.default_pod.home_size), quota = try(local.custom.user_quotas.home, local.default_pod.user_quotas.home) } + project = { size = try(local.custom.project_size, local.default_pod.project_size), quota = try(local.custom.user_quotas.project, local.default_pod.user_quotas.project) } + scratch = { size = try(local.custom.scratch_size, local.default_pod.scratch_size), quota = try(local.custom.user_quotas.scratch, local.default_pod.user_quotas.scratch) } + } + } } } From dc85fe582bd25db106199a3caf0900edd87d2247 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 11:55:17 -0400 Subject: [PATCH 03/66] initial config for mcgill scs --- mcgill-scs/config.yaml | 47 ++++++++++++++++++++++++++++++++++++++++++ mcgill-scs/custom.tf | 27 ++++++++++++++++++++++++ mcgill-scs/main.tf | 1 + 3 files changed, 75 insertions(+) create mode 100644 mcgill-scs/config.yaml create mode 100644 mcgill-scs/custom.tf create mode 120000 mcgill-scs/main.tf diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml new file mode 100644 index 0000000..aab0fc8 --- /dev/null +++ b/mcgill-scs/config.yaml @@ -0,0 +1,47 @@ +jupyterhub::jupyterhub_config_hash: + SbatchForm: + runtime: + min: 3.5 + def: 3.5 + max: 5.0 + nprocs: + min: 1 + def: 1 + max: 1 + memory: + min: 1024 + max: 2048 + def: 2048 + oversubscribe: + def: true + lock: true + ui: + def: 'lab' + SlurmFormSpawner: + disable_form: false + +profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] + +profile::users::ldap::users: + dummy_cours1: + count: 1 + groups: ['def-cours1'] + + dummy_cours2: + count: 1 + groups: ['def-cours2'] + + dummy_cours3: + count: 1 + groups: ['def-cours3'] + +profile::slurm::accounting::accounts: + def-cours1: + Fairshare: 1 + MaxJobs: 1 + def-cours2: + Fairshare: 1 + MaxJobs: 10 + def-cours3: + Fairshare: 1 + MaxJobs: 10 diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf new file mode 100644 index 0000000..210eee5 --- /dev/null +++ b/mcgill-scs/custom.tf @@ -0,0 +1,27 @@ +locals { + custom = { + n = { + cpu = 1 + cpupool = 1 + gpupool16 = 1 + gpupool12 = 1 + gpupool80 = 1 + } + home_size = 100 + project_size = 500 + scratch_size = 400 + + user_quotas = { + home = "1g" + project = "2g" + scratch = "4g" + } + + image_cpu = "Rocky-8" #snapshot-cpunode-2024-R810.4" + image_gpu = "Rocky-82 #snapshot-gpunode-2024-R810.4" + + config_version = "ef3e870" + } + + name = "mcgill-scs" +} diff --git a/mcgill-scs/main.tf b/mcgill-scs/main.tf new file mode 120000 index 0000000..4a4ab61 --- /dev/null +++ b/mcgill-scs/main.tf @@ -0,0 +1 @@ +../common/main.tf \ No newline at end of file From 112dcb0f0e6598be073ad477135eb7bf721eb33f Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 11:57:35 -0400 Subject: [PATCH 04/66] fix typo --- common/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/main.tf b/common/main.tf index 47341f4..ebb5ef5 100644 --- a/common/main.tf +++ b/common/main.tf @@ -82,7 +82,7 @@ locals { mgmt = "ha4-15gb" login = "ha4-15gb" cpu = "c8-30gb" - cpupool = "c8-30gb"" + cpupool = "c8-30gb" gpu = "gpu16-240-3375gb-a100x1" gpupool = "gpu16-240-3375gb-a100x1" gpupool16 = "gpu16-240-3375gb-a100x1" From ab0cb4161b4423cadda340646952a4fbe99ef64a Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 12:05:08 -0400 Subject: [PATCH 05/66] fix typo --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 210eee5..d42e46c 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,7 +18,7 @@ locals { } image_cpu = "Rocky-8" #snapshot-cpunode-2024-R810.4" - image_gpu = "Rocky-82 #snapshot-gpunode-2024-R810.4" + image_gpu = "Rocky-8" #snapshot-gpunode-2024-R810.4" config_version = "ef3e870" } From c1b5eaf1d42903b48d567cbb484fa0eb3e8a0297 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 13:33:00 -0400 Subject: [PATCH 06/66] define network parameters, required for Juno --- common/main.tf | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common/main.tf b/common/main.tf index ebb5ef5..0f0ab6a 100644 --- a/common/main.tf +++ b/common/main.tf @@ -98,6 +98,21 @@ locals { gpupool80 = { "1g.10gb" = 7 } gpupool12 = { "1g.5gb" = 7 } } + + network_map = { + arbutus = { + subnet_id = null + os_ext_network = null + } + beluga = { + subnet_id = null + os_ext_network = null + } + juno = { + subnet_id = "40981fb8-8421-455f-b691-75e5f52545f5" + os_ext_network = "Public-Network" + } + } } default = { @@ -323,6 +338,9 @@ module "openstack" { guest_passwd = "" hieradata = local.hieradata + + subnet_id = local.default_pod.network_map[var.cloud_name].subnet_id + os_ext_network = local.default_pod.network_map[var.cloud_name].os_ext_network } output "accounts" { From 749e10ede5cef57d61e82e488642658cf8174577 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 15:22:51 -0400 Subject: [PATCH 07/66] removing static node, using cpu snapshot for new cpu nodes --- mcgill-scs/custom.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index d42e46c..ed5decb 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -1,7 +1,7 @@ locals { custom = { n = { - cpu = 1 + cpu = 0 cpupool = 1 gpupool16 = 1 gpupool12 = 1 @@ -17,7 +17,7 @@ locals { scratch = "4g" } - image_cpu = "Rocky-8" #snapshot-cpunode-2024-R810.4" + image_cpu = "snapshot-cpunode-2024-R810.5" image_gpu = "Rocky-8" #snapshot-gpunode-2024-R810.4" config_version = "ef3e870" From 94d1e919968dee80e8c7910d42bcf0df006312c1 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 15:47:45 -0400 Subject: [PATCH 08/66] add nvidia-driver-cuda to the passthrough packages --- common/config.yaml | 11 ++++++ mcgill-scs/config.yaml | 78 ++++++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/common/config.yaml b/common/config.yaml index 1856e5a..46aa008 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -22,3 +22,14 @@ prometheus::remote_write_configs: basic_auth: username: 'cqformation' password: "%{alias('prometheus_password')}" + +profile::gpu::install::passthrough::packages: + - nvidia-driver-cuda + - nvidia-driver-cuda-libs + - nvidia-driver + - nvidia-driver-devel + - nvidia-driver-libs + - nvidia-driver-NVML + - nvidia-modprobe + - nvidia-xconfig + - nvidia-persistenced diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index aab0fc8..89bdcc8 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -1,47 +1,49 @@ jupyterhub::jupyterhub_config_hash: - SbatchForm: - runtime: - min: 3.5 - def: 3.5 - max: 5.0 - nprocs: - min: 1 - def: 1 - max: 1 - memory: - min: 1024 - max: 2048 - def: 2048 - oversubscribe: - def: true - lock: true - ui: - def: 'lab' - SlurmFormSpawner: - disable_form: false + SbatchForm: + runtime: + min: 3.5 + def: 3.5 + max: 5.0 + nprocs: + min: 1 + def: 1 + max: 1 + memory: + min: 1024 + max: 2048 + def: 2048 + oversubscribe: + def: true + lock: true + ui: + def: 'lab' + SlurmFormSpawner: + disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] profile::users::ldap::users: - dummy_cours1: - count: 1 - groups: ['def-cours1'] + dummy_cours1: + count: 1 + groups: ['def-cours1'] - dummy_cours2: - count: 1 - groups: ['def-cours2'] + dummy_cours2: + count: 1 + groups: ['def-cours2'] - dummy_cours3: - count: 1 - groups: ['def-cours3'] + dummy_cours3: + count: 1 + groups: ['def-cours3'] profile::slurm::accounting::accounts: - def-cours1: - Fairshare: 1 - MaxJobs: 1 - def-cours2: - Fairshare: 1 - MaxJobs: 10 - def-cours3: - Fairshare: 1 - MaxJobs: 10 + def-cours1: + Fairshare: 1 + MaxJobs: 1 + def-cours2: + Fairshare: 1 + MaxJobs: 10 + def-cours3: + Fairshare: 1 + MaxJobs: 10 + + From 65ef7b8d9c269c05a05b5b198a71097409fedf40 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 15:56:23 -0400 Subject: [PATCH 09/66] use snapshot for gpu nodes too --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index ed5decb..59b7bb7 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,7 +18,7 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "Rocky-8" #snapshot-gpunode-2024-R810.4" + image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "ef3e870" } From c34c2441efa9d99b7af60ea660f1862be97ce221 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 16:03:49 -0400 Subject: [PATCH 10/66] remove disk size for gpu nodes --- common/main.tf | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/main.tf b/common/main.tf index 0f0ab6a..7c5c0f9 100644 --- a/common/main.tf +++ b/common/main.tf @@ -256,7 +256,6 @@ locals { count = try(local.custom.n.gpupool12, 0), mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" } nodegpupool80 = { type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), @@ -264,7 +263,6 @@ locals { count = try(local.custom.n.gpupool80, 0), mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" } } } From c54a2878b1505e7505fae2932db752625c4202fb Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 16:07:52 -0400 Subject: [PATCH 11/66] re-create new snapshot from smaller node --- common/main.tf | 2 ++ mcgill-scs/custom.tf | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/common/main.tf b/common/main.tf index 7c5c0f9..0f0ab6a 100644 --- a/common/main.tf +++ b/common/main.tf @@ -256,6 +256,7 @@ locals { count = try(local.custom.n.gpupool12, 0), mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" } nodegpupool80 = { type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), @@ -263,6 +264,7 @@ locals { count = try(local.custom.n.gpupool80, 0), mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" } } } diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 59b7bb7..0477106 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,7 +18,7 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "snapshot-gpunode-2024-R810.5" + image_gpu = "Rocky-8" # snapshot-gpunode-2024-R810.5" config_version = "ef3e870" } From 4ede73deac8e18b01ac007a56cfe712b9455b7ec Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 16:37:22 -0400 Subject: [PATCH 12/66] try new snapshot for new pool node, add pool nodes --- mcgill-scs/custom.tf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 0477106..66295b1 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -2,10 +2,10 @@ locals { custom = { n = { cpu = 0 - cpupool = 1 - gpupool16 = 1 - gpupool12 = 1 - gpupool80 = 1 + cpupool = 2 + gpupool16 = 2 + gpupool12 = 2 + gpupool80 = 2 } home_size = 100 project_size = 500 @@ -18,7 +18,7 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "Rocky-8" # snapshot-gpunode-2024-R810.5" + image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "ef3e870" } From 003ac591a62220d82d7cd84cbd33dddcc2c16820 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Mon, 30 Sep 2024 17:01:54 -0400 Subject: [PATCH 13/66] reduce suspend_time of Slurm --- common/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/config.yaml b/common/config.yaml index 46aa008..ccafdb2 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -6,6 +6,8 @@ jupyterhub::jupyterhub_config_hash: profile::freeipa::mokey::require_verify_admin: false profile::slurm::base::slurm_version: '23.02' +# when using snapshots, it is quick enough to boot nodes that 900 seconds is enough for suspend +profile::slurm::base::suspend_time: 900 prometheus::global_config: scrape_interval: '1m' From f52030763e420901047070d3406cbf7c518552b5 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 08:58:48 -0400 Subject: [PATCH 14/66] restrict number of MIGs to 1, add more GPU nodes in the pool --- mcgill-scs/config.yaml | 3 +++ mcgill-scs/custom.tf | 12 +++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 89bdcc8..f1de904 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -15,6 +15,9 @@ jupyterhub::jupyterhub_config_hash: oversubscribe: def: true lock: true + gpus: + def: 'gpu:0' + choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1', 'gpu:4g.20gb:1', 'gpu:2g.20gb:1' ] ui: def: 'lab' SlurmFormSpawner: diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 66295b1..93ebcac 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -3,9 +3,9 @@ locals { n = { cpu = 0 cpupool = 2 - gpupool16 = 2 - gpupool12 = 2 - gpupool80 = 2 + gpupool16 = 16 + gpupool12 = 4 + gpupool80 = 8 } home_size = 100 project_size = 500 @@ -17,6 +17,12 @@ locals { scratch = "4g" } + mig = { + gpupool16 = { "1g.5gb" = 7 } + gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool80 = { "2g.20gb" = 3 } + } + image_cpu = "snapshot-cpunode-2024-R810.5" image_gpu = "snapshot-gpunode-2024-R810.5" From b3fc7b6a77efafe635443e13b2d42b46ea7ca14c Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 16:15:08 -0400 Subject: [PATCH 15/66] increase the suspend time to 1 day temporarily, for debugging --- mcgill-scs/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index f1de904..2e472bf 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -24,6 +24,7 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] +profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: dummy_cours1: From fcd0d4bed7596e3c36662880a3165264221eb6c6 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 16:44:27 -0400 Subject: [PATCH 16/66] up commit of MC to latest, pin GPU drivers to 550 --- common/config.yaml | 13 ++----------- common/main.tf | 2 +- mcgill-scs/custom.tf | 4 ++-- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/common/config.yaml b/common/config.yaml index ccafdb2..2975472 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -7,7 +7,8 @@ jupyterhub::jupyterhub_config_hash: profile::freeipa::mokey::require_verify_admin: false profile::slurm::base::slurm_version: '23.02' # when using snapshots, it is quick enough to boot nodes that 900 seconds is enough for suspend -profile::slurm::base::suspend_time: 900 +profile::slurm::base::suspend_time: 10800 +profile::gpu::install::passthrough::nvidia_driver_stream: '550-dkms' prometheus::global_config: scrape_interval: '1m' @@ -25,13 +26,3 @@ prometheus::remote_write_configs: username: 'cqformation' password: "%{alias('prometheus_password')}" -profile::gpu::install::passthrough::packages: - - nvidia-driver-cuda - - nvidia-driver-cuda-libs - - nvidia-driver - - nvidia-driver-devel - - nvidia-driver-libs - - nvidia-driver-NVML - - nvidia-modprobe - - nvidia-xconfig - - nvidia-persistenced diff --git a/common/main.tf b/common/main.tf index 0f0ab6a..131dc5d 100644 --- a/common/main.tf +++ b/common/main.tf @@ -59,7 +59,7 @@ locals { } cluster_purpose = "cours_academiques" - config_version = "1ba3a12" + config_version = "2972853" instances_type_map = { arbutus = { diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 93ebcac..24771c1 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -24,9 +24,9 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "snapshot-gpunode-2024-R810.5" + image_gpu = "Rocky-8" # snapshot-gpunode-2024-R810.5" - config_version = "ef3e870" + config_version = "2972853" } name = "mcgill-scs" From e7569b36fc56de0970502479560ef2439b36e3c1 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 17:20:13 -0400 Subject: [PATCH 17/66] removed cpupool nodes --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 24771c1..35cb0cf 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -2,7 +2,7 @@ locals { custom = { n = { cpu = 0 - cpupool = 2 + cpupool = 0 gpupool16 = 16 gpupool12 = 4 gpupool80 = 8 From cfef6c000d168ffae75f206f829af629d7115ab1 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 17:42:39 -0400 Subject: [PATCH 18/66] update snapshot for gpu node --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 35cb0cf..21646df 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -24,7 +24,7 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "Rocky-8" # snapshot-gpunode-2024-R810.5" + image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "2972853" } From afe49590c6e123a3d4e5673aee1b97702eb9ee80 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 17:46:57 -0400 Subject: [PATCH 19/66] reduce suspend_time to 15 minutes --- common/config.yaml | 2 +- mcgill-scs/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/config.yaml b/common/config.yaml index 2975472..a089f15 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -7,7 +7,7 @@ jupyterhub::jupyterhub_config_hash: profile::freeipa::mokey::require_verify_admin: false profile::slurm::base::slurm_version: '23.02' # when using snapshots, it is quick enough to boot nodes that 900 seconds is enough for suspend -profile::slurm::base::suspend_time: 10800 +profile::slurm::base::suspend_time: 900 profile::gpu::install::passthrough::nvidia_driver_stream: '550-dkms' prometheus::global_config: diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 2e472bf..7eb5143 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -24,7 +24,7 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] -profile::slurm::base::suspend_time: 86400 +#profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: dummy_cours1: From fed018cff6207ca6d25ec5cc832f687ba99a3a12 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 17:52:06 -0400 Subject: [PATCH 20/66] destroy all gpupool nodes --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 21646df..146a1e6 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -3,9 +3,9 @@ locals { n = { cpu = 0 cpupool = 0 - gpupool16 = 16 - gpupool12 = 4 - gpupool80 = 8 + gpupool16 = 0 #16 + gpupool12 = 0 #4 + gpupool80 = 0 #8 } home_size = 100 project_size = 500 From b8ae4327c816744adf97c8b737af1cd14593218f Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 1 Oct 2024 17:53:05 -0400 Subject: [PATCH 21/66] re-add gpupool nodes --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 146a1e6..21646df 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -3,9 +3,9 @@ locals { n = { cpu = 0 cpupool = 0 - gpupool16 = 0 #16 - gpupool12 = 0 #4 - gpupool80 = 0 #8 + gpupool16 = 16 + gpupool12 = 4 + gpupool80 = 8 } home_size = 100 project_size = 500 From e0a795b37de4d0316cb25f4e2c5fa4266e460039 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 08:58:48 -0400 Subject: [PATCH 22/66] test with drivers 555-dkms --- mcgill-scs/config.yaml | 1 + mcgill-scs/custom.tf | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 7eb5143..dbbb7a6 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -25,6 +25,7 @@ jupyterhub::jupyterhub_config_hash: profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] #profile::slurm::base::suspend_time: 86400 +profile::gpu::install::passthrough::nvidia_driver_stream: '555-dkms' profile::users::ldap::users: dummy_cours1: diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 21646df..f7c8532 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -24,7 +24,7 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "snapshot-gpunode-2024-R810.5" + image_gpu = "Rocky-8" #snapshot-gpunode-2024-R810.5" config_version = "2972853" } From 5b84c5b9b84a9e7a855781de12efb8209f76fda8 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 09:49:45 -0400 Subject: [PATCH 23/66] retry with dkms-560 --- mcgill-scs/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index dbbb7a6..ab730a0 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -25,7 +25,7 @@ jupyterhub::jupyterhub_config_hash: profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] #profile::slurm::base::suspend_time: 86400 -profile::gpu::install::passthrough::nvidia_driver_stream: '555-dkms' +profile::gpu::install::passthrough::nvidia_driver_stream: '560-dkms' profile::users::ldap::users: dummy_cours1: From 68740d454326045e7e9dedb78961486f9d23ef5c Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 10:18:06 -0400 Subject: [PATCH 24/66] shuffle MIG config between node types --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index f7c8532..b4a9e95 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,9 +18,9 @@ locals { } mig = { - gpupool16 = { "1g.5gb" = 7 } - gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } - gpupool80 = { "2g.20gb" = 3 } + gpupool12 = { "1g.5gb" = 7 } + gpupool16 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool80 = { "1g.10gb" = 1, "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" From d80f2f1f45cfb871430f5fe79130686fcaa98f77 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 10:28:19 -0400 Subject: [PATCH 25/66] remove MIG from nodegpupool16* to test --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index b4a9e95..c903764 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -19,7 +19,7 @@ locals { mig = { gpupool12 = { "1g.5gb" = 7 } - gpupool16 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool16 = null #{ "3g.20gb" = 1, "4g.20gb" = 1 } gpupool80 = { "1g.10gb" = 1, "2g.20gb" = 3 } } From 76b1294d2cb2523862a6cc6e089a8d86ef15693b Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 10:31:45 -0400 Subject: [PATCH 26/66] disable mig on all nodes --- mcgill-scs/custom.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index c903764..8e640ac 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,9 +18,9 @@ locals { } mig = { - gpupool12 = { "1g.5gb" = 7 } + gpupool12 = null #{ "1g.5gb" = 7 } gpupool16 = null #{ "3g.20gb" = 1, "4g.20gb" = 1 } - gpupool80 = { "1g.10gb" = 1, "2g.20gb" = 3 } + gpupool80 = null #{ "1g.10gb" = 1, "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" From a9b217b2a2b6f4443066e9a3539f442183cd58f5 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 10:45:50 -0400 Subject: [PATCH 27/66] add MIG back with drivers 560, from GPUs that have MIG disabled --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 8e640ac..2a70a67 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,9 +18,9 @@ locals { } mig = { - gpupool12 = null #{ "1g.5gb" = 7 } - gpupool16 = null #{ "3g.20gb" = 1, "4g.20gb" = 1 } - gpupool80 = null #{ "1g.10gb" = 1, "2g.20gb" = 3 } + gpupool16 = { "1g.5gb" = 7 } + gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool80 = { "1g.10gb" = 1, "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" From 2584ff7ec4d09da0cb6a149b1fee5a67663779bd Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 11:48:10 -0400 Subject: [PATCH 28/66] roll back to 550 drivers --- mcgill-scs/config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index ab730a0..7eb5143 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -25,7 +25,6 @@ jupyterhub::jupyterhub_config_hash: profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] #profile::slurm::base::suspend_time: 86400 -profile::gpu::install::passthrough::nvidia_driver_stream: '560-dkms' profile::users::ldap::users: dummy_cours1: From eb1178f1b18bd7a6a4092930132aa3e2e6777131 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 12:54:59 -0400 Subject: [PATCH 29/66] swap mig config --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 2a70a67..03896f3 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,9 +18,9 @@ locals { } mig = { - gpupool16 = { "1g.5gb" = 7 } - gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } - gpupool80 = { "1g.10gb" = 1, "2g.20gb" = 3 } + gpupool12 = { "1g.5gb" = 7 } + gpupool16 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool80 = { "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" From c2d1fec6d926a87c0e4adab85eb80e8657704f15 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 13:03:01 -0400 Subject: [PATCH 30/66] restore the use of the snapshot --- mcgill-scs/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 03896f3..21646df 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -18,13 +18,13 @@ locals { } mig = { - gpupool12 = { "1g.5gb" = 7 } - gpupool16 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool16 = { "1g.5gb" = 7 } + gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } gpupool80 = { "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "Rocky-8" #snapshot-gpunode-2024-R810.5" + image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "2972853" } From 0e96d0f6dd4ce70a0ba88457452cb9f68c965afd Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 15:54:58 -0400 Subject: [PATCH 31/66] make config_git_url configurable --- common/main.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/main.tf b/common/main.tf index 131dc5d..2de1153 100644 --- a/common/main.tf +++ b/common/main.tf @@ -59,6 +59,7 @@ locals { } cluster_purpose = "cours_academiques" + config_git_url = "https://github.com/ComputeCanada/puppet-magic_castle.git" config_version = "2972853" instances_type_map = { @@ -314,7 +315,7 @@ locals { module "openstack" { source = "git::https://github.com/ComputeCanada/magic_castle.git//openstack?ref=14.0.0-beta" - config_git_url = "https://github.com/ComputeCanada/puppet-magic_castle.git" + config_git_url = try(local.custom.config_git_url, local.default_pod.config_git_url) config_version = try(local.custom.config_version, local.default_pod.config_version) cluster_name = "${local.name}${var.cloud_suffix}" From 4609e19db3974e7572d6c5800e0037712cf85dd3 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 15:55:41 -0400 Subject: [PATCH 32/66] test MC from Maxime's fork, with StdEnv/2023 --- test-mc-infra-cours/config.yaml | 1 - test-mc-infra-cours/custom.tf | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test-mc-infra-cours/config.yaml b/test-mc-infra-cours/config.yaml index 9469c55..9094a11 100644 --- a/test-mc-infra-cours/config.yaml +++ b/test-mc-infra-cours/config.yaml @@ -21,7 +21,6 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::freeipa::mokey::require_verify_admin: false -profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] profile::users::ldap::users: dummy_cours1: diff --git a/test-mc-infra-cours/custom.tf b/test-mc-infra-cours/custom.tf index 5441a1e..e9cdb8c 100644 --- a/test-mc-infra-cours/custom.tf +++ b/test-mc-infra-cours/custom.tf @@ -7,9 +7,11 @@ locals { # home_size = 100 # project_size = 100 # scratch_size = 50 - image_cpu = "snapshot-cpunode-2024-R810.4" - image_gpu = "snapshot-gpunode-2024-R810.4" + image_cpu = "snapshot-cpunode-2024-R810.5" + image_gpu = "snapshot-gpunode-2024-R810.5" + config_git_url = "https://github.com/mboisson/puppet-magic_castle.git" + config_version = "1b45e1f" volumes = { nfs = { home = { size = 100, quota = "1g" } From 8c08fb3f8093a8fb4ce529ad6ebefa6b23580ab6 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 16:25:03 -0400 Subject: [PATCH 33/66] add new cpu node --- test-mc-infra-cours/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-mc-infra-cours/custom.tf b/test-mc-infra-cours/custom.tf index e9cdb8c..2cbbbf6 100644 --- a/test-mc-infra-cours/custom.tf +++ b/test-mc-infra-cours/custom.tf @@ -1,6 +1,6 @@ locals { custom = { - ncpu = 0 + ncpu = 1 ncpupool = 1 ngpu = 0 ngpupool = 1 From 5ad402207cf0add5952d89cd1ea6cd59d9ba9e48 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 16:27:47 -0400 Subject: [PATCH 34/66] update custom.tf for the test cluster to the n. structure --- test-mc-infra-cours/custom.tf | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test-mc-infra-cours/custom.tf b/test-mc-infra-cours/custom.tf index 2cbbbf6..e8c5fd8 100644 --- a/test-mc-infra-cours/custom.tf +++ b/test-mc-infra-cours/custom.tf @@ -1,9 +1,11 @@ locals { custom = { - ncpu = 1 - ncpupool = 1 - ngpu = 0 - ngpupool = 1 + n = { + cpu = 1 + cpupool = 1 + gpu = 0 + gpupool = 1 + } # home_size = 100 # project_size = 100 # scratch_size = 50 From 04bdd0bca5a86fc1cbae5767dc7923b682a8f2b3 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 2 Oct 2024 16:49:42 -0400 Subject: [PATCH 35/66] remove static node --- test-mc-infra-cours/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-mc-infra-cours/custom.tf b/test-mc-infra-cours/custom.tf index e8c5fd8..cd16335 100644 --- a/test-mc-infra-cours/custom.tf +++ b/test-mc-infra-cours/custom.tf @@ -1,7 +1,7 @@ locals { custom = { n = { - cpu = 1 + cpu = 0 cpupool = 1 gpu = 0 gpupool = 1 From 201d2ba57e6bfe11bfbe6dd96eb1e2841b5b0897 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 11:25:29 -0400 Subject: [PATCH 36/66] change mig config to use only 3g.20gb, and bump puppet-mc commit --- mcgill-scs/custom.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 21646df..c64c4e9 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -19,14 +19,14 @@ locals { mig = { gpupool16 = { "1g.5gb" = 7 } - gpupool12 = { "3g.20gb" = 1, "4g.20gb" = 1 } + gpupool12 = { "3g.20gb" = 2 } gpupool80 = { "2g.20gb" = 3 } } image_cpu = "snapshot-cpunode-2024-R810.5" image_gpu = "snapshot-gpunode-2024-R810.5" - config_version = "2972853" + config_version = "dc6b37f4d2c077a37d88bf4862ba57a09eed7213" } name = "mcgill-scs" From 070c824d3615429b0ed419c075eb04c076a4c4f6 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 11:36:15 -0400 Subject: [PATCH 37/66] initial configuration for nvidia workshop --- nvidia-workshop/config.yaml | 53 +++++++++++++++++++++++++++++++++++++ nvidia-workshop/custom.tf | 33 +++++++++++++++++++++++ nvidia-workshop/main.tf | 1 + 3 files changed, 87 insertions(+) create mode 100644 nvidia-workshop/config.yaml create mode 100644 nvidia-workshop/custom.tf create mode 120000 nvidia-workshop/main.tf diff --git a/nvidia-workshop/config.yaml b/nvidia-workshop/config.yaml new file mode 100644 index 0000000..0024f2e --- /dev/null +++ b/nvidia-workshop/config.yaml @@ -0,0 +1,53 @@ +jupyterhub::jupyterhub_config_hash: + SbatchForm: + runtime: + min: 3.5 + def: 3.5 + max: 5.0 + nprocs: + min: 1 + def: 1 + max: 1 + memory: + min: 1024 + max: 2048 + def: 2048 + oversubscribe: + def: true + lock: true + gpus: + def: 'gpu:0' + choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1', 'gpu:2g.20gb:1' ] + ui: + def: 'lab' + SlurmFormSpawner: + disable_form: false + +profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'nvhpc/23.9', 'openmpi/4.1.5', 'python/3.11', 'ipython-kernel/3.11'] +#profile::slurm::base::suspend_time: 86400 + +profile::users::ldap::users: + dummy_cours1: + count: 1 + groups: ['def-cours1'] + + dummy_cours2: + count: 1 + groups: ['def-cours2'] + + dummy_cours3: + count: 1 + groups: ['def-cours3'] + +profile::slurm::accounting::accounts: + def-cours1: + Fairshare: 1 + MaxJobs: 1 + def-cours2: + Fairshare: 1 + MaxJobs: 10 + def-cours3: + Fairshare: 1 + MaxJobs: 10 + + diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf new file mode 100644 index 0000000..6004b78 --- /dev/null +++ b/nvidia-workshop/custom.tf @@ -0,0 +1,33 @@ +locals { + custom = { + n = { + cpu = 0 + cpupool = 0 + gpupool16 = 16 + gpupool12 = 4 + gpupool80 = 8 + } + home_size = 50 + project_size = 50 + scratch_size = 50 + + user_quotas = { + home = "1g" + project = "1g" + scratch = "1g" + } + + mig = { + gpupool16 = { "1g.5gb" = 7 } + gpupool12 = { "3g.20gb" = 2 } + gpupool80 = { "2g.20gb" = 3 } + } + + image_cpu = "snapshot-cpunode-2024-R810.5" + image_gpu = "snapshot-gpunode-2024-R810.5" + + config_version = "dc6b37f4d2c077a37d88bf4862ba57a09eed7213" + } + + name = "nvidia-workshop" +} diff --git a/nvidia-workshop/main.tf b/nvidia-workshop/main.tf new file mode 120000 index 0000000..4a4ab61 --- /dev/null +++ b/nvidia-workshop/main.tf @@ -0,0 +1 @@ +../common/main.tf \ No newline at end of file From c1e61157ac4373b6f723dd5e2800403fa8068ccf Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 11:37:22 -0400 Subject: [PATCH 38/66] use 'internal' as cluster purpose --- nvidia-workshop/custom.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 6004b78..b23fe71 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -7,6 +7,7 @@ locals { gpupool12 = 4 gpupool80 = 8 } + cluster_purpose = "internal" home_size = 50 project_size = 50 scratch_size = 50 From e3d3a6d962fb59ad7aced97de01d435aadb22e21 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 11:50:52 -0400 Subject: [PATCH 39/66] add cuda/12.2 in the default modules --- nvidia-workshop/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/config.yaml b/nvidia-workshop/config.yaml index 0024f2e..c4e2e14 100644 --- a/nvidia-workshop/config.yaml +++ b/nvidia-workshop/config.yaml @@ -23,7 +23,7 @@ jupyterhub::jupyterhub_config_hash: SlurmFormSpawner: disable_form: false -profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'nvhpc/23.9', 'openmpi/4.1.5', 'python/3.11', 'ipython-kernel/3.11'] +profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'nvhpc/23.9', 'cuda/12.2', 'openmpi/4.1.5', 'python/3.11', 'ipython-kernel/3.11'] #profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: From c8a044f6dabaed638285cd524332617bfd399b3a Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 13:22:38 -0400 Subject: [PATCH 40/66] configure admin validation required --- nvidia-workshop/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/nvidia-workshop/config.yaml b/nvidia-workshop/config.yaml index c4e2e14..7df3c10 100644 --- a/nvidia-workshop/config.yaml +++ b/nvidia-workshop/config.yaml @@ -24,6 +24,7 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'nvhpc/23.9', 'cuda/12.2', 'openmpi/4.1.5', 'python/3.11', 'ipython-kernel/3.11'] +profile::freeipa::mokey::require_verify_admin: true #profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: From 3c65f22464c28538cea1146aa91fef0b9275a41d Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 13:23:01 -0400 Subject: [PATCH 41/66] require admin verification --- mcgill-scs/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 7eb5143..7d52164 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -24,6 +24,7 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] +profile::freeipa::mokey::require_verify_admin: false #profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: From 24af3477e7865f8deb1df723bc3c4dfed049944f Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 16 Oct 2024 14:06:02 -0400 Subject: [PATCH 42/66] customize number of default users --- common/main.tf | 4 +++- mcgill-scs/custom.tf | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/common/main.tf b/common/main.tf index 38939c3..81e8fa3 100644 --- a/common/main.tf +++ b/common/main.tf @@ -36,6 +36,7 @@ locals { image = "Rocky-8" image_cpu = "snapshot-cpunode-2024-R810.5" image_gpu = "snapshot-gpunode-2024-R810.5" + nb_users = 0 n = { cpu = 0 @@ -279,6 +280,7 @@ locals { instances = try(local.custom.instances, local.default.instances_map[var.cloud_name]) volumes = try(local.custom.volumes, local.default.volumes_map[var.cloud_name]) cluster_purpose = try(local.custom.cluster_purpose, local.default_pod.cluster_purpose) + nb_users = try(local.custom.nb_users, local.default_pod.nb_users) hieradata = yamlencode(merge( { @@ -316,7 +318,7 @@ module "openstack" { public_keys = compact(concat(split("\n", file("../common/sshkeys.pub")), )) - nb_users = 55 + nb_users = local.nb_users # Shared password, randomly chosen if blank guest_passwd = "" diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index c64c4e9..645600f 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -10,6 +10,7 @@ locals { home_size = 100 project_size = 500 scratch_size = 400 + nb_users = 1 user_quotas = { home = "1g" From 6bb643ebbdacdf4ac3b0a6750883bfc7c51b3010 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 13:09:03 -0400 Subject: [PATCH 43/66] configure more nodes in 1g.5gb --- mcgill-scs/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 645600f..c3846b2 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -20,7 +20,7 @@ locals { mig = { gpupool16 = { "1g.5gb" = 7 } - gpupool12 = { "3g.20gb" = 2 } + gpupool12 = { "1g.5gb" = 7 } gpupool80 = { "2g.20gb" = 3 } } From 0fca901e31550c5f78732700c013193c03e6e6d8 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 14:27:13 -0400 Subject: [PATCH 44/66] bump version of slurm --- mcgill-scs/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 7d52164..b33437b 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -25,6 +25,7 @@ jupyterhub::jupyterhub_config_hash: profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] profile::freeipa::mokey::require_verify_admin: false +profile::slurm::base::slurm_version: '24.05' #profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: From c7e68be640de2af6a579359a1ed160dd94d4eae1 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 14:34:27 -0400 Subject: [PATCH 45/66] test limit of 15 gres/gpu per user --- mcgill-scs/config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index b33437b..c2715c3 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -26,6 +26,7 @@ jupyterhub::jupyterhub_config_hash: profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] profile::freeipa::mokey::require_verify_admin: false profile::slurm::base::slurm_version: '24.05' + #profile::slurm::base::suspend_time: 86400 profile::users::ldap::users: @@ -52,4 +53,6 @@ profile::slurm::accounting::accounts: Fairshare: 1 MaxJobs: 10 +profile::slurm::accounting::options: + MaxTRESRunMinsPerUser: gres/gpu=15 From fd68622c956ce20d9109f88f19865604a1344798 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 15:14:47 -0400 Subject: [PATCH 46/66] removed MaxTRESRunMinsPerUser as this is not a cluster option, but a QOS one --- mcgill-scs/config.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index c2715c3..b9a2428 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -53,6 +53,3 @@ profile::slurm::accounting::accounts: Fairshare: 1 MaxJobs: 10 -profile::slurm::accounting::options: - MaxTRESRunMinsPerUser: gres/gpu=15 - From e66f9c369a6a372b0c5f38bbb98a059701d7e44e Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 15:18:43 -0400 Subject: [PATCH 47/66] removed configuration per class --- mcgill-scs/config.yaml | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index b9a2428..4973c23 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -29,27 +29,27 @@ profile::slurm::base::slurm_version: '24.05' #profile::slurm::base::suspend_time: 86400 -profile::users::ldap::users: - dummy_cours1: - count: 1 - groups: ['def-cours1'] +#profile::users::ldap::users: +# dummy_cours1: +# count: 1 +# groups: ['def-cours1'] +# +# dummy_cours2: +# count: 1 +# groups: ['def-cours2'] +# +# dummy_cours3: +# count: 1 +# groups: ['def-cours3'] - dummy_cours2: - count: 1 - groups: ['def-cours2'] - - dummy_cours3: - count: 1 - groups: ['def-cours3'] - -profile::slurm::accounting::accounts: - def-cours1: - Fairshare: 1 - MaxJobs: 1 - def-cours2: - Fairshare: 1 - MaxJobs: 10 - def-cours3: - Fairshare: 1 - MaxJobs: 10 +#profile::slurm::accounting::accounts: +# def-cours1: +# Fairshare: 1 +# MaxJobs: 1 +# def-cours2: +# Fairshare: 1 +# MaxJobs: 10 +# def-cours3: +# Fairshare: 1 +# MaxJobs: 10 From afca2797a85531b279c73ee66d1191eaa24c99d4 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 17:14:49 -0400 Subject: [PATCH 48/66] require admin verification --- mcgill-scs/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 4973c23..adf0b2a 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -24,7 +24,7 @@ jupyterhub::jupyterhub_config_hash: disable_form: false profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'openmpi/4.1.5', 'python/3.10', 'ipython-kernel/3.10'] -profile::freeipa::mokey::require_verify_admin: false +profile::freeipa::mokey::require_verify_admin: true profile::slurm::base::slurm_version: '24.05' #profile::slurm::base::suspend_time: 86400 From 27009390c3efd32585de621354f38fda08bbe00d Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Thu, 17 Oct 2024 17:16:28 -0400 Subject: [PATCH 49/66] require admin verification by default --- common/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/config.yaml b/common/config.yaml index a089f15..abf8bea 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -4,7 +4,7 @@ jupyterhub::jupyterhub_config_hash: SlurmFormSpawner: start_timeout: 900 -profile::freeipa::mokey::require_verify_admin: false +profile::freeipa::mokey::require_verify_admin: true profile::slurm::base::slurm_version: '23.02' # when using snapshots, it is quick enough to boot nodes that 900 seconds is enough for suspend profile::slurm::base::suspend_time: 900 From 82b0ddc3fc5f50c83c69750a49b6d8621ff1dff7 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 09:34:26 -0400 Subject: [PATCH 50/66] add cq and j nodes --- common/main.tf | 22 ++++++++++++++++++++++ nvidia-workshop/custom.tf | 4 ++++ 2 files changed, 26 insertions(+) diff --git a/common/main.tf b/common/main.tf index 81e8fa3..0d2b6bf 100644 --- a/common/main.tf +++ b/common/main.tf @@ -47,6 +47,8 @@ locals { gpupool12 = 0 gpupool16 = 0 gpupool80 = 0 + gpupool16-cq = 0 + gpupool12-j = 0 } home_size = 100 @@ -88,6 +90,8 @@ locals { gpupool16 = "gpu16-240-3375gb-a100x1" gpupool80 = "gpu13-240-2500gb-a100-80gx1" gpupool12 = "gpu12-120-850gb-a100x1" + gpupool16-cq = "gpu16-240-3450gb-a100x1_cq" + gpupool12-j = "gpu12-120-850gb-a100x1_jacobb" } } @@ -97,6 +101,8 @@ locals { gpupool16 = { "1g.5gb" = 7 } gpupool80 = { "1g.10gb" = 7 } gpupool12 = { "1g.5gb" = 7 } + gpupool16-cq = { "1g.5gb" = 7 } + gpupool12-j = { "1g.5gb" = 7 } } network_map = { @@ -234,6 +240,14 @@ locals { image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" } + nodegpupool16-cq = { + type = try(local.custom.instances_type_map.juno.gpupool16-cq, local.default_pod.instances_type_map.juno.gpupool16-cq), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool16-cq, 0), + mig = try(local.custom.mig.gpupool16-cq, local.default_pod.mig.gpupool16-cq) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } nodegpupool12 = { type = try(local.custom.instances_type_map.juno.gpupool12, local.default_pod.instances_type_map.juno.gpupool12), tags = ["node", "pool"], @@ -242,6 +256,14 @@ locals { image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" } + nodegpupool12-j = { + type = try(local.custom.instances_type_map.juno.gpupool12-j, local.default_pod.instances_type_map.juno.gpupool12-j), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool12-j, 0), + mig = try(local.custom.mig.gpupool12-j, local.default_pod.mig.gpupool12-j) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } nodegpupool80 = { type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), tags = ["node", "pool"], diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index b23fe71..153e797 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -4,7 +4,9 @@ locals { cpu = 0 cpupool = 0 gpupool16 = 16 + gpupool16-cq = 4 gpupool12 = 4 + gpupool12-j = 20 gpupool80 = 8 } cluster_purpose = "internal" @@ -20,7 +22,9 @@ locals { mig = { gpupool16 = { "1g.5gb" = 7 } + gpupool16-cq = { "1g.5gb" = 7 } gpupool12 = { "3g.20gb" = 2 } + gpupool12-j = { "1g.5gb" = 7 } gpupool80 = { "2g.20gb" = 3 } } From 9e8b2877f5fd907aaef90a23343e66f57d62e7d9 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 09:37:47 -0400 Subject: [PATCH 51/66] add cq and j nodes --- common/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/main.tf b/common/main.tf index 0d2b6bf..1b041b2 100644 --- a/common/main.tf +++ b/common/main.tf @@ -91,7 +91,7 @@ locals { gpupool80 = "gpu13-240-2500gb-a100-80gx1" gpupool12 = "gpu12-120-850gb-a100x1" gpupool16-cq = "gpu16-240-3450gb-a100x1_cq" - gpupool12-j = "gpu12-120-850gb-a100x1_jacobb" + gpupool12-j = "gpu12-120-850gb-a100x1_j" } } From b1dbfd8aa31c651f8e76b69e8a6b5c464c3bcacf Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 13:56:43 -0400 Subject: [PATCH 52/66] boot new node from Rocky 8 image --- nvidia-workshop/custom.tf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 153e797..6d06a74 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,6 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 + gpu = 1 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 @@ -29,7 +30,8 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "snapshot-gpunode-2024-R810.5" + image_gpu = "Rocky-8.10" + #image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "dc6b37f4d2c077a37d88bf4862ba57a09eed7213" } From add5c152d30e9d38b4fb241498e05918b4eae0d6 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 15:07:35 -0400 Subject: [PATCH 53/66] boot second gpu node --- nvidia-workshop/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 6d06a74..81b0104 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 1 + gpu = 2 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 From 32f0ef5fc9f1bcfef9fe7754ac6062074122d9de Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 15:39:55 -0400 Subject: [PATCH 54/66] configure mig in 7x1g.5gb --- nvidia-workshop/custom.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 81b0104..78f9b17 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -22,6 +22,7 @@ locals { } mig = { + gpu = { "1g.5gb" = 7 } gpupool16 = { "1g.5gb" = 7 } gpupool16-cq = { "1g.5gb" = 7 } gpupool12 = { "3g.20gb" = 2 } From 5a7e16e5157c5c526590bbfb3f2df82a715d55c4 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 15:57:06 -0400 Subject: [PATCH 55/66] switch static node to smaller flavor --- nvidia-workshop/custom.tf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 78f9b17..a398a8b 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -20,7 +20,12 @@ locals { project = "1g" scratch = "1g" } - + + instances_type_map = { + juno = { + gpu = "gpu12-120-850gb-a100x1" + } + } mig = { gpu = { "1g.5gb" = 7 } gpupool16 = { "1g.5gb" = 7 } From 609f9ffc7e5fc046090ea7764fcc80cb82fcaee0 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 15:59:35 -0400 Subject: [PATCH 56/66] remove static nodes --- nvidia-workshop/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index a398a8b..4427b3d 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 2 + gpu = 0 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 From 1a333dc811f6bd263a410346ab80f4a27e465d78 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 15:59:51 -0400 Subject: [PATCH 57/66] re-create static nodes --- nvidia-workshop/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 4427b3d..b97fe85 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 0 + gpu = 1 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 From 127910934645c41bf2cc4a42f31f271fb0cecaad Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 16:00:17 -0400 Subject: [PATCH 58/66] remove static nodes --- nvidia-workshop/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index b97fe85..4427b3d 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 1 + gpu = 0 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 From cb5e0364ad01d3a5f24a1064638708eda262e4fe Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 16:00:31 -0400 Subject: [PATCH 59/66] recreate static node --- nvidia-workshop/custom.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 4427b3d..b97fe85 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 0 + gpu = 1 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 From 1ff4627f7d4d495d128d9bc5136c358393db7379 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Fri, 18 Oct 2024 16:39:11 -0400 Subject: [PATCH 60/66] remove static node, switch to new snapshot --- nvidia-workshop/custom.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index b97fe85..8c0bb81 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -3,7 +3,7 @@ locals { n = { cpu = 0 cpupool = 0 - gpu = 1 + gpu = 0 gpupool16 = 16 gpupool16-cq = 4 gpupool12 = 4 @@ -36,8 +36,8 @@ locals { } image_cpu = "snapshot-cpunode-2024-R810.5" - image_gpu = "Rocky-8.10" - #image_gpu = "snapshot-gpunode-2024-R810.5" + #image_gpu = "Rocky-8.10" + image_gpu = "snapshot-gpunode-2024-R810.5" config_version = "dc6b37f4d2c077a37d88bf4862ba57a09eed7213" } From 471059d90daed83bdcde3b793084c55ea148e6ce Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 22 Oct 2024 12:01:06 -0400 Subject: [PATCH 61/66] remove 80gb gpus --- mcgill-scs/custom.tf | 2 +- nvidia-workshop/custom.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index c3846b2..422f590 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -5,7 +5,7 @@ locals { cpupool = 0 gpupool16 = 16 gpupool12 = 4 - gpupool80 = 8 + gpupool80 = 0 } home_size = 100 project_size = 500 diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index 8c0bb81..ed28edc 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -8,7 +8,7 @@ locals { gpupool16-cq = 4 gpupool12 = 4 gpupool12-j = 20 - gpupool80 = 8 + gpupool80 = 0 } cluster_purpose = "internal" home_size = 50 From f3d20403a756997cec713fc06e8d930e5c771de6 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 22 Oct 2024 12:06:11 -0400 Subject: [PATCH 62/66] restrict the flavours of GPUs that can be requested --- mcgill-scs/config.yaml | 2 +- nvidia-workshop/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index adf0b2a..6b20712 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -17,7 +17,7 @@ jupyterhub::jupyterhub_config_hash: lock: true gpus: def: 'gpu:0' - choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1', 'gpu:4g.20gb:1', 'gpu:2g.20gb:1' ] + choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1' ] ui: def: 'lab' SlurmFormSpawner: diff --git a/nvidia-workshop/config.yaml b/nvidia-workshop/config.yaml index 7df3c10..4b03370 100644 --- a/nvidia-workshop/config.yaml +++ b/nvidia-workshop/config.yaml @@ -17,7 +17,7 @@ jupyterhub::jupyterhub_config_hash: lock: true gpus: def: 'gpu:0' - choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1', 'gpu:2g.20gb:1' ] + choices: ['gpu:0', 'gpu:1g.5gb:1', 'gpu:3g.20gb:1' ] ui: def: 'lab' SlurmFormSpawner: From 0ea918af4e4faf15cbf95c566bbeb48b83822482 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 22 Oct 2024 13:21:37 -0400 Subject: [PATCH 63/66] remove 80gb gpus, add more flavors of gpus --- mcgill-scs/custom.tf | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index 422f590..c3fd147 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -3,8 +3,10 @@ locals { n = { cpu = 0 cpupool = 0 - gpupool16 = 16 - gpupool12 = 4 + gpupool16 = 6 + gpupool16-cq = 4 + gpupool12 = 2 + gpupool12-j = 8 gpupool80 = 0 } home_size = 100 @@ -19,9 +21,10 @@ locals { } mig = { + gpupool16-cq = { "1g.5gb" = 7 } gpupool16 = { "1g.5gb" = 7 } - gpupool12 = { "1g.5gb" = 7 } - gpupool80 = { "2g.20gb" = 3 } + gpupool12 = { "3g.20gb" = 2 } + gpupool12-j = { "1g.5gb" = 7 } } image_cpu = "snapshot-cpunode-2024-R810.5" From 116464274c28cf08e391297cc8a3115a5a13c9bc Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Tue, 22 Oct 2024 13:23:46 -0400 Subject: [PATCH 64/66] remove comments --- mcgill-scs/config.yaml | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/mcgill-scs/config.yaml b/mcgill-scs/config.yaml index 6b20712..893e1a3 100644 --- a/mcgill-scs/config.yaml +++ b/mcgill-scs/config.yaml @@ -27,29 +27,3 @@ profile::software_stack::lmod_default_modules: ['StdEnv/2023', 'gcc/12.3', 'open profile::freeipa::mokey::require_verify_admin: true profile::slurm::base::slurm_version: '24.05' -#profile::slurm::base::suspend_time: 86400 - -#profile::users::ldap::users: -# dummy_cours1: -# count: 1 -# groups: ['def-cours1'] -# -# dummy_cours2: -# count: 1 -# groups: ['def-cours2'] -# -# dummy_cours3: -# count: 1 -# groups: ['def-cours3'] - -#profile::slurm::accounting::accounts: -# def-cours1: -# Fairshare: 1 -# MaxJobs: 1 -# def-cours2: -# Fairshare: 1 -# MaxJobs: 10 -# def-cours3: -# Fairshare: 1 -# MaxJobs: 10 - From bf06824623dfac081de91ce4484b398d565ad7b7 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 23 Oct 2024 15:45:16 -0400 Subject: [PATCH 65/66] fix spacing --- common/main.tf | 300 ++++++++++++++++++++++++------------------------- 1 file changed, 150 insertions(+), 150 deletions(-) diff --git a/common/main.tf b/common/main.tf index 1b041b2..b9e0914 100644 --- a/common/main.tf +++ b/common/main.tf @@ -7,7 +7,7 @@ variable "pool" { } variable "TFC_WORKSPACE_NAME" { type = string - default = "" + default = "" } variable "tfe_token" { type = string @@ -17,12 +17,12 @@ variable "cloud_name" { type = string default = "" } -variable "prometheus_password" { - type = string +variable "prometheus_password" { + type = string default = "" } variable "credentials_hieradata" { default= {} } -variable "cloud_suffix" { +variable "suffix" { type = string default = "" } @@ -64,7 +64,7 @@ locals { cluster_purpose = "cours_academiques" config_git_url = "https://github.com/ComputeCanada/puppet-magic_castle.git" config_version = "2972853" - + instances_type_map = { arbutus = { mgmt = "p8-12gb" @@ -124,154 +124,154 @@ locals { default = { instances_map = { arbutus = { - mgmt = { - type = try(local.custom.instances_type_map.arbutus.mgmt, local.default_pod.instances_type_map.arbutus.mgmt), - tags = ["puppet", "mgmt", "nfs"], - disk_size = 20, - count = 1 - } - login = { - type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login), - tags = ["login", "public", "proxy"], - disk_size = 20, - count = 1 - } - nodecpu = { - type = try(local.custom.instances_type_map.arbutus.cpu, local.default_pod.instances_type_map.arbutus.cpu), - tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } - nodecpupool = { - type = try(local.custom.instances_type_map.arbutus.cpupool, local.default_pod.instances_type_map.arbutus.cpupool), - tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } - nodegpu = { - type = try(local.custom.instances_type_map.arbutus.gpu, local.default_pod.instances_type_map.arbutus.gpu), - tags = ["node"], - count = try(local.custom.n.gpu, local.default_pod.n.gpu), - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - } - nodegpupool = { - type = try(local.custom.instances_type_map.arbutus.gpupool, local.default_pod.instances_type_map.arbutus.gpupool), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool, local.default_pod.n.gpupool), - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - } + mgmt = { + type = try(local.custom.instances_type_map.arbutus.mgmt, local.default_pod.instances_type_map.arbutus.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } + login = { + type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = try(local.custom.n.login, local.default_pod.n.login) + } + nodecpu = { + type = try(local.custom.instances_type_map.arbutus.cpu, local.default_pod.instances_type_map.arbutus.cpu), + tags = ["node"], + count = try(local.custom.n.cpu, local.default_pod.n.cpu), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodecpupool = { + type = try(local.custom.instances_type_map.arbutus.cpupool, local.default_pod.instances_type_map.arbutus.cpupool), + tags = ["node", "pool"], + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodegpu = { + type = try(local.custom.instances_type_map.arbutus.gpu, local.default_pod.instances_type_map.arbutus.gpu), + tags = ["node"], + count = try(local.custom.n.gpu, local.default_pod.n.gpu), + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + } + nodegpupool = { + type = try(local.custom.instances_type_map.arbutus.gpupool, local.default_pod.instances_type_map.arbutus.gpupool), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool, local.default_pod.n.gpupool), + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + } } beluga = { - mgmt = { - type = try(local.custom.instances_type_map.beluga.mgmt, local.default_pod.instances_type_map.beluga.mgmt), - tags = ["puppet", "mgmt", "nfs"], - disk_size = 20, - count = 1 - } - login = { - type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login), - tags = ["login", "public", "proxy"], - disk_size = 20, - count = try(local.custom.n.login, local.default_pod.n.login) - } - nodecpu = { - type = try(local.custom.instances_type_map.beluga.cpu, local.default_pod.instances_type_map.beluga.cpu), - disk_size = 20 - tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } - nodecpupool = { - type = try(local.custom.instances_type_map.beluga.cpupool, local.default_pod.instances_type_map.beluga.cpupool), - disk_size = 20 - tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } + mgmt = { + type = try(local.custom.instances_type_map.beluga.mgmt, local.default_pod.instances_type_map.beluga.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } + login = { + type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = try(local.custom.n.login, local.default_pod.n.login) + } + nodecpu = { + type = try(local.custom.instances_type_map.beluga.cpu, local.default_pod.instances_type_map.beluga.cpu), + disk_size = 20 + tags = ["node"], + count = try(local.custom.n.cpu, local.default_pod.n.cpu), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodecpupool = { + type = try(local.custom.instances_type_map.beluga.cpupool, local.default_pod.instances_type_map.beluga.cpupool), + disk_size = 20 + tags = ["node", "pool"], + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } } juno = { - mgmt = { - type = try(local.custom.instances_type_map.juno.mgmt, local.default_pod.instances_type_map.juno.mgmt), - tags = ["puppet", "mgmt", "nfs"], - disk_size = 20, - count = 1 - } - login = { - type = try(local.custom.instances_type_map.juno.login, local.default_pod.instances_type_map.juno.login), - tags = ["login", "public", "proxy"], - disk_size = 20, - count = try(local.custom.n.login, local.default_pod.n.login) - } - nodecpu = { - type = try(local.custom.instances_type_map.juno.cpu, local.default_pod.instances_type_map.juno.cpu), - disk_size = 20 - tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } - nodecpupool = { - type = try(local.custom.instances_type_map.juno.cpupool, local.default_pod.instances_type_map.juno.cpupool), - disk_size = 20 - tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), - image = try(local.custom.image_cpu, local.default_pod.image_cpu), - } - nodegpu = { - type = try(local.custom.instances_type_map.juno.gpu, local.default_pod.instances_type_map.juno.gpu), - tags = ["node"], - count = try(local.custom.n.gpu, local.default_pod.n.gpu), - mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool = { - type = try(local.custom.instances_type_map.juno.gpupool, local.default_pod.instances_type_map.juno.gpupool16), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool, 0), - mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool16 = { - type = try(local.custom.instances_type_map.juno.gpupool16, local.default_pod.instances_type_map.juno.gpupool16), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool16, 0), - mig = try(local.custom.mig.gpupool16, local.default_pod.mig.gpupool16) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool16-cq = { - type = try(local.custom.instances_type_map.juno.gpupool16-cq, local.default_pod.instances_type_map.juno.gpupool16-cq), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool16-cq, 0), - mig = try(local.custom.mig.gpupool16-cq, local.default_pod.mig.gpupool16-cq) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool12 = { - type = try(local.custom.instances_type_map.juno.gpupool12, local.default_pod.instances_type_map.juno.gpupool12), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool12, 0), - mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool12-j = { - type = try(local.custom.instances_type_map.juno.gpupool12-j, local.default_pod.instances_type_map.juno.gpupool12-j), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool12-j, 0), - mig = try(local.custom.mig.gpupool12-j, local.default_pod.mig.gpupool12-j) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } - nodegpupool80 = { - type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), - tags = ["node", "pool"], - count = try(local.custom.n.gpupool80, 0), - mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) - image = try(local.custom.image_gpu, local.default_pod.image_gpu), - disk_size = "50" - } + mgmt = { + type = try(local.custom.instances_type_map.juno.mgmt, local.default_pod.instances_type_map.juno.mgmt), + tags = ["puppet", "mgmt", "nfs"], + disk_size = 20, + count = 1 + } + login = { + type = try(local.custom.instances_type_map.juno.login, local.default_pod.instances_type_map.juno.login), + tags = ["login", "public", "proxy"], + disk_size = 20, + count = try(local.custom.n.login, local.default_pod.n.login) + } + nodecpu = { + type = try(local.custom.instances_type_map.juno.cpu, local.default_pod.instances_type_map.juno.cpu), + disk_size = 20 + tags = ["node"], + count = try(local.custom.n.cpu, local.default_pod.n.cpu), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodecpupool = { + type = try(local.custom.instances_type_map.juno.cpupool, local.default_pod.instances_type_map.juno.cpupool), + disk_size = 20 + tags = ["node", "pool"], + count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + image = try(local.custom.image_cpu, local.default_pod.image_cpu), + } + nodegpu = { + type = try(local.custom.instances_type_map.juno.gpu, local.default_pod.instances_type_map.juno.gpu), + tags = ["node"], + count = try(local.custom.n.gpu, local.default_pod.n.gpu), + mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool = { + type = try(local.custom.instances_type_map.juno.gpupool, local.default_pod.instances_type_map.juno.gpupool16), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool, 0), + mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool16 = { + type = try(local.custom.instances_type_map.juno.gpupool16, local.default_pod.instances_type_map.juno.gpupool16), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool16, 0), + mig = try(local.custom.mig.gpupool16, local.default_pod.mig.gpupool16) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool16-cq = { + type = try(local.custom.instances_type_map.juno.gpupool16-cq, local.default_pod.instances_type_map.juno.gpupool16-cq), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool16-cq, 0), + mig = try(local.custom.mig.gpupool16-cq, local.default_pod.mig.gpupool16-cq) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool12 = { + type = try(local.custom.instances_type_map.juno.gpupool12, local.default_pod.instances_type_map.juno.gpupool12), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool12, 0), + mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool12-j = { + type = try(local.custom.instances_type_map.juno.gpupool12-j, local.default_pod.instances_type_map.juno.gpupool12-j), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool12-j, 0), + mig = try(local.custom.mig.gpupool12-j, local.default_pod.mig.gpupool12-j) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } + nodegpupool80 = { + type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), + tags = ["node", "pool"], + count = try(local.custom.n.gpupool80, 0), + mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) + image = try(local.custom.image_gpu, local.default_pod.image_gpu), + disk_size = "50" + } } } volumes_map = { @@ -324,7 +324,7 @@ module "openstack" { config_git_url = try(local.custom.config_git_url, local.default_pod.config_git_url) config_version = try(local.custom.config_version, local.default_pod.config_version) - cluster_name = "${local.name}${var.cloud_suffix}" + cluster_name = "${local.name}${var.suffix}" domain = "calculquebec.cloud" image = try(local.custom.image, local.default_pod.image) From d314ed1ba1f028475c7234fc1fc64a76977f0d63 Mon Sep 17 00:00:00 2001 From: Maxime Boissonneault Date: Wed, 23 Oct 2024 15:59:42 -0400 Subject: [PATCH 66/66] renamed n to nnodes for clarity --- common/main.tf | 38 +++++++++++++++++------------------ mcgill-scs/custom.tf | 2 +- nvidia-workshop/custom.tf | 2 +- test-mc-infra-cours/custom.tf | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/common/main.tf b/common/main.tf index b9e0914..7dffaea 100644 --- a/common/main.tf +++ b/common/main.tf @@ -38,7 +38,7 @@ locals { image_gpu = "snapshot-gpunode-2024-R810.5" nb_users = 0 - n = { + nnodes = { cpu = 0 gpu = 0 cpupool = 0 @@ -134,30 +134,30 @@ locals { type = try(local.custom.instances_type_map.arbutus.login, local.default_pod.instances_type_map.arbutus.login), tags = ["login", "public", "proxy"], disk_size = 20, - count = try(local.custom.n.login, local.default_pod.n.login) + count = try(local.custom.nnodes.login, local.default_pod.nnodes.login) } nodecpu = { type = try(local.custom.instances_type_map.arbutus.cpu, local.default_pod.instances_type_map.arbutus.cpu), tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), + count = try(local.custom.nnodes.cpu, local.default_pod.nnodes.cpu), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodecpupool = { type = try(local.custom.instances_type_map.arbutus.cpupool, local.default_pod.instances_type_map.arbutus.cpupool), tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + count = try(local.custom.nnodes.cpupool, local.default_pod.nnodes.cpupool), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodegpu = { type = try(local.custom.instances_type_map.arbutus.gpu, local.default_pod.instances_type_map.arbutus.gpu), tags = ["node"], - count = try(local.custom.n.gpu, local.default_pod.n.gpu), + count = try(local.custom.nnodes.gpu, local.default_pod.nnodes.gpu), image = try(local.custom.image_gpu, local.default_pod.image_gpu), } nodegpupool = { type = try(local.custom.instances_type_map.arbutus.gpupool, local.default_pod.instances_type_map.arbutus.gpupool), tags = ["node", "pool"], - count = try(local.custom.n.gpupool, local.default_pod.n.gpupool), + count = try(local.custom.nnodes.gpupool, local.default_pod.nnodes.gpupool), image = try(local.custom.image_gpu, local.default_pod.image_gpu), } } @@ -172,20 +172,20 @@ locals { type = try(local.custom.instances_type_map.beluga.login, local.default_pod.instances_type_map.beluga.login), tags = ["login", "public", "proxy"], disk_size = 20, - count = try(local.custom.n.login, local.default_pod.n.login) + count = try(local.custom.nnodes.login, local.default_pod.nnodes.login) } nodecpu = { type = try(local.custom.instances_type_map.beluga.cpu, local.default_pod.instances_type_map.beluga.cpu), disk_size = 20 tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), + count = try(local.custom.nnodes.cpu, local.default_pod.nnodes.cpu), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodecpupool = { type = try(local.custom.instances_type_map.beluga.cpupool, local.default_pod.instances_type_map.beluga.cpupool), disk_size = 20 tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + count = try(local.custom.nnodes.cpupool, local.default_pod.nnodes.cpupool), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } } @@ -200,26 +200,26 @@ locals { type = try(local.custom.instances_type_map.juno.login, local.default_pod.instances_type_map.juno.login), tags = ["login", "public", "proxy"], disk_size = 20, - count = try(local.custom.n.login, local.default_pod.n.login) + count = try(local.custom.nnodes.login, local.default_pod.nnodes.login) } nodecpu = { type = try(local.custom.instances_type_map.juno.cpu, local.default_pod.instances_type_map.juno.cpu), disk_size = 20 tags = ["node"], - count = try(local.custom.n.cpu, local.default_pod.n.cpu), + count = try(local.custom.nnodes.cpu, local.default_pod.nnodes.cpu), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodecpupool = { type = try(local.custom.instances_type_map.juno.cpupool, local.default_pod.instances_type_map.juno.cpupool), disk_size = 20 tags = ["node", "pool"], - count = try(local.custom.n.cpupool, local.default_pod.n.cpupool), + count = try(local.custom.nnodes.cpupool, local.default_pod.nnodes.cpupool), image = try(local.custom.image_cpu, local.default_pod.image_cpu), } nodegpu = { type = try(local.custom.instances_type_map.juno.gpu, local.default_pod.instances_type_map.juno.gpu), tags = ["node"], - count = try(local.custom.n.gpu, local.default_pod.n.gpu), + count = try(local.custom.nnodes.gpu, local.default_pod.nnodes.gpu), mig = try(local.custom.mig.gpu, local.default_pod.mig.gpu) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -227,7 +227,7 @@ locals { nodegpupool = { type = try(local.custom.instances_type_map.juno.gpupool, local.default_pod.instances_type_map.juno.gpupool16), tags = ["node", "pool"], - count = try(local.custom.n.gpupool, 0), + count = try(local.custom.nnodes.gpupool, 0), mig = try(local.custom.mig.gpupool, local.default_pod.mig.gpupool) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -235,7 +235,7 @@ locals { nodegpupool16 = { type = try(local.custom.instances_type_map.juno.gpupool16, local.default_pod.instances_type_map.juno.gpupool16), tags = ["node", "pool"], - count = try(local.custom.n.gpupool16, 0), + count = try(local.custom.nnodes.gpupool16, 0), mig = try(local.custom.mig.gpupool16, local.default_pod.mig.gpupool16) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -243,7 +243,7 @@ locals { nodegpupool16-cq = { type = try(local.custom.instances_type_map.juno.gpupool16-cq, local.default_pod.instances_type_map.juno.gpupool16-cq), tags = ["node", "pool"], - count = try(local.custom.n.gpupool16-cq, 0), + count = try(local.custom.nnodes.gpupool16-cq, 0), mig = try(local.custom.mig.gpupool16-cq, local.default_pod.mig.gpupool16-cq) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -251,7 +251,7 @@ locals { nodegpupool12 = { type = try(local.custom.instances_type_map.juno.gpupool12, local.default_pod.instances_type_map.juno.gpupool12), tags = ["node", "pool"], - count = try(local.custom.n.gpupool12, 0), + count = try(local.custom.nnodes.gpupool12, 0), mig = try(local.custom.mig.gpupool12, local.default_pod.mig.gpupool12) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -259,7 +259,7 @@ locals { nodegpupool12-j = { type = try(local.custom.instances_type_map.juno.gpupool12-j, local.default_pod.instances_type_map.juno.gpupool12-j), tags = ["node", "pool"], - count = try(local.custom.n.gpupool12-j, 0), + count = try(local.custom.nnodes.gpupool12-j, 0), mig = try(local.custom.mig.gpupool12-j, local.default_pod.mig.gpupool12-j) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" @@ -267,7 +267,7 @@ locals { nodegpupool80 = { type = try(local.custom.instances_type_map.juno.gpupool80, local.default_pod.instances_type_map.juno.gpupool80), tags = ["node", "pool"], - count = try(local.custom.n.gpupool80, 0), + count = try(local.custom.nnodes.gpupool80, 0), mig = try(local.custom.mig.gpupool80, local.default_pod.mig.gpupool80) image = try(local.custom.image_gpu, local.default_pod.image_gpu), disk_size = "50" diff --git a/mcgill-scs/custom.tf b/mcgill-scs/custom.tf index c3fd147..d4dde78 100644 --- a/mcgill-scs/custom.tf +++ b/mcgill-scs/custom.tf @@ -1,6 +1,6 @@ locals { custom = { - n = { + nnodes = { cpu = 0 cpupool = 0 gpupool16 = 6 diff --git a/nvidia-workshop/custom.tf b/nvidia-workshop/custom.tf index ed28edc..e4ba0c2 100644 --- a/nvidia-workshop/custom.tf +++ b/nvidia-workshop/custom.tf @@ -1,6 +1,6 @@ locals { custom = { - n = { + nnodes = { cpu = 0 cpupool = 0 gpu = 0 diff --git a/test-mc-infra-cours/custom.tf b/test-mc-infra-cours/custom.tf index cd16335..9cf142b 100644 --- a/test-mc-infra-cours/custom.tf +++ b/test-mc-infra-cours/custom.tf @@ -1,6 +1,6 @@ locals { custom = { - n = { + nnodes = { cpu = 0 cpupool = 1 gpu = 0