From ed07020a6ce8e1f6c93234ebd464b36e90acc391 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Jun 2023 18:08:50 +0200 Subject: [PATCH 01/12] Added vega config --- config/izum_vega.py | 107 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 config/izum_vega.py diff --git a/config/izum_vega.py b/config/izum_vega.py new file mode 100644 index 00000000..dd38753e --- /dev/null +++ b/config/izum_vega.py @@ -0,0 +1,107 @@ +from os import environ +username = environ.get('USER') + +# This is an example configuration file +site_configuration = { + 'general': [ + { + 'remote_detect': True, + } + ], + 'systems': [ + { + 'name': 'vega', + 'descr': 'Vega, a EuroHPC JU system', + 'modules_system': 'lmod', + 'hostnames': ['vglogin*','cn*','gn*'], + 'stagedir': f'/ceph/hpc/scratch/user/{username}/reframe_runs/staging', + 'outputdir': f'reframe_runs/', + 'partitions': [ + { + 'name': 'cpu', + 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'srun', + 'access': ['-p cpu', '--export=None'], + 'environs': ['default'], + 'max_jobs': 120, + 'features': [ + 'cpu', + ], + 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' + }, + { + 'name': 'gpu', + 'scheduler': 'slurm', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'srun', + 'access': ['-p gpu', '--export=None'], + 'environs': ['default'], + 'max_jobs': 60, + 'devices': [ + { + 'type': 'gpu', + 'num_devices': 4, + } + ], + 'resources': [ + { + 'name': '_rfm_gpu', + 'options': ['--gpus-per-node={num_gpus_per_node}'], + } + ], + 'features': [ + 'gpu', + ], + 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' + }, + ] + }, + ], + 'environments': [ + { + 'name': 'default', + 'cc': 'cc', + 'cxx': '', + 'ftn': '', + }, + ], + 'logging': [ + { + 'level': 'debug', + 'handlers': [ + { + 'type': 'stream', + 'name': 'stdout', + 'level': 'info', + 'format': '%(message)s' + }, + { + 'type': 'file', + 'name': 'reframe.log', + 'level': 'debug', + 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 + 'append': False, + 'timestamp': "%Y%m%d_%H%M%S", + } + ], + 'handlers_perflog': [ + { + 'type': 'filelog', + 'prefix': '%(check_system)s/%(check_partition)s', + 'level': 'info', + 'format': ( + '%(check_job_completion_time)s|reframe %(version)s|' + '%(check_info)s|jobid=%(check_jobid)s|' + '%(check_perf_var)s=%(check_perf_value)s|' + 'ref=%(check_perf_ref)s ' + '(l=%(check_perf_lower_thres)s, ' + 'u=%(check_perf_upper_thres)s)|' + '%(check_perf_unit)s' + ), + 'append': True + } + ] + } + ], +} From 1e5257e1fe6b9be396b467481285dc8a739cf15b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Jun 2023 18:14:22 +0200 Subject: [PATCH 02/12] You don't have access to the scratch partitions unless requested. This config will run from the homedir --- config/izum_vega.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index dd38753e..47303a0a 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -14,8 +14,8 @@ 'descr': 'Vega, a EuroHPC JU system', 'modules_system': 'lmod', 'hostnames': ['vglogin*','cn*','gn*'], - 'stagedir': f'/ceph/hpc/scratch/user/{username}/reframe_runs/staging', - 'outputdir': f'reframe_runs/', + 'stagedir': f'reframe_runs/staging', + 'outputdir': f'reframe_runs/output', 'partitions': [ { 'name': 'cpu', From a6b54b4bc595a34b95eecd708208e708632e6a1f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Jun 2023 18:20:36 +0200 Subject: [PATCH 03/12] Need to use PMIx if we want to use srun --- config/izum_vega.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 47303a0a..62679419 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -20,7 +20,7 @@ { 'name': 'cpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_MPI_TYPE=pmix'], 'launcher': 'srun', 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -33,7 +33,7 @@ { 'name': 'gpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_MPI_TYPE=pmix'], 'launcher': 'srun', 'access': ['-p gpu', '--export=None'], 'environs': ['default'], From 48c59a8fca8a31442eebe21290f17224352490de Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Jun 2023 18:24:36 +0200 Subject: [PATCH 04/12] Trying with mpirun, since we got an unexpected error --- config/izum_vega.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 62679419..63c591bc 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -20,8 +20,8 @@ { 'name': 'cpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_MPI_TYPE=pmix'], - 'launcher': 'srun', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'mpirun', 'access': ['-p cpu', '--export=None'], 'environs': ['default'], 'max_jobs': 120, @@ -33,8 +33,8 @@ { 'name': 'gpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_MPI_TYPE=pmix'], - 'launcher': 'srun', + 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'launcher': 'mpirun', 'access': ['-p gpu', '--export=None'], 'environs': ['default'], 'max_jobs': 60, From 5100250122cd1dde192d8bdbf1b2b78e3cb24e3d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 22 Jun 2023 16:51:44 +0200 Subject: [PATCH 05/12] Need to set SLURM_EXPORT_ENV=ALL to make sure that _within_ the job, the environment is exported to the job steps --- config/izum_vega.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 63c591bc..49467ff6 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -20,7 +20,10 @@ { 'name': 'cpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'prepare_cmds': [ + 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', + 'export SLURM_EXPORT_ENV=ALL', + ], 'launcher': 'mpirun', 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -33,7 +36,10 @@ { 'name': 'gpu', 'scheduler': 'slurm', - 'prepare_cmds': ['source /cvmfs/pilot.eessi-hpc.org/latest/init/bash'], + 'prepare_cmds': [ + 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', + 'export SLURM_EXPORT_ENV=ALL', + ], 'launcher': 'mpirun', 'access': ['-p gpu', '--export=None'], 'environs': ['default'], From 6e0e85bf25081a658fea1ee056a6b0f784b42a3c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 28 Jun 2023 14:42:30 +0200 Subject: [PATCH 06/12] Added setting OMPI_MCA_pml=ucx --- config/izum_vega.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/izum_vega.py b/config/izum_vega.py index 49467ff6..91080334 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -23,6 +23,9 @@ 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_EXPORT_ENV=ALL', + # Avoid https://github.com/EESSI/software-layer/issues/136 + # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) + 'export OMPI_MCA_pml=ucx', ], 'launcher': 'mpirun', 'access': ['-p cpu', '--export=None'], @@ -39,6 +42,9 @@ 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_EXPORT_ENV=ALL', + # Avoid https://github.com/EESSI/software-layer/issues/136 + # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) + 'export OMPI_MCA_pml=ucx', ], 'launcher': 'mpirun', 'access': ['-p gpu', '--export=None'], From 81b76ef5d225dee2c1ad4d1f23bbce30986f226d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 28 Jun 2023 14:57:17 +0200 Subject: [PATCH 07/12] Use srun as launcher --- config/izum_vega.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 91080334..88ae298e 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -23,11 +23,12 @@ 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_EXPORT_ENV=ALL', + 'export SLURM_MPI_TYPE=pmix', # Needed when using srun launcher # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', + 'launcher': 'srun', 'access': ['-p cpu', '--export=None'], 'environs': ['default'], 'max_jobs': 120, @@ -42,11 +43,12 @@ 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', 'export SLURM_EXPORT_ENV=ALL', + 'export SLURM_MPI_TYPE=pmix', # Needed when using srun launcher # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', + 'launcher': 'srun', 'access': ['-p gpu', '--export=None'], 'environs': ['default'], 'max_jobs': 60, From 2a02c2a46c007bfbf91c4ac3ab68cc73dedb7ab7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 3 Jul 2023 12:39:18 +0200 Subject: [PATCH 08/12] Added more explaination. Replaced launcher with mpirun, due to srun issues on Vega - we can change this back again if srun would work again on Vega. Also, make sure _all_ output from ReFrame, including logging, ends up in the same prefix {HOME}/reframe_runs - previously log files were written to the current working dir in which the reframe command is executed. --- config/izum_vega.py | 46 +++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 88ae298e..e9d85d7a 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -1,10 +1,23 @@ -from os import environ -username = environ.get('USER') +from os import environ, makedirs + +from eessi.testsuite.constants import FEATURES, DEVICES + +# Get username of current user +homedir = environ.get('HOME') + +# This config will write all staging, output and logging to subdirs under this prefix +reframe_prefix = f'{homedir}/reframe_runs' +log_prefix = f'{home_prefix}/logs' + +# ReFrame complains if the directory for the file logger doesn't exist yet +makedirs(f'{log_prefix}', exist_ok=True) # This is an example configuration file site_configuration = { 'general': [ { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information 'remote_detect': True, } ], @@ -14,26 +27,28 @@ 'descr': 'Vega, a EuroHPC JU system', 'modules_system': 'lmod', 'hostnames': ['vglogin*','cn*','gn*'], - 'stagedir': f'reframe_runs/staging', - 'outputdir': f'reframe_runs/output', + 'prefix': reframe_prefix, 'partitions': [ { 'name': 'cpu', 'scheduler': 'slurm', 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', + # Pass job environment variables like $PATH, etc., into job steps 'export SLURM_EXPORT_ENV=ALL', - 'export SLURM_MPI_TYPE=pmix', # Needed when using srun launcher + # Needed when using srun launcher + 'export SLURM_MPI_TYPE=pmix', # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'srun', + 'launcher': 'mpirun', + # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p cpu', '--export=None'], 'environs': ['default'], 'max_jobs': 120, 'features': [ - 'cpu', + FEATURES['CPU'], ], 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, @@ -42,19 +57,22 @@ 'scheduler': 'slurm', 'prepare_cmds': [ 'source /cvmfs/pilot.eessi-hpc.org/latest/init/bash', + # Pass job environment variables like $PATH, etc., into job steps 'export SLURM_EXPORT_ENV=ALL', - 'export SLURM_MPI_TYPE=pmix', # Needed when using srun launcher + # Needed when using srun launcher + 'export SLURM_MPI_TYPE=pmix', # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'srun', + 'launcher': 'mpirun', + # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p gpu', '--export=None'], 'environs': ['default'], 'max_jobs': 60, 'devices': [ { - 'type': 'gpu', + 'type': DEVICES['GPU'], 'num_devices': 4, } ], @@ -65,7 +83,7 @@ } ], 'features': [ - 'gpu', + FEATURES['GPU'], ], 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' }, @@ -92,17 +110,17 @@ }, { 'type': 'file', - 'name': 'reframe.log', + 'name': f'{log_prefix}/reframe.log', 'level': 'debug', 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 - 'append': False, + 'append': True, 'timestamp': "%Y%m%d_%H%M%S", } ], 'handlers_perflog': [ { 'type': 'filelog', - 'prefix': '%(check_system)s/%(check_partition)s', + 'prefix': f'{log_prefix}/%(check_system)s/%(check_partition)s', 'level': 'info', 'format': ( '%(check_job_completion_time)s|reframe %(version)s|' From 7e8494f4de1559ceab81a86558629d60ba2161ee Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 3 Jul 2023 12:48:30 +0200 Subject: [PATCH 09/12] Add comment about changing mpirun to srun for CPU autodetection --- config/izum_vega.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index e9d85d7a..b15340ae 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -42,7 +42,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', + 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p cpu', '--export=None'], 'environs': ['default'], @@ -65,7 +65,7 @@ # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', ], - 'launcher': 'mpirun', + 'launcher': 'mpirun', # Needs to be temporarily changed to srun for cpu autodetection # Use --export=None to avoid that login environment is passed down to submitted jobs 'access': ['-p gpu', '--export=None'], 'environs': ['default'], From d4b93b3ae54ad8e4a3f56d02a0906704b3da2a14 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 3 Jul 2023 13:08:20 +0200 Subject: [PATCH 10/12] Forgot to change the name in both places... --- config/izum_vega.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index b15340ae..4f2e2169 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -7,7 +7,7 @@ # This config will write all staging, output and logging to subdirs under this prefix reframe_prefix = f'{homedir}/reframe_runs' -log_prefix = f'{home_prefix}/logs' +log_prefix = f'{reframe_prefix}/logs' # ReFrame complains if the directory for the file logger doesn't exist yet makedirs(f'{log_prefix}', exist_ok=True) From eb4966406e84a5903c1ea51e2a572aee4e884279 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 17 Jul 2023 16:16:30 +0200 Subject: [PATCH 11/12] Disabled setting SLURM_MPI_TYPE to pmix because it breaks launching applications with mpirun on Vega for some reason - it just hangs --- config/izum_vega.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 4f2e2169..03fc5fcf 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -37,7 +37,7 @@ # Pass job environment variables like $PATH, etc., into job steps 'export SLURM_EXPORT_ENV=ALL', # Needed when using srun launcher - 'export SLURM_MPI_TYPE=pmix', + # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', From b66347203c474068eb1a84ea115a38e522e827cb Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 18 Jul 2023 10:27:23 +0200 Subject: [PATCH 12/12] Remove SLURM_MPI_TYPE also for GPU partition, as it breaks MPI runs --- config/izum_vega.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 03fc5fcf..4baa40c4 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -60,7 +60,7 @@ # Pass job environment variables like $PATH, etc., into job steps 'export SLURM_EXPORT_ENV=ALL', # Needed when using srun launcher - 'export SLURM_MPI_TYPE=pmix', + # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx',