diff --git a/.github/workflows/modules/fake_module.lua b/.github/workflows/modules/fake_module.lua new file mode 100644 index 0000000000..e45cb640d6 --- /dev/null +++ b/.github/workflows/modules/fake_module.lua @@ -0,0 +1,3 @@ +setenv("INSIDE_GITHUB_ACTIONS", "true") +-- Interfere with PATH so Lmod keeps a record +prepend_path("PATH", "/snap/bin") diff --git a/.github/workflows/tests_eessi_module.yml b/.github/workflows/tests_eessi_module.yml index cbcffe6385..2bf4b39bde 100644 --- a/.github/workflows/tests_eessi_module.yml +++ b/.github/workflows/tests_eessi_module.yml @@ -7,13 +7,13 @@ on: permissions: contents: read # to fetch code (actions/checkout) jobs: - build: + basic_checks: runs-on: ubuntu-latest strategy: fail-fast: false matrix: EESSI_VERSION: - - 2023.06 + - 2023.06 steps: - name: Check out software-layer repository uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -45,10 +45,11 @@ jobs: - name: Test for archdetect_cpu functionality with invalid path run: | - . /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod + # Initialise Lmod + . /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash export MODULEPATH=init/modules set +e # Do not exit immediately if a command exits with a non-zero status - export EESSI_ARCHDETECT_OPTIONS="dummy/cpu" + export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu" outfile="outfile.txt" module load EESSI/${{matrix.EESSI_VERSION}} > "${outfile}" 2>&1 cat "${outfile}" @@ -58,29 +59,149 @@ jobs: echo "Test for picking up invalid path on \${archdetect_cpu} FAILED" >&2 exit 1 fi - unset EESSI_ARCHDETECT_OPTIONS + unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE set -e # Re-enable exit on non-zero status + + lmod_and_init_script_comparison: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + EESSI_VERSION: + - 2023.06 + EESSI_SOFTWARE_SUBDIR_OVERRIDE: + - x86_64/amd/zen3 + - x86_64/amd/zen4 + EESSI_ACCELERATOR_TARGET_OVERRIDE: + - accel/nvidia/cc80 + steps: + - name: Check out software-layer repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Mount EESSI CernVM-FS pilot repository + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + with: + cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb + cvmfs_http_proxy: DIRECT + cvmfs_repositories: software.eessi.io - - name: Test for expected variables while adding dummy cpu archs and loading EESSI module + - name: Test for expected variables match between Lmod init script and original bash script run: | - . /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash # Initialise Lmod - export MODULEPATH=init/modules - CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath) - export EESSI_ARCHDETECT_OPTIONS="dummy/cpu:${CPU_ARCH}:dummy1/cpu1" + # Initialise Lmod + . /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash + + # Set our path overrides according to our matrix + export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}} + export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}} + moduleoutfile="moduleout.txt" sourceoutfile="sourceout.txt" + + # First do (and undo) the Lmod initialisation + export MODULEPATH=init/modules + # Turn on debug output in case we want to take a look + export EESSI_DEBUG_INIT=true + CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath) + export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="dummy/cpu:${CPU_ARCH}:dummy1/cpu1" module load EESSI/${{matrix.EESSI_VERSION}} - env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${moduleoutfile}" + # EESSI_ARCHDETECT_OPTIONS_OVERRIDE/EESSI_DEBUG_INIT only relevant for Lmod init + unset EESSI_ARCHDETECT_OPTIONS_OVERRIDE + unset EESSI_DEBUG_INIT + # Store all relevant environment variables + env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${moduleoutfile}" module unload EESSI/${{matrix.EESSI_VERSION}} + + # Now do the init script initialisation source ./init/bash - env | grep -E '^(EESSI_S|EESSI_C)' | sort > "${sourceoutfile}" + # source script version sets environment variables to force archdetect, ignore these + unset EESSI_USE_ARCHSPEC + unset EESSI_USE_ARCHDETECT + env | grep -E '(^EESSI_|^LMOD_RC|^LMOD_PACKAGE_PATH)' | sort > "${sourceoutfile}" + + # Now compare the two results + echo "" + echo "Lmod initialisation:" cat "${moduleoutfile}" + echo "" + echo "Source script initialisation:" cat "${sourceoutfile}" + echo "" + echo "" if (diff "${moduleoutfile}" "${sourceoutfile}" > /dev/null); then echo "Test for checking env variables PASSED" else echo "Test for checking env variables FAILED" >&2 - diff "${moduleoutfile}" "${sourceoutfile}" + diff --unified=0 "${moduleoutfile}" "${sourceoutfile}" exit 1 fi + make_sure_load_and_unload_work: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + EESSI_VERSION: + - 2023.06 + EESSI_SOFTWARE_SUBDIR_OVERRIDE: + - none + - x86_64/amd/zen2 + - x86_64/amd/zen4 + EESSI_ACCELERATOR_TARGET_OVERRIDE: + - none + - accel/nvidia/cc80 + steps: + - name: Check out software-layer repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Mount EESSI CernVM-FS pilot repository + uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + with: + cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb + cvmfs_http_proxy: DIRECT + cvmfs_repositories: software.eessi.io + + - name: Test for identical environment after loading and unloading the EESSI module + run: | + # Initialise Lmod + . /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/compat/linux/$(uname -m)/usr/share/Lmod/init/bash + + # Set our cpu path overrides according to our matrix + if [[ "${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}}" != "none" ]]; then + export EESSI_SOFTWARE_SUBDIR_OVERRIDE=${{matrix.EESSI_SOFTWARE_SUBDIR_OVERRIDE}} + fi + + # Set our accelerator path overrides according to our matrix + if [[ "${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}}" != "none" ]]; then + export EESSI_ACCELERATOR_TARGET_OVERRIDE=${{matrix.EESSI_ACCELERATOR_TARGET_OVERRIDE}} + fi + + # Turn on debug output in case we want to take a look + export EESSI_DEBUG_INIT=true + + initial_env_file="initial_env.txt" + module_cycled_file="load_unload_cycle.txt" + + # prepare Lmod, resetting it in a roundabout given we don't want defaults set + export MODULEPATH=init/modules:.github/workflows/modules + module load fake_module + module purge + module unuse .github/workflows/modules + module avail + + # Store the initial environment (ignoring Lmod tables) + env | grep -v _ModuleTable | sort > "${initial_env_file}" + + # Do (and undo) loading the EESSI module + CPU_ARCH=$(./init/eessi_archdetect.sh -a cpupath) + module load EESSI/${{matrix.EESSI_VERSION}} + module unload EESSI/${{matrix.EESSI_VERSION}} + env | grep -v _ModuleTable | sort > "${module_cycled_file}" + + # Now compare the two results (do not expose the files, as they contain the full environment!) + if (diff "${initial_env_file}" "${module_cycled_file}" > /dev/null); then + echo "Test for checking env variables PASSED" + else + echo "Test for checking env variables FAILED" >&2 + diff --unified=0 "${initial_env_file}" "${module_cycled_file}" + exit 1 + fi \ No newline at end of file diff --git a/init/modules/EESSI/2023.06.lua b/init/modules/EESSI/2023.06.lua index 463706ce6c..348699c0f1 100644 --- a/init/modules/EESSI/2023.06.lua +++ b/init/modules/EESSI/2023.06.lua @@ -17,56 +17,141 @@ local eessi_os_type = "linux" setenv("EESSI_VERSION", eessi_version) setenv("EESSI_CVMFS_REPO", eessi_repo) setenv("EESSI_OS_TYPE", eessi_os_type) +function eessiDebug(text) + if (mode() == "load" and os.getenv("EESSI_DEBUG_INIT")) then + LmodMessage(text) + end +end function archdetect_cpu() local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper.sh') - if not os.getenv("EESSI_ARCHDETECT_OPTIONS") then + -- make sure that we grab the value for architecture before the module unsets the environment variable (in unload mode) + local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or (os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") or "") + if not os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE") then if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then - LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic") + LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ARCHDETECT_OPTIONS_OVERRIDE to the available cpu architecture in the form of: x86_64/intel/haswell:x86_64/generic or aarch64/neoverse_v1:aarch64/generic") end source_sh("bash", script) end - local archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or "" - for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do - if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then - -- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected, - -- since optimized software installations for Zen4 are a work-in-progress, - -- see https://gitlab.com/eessi/support/-/issues/37 - if archdetect_filter_cpu == "x86_64/amd/zen4" then - archdetect_filter_cpu = "x86_64/amd/zen3" - if mode() == "load" then - LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.") + -- EESSI_ARCHDETECT_OPTIONS is set by the script (_if_ it was called) + archdetect_options = os.getenv("EESSI_ARCHDETECT_OPTIONS") or archdetect_options + if archdetect_options then + eessiDebug("Got archdetect CPU options: " .. archdetect_options) + -- archdetect_options is a colon-separated list of CPU architectures that are compatible with + -- the host CPU and ordered from most specific to least specific, e.g., + -- x86_64/intel/skylake_avx512:x86_64/intel/haswell:x86_64/generic + -- We loop over the list, and return the highest matching arch for which a directory exists for this EESSI version + for archdetect_filter_cpu in string.gmatch(archdetect_options, "([^" .. ":" .. "]+)") do + if isDir(pathJoin(eessi_prefix, "software", eessi_os_type, archdetect_filter_cpu, "software")) then + -- use x86_64/amd/zen3 for now when AMD Genoa (Zen4) CPU is detected, + -- since optimized software installations for Zen4 are a work-in-progress, + -- see https://gitlab.com/eessi/support/-/issues/37 + if (archdetect_filter_cpu == "x86_64/amd/zen4" and not os.getenv("EESSI_SOFTWARE_SUBDIR_OVERRIDE") == "x86_64/amd/zen4") then + archdetect_filter_cpu = "x86_64/amd/zen3" + if mode() == "load" then + LmodMessage("Sticking to " .. archdetect_filter_cpu .. " for now, since optimized installations for AMD Genoa (Zen4) are a work in progress.") + end end + eessiDebug("Selected archdetect CPU: " .. archdetect_filter_cpu) + return archdetect_filter_cpu end - return archdetect_filter_cpu end + LmodError("Software directory check for the detected architecture failed") + else + -- Still need to return something + return nil end - LmodError("Software directory check for the detected architecture failed") end +function archdetect_accel() + local script = pathJoin(eessi_prefix, 'init', 'lmod_eessi_archdetect_wrapper_accel.sh') + -- for unload mode, we need to grab the value before it is unset + local archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or (os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE") or "") + if not os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE ") then + if convertToCanonical(LmodVersion()) < convertToCanonical("8.6") then + LmodError("Loading this modulefile requires using Lmod version >= 8.6, but you can export EESSI_ACCELERATOR_TARGET_OVERRIDE to the available accelerator architecture in the form of: accel/nvidia/cc80") + end + source_sh("bash", script) + end + archdetect_accel = os.getenv("EESSI_ACCEL_SUBDIR") or archdetect_accel + eessiDebug("Got archdetect accel option: " .. archdetect_accel) + return archdetect_accel +end +-- archdetect finds the best compatible architecture, e.g., x86_64/amd/zen3 local archdetect = archdetect_cpu() +-- archdetect_accel() attempts to identify an accelerator, e.g., accel/nvidia/cc80 +local archdetect_accel = archdetect_accel() +-- eessi_cpu_family is derived from the archdetect match, e.g., x86_64 local eessi_cpu_family = archdetect:match("([^/]+)") local eessi_software_subdir = archdetect +-- eessi_eprefix is the base location of the compat layer, e.g., /cvmfs/software.eessi.io/versions/2023.06/compat/linux/x86_64 local eessi_eprefix = pathJoin(eessi_prefix, "compat", eessi_os_type, eessi_cpu_family) +-- eessi_software_path is the location of the software installations, e.g., +-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3 local eessi_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_software_subdir) -local eessi_module_path = pathJoin(eessi_software_path, "modules", "all") +local eessi_modules_subdir = pathJoin("modules", "all") +-- eessi_module_path is the location of the _CPU_ module files, e.g., +-- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/modules/all +local eessi_module_path = pathJoin(eessi_software_path, eessi_modules_subdir) local eessi_site_software_path = string.gsub(eessi_software_path, "versions", "host_injections") -local eessi_site_module_path = pathJoin(eessi_site_software_path, "modules", "all") +-- Site module path is the same as the EESSI one, but with `versions` changed to `host_injections`, e.g., +-- /cvmfs/software.eessi.io/host_injections/2023.06/software/linux/x86_64/amd/zen3/modules/all +local eessi_site_module_path = pathJoin(eessi_site_software_path, eessi_modules_subdir) setenv("EPREFIX", eessi_eprefix) +eessiDebug("Setting EPREFIX to " .. eessi_eprefix) setenv("EESSI_CPU_FAMILY", eessi_cpu_family) +eessiDebug("Setting EESSI_CPU_FAMILY to " .. eessi_cpu_family) setenv("EESSI_SITE_SOFTWARE_PATH", eessi_site_software_path) +eessiDebug("Setting EESSI_SITE_SOFTWARE_PATH to " .. eessi_site_software_path) setenv("EESSI_SITE_MODULEPATH", eessi_site_module_path) +eessiDebug("Setting EESSI_SITE_MODULEPATH to " .. eessi_site_module_path) setenv("EESSI_SOFTWARE_SUBDIR", eessi_software_subdir) +eessiDebug("Setting EESSI_SOFTWARE_SUBDIR to " .. eessi_software_subdir) setenv("EESSI_PREFIX", eessi_prefix) +eessiDebug("Setting EESSI_PREFIX to " .. eessi_prefix) setenv("EESSI_EPREFIX", eessi_eprefix) +eessiDebug("Setting EPREFIX to " .. eessi_eprefix) prepend_path("PATH", pathJoin(eessi_eprefix, "bin")) -prepend_path("PATH", pathJoin(eessi_eprefix, "usr/bin")) +eessiDebug("Adding " .. pathJoin(eessi_eprefix, "bin") .. " to PATH") +prepend_path("PATH", pathJoin(eessi_eprefix, "usr", "bin")) +eessiDebug("Adding " .. pathJoin(eessi_eprefix, "usr", "bin") .. " to PATH") setenv("EESSI_SOFTWARE_PATH", eessi_software_path) +eessiDebug("Setting EESSI_SOFTWARE_PATH to " .. eessi_software_path) setenv("EESSI_MODULEPATH", eessi_module_path) +eessiDebug("Setting EESSI_MODULEPATH to " .. eessi_module_path) +-- We ship our spider cache, so this location does not need to be spider-ed if ( mode() ~= "spider" ) then prepend_path("MODULEPATH", eessi_module_path) + eessiDebug("Adding " .. eessi_module_path .. " to MODULEPATH") end -prepend_path("LMOD_RC", pathJoin(eessi_software_path, "/.lmod/lmodrc.lua")) +prepend_path("LMOD_RC", pathJoin(eessi_software_path, ".lmod", "lmodrc.lua")) +eessiDebug("Adding " .. pathJoin(eessi_software_path, ".lmod", "lmodrc.lua") .. " to LMOD_RC") +-- Use pushenv for LMOD_PACKAGE_PATH as this may be set locally by the site +pushenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod")) +eessiDebug("Setting LMOD_PACKAGE_PATH to " .. pathJoin(eessi_software_path, ".lmod")) + +-- the accelerator may have an empty value and we need to give some flexibility +-- * construct the path we expect to find +-- * then check it exists +-- * then update the modulepath +if not (archdetect_accel == nil or archdetect_accel == '') then + -- The CPU subdirectory of the accelerator installations is _usually_ the same as host CPU, but this can be overridden + eessi_accel_software_subdir = os.getenv("EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE") or eessi_software_subdir + -- CPU location of the accelerator installations, e.g., + -- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3 + eessi_accel_software_path = pathJoin(eessi_prefix, "software", eessi_os_type, eessi_accel_software_subdir) + -- location of the accelerator modules, e.g., + -- /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen3/accel/nvidia/cc80/modules/all + eessi_module_path_accel = pathJoin(eessi_accel_software_path, archdetect_accel, eessi_modules_subdir) + eessiDebug("Checking if " .. eessi_module_path_accel .. " exists") + if isDir(eessi_module_path_accel) then + setenv("EESSI_MODULEPATH_ACCEL", eessi_module_path_accel) + prepend_path("MODULEPATH", eessi_module_path_accel) + eessiDebug("Using acclerator modules at: " .. eessi_module_path_accel) + end +end + +-- prepend the site module path last so it has priority prepend_path("MODULEPATH", eessi_site_module_path) -setenv("LMOD_PACKAGE_PATH", pathJoin(eessi_software_path, ".lmod")) +eessiDebug("Adding " .. eessi_site_module_path .. " to MODULEPATH") if mode() == "load" then LmodMessage("EESSI/" .. eessi_version .. " loaded successfully") end