Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{2023.06}[foss/2023a] PyTorch v2.1.2 with CUDA/12.1.1 #369

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 43 additions & 27 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ display_help() {
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
}

# Function to check if a command exists
function command_exists() {
command -v "$1" >/dev/null 2>&1
}

function copy_build_log() {
# copy specified build log to specified directory, with some context added
build_log=${1}
Expand Down Expand Up @@ -147,6 +152,39 @@ else
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
fi

# We need to ensure that certain files are present or updated before we source
# $TOPDIR/init/eessi_environment_variables
# Particularly the files we need to have present/updated in
# ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
# are:
# - .lmod/lmodrc.lua
# - .lmod/SitePackage.lua
# We run scripts to create them if they don't exist or if the scripts have been
# changed in the PR.

# Set base directory for software and for Lmod config files
_eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
_lmod_cfg_dir=${_eessi_software_path}/.lmod

# We assume there's only one diff file that corresponds to the PR patch file
pr_diff=$(ls [0-9]*.diff | head -1)

# Create or update ${_eessi_software_path}/.lmod/lmodrc.lua
_lmodrc_file=${_lmod_cfg_dir}/lmodrc.lua
_lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
if [ ! -f "${_lmodrc_file}" ] || [ "${_lmodrc_changed}" == '0' ]; then
python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path}
check_exit_code $? "${_lmodrc_file} created/updated" "Failed to create/update ${_lmodrc_file}"
fi

# Create or update ${_eessi_software_path}/.lmod/SitePackage.lua
_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua
_sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
if [ ! -f "${_lmod_sitepackage_file}" ] || [ "${_sitepackage_changed}" == '0' ]; then
python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path}
check_exit_code $? "${_lmod_sitepackage_file} created/updated" "Failed to create/update ${_lmod_sitepackage_file}"
fi

# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE)
# $EESSI_SILENT - don't print any messages
# $EESSI_BASIC_ENV - give a basic set of environment variables
Expand Down Expand Up @@ -212,13 +250,11 @@ else
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
fi

# Install drivers in host_injections
# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works;
# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software)
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh

# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway)
export EESSI_OVERRIDE_GPU_CHECK=1
# Install NVIDIA drivers in host_injections (if they exist)
if command_exists "nvidia-smi"; then
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
fi

# use PR patch file to determine in which easystack files stuff was added
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
Expand Down Expand Up @@ -268,25 +304,5 @@ else
done
fi

### add packages here

echo ">> Creating/updating Lmod RC file..."
export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod"
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
check_exit_code $? "$lmod_rc_file created" "Failed to create $lmod_rc_file"
fi

echo ">> Creating/updating Lmod SitePackage.lua ..."
export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod"
lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua"
sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then
python3 $TOPDIR/create_lmodsitepackage.py ${EASYBUILD_INSTALLPATH}
check_exit_code $? "$lmod_sitepackage_file created" "Failed to create $lmod_sitepackage_file"
fi

echo ">> Cleaning up ${TMPDIR}..."
rm -r ${TMPDIR}
5 changes: 5 additions & 0 deletions bot/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,11 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
fi

# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway)
echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'"
export EESSI_OVERRIDE_GPU_CHECK=1
echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'"

# create tmp file for output of build step
build_outerr=$(mktemp build.outerr.XXXX)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,6 @@ easyconfigs:
- ESPResSo-4.2.2-foss-2023a.eb:
options:
from-pr: 20595
- PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb:
options:
cuda-compute-capabilities: 6.0,6.1,7.0,7.5,8.0,8.6,8.9,9.0
30 changes: 30 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,29 @@ def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs):
raise EasyBuildError("OpenBLAS-specific hook triggered for non-OpenBLAS easyconfig?!")


def pre_configure_hook_pytorch_add_cupti_libdir(self, *args, **kwargs):
"""
Pre-configure hook for PyTorch: add directory $EESSI_SOFTWARE_PATH/software/CUDA/12.1.1/extras/CUPTI/lib64 to LIBRARY_PATH
"""
if self.name == 'PyTorch':
if 'cudaver' in self.cfg.template_values and self.cfg.template_values['cudaver'] == '12.1.1':
_cudaver = self.cfg.template_values['cudaver']
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: CUDA version: '%s'" % _cudaver)
_library_path = os.getenv('LIBRARY_PATH')
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: library_path: '%s'", _library_path)
_eessi_software_path = os.getenv('EESSI_SOFTWARE_PATH')
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: eessi_software_path: '%s'", _eessi_software_path)
_cupti_lib_dir = os.path.join(_eessi_software_path, 'software', 'CUDA', _cudaver, 'extras', 'CUPTI', 'lib64')
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: cupti_lib_dir: '%s'", _cupti_lib_dir)
if _library_path:
env.setvar('LIBRARY_PATH', ':'.join([_library_path, _cupti_lib_dir]))
else:
env.setvar('LIBRARY_PATH', _cupti_lib_dir)
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: LIBRARY_PATH: '%s'", os.getenv('LIBRARY_PATH'))
else:
raise EasyBuildError("PyTorch-specific hook triggered for non-PyTorch easyconfig?!")


def pre_configure_hook_libfabric_disable_psm3_x86_64_generic(self, *args, **kwargs):
"""Add --disable-psm3 to libfabric configure options when building with --optarch=GENERIC on x86_64."""
if self.name == 'libfabric':
Expand Down Expand Up @@ -560,6 +583,12 @@ def pre_test_hook_increase_max_failed_tests_arm_PyTorch(self, *args, **kwargs):
"""
if self.name == 'PyTorch' and self.version == '2.1.2' and get_cpu_architecture() == AARCH64:
self.cfg['max_failed_tests'] = 10
if 'cudaver' in self.cfg.template_values and self.cfg.template_values['cudaver'] == '12.1.1':
_cudaver = self.cfg.template_values['cudaver']
_runtest = self.cfg['runtest']
self.cfg['runtest'] = _runtest.replace(
'PYTHONUNBUFFERED',
'PYTORCH_TEST_RUN_EVERYTHING_IN_SERIAL=1 PYTHONUNBUFFERED')


def pre_single_extension_hook(ext, *args, **kwargs):
Expand Down Expand Up @@ -851,6 +880,7 @@ def inject_gpu_property(ec):
'libfabric': pre_configure_hook_libfabric_disable_psm3_x86_64_generic,
'MetaBAT': pre_configure_hook_metabat_filtered_zlib_dep,
'OpenBLAS': pre_configure_hook_openblas_optarch_generic,
'PyTorch': pre_configure_hook_pytorch_add_cupti_libdir,
'WRF': pre_configure_hook_wrf_aarch64,
'at-spi2-core': pre_configure_hook_atspi2core_filter_ld_library_path,
}
Expand Down
6 changes: 0 additions & 6 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -477,12 +477,6 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
mkdir -p ${EESSI_USR_LOCAL_CUDA}
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
if [[ "${NVIDIA_MODE}" == "install" ]] ; then
# We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver
# (this works because we build within a container and the LMOD_RC recognises that)
touch ${EESSI_TMPDIR}/libcuda.so
export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so"
fi
fi
fi

Expand Down
2 changes: 0 additions & 2 deletions install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@
nvidia_files=(
eessi-2023.06-cuda-and-libraries.yml
install_cuda_and_libraries.sh
install_cuda_host_injections.sh
install_cuDNN_host_injections.sh
link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"
Expand Down
3 changes: 3 additions & 0 deletions run_in_compat_layer_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ fi
if [ ! -z ${http_proxy} ]; then
INPUT="export http_proxy=${http_proxy}; ${INPUT}"
fi
if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then
INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}"
fi
if [ ! -z ${https_proxy} ]; then
INPUT="export https_proxy=${https_proxy}; ${INPUT}"
fi
Expand Down
Loading