From 72fc6e242311902d1e4887bc3ab5b8d6c9f8a961 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 27 Sep 2024 13:47:35 +0200 Subject: [PATCH 01/32] Allow Nvidia driver script to set LD_PRELOAD --- .../nvidia/link_nvidia_host_libraries.sh | 317 +++++++++++++----- 1 file changed, 235 insertions(+), 82 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index e8d7f0d0a7..718229b1d7 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -1,74 +1,212 @@ #!/bin/bash # This script links host libraries related to GPU drivers to a location where -# they can be found by the EESSI linker +# they can be found by the EESSI linker (or sets LD_PRELOAD as an +# alternative.) # Initialise our bash functions -TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +TOPDIR=$(dirname "$(realpath "$BASH_SOURCE")") source "$TOPDIR"/../../utils.sh -# We rely on ldconfig to give us the location of the libraries on the host -command_name="ldconfig" -# We cannot use a version of ldconfig that's being shipped under CVMFS -exclude_prefix="/cvmfs" +# Define a function to find the host ld_config +get_host_ldconfig() { + local command_name="ldconfig" # Set command to find + local exclude_prefix="/cvmfs" # Set excluded prefix (paths to ignore) + local found_paths=() # Initialize an array to store found paths -found_paths=() -# Always attempt to use /sbin/ldconfig -if [ -x "/sbin/$command_name" ]; then - found_paths+=("/sbin/$command_name") -fi -IFS=':' read -ra path_dirs <<< "$PATH" -for dir in "${path_dirs[@]}"; do - if [ "$dir" = "/sbin" ]; then - continue # we've already checked for $command_name in /sbin, don't need to do it twice - fi - if [[ ! "$dir" =~ ^$exclude_prefix ]]; then - if [ -x "$dir/$command_name" ]; then - found_paths+=("$dir/$command_name") - fi - fi -done + # Always attempt to use /sbin/ldconfig + if [ -x "/sbin/$command_name" ]; then + found_paths+=("/sbin/$command_name") + fi -if [ ${#found_paths[@]} -gt 0 ]; then - echo "Found $command_name in the following locations:" - printf -- "- %s\n" "${found_paths[@]}" - echo "Using first version" - host_ldconfig=${found_paths[0]} -else - error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." - fatal_error "$error" -fi + # Split the $PATH and iterate over each directory + IFS=':' read -ra path_dirs <<< "$PATH" + for dir in "${path_dirs[@]}"; do + if [ "$dir" = "/sbin" ]; then + continue # Skip /sbin since it's already checked + fi + + # Check if directory does not start with the exclude prefix + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi + done + + # Check if any paths were found + if [ ${#found_paths[@]} -gt 0 ]; then + # echo the first version we found and return success + echo "${found_paths[0]}" + return 0 + else + fatal_error "$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fi +} -# Make sure EESSI is initialised (doesn't matter what version) -check_eessi_initialised +get_nvlib_list() { + local nvliblist_url="https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf" + local default_nvlib_list=( + "libcuda.so" + "libcudadebugger.so" + "libEGL_installertest.so" + "libEGL_nvidia.so" + "libEGL.so" + "libGLdispatch.so" + "libGLESv1_CM_nvidia.so" + "libGLESv1_CM.so" + "libGLESv2_nvidia.so" + "libGLESv2.so" + "libGL.so" + "libGLX_installertest.so" + "libGLX_nvidia.so" + "libglx.so" + "libGLX.so" + "libnvcuvid.so" + "libnvidia-cbl.so" + "libnvidia-cfg.so" + "libnvidia-compiler.so" + "libnvidia-eglcore.so" + "libnvidia-egl-wayland.so" + "libnvidia-encode.so" + "libnvidia-fatbinaryloader.so" + "libnvidia-fbc.so" + "libnvidia-glcore.so" + "libnvidia-glsi.so" + "libnvidia-glvkspirv.so" + "libnvidia-gpucomp.so" + "libnvidia-gtk2.so" + "libnvidia-gtk3.so" + "libnvidia-ifr.so" + "libnvidia-ml.so" + "libnvidia-nvvm.so" + "libnvidia-opencl.so" + "libnvidia-opticalflow.so" + "libnvidia-ptxjitcompiler.so" + "libnvidia-rtcore.so" + "libnvidia-tls.so" + "libnvidia-wfb.so" + "libnvoptix.so.1" + "libOpenCL.so" + "libOpenGL.so" + "libvdpau_nvidia.so" + "nvidia_drv.so" + "tls_test_.so" + ) -# Find the CUDA version of the host CUDA drivers -# (making sure that this can still work inside prefix environment inside a container) + # Try to download the nvliblist.conf file with curl + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" + nvliblist_content=$(curl --silent "$nvliblist_url") + + # Check if curl failed (i.e., the content is empty) + if [ -z "$nvliblist_content" ]; then + # Failed to download nvliblist.conf, using default list instead + printf "%s\n" "${default_nvlib_list[@]}" + return 1 + fi + + # If curl succeeded, filter and return the libraries from the downloaded content + echo "$nvliblist_content" | grep '.so$' +} + + +# Check for required commands +command -v nvidia-smi >/dev/null 2>&1 || echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit + +# Variables +LD_PRELOAD_MODE=0 + +# Parse command-line options +while [[ "$#" -gt 0 ]]; do + case $1 in + --ld-preload) LD_PRELOAD_MODE=1 ;; # Enable LD_PRELOAD mode + *) fatal_error "Unknown option: $1";; + esac + shift +done + +# Gather information about NVIDIA drivers (even if we are inside a Gentoo Prefix in a container) export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH -nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_command > /dev/null; then - host_driver_version=$($nvidia_smi_command | tail -n1) + +# Command to give to get the CUDA driver version +nvidia_smi_driver_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_driver_command > /dev/null 2>&1; then + host_driver_version=$($nvidia_smi_driver_command | tail -n1) echo_green "Found NVIDIA GPU driver version ${host_driver_version}" + # If the first worked, this should work too - host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') echo_green "Found host CUDA version ${host_cuda_version}" else - error="Failed to successfully execute\n $nvidia_smi_command\n" - fatal_error "$error" + fatal_error "Failed to execute $nvidia_smi_driver_command" fi -# Let's make sure the driver libraries are not already in place +# Gather any CUDA related driver libraries from the host +# - First let's see what driver libraries are there +# - then extract the ones we need for CUDA + +# Find the host ldconfig +host_ldconfig=$(get_host_ldconfig) +# Gather libraries on the host (_must_ be host ldconfig) +host_libraries=$($host_ldconfig -p | awk '{print $NF}') +singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null) + +# Now gather the list of possible CUDA libraries +cuda_candidate_libraries=$(get_nvlib_list) +# Check if the function returned an error (e.g., curl failed) +if [ $? -ne 0 ]; then + echo "Using default list of libraries" +else + echo "Using downloaded list of libraries" +fi + +# Filter the host libraries to find the CUDA libaries locations +# Initialize an array to hold the matched libraries +matched_libraries=() + +# Process each library and check for matches in libs.txt +echo "$nvlib_list" | while read -r library; do + # Search for the library in libs.txt and add it to the matched_libraries array + matched=$(echo "$ldconfig_output $singularity_libs" | grep "$library") + if [ -n "$matched" ]; then + matched_libraries+=("$matched") # Add matched library to the array + fi +done + +# Output the matched libraries +echo "Matched CUDA Libraries:" +printf "%s\n" "${matched_libraries[@]}" + +# LD_PRELOAD Mode +if [ "$LD_PRELOAD_MODE" -eq 1 ]; then + # Set LD_PRELOAD with the matched libraries + if [ ${#matched_libraries[@]} -gt 0 ]; then + LD_PRELOAD=$(printf "%s\n" "${matched_libraries[@]}" | tr '\n' ':') + # Remove the trailing colon from LD_PRELOAD if it exists + LD_PRELOAD=${LD_PRELOAD%:} + export LD_PRELOAD + echo "LD_PRELOAD set to: $LD_PRELOAD" + export EESSI_OVERRIDE_GPU_CHECK=1 + echo "Allowing overriding GPU checks in EESSI via EESSI_OVERRIDE_GPU_CHECK" + else + echo "No libraries matched, LD_PRELOAD not set." + exit 0 +fi + +# If we haven't already exited, we may need to create the symlinks + +# First let's make sure the driver libraries are not already in place link_drivers=1 -# first make sure that target of host_injections variant symlink is an existing directory -host_injections_target=$(realpath -m ${EESSI_CVMFS_REPO}/host_injections) -if [ ! -d ${host_injections_target} ]; then - create_directory_structure ${host_injections_target} +# Make sure that target of host_injections variant symlink is an existing directory +host_injections_target=$(realpath -m "${EESSI_CVMFS_REPO}/host_injections") +if [ ! -d "$host_injections_target" ]; then + create_directory_structure "$host_injections_target" fi host_injections_nvidia_dir="${EESSI_CVMFS_REPO}/host_injections/nvidia/${EESSI_CPU_FAMILY}" host_injection_driver_dir="${host_injections_nvidia_dir}/host" -host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" +host_injection_driver_version_file="${host_injection_driver_dir}/driver_version.txt" if [ -e "$host_injection_driver_version_file" ]; then if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then echo_green "The host GPU driver libraries (v${host_driver_version}) have already been linked! (based on ${host_injection_driver_version_file})" @@ -76,11 +214,7 @@ if [ -e "$host_injection_driver_version_file" ]; then else # There's something there but it is out of date echo_yellow "Cleaning out outdated symlinks" - rm $host_injection_driver_dir/* - if [ $? -ne 0 ]; then - error="Unable to remove files under '$host_injection_driver_dir'." - fatal_error "$error" - fi + rm "${host_injection_driver_dir}"/* || fatal_error "Unable to remove files under '${host_injection_driver_dir}'." fi fi @@ -89,56 +223,75 @@ if [ "$link_drivers" -eq 1 ]; then if ! create_directory_structure "${host_injection_driver_dir}" ; then fatal_error "No write permissions to directory ${host_injection_driver_dir}" fi - cd ${host_injection_driver_dir} - # Need a small temporary space to hold a couple of files - temp_dir=$(mktemp -d) - - # Gather libraries on the host (_must_ be host ldconfig) - $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt - # Allow for the fact that we may be in a container so the CUDA libs might be in there - ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null - - # Leverage singularity to find the full list of libraries we should be linking to - echo_yellow "Downloading latest version of nvliblist.conf from Apptainer to ${temp_dir}/nvliblist.conf" - curl --silent --output "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + cd "${host_injection_driver_dir}" || fatal_error "Failed to cd to ${host_injection_driver_dir}" # Make symlinks to all the interesting libraries - grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} + # Loop over each matched library + for library in "${matched_libraries[@]}"; do + # Check if the library file exists + if [ -e "$library" ]; then + # Create a symlink in the current directory + ln -s "$library" . + # Check if the symlink was created successfully + if [ $? -eq 0 ]; then + echo "Successfully created symlink for library $library in $PWD" + else + fatal_error "Error: Failed to create symlink for library $library in $PWD" + fi + else + echo "Warning: Library not found: $library" + fi + done - # Inject driver and CUDA versions into dir - echo $host_driver_version > driver_version.txt - echo $host_cuda_version > cuda_version.txt + # Inject driver and CUDA versions into the directory + echo "$host_driver_version" > driver_version.txt + echo "$host_cuda_version" > cuda_version.txt drivers_linked=1 - - # Remove the temporary directory when done - rm -r "$temp_dir" fi # Make latest symlink for NVIDIA drivers -cd $host_injections_nvidia_dir +cd "$host_injections_nvidia_dir" || fatal_error "Failed to cd to $host_injections_nvidia_dir" symlink="latest" if [ -L "$symlink" ]; then - # Unless the drivers have been installed, leave the symlink alone if [ "$drivers_linked" -eq 1 ]; then - ln -sf host latest + ln -sf host "$symlink" + if [ $? -eq 0 ]; then + echo "Successfully created symlink between $symlink and host in $PWD" + else + fatal_error "Failed to create symlink between $symlink and host in $PWD" + fi fi else - # No link exists yet - ln -s host latest + ln -s host "$symlink" + if [ $? -eq 0 ]; then + echo "Successfully created symlink between $symlink and host in $PWD" + else + fatal_error "Failed to create symlink between $symlink and host in $PWD" + fi fi # Make sure the libraries can be found by the EESSI linker host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} if [ -L "$host_injection_linker_dir/lib" ]; then target_path=$(readlink -f "$host_injection_linker_dir/lib") - if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then - cd $host_injection_linker_dir - ln -sf $host_injections_nvidia_dir/latest lib + if [ "$target_path" != "$host_injections_nvidia_dir/latest" ]; then + cd "$host_injection_linker_dir" || fatal_error "Failed to cd to $host_injection_linker_dir" + ln -sf "$host_injections_nvidia_dir/latest" lib + if [ $? -eq 0 ]; then + echo "Successfully created symlink between $host_injections_nvidia_dir/latest and lib in $PWD" + else + fatal_error "Failed to create symlink between $host_injections_nvidia_dir/latest and lib in $PWD" + fi fi else - create_directory_structure $host_injection_linker_dir - cd $host_injection_linker_dir - ln -s $host_injections_nvidia_dir/latest lib + create_directory_structure "$host_injection_linker_dir" + cd "$host_injection_linker_dir" || fatal_error "Failed to cd to $host_injection_linker_dir" + ln -s "$host_injections_nvidia_dir/latest" lib + if [ $? -eq 0 ]; then + echo "Successfully created symlink between $host_injections_nvidia_dir/latest and lib in $PWD" + else + fatal_error "Failed to create symlink between $host_injections_nvidia_dir/latest and lib in $PWD" + fi fi echo_green "Host NVIDIA GPU drivers linked successfully for EESSI" From 7229764b75a29836a96509a62476456bdd0258e6 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 27 Sep 2024 14:08:10 +0200 Subject: [PATCH 02/32] Make an option to not download anything --- .../gpu_support/nvidia/link_nvidia_host_libraries.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 718229b1d7..71872665bd 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -94,8 +94,13 @@ get_nvlib_list() { "tls_test_.so" ) + # Check if the function was called with the "default" argument + if [[ "$1" == "default" ]]; then + printf "%s\n" "${default_nvlib_list[@]}" + return 0 + fi + # Try to download the nvliblist.conf file with curl - echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" nvliblist_content=$(curl --silent "$nvliblist_url") # Check if curl failed (i.e., the content is empty) @@ -115,11 +120,13 @@ command -v nvidia-smi >/dev/null 2>&1 || echo_yellow "nvidia-smi not found, this # Variables LD_PRELOAD_MODE=0 +DOWNLOAD="" # Parse command-line options while [[ "$#" -gt 0 ]]; do case $1 in --ld-preload) LD_PRELOAD_MODE=1 ;; # Enable LD_PRELOAD mode + --no-download) DOWNLOAD="default" ;; # Download latest list of CUDA libraries *) fatal_error "Unknown option: $1";; esac shift @@ -152,7 +159,7 @@ host_libraries=$($host_ldconfig -p | awk '{print $NF}') singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null) # Now gather the list of possible CUDA libraries -cuda_candidate_libraries=$(get_nvlib_list) +cuda_candidate_libraries=$(get_nvlib_list "${DOWNLOAD}") # Check if the function returned an error (e.g., curl failed) if [ $? -ne 0 ]; then echo "Using default list of libraries" From 513cbb7957dea10447ee1dafecad72ae02fd896d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 27 Sep 2024 14:16:42 +0200 Subject: [PATCH 03/32] Make sure the umask allows for global reading --- .../nvidia/link_nvidia_host_libraries.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 71872665bd..b5ec314096 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -114,6 +114,21 @@ get_nvlib_list() { echo "$nvliblist_content" | grep '.so$' } +# Function to check if umask allows global read +check_global_read() { + # Get the current umask value + local current_umask=$(umask) + + # Convert umask to decimal to analyze + local umask_decimal=$((8#$current_umask)) + + # Check if umask allows global read + if [[ $umask_decimal -eq 0 || $umask_decimal -eq 22 ]]; then + echo "The current umask ($current_umask) allows global read permissions." + else + fatal_error "The current umask ($current_umask) does not allow global read permissions." + fi +} # Check for required commands command -v nvidia-smi >/dev/null 2>&1 || echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit @@ -208,6 +223,7 @@ link_drivers=1 # Make sure that target of host_injections variant symlink is an existing directory host_injections_target=$(realpath -m "${EESSI_CVMFS_REPO}/host_injections") if [ ! -d "$host_injections_target" ]; then + check_global_read create_directory_structure "$host_injections_target" fi @@ -227,6 +243,7 @@ fi drivers_linked=0 if [ "$link_drivers" -eq 1 ]; then + check_global_read if ! create_directory_structure "${host_injection_driver_dir}" ; then fatal_error "No write permissions to directory ${host_injection_driver_dir}" fi @@ -291,6 +308,7 @@ if [ -L "$host_injection_linker_dir/lib" ]; then fi fi else + check_global_read create_directory_structure "$host_injection_linker_dir" cd "$host_injection_linker_dir" || fatal_error "Failed to cd to $host_injection_linker_dir" ln -s "$host_injections_nvidia_dir/latest" lib From 3b80a1b3843997b09fb178245c462d8fe3672ad2 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 14:38:00 +0200 Subject: [PATCH 04/32] Careful on the exit --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index b5ec314096..22dfb138bb 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -131,7 +131,7 @@ check_global_read() { } # Check for required commands -command -v nvidia-smi >/dev/null 2>&1 || echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit +command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit 1; } # Variables LD_PRELOAD_MODE=0 From 2475e4d5ca4a9100aaaba04d055a919d64e94989 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 14:45:09 +0200 Subject: [PATCH 05/32] Referenced wrong variable --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 22dfb138bb..51babbba3d 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -187,7 +187,7 @@ fi matched_libraries=() # Process each library and check for matches in libs.txt -echo "$nvlib_list" | while read -r library; do +for library in "${cuda_candidate_libraries[@]}"; do # Search for the library in libs.txt and add it to the matched_libraries array matched=$(echo "$ldconfig_output $singularity_libs" | grep "$library") if [ -n "$matched" ]; then From bd6041ebc70eb37507270ec7b3442ba68c698835 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 14:56:31 +0200 Subject: [PATCH 06/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 51babbba3d..a1ff156f50 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -135,13 +135,13 @@ command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, th # Variables LD_PRELOAD_MODE=0 -DOWNLOAD="" +LIBS_LIST="" # Parse command-line options while [[ "$#" -gt 0 ]]; do case $1 in --ld-preload) LD_PRELOAD_MODE=1 ;; # Enable LD_PRELOAD mode - --no-download) DOWNLOAD="default" ;; # Download latest list of CUDA libraries + --no-download) LIBS_LIST="default" ;; # Download latest list of CUDA libraries *) fatal_error "Unknown option: $1";; esac shift @@ -174,7 +174,7 @@ host_libraries=$($host_ldconfig -p | awk '{print $NF}') singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null) # Now gather the list of possible CUDA libraries -cuda_candidate_libraries=$(get_nvlib_list "${DOWNLOAD}") +cuda_candidate_libraries=$(get_nvlib_list "${LIBS_LIST}") # Check if the function returned an error (e.g., curl failed) if [ $? -ne 0 ]; then echo "Using default list of libraries" From 9d721025e46721f63e8a7a1d96940ff424c05c6a Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 14:58:45 +0200 Subject: [PATCH 07/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index a1ff156f50..0afc7b6b86 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -157,7 +157,7 @@ if $nvidia_smi_driver_command > /dev/null 2>&1; then echo_green "Found NVIDIA GPU driver version ${host_driver_version}" # If the first worked, this should work too - host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk '{NF>1; print $NF}') echo_green "Found host CUDA version ${host_cuda_version}" else fatal_error "Failed to execute $nvidia_smi_driver_command" From f04d976048b00126d23933dbd3bd1d32203b4d25 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:12:40 +0200 Subject: [PATCH 08/32] Use same return code in any scenario where we use default list of CUDA libraries --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 0afc7b6b86..8ecd76e780 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -97,7 +97,7 @@ get_nvlib_list() { # Check if the function was called with the "default" argument if [[ "$1" == "default" ]]; then printf "%s\n" "${default_nvlib_list[@]}" - return 0 + return 1 fi # Try to download the nvliblist.conf file with curl @@ -112,6 +112,8 @@ get_nvlib_list() { # If curl succeeded, filter and return the libraries from the downloaded content echo "$nvliblist_content" | grep '.so$' + + return 0 } # Function to check if umask allows global read From 9c2865f9be7c002dde888006675cf7db5dd246b9 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:23:37 +0200 Subject: [PATCH 09/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 8ecd76e780..6a42bcdd3c 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -86,7 +86,7 @@ get_nvlib_list() { "libnvidia-rtcore.so" "libnvidia-tls.so" "libnvidia-wfb.so" - "libnvoptix.so.1" + "libnvoptix.so" "libOpenCL.so" "libOpenGL.so" "libvdpau_nvidia.so" From a263d6cb714aafd0ef38959fcc1ef63ee50d8db6 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:40:19 +0200 Subject: [PATCH 10/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 6a42bcdd3c..ee42c81b65 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -175,8 +175,8 @@ host_ldconfig=$(get_host_ldconfig) host_libraries=$($host_ldconfig -p | awk '{print $NF}') singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null) -# Now gather the list of possible CUDA libraries -cuda_candidate_libraries=$(get_nvlib_list "${LIBS_LIST}") +# Now gather the list of possible CUDA libraries and make them into an array +cuda_candidate_libraries=($(get_nvlib_list "${LIBS_LIST}")) # Check if the function returned an error (e.g., curl failed) if [ $? -ne 0 ]; then echo "Using default list of libraries" @@ -191,7 +191,7 @@ matched_libraries=() # Process each library and check for matches in libs.txt for library in "${cuda_candidate_libraries[@]}"; do # Search for the library in libs.txt and add it to the matched_libraries array - matched=$(echo "$ldconfig_output $singularity_libs" | grep "$library") + matched=$(echo "$host_libraries $singularity_libs" | grep "$library") if [ -n "$matched" ]; then matched_libraries+=("$matched") # Add matched library to the array fi From 9f946424fd7e32cd390e2d7d9b4645a82ceef2ef Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:42:37 +0200 Subject: [PATCH 11/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index ee42c81b65..4e7343d351 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -214,6 +214,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then echo "Allowing overriding GPU checks in EESSI via EESSI_OVERRIDE_GPU_CHECK" else echo "No libraries matched, LD_PRELOAD not set." + fi exit 0 fi From 71e8898554d937fcc52d285d5cc371e9283df2a7 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:44:58 +0200 Subject: [PATCH 12/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 4e7343d351..c18e655110 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -197,9 +197,8 @@ for library in "${cuda_candidate_libraries[@]}"; do fi done -# Output the matched libraries -echo "Matched CUDA Libraries:" -printf "%s\n" "${matched_libraries[@]}" +# Output the number of matched libraries +echo "Matched ${#matched_libraries[@]} CUDA Libraries" # LD_PRELOAD Mode if [ "$LD_PRELOAD_MODE" -eq 1 ]; then From ed9e868e865c4e43b058465fc101df02f16a10c0 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 15:49:00 +0200 Subject: [PATCH 13/32] This script might be `source`d, don't use exit --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index c18e655110..ce3b37cbc5 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -133,7 +133,7 @@ check_global_read() { } # Check for required commands -command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; exit 1; } +command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; return 1; } # Variables LD_PRELOAD_MODE=0 @@ -214,7 +214,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then else echo "No libraries matched, LD_PRELOAD not set." fi - exit 0 + return 0 fi # If we haven't already exited, we may need to create the symlinks From d8d335e72643a13d782931b2c6d7cd9a08a809ec Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 16:06:20 +0200 Subject: [PATCH 14/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index ce3b37cbc5..ec911c6e9a 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -214,7 +214,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then else echo "No libraries matched, LD_PRELOAD not set." fi - return 0 + [[ "${BASH_SOURCE[0]}" != "${0}" ]] && return 1 || exit 1 fi # If we haven't already exited, we may need to create the symlinks From 2d5c96b8f80408052937ba8ff90835f5ba61ef9e Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 17:26:45 +0200 Subject: [PATCH 15/32] Exclude anything graphics related from an LD_PRELOAD approach --- .../nvidia/link_nvidia_host_libraries.sh | 69 ++++++++++++++++--- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index ec911c6e9a..8e5a7ddc32 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -202,17 +202,68 @@ echo "Matched ${#matched_libraries[@]} CUDA Libraries" # LD_PRELOAD Mode if [ "$LD_PRELOAD_MODE" -eq 1 ]; then - # Set LD_PRELOAD with the matched libraries - if [ ${#matched_libraries[@]} -gt 0 ]; then - LD_PRELOAD=$(printf "%s\n" "${matched_libraries[@]}" | tr '\n' ':') + echo + echo_yellow "When attempting to use LD_PRELOAD we exclude anything related to graphics" + + # Filter out all libraries that have missing library dependencies under EESSI + filtered_libraries=() + for library in "${matched_libraries[@]}"; do + # Run ldd on the given binary and filter for "not found" libraries + NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') + # Check if it is missing an so dep under EESSI + if [[ -z "$NOT_FOUND_LIBS" ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + filtered_libraries+=("$library") + fi + fi + else + # Iterate over "not found" libraries and check if they are in the array + all_found=true + for lib in $NOT_FOUND_LIBS; do + found=false + for listed_lib in "${matched_libraries[@]}"; do + if [[ "$lib" == "$listed_lib" ]]; then + found=true + break + fi + done + + if [[ "$found" == false ]]; then + echo "$lib is NOT in the provided preload list, filtering $library." + all_found=false + break + fi + done + + # If we find all the missing libs in our list include it + if [[ "$all_found" == true ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + filtered_libraries+=("$library") + fi + fi + fi + fi + done + + # Set EESSI_GPU_LD_PRELOAD with the matched libraries + if [ ${#filtered_libraries[@]} -gt 0 ]; then + echo + echo_yellow "The recommended way to use LD_PRELOAD is to only use it when you need to:" + echo + EESSI_GPU_LD_PRELOAD=$(printf "%s\n" "${filtered_libraries[@]}" | tr '\n' ':') # Remove the trailing colon from LD_PRELOAD if it exists - LD_PRELOAD=${LD_PRELOAD%:} - export LD_PRELOAD - echo "LD_PRELOAD set to: $LD_PRELOAD" + EESSI_GPU_LD_PRELOAD=${EESSI_GPU_LD_PRELOAD%:} + export EESSI_GPU_LD_PRELOAD + echo_green "export EESSI_GPU_LD_PRELOAD=\"$EESSI_GPU_LD_PRELOAD\"" export EESSI_OVERRIDE_GPU_CHECK=1 - echo "Allowing overriding GPU checks in EESSI via EESSI_OVERRIDE_GPU_CHECK" - else - echo "No libraries matched, LD_PRELOAD not set." + echo_green "export EESSI_OVERRIDE_GPU_CHECK=\"$EESSI_OVERRIDE_GPU_CHECK\"" + echo + echo_yellow "Then you can set LD_PRELOAD only when you want to run a GPU application, e.g.," + echo_yellow " LD_PRELOAD=\"\$EESSI_GPU_LD_PRELOAD\" device_query" fi [[ "${BASH_SOURCE[0]}" != "${0}" ]] && return 1 || exit 1 fi From 1a325f2823fdb6bfc5208b7dee5194b428f35f99 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 17:31:22 +0200 Subject: [PATCH 16/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 8e5a7ddc32..a356e0e3a5 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -264,6 +264,8 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then echo echo_yellow "Then you can set LD_PRELOAD only when you want to run a GPU application, e.g.," echo_yellow " LD_PRELOAD=\"\$EESSI_GPU_LD_PRELOAD\" device_query" + else + echo "No libraries matched, LD_PRELOAD not set." fi [[ "${BASH_SOURCE[0]}" != "${0}" ]] && return 1 || exit 1 fi From 000e6c1cc26002aacea88e12de9d7bcda6c93cb0 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Fri, 27 Sep 2024 17:59:33 +0200 Subject: [PATCH 17/32] Update link_nvidia_host_libraries.sh --- .../nvidia/link_nvidia_host_libraries.sh | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index a356e0e3a5..3de39e33a4 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -120,15 +120,13 @@ get_nvlib_list() { check_global_read() { # Get the current umask value local current_umask=$(umask) - + # Convert umask to decimal to analyze - local umask_decimal=$((8#$current_umask)) + local umask_octal=$(printf '%03o\n' "$current_umask") # Check if umask allows global read - if [[ $umask_decimal -eq 0 || $umask_decimal -eq 22 ]]; then - echo "The current umask ($current_umask) allows global read permissions." - else - fatal_error "The current umask ($current_umask) does not allow global read permissions." + if [ "$umask_octal" -gt 022 ]; then + fatal_error "The current umask ($current_umask) does not allow global read permissions, you'll want everyone to be able to read the created directory." fi } @@ -157,7 +155,7 @@ nvidia_smi_driver_command="nvidia-smi --query-gpu=driver_version --format=csv,no if $nvidia_smi_driver_command > /dev/null 2>&1; then host_driver_version=$($nvidia_smi_driver_command | tail -n1) echo_green "Found NVIDIA GPU driver version ${host_driver_version}" - + # If the first worked, this should work too host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk '{NF>1; print $NF}') echo_green "Found host CUDA version ${host_cuda_version}" @@ -193,7 +191,7 @@ for library in "${cuda_candidate_libraries[@]}"; do # Search for the library in libs.txt and add it to the matched_libraries array matched=$(echo "$host_libraries $singularity_libs" | grep "$library") if [ -n "$matched" ]; then - matched_libraries+=("$matched") # Add matched library to the array + matched_libraries+=( $matched ) # Add matched library to the array fi done @@ -307,18 +305,11 @@ if [ "$link_drivers" -eq 1 ]; then # Make symlinks to all the interesting libraries # Loop over each matched library for library in "${matched_libraries[@]}"; do - # Check if the library file exists - if [ -e "$library" ]; then - # Create a symlink in the current directory - ln -s "$library" . - # Check if the symlink was created successfully - if [ $? -eq 0 ]; then - echo "Successfully created symlink for library $library in $PWD" - else - fatal_error "Error: Failed to create symlink for library $library in $PWD" - fi - else - echo "Warning: Library not found: $library" + # Create a symlink in the current directory + ln -s "$library" . + # Check if the symlink was created successfully + if [ $? -ne 0 ]; then + fatal_error "Error: Failed to create symlink for library $library in $PWD" fi done From f0892b19d03ae5f51c1c83b6b3930d9767bb25fc Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 2 Oct 2024 10:13:56 +0200 Subject: [PATCH 18/32] Update link_nvidia_host_libraries.sh --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 3de39e33a4..07c57360fa 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -229,7 +229,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then done if [[ "$found" == false ]]; then - echo "$lib is NOT in the provided preload list, filtering $library." + echo "$lib is NOT in the provided preload list, filtering $library" all_found=false break fi From 02b48afb928baad72f3307ed689413eb83877cd3 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 16:59:06 +0200 Subject: [PATCH 19/32] Switch to nvidia-smi commands that are spoofed in CI --- .../nvidia/link_nvidia_host_libraries.sh | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 07c57360fa..2ecb7c06c0 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -150,17 +150,24 @@ done # Gather information about NVIDIA drivers (even if we are inside a Gentoo Prefix in a container) export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH -# Command to give to get the CUDA driver version -nvidia_smi_driver_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_driver_command > /dev/null 2>&1; then - host_driver_version=$($nvidia_smi_driver_command | tail -n1) - echo_green "Found NVIDIA GPU driver version ${host_driver_version}" - - # If the first worked, this should work too - host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk '{NF>1; print $NF}') - echo_green "Found host CUDA version ${host_cuda_version}" +# Check for NVIDIA GPUs via nvidia-smi command +nvidia_smi=$(command -v nvidia-smi) +if [[ $? -eq 0 ]]; then + nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) + nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out + if [[ $? -eq 0 ]]; then + nvidia_smi_info=$(head -1 $nvidia_smi_out) + host_cuda_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + host_driver_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f3 -d,) + echo_green "Found host CUDA version ${host_cuda_version}" + echo_green "Found NVIDIA GPU driver version ${host_driver_version}" + rm -f $nvidia_smi_out + else + fatal_error "nvidia-smi command failed, see output in $nvidia_smi_out" + fi else - fatal_error "Failed to execute $nvidia_smi_driver_command" + fatal_error "nvidia-smi command not found" + exit 2 fi # Gather any CUDA related driver libraries from the host From 6410c940d03b98d0d0dd22fee462750eaa061797 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 17:22:41 +0200 Subject: [PATCH 20/32] Filter out symlinks in LD_PRELOAD mode --- .../nvidia/link_nvidia_host_libraries.sh | 65 ++++++++++--------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 2ecb7c06c0..7408eabf27 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -210,44 +210,47 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then echo echo_yellow "When attempting to use LD_PRELOAD we exclude anything related to graphics" - # Filter out all libraries that have missing library dependencies under EESSI + # Filter out all symlinks and libraries that have missing library dependencies under EESSI filtered_libraries=() for library in "${matched_libraries[@]}"; do - # Run ldd on the given binary and filter for "not found" libraries - NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') - # Check if it is missing an so dep under EESSI - if [[ -z "$NOT_FOUND_LIBS" ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then - filtered_libraries+=("$library") + if [ ! -L "$library" ]; then + # $library is not a symlink + # Run ldd on the given binary and filter for "not found" libraries + NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') + # Check if it is missing an so dep under EESSI + if [[ -z "$NOT_FOUND_LIBS" ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + filtered_libraries+=("$library") + fi fi - fi - else - # Iterate over "not found" libraries and check if they are in the array - all_found=true - for lib in $NOT_FOUND_LIBS; do - found=false - for listed_lib in "${matched_libraries[@]}"; do - if [[ "$lib" == "$listed_lib" ]]; then - found=true + else + # Iterate over "not found" libraries and check if they are in the array + all_found=true + for lib in $NOT_FOUND_LIBS; do + found=false + for listed_lib in "${matched_libraries[@]}"; do + if [[ "$lib" == "$listed_lib" ]]; then + found=true + break + fi + done + + if [[ "$found" == false ]]; then + echo "$lib is NOT in the provided preload list, filtering $library" + all_found=false break fi done - if [[ "$found" == false ]]; then - echo "$lib is NOT in the provided preload list, filtering $library" - all_found=false - break - fi - done - - # If we find all the missing libs in our list include it - if [[ "$all_found" == true ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then - filtered_libraries+=("$library") + # If we find all the missing libs in our list include it + if [[ "$all_found" == true ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + filtered_libraries+=("$library") + fi fi fi fi From b8257ca61e41de7e5f5c6706380adfec8299898b Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 17:30:41 +0200 Subject: [PATCH 21/32] Retain CUDA compute capability with . --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 7408eabf27..b08a4b50b1 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -157,7 +157,7 @@ if [[ $? -eq 0 ]]; then nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out if [[ $? -eq 0 ]]; then nvidia_smi_info=$(head -1 $nvidia_smi_out) - host_cuda_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') + host_cuda_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d,) host_driver_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f3 -d,) echo_green "Found host CUDA version ${host_cuda_version}" echo_green "Found NVIDIA GPU driver version ${host_driver_version}" From 9070096978e5dc77185e82db2c7f621ebacfb777 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 20:19:16 +0200 Subject: [PATCH 22/32] Resolve all symlinks in LD_PRELOAD mode --- .../nvidia/link_nvidia_host_libraries.sh | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index b08a4b50b1..b9e6877ce9 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -213,42 +213,44 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Filter out all symlinks and libraries that have missing library dependencies under EESSI filtered_libraries=() for library in "${matched_libraries[@]}"; do - if [ ! -L "$library" ]; then - # $library is not a symlink - # Run ldd on the given binary and filter for "not found" libraries - NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') - # Check if it is missing an so dep under EESSI - if [[ -z "$NOT_FOUND_LIBS" ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then + library=$(realpath "$library") + # Run ldd on the given binary and filter for "not found" libraries + NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') + # Check if it is missing an so dep under EESSI + if [[ -z "$NOT_FOUND_LIBS" ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then filtered_libraries+=("$library") fi fi - else - # Iterate over "not found" libraries and check if they are in the array - all_found=true - for lib in $NOT_FOUND_LIBS; do - found=false - for listed_lib in "${matched_libraries[@]}"; do - if [[ "$lib" == "$listed_lib" ]]; then - found=true - break - fi - done - - if [[ "$found" == false ]]; then - echo "$lib is NOT in the provided preload list, filtering $library" - all_found=false + fi + else + # Iterate over "not found" libraries and check if they are in the array + all_found=true + for lib in $NOT_FOUND_LIBS; do + found=false + for listed_lib in "${matched_libraries[@]}"; do + if [[ "$lib" == "$listed_lib" ]]; then + found=true break fi done - # If we find all the missing libs in our list include it - if [[ "$all_found" == true ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then + if [[ "$found" == false ]]; then + echo "$lib is NOT in the provided preload list, filtering $library" + all_found=false + break + fi + done + + # If we find all the missing libs in our list include it + if [[ "$all_found" == true ]]; then + # Anything graphics is out, as is libnvidia-fbc* + if [[ "$library" != *"GL"* ]]; then + if [[ "$library" != *"libnvidia-fbc"* ]]; then + if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then filtered_libraries+=("$library") fi fi From add2574606646105a72975ee7916824319084ac9 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 20:26:51 +0200 Subject: [PATCH 23/32] Don't resolve symlinks until we store them --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index b9e6877ce9..97948a5e70 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -213,7 +213,6 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Filter out all symlinks and libraries that have missing library dependencies under EESSI filtered_libraries=() for library in "${matched_libraries[@]}"; do - library=$(realpath "$library") # Run ldd on the given binary and filter for "not found" libraries NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') # Check if it is missing an so dep under EESSI @@ -221,6 +220,8 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Anything graphics is out, as is libnvidia-fbc* if [[ "$library" != *"GL"* ]]; then if [[ "$library" != *"libnvidia-fbc"* ]]; then + # Resolve any symlink + library=$(realpath "$library") if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then filtered_libraries+=("$library") fi @@ -250,6 +251,8 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Anything graphics is out, as is libnvidia-fbc* if [[ "$library" != *"GL"* ]]; then if [[ "$library" != *"libnvidia-fbc"* ]]; then + # Resolve any symlink + library=$(realpath "$library") if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then filtered_libraries+=("$library") fi From 9fcf33c8c76fd1b41543393a18fc380e04fc9dc3 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 9 Oct 2024 20:40:23 +0200 Subject: [PATCH 24/32] Allow partial maps for dependent .so files --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 97948a5e70..a35023ebdb 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -233,7 +233,8 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then for lib in $NOT_FOUND_LIBS; do found=false for listed_lib in "${matched_libraries[@]}"; do - if [[ "$lib" == "$listed_lib" ]]; then + # Matching to the .so is enough + if [[ "$lib" == "$listed_lib"* ]]; then found=true break fi From 26a91f07876e31d55f901c4ec14737f0e5ecadb7 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 9 Oct 2024 20:57:58 +0200 Subject: [PATCH 25/32] Make sure EESSI is initialised --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index a35023ebdb..1b450c6a37 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -130,6 +130,9 @@ check_global_read() { fi } +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + # Check for required commands command -v nvidia-smi >/dev/null 2>&1 || { echo_yellow "nvidia-smi not found, this script won't do anything useful"; return 1; } From 5b3fb27acdaa4a6ab92325b4ffa4abaabea6e930 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 11:57:37 +0200 Subject: [PATCH 26/32] Try to improve things --- .../nvidia/link_nvidia_host_libraries.sh | 79 +++++++++++++------ 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index a35023ebdb..2a2306d2d8 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -209,32 +209,45 @@ echo "Matched ${#matched_libraries[@]} CUDA Libraries" if [ "$LD_PRELOAD_MODE" -eq 1 ]; then echo echo_yellow "When attempting to use LD_PRELOAD we exclude anything related to graphics" - + local cuda_compat_nvlib_list=( + "libcuda.so" + "libcudadebugger.so" + "libnvidia-nvvm.so" + "libnvidia-ptxjitcompiler.so" + ) # Filter out all symlinks and libraries that have missing library dependencies under EESSI filtered_libraries=() + compat_filtered_libraries=() for library in "${matched_libraries[@]}"; do # Run ldd on the given binary and filter for "not found" libraries - NOT_FOUND_LIBS=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') + not_found_libs=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') # Check if it is missing an so dep under EESSI - if [[ -z "$NOT_FOUND_LIBS" ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then - # Resolve any symlink - library=$(realpath "$library") - if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then - filtered_libraries+=("$library") + if [[ -z "$not_found_libs" ]]; then + # Resolve any symlink + realpath_library=$(realpath "$library") + if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then + filtered_libraries+=("$realpath_library") + # Also prepare compat only libraries for the short list + for item in "${cuda_compat_nvlib_list[@]}"; do + # Check if the current item is a substring of $library + if [[ "$realpath_library" == "$item"* ]]; then + echo "Match found for $item for CUDA compat libraries" + if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then + compat_filtered_libraries+=("$realpath_library") + fi + break fi - fi - fi + done + fi else # Iterate over "not found" libraries and check if they are in the array all_found=true - for lib in $NOT_FOUND_LIBS; do + for lib in $not_found_libs; do found=false for listed_lib in "${matched_libraries[@]}"; do - # Matching to the .so is enough - if [[ "$lib" == "$listed_lib"* ]]; then + # Matching to the .so or a symlink target is enough + realpath_lib=$(realpath "$listed_lib") + if [[ "$lib" == "$listed_lib"* || "$lib" == "$realpath_lib" ]]; then found=true break fi @@ -249,15 +262,21 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # If we find all the missing libs in our list include it if [[ "$all_found" == true ]]; then - # Anything graphics is out, as is libnvidia-fbc* - if [[ "$library" != *"GL"* ]]; then - if [[ "$library" != *"libnvidia-fbc"* ]]; then - # Resolve any symlink - library=$(realpath "$library") - if [[ ! " ${filtered_libraries[@]} " =~ " $library " ]]; then - filtered_libraries+=("$library") + # Resolve any symlink + realpath_library=$(realpath "$library") + if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then + filtered_libraries+=("$realpath_library") + # Also prepare compat only libraries for the short list + for item in "${cuda_compat_nvlib_list[@]}"; do + # Check if the current item is a substring of $library + if [[ "$realpath_library" == "$item"* ]]; then + echo "Match found for $item for CUDA compat libraries" + if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then + compat_filtered_libraries+=("$realpath_library") + fi + break fi - fi + done fi fi fi @@ -266,18 +285,28 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Set EESSI_GPU_LD_PRELOAD with the matched libraries if [ ${#filtered_libraries[@]} -gt 0 ]; then echo - echo_yellow "The recommended way to use LD_PRELOAD is to only use it when you need to:" + echo_yellow "The recommended way to use LD_PRELOAD is to only use it when you need to." echo + EESSI_GPU_COMPAT_LD_PRELOAD=$(printf "%s\n" "${compat_filtered_libraries[@]}" | tr '\n' ':') + # Remove the trailing colon from LD_PRELOAD if it exists + EESSI_GPU_COMPAT_LD_PRELOAD=${EESSI_GPU_COMPAT_LD_PRELOAD%:} + export EESSI_GPU_COMPAT_LD_PRELOAD + echo_yellow "A minimal preload which should work in most cases:" + echo_green "export EESSI_GPU_COMPAT_LD_PRELOAD=\"$EESSI_GPU_LD_PRELOAD\"" + echo + EESSI_GPU_LD_PRELOAD=$(printf "%s\n" "${filtered_libraries[@]}" | tr '\n' ':') # Remove the trailing colon from LD_PRELOAD if it exists EESSI_GPU_LD_PRELOAD=${EESSI_GPU_LD_PRELOAD%:} export EESSI_GPU_LD_PRELOAD + echo_yellow "A corner-case full preload (which is hard on memory) for exceptional use:" + echo_green "export EESSI_GPU_LD_PRELOAD=\"$EESSI_GPU_LD_PRELOAD\"" export EESSI_OVERRIDE_GPU_CHECK=1 echo_green "export EESSI_OVERRIDE_GPU_CHECK=\"$EESSI_OVERRIDE_GPU_CHECK\"" echo echo_yellow "Then you can set LD_PRELOAD only when you want to run a GPU application, e.g.," - echo_yellow " LD_PRELOAD=\"\$EESSI_GPU_LD_PRELOAD\" device_query" + echo_yellow " LD_PRELOAD=\"\$EESSI_GPU_COMPAT_LD_PRELOAD\" device_query" else echo "No libraries matched, LD_PRELOAD not set." fi From 6534b99d2c5a78528a872f6787b991ba7a69abc6 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 12:03:44 +0200 Subject: [PATCH 27/32] Try to improve things --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 5fb3bfc2ba..ce3d75ab4a 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -233,7 +233,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library - if [[ "$realpath_library" == "$item"* ]]; then + if [[ "$library" == "$item"* ]]; then echo "Match found for $item for CUDA compat libraries" if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then compat_filtered_libraries+=("$realpath_library") @@ -272,7 +272,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library - if [[ "$realpath_library" == "$item"* ]]; then + if [[ "$library" == "$item"* ]]; then echo "Match found for $item for CUDA compat libraries" if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then compat_filtered_libraries+=("$realpath_library") From d0b02ed4c2e456ed6612d4605f9950245506e366 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 12:04:24 +0200 Subject: [PATCH 28/32] Try to improve things --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index ce3d75ab4a..7083905446 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -212,7 +212,7 @@ echo "Matched ${#matched_libraries[@]} CUDA Libraries" if [ "$LD_PRELOAD_MODE" -eq 1 ]; then echo echo_yellow "When attempting to use LD_PRELOAD we exclude anything related to graphics" - local cuda_compat_nvlib_list=( + cuda_compat_nvlib_list=( "libcuda.so" "libcudadebugger.so" "libnvidia-nvvm.so" From e9a1d293cc7c2bfeb2854bae155e74437d76a555 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 12:07:54 +0200 Subject: [PATCH 29/32] Try to improve things --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 7083905446..d716f5b2e9 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -233,7 +233,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library - if [[ "$library" == "$item"* ]]; then + if [[ "$realpath_library" == *"$item"* ]]; then echo "Match found for $item for CUDA compat libraries" if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then compat_filtered_libraries+=("$realpath_library") @@ -272,7 +272,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library - if [[ "$library" == "$item"* ]]; then + if [[ "$realpath_library" == *"$item"* ]]; then echo "Match found for $item for CUDA compat libraries" if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then compat_filtered_libraries+=("$realpath_library") From ac224a09f450af97dd82c41a55fd17123b918cd8 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 12:11:07 +0200 Subject: [PATCH 30/32] Try to improve things --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index d716f5b2e9..1b02c14efa 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -295,7 +295,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then EESSI_GPU_COMPAT_LD_PRELOAD=${EESSI_GPU_COMPAT_LD_PRELOAD%:} export EESSI_GPU_COMPAT_LD_PRELOAD echo_yellow "A minimal preload which should work in most cases:" - echo_green "export EESSI_GPU_COMPAT_LD_PRELOAD=\"$EESSI_GPU_LD_PRELOAD\"" + echo_green "export EESSI_GPU_COMPAT_LD_PRELOAD=\"$EESSI_GPU_COMPAT_LD_PRELOAD\"" echo EESSI_GPU_LD_PRELOAD=$(printf "%s\n" "${filtered_libraries[@]}" | tr '\n' ':') From 465dd47e7aa7f9a873925339fc63790a1ae839ea Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 10 Oct 2024 12:14:56 +0200 Subject: [PATCH 31/32] Try to improve things --- scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 1b02c14efa..967e7d3589 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -250,7 +250,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then for listed_lib in "${matched_libraries[@]}"; do # Matching to the .so or a symlink target is enough realpath_lib=$(realpath "$listed_lib") - if [[ "$lib" == "$listed_lib"* || "$lib" == "$realpath_lib" ]]; then + if [[ "$lib" == "$listed_lib"* || "$realpath_lib" == *"$lib" ]]; then found=true break fi From 3eae3f265480ab2f2343f313fbcc6664d02b4855 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 7 Nov 2024 10:16:24 +0100 Subject: [PATCH 32/32] Apply suggestions from code review Accepted all except one Co-authored-by: TopRichard <121792457+TopRichard@users.noreply.github.com> --- .../nvidia/link_nvidia_host_libraries.sh | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh index 967e7d3589..2218a92116 100755 --- a/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -15,8 +15,8 @@ get_host_ldconfig() { local found_paths=() # Initialize an array to store found paths # Always attempt to use /sbin/ldconfig - if [ -x "/sbin/$command_name" ]; then - found_paths+=("/sbin/$command_name") + if [ -x "/sbin/${command_name}" ]; then + found_paths+=("/sbin/${command_name}") fi # Split the $PATH and iterate over each directory @@ -28,8 +28,8 @@ get_host_ldconfig() { # Check if directory does not start with the exclude prefix if [[ ! "$dir" =~ ^$exclude_prefix ]]; then - if [ -x "$dir/$command_name" ]; then - found_paths+=("$dir/$command_name") + if [ -x "${dir}/${command_name}" ]; then + found_paths+=("${dir}/${command_name}") fi fi done @@ -142,7 +142,7 @@ LIBS_LIST="" # Parse command-line options while [[ "$#" -gt 0 ]]; do - case $1 in + case "$1" in --ld-preload) LD_PRELOAD_MODE=1 ;; # Enable LD_PRELOAD mode --no-download) LIBS_LIST="default" ;; # Download latest list of CUDA libraries *) fatal_error "Unknown option: $1";; @@ -151,7 +151,7 @@ while [[ "$#" -gt 0 ]]; do done # Gather information about NVIDIA drivers (even if we are inside a Gentoo Prefix in a container) -export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" # Check for NVIDIA GPUs via nvidia-smi command nvidia_smi=$(command -v nvidia-smi) @@ -159,9 +159,9 @@ if [[ $? -eq 0 ]]; then nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out if [[ $? -eq 0 ]]; then - nvidia_smi_info=$(head -1 $nvidia_smi_out) - host_cuda_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d,) - host_driver_version=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f3 -d,) + nvidia_smi_info=$(head -1 "${nvidia_smi_out}") + host_cuda_version=$(echo "${nvidia_smi_info}" | sed 's/, /,/g' | cut -f4 -d,) + host_driver_version=$(echo "${nvidia_smi_info}" | sed 's/, /,/g' | cut -f3 -d,) echo_green "Found host CUDA version ${host_cuda_version}" echo_green "Found NVIDIA GPU driver version ${host_driver_version}" rm -f $nvidia_smi_out @@ -180,7 +180,7 @@ fi # Find the host ldconfig host_ldconfig=$(get_host_ldconfig) # Gather libraries on the host (_must_ be host ldconfig) -host_libraries=$($host_ldconfig -p | awk '{print $NF}') +host_libraries=$("${host_ldconfig}" -p | awk '{print $NF}') singularity_libs=$(ls /.singularity.d/libs/* 2>/dev/null) # Now gather the list of possible CUDA libraries and make them into an array @@ -223,13 +223,13 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then compat_filtered_libraries=() for library in "${matched_libraries[@]}"; do # Run ldd on the given binary and filter for "not found" libraries - not_found_libs=$(ldd "$library" 2>/dev/null | grep "not found" | awk '{print $1}') + not_found_libs=$(ldd "${library}" 2>/dev/null | grep "not found" | awk '{print $1}') # Check if it is missing an so dep under EESSI if [[ -z "$not_found_libs" ]]; then # Resolve any symlink realpath_library=$(realpath "$library") if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then - filtered_libraries+=("$realpath_library") + filtered_libraries+=("${realpath_library}") # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library @@ -249,7 +249,7 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then found=false for listed_lib in "${matched_libraries[@]}"; do # Matching to the .so or a symlink target is enough - realpath_lib=$(realpath "$listed_lib") + realpath_lib=$(realpath "${listed_lib}") if [[ "$lib" == "$listed_lib"* || "$realpath_lib" == *"$lib" ]]; then found=true break @@ -266,16 +266,16 @@ if [ "$LD_PRELOAD_MODE" -eq 1 ]; then # If we find all the missing libs in our list include it if [[ "$all_found" == true ]]; then # Resolve any symlink - realpath_library=$(realpath "$library") + realpath_library=$(realpath "${library}") if [[ ! " ${filtered_libraries[@]} " =~ " $realpath_library " ]]; then - filtered_libraries+=("$realpath_library") + filtered_libraries+=("${realpath_library}") # Also prepare compat only libraries for the short list for item in "${cuda_compat_nvlib_list[@]}"; do # Check if the current item is a substring of $library if [[ "$realpath_library" == *"$item"* ]]; then echo "Match found for $item for CUDA compat libraries" if [[ ! " ${compat_filtered_libraries[@]} " =~ " $realpath_library " ]]; then - compat_filtered_libraries+=("$realpath_library") + compat_filtered_libraries+=("${realpath_library}") fi break fi