Skip to content

Commit

Permalink
merge with commit 65e4c36
Browse files Browse the repository at this point in the history
  • Loading branch information
lara committed Jan 14, 2025
2 parents 940dae9 + 65e4c36 commit 3719c75
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 116 deletions.
68 changes: 0 additions & 68 deletions reframe_config_bot.py.tmpl

This file was deleted.

113 changes: 65 additions & 48 deletions test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,56 +133,73 @@ else
fi

# Configure ReFrame, see https://www.eessi.io/docs/test-suite/installation-configuration
export RFM_CONFIG_FILES=$TOPDIR/reframe_config_bot.py
export RFM_CONFIG_FILE_TEMPLATE=$TOPDIR/reframe_config_bot.py.tmpl
# RFM_CONFIG_FILES _has_ to be set by the site hosting the bot, so that it knows where to find the ReFrame
# config file that matches the bot config. See https://gitlab.com/eessi/support/-/issues/114#note_2293660921
if [ -z "$RFM_CONFIG_FILES" ]; then
err_msg = "Please set RFM_CONFIG_FILES in the environment of this bot instance to point to a valid"
err_msg = "${err_msg} ReFrame configuration file that matches the bot config."
err_msg = "${err_msg} For more information, see https://gitlab.com/eessi/support/-/issues/114#note_2293660921"
fatal_error "${err_msg}"
fi
export RFM_CHECK_SEARCH_PATH=$TESTSUITEPREFIX/eessi/testsuite/tests
export RFM_CHECK_SEARCH_RECURSIVE=1
export RFM_PREFIX=$PWD/reframe_runs

# Get the correct partition name
REFRAME_PARTITION_NAME=${EESSI_SOFTWARE_SUBDIR//\//_}
if [ ! -z "$EESSI_ACCELERATOR_TARGET" ]; then
REFRAME_PARTITION_NAME=${REFRAME_PARTITION_NAME}_${EESSI_ACCELERATOR_TARGET//\//_}
fi
echo "Constructed partition name based on EESSI_SOFTWARE_SUBDIR and EESSI_ACCELERATOR_TARGET: ${REFRAME_PARTITION_NAME}"

# Set the reframe system name, including partition
export RFM_SYSTEM="BotBuildTests:${REFRAME_PARTITION_NAME}"

echo "Configured reframe with the following environment variables:"
env | grep "RFM_"

# The /sys inside the container is not the same as the /sys of the host
# We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM).
# Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup
# and that's the prefix we use to extract the memory limit from
cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$(</proc/self/cpuset)/memory.limit_in_bytes"
cgroup_v2_mem_limit="/hostsys/fs/cgroup/$(</proc/self/cpuset)/memory.max"
if [ -f "$cgroup_v1_mem_limit" ]; then
echo "Getting memory limit from file $cgroup_v1_mem_limit"
cgroup_mem_bytes=$(cat "$cgroup_v1_mem_limit")
elif [ -f "$cgroup_v2_mem_limit" ]; then
echo "Getting memory limit from file $cgroup_v2_mem_limit"
cgroup_mem_bytes=$(cat "$cgroup_v2_mem_limit")
if [ "$cgroup_mem_bytes" = 'max' ]; then
# In cgroupsv2, the memory.max file may contain 'max', meaning the group can use the full system memory
# Here, we get the system memory from /proc/meminfo. Units are supposedly always in kb, but lets match them too
cgroup_mem_kilobytes=$(grep -oP 'MemTotal:\s+\K\d+(?=\s+kB)' /proc/meminfo)
if [[ $? -ne 0 ]] || [[ -z "$cgroup_mem_kilobytes" ]]; then
fatal_error "Failed to get memory limit from /proc/meminfo"
fi
cgroup_mem_bytes=$(("$cgroup_mem_kilobytes"*1024))
fi
else
fatal_error "Both files ${cgroup_v1_mem_limit} and ${cgroup_v2_mem_limit} couldn't be found. Failed to get the memory limit from the current cgroup"
fi
if [[ $? -eq 0 ]]; then
# Convert to MiB
cgroup_mem_mib=$(("$cgroup_mem_bytes"/(1024*1024)))
else
fatal_error "Failed to get the memory limit in bytes from the current cgroup"
fi
echo "Detected available memory: ${cgroup_mem_mib} MiB"

cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
echo "Replacing memory limit in the ReFrame config file with the detected CGROUP memory limit: ${cgroup_mem_mib} MiB"
sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES
RFM_PARTITION="${SLURM_JOB_PARTITION}"
echo "Replacing partition name in the template ReFrame config file: ${RFM_PARTITION}"
sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES
# THIS WHOLE BLOCK SHOULD NO LONGER BE NEEDED IF WE HAVE SITE-SPECIFIC CONFIG FILES
# # The /sys inside the container is not the same as the /sys of the host
# # We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM).
# # Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup
# # and that's the prefix we use to extract the memory limit from
# cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$(</proc/self/cpuset)/memory.limit_in_bytes"
# cgroup_v2_mem_limit="/hostsys/fs/cgroup/$(</proc/self/cpuset)/memory.max"
# if [ -f "$cgroup_v1_mem_limit" ]; then
# echo "Getting memory limit from file $cgroup_v1_mem_limit"
# cgroup_mem_bytes=$(cat "$cgroup_v1_mem_limit")
# elif [ -f "$cgroup_v2_mem_limit" ]; then
# echo "Getting memory limit from file $cgroup_v2_mem_limit"
# cgroup_mem_bytes=$(cat "$cgroup_v2_mem_limit")
# if [ "$cgroup_mem_bytes" = 'max' ]; then
# # In cgroupsv2, the memory.max file may contain 'max', meaning the group can use the full system memory
# # Here, we get the system memory from /proc/meminfo. Units are supposedly always in kb, but lets match them too
# cgroup_mem_kilobytes=$(grep -oP 'MemTotal:\s+\K\d+(?=\s+kB)' /proc/meminfo)
# if [[ $? -ne 0 ]] || [[ -z "$cgroup_mem_kilobytes" ]]; then
# fatal_error "Failed to get memory limit from /proc/meminfo"
# fi
# cgroup_mem_bytes=$(("$cgroup_mem_kilobytes"*1024))
# fi
# else
# fatal_error "Both files ${cgroup_v1_mem_limit} and ${cgroup_v2_mem_limit} couldn't be found. Failed to get the memory limit from the current cgroup"
# fi
# if [[ $? -eq 0 ]]; then
# # Convert to MiB
# cgroup_mem_mib=$(("$cgroup_mem_bytes"/(1024*1024)))
# else
# fatal_error "Failed to get the memory limit in bytes from the current cgroup"
# fi
# echo "Detected available memory: ${cgroup_mem_mib} MiB"
#
# cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
# echo "Replacing memory limit in the ReFrame config file with the detected CGROUP memory limit: ${cgroup_mem_mib} MiB"
# sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES
# RFM_PARTITION="${SLURM_JOB_PARTITION}"
# echo "Replacing partition name in the template ReFrame config file: ${RFM_PARTITION}"
# sed -i "s/__RFM_PARTITION__/${RFM_PARTITION}/g" $RFM_CONFIG_FILES

# Make debugging easier by printing the final config file:
echo "Final config file (after replacements):"
echo "ReFrame config file used:"
cat "${RFM_CONFIG_FILES}"

# Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966
Expand All @@ -191,20 +208,20 @@ export PSM3_DEVICES='self,shm' # this is enough, since we only run single node
# Check we can run reframe
reframe --version
if [[ $? -eq 0 ]]; then
echo_green "Succesfully ran 'reframe --version'"
echo_green "Succesfully ran 'reframe --version'"
else
fatal_error "Failed to run 'reframe --version'"
fatal_error "Failed to run 'reframe --version'"
fi

# Get the subset of test names based on the test mapping and tags (e.g. CI, 1_node)
module_list="module_files.list.txt"
mapping_config="tests/eessi_test_mapping/software_to_tests.yml"
if [[ ! -f "$module_list" ]]; then
echo_green "File ${module_list} not found, so only running the default set of tests from ${mapping_config}"
# Run with --debug for easier debugging in case there are issues:
python3 tests/eessi_test_mapping/map_software_to_test.py --mapping-file "${mapping_config}" --debug --defaults-only
REFRAME_NAME_ARGS=$(python3 tests/eessi_test_mapping/map_software_to_test.py --mapping-file "${mapping_config}" --defaults-only)
test_selection_exit_code=$?
echo_green "File ${module_list} not found, so only running the default set of tests from ${mapping_config}"
# Run with --debug for easier debugging in case there are issues:
python3 tests/eessi_test_mapping/map_software_to_test.py --mapping-file "${mapping_config}" --debug --defaults-only
REFRAME_NAME_ARGS=$(python3 tests/eessi_test_mapping/map_software_to_test.py --mapping-file "${mapping_config}" --defaults-only)
test_selection_exit_code=$?
else
# Run with --debug for easier debugging in case there are issues:
python3 tests/eessi_test_mapping/map_software_to_test.py --module-list "${module_list}" --mapping-file "${mapping_config}" --debug
Expand Down

0 comments on commit 3719c75

Please sign in to comment.