Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update vsc_kul_uhasselt.conf -- Dynamically adjust account based on queue type #822

Merged
merged 22 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 182 additions & 55 deletions conf/vsc_kul_uhasselt.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
def SCRATCH_DIR = System.getenv("VSC_SCRATCH") ?: "/tmp"
def TIER2_PROJECT = System.getenv("SLURM_ACCOUNT") ?: null
def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',')

// Perform work directory cleanup when the run has succesfully completed
// cleanup = true
Expand All @@ -28,7 +30,7 @@ process {
singularity {
enabled = true
autoMounts = true
cacheDir = "$scratch_dir/.singularity"
cacheDir = "$SCRATCH_DIR/.singularity"
pullTimeout = "30 min"
}

Expand All @@ -38,15 +40,158 @@ params {
}

env {
APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp"
APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache"
APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp"
APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache"
}

// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
aws {
maxErrorRetry = 3
}

/*
* Queue Selection Utility Functions for HPC Environments
* ==================================================
* This module provides functions to determine appropriate HPC queues based on task requirements
* for both GENIUS and WICE clusters.
*/

/*
* Constants:
* ----------
* TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs
* MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues
* MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues
*/
def TIME_THRESHOLD = 72.h
def MEMORY_THRESHOLD_GENIUS = 175.GB
def MEMORY_THRESHOLD_WICE = 239.GB

/*
* ---------
* Functions:
* ----------
* These functions are designed to select the appropriate HPC queues of
* VSC_KUL_UHASSELT based on task requirements. They handle both standard
* and GPU queues, considering memory requirements, execution time, and
* queue availability.
*/

/*
* limitTaskTime(time, maxTime)
* Ensures task time doesn't exceed the maximum allowed time
* @param time Current task time
* @param maxTime Maximum allowed time
* @return Limited task time
*/
def limitTaskTime(time, maxTime) {
return time > maxTime ? maxTime : time
}

/*
* determineGeniusQueue(task)
* Selects appropriate CPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements
*/
def determineGeniusQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedBigmem = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')

if (isHighMemory) {
return isLongRunning ?
(hasDedicatedBigmem ? 'dedicated_big_bigmem' : 'bigmem_long') :
'bigmem'
}

return isLongRunning ? 'batch_long' : 'batch'
}

/*
* determineGeniusGpuQueue(task)
* Selects appropriate GPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineGeniusGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedGpu = AVAILABLE_QUEUES.contains('dedicated_rega_gpu')
def hasAmdGpu = AVAILABLE_QUEUES.contains('amd')

if (isHighMemory) {
return isLongRunning ? 'gpu_v100_long' : 'gpu_v100'
}

if (isLongRunning) {
if (hasDedicatedGpu) return 'dedicated_rega_gpu'
if (hasAmdGpu) return 'amd_long'
return 'gpu_p100_long'
}

return hasAmdGpu ? 'amd' : 'gpu_p100'
}

/*
* determineWiceQueue(task)
* Selects appropriate CPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements and availability
*/
def determineWiceQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')

if (isHighMemory) {
if (isLongRunning && hasDedicatedQueue) {
return 'dedicated_big_bigmem'
}
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
return 'bigmem,hugemem'
}

return isLongRunning ?
'batch_long,batch_icelake_long,batch_sapphirerapids_long' :
'batch,batch_sapphirerapids,batch_icelake'
}

/*
* determineWiceGpuQueue(task)
* Selects appropriate GPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineWiceGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = isHighMemory ?
AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') :
AVAILABLE_QUEUES.contains('dedicated_big_gpu')

if (isLongRunning && !hasDedicatedQueue) {
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
}

if (isHighMemory) {
return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu_h100' : 'gpu_h100'
}

return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}

/*
* ========
* Profiles
* ========
* These profiles define the resource limits, queue selection, and cluster options
* for WICE and GENIUS clusters. They also include GPU-specific configurations.
* Details of the resource limits can be found in for genius at
* https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html
* and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html
*/

// Define profiles for each cluster
profiles {
genius {
Expand All @@ -55,35 +200,28 @@ profiles {
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
beforeScript = 'module load cluster/genius'
clusterOptions = { "--clusters=genius --account=$tier1_project" }

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
(task.time >= 72.h ? 'batch_long' : 'batch')
beforeScript = { 'module load cluster/genius/' + determineGeniusQueue(task).toString().split(',')[0] }
queue = { determineGeniusQueue(task) }
clusterOptions = {
determineGeniusQueue(task) =~ /dedicated/ ?
"--clusters=genius --account=lp_big_genius_cpu" :
"--clusters=genius --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
// suggested to use 9 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
}


genius_gpu {
params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
Expand All @@ -92,16 +230,11 @@ profiles {
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
beforeScript = 'module load cluster/genius'
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
Expand All @@ -112,53 +245,47 @@ profiles {
process {
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
clusterOptions = { "--clusters=wice --account=$tier1_project"}
beforeScript = 'module load cluster/wice'

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
beforeScript = { 'module load cluster/wice/' + determineWiceQueue(task).toString().split(',')[0] }
queue = { determineWiceQueue(task) }
clusterOptions = {
determineWiceQueue(task) =~ /dedicated/ ?
"--clusters=wice --account=lp_big_wice_cpu" :
"--clusters=wice --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*'{
withLabel: '.*gpu.*' {
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
// suggested to use 16 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
}


wice_gpu {
params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

process {
// 768 - 65 so 65GB for overhead, max is 720000MB
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
beforeScript = 'module load cluster/wice'
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
Expand All @@ -167,7 +294,7 @@ profiles {
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'

process {
clusterOptions = {"--clusters=genius --account=$tier1_project"}
clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"}
beforeScript = 'module load cluster/genius/superdome'
// 6000 - 228 so 228GB for overhead, max is 5910888MB
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]
Expand Down
8 changes: 8 additions & 0 deletions docs/vsc_kul_uhasselt.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,17 @@ A nextflow module is available that can be loaded `module load Nextflow` but it

2. Set up the environment variables in `~/.bashrc` or `~/.bash_profile`:

:::note
If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. AMD is considered a dedicated partition.
:::

```bash
export SLURM_ACCOUNT="<your-credential-account>"

# Comma-separated list of available dedicated partitions (if any)
# For example: export VSC_DEDICATED_QUEUES="dedicated_big_bigmem,dedicated_big_gpu"
export VSC_DEDICATED_QUEUES="<available-dedicated-partitions>"

# Needed for running Nextflow jobs
export NXF_HOME="$VSC_SCRATCH/.nextflow"
export NXF_WORK="$VSC_SCRATCH/work"
Expand Down
Loading