Skip to content

Commit

Permalink
Merge pull request #822 from ljwharbers/kul
Browse files Browse the repository at this point in the history
Update vsc_kul_uhasselt.conf -- Dynamically adjust account based on queue type
  • Loading branch information
Joon-Klaps authored Jan 10, 2025
2 parents c3b6432 + 0208521 commit eb6bb4b
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 55 deletions.
237 changes: 182 additions & 55 deletions conf/vsc_kul_uhasselt.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
// Default to /tmp directory if $VSC_SCRATCH scratch env is not available,
// see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config
scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp"
tier1_project = System.getenv("SLURM_ACCOUNT") ?: null
def SCRATCH_DIR = System.getenv("VSC_SCRATCH") ?: "/tmp"
def TIER2_PROJECT = System.getenv("SLURM_ACCOUNT") ?: null
def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: ""
def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',')

// Perform work directory cleanup when the run has succesfully completed
// cleanup = true
Expand All @@ -28,7 +30,7 @@ process {
singularity {
enabled = true
autoMounts = true
cacheDir = "$scratch_dir/.singularity"
cacheDir = "$SCRATCH_DIR/.singularity"
pullTimeout = "30 min"
}

Expand All @@ -38,15 +40,158 @@ params {
}

env {
APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp"
APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache"
APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp"
APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache"
}

// AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time)
aws {
maxErrorRetry = 3
}

/*
* Queue Selection Utility Functions for HPC Environments
* ==================================================
* This module provides functions to determine appropriate HPC queues based on task requirements
* for both GENIUS and WICE clusters.
*/

/*
* Constants:
* ----------
* TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs
* MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues
* MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues
*/
def TIME_THRESHOLD = 72.h
def MEMORY_THRESHOLD_GENIUS = 175.GB
def MEMORY_THRESHOLD_WICE = 239.GB

/*
* ---------
* Functions:
* ----------
* These functions are designed to select the appropriate HPC queues of
* VSC_KUL_UHASSELT based on task requirements. They handle both standard
* and GPU queues, considering memory requirements, execution time, and
* queue availability.
*/

/*
* limitTaskTime(time, maxTime)
* Ensures task time doesn't exceed the maximum allowed time
* @param time Current task time
* @param maxTime Maximum allowed time
* @return Limited task time
*/
def limitTaskTime(time, maxTime) {
return time > maxTime ? maxTime : time
}

/*
* determineGeniusQueue(task)
* Selects appropriate CPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements
*/
def determineGeniusQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedBigmem = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')

if (isHighMemory) {
return isLongRunning ?
(hasDedicatedBigmem ? 'dedicated_big_bigmem' : 'bigmem_long') :
'bigmem'
}

return isLongRunning ? 'batch_long' : 'batch'
}

/*
* determineGeniusGpuQueue(task)
* Selects appropriate GPU queue for GENIUS cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineGeniusGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedGpu = AVAILABLE_QUEUES.contains('dedicated_rega_gpu')
def hasAmdGpu = AVAILABLE_QUEUES.contains('amd')

if (isHighMemory) {
return isLongRunning ? 'gpu_v100_long' : 'gpu_v100'
}

if (isLongRunning) {
if (hasDedicatedGpu) return 'dedicated_rega_gpu'
if (hasAmdGpu) return 'amd_long'
return 'gpu_p100_long'
}

return hasAmdGpu ? 'amd' : 'gpu_p100'
}

/*
* determineWiceQueue(task)
* Selects appropriate CPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return Queue name based on task requirements and availability
*/
def determineWiceQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = AVAILABLE_QUEUES.contains('dedicated_big_bigmem')

if (isHighMemory) {
if (isLongRunning && hasDedicatedQueue) {
return 'dedicated_big_bigmem'
}
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
return 'bigmem,hugemem'
}

return isLongRunning ?
'batch_long,batch_icelake_long,batch_sapphirerapids_long' :
'batch,batch_sapphirerapids,batch_icelake'
}

/*
* determineWiceGpuQueue(task)
* Selects appropriate GPU queue for WICE cluster
* @param task Nextflow task object containing memory and time requirements
* @return GPU queue name based on task requirements
*/
def determineWiceGpuQueue = { task ->
def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE
def isLongRunning = task.time >= TIME_THRESHOLD
def hasDedicatedQueue = isHighMemory ?
AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') :
AVAILABLE_QUEUES.contains('dedicated_big_gpu')

if (isLongRunning && !hasDedicatedQueue) {
task.time = limitTaskTime(task.time, TIME_THRESHOLD)
}

if (isHighMemory) {
return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu_h100' : 'gpu_h100'
}

return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu' : 'gpu_a100,gpu'
}

/*
* ========
* Profiles
* ========
* These profiles define the resource limits, queue selection, and cluster options
* for WICE and GENIUS clusters. They also include GPU-specific configurations.
* Details of the resource limits can be found in for genius at
* https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html
* and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html
*/

// Define profiles for each cluster
profiles {
genius {
Expand All @@ -55,35 +200,28 @@ profiles {
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ]
beforeScript = 'module load cluster/genius'
clusterOptions = { "--clusters=genius --account=$tier1_project" }

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') :
(task.time >= 72.h ? 'batch_long' : 'batch')
beforeScript = { 'module load cluster/genius/' + determineGeniusQueue(task).toString().split(',')[0] }
queue = { determineGeniusQueue(task) }
clusterOptions = {
determineGeniusQueue(task) =~ /dedicated/ ?
"--clusters=genius --account=lp_big_genius_cpu" :
"--clusters=genius --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*'{
resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ]
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
// suggested to use 9 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
}


genius_gpu {
params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
Expand All @@ -92,16 +230,11 @@ profiles {
process {
// 768 - 65 so 65GB for overhead, max is 720000MB
resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h]
beforeScript = 'module load cluster/genius'
beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] }
queue = { determineGeniusGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int)
"--gres=gpu:${gpus} --clusters=genius --account=$tier1_project"
}

queue = {
task.memory >= 175.GB ?
(task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') :
(task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd')
"--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT"
}
}
}
Expand All @@ -112,53 +245,47 @@ profiles {
process {
// max is 2016000
resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ]
clusterOptions = { "--clusters=wice --account=$tier1_project"}
beforeScript = 'module load cluster/wice'

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') :
(task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake')
beforeScript = { 'module load cluster/wice/' + determineWiceQueue(task).toString().split(',')[0] }
queue = { determineWiceQueue(task) }
clusterOptions = {
determineWiceQueue(task) =~ /dedicated/ ?
"--clusters=wice --account=lp_big_wice_cpu" :
"--clusters=wice --account=$TIER2_PROJECT"
}

withLabel: '.*gpu.*'{
withLabel: '.*gpu.*' {
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
// suggested to use 16 cpus per gpu
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
}


wice_gpu {
params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.'
apptainer.runOptions = '--containall --cleanenv --nv'
singularity.runOptions = '--containall --cleanenv --nv'

process {
// 768 - 65 so 65GB for overhead, max is 720000MB
beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] }
resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ]
beforeScript = 'module load cluster/wice'
queue = { determineWiceGpuQueue(task) }
clusterOptions = {
def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int)
"--gres=gpu:${gpus} --clusters=wice --account=$tier1_project"
}

queue = {
task.memory >= 239.GB ?
(task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') :
(task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu')
def queueValue = determineWiceGpuQueue(task)
queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" :
queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" :
"--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}"
}
}
}
Expand All @@ -167,7 +294,7 @@ profiles {
params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.'

process {
clusterOptions = {"--clusters=genius --account=$tier1_project"}
clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"}
beforeScript = 'module load cluster/genius/superdome'
// 6000 - 228 so 228GB for overhead, max is 5910888MB
resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h]
Expand Down
8 changes: 8 additions & 0 deletions docs/vsc_kul_uhasselt.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,17 @@ A nextflow module is available that can be loaded `module load Nextflow` but it

2. Set up the environment variables in `~/.bashrc` or `~/.bash_profile`:

:::note
If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. AMD is considered a dedicated partition.
:::

```bash
export SLURM_ACCOUNT="<your-credential-account>"

# Comma-separated list of available dedicated partitions (if any)
# For example: export VSC_DEDICATED_QUEUES="dedicated_big_bigmem,dedicated_big_gpu"
export VSC_DEDICATED_QUEUES="<available-dedicated-partitions>"

# Needed for running Nextflow jobs
export NXF_HOME="$VSC_SCRATCH/.nextflow"
export NXF_WORK="$VSC_SCRATCH/work"
Expand Down

0 comments on commit eb6bb4b

Please sign in to comment.