diff --git a/README.md b/README.md index 71d0f35..6f5ae1e 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ For each group (if used) or partition any nodes in an ansible inventory group `< - Nodes may have arbitrary hostnames but these should be lowercase to avoid a mismatch between inventory and actual hostname. - Nodes in a group are assumed to be homogenous in terms of processor and memory. - An inventory group may be empty or missing, but if it is not then the play must contain at least one node from it (used to set processor information). -- Nodes may not appear in more than one group. + `openhpc_job_maxtime`: Maximum job time limit, default `'60-0'` (60 days). See [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) parameter `MaxTime` for format. The default is 60 days. The value should be quoted to avoid Ansible conversions. diff --git a/molecule/README.md b/molecule/README.md index 16126cb..77db89a 100644 --- a/molecule/README.md +++ b/molecule/README.md @@ -22,6 +22,7 @@ test11 | 1 | N | As for #5 but then deletes a n test12 | 1 | N | As for #5 but enabling job completion and testing `sacct -c` test13 | 1 | N | As for #5 but tests `openhpc_config` variable. test14 | 1 | N | As for #5 but also tests `extra_nodes` via State=DOWN nodes. +test15 | 1 | Y | As for #5 but also tests `partitions with different name but with the same NodeName`. # Local Installation & Running diff --git a/molecule/test15/converge.yml b/molecule/test15/converge.yml new file mode 100644 index 0000000..7ec3109 --- /dev/null +++ b/molecule/test15/converge.yml @@ -0,0 +1,27 @@ +--- +- name: Converge + hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + partition_params: + PreemptMode: requeue + - name: beta + groups: + - name: "compute" + partition_params: + PreemptMode: 'OFF' + Priority: 1000 + Default: false + AllowAccounts: Group_own_thePartition + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true + tasks: + - name: "Include ansible-role-openhpc" + include_role: + name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" diff --git a/molecule/test15/molecule.yml b/molecule/test15/molecule.yml new file mode 100644 index 0000000..29d30ca --- /dev/null +++ b/molecule/test15/molecule.yml @@ -0,0 +1,44 @@ +--- +driver: + name: podman +platforms: + - name: testohpc-login-0 + image: ${MOLECULE_IMAGE} + pre_build_image: true + groups: + - testohpc_login + command: /sbin/init + tmpfs: + - /run + - /tmp + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + network: net1 + - name: testohpc-compute-0 + image: ${MOLECULE_IMAGE} + pre_build_image: true + groups: + - testohpc_compute + command: /sbin/init + tmpfs: + - /run + - /tmp + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + network: net1 + - name: testohpc-compute-1 + image: ${MOLECULE_IMAGE} + pre_build_image: true + groups: + - testohpc_compute + command: /sbin/init + tmpfs: + - /run + - /tmp + volumes: + - /sys/fs/cgroup:/sys/fs/cgroup:ro + network: net1 +provisioner: + name: ansible +verifier: + name: ansible diff --git a/molecule/test15/verify.yml b/molecule/test15/verify.yml new file mode 100644 index 0000000..bcbe64d --- /dev/null +++ b/molecule/test15/verify.yml @@ -0,0 +1,16 @@ +--- + +- name: Check slurm hostlist + hosts: testohpc_login + vars: + expected_sinfo: | # NB compute is default (*) + 'compute*,up,60-00:00:00,2,idle,testohpc-compute-[0-1]' + 'beta,up,60-00:00:00,2,idle,testohpc-compute-[0-1]' + tasks: + - name: Get slurm partition info + command: sinfo --noheader --format="%P,%a,%l,%D,%t,%N" # using --format ensures we control whitespace + register: sinfo + - name: + assert: + that: "sinfo.stdout.split() == expected_sinfo.split()" + fail_msg: "FAILED - got {{ sinfo.stdout.split() }} expected {{ expected_sinfo.split() }}" diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 792533e..67cc180 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -150,10 +150,10 @@ NodeName={{ node }} # OpenHPC default configuration PropagateResourceLimitsExcept=MEMLOCK Epilog=/etc/slurm/slurm.epilog.clean +{% set donehosts = [] %} {% for part in openhpc_slurm_partitions %} {% set nodelist = [] %} {% for group in part.get('groups', [part]) %} - {% set group_name = group.cluster_name|default(openhpc_cluster_name) ~ '_' ~ group.name %} # openhpc_slurm_partitions group: {{ group_name }} {% set inventory_group_hosts = groups.get(group_name, []) %} @@ -164,9 +164,11 @@ Epilog=/etc/slurm/slurm.epilog.clean {% set ram_mb = (first_host_hv['ansible_memory_mb']['real']['total'] * (group.ram_multiplier | default(openhpc_ram_multiplier))) | int %} {% for hostlist in (inventory_group_hosts | hostlist_expression) %} {% set gres = ' Gres=%s' % (','.join(group.gres | map(attribute='conf') )) if 'gres' in group else '' %} - + {% if hostlist not in donehosts %} NodeName={{ hostlist }} State=UNKNOWN RealMemory={{ group.get('ram_mb', ram_mb) }} Sockets={{first_host_hv['ansible_processor_count']}} CoresPerSocket={{ first_host_hv['ansible_processor_cores'] }} ThreadsPerCore={{ first_host_hv['ansible_processor_threads_per_core'] }}{{ gres }} + {% endif %} {% set _ = nodelist.append(hostlist) %} + {% set _ = donehosts.append(hostlist) %} {% endfor %}{# nodes #} {% endif %}{# inventory_group_hosts #} {% for extra_node_defn in group.get('extra_nodes', []) %} @@ -184,5 +186,4 @@ PartitionName={{part.name}} Default={{ part.get('default', 'YES') }} MaxTime={{ NodeName=nonesuch {% if openhpc_slurm_configless %}SlurmctldParameters=enable_configless{% endif %} - ReturnToService=2