Skip to content

Commit

Permalink
Merge pull request #135 from nebius/dev
Browse files Browse the repository at this point in the history
Release v1.14.11
  • Loading branch information
dstaroff authored Oct 22, 2024
2 parents acef4c4 + 756b068 commit 8467124
Show file tree
Hide file tree
Showing 25 changed files with 232 additions and 46 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.14.10
1.14.11
19 changes: 19 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,25 @@ type SlurmClusterSpec struct {
//
// +kubebuilder:validation:Optional
Telemetry *Telemetry `json:"telemetry,omitempty"`

// PartitionConfiguration define partition configuration of slurm worker nodes
// https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
// +kubebuilder:validation:Optional
PartitionConfiguration PartitionConfiguration `json:"partitionConfiguration,omitempty"`
}

type PartitionConfiguration struct {
// ConfigType
// +kubebuilder:validation:Enum=default;custom
// +kubebuilder:validation:Optional
// +kubebuilder:default="default"
ConfigType string `json:"configType,omitempty"`
// RawConfig define partition configuration as list of string started with PartitionName
// Example for custom ConfigType:
// - PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
// - PartitionName=high_priority Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
// +kubebuilder:validation:Optional
RawConfig []string `json:"rawConfig,omitempty"`
}

type NCCLSettings struct {
Expand Down
21 changes: 21 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,28 @@ spec:
- custom
type: string
type: object
partitionConfiguration:
description: |-
PartitionConfiguration define partition configuration of slurm worker nodes
https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
properties:
configType:
default: default
description: ConfigType
enum:
- default
- custom
type: string
rawConfig:
description: |-
RawConfig define partition configuration as list of string started with PartitionName
Example for custom ConfigType:
- PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
- PartitionName=high_priority Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
items:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ resources:
images:
- name: controller
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
newTag: 1.14.10
newTag: 1.14.11
2 changes: 1 addition & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ spec:
value: "false"
- name: SLURM_OPERATOR_WATCH_NAMESPACES
value: "*"
image: controller:1.14.10
image: controller:1.14.11
imagePullPolicy: Always
name: manager
securityContext:
Expand Down
4 changes: 2 additions & 2 deletions helm/slurm-cluster-storage/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-slurm-cluster-storage
description: A Helm chart for Kubernetes
type: application
version: "1.14.10"
appVersion: "1.14.10"
version: "1.14.11"
appVersion: "1.14.11"
4 changes: 2 additions & 2 deletions helm/slurm-cluster/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-slurm-cluster
description: A Helm chart for Kubernetes
type: application
version: "1.14.10"
appVersion: "1.14.10"
version: "1.14.11"
appVersion: "1.14.11"
4 changes: 4 additions & 0 deletions helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ spec:
crVersion: {{ .Chart.Version }}
pause: {{ .Values.pause }}
clusterType: {{ .Values.clusterType }}
partitionConfiguration:
configType: {{ (default "default" .Values.partitionConfiguration.configType) }}
rawConfig:
{{- default list .Values.partitionConfiguration.rawConfig | toYaml | nindent 6 }}
k8sNodeFilters:
{{- range .Values.k8sNodeFilters }}
- name: {{ .name }}
Expand Down
26 changes: 18 additions & 8 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ annotations: {}
pause: false
# Slurm cluster type. Can be now gpu or cpu
clusterType: gpu
# partitionConfiguration define partition configuration of slurm worker nodes
# https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
partitionConfiguration:
# Could be default or custom
configType: "default"
# configuration as list string started with PartitionName
# Example for custom ConfigType:
rawConfig: []
# - PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
# - PartitionName=high_priority Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
# K8s node filters used in Slurm node specifications. Define which nodes should be used to schedule pods to
k8sNodeFilters:
- name: gpu
Expand Down Expand Up @@ -333,12 +343,12 @@ telemetry: {}
# otelCollectorPort: 8429

images:
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.14.10-jammy-slurm24.05.2"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.14.10-jammy-slurm24.05.2"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.14.10-jammy-slurm24.05.2"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.14.10-jammy-slurm24.05.2"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.14.10-jammy-slurm24.05.2"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.14.10-jammy-slurm24.05.2"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.14.10-jammy-slurm24.05.2"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.14.10-jammy-slurm24.05.2"
slurmctld: "cr.eu-north1.nebius.cloud/soperator/controller_slurmctld:1.14.11-jammy-slurm24.05.2"
slurmd: "cr.eu-north1.nebius.cloud/soperator/worker_slurmd:1.14.11-jammy-slurm24.05.2"
sshd: "cr.eu-north1.nebius.cloud/soperator/login_sshd:1.14.11-jammy-slurm24.05.2"
munge: "cr.eu-north1.nebius.cloud/soperator/munge:1.14.11-jammy-slurm24.05.2"
populateJail: "cr.eu-north1.nebius.cloud/soperator/populate_jail:1.14.11-jammy-slurm24.05.2"
ncclBenchmark: "cr.eu-north1.nebius.cloud/soperator/nccl_benchmark:1.14.11-jammy-slurm24.05.2"
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.14.11-jammy-slurm24.05.2"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.14.11-jammy-slurm24.05.2"
mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3"
4 changes: 2 additions & 2 deletions helm/soperator-crds/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-soperator-crds
description: A Helm chart for Kubernetes
type: application
version: 1.14.10
appVersion: "1.14.10"
version: 1.14.11
appVersion: "1.14.11"
22 changes: 22 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,28 @@ spec:
- custom
type: string
type: object
partitionConfiguration:
description: |-
PartitionConfiguration define partition configuration of slurm worker nodes
https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
properties:
configType:
default: default
description: ConfigType
enum:
- default
- custom
type: string
rawConfig:
description: |-
RawConfig define partition configuration as list of string started with PartitionName
Example for custom ConfigType:
- PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
- PartitionName=high_priority Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
items:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Expand Down
4 changes: 2 additions & 2 deletions helm/soperator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-soperator
description: A Helm chart for Kubernetes
type: application
version: 1.14.10
appVersion: "1.14.10"
version: 1.14.11
appVersion: "1.14.11"
22 changes: 22 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,28 @@ spec:
- custom
type: string
type: object
partitionConfiguration:
description: |-
PartitionConfiguration define partition configuration of slurm worker nodes
https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
properties:
configType:
default: default
description: ConfigType
enum:
- default
- custom
type: string
rawConfig:
description: |-
RawConfig define partition configuration as list of string started with PartitionName
Example for custom ConfigType:
- PartitionName=low_priority Nodes=worker-[0-15] Default=YES MaxTime=INFINITE State=UP PriorityTier=1
- PartitionName=high_priority Nodes=worker-[10-20] Default=NO MaxTime=INFINITE State=UP PriorityTier=2
items:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Expand Down
2 changes: 1 addition & 1 deletion helm/soperator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ controllerManager:
slurmOperatorWatchNamespaces: '*'
image:
repository: cr.eu-north1.nebius.cloud/soperator/slurm-operator
tag: 1.14.10
tag: 1.14.11
imagePullPolicy: Always
resources:
limits:
Expand Down
10 changes: 8 additions & 2 deletions images/worker/slurmd_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@ set -e # Exit immediately if any command returns a non-zero error code

echo "Starting slurmd entrypoint script"
if [ -n "${CGROUP_V2}" ]; then
CGROUP_PATH=$(cat /proc/self/cgroup | awk -F'/' '{print "/"$2"/"$3"/"$4}')
CGROUP_PATH=''
if [ "$SLURM_CLUSTER_TYPE" = "gpu" ]; then
CGROUP_PATH=$(cat /proc/self/cgroup | awk -F'/' '{print "/"$2"/"$3"/"$4}')
else
CGROUP_PATH=$(cat /proc/self/cgroup | awk -F'/' '{print "/"$2"/"$3}')
fi

if [ -n "${CGROUP_PATH}" ]; then
echo "cgroup v2 detected, creating cgroup for ${CGROUP_PATH}"
mkdir -p /sys/fs/cgroup/${CGROUP_PATH}/system.slice
mkdir -p /sys/fs/cgroup/${CGROUP_PATH}/system.slice
else
echo "cgroup v2 detected, but cgroup path is empty"
exit 1
Expand Down
2 changes: 1 addition & 1 deletion internal/consts/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
package consts

const (
VersionCR = "1.14.10"
VersionCR = "1.14.11"
)
8 changes: 7 additions & 1 deletion internal/controller/reconciler/grant.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ func (r *MariaDbGrantReconciler) Reconcile(
func (r *MariaDbGrantReconciler) patch(existing, desired client.Object) (client.Patch, error) {
patchImpl := func(dst, src *mariadv1alpha1.Grant) client.Patch {
res := client.MergeFrom(dst.DeepCopy())
dst.Spec = src.Spec
dst.Spec.Username = src.Spec.Username
dst.Spec.Host = src.Spec.Host
dst.Spec.Database = src.Spec.Database
dst.Spec.Table = src.Spec.Table
dst.Spec.Privileges = src.Spec.Privileges
dst.Spec.GrantOption = src.Spec.GrantOption
dst.Spec.MariaDBRef = src.Spec.MariaDBRef
return res
}
return patchImpl(existing.(*mariadv1alpha1.Grant), desired.(*mariadv1alpha1.Grant)), nil
Expand Down
6 changes: 5 additions & 1 deletion internal/controller/reconciler/mariadb.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ func (r *MariaDbReconciler) patch(existing, desired client.Object) (client.Patch
dst.Spec.Image = src.Spec.Image
dst.Spec.Replicas = src.Spec.Replicas
dst.Spec.Port = src.Spec.Port
dst.Spec.Storage = src.Spec.Storage
dst.Spec.Database = src.Spec.Database
dst.Spec.Username = src.Spec.Username
dst.Spec.PasswordSecretKeyRef = src.Spec.PasswordSecretKeyRef
Expand All @@ -65,6 +64,11 @@ func (r *MariaDbReconciler) patch(existing, desired client.Object) (client.Patch
dst.Spec.Resources = src.Spec.Resources
dst.Spec.SecurityContext = src.Spec.SecurityContext
dst.Spec.PodSecurityContext = src.Spec.PodSecurityContext
dst.Spec.Storage.Ephemeral = src.Spec.Storage.Ephemeral
dst.Spec.Storage.StorageClassName = src.Spec.Storage.StorageClassName
dst.Spec.Storage.VolumeClaimTemplate = src.Spec.Storage.VolumeClaimTemplate
dst.Spec.Storage.Size = src.Spec.Storage.Size

return res
}
return patchImpl(existing.(*mariadv1alpha1.MariaDB), desired.(*mariadv1alpha1.MariaDB)), nil
Expand Down
20 changes: 19 additions & 1 deletion internal/render/common/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package common
import (
"fmt"
"reflect"
"strings"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -79,6 +80,8 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
res.AddProperty("CliFilterPlugins", "cli_filter/user_defaults")
res.AddComment("")
res.AddProperty("LaunchParameters", "use_interactive_step")
res.AddComment("Scrontab")
res.AddProperty("ScronParameters", "enable,explicit_scancel")
res.AddComment("")
res.AddProperty("MaxJobCount", 1000) // Keep 1000 last jobs in controller memory
res.AddProperty("MinJobAge", 86400) // Don't remove jobs from controller memory after some time
Expand Down Expand Up @@ -113,7 +116,22 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
res.AddComment("COMPUTE NODES")
res.AddComment("We're using the \"dynamic nodes\" feature: https://slurm.schedmd.com/dynamic_nodes.html")
res.AddProperty("MaxNodeCount", "512")
res.AddProperty("PartitionName", "main Nodes=ALL Default=YES MaxTime=INFINITE State=UP OverSubscribe=YES")
res.AddComment("Partition Configuration")
res.AddProperty("JobRequeue", 1)
res.AddProperty("PreemptMode", "REQUEUE")
res.AddProperty("PreemptType", "preempt/partition_prio")
switch cluster.PartitionConfiguration.ConfigType {
case "custom":
for _, l := range cluster.PartitionConfiguration.RawConfig {
line := strings.TrimSpace(l)
if strings.HasPrefix(line, "PartitionName") {
clearLine := strings.Replace(line, "PartitionName=", "", 1)
res.AddProperty("PartitionName", clearLine)
}
}
default:
res.AddProperty("PartitionName", "main Nodes=ALL Default=YES MaxTime=INFINITE State=UP OverSubscribe=YES")
}
if cluster.NodeAccounting.Enabled {
res.AddComment("")
res.AddComment("ACCOUNTING")
Expand Down
5 changes: 3 additions & 2 deletions internal/render/worker/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ func RenderStatefulSet(
consts.DefaultContainerAnnotationName: consts.ContainerNameSlurmd,
}

initContainers := []corev1.Container{
renderContainerToolkitValidation(&worker.ContainerToolkitValidation),
var initContainers []corev1.Container
if clusterType == consts.ClusterTypeGPU {
initContainers = append(initContainers, renderContainerToolkitValidation(&worker.ContainerToolkitValidation))
}

return appsv1.StatefulSet{
Expand Down
2 changes: 1 addition & 1 deletion internal/render/worker/statefulset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,5 @@ func Test_RenderStatefulSet(t *testing.T) {
result, err = worker.RenderStatefulSet(testNamespace, testCluster, consts.ClusterTypeCPU, nodeFilter, voluemSource, workerCGroupV2)
assert.NoError(t, err)
assert.Equal(t, consts.CGroupV2Env, result.Spec.Template.Spec.Containers[0].Env[4].Name)
assert.True(t, len(result.Spec.Template.Spec.InitContainers) == 1)
assert.True(t, len(result.Spec.Template.Spec.InitContainers) == 0)
}
Loading

0 comments on commit 8467124

Please sign in to comment.