Skip to content

Commit

Permalink
Merge pull request #281 from nebius/dev
Browse files Browse the repository at this point in the history
Soperator release 1.16.1
  • Loading branch information
asteny authored Dec 23, 2024
2 parents 4c06b19 + 5356644 commit 3d3bfb0
Show file tree
Hide file tree
Showing 33 changed files with 461 additions and 237 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.16.0
1.16.1
52 changes: 35 additions & 17 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type SlurmClusterSpec struct {
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose", maxJobCount: 10000, minJobAge: 86400}
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
}

Expand All @@ -86,27 +87,39 @@ type SlurmConfig struct {
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=1228800
DefMemPerNode int32 `json:"defMemPerNode,omitempty"`
DefMemPerNode *int32 `json:"defMemPerNode,omitempty"`
// Default count of CPUs allocated per allocated GPU
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=16
DefCpuPerGPU int32 `json:"defCpuPerGPU,omitempty"`
DefCpuPerGPU *int32 `json:"defCpuPerGPU,omitempty"`
// The time to wait, in seconds, when any job is in the COMPLETING state before any additional jobs are scheduled.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=5
CompleteWait int32 `json:"completeWait,omitempty"`
CompleteWait *int32 `json:"completeWait,omitempty"`
// Defines specific subsystems which should provide more detailed event logging.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
// +kubebuilder:validation:Pattern="^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$"
DebugFlags string `json:"debugFlags,omitempty"`
DebugFlags *string `json:"debugFlags,omitempty"`
// Additional parameters for the task plugin
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Verbose"
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
TaskPluginParam string `json:"taskPluginParam,omitempty"`
TaskPluginParam *string `json:"taskPluginParam,omitempty"`
// Keep N last jobs in controller memory
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=10000
MaxJobCount *int32 `json:"maxJobCount,omitempty"`
// Don't remove jobs from controller memory after some time
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=86400
MinJobAge *int32 `json:"minJobAge,omitempty"`
}

type PartitionConfiguration struct {
Expand Down Expand Up @@ -547,33 +560,32 @@ type SlurmdbdConfig struct {

type AccountingSlurmConf struct {
// +kubebuilder:validation:Optional
AccountingStorageTRES string `json:"accountingStorageTRES,omitempty"`
AccountingStorageTRES *string `json:"accountingStorageTRES,omitempty"`
// +kubebuilder:validation:Optional
AccountingStoreFlags string `json:"accountingStoreFlags,omitempty"`
AccountingStoreFlags *string `json:"accountingStoreFlags,omitempty"`
// +kubebuilder:validation:Optional
AcctGatherInterconnectType string `json:"acctGatherInterconnectType,omitempty"`
AcctGatherInterconnectType *string `json:"acctGatherInterconnectType,omitempty"`
// +kubebuilder:validation:Optional
AcctGatherFilesystemType string `json:"acctGatherFilesystemType,omitempty"`
AcctGatherFilesystemType *string `json:"acctGatherFilesystemType,omitempty"`
// +kubebuilder:validation:Optional
AcctGatherProfileType string `json:"acctGatherProfileType,omitempty"`
AcctGatherProfileType *string `json:"acctGatherProfileType,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Enum="jobacct_gather/linux";"jobacct_gather/cgroup";"jobacct_gather/none"
JobAcctGatherType string `json:"jobAcctGatherType,omitempty"`
JobAcctGatherType *string `json:"jobAcctGatherType,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default=30
JobAcctGatherFrequency int `json:"jobAcctGatherFrequency,omitempty"`
JobAcctGatherFrequency *int `json:"jobAcctGatherFrequency,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Enum="NoShared";"UsePss";"OverMemoryKill";"DisableGPUAcct"
JobAcctGatherParams string `json:"jobAcctGatherParams,omitempty"`
JobAcctGatherParams *string `json:"jobAcctGatherParams,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default=0
PriorityWeightAge int16 `json:"priorityWeightAge,omitempty"`
PriorityWeightAge *int16 `json:"priorityWeightAge,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default=0
PriorityWeightFairshare int16 `json:"priorityWeightFairshare,omitempty"`
PriorityWeightFairshare *int16 `json:"priorityWeightFairshare,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default=0
PriorityWeightTRES int16 `json:"priorityWeightTRES,omitempty"`
PriorityWeightTRES *string `json:"priorityWeightTRES,omitempty"`
}

// SlurmNodeController defines the configuration for the Slurm controller node
Expand Down Expand Up @@ -645,6 +657,12 @@ type SlurmNodeWorker struct {
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
EnableGDRCopy bool `json:"enableGDRCopy,omitempty"`

// SlurmNodeExtra defines the string that will be set to the "Extra" field of the corresponding Slurm node. It can
// use any environment variables that are available in the slurmd container when it starts.
//
// +kubebuilder:validation:Optional
SlurmNodeExtra string `json:"slurmNodeExtra,omitempty"`
}

// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
Expand Down
94 changes: 92 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 26 additions & 2 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,14 @@ spec:
type: string
type: object
slurmConfig:
default:
completeWait: 5
debugFlags: Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs
defCpuPerGPU: 16
defMemPerNode: 1228800
maxJobCount: 10000
minJobAge: 86400
taskPluginParam: Verbose
description: SlurmConfig represents the Slurm configuration in slurm.conf.
Not all options are supported.
properties:
Expand All @@ -1496,8 +1504,20 @@ spec:
node in mebibytes.
format: int32
type: integer
maxJobCount:
default: 10000
description: Keep N last jobs in controller memory
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
time
format: int32
type: integer
taskPluginParam:
default: Verbose
description: Additional parameters for the task plugin
pattern: ^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$
type: string
type: object
Expand Down Expand Up @@ -2057,8 +2077,7 @@ spec:
default: 0
type: integer
priorityWeightTRES:
default: 0
type: integer
type: string
type: object
slurmdbd:
description: Slurmdbd represents the Slurm database daemon
Expand Down Expand Up @@ -3778,6 +3797,11 @@ spec:
description: Size defines the number of node instances
format: int32
type: integer
slurmNodeExtra:
description: |-
SlurmNodeExtra defines the string that will be set to the "Extra" field of the corresponding Slurm node. It can
use any environment variables that are available in the slurmd container when it starts.
type: string
slurmd:
description: Slurmd represents the Slurm daemon service configuration
properties:
Expand Down
2 changes: 1 addition & 1 deletion config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ resources:
images:
- name: controller
newName: cr.eu-north1.nebius.cloud/soperator/slurm-operator
newTag: 1.16.0
newTag: 1.16.1
2 changes: 1 addition & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ spec:
value: "false"
- name: SLURM_OPERATOR_WATCH_NAMESPACES
value: "*"
image: controller:1.16.0
image: controller:1.16.1
imagePullPolicy: Always
name: manager
securityContext:
Expand Down
8 changes: 4 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ toolchain go1.23.3

require (
github.com/go-logr/logr v1.4.2
github.com/onsi/ginkgo/v2 v2.22.0
github.com/onsi/ginkgo/v2 v2.22.1
github.com/onsi/gomega v1.36.1
github.com/open-telemetry/opentelemetry-operator v0.103.0
github.com/pkg/errors v0.9.1
Expand Down Expand Up @@ -52,7 +52,7 @@ require (
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db // indirect
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/josharian/intern v1.0.0 // indirect
Expand All @@ -75,13 +75,13 @@ require (
go.opentelemetry.io/otel/sdk/metric v1.28.0 // indirect
go.opentelemetry.io/otel/trace v1.28.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/net v0.30.0 // indirect
golang.org/x/net v0.32.0 // indirect
golang.org/x/oauth2 v0.22.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.26.0 // indirect
golang.org/x/tools v0.28.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/protobuf v1.35.1 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgYQBbFN4U4JNXUNYpxael3UzMyo=
github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg=
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4=
Expand All @@ -72,8 +72,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg=
github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo=
github.com/onsi/ginkgo/v2 v2.22.1 h1:QW7tbJAUDyVDVOM5dFa7qaybo+CRfR7bemlQUN6Z8aM=
github.com/onsi/ginkgo/v2 v2.22.1/go.mod h1:S6aTpoRsSq2cZOd+pssHAlKW/Q/jZt6cPrPlnj4a1xM=
github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw=
github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/open-telemetry/opentelemetry-operator v0.103.0 h1:L0REMuJSMZjqCw7p7fWMn19XkiIULMr3NnHdPLryMQs=
Expand Down Expand Up @@ -144,8 +144,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4=
golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU=
golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA=
golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -168,8 +168,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ=
golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0=
golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
4 changes: 2 additions & 2 deletions helm/slurm-cluster-storage/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: helm-slurm-cluster-storage
description: A Helm chart for Kubernetes
type: application
version: "1.16.0"
appVersion: "1.16.0"
version: "1.16.1"
appVersion: "1.16.1"
Loading

0 comments on commit 3d3bfb0

Please sign in to comment.