diff --git a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml index d0e0e6f86e..b5583cbaaf 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml @@ -197,6 +197,7 @@ spec: They will be merged with the TrainingRuntime values. type: object managedBy: + default: kubeflow.org/trainjob-controller description: |- ManagedBy is used to indicate the controller or entity that manages a TrainJob. The value must be either an empty, `kubeflow.org/trainjob-controller` or @@ -206,6 +207,12 @@ spec: with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. Defaults to `kubeflow.org/trainjob-controller` type: string + x-kubernetes-validations: + - message: ManagedBy must be kubeflow.org/trainjob-controller or kueue.x-k8s.io/multikueue + if set + rule: self in ['kubeflow.org/trainjob-controller', 'kueue.x-k8s.io/multikueue'] + - message: ManagedBy value is immutable + rule: self == oldSelf modelConfig: description: Configuration of the pre-trained and trained model. properties: @@ -2736,14 +2743,15 @@ spec: description: Reference to the training runtime. properties: apiGroup: + default: kubeflow.org description: |- APIGroup of the runtime being referenced. Defaults to `kubeflow.org`. type: string kind: + default: ClusterTrainingRuntime description: |- Kind of the runtime being referenced. - It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime. type: string name: @@ -2756,6 +2764,7 @@ spec: - name type: object suspend: + default: false description: |- Whether the controller should suspend the running TrainJob. Defaults to false. diff --git a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go index 6cb9daabe8..0eeb05ddd0 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v2alpha1/openapi_generated.go @@ -727,7 +727,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_RuntimeRef(ref common.ReferenceCallbac }, "kind": { SchemaProps: spec.SchemaProps{ - Description: "Kind of the runtime being referenced. It must be one of TrainingRuntime or ClusterTrainingRuntime. Defaults to ClusterTrainingRuntime.", + Description: "Kind of the runtime being referenced. Defaults to ClusterTrainingRuntime.", Type: []string{"string"}, Format: "", }, diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index 0a2f95770d..3baab1fb35 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -22,6 +22,10 @@ import ( jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) +const ( + TrainJobKind string = "TrainJob" +) + // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // +kubebuilder:object:root=true @@ -87,6 +91,7 @@ type TrainJobSpec struct { // Whether the controller should suspend the running TrainJob. // Defaults to false. + // +kubebuilder:default=false Suspend *bool `json:"suspend,omitempty"` // ManagedBy is used to indicate the controller or entity that manages a TrainJob. @@ -96,6 +101,9 @@ type TrainJobSpec struct { // `kubeflow.org/trainjob-controller`, but delegates reconciling TrainJobs // with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. // Defaults to `kubeflow.org/trainjob-controller` + // +kubebuilder:default="kubeflow.org/trainjob-controller" + // +kubebuilder:validation:XValidation:rule="self in ['kubeflow.org/trainjob-controller', 'kueue.x-k8s.io/multikueue']", message="ManagedBy must be kubeflow.org/trainjob-controller or kueue.x-k8s.io/multikueue if set" + // +kubebuilder:validation:XValidation:rule="self == oldSelf", message="ManagedBy value is immutable" ManagedBy *string `json:"managedBy,omitempty"` } @@ -108,11 +116,12 @@ type RuntimeRef struct { // APIGroup of the runtime being referenced. // Defaults to `kubeflow.org`. + // +kubebuilder:default="kubeflow.org" APIGroup *string `json:"apiGroup,omitempty"` // Kind of the runtime being referenced. - // It must be one of TrainingRuntime or ClusterTrainingRuntime. // Defaults to ClusterTrainingRuntime. + // +kubebuilder:default="ClusterTrainingRuntime" Kind *string `json:"kind,omitempty"` } diff --git a/test/integration/controller.v2/trainjob_controller_test.go b/test/integration/controller.v2/trainjob_controller_test.go index d13f1179f1..e31b7e3f79 100644 --- a/test/integration/controller.v2/trainjob_controller_test.go +++ b/test/integration/controller.v2/trainjob_controller_test.go @@ -21,6 +21,7 @@ import ( "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1" @@ -71,4 +72,93 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() { gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed()) }) }) + + ginkgo.When("TrainJob CR Validation", func() { + ginkgo.AfterEach(func() { + gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainJob{}, client.InNamespace(ns.Name))).Should( + gomega.Succeed()) + }) + + ginkgo.It("Should succeed in creating TrainJob", func() { + + managedBy := "kubeflow.org/trainjob-controller" + + trainingRuntimeRef := kubeflowv2.RuntimeRef{ + Name: "TorchRuntime", + APIGroup: ptr.To(kubeflowv2.GroupVersion.Group), + Kind: ptr.To(kubeflowv2.TrainingRuntimeKind), + } + jobSpec := kubeflowv2.TrainJobSpec{ + RuntimeRef: trainingRuntimeRef, + ManagedBy: &managedBy, + } + trainJob := &kubeflowv2.TrainJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: kubeflowv2.SchemeGroupVersion.String(), + Kind: kubeflowv2.TrainJobKind, + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "valid-trainjob-", + Namespace: ns.Name, + }, + Spec: jobSpec, + } + + err := k8sClient.Create(ctx, trainJob) + gomega.Expect(err).Should(gomega.Succeed()) + }) + + ginkgo.It("Should fail in creating TrainJob with invalid spec.managedBy", func() { + managedBy := "invalidManagedBy" + jobSpec := kubeflowv2.TrainJobSpec{ + ManagedBy: &managedBy, + } + trainJob := &kubeflowv2.TrainJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: kubeflowv2.SchemeGroupVersion.String(), + Kind: kubeflowv2.TrainJobKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-trainjob", + Namespace: ns.Name, + }, + Spec: jobSpec, + } + gomega.Expect(k8sClient.Create(ctx, trainJob)).To(gomega.MatchError( + gomega.ContainSubstring("spec.managedBy: Invalid value"))) + }) + + ginkgo.It("Should fail in updating spec.managedBy", func() { + + managedBy := "kubeflow.org/trainjob-controller" + + trainingRuntimeRef := kubeflowv2.RuntimeRef{ + Name: "TorchRuntime", + APIGroup: ptr.To(kubeflowv2.GroupVersion.Group), + Kind: ptr.To(kubeflowv2.TrainingRuntimeKind), + } + jobSpec := kubeflowv2.TrainJobSpec{ + RuntimeRef: trainingRuntimeRef, + ManagedBy: &managedBy, + } + trainJob := &kubeflowv2.TrainJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: kubeflowv2.SchemeGroupVersion.String(), + Kind: kubeflowv2.TrainJobKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "job-with-failed-update", + Namespace: ns.Name, + }, + Spec: jobSpec, + } + + gomega.Expect(k8sClient.Create(ctx, trainJob)).Should(gomega.Succeed()) + updatedManagedBy := "kueue.x-k8s.io/multikueue" + jobSpec.ManagedBy = &updatedManagedBy + trainJob.Spec = jobSpec + gomega.Expect(k8sClient.Update(ctx, trainJob)).To(gomega.MatchError( + gomega.ContainSubstring("ManagedBy value is immutable"))) + }) + }) })