-
Notifications
You must be signed in to change notification settings - Fork 715
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
KEP-2170: Add TrainJob and TrainingRuntime APIs #2223
Changes from 10 commits
6865663
ed830c8
66e7049
bfa1f20
2bf13c9
72a933e
9d0a686
880560c
49a004c
c28a166
06e7653
7aa4094
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
/* | ||
Copyright 2024 The Kubeflow Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
// Package v2alpha1 contains API Schema definitions for the kubeflow.org v2alpha1 API group | ||
// +kubebuilder:object:generate=true | ||
// +groupName=kubeflow.org | ||
package v2alpha1 | ||
|
||
import ( | ||
"k8s.io/apimachinery/pkg/runtime/schema" | ||
"sigs.k8s.io/controller-runtime/pkg/scheme" | ||
) | ||
|
||
var ( | ||
// GroupVersion is group version used to register these objects. | ||
GroupVersion = schema.GroupVersion{Group: "kubeflow.org", Version: "v2alpha1"} | ||
|
||
// SchemeBuilder is used to add go types to the GroupVersionKind scheme. | ||
SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} | ||
|
||
// SchemeGroupVersion is alias to GroupVersion for client-go libraries. | ||
SchemeGroupVersion = GroupVersion | ||
|
||
// AddToScheme adds the types in this group-version to the given scheme. | ||
AddToScheme = SchemeBuilder.AddToScheme | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,3 +15,197 @@ limitations under the License. | |
*/ | ||
|
||
package v2alpha1 | ||
|
||
import ( | ||
autoscalingv2 "k8s.io/api/autoscaling/v2" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" | ||
) | ||
|
||
// +kubebuilder:object:root=true | ||
|
||
// ClusterTrainingRuntime represents a training runtime which can be referenced as part of | ||
// `trainingRuntimeRef` API in TrainJob. This resource is a cluster-scoped and can be referenced | ||
// by TrainJob that created in *any* namespace. | ||
type ClusterTrainingRuntime struct { | ||
metav1.TypeMeta `json:",inline"` | ||
|
||
// Standard object's metadata. | ||
metav1.ObjectMeta `json:"metadata,omitempty"` | ||
|
||
// Specification of the desired ClusterTrainingRuntime. | ||
Spec TrainingRuntimeSpec `json:"spec,omitempty"` | ||
} | ||
|
||
// +kubebuilder:object:root=true | ||
|
||
// ClusterTrainingRuntimeList is a collection of cluster training runtimes. | ||
type ClusterTrainingRuntimeList struct { | ||
metav1.TypeMeta `json:",inline"` | ||
|
||
// Standard list metadata. | ||
metav1.ListMeta `json:"metadata,omitempty"` | ||
|
||
// List of ClusterTrainingRuntimes. | ||
Items []ClusterTrainingRuntime `json:"items"` | ||
} | ||
|
||
// +kubebuilder:object:root=true | ||
|
||
// TrainingRuntime represents a training runtime which can be referenced as part of | ||
// `trainingRuntimeRef` API in TrainJob. This resource is a namespaced-scoped and can be referenced | ||
// by TrainJob that created in the *same* namespace as the TrainingRuntime. | ||
type TrainingRuntime struct { | ||
metav1.TypeMeta `json:",inline"` | ||
|
||
// Standard object's metadata. | ||
metav1.ObjectMeta `json:"metadata,omitempty"` | ||
|
||
// Specification of the desired TrainingRuntime. | ||
Spec TrainingRuntimeSpec `json:"spec,omitempty"` | ||
} | ||
|
||
// +kubebuilder:object:root=true | ||
|
||
// TrainingRuntimeList is a collection of training runtimes. | ||
type TrainingRuntimeList struct { | ||
metav1.TypeMeta `json:",inline"` | ||
|
||
// Standard list metadata. | ||
metav1.ListMeta `json:"metadata,omitempty"` | ||
|
||
// List of TrainingRuntimes. | ||
Items []TrainingRuntime `json:"items"` | ||
} | ||
|
||
// TrainingRuntimeSpec represents a specification of the desired training runtime. | ||
type TrainingRuntimeSpec struct { | ||
|
||
// Configuration for the model training with ML-specific parameters. | ||
MLSpec *MLSpec `json:"mlSpec,omitempty"` | ||
|
||
// Configuration for the PodGroup to enable gang-scheduling via supported plugins. | ||
PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be inlined? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
// JobSet template which will be used by TrainJob. | ||
Template JobSetTemplateSpec `json:"template"` | ||
} | ||
|
||
// JobSetTemplateSpec represents a template of the desired JobSet. | ||
type JobSetTemplateSpec struct { | ||
// Metadata for custom JobSet's labels and annotations. | ||
// JobSet name and namespace is equal to the TrainJob's name and namespace. | ||
metav1.ObjectMeta `json:"metadata,omitempty"` | ||
|
||
// Specification of the desired JobSet which will be created from TrainJob. | ||
Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"` | ||
} | ||
|
||
// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling. | ||
type PodGroupSpec struct { | ||
// Plugin for the gang-scheduling. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we going forward with a default? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By default, the gang-scheduling is disabled for TrainJob, since it requires plugin to be installed (coscheduling or volcano). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes sense. |
||
Plugin GangSchedulerPlugin `json:"plugin"` | ||
|
||
// Time threshold to schedule PodGroup for gang-scheduling. | ||
ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"` | ||
} | ||
|
||
// GangSchedulerPlugin represents one of the supported gang-scheduling plugins. | ||
type GangSchedulerPlugin string | ||
|
||
const ( | ||
// Volcano plugin for gang-scheduling. | ||
GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano" | ||
|
||
// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling. | ||
GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling" | ||
) | ||
|
||
// MLSpec represents configuration for the model trining with ML-specific parameters. | ||
type MLSpec struct { | ||
|
||
// Number of training nodes. | ||
// Defaults to 1. | ||
NumNodes *int32 `json:"numNodes,omitempty"` | ||
|
||
// Configuration for the runtime-specific parameters, such as Torch or MPI. | ||
// One of the following spec sources can be set. | ||
MLSpecSource `json:",inline"` | ||
} | ||
|
||
// MLPolicySource represents the runtime-specific configuration for various technologies. | ||
// One of the following specs can be set. | ||
type MLSpecSource struct { | ||
|
||
// Configuration for the PyTorch runtime. | ||
Torch *TorchMLSpecSource `json:"torch,omitempty"` | ||
|
||
// Configuration for the MPI Runtime. | ||
MPI *MPIMLSpecSource `json:"mpi,omitempty"` | ||
} | ||
|
||
// TorchMLSpecSource represents a PyTorch runtime configuration. | ||
type TorchMLSpecSource struct { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tenzen-y I removed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When users will need it, we can add it in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That sounds good to me. |
||
// Number of processes per node. | ||
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. | ||
// Supported values: `auto`, `cpu`, `gpu`, or int value. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could probably use KubeBuilder validations for the enums here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we discussed offline, we will add validations in the separate PRs. |
||
// TODO (andreyvelich): Add kubebuilder validation. | ||
// Defaults to `auto`. | ||
NumProcPerNode *string `json:"numProcPerNode,omitempty"` | ||
|
||
// Elastic policy for the PyTorch training. | ||
ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"` | ||
} | ||
|
||
// TorchElasticPolicy represents a configuration for the PyTorch elastic training. | ||
// If this policy is set, the `.spec.numNodes` parameter must be omitted, since min and max node | ||
// is used to configure the `torchrun` CLI argument: `--nnodes=minNodes:maxNodes`. | ||
// Only `c10d` backend is supported for the Rendezvous communication. | ||
type TorchElasticPolicy struct { | ||
// How many times the training job can be restarted. | ||
// This value is inserted into the `--max-restarts` argument of the `torchrun` CLI and | ||
// the `.spec.failurePolicy.maxRestarts` parameter of the training Job. | ||
MaxRestarts *int32 `json:"maxRestarts,omitempty"` | ||
|
||
// Lower limit for the number of nodes to which training job can scale down. | ||
MinNodes *int32 `json:"minNodes,omitempty"` | ||
|
||
// Upper limit for the number of nodes to which training job can scale up. | ||
MaxNodes *int32 `json:"maxNodes,omitempty"` | ||
|
||
// Specification which are used to calculate the desired number of nodes. See the individual | ||
// metric source types for more information about how each type of metric must respond. | ||
// The HPA will be created to perform auto-scaling. | ||
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"` | ||
} | ||
|
||
// MPIMLSpecSource represents a MPI runtime configuration. | ||
type MPIMLSpecSource struct { | ||
// Number of processes per node. | ||
// This value is equal to the number of slots for each node in the hostfile. | ||
NumProcPerNode *int32 `json:"numProcPerNode,omitempty"` | ||
|
||
// Implementation name for the MPI to create the appropriate hostfile. | ||
// Defaults to OpenMPI. | ||
MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"` | ||
|
||
// Directory where SSH keys are mounted. | ||
SSHAuthMountPath *string `json:"SSHAuthMountPath,omitempty"` | ||
|
||
// Whether to run training process on the launcher Job. | ||
// Defaults to false. | ||
RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"` | ||
} | ||
|
||
// MPIImplementation represents one of the supported MPI implementations. | ||
type MPIImplementation string | ||
|
||
const ( | ||
MPIImplementationOpenMPI MPIImplementation = "OpenMPI" | ||
MPIImplementationIntel MPIImplementation = "Intel" | ||
MPIImplementationMPICH MPIImplementation = "MPICH" | ||
) | ||
|
||
func init() { | ||
SchemeBuilder.Register(&ClusterTrainingRuntime{}, &ClusterTrainingRuntimeList{}, &TrainingRuntime{}, &TrainingRuntimeList{}) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should the spec ever be empty for ClusterTrainingRUntime?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not really, but I noticed that for all Kubernetes APIs the
spec
is set withomitempty
: https://github.com/kubernetes/api/blob/master/apps/v1/types.go#L820.@tenzen-y @kannon92 Any specific reason why we do this ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, in any case, the
spec
field is defined as an optional field in the Kubernetes. So, the optional TrainingRuntime spec would be better.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TIL.