Skip to content

Commit

Permalink
[fix][kubectl-plugin] set worker group CPU limit (#2958)
Browse files Browse the repository at this point in the history
when creating a new worker group with `kubectl ray create workergroup`.
Write a unit test.

I noticed we are setting resource limits equal to resource requests everywhere
else but in this command. I have a K8s [LimitRange] that prevented the creation
of these worker Pods that had CPU limit defaulting to less than their CPU
requests. Describing the RayCluster showed this warning event.

`Failed to create worker Pod hyperkube/, Pod
"dxia-test-other-group-worker-pm2sh" is invalid:
spec.containers[0].resources.requests: Invalid value: "2": must be less than or
equal to cpu limit of 250m`

Signed-off-by: David Xia <[email protected]>

[LimitRange]: https://kubernetes.io/docs/concepts/policy/limit-range/
  • Loading branch information
davidxia authored Feb 7, 2025
1 parent 6e29e8a commit 51f443c
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 12 deletions.
32 changes: 20 additions & 12 deletions kubectl-plugin/pkg/cmd/create/create_workergroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ import (
"k8s.io/kubectl/pkg/util/templates"
)

const (
resourceNvidiaGPU = "nvidia.com/gpu"
)

type CreateWorkerGroupOptions struct {
configFlags *genericclioptions.ConfigFlags
ioStreams *genericclioptions.IOStreams
Expand Down Expand Up @@ -135,6 +139,19 @@ func (options *CreateWorkerGroupOptions) Run(ctx context.Context, factory cmduti
}

newRayCluster := rayCluster.DeepCopy()

newRayCluster.Spec.WorkerGroupSpecs = append(newRayCluster.Spec.WorkerGroupSpecs, createWorkerGroupSpec(options))

newRayCluster, err = k8sClient.RayClient().RayV1().RayClusters(*options.configFlags.Namespace).Update(ctx, newRayCluster, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("error updating Ray cluster with new worker group: %w", err)
}

fmt.Printf("Updated Ray cluster %s/%s with new worker group\n", newRayCluster.Namespace, newRayCluster.Name)
return nil
}

func createWorkerGroupSpec(options *CreateWorkerGroupOptions) rayv1.WorkerGroupSpec {
podTemplate := corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
Expand All @@ -158,25 +175,16 @@ func (options *CreateWorkerGroupOptions) Run(ctx context.Context, factory cmduti

gpuResource := resource.MustParse(options.workerGPU)
if !gpuResource.IsZero() {
podTemplate.Spec.Containers[0].Resources.Requests[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
podTemplate.Spec.Containers[0].Resources.Limits[corev1.ResourceName("nvidia.com/gpu")] = gpuResource
podTemplate.Spec.Containers[0].Resources.Requests[corev1.ResourceName(resourceNvidiaGPU)] = gpuResource
podTemplate.Spec.Containers[0].Resources.Limits[corev1.ResourceName(resourceNvidiaGPU)] = gpuResource
}

workerGroup := rayv1.WorkerGroupSpec{
return rayv1.WorkerGroupSpec{
GroupName: options.groupName,
Replicas: &options.workerReplicas,
MinReplicas: &options.workerMinReplicas,
MaxReplicas: &options.workerMaxReplicas,
RayStartParams: map[string]string{},
Template: podTemplate,
}
newRayCluster.Spec.WorkerGroupSpecs = append(newRayCluster.Spec.WorkerGroupSpecs, workerGroup)

newRayCluster, err = k8sClient.RayClient().RayV1().RayClusters(*options.configFlags.Namespace).Update(ctx, newRayCluster, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("error updating Ray cluster with new worker group: %w", err)
}

fmt.Printf("Updated Ray cluster %s/%s with new worker group\n", newRayCluster.Namespace, newRayCluster.Name)
return nil
}
57 changes: 57 additions & 0 deletions kubectl-plugin/pkg/cmd/create/create_workergroup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package create

import (
"testing"

"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/utils/ptr"

rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
)

func TestCreateWorkerGroupSpec(t *testing.T) {
options := &CreateWorkerGroupOptions{
groupName: "example-group",
image: "DEADBEEF",
workerReplicas: 3,
workerMinReplicas: 1,
workerMaxReplicas: 5,
workerCPU: "2",
workerMemory: "5Gi",
workerGPU: "1",
}

expected := rayv1.WorkerGroupSpec{
RayStartParams: map[string]string{},
GroupName: "example-group",
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "ray-worker",
Image: "DEADBEEF",
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("5Gi"),
resourceNvidiaGPU: resource.MustParse("1"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("5Gi"),
resourceNvidiaGPU: resource.MustParse("1"),
},
},
},
},
},
},
Replicas: ptr.To[int32](3),
MinReplicas: ptr.To[int32](1),
MaxReplicas: ptr.To[int32](5),
}

assert.Equal(t, expected, createWorkerGroupSpec(options))
}

0 comments on commit 51f443c

Please sign in to comment.