diff --git a/charts/overrides/kwok/pod-complete.yml b/charts/overrides/kwok/pod-complete.yml new file mode 100644 index 0000000..2a4e8db --- /dev/null +++ b/charts/overrides/kwok/pod-complete.yml @@ -0,0 +1,36 @@ +apiVersion: kwok.x-k8s.io/v1alpha1 +kind: Stage +metadata: + name: pod-complete +spec: + next: + statusTemplate: | + {{`{{ $now := Now }} + {{ $root := . }} + containerStatuses: + {{ range $index, $item := .spec.containers }} + {{ $origin := index $root.status.containerStatuses $index }} + - image: {{ $item.image | Quote }} + name: {{ $item.name | Quote }} + ready: false + restartCount: 0 + started: false + state: + terminated: + exitCode: 0 + finishedAt: {{ $now | Quote }} + reason: Completed + startedAt: {{ $now | Quote }} + {{ end }} + phase: Succeeded`}} + resourceRef: + apiGroup: v1 + kind: Pod + selector: + matchExpressions: + - key: .metadata.deletionTimestamp + operator: DoesNotExist + - key: .status.phase + operator: In + values: + - Running diff --git a/docs/deployment.md b/docs/deployment.md index ae354d3..b1f7c40 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -40,11 +40,16 @@ KWOK_REPO=kubernetes-sigs/kwok KWOK_LATEST_RELEASE="v0.5.2" kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/kwok.yaml" +``` +Next, deploy and adjust the stages. +```bash kubectl apply -f "https://github.com/${KWOK_REPO}/releases/download/${KWOK_LATEST_RELEASE}/stage-fast.yaml" kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-container-running-failed.yaml + +kubectl apply -f charts/overrides/kwok/pod-complete.yml ``` For configuring virtual nodes, you need to provide the `values.yaml` file to define the type and quantity of nodes you wish to create. You also have the option to enhance node configurations by adding annotations, labels, and conditions. For guidance, refer to the [values-example.yaml](../charts/virtual-nodes/values-example.yaml) file. diff --git a/docs/examples/kueue/kueue.md b/docs/examples/kueue/kueue.md new file mode 100644 index 0000000..25e0b98 --- /dev/null +++ b/docs/examples/kueue/kueue.md @@ -0,0 +1,31 @@ +# Example of running Kueue with knavigator + +## Install kueue + +Install kueue by following these [instructions](https://kueue.sigs.k8s.io/docs/installation/): + +```bash +VERSION=v0.6.2 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/manifests.yaml +kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/$VERSION/prometheus.yaml +``` + +## Deploy cluster and local queues + +```bash +kubectl apply -f docs/examples/kueue/queues.yml +``` + +## Deploy virtual nodes + +In this example we deploy 4 GPU nodes. Refer to [values.yaml](values.yaml) for more details. + +```bash +helm upgrade --install virtual-nodes charts/virtual-nodes -f docs/examples/kueue/values.yaml +``` + +## Run kueue job + +```bash +./bin/knavigator -tasks resources/tests/kueue/test-job.yml +``` diff --git a/docs/examples/kueue/queues.yml b/docs/examples/kueue/queues.yml new file mode 100644 index 0000000..9ad7514 --- /dev/null +++ b/docs/examples/kueue/queues.yml @@ -0,0 +1,30 @@ +# cluster-queue.yaml +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ClusterQueue +metadata: + name: "cluster-queue" +spec: + namespaceSelector: {} # match all. + resourceGroups: + - coveredResources: ["cpu", "memory", "nvidia.com/gpu"] + flavors: + - name: "default-flavor" + resources: + - name: "cpu" + nominalQuota: 4 + - name: "memory" + nominalQuota: 36Gi + - name: "nvidia.com/gpu" + nominalQuota: 4 +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: ResourceFlavor +metadata: + name: "default-flavor" +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: team-a-queue +spec: + clusterQueue: cluster-queue diff --git a/docs/examples/kueue/values.yaml b/docs/examples/kueue/values.yaml new file mode 100644 index 0000000..fe75c7b --- /dev/null +++ b/docs/examples/kueue/values.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +nodes: +- type: dgxa100.80g + count: 4 + annotations: {} + labels: + nvidia.com/gpu.count: "8" + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + conditions: + - message: Filesystem is not read-only + reason: FilesystemIsNotReadOnly + status: "False" + type: ReadonlyFilesystem + - message: kernel has no deadlock + reason: KernelHasNoDeadlock + status: "False" + type: KernelDeadlock diff --git a/docs/getting_started.md b/docs/getting_started.md index e4fc1e2..bfbcd3b 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -58,3 +58,7 @@ Run a test jobset with a driver and workers: ```shell ./bin/knavigator -tasks ./resources/tests/k8s/test-jobset-with-driver.yml ``` + +### Kueue + +Refer to [this document](./examples/kueue/kueue.md) for detailed instructions on how to run `kueue` system with `knavigator`. diff --git a/resources/templates/kueue/job.yml b/resources/templates/kueue/job.yml new file mode 100644 index 0000000..c19619a --- /dev/null +++ b/resources/templates/kueue/job.yml @@ -0,0 +1,27 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{._NAME_}}" + namespace: {{.namespace}} + labels: + kueue.x-k8s.io/queue-name: {{.queueName}} +spec: + completions: {{.completions}} + parallelism: {{.parallelism}} + completionMode: {{.completionMode}} + template: + spec: + containers: + - name: test + image: {{.image}} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + requests: + cpu: "{{.cpu}}" + memory: {{.memory}} + nvidia.com/gpu: "{{.gpu}}" + restartPolicy: Never diff --git a/resources/tests/kueue/test-job.yml b/resources/tests/kueue/test-job.yml new file mode 100644 index 0000000..3e5959e --- /dev/null +++ b/resources/tests/kueue/test-job.yml @@ -0,0 +1,23 @@ +name: test-kueue-job +description: submit and validate a kueue job +tasks: +- id: job + type: SubmitObj + params: + count: 1 + grv: + group: batch + version: v1 + resource: jobs + template: "resources/templates/kueue/job.yml" + nameformat: "job{{._ENUM_}}" + overrides: + queueName: team-a-queue + namespace: default + parallelism: 3 + completions: 3 + completionMode: Indexed + image: ubuntu + cpu: 100m + memory: 512M + gpu: 1