diff --git a/README.md b/README.md index 67d8fca..84f9ac0 100644 --- a/README.md +++ b/README.md @@ -53,3 +53,4 @@ Here's a demo showing how to install and configure `Knavigator`, and run an exam - [Getting started](docs/getting_started.md) - [Task management](docs/task_management.md) - [Metrics and Dashboards](docs/metrics.md) +- [Benchmarking](resources/benchmarks/README.md) diff --git a/resources/benchmarks/README.md b/resources/benchmarks/README.md index 1795937..5fef533 100644 --- a/resources/benchmarks/README.md +++ b/resources/benchmarks/README.md @@ -18,36 +18,36 @@ Run:ai requires additional customization and thus has a separate workflow ## Gang Scheduling Benchmark Test -The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitting a burst of 53 jobs with replica numbers ranging from 1 to 32 in a [predetermined order](gang-scheduling/workflows/run-test-common.yml). +The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitting a burst of 53 jobs with replica numbers ranging from 1 to 32 in a [predetermined order](gang-scheduling/workflows/run-test.yaml). #### Example To run the benchmark test for Kueue: ```bash -./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-kueue.yml,run-test-common.yml}' +./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-kueue.yaml,run-test.yaml}' ``` #### Run:ai ```bash -./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml +./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/runai-test.yaml ``` ## Scaling Benchmark Test -The scaling benchmark workflow operates on 500 virtual GPU nodes, submitting [two workloads](workflows/run-test-common.yml) one after another. The first workload is a job with 500 replicas, the second workload is 500 single node jobs started simultaneously. +The scaling benchmark workflow operates on 500 virtual GPU nodes with tho workflows. The first [workflow](scaling/workflows/run-test-multi.yaml) submits is a job with 500 replicas, the second [workflow](scaling/workflows/run-test-single.yaml) submits a batch of 500 single-node jobs. ### Example To run the benchmark test for Volcano: ```bash -./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-volcano.yml,run-test-common.yml}' +./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-nodes.yaml,config-volcano.yaml,run-test-multi.yaml}' ``` ### Run:ai ```bash -./bin/knavigator -workflow resources/benchmarks/scaling/workflows/run-test-runai.yml +./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-nodes.yaml,config-runai.yaml,runai-test-single.yaml}' ``` diff --git a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml similarity index 91% rename from resources/benchmarks/gang-scheduling/workflows/config-kueue.yml rename to resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml index 11d1189..336cb31 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml +++ b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yaml @@ -1,21 +1,22 @@ name: config-kueue +description: register, deploy and configure kueue custom resources tasks: - id: register-cluster-queue type: RegisterObj params: - template: "resources/templates/kueue/cluster-queue.yml" + template: "resources/templates/kueue/cluster-queue.yaml" - id: register-local-queue type: RegisterObj params: - template: "resources/templates/kueue/local-queue.yml" + template: "resources/templates/kueue/local-queue.yaml" - id: register-resource-flavor type: RegisterObj params: - template: "resources/templates/kueue/resource-flavor.yml" + template: "resources/templates/kueue/resource-flavor.yaml" - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/kueue/job.yml" + template: "resources/benchmarks/templates/kueue/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[0-9]-.*" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/scaling/workflows/config-volcano.yml b/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml similarity index 84% rename from resources/benchmarks/scaling/workflows/config-volcano.yml rename to resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml index 53673af..11ecd34 100644 --- a/resources/benchmarks/scaling/workflows/config-volcano.yml +++ b/resources/benchmarks/gang-scheduling/workflows/config-volcano.yaml @@ -1,9 +1,10 @@ name: config-volcano +description: register, deploy and configure volcano custom resources tasks: - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/volcano/job.yml" + template: "resources/benchmarks/templates/volcano/job.yaml" nameFormat: "j{{._ENUM_}}" podNameFormat: "{{._NAME_}}-test-[0-9]+" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yml b/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml similarity index 83% rename from resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yml rename to resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml index 79a7eeb..905b7f2 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yml +++ b/resources/benchmarks/gang-scheduling/workflows/config-yunikorn.yaml @@ -1,9 +1,10 @@ name: config-yunikorn +description: register, deploy and configure yunikorn custom resources tasks: - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/yunikorn/job.yml" + template: "resources/benchmarks/templates/yunikorn/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-.*" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test-common.yml b/resources/benchmarks/gang-scheduling/workflows/run-test.yaml similarity index 100% rename from resources/benchmarks/gang-scheduling/workflows/run-test-common.yml rename to resources/benchmarks/gang-scheduling/workflows/run-test.yaml diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml b/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml similarity index 99% rename from resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml rename to resources/benchmarks/gang-scheduling/workflows/runai-test.yaml index 7798d4b..a917dda 100644 --- a/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml +++ b/resources/benchmarks/gang-scheduling/workflows/runai-test.yaml @@ -12,14 +12,14 @@ tasks: - id: register-trainingworkload type: RegisterObj params: - template: "resources/benchmarks/templates/runai/trainingworkload.yml" + template: "resources/benchmarks/templates/runai/trainingworkload.yaml" nameFormat: "twl{{._ENUM_}}" podNameFormat: "{{._NAME_}}-0-0" podCount: 1 - id: register-distributedworkload type: RegisterObj params: - template: "resources/benchmarks/templates/runai/distributedworkload.yml" + template: "resources/benchmarks/templates/runai/distributedworkload.yaml" nameFormat: "dwl{{._ENUM_}}" podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" podCount: "{{.workers}} + 1" diff --git a/resources/benchmarks/scaling/workflows/config-kueue.yml b/resources/benchmarks/scaling/workflows/config-kueue.yaml similarity index 91% rename from resources/benchmarks/scaling/workflows/config-kueue.yml rename to resources/benchmarks/scaling/workflows/config-kueue.yaml index 4b70dce..d785d9e 100644 --- a/resources/benchmarks/scaling/workflows/config-kueue.yml +++ b/resources/benchmarks/scaling/workflows/config-kueue.yaml @@ -1,21 +1,22 @@ name: config-kueue +description: register, deploy and configure kueue custom resources tasks: - id: register-cluster-queue type: RegisterObj params: - template: "resources/templates/kueue/cluster-queue.yml" + template: "resources/templates/kueue/cluster-queue.yaml" - id: register-local-queue type: RegisterObj params: - template: "resources/templates/kueue/local-queue.yml" + template: "resources/templates/kueue/local-queue.yaml" - id: register-resource-flavor type: RegisterObj params: - template: "resources/templates/kueue/resource-flavor.yml" + template: "resources/templates/kueue/resource-flavor.yaml" - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/kueue/job.yml" + template: "resources/benchmarks/templates/kueue/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[0-9]-.*" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/scaling/workflows/config-nodes.yaml b/resources/benchmarks/scaling/workflows/config-nodes.yaml new file mode 100644 index 0000000..65eb85f --- /dev/null +++ b/resources/benchmarks/scaling/workflows/config-nodes.yaml @@ -0,0 +1,12 @@ +name: config-nodes +description: create 500 virtual GPU nodes +tasks: +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 500 + labels: + nvidia.com/gpu.count: "8" + timeout: 5m diff --git a/resources/benchmarks/scaling/workflows/config-runai.yaml b/resources/benchmarks/scaling/workflows/config-runai.yaml new file mode 100644 index 0000000..f7cf038 --- /dev/null +++ b/resources/benchmarks/scaling/workflows/config-runai.yaml @@ -0,0 +1,17 @@ +name: config-runai +description: register, deploy and configure run:ai custom resources +tasks: +- id: register-trainingworkload + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/trainingworkload.yaml" + nameFormat: "twl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-0-0" + podCount: 1 +- id: register-mpi + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/mpijob.yaml" + nameFormat: "mpijob{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" diff --git a/resources/benchmarks/gang-scheduling/workflows/config-volcano.yml b/resources/benchmarks/scaling/workflows/config-volcano.yaml similarity index 84% rename from resources/benchmarks/gang-scheduling/workflows/config-volcano.yml rename to resources/benchmarks/scaling/workflows/config-volcano.yaml index 53673af..11ecd34 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-volcano.yml +++ b/resources/benchmarks/scaling/workflows/config-volcano.yaml @@ -1,9 +1,10 @@ name: config-volcano +description: register, deploy and configure volcano custom resources tasks: - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/volcano/job.yml" + template: "resources/benchmarks/templates/volcano/job.yaml" nameFormat: "j{{._ENUM_}}" podNameFormat: "{{._NAME_}}-test-[0-9]+" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/scaling/workflows/config-yunikorn.yml b/resources/benchmarks/scaling/workflows/config-yunikorn.yaml similarity index 83% rename from resources/benchmarks/scaling/workflows/config-yunikorn.yml rename to resources/benchmarks/scaling/workflows/config-yunikorn.yaml index aee3fb9..cc79a3d 100644 --- a/resources/benchmarks/scaling/workflows/config-yunikorn.yml +++ b/resources/benchmarks/scaling/workflows/config-yunikorn.yaml @@ -1,9 +1,10 @@ name: config-yunikorn +description: register, deploy and configure yunikorn custom resources tasks: - id: register type: RegisterObj params: - template: "resources/benchmarks/templates/yunikorn/job.yml" + template: "resources/benchmarks/templates/yunikorn/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-.*" podCount: "{{.replicas}}" diff --git a/resources/benchmarks/scaling/workflows/run-test-common.yml b/resources/benchmarks/scaling/workflows/run-test-common.yml deleted file mode 100644 index 583c68e..0000000 --- a/resources/benchmarks/scaling/workflows/run-test-common.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: test-scaling -tasks: -- id: configure - type: Configure - params: - nodes: - - type: dgxa100.80g - count: 500 - labels: - nvidia.com/gpu.count: "8" - timeout: 5m -- id: sleep - type: Sleep - params: - timeout: 5s -- id: job1 - type: SubmitObj - params: - refTaskId: register - count: 500 - params: - replicas: 1 - ttl: 2m -- id: job2 - type: SubmitObj - params: - refTaskId: register - count: 1 - params: - replicas: 500 - ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/run-test-multi.yaml b/resources/benchmarks/scaling/workflows/run-test-multi.yaml new file mode 100644 index 0000000..818a1f8 --- /dev/null +++ b/resources/benchmarks/scaling/workflows/run-test-multi.yaml @@ -0,0 +1,11 @@ +name: test-scaling-multi-node-job +description: deploy a 500-replicas job +tasks: +- id: job + type: SubmitObj + params: + refTaskId: register + count: 1 + params: + replicas: 500 + ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/run-test-runai.yml b/resources/benchmarks/scaling/workflows/run-test-runai.yml deleted file mode 100644 index 4eed902..0000000 --- a/resources/benchmarks/scaling/workflows/run-test-runai.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: test-scaling -tasks: -- id: configure - type: Configure - params: - nodes: - - type: dgxa100.80g - count: 500 - labels: - nvidia.com/gpu.count: "8" - timeout: 5m -- id: sleep - type: Sleep - params: - timeout: 5s -- id: register-trainingworkload - type: RegisterObj - params: - template: "resources/benchmarks/templates/runai/trainingworkload.yml" - nameFormat: "twl{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-0-0" - podCount: 1 -- id: register-distributedworkload - type: RegisterObj - params: - template: "resources/benchmarks/templates/runai/distributedworkload.yml" - nameFormat: "dwl{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" - podCount: "{{.workers}} + 1" -# -### Benchmark test -# -- id: job1 - type: SubmitObj - params: - refTaskId: register-trainingworkload - count: 500 - params: - ttl: 2m -- id: job2 - type: SubmitObj - params: - refTaskId: register-distributedworkload - count: 1 - params: - workers: 499 - ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/run-test-single.yaml b/resources/benchmarks/scaling/workflows/run-test-single.yaml new file mode 100644 index 0000000..ff5e42e --- /dev/null +++ b/resources/benchmarks/scaling/workflows/run-test-single.yaml @@ -0,0 +1,11 @@ +name: test-scaling-single-node-jobs +description: deploy 500 single-replica jobs +tasks: +- id: job + type: SubmitObj + params: + refTaskId: register + count: 500 + params: + replicas: 1 + ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/runai-test-multi.yaml b/resources/benchmarks/scaling/workflows/runai-test-multi.yaml new file mode 100644 index 0000000..5c45bc7 --- /dev/null +++ b/resources/benchmarks/scaling/workflows/runai-test-multi.yaml @@ -0,0 +1,11 @@ +name: test-scaling +description: deploy a 500-replicas job +tasks: +- id: job + type: SubmitObj + params: + refTaskId: register-mpi + count: 1 + params: + workers: 499 + ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/runai-test-single.yaml b/resources/benchmarks/scaling/workflows/runai-test-single.yaml new file mode 100644 index 0000000..a8a530b --- /dev/null +++ b/resources/benchmarks/scaling/workflows/runai-test-single.yaml @@ -0,0 +1,10 @@ +name: test-scaling +description: deploy 500 single-replica jobs +tasks: +- id: job + type: SubmitObj + params: + refTaskId: register-trainingworkload + count: 500 + params: + ttl: 2m diff --git a/resources/benchmarks/templates/kueue/job.yml b/resources/benchmarks/templates/kueue/job.yaml similarity index 100% rename from resources/benchmarks/templates/kueue/job.yml rename to resources/benchmarks/templates/kueue/job.yaml diff --git a/resources/benchmarks/templates/runai/distributedworkload.yml b/resources/benchmarks/templates/runai/distributedworkload.yaml similarity index 100% rename from resources/benchmarks/templates/runai/distributedworkload.yml rename to resources/benchmarks/templates/runai/distributedworkload.yaml diff --git a/resources/benchmarks/templates/runai/mpijob.yaml b/resources/benchmarks/templates/runai/mpijob.yaml new file mode 100644 index 0000000..34ffd34 --- /dev/null +++ b/resources/benchmarks/templates/runai/mpijob.yaml @@ -0,0 +1,49 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: "{{._NAME_}}" + namespace: runai- + labels: + project: + runai/queue: +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + spec: + schedulerName: runai-scheduler + containers: + - image: runai/mpi-launcher:latest + name: mpi-launcher + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: 8 + Worker: + replicas: {{.workers}} + template: + metadata: + annotations: + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} + labels: + app: {{._NAME_}} + spec: + schedulerName: runai-scheduler + containers: + - image: runai/mpi-worker:latest + name: mpi-worker + resources: + limits: + cpu: 100m + memory: 250M + nvidia.com/gpu: 8 diff --git a/resources/benchmarks/templates/runai/trainingworkload.yml b/resources/benchmarks/templates/runai/trainingworkload.yaml similarity index 100% rename from resources/benchmarks/templates/runai/trainingworkload.yml rename to resources/benchmarks/templates/runai/trainingworkload.yaml diff --git a/resources/benchmarks/templates/volcano/job.yml b/resources/benchmarks/templates/volcano/job.yaml similarity index 100% rename from resources/benchmarks/templates/volcano/job.yml rename to resources/benchmarks/templates/volcano/job.yaml diff --git a/resources/benchmarks/templates/yunikorn/job.yml b/resources/benchmarks/templates/yunikorn/job.yaml similarity index 100% rename from resources/benchmarks/templates/yunikorn/job.yml rename to resources/benchmarks/templates/yunikorn/job.yaml diff --git a/resources/templates/kueue/cluster-queue.yml b/resources/templates/kueue/cluster-queue.yaml similarity index 100% rename from resources/templates/kueue/cluster-queue.yml rename to resources/templates/kueue/cluster-queue.yaml diff --git a/resources/templates/kueue/job.yml b/resources/templates/kueue/job.yaml similarity index 100% rename from resources/templates/kueue/job.yml rename to resources/templates/kueue/job.yaml diff --git a/resources/templates/kueue/local-queue.yml b/resources/templates/kueue/local-queue.yaml similarity index 100% rename from resources/templates/kueue/local-queue.yml rename to resources/templates/kueue/local-queue.yaml diff --git a/resources/templates/kueue/resource-flavor.yml b/resources/templates/kueue/resource-flavor.yaml similarity index 100% rename from resources/templates/kueue/resource-flavor.yml rename to resources/templates/kueue/resource-flavor.yaml diff --git a/resources/workflows/kueue/test-job.yml b/resources/workflows/kueue/test-job.yml index aaac626..88b3a49 100644 --- a/resources/workflows/kueue/test-job.yml +++ b/resources/workflows/kueue/test-job.yml @@ -18,19 +18,19 @@ tasks: - id: register-cluster-queue type: RegisterObj params: - template: "resources/templates/kueue/cluster-queue.yml" + template: "resources/templates/kueue/cluster-queue.yaml" - id: register-local-queue type: RegisterObj params: - template: "resources/templates/kueue/local-queue.yml" + template: "resources/templates/kueue/local-queue.yaml" - id: register-resource-flavor type: RegisterObj params: - template: "resources/templates/kueue/resource-flavor.yml" + template: "resources/templates/kueue/resource-flavor.yaml" - id: register-job type: RegisterObj params: - template: "resources/templates/kueue/job.yml" + template: "resources/templates/kueue/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[0-9]-.*" podCount: "{{.parallelism}}" diff --git a/resources/workflows/kueue/test-preemption.yml b/resources/workflows/kueue/test-preemption.yml index 28af7f9..64f1d38 100644 --- a/resources/workflows/kueue/test-preemption.yml +++ b/resources/workflows/kueue/test-preemption.yml @@ -18,19 +18,19 @@ tasks: - id: register-cluster-queue type: RegisterObj params: - template: "resources/templates/kueue/cluster-queue.yml" + template: "resources/templates/kueue/cluster-queue.yaml" - id: register-local-queue type: RegisterObj params: - template: "resources/templates/kueue/local-queue.yml" + template: "resources/templates/kueue/local-queue.yaml" - id: register-resource-flavor type: RegisterObj params: - template: "resources/templates/kueue/resource-flavor.yml" + template: "resources/templates/kueue/resource-flavor.yaml" - id: register-job type: RegisterObj params: - template: "resources/templates/kueue/job.yml" + template: "resources/templates/kueue/job.yaml" nameFormat: "job{{._ENUM_}}" podNameFormat: "{{._NAME_}}-[0-9]-.*" podCount: "{{.parallelism}}" diff --git a/tests/ci/test_jobset.sh b/tests/ci/test_jobset.sh index 0c8ca8e..f2ce4f1 100755 --- a/tests/ci/test_jobset.sh +++ b/tests/ci/test_jobset.sh @@ -12,4 +12,4 @@ deploy_kwok deploy_jobset # Run knavigator with an example test -${REPO_HOME}/bin/knavigator -workflow ${REPO_HOME}/resources/workflows/k8s/test-jobset.yml -cleanup +${REPO_HOME}/bin/knavigator -workflow ${REPO_HOME}/resources/workflows/k8s/test-jobset.yml -v 4 -cleanup