added network topology test for Run:ai (#104)

Signed-off-by: Dmitry Shmulevich <[email protected]>
NVIDIA · Aug 22, 2024 · da361e4 · da361e4
1 parent a3ac37c
commit da361e4
Show file tree

Hide file tree

Showing 9 changed files with 448 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 /bin
 /coverage.out
+.DS_Store
 
diff --git a/docs/assets/network-aware-scheduling.png b/docs/assets/network-aware-scheduling.png
diff --git a/resources/benchmarks/README.md b/resources/benchmarks/README.md
@@ -28,7 +28,7 @@ To run the benchmark test for Kueue:
 ./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,config-kueue.yaml,run-test.yaml}'
 ```
 
-#### Run:ai
+To run the benchmark test for Run:ai
 
 ```bash
 ./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-nodes.yaml,runai-test.yaml}'
@@ -46,8 +46,25 @@ To run the benchmark test for Volcano:
 ./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-nodes.yaml,config-volcano.yaml,run-test-multi.yaml}'
 ```
 
-### Run:ai
+To run the benchmark test for Run:ai
 
 ```bash
 ./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-nodes.yaml,config-runai.yaml,runai-test-single.yaml}'
 ```
+
+## Network Topology Benchmark Test
+
+The network topology benchmark workflow runs on 12 virtual GPU nodes, arranged to simulate a tree-like network topology.
+Out of these, 5 nodes are marked as busy, leaving 7 nodes available. The workflow submits a job with 3 replicas.
+
+From a network connectivity standpoint, the optimal assignment would be nodes n5, n7, and n8, as shown in the following diagram.
+
+![network aware scheduling](../../docs/assets/network-aware-scheduling.png)
+
+### Example
+
+To run the benchmark test for Run:ai
+
+```bash
+./bin/knavigator -workflow 'resources/benchmarks/nwtopo/workflows/{config-nodes.yaml,runai-test.yaml}'
+```
diff --git a/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml b/resources/benchmarks/nwtopo/templates/runai/mpijob.yaml
@@ -0,0 +1,81 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: {{._NAME_}}
+  namespace: runai-<RUNAI_PROJECT>
+  labels:
+    project: <RUNAI_PROJECT>
+    runai/queue: <RUNAI_PROJECT>
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+          labels:
+            app: {{._NAME_}}
+        spec:
+          schedulerName: runai-scheduler
+          containers:
+          - image: runai/mpi-launcher:latest
+            name: mpi-launcher
+            resources:
+              limits:
+                cpu: 100m
+                memory: 250M
+                nvidia.com/gpu: 8
+    Worker:
+      replicas: {{.workers}}
+      template:
+        metadata:
+          annotations:
+            pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}}
+            pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}}
+          labels:
+            app: {{._NAME_}}
+        spec:
+          affinity:
+            podAffinity:
+              preferredDuringSchedulingIgnoredDuringExecution:
+                - weight: 20
+                  podAffinityTerm:
+                    labelSelector:
+                      matchExpressions:
+                        - key: app
+                          operator: In
+                          values:
+                            - {{._NAME_}}
+                    topologyKey: net-layer-3
+                - weight: 70
+                  podAffinityTerm:
+                    labelSelector:
+                      matchExpressions:
+                        - key: app
+                          operator: In
+                          values:
+                            - {{._NAME_}}
+                    topologyKey: net-layer-2
+                - weight: 90
+                  podAffinityTerm:
+                    labelSelector:
+                      matchExpressions:
+                        - key: app
+                          operator: In
+                          values:
+                            - {{._NAME_}}
+                    topologyKey: net-layer-1
+          schedulerName: runai-scheduler
+          containers:
+          - image: runai/mpi-worker:latest
+            name: mpi-worker
+            resources:
+              limits:
+                cpu: 100m
+                memory: 250M
+                nvidia.com/gpu: 8
diff --git a/resources/benchmarks/nwtopo/workflows/config-combo-network-aware.yaml b/resources/benchmarks/nwtopo/workflows/config-combo-network-aware.yaml
@@ -0,0 +1,114 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: config-kueue
+tasks:
+- id: register-cluster-queue
+  type: RegisterObj
+  params:
+    template: "resources/templates/kueue/cluster-queue.yaml"
+- id: register-local-queue
+  type: RegisterObj
+  params:
+    template: "resources/templates/kueue/local-queue.yaml"
+- id: register-resource-flavor
+  type: RegisterObj
+  params:
+    template: "resources/templates/kueue/resource-flavor.yaml"
+- id: register
+  type: RegisterObj
+  params:
+    template: "resources/benchmarks/templates/jobset/jobset-coscheduling.yaml"
+    nameFormat: "jobset{{._ENUM_}}"
+    podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
+    podCount: "{{.replicas}}"
+- id: create-resource-flavor
+  type: SubmitObj
+  params:
+    refTaskId: register-resource-flavor
+    canExist: true
+    params:
+      name: "gpu-node"
+      nodeLabels:
+        nvidia.com/gpu.count: "8"
+- id: create-cluster-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-cluster-queue
+    canExist: true
+    params:
+      name: team
+      flavor: gpu-node
+      cpu: 8
+      memory: 36Gi
+      pods: 32
+      gpu: 256
+- id: create-local-queue
+  type: SubmitObj
+  params:
+    refTaskId: register-local-queue
+    canExist: true
+    params:
+      name: team-queue
+      namespace: default
+      clusterQueue: team
+- id: configure
+  type: Configure
+  params:
+    configmaps:
+    - name: scheduler-config
+      namespace: scheduler-plugins
+      op: create
+      data:
+        scheduler-config.yaml: |
+          apiVersion: kubescheduler.config.k8s.io/v1
+          kind: KubeSchedulerConfiguration
+          leaderElection:
+            leaderElect: false
+          profiles:
+          - schedulerName: scheduler-plugins-scheduler
+            plugins:
+              multiPoint:
+                enabled:
+                - name: Coscheduling
+                - name: CapacityScheduling
+                - name: NetworkOverhead
+                  weight: 5
+                disabled:
+                - name: NodeResourcesFit
+              queueSort:
+                enabled:
+                - name: TopologicalSort
+                disabled:
+                - name: "*"
+            pluginConfig:
+            - name: Coscheduling
+              args:
+                permitWaitingTimeSeconds: 10
+            - name: TopologicalSort
+              args:
+                namespaces:
+                - "default"
+            - name: NetworkOverhead
+              args:
+                namespaces:
+                - "default"
+                weightsName: "UserDefined"
+                networkTopologyName: "net-topology-test"
+    deploymentRestarts:
+    - namespace: scheduler-plugins
+      name: scheduler-plugins-controller
+    - namespace: scheduler-plugins
+      name: scheduler-plugins-scheduler
+    timeout: 2m
diff --git a/resources/benchmarks/nwtopo/workflows/config-nodes.yaml b/resources/benchmarks/nwtopo/workflows/config-nodes.yaml
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: config-nw-topo-nodes
+description: |
+  Create a 12-nodes cluster with a tree-like network topology
+  and mark 5 nodes as busy:
+            __________ sw31 __________
+           /            |             \
+       sw21            sw22            sw23
+       /  \            /  \            /  \
+   sw11    sw12    sw13    sw14    sw15    sw16
+    /\      /\      /\      /\      /\      /\
+  n1  n2  n3  n4  n5  n6  n7  n8  n9 n10 n11 n12
+  x       x           x                  x   x
+  Then deploy a 3-replicas job. The optimal nodes from the
+  network topology perspective for this job are nodes n5, n7, n8.
+tasks:
+- id: configure
+  type: Configure
+  params:
+    nodes:
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n1
+        net-layer-1: sw11
+        net-layer-2: sw21
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n2
+        net-layer-1: sw11
+        net-layer-2: sw21
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n3
+        net-layer-1: sw12
+        net-layer-2: sw21
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n4
+        net-layer-1: sw12
+        net-layer-2: sw21
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n5
+        net-layer-1: sw13
+        net-layer-2: sw22
+        net-layer-3: sw31
+        net-optimal: true
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n6
+        net-layer-1: sw13
+        net-layer-2: sw22
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n7
+        net-layer-1: sw14
+        net-layer-2: sw22
+        net-layer-3: sw31
+        net-optimal: true
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n8
+        net-layer-1: sw14
+        net-layer-2: sw22
+        net-layer-3: sw31
+        net-optimal: true
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n9
+        net-layer-1: sw15
+        net-layer-2: sw23
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n10
+        net-layer-1: sw15
+        net-layer-2: sw23
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n11
+        net-layer-1: sw16
+        net-layer-2: sw23
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    - type: dgxa100.80g
+      count: 1
+      labels:
+        node-id: n12
+        net-layer-1: sw16
+        net-layer-2: sw23
+        net-layer-3: sw31
+        nvidia.com/gpu.count: "8"
+    timeout: 5m
+- id: update
+  type: UpdateNodes
+  params:
+    selectors:
+    - node-id: n1
+    - node-id: n3
+    - node-id: n6
+    - node-id: n11
+    - node-id: n12
+    state:
+      spec:
+        unschedulable: true