Skip to content

Commit

Permalink
add topology workflow for jobset
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh committed Oct 31, 2024
1 parent 7878a12 commit c75f6cc
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 0 deletions.
78 changes: 78 additions & 0 deletions resources/benchmarks/nwtopo/templates/jobset/jobset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: "{{._NAME_}}"
namespace: default
spec:
# We want to declare our JobSet successful if workers finish.
# If workers finish we should clean up the remaining replicatedJobs.
successPolicy:
operator: All
targetReplicatedJobs:
- workers
replicatedJobs:
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: {{.replicas}}
parallelism: {{.replicas}}
completionMode: NonIndexed
template:
metadata:
labels:
app: {{._NAME_}}
annotations:
pod-complete.stage.kwok.x-k8s.io/delay: "{{.ttl}}"
pod-complete.stage.kwok.x-k8s.io/jitter-delay: "{{.ttl}}"
spec:
schedulerName: default-scheduler
affinity:
podAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 70
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{._NAME_}}
topologyKey: net-layer-2
- weight: 90
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- {{._NAME_}}
topologyKey: net-layer-1
containers:
- name: test
image: ubuntu
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: 100m
memory: 250M
nvidia.com/gpu: "8"
requests:
cpu: 100m
memory: 250M
nvidia.com/gpu: "8"
23 changes: 23 additions & 0 deletions resources/benchmarks/nwtopo/workflows/config-jobset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: config-jobset
tasks:
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/nwtopo/templates/jobset/jobset.yaml"
nameFormat: "jobset{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-workers-[0-9]+-.+"
podCount: "{{.replicas}}"

0 comments on commit c75f6cc

Please sign in to comment.