forked from project-codeflare/codeflare-sdk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbase-template.yaml
335 lines (335 loc) · 14.8 KB
/
base-template.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
apiVersion: workload.codeflare.dev/v1beta1
kind: AppWrapper
metadata:
name: aw-kuberay
namespace: default
#new addition
labels:
orderedinstance: "m4.xlarge_g4dn.xlarge"
spec:
priority: 9
resources:
Items: []
GenericItems:
- replicas: 1
#new addition
custompodresources:
- replicas: 1
requests:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
limits:
cpu: 2
memory: 8G
nvidia.com/gpu: 0
- replicas: 3
requests:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
limits:
cpu: 2
memory: 12G
nvidia.com/gpu: 1
generictemplate:
# This config demonstrates KubeRay's Ray autoscaler integration.
# The resource requests and limits in this config are too small for production!
# For an example with more realistic resource configuration, see
# ray-cluster.autoscaler.large.yaml.
apiVersion: ray.io/v1alpha1
kind: RayCluster
metadata:
labels:
appwrapper.mcad.ibm.com: "aw-kuberay"
controller-tools.k8s.io: "1.0"
# A unique identifier for the head node and workers of this cluster.
name: kuberay-cluster
# finalizers:
# - kubernetes
spec:
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.7.0'
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
enableInTreeAutoscaling: false
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
# The example configuration shown below below represents the DEFAULT values.
# (You may delete autoscalerOptions if the defaults are suitable.)
autoscalerOptions:
# upscalingMode is "Default" or "Aggressive."
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
# Default: Upscaling is not rate-limited.
# Aggressive: An alias for Default; upscaling is not rate-limited.
upscalingMode: Default
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
idleTimeoutSeconds: 60
# image optionally overrides the autoscaler's container image.
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
## image: "my-repo/my-custom-autoscaler-image:tag"
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
imagePullPolicy: Always
# resources specifies optional resource request and limit overrides for the autoscaler container.
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
resources:
limits:
cpu: "500m"
memory: "512Mi"
requests:
cpu: "500m"
memory: "512Mi"
######################headGroupSpec#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
dashboard-host: '0.0.0.0'
block: 'true'
# num-cpus: '1' # can be auto-completed from the limits
# Use `resources` to optionally specify custom resource annotations for the Ray node.
# The value of `resources` is a string-integer mapping.
# Currently, `resources` must be provided in the specific format demonstrated below:
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
num-gpus: '0'
#pod template
template:
spec:
#new addition
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aw-kuberay
operator: In
values:
- "aw-kuberay"
containers:
# The Ray head pod
- env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: "0"
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
name: ray-head
image: rayproject/ray:latest
imagePullPolicy: Always
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
requests:
cpu: 2
memory: "8G"
nvidia.com/gpu: 0
volumeMounts:
- name: ca-vol
mountPath: "/home/ray/workspace/ca"
readOnly: true
- name: server-cert
mountPath: "/home/ray/workspace/tls"
readOnly: true
initContainers:
- command:
- sh
- -c
- cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf "authorityKeyIdentifier=keyid,issuer\nbasicConstraints=CA:FALSE\nsubjectAltName = @alt_names\n[alt_names]\nDNS.1 = 127.0.0.1\nDNS.2 = localhost\nDNS.3 = ${FQ_RAY_IP}\nDNS.4 = $(awk 'END{print $1}' /etc/hosts)\nDNS.5 = rayclient-deployment-name-$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).server-name">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext
image: rayproject/ray:2.7.0
name: create-cert
# securityContext:
# runAsUser: 1000
# runAsGroup: 1000
volumeMounts:
- name: ca-vol
mountPath: "/home/ray/workspace/ca"
readOnly: true
- name: server-cert
mountPath: "/home/ray/workspace/tls"
readOnly: false
volumes:
- name: ca-vol
secret:
secretName: ca-secret-deployment-name
optional: false
- name: server-cert
emptyDir: {}
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 3
minReplicas: 3
maxReplicas: 3
# logical group name, for this called small-group, also can be functional
groupName: small-group
# if worker pods need to be added, we can simply increment the replicas
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
# the operator will remove pods from the list until the number of replicas is satisfied
# when a pod is confirmed to be deleted, its name will be removed from the list below
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block ...
rayStartParams:
block: 'true'
num-gpus: 1
#pod template
template:
metadata:
labels:
key: value
# annotations for pod
annotations:
key: value
# finalizers:
# - kubernetes
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: aw-kuberay
operator: In
values:
- "aw-kuberay"
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
- name: create-cert
image: rayproject/ray:2.7.0
command:
- sh
- -c
- cd /home/ray/workspace/tls && openssl req -nodes -newkey rsa:2048 -keyout server.key -out server.csr -subj '/CN=ray-head' && printf "authorityKeyIdentifier=keyid,issuer\nbasicConstraints=CA:FALSE\nsubjectAltName = @alt_names\n[alt_names]\nDNS.1 = 127.0.0.1\nDNS.2 = localhost\nDNS.3 = ${FQ_RAY_IP}\nDNS.4 = $(awk 'END{print $1}' /etc/hosts)">./domain.ext && cp /home/ray/workspace/ca/* . && openssl x509 -req -CA ca.crt -CAkey ca.key -in server.csr -out server.crt -days 365 -CAcreateserial -extfile domain.ext
# securityContext:
# runAsUser: 1000
# runAsGroup: 1000
volumeMounts:
- name: ca-vol
mountPath: "/home/ray/workspace/ca"
readOnly: true
- name: server-cert
mountPath: "/home/ray/workspace/tls"
readOnly: false
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: rayproject/ray:latest
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_USE_TLS
value: "0"
- name: RAY_TLS_SERVER_CERT
value: /home/ray/workspace/tls/server.crt
- name: RAY_TLS_SERVER_KEY
value: /home/ray/workspace/tls/server.key
- name: RAY_TLS_CA_CERT
value: /home/ray/workspace/tls/ca.crt
# environment variables to set in the container.Optional.
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
requests:
cpu: "2"
memory: "12G"
nvidia.com/gpu: "1"
volumeMounts:
- name: ca-vol
mountPath: "/home/ray/workspace/ca"
readOnly: true
- name: server-cert
mountPath: "/home/ray/workspace/tls"
readOnly: true
volumes:
- name: ca-vol
secret:
secretName: ca-secret-deployment-name
optional: false
- name: server-cert
emptyDir: {}
- replicas: 1
generictemplate:
kind: Route
apiVersion: route.openshift.io/v1
metadata:
name: ray-dashboard-deployment-name
namespace: default
labels:
# allows me to return name of service that Ray operator creates
odh-ray-cluster-service: deployment-name-head-svc
spec:
to:
kind: Service
name: deployment-name-head-svc
port:
targetPort: dashboard
- replicas: 1
generictemplate:
apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: rayclient-deployment-name
namespace: default
labels:
# allows me to return name of service that Ray operator creates
odh-ray-cluster-service: deployment-name-head-svc
spec:
port:
targetPort: client
tls:
termination: passthrough
to:
kind: Service
name: deployment-name-head-svc
- replicas: 1
generictemplate:
apiVersion: v1
data:
ca.crt: generated_crt
ca.key: generated_key
kind: Secret
metadata:
name: ca-secret-deployment-name
labels:
# allows me to return name of service that Ray operator creates
odh-ray-cluster-service: deployment-name-head-svc