forked from rainfd/vcuda-deployment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcuda.yml
257 lines (257 loc) · 6.17 KB
/
vcuda.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
apiVersion: v1
data:
gpu-quota-admission.config: |
{
"QuotaConfigMapName": "gpuquota",
"QuotaConfigMapNamespace": "kube-system",
"GPUModelLabel": "gaia.tencent.com/gpu-model",
"GPUPoolLabel": "gaia.tencent.com/gpu-pool"
}
kind: ConfigMap
metadata:
name: gpu-quota-admission
namespace: kube-system
---
---
apiVersion: v1
kind: Service
metadata:
name: gpu-quota-admission
namespace: kube-system
spec:
ports:
- port: 3456
protocol: TCP
targetPort: 3456
selector:
k8s-app: gpu-quota-admission
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: gpu-quota-admission
name: gpu-quota-admission
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
k8s-app: gpu-quota-admission
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
k8s-app: gpu-quota-admission
namespace: kube-system
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: node-role.kubernetes.io/master
operator: Exists
weight: 1
containers:
- env:
- name: LOG_LEVEL
value: "5"
- name: EXTRA_FLAGS
value: --incluster-mode=true
image: ccr.ccs.tencentyun.com/tkeimages/gpu-quota-admission:latest
imagePullPolicy: IfNotPresent
name: gpu-quota-admission
ports:
- containerPort: 3456
protocol: TCP
resources:
limits:
cpu: "1"
memory: 1Gi
requests:
cpu: "1"
memory: 1Gi
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
initContainers:
- command:
- sh
- -c
- ' mkdir -p /etc/kubernetes/ && cp /root/gpu-quota-admission/gpu-quota-admission.config
/etc/kubernetes/'
image: busybox
imagePullPolicy: Always
name: init-kube-config
resources: {}
securityContext:
privileged: true
volumeMounts:
- mountPath: /root/gpu-quota-admission/
name: config
priority: 2000000000
priorityClassName: system-cluster-critical
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: gpu-manager
serviceAccountName: gpu-manager
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- configMap:
defaultMode: 420
name: gpu-quota-admission
name: config
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: gpu-manager
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: gpu-manager
namespace: kube-system
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gpu-manager
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
name: gpu-manager-metric
namespace: kube-system
annotations:
prometheus.io/scrape: "true"
labels:
kubernetes.io/cluster-service: "true"
spec:
clusterIP: None
ports:
- name: metrics
port: 5678
protocol: TCP
targetPort: 5678
selector:
name: gpu-manager-ds
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-manager-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: gpu-manager-ds
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: gpu-manager-ds
spec:
containers:
- env:
- name: LOG_LEVEL
value: "5"
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
image: thomassong/gpu-manager:1.1.5
imagePullPolicy: Always
name: gpu-manager
ports:
- containerPort: 5678
protocol: TCP
resources: {}
securityContext:
privileged: true
volumeMounts:
- mountPath: /var/lib/kubelet/device-plugins
name: device-plugin
- mountPath: /etc/gpu-manager/vdriver
name: vdriver
- mountPath: /etc/gpu-manager/vm
name: vmdata
- mountPath: /var/log/gpu-manager
name: log
- mountPath: /etc/gpu-manager/checkpoint
name: checkpoint
- mountPath: /var/run
name: run-dir
- mountPath: /sys/fs/cgroup
name: cgroup
readOnly: true
- mountPath: /usr/local/host
name: usr-directory
readOnly: true
hostPID: true
nodeSelector:
nvidia-device-enable: enable
priorityClassName: system-node-critical
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: gpu-manager
serviceAccountName: gpu-manager
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- effect: NoSchedule
key: tencent.com/vcuda-core
operator: Exists
volumes:
- hostPath:
path: /var/lib/kubelet/device-plugins
type: Directory
name: device-plugin
- hostPath:
path: /etc/gpu-manager/vm
type: DirectoryOrCreate
name: vmdata
- hostPath:
path: /etc/gpu-manager/vdriver
type: DirectoryOrCreate
name: vdriver
- hostPath:
path: /etc/gpu-manager/log
type: DirectoryOrCreate
name: log
- hostPath:
path: /etc/gpu-manager/checkpoint
type: DirectoryOrCreate
name: checkpoint
- hostPath:
path: /sys/fs/cgroup
type: Directory
name: cgroup
- hostPath:
path: /usr
type: Directory
name: usr-directory
- hostPath:
path: /var/run
type: Directory
name: run-dir
updateStrategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate