forked from intel/intel-technology-enabling-for-openshift
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtgi_gaudi_servingruntime.yaml
58 lines (58 loc) · 1.28 KB
/
tgi_gaudi_servingruntime.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Copyright (c) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
---
apiVersion: serving.kserve.io/v1alpha1
kind: ServingRuntime
metadata:
name: tgi-gaudi-serving-runtime
spec:
containers:
- name: kserve-container
image: ghcr.io/huggingface/tgi-gaudi:1.2.1
args:
- --model-id
- /mnt/models/
- --port=8080
- --num-shard=3 #Number of GPU's
- --sharded=true
- --json-output
env: #Add variables according to the chosen model
- name: HF_HOME
value: /tmp/hf_home
- name: HF_OFFLINE
value: "1"
- name: TRANSFORMERS_OFFLINE
value: "1"
- name: HF_HUB_CACHE
value: /mnt/models
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
key: HUGGING_FACE_HUB_TOKEN
name: hf-token
resources:
limits:
cpu: "16"
memory: 128Gi
requests:
cpu: "16"
memory: 128Gi
readinessProbe:
exec:
command:
- curl
- localhost:8080/health
initialDelaySeconds: 500
livenessProbe:
exec:
command:
- curl
- localhost:8080/health
initialDelaySeconds: 500
ports:
- containerPort: 8080
protocol: TCP
multiModel: false
supportedModelFormats:
- autoSelect: true
name: llm