Skip to content

Commit

Permalink
feat: add qwen preset test
Browse files Browse the repository at this point in the history
- remove outdated manifests
- replace LoadBalancer by ClusterIP, won't directly
 access to IP anymore

Signed-off-by: jerryzhuang <[email protected]>
  • Loading branch information
zhuangqh committed Dec 19, 2024
1 parent f66966b commit 071d9fe
Show file tree
Hide file tree
Showing 24 changed files with 177 additions and 86 deletions.
8 changes: 8 additions & 0 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@
"OSS": true,
"loads_adapter": false
},
{
"name": "qwen2-5-coder-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "llama-2-7b",
"node-count": 1,
Expand Down
5 changes: 1 addition & 4 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,7 @@ jobs:
- name: Retrieve External Service IP
id: get_ip
run: |
while [[ -z $SERVICE_IP ]]; do
SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
sleep 5
done
SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.spec.clusterIP}')
echo "Service IP is $SERVICE_IP"
echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ spec:
protocol: TCP
port: 29500
targetPort: 29500
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ spec:
protocol: TCP
port: 29500
targetPort: 29500
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
14 changes: 0 additions & 14 deletions presets/workspace/test/manifests/llama-headless.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
2 changes: 1 addition & 1 deletion presets/workspace/test/manifests/phi-2/phi-2-service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
- protocol: TCP
port: 80
targetPort: 5000
type: LoadBalancer
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: qwen2-5-coder-7b-instruct
spec:
selector:
app: qwen2-5-coder-7b-instruct
ports:
- protocol: TCP
port: 80
targetPort: 5000
type: ClusterIP
publishNotReadyAddresses: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: qwen2-5-coder-7b-instruct
spec:
replicas: 1
selector:
matchLabels:
app: qwen2-5-coder-7b-instruct
template:
metadata:
labels:
app: qwen2-5-coder-7b-instruct
spec:
containers:
- name: qwen2-5-coder-7b-instruct-container
image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: qwen25coder7
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: qwen2-5-coder-7b-instruct
spec:
replicas: 1
selector:
matchLabels:
app: qwen2-5-coder-7b-instruct
template:
metadata:
labels:
app: qwen2-5-coder-7b-instruct
spec:
containers:
- name: qwen2-5-coder-7b-instruct-container
image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
command:
- /bin/sh
- -c
- python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
- mountPath: /mnt/config
name: config-volume
volumes:
- name: dshm
emptyDir:
medium: Memory
- configMap:
defaultMode: 420
name: qwen2-5-coder-7b-inference-params
name: config-volume
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: qwen25coder7
---
apiVersion: v1
kind: ConfigMap
metadata:
name: qwen2-5-coder-7b-inference-params
data:
inference_config.yaml: |
# Maximum number of steps to find the max available seq len fitting in the GPU memory.
max_probe_steps: 6
vllm:
cpu-offload-gb: 0
gpu-memory-utilization: 0.95
swap-space: 4
served-model-name: test
dtype: float16
tensor-parallel-size: 2
# max-seq-len-to-capture: 8192
# num-scheduler-steps: 1
# enable-chunked-prefill: false
# see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.

0 comments on commit 071d9fe

Please sign in to comment.