feat: add qwen preset test

- remove outdated manifests - replace LoadBalancer by ClusterIP, won't directly access to IP anymore Signed-off-by: jerryzhuang <[email protected]>
kaito-project · Dec 19, 2024 · 071d9fe · 071d9fe
1 parent f66966b
commit 071d9fe
Show file tree

Hide file tree

Showing 24 changed files with 177 additions and 86 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -98,6 +98,14 @@
         "OSS": true,
         "loads_adapter": false
       },
+      {
+        "name": "qwen2-5-coder-7b-instruct",
+        "node-count": 1,
+        "node-vm-size": "Standard_NC12s_v3",
+        "node-osdisk-size": 100,
+        "OSS": true,
+        "loads_adapter": false
+      },
       {
         "name": "llama-2-7b",
         "node-count": 1,

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -219,10 +219,7 @@ jobs:
       - name: Retrieve External Service IP
         id: get_ip
         run: |
-            while [[ -z $SERVICE_IP ]]; do 
-                SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-                sleep 5
-            done 
+            SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.spec.clusterIP}')
             echo "Service IP is $SERVICE_IP"
             echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
 

diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-headless.yaml b/presets/workspace/test/manifests/llama-headless.yaml
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...rkspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/...rkspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...s/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/...s/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...s/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml b/...s/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/...workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...ets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/...ets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/...workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml b/...workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  selector:
+    app: qwen2-5-coder-7b-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qwen2-5-coder-7b-instruct
+  template:
+    metadata:
+      labels:
+        app: qwen2-5-coder-7b-instruct
+    spec:
+      containers:
+      - name: qwen2-5-coder-7b-instruct-container
+        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: qwen25coder7
diff --git a/...ts/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml b/...ts/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
@@ -0,0 +1,83 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qwen2-5-coder-7b-instruct
+  template:
+    metadata:
+      labels:
+        app: qwen2-5-coder-7b-instruct
+    spec:
+      containers:
+      - name: qwen2-5-coder-7b-instruct-container
+        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+        - mountPath: /mnt/config
+          name: config-volume
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      - configMap:
+          defaultMode: 420
+          name: qwen2-5-coder-7b-inference-params
+        name: config-volume
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: qwen25coder7
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: qwen2-5-coder-7b-inference-params
+data:
+  inference_config.yaml: |
+    # Maximum number of steps to find the max available seq len fitting in the GPU memory.
+    max_probe_steps: 6
+
+    vllm:
+      cpu-offload-gb: 0
+      gpu-memory-utilization: 0.95
+      swap-space: 4
+      served-model-name: test
+      dtype: float16
+      tensor-parallel-size: 2
+
+      # max-seq-len-to-capture: 8192
+      # num-scheduler-steps: 1
+      # enable-chunked-prefill: false
+      # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.