Add integration tests for vllm

Signed-off-by: Radoslav Dimitrov <[email protected]>
stacklok · Jan 29, 2025 · 0d56e43 · 0d56e43
1 parent 5135e51
commit 0d56e43
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 46 deletions.
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -58,10 +58,10 @@ jobs:
           echo "Running container from image: $DOCKER_IMAGE"
 
           # Run the container
-          docker run --name $CONTAINER_NAME -d -p 8989:8989 -p 9090:9090 \
-            -p 8990:8990 \
+          docker run --name $CONTAINER_NAME -d --network host \
             -v "$(pwd)"/codegate_volume:/app/codegate_volume \
             -e CODEGATE_APP_LOG_LEVEL=DEBUG \
+            -e CODEGATE_VLLM_URL=http://localhost:8000 \
             --restart unless-stopped $DOCKER_IMAGE
 
           # Confirm the container started
@@ -146,7 +146,60 @@ jobs:
         run: |
           poetry run python tests/integration/integration_tests.py
 
-      - name: Print the container logs (useful for debugging)
+      - name: Build and run the vllm container
+        run: |
+          git clone https://github.com/vllm-project/vllm.git
+          cd vllm
+          docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
+          docker run -d  --name vllm \
+             --network="host" \
+             vllm-cpu-env --model Qwen/Qwen2.5-Coder-1.5B-Instruct
+
+      - name: Verify the vllm container is running
+        run: |
+          echo -e "\nVerify the vllm container is serving\n"
+          docker ps -f name=vllm
+
+          echo "Loop until the endpoint responds successfully"
+          while ! curl --silent --fail --get "http://localhost:8000/ping" >/dev/null; do
+            echo "Ping not available yet. Retrying in 2 seconds..."
+            sleep 2
+          done
+          echo -e "\nPing is now available!\n"
+
+          echo -e "\nVerify the completions endpoint works\n"
+          curl http://localhost:8000/v1/completions -H "Content-Type: application/json"   -d '{
+              "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+              "prompt": ["How to make pizza"],
+              "max_tokens": 100,
+              "temperature": 0
+            }'
+
+          echo -e "\nVerify the chat/completions endpoint works\n"
+          curl -X POST http://localhost:8000/v1/chat/completions \
+              -H "Content-Type: application/json" \
+              -d '{
+                "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+                "messages": [
+                  {"role": "system", "content": "You are a coding assistant."},
+                  {"role": "user", "content": "Hello"}
+                ],
+                "temperature": 0,
+                "max_tokens": 4096,
+                "extra_body": {}
+              }'
+
+          # Print a new line and then the message in a single echo
+          echo -e "\nPrint the vllm container logs\n"
+          docker logs vllm
+
+      - name: Run integration tests - vllm
+        env:
+          CODEGATE_PROVIDERS: "vllm"
+        run: |
+          poetry run python tests/integration/integration_tests.py
+
+      - name: Print the CodeGate container logs (useful for debugging)
         if: always()
         run: |
           docker logs $CONTAINER_NAME
@@ -159,3 +212,8 @@ jobs:
           echo "DB contents:"
           ls -la codegate_volume/db
           docker exec $CONTAINER_NAME ls -la /app/codegate_volume/db
+
+      - name: Print the vllm container logs (useful for debugging)
+        if: always()
+        run: |
+          docker logs vllm
diff --git a/src/codegate/providers/vllm/adapter.py b/src/codegate/providers/vllm/adapter.py
@@ -102,6 +102,7 @@ def _has_chat_ml_format(data: Dict) -> bool:
         content = input_chat_request["messages"][0]["content"]
         if isinstance(content, str) and "<|im_start|>" in content:
             return True
+        return False
 
     def normalize(self, data: Dict) -> ChatCompletionRequest:
         """
@@ -117,12 +118,6 @@ def normalize(self, data: Dict) -> ChatCompletionRequest:
             if not model_name.startswith("hosted_vllm/"):
                 normalized_data["model"] = f"hosted_vllm/{model_name}"
 
-        # Ensure the base_url ends with /v1 if provided
-        if "base_url" in normalized_data:
-            base_url = normalized_data["base_url"].rstrip("/")
-            if not base_url.endswith("/v1"):
-                normalized_data["base_url"] = f"{base_url}/v1"
-
         ret_data = normalized_data
         if self._has_chat_ml_format(normalized_data):
             ret_data = self._chat_ml_normalizer.normalize(normalized_data)

diff --git a/src/codegate/providers/vllm/provider.py b/src/codegate/providers/vllm/provider.py
@@ -1,4 +1,5 @@
 import json
+from urllib.parse import urljoin
 
 import httpx
 import structlog
@@ -31,6 +32,19 @@ def __init__(
     def provider_route_name(self) -> str:
         return "vllm"
 
+    def _get_base_url(self) -> str:
+        """
+        Get the base URL from config with proper formatting
+        """
+        config = Config.get_config()
+        base_url = config.provider_urls.get("vllm") if config else ""
+        if base_url:
+            base_url = base_url.rstrip("/")
+            # Add /v1 if not present
+            if not base_url.endswith("/v1"):
+                base_url = f"{base_url}/v1"
+        return base_url
+
     def models(self):
         resp = httpx.get(f"{self.base_url}/v1/models")
         jsonresp = resp.json()
@@ -40,60 +54,69 @@ def models(self):
     def _setup_routes(self):
         """
         Sets up the /chat/completions route for the provider as expected by the
-        OpenAI API. Extracts the API key from the "Authorization" header and
-        passes it to the completion handler.
+        OpenAI API. Makes the API key optional in the "Authorization" header.
         """
 
         @self.router.get(f"/{self.provider_route_name}/models")
-        async def get_models(authorization: str = Header(..., description="Bearer token")):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
-
-            token = authorization.split(" ")[1]
-            config = Config.get_config()
-            if config:
-                base_url = config.provider_urls.get("vllm")
-            else:
-                base_url = ""
-
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    f"{base_url}/v1/models", headers={"Authorization": f"Bearer {token}"}
+        async def get_models(
+            authorization: str | None = Header(None, description="Optional Bearer token")
+        ):
+            base_url = self._get_base_url()
+            headers = {}
+
+            if authorization:
+                if not authorization.startswith("Bearer "):
+                    raise HTTPException(
+                        status_code=401, detail="Invalid authorization header format"
+                    )
+                token = authorization.split(" ")[1]
+                headers["Authorization"] = f"Bearer {token}"
+
+            try:
+                models_url = urljoin(base_url, "v1/models")
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(models_url, headers=headers)
+                    response.raise_for_status()
+                    return response.json()
+            except httpx.HTTPError as e:
+                logger = structlog.get_logger("codegate")
+                logger.error("Error fetching vLLM models", error=str(e))
+                raise HTTPException(
+                    status_code=e.response.status_code if hasattr(e, "response") else 500,
+                    detail=str(e),
                 )
-                response.raise_for_status()
-                return response.json()
 
         @self.router.post(f"/{self.provider_route_name}/chat/completions")
         @self.router.post(f"/{self.provider_route_name}/completions")
         async def create_completion(
             request: Request,
-            authorization: str = Header(..., description="Bearer token"),
+            authorization: str | None = Header(None, description="Optional Bearer token"),
         ):
-            if not authorization.startswith("Bearer "):
-                raise HTTPException(status_code=401, detail="Invalid authorization header")
+            api_key = None
+            if authorization:
+                if not authorization.startswith("Bearer "):
+                    raise HTTPException(
+                        status_code=401, detail="Invalid authorization header format"
+                    )
+                api_key = authorization.split(" ")[1]
 
-            api_key = authorization.split(" ")[1]
             body = await request.body()
             data = json.loads(body)
 
             # Add the vLLM base URL to the request
-            config = Config.get_config()
-            if config:
-                data["base_url"] = config.provider_urls.get("vllm")
-            else:
-                data["base_url"] = ""
+            base_url = self._get_base_url()
+            data["base_url"] = base_url
 
             is_fim_request = self._is_fim_request(request, data)
             try:
+                # Pass the potentially None api_key to complete
                 stream = await self.complete(data, api_key, is_fim_request=is_fim_request)
             except Exception as e:
-                #  check if we have an status code there
+                # Check if we have a status code there
                 if hasattr(e, "status_code"):
                     logger = structlog.get_logger("codegate")
                     logger.error("Error in VLLMProvider completion", error=str(e))
+                    raise HTTPException(status_code=e.status_code, detail=str(e))
+                raise e
 
-                    raise HTTPException(status_code=e.status_code, detail=str(e))  # type: ignore
-                else:
-                    # just continue raising the exception
-                    raise e
             return self._completion_handler.create_response(stream)
diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml
@@ -1,6 +1,6 @@
 headers:
   vllm:
-    Authorization: Bearer ENV_VLLM_KEY
+    Content-Type: application/json
   openai:
     Authorization: Bearer ENV_OPENAI_KEY
   ollama:
@@ -160,20 +160,20 @@ testcases:
               "role":"user"
             }
         ],
-        "model":"Qwen/Qwen2.5-Coder-14B-Instruct",
+        "model":"Qwen/Qwen2.5-Coder-1.5B-Instruct",
         "stream":true,
         "temperature":0
       }
     likes: |
-      Hello! How can I assist you today? If you have any questions about software security, package analysis, or need guidance on secure coding practices, feel free to ask.
+      Hello! How can I assist you today?
 
   vllm_fim:
     name: VLLM FIM
     provider: vllm
     url: http://127.0.0.1:8989/vllm/completions
     data: |
       {
-        "model": "Qwen/Qwen2.5-Coder-14B",
+        "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
         "max_tokens": 4096,
         "temperature": 0,
         "stream": true,
@@ -332,4 +332,4 @@ testcases:
 
       print(response.status_code)
       print(response.json())
-      ```
+      ```