From 70ace5c016cb7b71dc0e870c42e715e7093bd508 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 12:37:53 +0200 Subject: [PATCH 1/7] Add integration tests for vllm Signed-off-by: Radoslav Dimitrov --- .github/workflows/integration-tests.yml | 61 ++++++++++++++++- src/codegate/providers/vllm/adapter.py | 7 +- src/codegate/providers/vllm/provider.py | 87 ++++++++++++++++--------- tests/integration/testcases.yaml | 10 +-- 4 files changed, 121 insertions(+), 44 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 4c56a69c..81eca2e6 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -62,6 +62,7 @@ jobs: -v "$(pwd)"/codegate_volume:/app/codegate_volume \ -e CODEGATE_APP_LOG_LEVEL=DEBUG \ -e CODEGATE_OLLAMA_URL=http://localhost:11434 \ + -e CODEGATE_VLLM_URL=http://localhost:8000 \ --restart unless-stopped $DOCKER_IMAGE # Confirm the container started @@ -181,7 +182,60 @@ jobs: run: | docker logs ollama - - name: Print the container logs (useful for debugging) + - name: Build and run the vllm container + run: | + git clone https://github.com/vllm-project/vllm.git + cd vllm + docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . + docker run -d --name vllm \ + --network="host" \ + vllm-cpu-env --model Qwen/Qwen2.5-Coder-1.5B-Instruct + + - name: Verify the vllm container is running + run: | + echo -e "\nVerify the vllm container is serving\n" + docker ps -f name=vllm + + echo "Loop until the endpoint responds successfully" + while ! curl --silent --fail --get "http://localhost:8000/ping" >/dev/null; do + echo "Ping not available yet. Retrying in 2 seconds..." + sleep 2 + done + echo -e "\nPing is now available!\n" + + echo -e "\nVerify the completions endpoint works\n" + curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "prompt": ["How to make pizza"], + "max_tokens": 100, + "temperature": 0 + }' + + echo -e "\nVerify the chat/completions endpoint works\n" + curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a coding assistant."}, + {"role": "user", "content": "Hello"} + ], + "temperature": 0, + "max_tokens": 4096, + "extra_body": {} + }' + + # Print a new line and then the message in a single echo + echo -e "\nPrint the vllm container logs\n" + docker logs vllm + + - name: Run integration tests - vllm + env: + CODEGATE_PROVIDERS: "vllm" + run: | + poetry run python tests/integration/integration_tests.py + + - name: Print the CodeGate container logs (useful for debugging) if: always() run: | docker logs $CONTAINER_NAME @@ -194,3 +248,8 @@ jobs: echo "DB contents:" ls -la codegate_volume/db docker exec $CONTAINER_NAME ls -la /app/codegate_volume/db + + - name: Print the vllm container logs (useful for debugging) + if: always() + run: | + docker logs vllm diff --git a/src/codegate/providers/vllm/adapter.py b/src/codegate/providers/vllm/adapter.py index 0391708b..4b6294f3 100644 --- a/src/codegate/providers/vllm/adapter.py +++ b/src/codegate/providers/vllm/adapter.py @@ -102,6 +102,7 @@ def _has_chat_ml_format(data: Dict) -> bool: content = input_chat_request["messages"][0]["content"] if isinstance(content, str) and "<|im_start|>" in content: return True + return False def normalize(self, data: Dict) -> ChatCompletionRequest: """ @@ -117,12 +118,6 @@ def normalize(self, data: Dict) -> ChatCompletionRequest: if not model_name.startswith("hosted_vllm/"): normalized_data["model"] = f"hosted_vllm/{model_name}" - # Ensure the base_url ends with /v1 if provided - if "base_url" in normalized_data: - base_url = normalized_data["base_url"].rstrip("/") - if not base_url.endswith("/v1"): - normalized_data["base_url"] = f"{base_url}/v1" - ret_data = normalized_data if self._has_chat_ml_format(normalized_data): ret_data = self._chat_ml_normalizer.normalize(normalized_data) diff --git a/src/codegate/providers/vllm/provider.py b/src/codegate/providers/vllm/provider.py index 303b907b..70f768c7 100644 --- a/src/codegate/providers/vllm/provider.py +++ b/src/codegate/providers/vllm/provider.py @@ -1,4 +1,5 @@ import json +from urllib.parse import urljoin import httpx import structlog @@ -31,6 +32,19 @@ def __init__( def provider_route_name(self) -> str: return "vllm" + def _get_base_url(self) -> str: + """ + Get the base URL from config with proper formatting + """ + config = Config.get_config() + base_url = config.provider_urls.get("vllm") if config else "" + if base_url: + base_url = base_url.rstrip("/") + # Add /v1 if not present + if not base_url.endswith("/v1"): + base_url = f"{base_url}/v1" + return base_url + def models(self): resp = httpx.get(f"{self.base_url}/v1/models") jsonresp = resp.json() @@ -40,60 +54,69 @@ def models(self): def _setup_routes(self): """ Sets up the /chat/completions route for the provider as expected by the - OpenAI API. Extracts the API key from the "Authorization" header and - passes it to the completion handler. + OpenAI API. Makes the API key optional in the "Authorization" header. """ @self.router.get(f"/{self.provider_route_name}/models") - async def get_models(authorization: str = Header(..., description="Bearer token")): - if not authorization.startswith("Bearer "): - raise HTTPException(status_code=401, detail="Invalid authorization header") - - token = authorization.split(" ")[1] - config = Config.get_config() - if config: - base_url = config.provider_urls.get("vllm") - else: - base_url = "" - - async with httpx.AsyncClient() as client: - response = await client.get( - f"{base_url}/v1/models", headers={"Authorization": f"Bearer {token}"} + async def get_models( + authorization: str | None = Header(None, description="Optional Bearer token") + ): + base_url = self._get_base_url() + headers = {} + + if authorization: + if not authorization.startswith("Bearer "): + raise HTTPException( + status_code=401, detail="Invalid authorization header format" + ) + token = authorization.split(" ")[1] + headers["Authorization"] = f"Bearer {token}" + + try: + models_url = urljoin(base_url, "v1/models") + async with httpx.AsyncClient() as client: + response = await client.get(models_url, headers=headers) + response.raise_for_status() + return response.json() + except httpx.HTTPError as e: + logger = structlog.get_logger("codegate") + logger.error("Error fetching vLLM models", error=str(e)) + raise HTTPException( + status_code=e.response.status_code if hasattr(e, "response") else 500, + detail=str(e), ) - response.raise_for_status() - return response.json() @self.router.post(f"/{self.provider_route_name}/chat/completions") @self.router.post(f"/{self.provider_route_name}/completions") async def create_completion( request: Request, - authorization: str = Header(..., description="Bearer token"), + authorization: str | None = Header(None, description="Optional Bearer token"), ): - if not authorization.startswith("Bearer "): - raise HTTPException(status_code=401, detail="Invalid authorization header") + api_key = None + if authorization: + if not authorization.startswith("Bearer "): + raise HTTPException( + status_code=401, detail="Invalid authorization header format" + ) + api_key = authorization.split(" ")[1] - api_key = authorization.split(" ")[1] body = await request.body() data = json.loads(body) # Add the vLLM base URL to the request - config = Config.get_config() - if config: - data["base_url"] = config.provider_urls.get("vllm") - else: - data["base_url"] = "" + base_url = self._get_base_url() + data["base_url"] = base_url is_fim_request = self._is_fim_request(request, data) try: + # Pass the potentially None api_key to complete stream = await self.complete(data, api_key, is_fim_request=is_fim_request) except Exception as e: - #  check if we have an status code there + # Check if we have a status code there if hasattr(e, "status_code"): logger = structlog.get_logger("codegate") logger.error("Error in VLLMProvider completion", error=str(e)) + raise HTTPException(status_code=e.status_code, detail=str(e)) + raise e - raise HTTPException(status_code=e.status_code, detail=str(e)) # type: ignore - else: - # just continue raising the exception - raise e return self._completion_handler.create_response(stream) diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index 7798bf69..4a32d4c6 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -1,6 +1,6 @@ headers: vllm: - Authorization: Bearer ENV_VLLM_KEY + Content-Type: application/json openai: Authorization: Bearer ENV_OPENAI_KEY ollama: @@ -161,12 +161,12 @@ testcases: "role":"user" } ], - "model":"Qwen/Qwen2.5-Coder-14B-Instruct", + "model":"Qwen/Qwen2.5-Coder-1.5B-Instruct", "stream":true, "temperature":0 } likes: | - Hello! How can I assist you today? If you have any questions about software security, package analysis, or need guidance on secure coding practices, feel free to ask. + Hello! How can I assist you today? vllm_fim: name: VLLM FIM @@ -174,7 +174,7 @@ testcases: url: http://127.0.0.1:8989/vllm/completions data: | { - "model": "Qwen/Qwen2.5-Coder-14B", + "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", "max_tokens": 4096, "temperature": 0, "stream": true, @@ -333,4 +333,4 @@ testcases: print(response.status_code) print(response.json()) - ``` \ No newline at end of file + ``` From 4a45c7575f5bbbc0eca2c506b3ebcae741b0231a Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 13:19:44 +0200 Subject: [PATCH 2/7] Try using Qwen/Qwen2.5-Coder-3B-Instruct Signed-off-by: Radoslav Dimitrov --- .github/workflows/integration-tests.yml | 6 +++--- tests/integration/testcases.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 81eca2e6..1dc3329e 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -189,7 +189,7 @@ jobs: docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . docker run -d --name vllm \ --network="host" \ - vllm-cpu-env --model Qwen/Qwen2.5-Coder-1.5B-Instruct + vllm-cpu-env --model Qwen/Qwen2.5-Coder-3B-Instruct - name: Verify the vllm container is running run: | @@ -205,7 +205,7 @@ jobs: echo -e "\nVerify the completions endpoint works\n" curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-3B-Instruct", "prompt": ["How to make pizza"], "max_tokens": 100, "temperature": 0 @@ -215,7 +215,7 @@ jobs: curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-3B-Instruct", "messages": [ {"role": "system", "content": "You are a coding assistant."}, {"role": "user", "content": "Hello"} diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index 4a32d4c6..b986475f 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -161,7 +161,7 @@ testcases: "role":"user" } ], - "model":"Qwen/Qwen2.5-Coder-1.5B-Instruct", + "model":"Qwen/Qwen2.5-Coder-3B-Instruct", "stream":true, "temperature":0 } @@ -174,7 +174,7 @@ testcases: url: http://127.0.0.1:8989/vllm/completions data: | { - "model": "Qwen/Qwen2.5-Coder-1.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-3B-Instruct", "max_tokens": 4096, "temperature": 0, "stream": true, From 430d6738b23042866b83e4282a33f36003e499a1 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 14:36:03 +0200 Subject: [PATCH 3/7] Go back to using Qwen/Qwen2.5-Coder-0.5B-Instruct Signed-off-by: Radoslav Dimitrov --- .github/workflows/integration-tests.yml | 6 +++--- tests/integration/testcases.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 1dc3329e..d5cc88fb 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -189,7 +189,7 @@ jobs: docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . docker run -d --name vllm \ --network="host" \ - vllm-cpu-env --model Qwen/Qwen2.5-Coder-3B-Instruct + vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B-Instruct - name: Verify the vllm container is running run: | @@ -205,7 +205,7 @@ jobs: echo -e "\nVerify the completions endpoint works\n" curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen2.5-Coder-3B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "prompt": ["How to make pizza"], "max_tokens": 100, "temperature": 0 @@ -215,7 +215,7 @@ jobs: curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-Coder-3B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "messages": [ {"role": "system", "content": "You are a coding assistant."}, {"role": "user", "content": "Hello"} diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index b986475f..2102ac6e 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -161,7 +161,7 @@ testcases: "role":"user" } ], - "model":"Qwen/Qwen2.5-Coder-3B-Instruct", + "model":"Qwen/Qwen2.5-Coder-0.5B-Instruct", "stream":true, "temperature":0 } @@ -174,7 +174,7 @@ testcases: url: http://127.0.0.1:8989/vllm/completions data: | { - "model": "Qwen/Qwen2.5-Coder-3B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "max_tokens": 4096, "temperature": 0, "stream": true, From c6bb1fa27c91934c9e192210cf75a54ac9b9e0ad Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 15:09:03 +0200 Subject: [PATCH 4/7] Reformat the vllm_fim test Signed-off-by: Radoslav Dimitrov --- tests/integration/testcases.yaml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index 2102ac6e..e7067046 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -178,7 +178,20 @@ testcases: "max_tokens": 4096, "temperature": 0, "stream": true, - "stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```"], + "stop": [ + "<|endoftext|>", + "<|fim_prefix|>", + "<|fim_middle|>", + "<|fim_suffix|>", + "<|fim_pad|>", + "<|repo_name|>", + "<|file_sep|>", + "<|im_start|>", + "<|im_end|>", + "/src/", + "#- coding: utf-8", + "```" + ], "prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>" } likes: | From 123df46094ed314a5a5a36f2e997905e32f9c911 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 15:24:26 +0200 Subject: [PATCH 5/7] Use Qwen/Qwen2.5-Coder-0.5B Signed-off-by: Radoslav Dimitrov --- .github/workflows/integration-tests.yml | 6 +++--- tests/integration/testcases.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index d5cc88fb..91c5a1d3 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -189,7 +189,7 @@ jobs: docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . docker run -d --name vllm \ --network="host" \ - vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B-Instruct + vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B - name: Verify the vllm container is running run: | @@ -205,7 +205,7 @@ jobs: echo -e "\nVerify the completions endpoint works\n" curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B", "prompt": ["How to make pizza"], "max_tokens": 100, "temperature": 0 @@ -215,7 +215,7 @@ jobs: curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B", "messages": [ {"role": "system", "content": "You are a coding assistant."}, {"role": "user", "content": "Hello"} diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index e7067046..f1174f08 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -161,7 +161,7 @@ testcases: "role":"user" } ], - "model":"Qwen/Qwen2.5-Coder-0.5B-Instruct", + "model":"Qwen/Qwen2.5-Coder-0.5B", "stream":true, "temperature":0 } @@ -174,7 +174,7 @@ testcases: url: http://127.0.0.1:8989/vllm/completions data: | { - "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", + "model": "Qwen/Qwen2.5-Coder-0.5B", "max_tokens": 4096, "temperature": 0, "stream": true, From f0127af1ff9a0f97039dab5e34a327302518b17f Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 15:49:00 +0200 Subject: [PATCH 6/7] Revert "Use Qwen/Qwen2.5-Coder-0.5B" This reverts commit 32b2d8c4001b69240d765893c62278a4f7ee5dd3. --- .github/workflows/integration-tests.yml | 6 +++--- tests/integration/testcases.yaml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 91c5a1d3..d5cc88fb 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -189,7 +189,7 @@ jobs: docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . docker run -d --name vllm \ --network="host" \ - vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B + vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B-Instruct - name: Verify the vllm container is running run: | @@ -205,7 +205,7 @@ jobs: echo -e "\nVerify the completions endpoint works\n" curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ - "model": "Qwen/Qwen2.5-Coder-0.5B", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "prompt": ["How to make pizza"], "max_tokens": 100, "temperature": 0 @@ -215,7 +215,7 @@ jobs: curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "Qwen/Qwen2.5-Coder-0.5B", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "messages": [ {"role": "system", "content": "You are a coding assistant."}, {"role": "user", "content": "Hello"} diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index f1174f08..e7067046 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -161,7 +161,7 @@ testcases: "role":"user" } ], - "model":"Qwen/Qwen2.5-Coder-0.5B", + "model":"Qwen/Qwen2.5-Coder-0.5B-Instruct", "stream":true, "temperature":0 } @@ -174,7 +174,7 @@ testcases: url: http://127.0.0.1:8989/vllm/completions data: | { - "model": "Qwen/Qwen2.5-Coder-0.5B", + "model": "Qwen/Qwen2.5-Coder-0.5B-Instruct", "max_tokens": 4096, "temperature": 0, "stream": true, From 7a70414c878bfe8623bbc26d04d7bde029bc2572 Mon Sep 17 00:00:00 2001 From: Radoslav Dimitrov Date: Wed, 29 Jan 2025 16:29:11 +0200 Subject: [PATCH 7/7] Update the expected result for vllm_fim Signed-off-by: Radoslav Dimitrov --- tests/integration/testcases.yaml | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml index e7067046..9190dc9c 100644 --- a/tests/integration/testcases.yaml +++ b/tests/integration/testcases.yaml @@ -195,19 +195,14 @@ testcases: "prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>" } likes: | - # Create an instance of the InvokeHTTP class - invoke = invokehttp.InvokeHTTP(key) + return response.json() - # Call the API using the invoke_http method - response = invoke.invoke_http(url, method='get', data=data) + def test_call_api(): + response = call_api('http://localhost:8080', method='post', data='data') + assert response['key1'] == 'test1' and response['key2'] == 'test2', "Test failed" - # Check the response status code - if response.status_code == 200: - # The API call was successful - print(response.json()) - else: - # The API call failed - print('Error:', response.status_code) + if __name__ == '__main__': + test_call_api() anthropic_chat: name: Anthropic Chat