Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable the integration tests for vllm #806

Merged
merged 7 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
-v "$(pwd)"/codegate_volume:/app/codegate_volume \
-e CODEGATE_APP_LOG_LEVEL=DEBUG \
-e CODEGATE_OLLAMA_URL=http://localhost:11434 \
-e CODEGATE_VLLM_URL=http://localhost:8000 \
--restart unless-stopped $DOCKER_IMAGE

# Confirm the container started
Expand Down Expand Up @@ -181,7 +182,60 @@ jobs:
run: |
docker logs ollama

- name: Print the container logs (useful for debugging)
- name: Build and run the vllm container
run: |
git clone https://github.com/vllm-project/vllm.git
cd vllm
docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
docker run -d --name vllm \
--network="host" \
vllm-cpu-env --model Qwen/Qwen2.5-Coder-0.5B-Instruct

- name: Verify the vllm container is running
run: |
echo -e "\nVerify the vllm container is serving\n"
docker ps -f name=vllm

echo "Loop until the endpoint responds successfully"
while ! curl --silent --fail --get "http://localhost:8000/ping" >/dev/null; do
echo "Ping not available yet. Retrying in 2 seconds..."
sleep 2
done
echo -e "\nPing is now available!\n"

echo -e "\nVerify the completions endpoint works\n"
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
"prompt": ["How to make pizza"],
"max_tokens": 100,
"temperature": 0
}'

echo -e "\nVerify the chat/completions endpoint works\n"
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
"messages": [
{"role": "system", "content": "You are a coding assistant."},
{"role": "user", "content": "Hello"}
],
"temperature": 0,
"max_tokens": 4096,
"extra_body": {}
}'

# Print a new line and then the message in a single echo
echo -e "\nPrint the vllm container logs\n"
docker logs vllm

- name: Run integration tests - vllm
env:
CODEGATE_PROVIDERS: "vllm"
run: |
poetry run python tests/integration/integration_tests.py

- name: Print the CodeGate container logs (useful for debugging)
if: always()
run: |
docker logs $CONTAINER_NAME
Expand All @@ -194,3 +248,8 @@ jobs:
echo "DB contents:"
ls -la codegate_volume/db
docker exec $CONTAINER_NAME ls -la /app/codegate_volume/db

- name: Print the vllm container logs (useful for debugging)
if: always()
run: |
docker logs vllm
7 changes: 1 addition & 6 deletions src/codegate/providers/vllm/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def _has_chat_ml_format(data: Dict) -> bool:
content = input_chat_request["messages"][0]["content"]
if isinstance(content, str) and "<|im_start|>" in content:
return True
return False

def normalize(self, data: Dict) -> ChatCompletionRequest:
"""
Expand All @@ -117,12 +118,6 @@ def normalize(self, data: Dict) -> ChatCompletionRequest:
if not model_name.startswith("hosted_vllm/"):
normalized_data["model"] = f"hosted_vllm/{model_name}"

# Ensure the base_url ends with /v1 if provided
if "base_url" in normalized_data:
base_url = normalized_data["base_url"].rstrip("/")
if not base_url.endswith("/v1"):
normalized_data["base_url"] = f"{base_url}/v1"

ret_data = normalized_data
if self._has_chat_ml_format(normalized_data):
ret_data = self._chat_ml_normalizer.normalize(normalized_data)
Expand Down
87 changes: 55 additions & 32 deletions src/codegate/providers/vllm/provider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from urllib.parse import urljoin

import httpx
import structlog
Expand Down Expand Up @@ -31,6 +32,19 @@ def __init__(
def provider_route_name(self) -> str:
return "vllm"

def _get_base_url(self) -> str:
"""
Get the base URL from config with proper formatting
"""
config = Config.get_config()
base_url = config.provider_urls.get("vllm") if config else ""
if base_url:
base_url = base_url.rstrip("/")
# Add /v1 if not present
if not base_url.endswith("/v1"):
base_url = f"{base_url}/v1"
return base_url

def models(self):
resp = httpx.get(f"{self.base_url}/v1/models")
jsonresp = resp.json()
Expand All @@ -40,60 +54,69 @@ def models(self):
def _setup_routes(self):
"""
Sets up the /chat/completions route for the provider as expected by the
OpenAI API. Extracts the API key from the "Authorization" header and
passes it to the completion handler.
OpenAI API. Makes the API key optional in the "Authorization" header.
"""

@self.router.get(f"/{self.provider_route_name}/models")
async def get_models(authorization: str = Header(..., description="Bearer token")):
if not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Invalid authorization header")

token = authorization.split(" ")[1]
config = Config.get_config()
if config:
base_url = config.provider_urls.get("vllm")
else:
base_url = ""

async with httpx.AsyncClient() as client:
response = await client.get(
f"{base_url}/v1/models", headers={"Authorization": f"Bearer {token}"}
async def get_models(
authorization: str | None = Header(None, description="Optional Bearer token")
):
base_url = self._get_base_url()
headers = {}

if authorization:
if not authorization.startswith("Bearer "):
raise HTTPException(
status_code=401, detail="Invalid authorization header format"
)
token = authorization.split(" ")[1]
headers["Authorization"] = f"Bearer {token}"

try:
models_url = urljoin(base_url, "v1/models")
async with httpx.AsyncClient() as client:
response = await client.get(models_url, headers=headers)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
logger = structlog.get_logger("codegate")
logger.error("Error fetching vLLM models", error=str(e))
raise HTTPException(
status_code=e.response.status_code if hasattr(e, "response") else 500,
detail=str(e),
)
response.raise_for_status()
return response.json()

@self.router.post(f"/{self.provider_route_name}/chat/completions")
@self.router.post(f"/{self.provider_route_name}/completions")
async def create_completion(
request: Request,
authorization: str = Header(..., description="Bearer token"),
authorization: str | None = Header(None, description="Optional Bearer token"),
):
if not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Invalid authorization header")
api_key = None
if authorization:
if not authorization.startswith("Bearer "):
raise HTTPException(
status_code=401, detail="Invalid authorization header format"
)
api_key = authorization.split(" ")[1]

api_key = authorization.split(" ")[1]
body = await request.body()
data = json.loads(body)

# Add the vLLM base URL to the request
config = Config.get_config()
if config:
data["base_url"] = config.provider_urls.get("vllm")
else:
data["base_url"] = ""
base_url = self._get_base_url()
data["base_url"] = base_url

is_fim_request = self._is_fim_request(request, data)
try:
# Pass the potentially None api_key to complete
stream = await self.complete(data, api_key, is_fim_request=is_fim_request)
except Exception as e:
#  check if we have an status code there
# Check if we have a status code there
if hasattr(e, "status_code"):
logger = structlog.get_logger("codegate")
logger.error("Error in VLLMProvider completion", error=str(e))
raise HTTPException(status_code=e.status_code, detail=str(e))
raise e

raise HTTPException(status_code=e.status_code, detail=str(e)) # type: ignore
else:
# just continue raising the exception
raise e
return self._completion_handler.create_response(stream)
42 changes: 25 additions & 17 deletions tests/integration/testcases.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
headers:
vllm:
Authorization: Bearer ENV_VLLM_KEY
Content-Type: application/json
openai:
Authorization: Bearer ENV_OPENAI_KEY
ollama:
Expand Down Expand Up @@ -161,40 +161,48 @@ testcases:
"role":"user"
}
],
"model":"Qwen/Qwen2.5-Coder-14B-Instruct",
"model":"Qwen/Qwen2.5-Coder-0.5B-Instruct",
"stream":true,
"temperature":0
}
likes: |
Hello! How can I assist you today? If you have any questions about software security, package analysis, or need guidance on secure coding practices, feel free to ask.
Hello! How can I assist you today?

vllm_fim:
name: VLLM FIM
provider: vllm
url: http://127.0.0.1:8989/vllm/completions
data: |
{
"model": "Qwen/Qwen2.5-Coder-14B",
"model": "Qwen/Qwen2.5-Coder-0.5B-Instruct",
"max_tokens": 4096,
"temperature": 0,
"stream": true,
"stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```"],
"stop": [
"<|endoftext|>",
"<|fim_prefix|>",
"<|fim_middle|>",
"<|fim_suffix|>",
"<|fim_pad|>",
"<|repo_name|>",
"<|file_sep|>",
"<|im_start|>",
"<|im_end|>",
"/src/",
"#- coding: utf-8",
"```"
],
"prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>"
}
likes: |
# Create an instance of the InvokeHTTP class
invoke = invokehttp.InvokeHTTP(key)
return response.json()

# Call the API using the invoke_http method
response = invoke.invoke_http(url, method='get', data=data)
def test_call_api():
response = call_api('http://localhost:8080', method='post', data='data')
assert response['key1'] == 'test1' and response['key2'] == 'test2', "Test failed"

# Check the response status code
if response.status_code == 200:
# The API call was successful
print(response.json())
else:
# The API call failed
print('Error:', response.status_code)
if __name__ == '__main__':
test_call_api()

anthropic_chat:
name: Anthropic Chat
Expand Down Expand Up @@ -333,4 +341,4 @@ testcases:

print(response.status_code)
print(response.json())
```
```