diff --git a/presets/inference/text-generation/api_spec.json b/presets/inference/text-generation/api_spec.json index 480fa97e4..d6448d15d 100644 --- a/presets/inference/text-generation/api_spec.json +++ b/presets/inference/text-generation/api_spec.json @@ -5,60 +5,96 @@ "version": "0.1.0" }, "paths": { - "/": { + "/health": { "get": { - "summary": "Home Endpoint", - "description": "A simple endpoint that indicates the server is running.\nNo parameters are required. Returns a message indicating the server status.", - "operationId": "home__get", + "summary": "Health", + "description": "Health check.", + "operationId": "health_health_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { - "schema": { - "$ref": "#/components/schemas/HomeResponse" - } + "schema": {} } } } } } }, - "/healthz": { - "get": { - "summary": "Health Check Endpoint", - "operationId": "health_check_healthz_get", + "/tokenize": { + "post": { + "summary": "Tokenize", + "operationId": "tokenize_tokenize_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/TokenizeCompletionRequest" + }, + { + "$ref": "#/components/schemas/TokenizeChatRequest" + } + ], + "title": "Request" + } + } + }, + "required": true + }, "responses": { "200": { "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/HealthStatus" - }, - "example": { - "status": "Healthy" + "$ref": "#/components/schemas/HTTPValidationError" } } } + } + } + } + }, + "/detokenize": { + "post": { + "summary": "Detokenize", + "operationId": "detokenize_detokenize_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DetokenizeRequest" + } + } }, - "500": { - "description": "Error Response", + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", "content": { "application/json": { - "examples": { - "model_uninitialized": { - "summary": "Model not initialized", - "value": { - "detail": "Model not initialized" - } - }, - "pipeline_uninitialized": { - "summary": "Pipeline not initialized", - "value": { - "detail": "Pipeline not initialized" - } - } + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" } } } @@ -66,75 +102,47 @@ } } }, - "/chat": { + "/v1/models": { + "get": { + "summary": "Show Available Models", + "operationId": "show_available_models_v1_models_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/version": { + "get": { + "summary": "Show Version", + "operationId": "show_version_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/v1/chat/completions": { "post": { - "summary": "Chat Endpoint", - "description": "Processes chat requests, generating text based on the specified pipeline (text generation or conversational).\nValidates required parameters based on the pipeline and returns the generated text.", - "operationId": "generate_text_chat_post", + "summary": "Create Chat Completion", + "operationId": "create_chat_completion_v1_chat_completions_post", "requestBody": { "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/UnifiedRequestModel" - }, - "examples": { - "text_generation_example": { - "summary": "Text Generation Example", - "description": "An example of a text generation request.", - "value": { - "prompt": "Tell me a joke", - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - }, - "conversation_example": { - "summary": "Conversation Example", - "description": "An example of a conversational request.", - "value": { - "messages": [ - { - "role": "user", - "content": "What is your favourite condiment?" - }, - { - "role": "assistant", - "content": "Well, im quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever im cooking up in the kitchen!" - }, - { - "role": "user", - "content": "Do you have mayonnaise recipes?" - } - ], - "return_full_text": true, - "clean_up_tokenization_spaces": false, - "generate_kwargs": { - "max_length": 200, - "min_length": 0, - "do_sample": true, - "early_stopping": false, - "num_beams": 1, - "temperature": 1, - "top_k": 10, - "top_p": 1, - "typical_p": 1, - "repetition_penalty": 1, - "eos_token_id": 11 - } - } - } + "$ref": "#/components/schemas/ChatCompletionRequest" } } }, @@ -145,47 +153,45 @@ "description": "Successful Response", "content": { "application/json": { - "schema": {}, - "examples": { - "text_generation": { - "summary": "Text Generation Response", - "value": { - "Result": "Generated text based on the prompt." - } - }, - "conversation": { - "summary": "Conversation Response", - "value": { - "Result": "Response to the last message in the conversation." - } - } - } + "schema": {} } } }, - "400": { + "422": { "description": "Validation Error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "examples": { - "missing_prompt": { - "summary": "Missing Prompt", - "value": { - "detail": "Text generation parameter prompt required" - } - }, - "missing_messages": { - "summary": "Missing Messages", - "value": { - "detail": "Conversational parameter messages required" - } - } + "$ref": "#/components/schemas/HTTPValidationError" } } } + } + } + } + }, + "/v1/completions": { + "post": { + "summary": "Create Completion", + "operationId": "create_completion_v1_completions_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CompletionRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } }, "422": { "description": "Validation Error", @@ -196,13 +202,39 @@ } } } + } + } + } + }, + "/v1/embeddings": { + "post": { + "summary": "Create Embedding", + "operationId": "create_embedding_v1_embeddings_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/EmbeddingRequest" + } + } }, - "500": { - "description": "Internal Server Error", + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "$ref": "#/components/schemas/HTTPValidationError" } } } @@ -210,61 +242,36 @@ } } }, - "/metrics": { + "/healthz": { "get": { - "summary": "Metrics Endpoint", - "description": "Provides system metrics, including GPU details if available, or CPU and memory usage otherwise.\nUseful for monitoring the resource utilization of the server running the ML models.", - "operationId": "get_metrics_metrics_get", + "summary": "Health Check Endpoint", + "description": "Health check.", + "operationId": "health_check_healthz_get", "responses": { "200": { "description": "Successful Response", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/MetricsResponse" + "$ref": "#/components/schemas/HealthStatus" }, - "examples": { - "gpu_metrics": { - "summary": "Example when GPUs are available", - "value": { - "gpu_info": [ - { - "id": "GPU-1234", - "name": "GeForce GTX 950", - "load": "25.00%", - "temperature": "55 C", - "memory": { - "used": "1.00 GB", - "total": "2.00 GB" - } - } - ] - } - }, - "cpu_metrics": { - "summary": "Example when only CPU is available", - "value": { - "cpu_info": { - "load_percentage": 20, - "physical_cores": 4, - "total_cores": 8, - "memory": { - "used": "4.00 GB", - "total": "16.00 GB" - } - } - } - } + "example": { + "status": "Healthy" } } } }, "500": { - "description": "Internal Server Error", + "description": "Error Response", "content": { "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" + "examples": { + "model_uninitialized": { + "summary": "Model not initialized", + "value": { + "detail": "Model not initialized" + } + } } } } @@ -275,148 +282,1661 @@ }, "components": { "schemas": { - "CPUInfo": { + "AudioURL": { "properties": { - "load_percentage": { - "type": "number", - "title": "Load Percentage" + "url": { + "type": "string", + "title": "Url" + } + }, + "type": "object", + "required": [ + "url" + ], + "title": "AudioURL" + }, + "BaseModel": { + "properties": {}, + "type": "object", + "title": "BaseModel" + }, + "ChatCompletionAssistantMessageParam": { + "properties": { + "role": { + "type": "string", + "enum": [ + "assistant" + ], + "const": "assistant", + "title": "Role" }, - "physical_cores": { - "type": "integer", - "title": "Physical Cores" + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam" + } + ] + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Content" }, - "total_cores": { - "type": "integer", - "title": "Total Cores" + "function_call": { + "anyOf": [ + { + "$ref": "#/components/schemas/FunctionCall" + }, + { + "type": "null" + } + ] + }, + "name": { + "type": "string", + "title": "Name" + }, + "refusal": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Refusal" }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" + "tool_calls": { + "items": { + "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam" + }, + "type": "array", + "title": "Tool Calls" } }, "type": "object", "required": [ - "load_percentage", - "physical_cores", - "total_cores", - "memory" + "role" ], - "title": "CPUInfo" + "title": "ChatCompletionAssistantMessageParam" }, - "ErrorResponse": { + "ChatCompletionContentPartAudioParam": { "properties": { - "detail": { + "audio_url": { + "$ref": "#/components/schemas/AudioURL" + }, + "type": { "type": "string", - "title": "Detail" + "enum": [ + "audio_url" + ], + "const": "audio_url", + "title": "Type" } }, "type": "object", "required": [ - "detail" + "audio_url", + "type" ], - "title": "ErrorResponse" + "title": "ChatCompletionContentPartAudioParam" }, - "GPUInfo": { + "ChatCompletionContentPartImageParam": { "properties": { - "id": { - "type": "string", - "title": "Id" - }, - "name": { - "type": "string", - "title": "Name" - }, - "load": { - "type": "string", - "title": "Load" + "image_url": { + "$ref": "#/components/schemas/ImageURL" }, - "temperature": { + "type": { "type": "string", - "title": "Temperature" - }, - "memory": { - "$ref": "#/components/schemas/MemoryInfo" + "enum": [ + "image_url" + ], + "const": "image_url", + "title": "Type" } }, "type": "object", "required": [ - "id", - "name", - "load", - "temperature", - "memory" + "image_url", + "type" ], - "title": "GPUInfo" + "title": "ChatCompletionContentPartImageParam" }, - "GenerateKwargs": { + "ChatCompletionContentPartRefusalParam": { "properties": { - "max_length": { - "type": "integer", - "title": "Max Length", - "default": 200 - }, - "min_length": { - "type": "integer", - "title": "Min Length", - "default": 0 + "refusal": { + "type": "string", + "title": "Refusal" }, - "do_sample": { - "type": "boolean", - "title": "Do Sample", + "type": { + "type": "string", + "enum": [ + "refusal" + ], + "const": "refusal", + "title": "Type" + } + }, + "type": "object", + "required": [ + "refusal", + "type" + ], + "title": "ChatCompletionContentPartRefusalParam" + }, + "ChatCompletionContentPartTextParam": { + "properties": { + "text": { + "type": "string", + "title": "Text" + }, + "type": { + "type": "string", + "enum": [ + "text" + ], + "const": "text", + "title": "Type" + } + }, + "type": "object", + "required": [ + "text", + "type" + ], + "title": "ChatCompletionContentPartTextParam" + }, + "ChatCompletionFunctionMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Content" + }, + "name": { + "type": "string", + "title": "Name" + }, + "role": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Role" + } + }, + "type": "object", + "required": [ + "content", + "name", + "role" + ], + "title": "ChatCompletionFunctionMessageParam" + }, + "ChatCompletionMessageToolCallParam": { + "properties": { + "id": { + "type": "string", + "title": "Id" + }, + "function": { + "$ref": "#/components/schemas/Function" + }, + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type" + } + }, + "type": "object", + "required": [ + "id", + "function", + "type" + ], + "title": "ChatCompletionMessageToolCallParam" + }, + "ChatCompletionNamedFunction": { + "properties": { + "name": { + "type": "string", + "title": "Name" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name" + ], + "title": "ChatCompletionNamedFunction" + }, + "ChatCompletionNamedToolChoiceParam": { + "properties": { + "function": { + "$ref": "#/components/schemas/ChatCompletionNamedFunction" + }, + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type", + "default": "function" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "function" + ], + "title": "ChatCompletionNamedToolChoiceParam" + }, + "ChatCompletionRequest": { + "properties": { + "messages": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionSystemMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionUserMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionToolMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionMessageParam" + } + ] + }, + "type": "array", + "title": "Messages" + }, + "model": { + "type": "string", + "title": "Model" + }, + "frequency_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Frequency Penalty", + "default": 0.0 + }, + "logit_bias": { + "anyOf": [ + { + "additionalProperties": { + "type": "number" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Logit Bias" + }, + "logprobs": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Logprobs", + "default": false + }, + "top_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Top Logprobs", + "default": 0 + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Tokens" + }, + "n": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "N", + "default": 1 + }, + "presence_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Presence Penalty", + "default": 0.0 + }, + "response_format": { + "anyOf": [ + { + "$ref": "#/components/schemas/ResponseFormat" + }, + { + "type": "null" + } + ] + }, + "seed": { + "anyOf": [ + { + "type": "integer", + "maximum": 9.223372036854776e+18, + "minimum": -9.223372036854776e+18 + }, + { + "type": "null" + } + ], + "title": "Seed" + }, + "stop": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop" + }, + "stream": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Stream", + "default": false + }, + "stream_options": { + "anyOf": [ + { + "$ref": "#/components/schemas/StreamOptions" + }, + { + "type": "null" + } + ] + }, + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature", + "default": 0.7 + }, + "top_p": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Top P", + "default": 1.0 + }, + "tools": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ChatCompletionToolsParam" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tools" + }, + "tool_choice": { + "anyOf": [ + { + "type": "string", + "enum": [ + "none" + ], + "const": "none" + }, + { + "type": "string", + "enum": [ + "auto" + ], + "const": "auto" + }, + { + "$ref": "#/components/schemas/ChatCompletionNamedToolChoiceParam" + }, + { + "type": "null" + } + ], + "title": "Tool Choice", + "default": "none" + }, + "parallel_tool_calls": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Parallel Tool Calls", + "default": false + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "best_of": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Best Of" + }, + "use_beam_search": { + "type": "boolean", + "title": "Use Beam Search", + "default": false + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": -1 + }, + "min_p": { + "type": "number", + "title": "Min P", + "default": 0.0 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1.0 + }, + "length_penalty": { + "type": "number", + "title": "Length Penalty", + "default": 1.0 + }, + "stop_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop Token Ids" + }, + "include_stop_str_in_output": { + "type": "boolean", + "title": "Include Stop Str In Output", + "default": false + }, + "ignore_eos": { + "type": "boolean", + "title": "Ignore Eos", + "default": false + }, + "min_tokens": { + "type": "integer", + "title": "Min Tokens", + "default": 0 + }, + "skip_special_tokens": { + "type": "boolean", + "title": "Skip Special Tokens", + "default": true + }, + "spaces_between_special_tokens": { + "type": "boolean", + "title": "Spaces Between Special Tokens", + "default": true + }, + "truncate_prompt_tokens": { + "anyOf": [ + { + "type": "integer", + "minimum": 1.0 + }, + { + "type": "null" + } + ], + "title": "Truncate Prompt Tokens" + }, + "prompt_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Prompt Logprobs" + }, + "echo": { + "type": "boolean", + "title": "Echo", + "description": "If true, the new message will be prepended with the last message if they belong to the same role.", + "default": false + }, + "add_generation_prompt": { + "type": "boolean", + "title": "Add Generation Prompt", + "description": "If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.", + "default": true + }, + "continue_final_message": { + "type": "boolean", + "title": "Continue Final Message", + "description": "If this is set, the chat will be formatted so that the final message in the chat is open-ended, without any EOS tokens. The model will continue this message rather than starting a new one. This allows you to \"prefill\" part of the model's response for it. Cannot be used at the same time as `add_generation_prompt`.", + "default": false + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "description": "If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).", + "default": false + }, + "documents": { + "anyOf": [ + { + "items": { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Documents", + "description": "A list of dicts representing documents that will be accessible to the model if it is performing RAG (retrieval-augmented generation). If the template does not support RAG, this argument will have no effect. We recommend that each document should be a dict containing \"title\" and \"text\" keys." + }, + "chat_template": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Chat Template", + "description": "A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one." + }, + "chat_template_kwargs": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Chat Template Kwargs", + "description": "Additional kwargs to pass to the template renderer. Will be accessible by the chat template." + }, + "guided_json": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "$ref": "#/components/schemas/BaseModel" + }, + { + "type": "null" + } + ], + "title": "Guided Json", + "description": "If specified, the output will follow the JSON schema." + }, + "guided_regex": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Regex", + "description": "If specified, the output will follow the regex pattern." + }, + "guided_choice": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Guided Choice", + "description": "If specified, the output will be exactly one of the choices." + }, + "guided_grammar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Grammar", + "description": "If specified, the output will follow the context free grammar." + }, + "guided_decoding_backend": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Decoding Backend", + "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be either 'outlines' / 'lm-format-enforcer'" + }, + "guided_whitespace_pattern": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Whitespace Pattern", + "description": "If specified, will override the default whitespace pattern for guided json decoding." + }, + "priority": { + "type": "integer", + "title": "Priority", + "description": "The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.", + "default": 0 + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "messages", + "model" + ], + "title": "ChatCompletionRequest" + }, + "ChatCompletionSystemMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "system" + ], + "const": "system", + "title": "Role" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "ChatCompletionSystemMessageParam" + }, + "ChatCompletionToolMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "tool" + ], + "const": "tool", + "title": "Role" + }, + "tool_call_id": { + "type": "string", + "title": "Tool Call Id" + } + }, + "type": "object", + "required": [ + "content", + "role", + "tool_call_id" + ], + "title": "ChatCompletionToolMessageParam" + }, + "ChatCompletionToolsParam": { + "properties": { + "type": { + "type": "string", + "enum": [ + "function" + ], + "const": "function", + "title": "Type", + "default": "function" + }, + "function": { + "$ref": "#/components/schemas/FunctionDefinition" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "function" + ], + "title": "ChatCompletionToolsParam" + }, + "ChatCompletionUserMessageParam": { + "properties": { + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartImageParam" + } + ] + }, + "type": "array" + } + ], + "title": "Content" + }, + "role": { + "type": "string", + "enum": [ + "user" + ], + "const": "user", + "title": "Role" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "content", + "role" + ], + "title": "ChatCompletionUserMessageParam" + }, + "CompletionRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" + }, + "prompt": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "type": "array" + }, + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], + "title": "Prompt" + }, + "best_of": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Best Of" + }, + "echo": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Echo", + "default": false + }, + "frequency_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Frequency Penalty", + "default": 0.0 + }, + "logit_bias": { + "anyOf": [ + { + "additionalProperties": { + "type": "number" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Logit Bias" + }, + "logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Logprobs" + }, + "max_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Tokens", + "default": 16 + }, + "n": { + "type": "integer", + "title": "N", + "default": 1 + }, + "presence_penalty": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Presence Penalty", + "default": 0.0 + }, + "seed": { + "anyOf": [ + { + "type": "integer", + "maximum": 9.223372036854776e+18, + "minimum": -9.223372036854776e+18 + }, + { + "type": "null" + } + ], + "title": "Seed" + }, + "stop": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop" + }, + "stream": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Stream", + "default": false + }, + "stream_options": { + "anyOf": [ + { + "$ref": "#/components/schemas/StreamOptions" + }, + { + "type": "null" + } + ] + }, + "suffix": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Suffix" + }, + "temperature": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Temperature", + "default": 1.0 + }, + "top_p": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "Top P", + "default": 1.0 + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "use_beam_search": { + "type": "boolean", + "title": "Use Beam Search", + "default": false + }, + "top_k": { + "type": "integer", + "title": "Top K", + "default": -1 + }, + "min_p": { + "type": "number", + "title": "Min P", + "default": 0.0 + }, + "repetition_penalty": { + "type": "number", + "title": "Repetition Penalty", + "default": 1.0 + }, + "length_penalty": { + "type": "number", + "title": "Length Penalty", + "default": 1.0 + }, + "stop_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Stop Token Ids" + }, + "include_stop_str_in_output": { + "type": "boolean", + "title": "Include Stop Str In Output", + "default": false + }, + "ignore_eos": { + "type": "boolean", + "title": "Ignore Eos", + "default": false + }, + "min_tokens": { + "type": "integer", + "title": "Min Tokens", + "default": 0 + }, + "skip_special_tokens": { + "type": "boolean", + "title": "Skip Special Tokens", + "default": true + }, + "spaces_between_special_tokens": { + "type": "boolean", + "title": "Spaces Between Special Tokens", + "default": true + }, + "truncate_prompt_tokens": { + "anyOf": [ + { + "type": "integer", + "minimum": 1.0 + }, + { + "type": "null" + } + ], + "title": "Truncate Prompt Tokens" + }, + "allowed_token_ids": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Allowed Token Ids" + }, + "prompt_logprobs": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Prompt Logprobs" + }, + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "description": "If true (the default), special tokens (e.g. BOS) will be added to the prompt.", "default": true }, - "early_stopping": { - "type": "boolean", - "title": "Early Stopping", - "default": false + "response_format": { + "anyOf": [ + { + "$ref": "#/components/schemas/ResponseFormat" + }, + { + "type": "null" + } + ], + "description": "Similar to chat completion, this parameter specifies the format of output. Only {'type': 'json_object'} or {'type': 'text' } is supported." + }, + "guided_json": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object" + }, + { + "$ref": "#/components/schemas/BaseModel" + }, + { + "type": "null" + } + ], + "title": "Guided Json", + "description": "If specified, the output will follow the JSON schema." + }, + "guided_regex": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Regex", + "description": "If specified, the output will follow the regex pattern." + }, + "guided_choice": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Guided Choice", + "description": "If specified, the output will be exactly one of the choices." + }, + "guided_grammar": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Grammar", + "description": "If specified, the output will follow the context free grammar." + }, + "guided_decoding_backend": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Decoding Backend", + "description": "If specified, will override the default guided decoding backend of the server for this specific request. If set, must be one of 'outlines' / 'lm-format-enforcer'" + }, + "guided_whitespace_pattern": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Guided Whitespace Pattern", + "description": "If specified, will override the default whitespace pattern for guided json decoding." + }, + "priority": { + "type": "integer", + "title": "Priority", + "description": "The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.", + "default": 0 + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "prompt" + ], + "title": "CompletionRequest" + }, + "CustomChatCompletionContentPartParam": { + "properties": { + "type": { + "type": "string", + "title": "Type" + } + }, + "additionalProperties": true, + "type": "object", + "required": [ + "type" + ], + "title": "CustomChatCompletionContentPartParam" + }, + "CustomChatCompletionMessageParam": { + "properties": { + "role": { + "type": "string", + "title": "Role" + }, + "content": { + "anyOf": [ + { + "type": "string" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionContentPartTextParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartImageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartAudioParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionContentPartRefusalParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionContentPartParam" + } + ] + }, + "type": "array" + } + ], + "title": "Content" }, - "num_beams": { - "type": "integer", - "title": "Num Beams", - "default": 1 + "name": { + "type": "string", + "title": "Name" }, - "temperature": { - "type": "number", - "title": "Temperature", - "default": 1 + "tool_call_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Tool Call Id" }, - "top_k": { - "type": "integer", - "title": "Top K", - "default": 10 + "tool_calls": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ChatCompletionMessageToolCallParam" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool Calls" + } + }, + "type": "object", + "required": [ + "role" + ], + "title": "CustomChatCompletionMessageParam", + "description": "Enables custom roles in the Chat Completion API." + }, + "DetokenizeRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" }, - "top_p": { - "type": "number", - "title": "Top P", - "default": 1 + "tokens": { + "items": { + "type": "integer" + }, + "type": "array", + "title": "Tokens" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "tokens" + ], + "title": "DetokenizeRequest" + }, + "EmbeddingRequest": { + "properties": { + "model": { + "type": "string", + "title": "Model" }, - "typical_p": { - "type": "number", - "title": "Typical P", - "default": 1 + "input": { + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "items": { + "items": { + "type": "integer" + }, + "type": "array" + }, + "type": "array" + }, + { + "type": "string" + }, + { + "items": { + "type": "string" + }, + "type": "array" + } + ], + "title": "Input" }, - "repetition_penalty": { - "type": "number", - "title": "Repetition Penalty", - "default": 1 + "encoding_format": { + "type": "string", + "enum": [ + "float", + "base64" + ], + "title": "Encoding Format", + "default": "float" }, - "pad_token_id": { - "type": "integer", - "title": "Pad Token Id" + "dimensions": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Dimensions" + }, + "user": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "User" + }, + "truncate_prompt_tokens": { + "anyOf": [ + { + "type": "integer", + "minimum": 1.0 + }, + { + "type": "null" + } + ], + "title": "Truncate Prompt Tokens" + }, + "additional_data": { + "anyOf": [ + {}, + { + "type": "null" + } + ], + "title": "Additional Data" }, - "eos_token_id": { + "priority": { "type": "integer", - "title": "Eos Token Id", - "default": 11 + "title": "Priority", + "description": "The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.", + "default": 0 } }, + "additionalProperties": false, "type": "object", - "title": "GenerateKwargs", - "example": { - "max_length": 200, - "temperature": 0.7, - "top_p": 0.9, - "additional_param": "Example value" - } + "required": [ + "model", + "input" + ], + "title": "EmbeddingRequest" + }, + "Function": { + "properties": { + "arguments": { + "type": "string", + "title": "Arguments" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "arguments", + "name" + ], + "title": "Function" + }, + "FunctionCall": { + "properties": { + "arguments": { + "type": "string", + "title": "Arguments" + }, + "name": { + "type": "string", + "title": "Name" + } + }, + "type": "object", + "required": [ + "arguments", + "name" + ], + "title": "FunctionCall" + }, + "FunctionDefinition": { + "properties": { + "name": { + "type": "string", + "title": "Name" + }, + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "parameters": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Parameters" + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "name" + ], + "title": "FunctionDefinition" }, "HTTPValidationError": { "properties": { @@ -445,121 +1965,214 @@ ], "title": "HealthStatus" }, - "HomeResponse": { + "ImageURL": { "properties": { - "message": { + "url": { + "type": "string", + "title": "Url" + }, + "detail": { "type": "string", - "title": "Message", - "example": "Server is running" + "enum": [ + "auto", + "low", + "high" + ], + "title": "Detail" } }, "type": "object", "required": [ - "message" + "url" ], - "title": "HomeResponse" + "title": "ImageURL" }, - "MemoryInfo": { + "JsonSchemaResponseFormat": { "properties": { - "used": { + "name": { "type": "string", - "title": "Used" + "title": "Name" }, - "total": { - "type": "string", - "title": "Total" + "description": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Description" + }, + "schema": { + "anyOf": [ + { + "type": "object" + }, + { + "type": "null" + } + ], + "title": "Schema" + }, + "strict": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Strict" } }, + "additionalProperties": false, "type": "object", "required": [ - "used", - "total" + "name" ], - "title": "MemoryInfo" + "title": "JsonSchemaResponseFormat" }, - "Message": { + "ResponseFormat": { "properties": { - "role": { + "type": { "type": "string", - "title": "Role" + "enum": [ + "text", + "json_object", + "json_schema" + ], + "title": "Type" }, - "content": { - "type": "string", - "title": "Content" + "json_schema": { + "anyOf": [ + { + "$ref": "#/components/schemas/JsonSchemaResponseFormat" + }, + { + "type": "null" + } + ] } }, + "additionalProperties": false, "type": "object", "required": [ - "role", - "content" + "type" ], - "title": "Message" + "title": "ResponseFormat" }, - "MetricsResponse": { + "StreamOptions": { "properties": { - "gpu_info": { - "items": { - "$ref": "#/components/schemas/GPUInfo" - }, - "type": "array", - "title": "Gpu Info" + "include_usage": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Include Usage", + "default": true }, - "cpu_info": { - "$ref": "#/components/schemas/CPUInfo" + "continuous_usage_stats": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Continuous Usage Stats", + "default": true } }, + "additionalProperties": false, "type": "object", - "title": "MetricsResponse" + "title": "StreamOptions" }, - "UnifiedRequestModel": { + "TokenizeChatRequest": { "properties": { - "prompt": { + "model": { "type": "string", - "title": "Prompt", - "description": "Prompt for text generation. Required for text-generation pipeline. Do not use with 'messages'." + "title": "Model" + }, + "messages": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ChatCompletionSystemMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionUserMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionAssistantMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionToolMessageParam" + }, + { + "$ref": "#/components/schemas/ChatCompletionFunctionMessageParam" + }, + { + "$ref": "#/components/schemas/CustomChatCompletionMessageParam" + } + ] + }, + "type": "array", + "title": "Messages" }, - "return_full_text": { + "add_generation_prompt": { "type": "boolean", - "title": "Return Full Text", - "description": "Return full text if True, else only added text", + "title": "Add Generation Prompt", "default": true }, - "clean_up_tokenization_spaces": { + "continue_final_message": { "type": "boolean", - "title": "Clean Up Tokenization Spaces", - "description": "Clean up extra spaces in text output", + "title": "Continue Final Message", "default": false }, - "prefix": { + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "default": false + } + }, + "additionalProperties": false, + "type": "object", + "required": [ + "model", + "messages" + ], + "title": "TokenizeChatRequest" + }, + "TokenizeCompletionRequest": { + "properties": { + "model": { "type": "string", - "title": "Prefix", - "description": "Prefix added to prompt" + "title": "Model" }, - "handle_long_generation": { + "prompt": { "type": "string", - "title": "Handle Long Generation", - "description": "Strategy to handle long generation" - }, - "generate_kwargs": { - "allOf": [ - { - "$ref": "#/components/schemas/GenerateKwargs" - } - ], - "title": "Generate Kwargs", - "description": "Additional kwargs for generate method" + "title": "Prompt" }, - "messages": { - "items": { - "$ref": "#/components/schemas/Message" - }, - "type": "array", - "title": "Messages", - "description": "Messages for conversational model. Required for conversational pipeline. Do not use with 'prompt'." + "add_special_tokens": { + "type": "boolean", + "title": "Add Special Tokens", + "default": true } }, + "additionalProperties": false, "type": "object", - "title": "UnifiedRequestModel" + "required": [ + "model", + "prompt" + ], + "title": "TokenizeCompletionRequest" }, "ValidationError": { "properties": { diff --git a/presets/inference/text-generation/inference_api.py b/presets/inference/text-generation/inference_api.py index b9381e220..fa04c5d0d 100644 --- a/presets/inference/text-generation/inference_api.py +++ b/presets/inference/text-generation/inference_api.py @@ -461,7 +461,7 @@ def get_metrics(): if torch.cuda.is_available(): gpus = GPUtil.getGPUs() gpu_info = [GPUInfo( - id=gpu.id, + id=str(gpu.id), name=gpu.name, load=f"{gpu.load * 100:.2f}%", temperature=f"{gpu.temperature} C", diff --git a/presets/inference/text-generation/inference_api_vllm.py b/presets/inference/text-generation/inference_api_vllm.py new file mode 100644 index 000000000..865bb82d7 --- /dev/null +++ b/presets/inference/text-generation/inference_api_vllm.py @@ -0,0 +1,98 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. +import logging +import os +import asyncio + +import uvloop +from pydantic import BaseModel, Field +from fastapi import Request, HTTPException + +from vllm.utils import FlexibleArgumentParser +import vllm.entrypoints.openai.api_server as api_server + +# Initialize logger +logger = logging.getLogger(__name__) +debug_mode = os.environ.get('DEBUG_MODE', 'false').lower() == 'true' +logging.basicConfig(level=logging.DEBUG if debug_mode else logging.INFO) + +class HealthStatus(BaseModel): + status: str = Field(..., example="Healthy") + +@api_server.router.get("/healthz", + response_model=HealthStatus, + summary="Health Check Endpoint", + responses={ + 200: { + "description": "Successful Response", + "content": { + "application/json": { + "example": { + "status": "Healthy" + } + } + } + }, + 500: { + "description": "Error Response", + "content": { + "application/json": { + "examples": { + "model_uninitialized": { + "summary": + "Model not initialized", + "value": { + "detail": + "Model not initialized" + } + } + } + } + } + } + }) +async def health_check(raw_request: Request) -> HealthStatus: + """Health check.""" + try: + await asyncio.wait_for(api_server.engine_client(raw_request).check_health(), timeout=1.0) + return {"status": "Healthy"} + except asyncio.TimeoutError: + raise HTTPException(status_code=500, detail="Model not initialized") + + +def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + local_rank = int(os.environ.get("LOCAL_RANK", + 0)) # Default to 0 if not set + port = 5000 + local_rank # Adjust port based on local rank + + server_default_args = { + "disable-frontend-multiprocessing": False, + "port": port + } + parser.set_defaults(**server_default_args) + + # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args + engine_default_args = { + "model": "/workspace/tfs/weights", + "dtype": "float16", + "cpu-offload-gb": 0, + "gpu-memory-utilization": 0.9, + "swap-space": 4, + "disable_log_stats": False, + } + parser.set_defaults(**engine_default_args) + + return parser + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description='vLLM serving server') + parser = api_server.make_arg_parser(parser) + parser = make_arg_parser(parser) + args = parser.parse_args() + + # Run the serving server + logger.info(f"Starting server on port {args.port}") + # See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more + # details about serving server + uvloop.run(api_server.run_server(args)) diff --git a/presets/inference/text-generation/requirements.txt b/presets/inference/text-generation/requirements.txt index 2530fed6a..0e5f6e4d8 100644 --- a/presets/inference/text-generation/requirements.txt +++ b/presets/inference/text-generation/requirements.txt @@ -2,6 +2,7 @@ # Core Dependencies transformers==4.41.2 +vllm==0.6.3 torch==2.2.0 accelerate==0.30.1 fastapi>=0.111.0,<0.112.0 # Allow patch updates