-
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial gRPC server and TGIS proto API mapping layer
Signed-off-by: Joe Runde <[email protected]> Signed-off-by: Joe Runde <[email protected]>
- Loading branch information
Showing
9 changed files
with
929 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
|
||
target_path := "vllm/entrypoints/grpc/pb" | ||
|
||
gen-protos: | ||
# Compile protos | ||
pip install grpcio-tools==1.60.1 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir | ||
mkdir $(target_path) || true | ||
python -m grpc_tools.protoc -Iproto --python_out=$(target_path) \ | ||
--grpc_python_out=$(target_path) --mypy_out=$(target_path) proto/generation.proto | ||
find $(target_path)/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \; | ||
touch $(target_path)/__init__.py | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
/* | ||
Internal service interface for FMaaS completions | ||
*/ | ||
|
||
syntax = "proto3"; | ||
package fmaas; | ||
|
||
|
||
service GenerationService { | ||
// Generates text given a text prompt, for one or more inputs | ||
rpc Generate (BatchedGenerationRequest) returns (BatchedGenerationResponse) {} | ||
// Generates text given a single input prompt, streaming the response | ||
rpc GenerateStream (SingleGenerationRequest) returns (stream GenerationResponse) {} | ||
// Tokenize text | ||
rpc Tokenize (BatchedTokenizeRequest) returns (BatchedTokenizeResponse) {} | ||
// Model info | ||
rpc ModelInfo (ModelInfoRequest) returns (ModelInfoResponse) {} | ||
} | ||
|
||
// ============================================================================================================ | ||
// Generation API | ||
|
||
enum DecodingMethod { | ||
GREEDY = 0; | ||
SAMPLE = 1; | ||
} | ||
|
||
message BatchedGenerationRequest { | ||
string model_id = 1; | ||
optional string prefix_id = 2; | ||
repeated GenerationRequest requests = 3; | ||
|
||
Parameters params = 10; | ||
} | ||
|
||
message SingleGenerationRequest { | ||
string model_id = 1; | ||
optional string prefix_id = 2; | ||
GenerationRequest request = 3; | ||
|
||
Parameters params = 10; | ||
} | ||
|
||
message BatchedGenerationResponse { | ||
repeated GenerationResponse responses = 1; | ||
} | ||
|
||
message GenerationRequest { | ||
string text = 2; | ||
} | ||
|
||
message GenerationResponse { | ||
uint32 input_token_count = 6; | ||
uint32 generated_token_count = 2; | ||
string text = 4; | ||
StopReason stop_reason = 7; | ||
// The stop sequence encountered, iff stop_reason == STOP_SEQUENCE | ||
string stop_sequence = 11; | ||
// Random seed used, not applicable for greedy requests | ||
uint64 seed = 10; | ||
|
||
// Individual generated tokens and associated details, if requested | ||
repeated TokenInfo tokens = 8; | ||
|
||
// Input tokens and associated details, if requested | ||
repeated TokenInfo input_tokens = 9; | ||
} | ||
|
||
message Parameters { | ||
// The high level decoding approach | ||
DecodingMethod method = 1; | ||
// Parameters related to sampling, applicable only when method == SAMPLING | ||
SamplingParameters sampling = 2; | ||
// Parameters controlling when generation should stop | ||
StoppingCriteria stopping = 3; | ||
// Flags to control what is returned in the response | ||
ResponseOptions response = 4; | ||
// Parameters for conditionally penalizing/boosting | ||
// candidate tokens during decoding | ||
DecodingParameters decoding = 5; | ||
// Truncate to this many input tokens. Can be used to avoid requests | ||
// failing due to input being longer than configured limits. | ||
// Zero means don't truncate. | ||
uint32 truncate_input_tokens = 6; | ||
} | ||
|
||
message DecodingParameters { | ||
message LengthPenalty { | ||
// Start the decay after this number of tokens have been generated | ||
uint32 start_index = 1; | ||
// Factor of exponential decay | ||
float decay_factor = 2; | ||
} | ||
|
||
// Default (0.0) means no penalty (equivalent to 1.0) | ||
// 1.2 is a recommended value | ||
float repetition_penalty = 1; | ||
|
||
// Exponentially increases the score of the EOS token | ||
// once start_index tokens have been generated | ||
optional LengthPenalty length_penalty = 2; | ||
} | ||
|
||
|
||
message SamplingParameters { | ||
// Default (0.0) means disabled (equivalent to 1.0) | ||
float temperature = 1; | ||
// Default (0) means disabled | ||
uint32 top_k = 2; | ||
// Default (0) means disabled (equivalent to 1.0) | ||
float top_p = 3; | ||
// Default (0) means disabled (equivalent to 1.0) | ||
float typical_p = 4; | ||
|
||
optional uint64 seed = 5; | ||
} | ||
|
||
message StoppingCriteria { | ||
// Default (0) is currently 20 | ||
uint32 max_new_tokens = 1; | ||
// Default (0) means no minimum | ||
uint32 min_new_tokens = 2; | ||
// Default (0) means no time limit | ||
uint32 time_limit_millis = 3; | ||
repeated string stop_sequences = 4; | ||
// If not specified, default behavior depends on server setting | ||
optional bool include_stop_sequence = 5; | ||
|
||
//more to come | ||
} | ||
|
||
message ResponseOptions { | ||
// Include input text | ||
bool input_text = 1; | ||
// Include list of individual generated tokens | ||
// "Extra" token information is included based on the other flags below | ||
bool generated_tokens = 2; | ||
// Include list of input tokens | ||
// "Extra" token information is included based on the other flags here, | ||
// but only for decoder-only models | ||
bool input_tokens = 3; | ||
// Include logprob for each returned token | ||
// Applicable only if generated_tokens == true and/or input_tokens == true | ||
bool token_logprobs = 4; | ||
// Include rank of each returned token | ||
// Applicable only if generated_tokens == true and/or input_tokens == true | ||
bool token_ranks = 5; | ||
// Include top n candidate tokens at the position of each returned token | ||
// The maximum value permitted is 5, but more may be returned if there is a tie | ||
// for nth place. | ||
// Applicable only if generated_tokens == true and/or input_tokens == true | ||
uint32 top_n_tokens = 6; | ||
} | ||
|
||
enum StopReason { | ||
// Possibly more tokens to be streamed | ||
NOT_FINISHED = 0; | ||
// Maximum requested tokens reached | ||
MAX_TOKENS = 1; | ||
// End-of-sequence token encountered | ||
EOS_TOKEN = 2; | ||
// Request cancelled by client | ||
CANCELLED = 3; | ||
// Time limit reached | ||
TIME_LIMIT = 4; | ||
// Stop sequence encountered | ||
STOP_SEQUENCE = 5; | ||
// Total token limit reached | ||
TOKEN_LIMIT = 6; | ||
// Decoding error | ||
ERROR = 7; | ||
} | ||
|
||
message TokenInfo { | ||
// uint32 id = 1; // TBD | ||
string text = 2; | ||
// The logprob (log of normalized probability), if requested | ||
float logprob = 3; | ||
// One-based rank relative to other tokens, if requested | ||
uint32 rank = 4; | ||
|
||
message TopToken { | ||
// uint32 id = 1; // TBD | ||
string text = 2; | ||
float logprob = 3; | ||
} | ||
|
||
// Top N candidate tokens at this position, if requested | ||
// May or may not include this token | ||
repeated TopToken top_tokens = 5; | ||
} | ||
|
||
|
||
// ============================================================================================================ | ||
// Tokenization API | ||
|
||
message BatchedTokenizeRequest { | ||
string model_id = 1; | ||
repeated TokenizeRequest requests = 2; | ||
bool return_tokens = 3; //TBD | ||
} | ||
|
||
message BatchedTokenizeResponse { | ||
repeated TokenizeResponse responses = 1; | ||
} | ||
|
||
message TokenizeRequest { | ||
string text = 1; | ||
} | ||
|
||
message TokenizeResponse { | ||
uint32 token_count = 1; | ||
repeated string tokens = 2; // if include_tokens = true | ||
|
||
// We'll possibly add more later | ||
} | ||
|
||
|
||
// ============================================================================================================ | ||
// Model Info API | ||
|
||
message ModelInfoRequest { | ||
string model_id = 1; | ||
} | ||
|
||
message ModelInfoResponse { | ||
enum ModelKind { | ||
DECODER_ONLY = 0; | ||
ENCODER_DECODER = 1; | ||
} | ||
|
||
ModelKind model_kind = 1; | ||
uint32 max_sequence_length = 2; | ||
uint32 max_new_tokens = 3; | ||
} |
Oops, something went wrong.