Initial gRPC server and TGIS proto API mapping layer

Signed-off-by: Joe Runde <[email protected]> Signed-off-by: Joe Runde <[email protected]>
IBM · Mar 20, 2024 · e3180c6 · e3180c6
1 parent ba8ae1d
commit e3180c6
Show file tree

Hide file tree

Showing 9 changed files with 929 additions and 3 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,12 @@
+
+target_path := "vllm/entrypoints/grpc/pb"
+
+gen-protos:
+	# Compile protos
+	pip install grpcio-tools==1.60.1 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
+	mkdir $(target_path) || true
+	python -m grpc_tools.protoc -Iproto --python_out=$(target_path) \
+		--grpc_python_out=$(target_path) --mypy_out=$(target_path) proto/generation.proto
+	find $(target_path)/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+	touch $(target_path)/__init__.py
+
diff --git a/proto/generation.proto b/proto/generation.proto
@@ -0,0 +1,235 @@
+/*
+  Internal service interface for FMaaS completions
+ */
+
+syntax = "proto3";
+package fmaas;
+
+
+service GenerationService {
+  // Generates text given a text prompt, for one or more inputs
+  rpc Generate (BatchedGenerationRequest) returns (BatchedGenerationResponse) {}
+  // Generates text given a single input prompt, streaming the response
+  rpc GenerateStream (SingleGenerationRequest) returns (stream GenerationResponse) {}
+  // Tokenize text
+  rpc Tokenize (BatchedTokenizeRequest) returns (BatchedTokenizeResponse) {}
+  // Model info
+  rpc ModelInfo (ModelInfoRequest) returns (ModelInfoResponse) {}
+}
+
+// ============================================================================================================
+// Generation API
+
+enum DecodingMethod {
+  GREEDY = 0;
+  SAMPLE = 1;
+}
+
+message BatchedGenerationRequest {
+  string model_id = 1;
+  optional string prefix_id = 2;
+  repeated GenerationRequest requests = 3;
+
+  Parameters params = 10;
+}
+
+message SingleGenerationRequest {
+  string model_id = 1;
+  optional string prefix_id = 2;
+  GenerationRequest request = 3;
+
+  Parameters params = 10;
+}
+
+message BatchedGenerationResponse {
+  repeated GenerationResponse responses = 1;
+}
+
+message GenerationRequest {
+  string text = 2;
+}
+
+message GenerationResponse {
+  uint32 input_token_count = 6;
+  uint32 generated_token_count = 2;
+  string text = 4;
+  StopReason stop_reason = 7;
+  // The stop sequence encountered, iff stop_reason == STOP_SEQUENCE
+  string stop_sequence = 11;
+  // Random seed used, not applicable for greedy requests
+  uint64 seed = 10;
+
+  // Individual generated tokens and associated details, if requested
+  repeated TokenInfo tokens = 8;
+
+  // Input tokens and associated details, if requested
+  repeated TokenInfo input_tokens = 9;
+}
+
+message Parameters {
+  // The high level decoding approach
+  DecodingMethod method = 1;
+  // Parameters related to sampling, applicable only when method == SAMPLING
+  SamplingParameters sampling = 2;
+  // Parameters controlling when generation should stop
+  StoppingCriteria stopping = 3;
+  // Flags to control what is returned in the response
+  ResponseOptions response = 4;
+  // Parameters for conditionally penalizing/boosting
+  // candidate tokens during decoding
+  DecodingParameters decoding = 5;
+  // Truncate to this many input tokens. Can be used to avoid requests
+  // failing due to input being longer than configured limits.
+  // Zero means don't truncate.
+  uint32 truncate_input_tokens = 6;
+}
+
+message DecodingParameters {
+  message LengthPenalty {
+    // Start the decay after this number of tokens have been generated
+    uint32 start_index = 1;
+    // Factor of exponential decay
+    float decay_factor = 2;
+  }
+
+  // Default (0.0) means no penalty (equivalent to 1.0)
+  // 1.2 is a recommended value
+  float repetition_penalty = 1;
+
+  // Exponentially increases the score of the EOS token
+  // once start_index tokens have been generated
+  optional LengthPenalty length_penalty = 2;
+}
+
+
+message SamplingParameters {
+  // Default (0.0) means disabled (equivalent to 1.0)
+  float temperature = 1;
+  // Default (0) means disabled
+  uint32 top_k = 2;
+  // Default (0) means disabled (equivalent to 1.0)
+  float top_p = 3;
+  // Default (0) means disabled (equivalent to 1.0)
+  float typical_p = 4;
+
+  optional uint64 seed = 5;
+}
+
+message StoppingCriteria {
+  // Default (0) is currently 20
+  uint32 max_new_tokens = 1;
+  // Default (0) means no minimum
+  uint32 min_new_tokens = 2;
+  // Default (0) means no time limit
+  uint32 time_limit_millis = 3;
+  repeated string stop_sequences = 4;
+  // If not specified, default behavior depends on server setting
+  optional bool include_stop_sequence = 5;
+
+  //more to come
+}
+
+message ResponseOptions {
+  // Include input text
+  bool input_text = 1;
+  // Include list of individual generated tokens
+  // "Extra" token information is included based on the other flags below
+  bool generated_tokens = 2;
+  // Include list of input tokens
+  // "Extra" token information is included based on the other flags here,
+  // but only for decoder-only models
+  bool input_tokens = 3;
+  // Include logprob for each returned token
+  // Applicable only if generated_tokens == true and/or input_tokens == true
+  bool token_logprobs = 4;
+  // Include rank of each returned token
+  // Applicable only if generated_tokens == true and/or input_tokens == true
+  bool token_ranks = 5;
+  // Include top n candidate tokens at the position of each returned token
+  // The maximum value permitted is 5, but more may be returned if there is a tie
+  // for nth place.
+  // Applicable only if generated_tokens == true and/or input_tokens == true
+  uint32 top_n_tokens = 6;
+}
+
+enum StopReason {
+  // Possibly more tokens to be streamed
+  NOT_FINISHED = 0;
+  // Maximum requested tokens reached
+  MAX_TOKENS = 1;
+  // End-of-sequence token encountered
+  EOS_TOKEN = 2;
+  // Request cancelled by client
+  CANCELLED = 3;
+  // Time limit reached
+  TIME_LIMIT = 4;
+  // Stop sequence encountered
+  STOP_SEQUENCE = 5;
+  // Total token limit reached
+  TOKEN_LIMIT = 6;
+  // Decoding error
+  ERROR = 7;
+}
+
+message TokenInfo {
+  // uint32 id = 1;  // TBD
+  string text = 2;
+  // The logprob (log of normalized probability), if requested
+  float logprob = 3;
+  // One-based rank relative to other tokens, if requested
+  uint32 rank = 4;
+
+  message TopToken {
+    // uint32 id = 1;  // TBD
+    string text = 2;
+    float logprob = 3;
+  }
+
+  // Top N candidate tokens at this position, if requested
+  // May or may not include this token
+  repeated TopToken top_tokens = 5;
+}
+
+
+// ============================================================================================================
+// Tokenization API
+
+message BatchedTokenizeRequest {
+  string model_id = 1;
+  repeated TokenizeRequest requests = 2;
+  bool return_tokens = 3; //TBD
+}
+
+message BatchedTokenizeResponse {
+  repeated TokenizeResponse responses = 1;
+}
+
+message TokenizeRequest {
+  string text = 1;
+}
+
+message TokenizeResponse {
+  uint32 token_count = 1;
+  repeated string tokens = 2; // if include_tokens = true
+
+  // We'll possibly add more later
+}
+
+
+// ============================================================================================================
+// Model Info API
+
+message ModelInfoRequest {
+  string model_id = 1;
+}
+
+message ModelInfoResponse {
+  enum ModelKind {
+    DECODER_ONLY = 0;
+    ENCODER_DECODER = 1;
+  }
+
+  ModelKind model_kind = 1;
+  uint32 max_sequence_length = 2;
+  uint32 max_new_tokens = 3;
+}