-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathllm_rpc.proto
91 lines (80 loc) · 3.47 KB
/
llm_rpc.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
syntax = "proto3";
package llm_rpc.api;
message GenerateRequest {
string api_key = 1;
repeated string prompts = 2;
repeated string stop = 3;
}
message GenerateReply {
message Generation {
string text = 1;
// JSON object with additional information about the generation.
string generation_info = 2;
}
message GenerationList {
repeated Generation generations = 1;
}
repeated GenerationList generations = 1;
}
message LLMTypeRequest {
string api_key = 1;
}
message LLMTypeReply {
string llm_type = 1;
}
message GenerationalGutsRequest {
string api_key = 1;
string prompt = 2;
/// This performs a FFT of the token embeddings. It picks out the high frequency components of the embeddings.
///
/// Leave this on by setting this to true unless you have a good reason to turn it off.
bool fft_embeddings = 3;
/// This truncates the token embeddings so you can present the parts that have the most "information".
///
/// A good default is 10-30
int32 embedding_trunkation = 4;
/// This controls how many possible next tokens are returned.
///
/// A good default is 3-5
int32 top_k_logits = 5;
}
message GenerationalGutsReply {
/// The token stack is a trace of the LLM's internal state as it processes the prompt.
/// The string is tokenized, which takes chunks of the string and converts them into a lookup index.
/// The lookup index is then used to look up the token embedding for the token.
/// The token embedding is then combined with the positional embedding to create the token embedding.
/// The token embedding is then passed through the LLM to produce the hidden state.
/// The hidden state is then used to generate the next token. This stack does not include the
message TokenStack {
/// The token string. This is the original token string, not the token id.
/// Example: "the"
string token = 1;
/// The token id. This is the lookup index for the token in the embedding.
/// Example: 464 is the lookup index for "the" in the GPT-2 embedding.
int32 token_id = 2;
/// The positional embedding for the token. This encodes which position the token is in the sequence.
repeated float positional_embedding = 3;
/// The token embedding for the token. This is the embedding for the token itself that has semantic meaning.
/// It's position in the high dimensional embedding space encodes the "meaning" of the token for the LLM.
repeated float token_embedding = 4;
/// The hidden state of the token. This is the output of the LLM after processing the token.
/// This is also called the "context embedding".
repeated float hidden_state = 5;
}
/// Each generation is a possible next token and its probability. They are determined by the a map from the final hidden state
/// to the "token space". The token space is the space of all possible tokens that the LLM can generate.
///
/// We return the top_k tokens (by probability) and their probabilities.
message Generation {
string token = 1;
int32 id = 2;
float logit = 3;
}
repeated TokenStack tokens = 1;
repeated Generation generations = 2;
}
service RemoteLLM {
rpc Generate(GenerateRequest) returns (GenerateReply) {}
rpc GetLlmType(LLMTypeRequest) returns (LLMTypeReply) {}
rpc GenerationalGuts(GenerationalGutsRequest) returns (GenerationalGutsReply) {}
}