-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllm-post-process
executable file
·233 lines (203 loc) · 11.8 KB
/
llm-post-process
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python3
# -*- mode: python; indent-tabs-mode: nil; python-indent-level: 4 -*-
# vim: autoindent tabstop=4 shiftwidth=4 expandtab softtabstop=4 filetype=python
import sys
import json
from dataclasses import dataclass, is_dataclass, asdict
from enum import Enum
from typing import Dict, List, Any, Optional, Tuple
from functools import reduce
def to_dict(a: Any):
if hasattr(a, "to_dict"):
return a.to_dict()
elif is_dataclass(a):
return asdict(a)
else:
raise TypeError(f"Cannot bring {a} to a dict")
def json_lookup(d: Dict[Any, Any], path: str, delim: str=".") -> Optional[Any]:
try:
segments = path.split(delim)
return reduce(lambda td, seg: td[seg], segments, d)
except KeyError:
return None
class MetricClass(Enum):
COUNT = "count"
THROUGHPUT = "throughput"
@dataclass
class Desc:
metric_class: MetricClass
source: str
metric_type: str
def to_dict(self):
return {
"class": str(self.metric_class.value),
"source": self.source,
"type": self.metric_type,
}
@dataclass
class Metric:
idx: int
desc: Desc
names: Dict[str, str]
def to_dict(self):
return {
"desc": to_dict(self.desc),
"idx": self.idx,
"names": self.names,
}
@dataclass
class Period:
name: str
metric_files: List[str]
def to_dict(self):
return {
"name": self.name,
"metric-files": self.metric_files,
}
@dataclass
class PostProcess:
primary_period: str
primary_metric: str
benchmark: str
periods: List[Period]
rickshaw_bench_metric_version: str
def to_dict(self):
return {
"primary-period": self.primary_period,
"primary-metric": self.primary_metric,
"benchmark": self.benchmark,
"periods": [to_dict(period) for period in self.periods],
"rickshaw-bench-metric": {
"schema": {
"version": self.rickshaw_bench_metric_version
}
}
}
@dataclass
class Entry:
idx: int
start: int
end: int
value: float
def __str__(self):
return f"{int(self.idx)},{int(self.start)},{int(self.end)},{float(self.value)}"
def main():
# In any benchmark post-process script, the metrics generated need to be attributed to a
# time-period (AKA benchmark-phase). The period which is used to report and offical
# result for the benchmark is the 'measurement' period. Other periods thay may exist
# could be "warm-up", "prep", etc.
# There are 3 important phases to the post processing here...
# 1. Generate the 'post-process-data.json` file which describes where to find
# our processed data and includes some metadata
periods = [Period("measurement", ["metric-data-0"])]
post_process_config = PostProcess(
"measurement",
"throughput",
"llm",
periods,
'2021.04.12',
)
with open("post-process-data.json", "w") as f:
f.write(json.dumps(to_dict(post_process_config)))
# 2. Generate a file to describe what our metrics look like (ie, metric-data-0.json)
# (Metric, string path to value in the json file)
metrics: List[Tuple[Metric, str]] = [
# summary
(Metric(0, Desc(MetricClass.THROUGHPUT, "llm", "throughput-full-duration"), {}), "summary.throughput_full_duration"),
(Metric(1, Desc(MetricClass.COUNT, "llm", "full-duration"), {}), "summary.full_duration"),
(Metric(2, Desc(MetricClass.THROUGHPUT, "llm", "throughput"), {}), "summary.throughput"),
(Metric(3, Desc(MetricClass.COUNT, "llm", "total-requests"), {}), "summary.total_requests"),
(Metric(4, Desc(MetricClass.COUNT, "llm", "req-completed-within-test-duration"), {}), "summary.req_completed_within_test_duration"),
(Metric(5, Desc(MetricClass.COUNT, "llm", "total-failures"), {}), "summary.total_failures"),
(Metric(6, Desc(MetricClass.COUNT, "llm", "failure-rate"), {}), "summary.failure_rate"),
# tpot
(Metric(7, Desc(MetricClass.COUNT, "llm", "tpot-min"), {}), "summary.tpot.min"),
(Metric(8, Desc(MetricClass.COUNT, "llm", "tpot-max"), {}), "summary.tpot.max"),
(Metric(9, Desc(MetricClass.COUNT, "llm", "tpot-median"), {}), "summary.tpot.median"),
(Metric(10, Desc(MetricClass.COUNT, "llm", "tpot-mean"), {}), "summary.tpot.mean"),
(Metric(11, Desc(MetricClass.COUNT, "llm", "tpot-percentile-80"), {}), "summary.tpot.percentile_80"),
(Metric(12, Desc(MetricClass.COUNT, "llm", "tpot-percentile-90"), {}), "summary.tpot.percentile_90"),
(Metric(13, Desc(MetricClass.COUNT, "llm", "tpot-percentile-95"), {}), "summary.tpot.percentile_95"),
(Metric(14, Desc(MetricClass.COUNT, "llm", "tpot-percentile-99"), {}), "summary.tpot.percentile_99"),
# ttft
(Metric(15, Desc(MetricClass.COUNT, "llm", "ttft-min"), {}), "summary.ttft.min"),
(Metric(16, Desc(MetricClass.COUNT, "llm", "ttft-max"), {}), "summary.ttft.max"),
(Metric(17, Desc(MetricClass.COUNT, "llm", "ttft-median"), {}), "summary.ttft.median"),
(Metric(18, Desc(MetricClass.COUNT, "llm", "ttft-mean"), {}), "summary.ttft.mean"),
(Metric(19, Desc(MetricClass.COUNT, "llm", "ttft-percentile-80"), {}), "summary.ttft.percentile_80"),
(Metric(20, Desc(MetricClass.COUNT, "llm", "ttft-percentile-90"), {}), "summary.ttft.percentile_90"),
(Metric(21, Desc(MetricClass.COUNT, "llm", "ttft-percentile-95"), {}), "summary.ttft.percentile_95"),
(Metric(22, Desc(MetricClass.COUNT, "llm", "ttft-percentile-99"), {}), "summary.ttft.percentile_99"),
# itl
(Metric(23, Desc(MetricClass.COUNT, "llm", "itl-min"), {}), "summary.itl.min"),
(Metric(24, Desc(MetricClass.COUNT, "llm", "itl-max"), {}), "summary.itl.max"),
(Metric(25, Desc(MetricClass.COUNT, "llm", "itl-median"), {}), "summary.itl.median"),
(Metric(26, Desc(MetricClass.COUNT, "llm", "itl-mean"), {}), "summary.itl.mean"),
(Metric(27, Desc(MetricClass.COUNT, "llm", "itl-percentile-80"), {}), "summary.itl.percentile_80"),
(Metric(28, Desc(MetricClass.COUNT, "llm", "itl-percentile-90"), {}), "summary.itl.percentile_90"),
(Metric(29, Desc(MetricClass.COUNT, "llm", "itl-percentile-95"), {}), "summary.itl.percentile_95"),
(Metric(30, Desc(MetricClass.COUNT, "llm", "itl-percentile-99"), {}), "summary.itl.percentile_99"),
# tt_ack
(Metric(31, Desc(MetricClass.COUNT, "llm", "tt-ack-min"), {}), "summary.tt_ack.min"),
(Metric(32, Desc(MetricClass.COUNT, "llm", "tt-ack-max"), {}), "summary.tt_ack.max"),
(Metric(33, Desc(MetricClass.COUNT, "llm", "tt-ack-median"), {}), "summary.tt_ack.median"),
(Metric(34, Desc(MetricClass.COUNT, "llm", "tt-ack-mean"), {}), "summary.tt_ack.mean"),
(Metric(35, Desc(MetricClass.COUNT, "llm", "tt-ack-percentile-80"), {}), "summary.tt_ack.percentile_80"),
(Metric(36, Desc(MetricClass.COUNT, "llm", "tt-ack-percentile-90"), {}), "summary.tt_ack.percentile_90"),
(Metric(37, Desc(MetricClass.COUNT, "llm", "tt-ack-percentile-95"), {}), "summary.tt_ack.percentile_95"),
(Metric(38, Desc(MetricClass.COUNT, "llm", "tt-ack-percentile-99"), {}), "summary.tt_ack.percentile_99"),
# response_time
(Metric(39, Desc(MetricClass.COUNT, "llm", "response-time-min"), {}), "summary.response_time.min"),
(Metric(40, Desc(MetricClass.COUNT, "llm", "response-time-max"), {}), "summary.response_time.max"),
(Metric(41, Desc(MetricClass.COUNT, "llm", "response-time-median"), {}), "summary.response_time.median"),
(Metric(42, Desc(MetricClass.COUNT, "llm", "response-time-mean"), {}), "summary.response_time.mean"),
(Metric(43, Desc(MetricClass.COUNT, "llm", "response-time-percentile-80"), {}), "summary.response_time.percentile_80"),
(Metric(44, Desc(MetricClass.COUNT, "llm", "response-time-percentile-90"), {}), "summary.response_time.percentile_90"),
(Metric(45, Desc(MetricClass.COUNT, "llm", "response-time-percentile-95"), {}), "summary.response_time.percentile_95"),
(Metric(46, Desc(MetricClass.COUNT, "llm", "response-time-percentile-99"), {}), "summary.response_time.percentile_99"),
# output_tokens
(Metric(47, Desc(MetricClass.COUNT, "llm", "output-tokens-min"), {}), "summary.output_tokens.min"),
(Metric(48, Desc(MetricClass.COUNT, "llm", "output-tokens-max"), {}), "summary.output_tokens.max"),
(Metric(49, Desc(MetricClass.COUNT, "llm", "output-tokens-median"), {}), "summary.output_tokens.median"),
(Metric(50, Desc(MetricClass.COUNT, "llm", "output-tokens-mean"), {}), "summary.output_tokens.mean"),
(Metric(51, Desc(MetricClass.COUNT, "llm", "output-tokens-percentile-80"), {}), "summary.output_tokens.percentile_80"),
(Metric(52, Desc(MetricClass.COUNT, "llm", "output-tokens-percentile-90"), {}), "summary.output_tokens.percentile_90"),
(Metric(53, Desc(MetricClass.COUNT, "llm", "output-tokens-percentile-95"), {}), "summary.output_tokens.percentile_95"),
(Metric(54, Desc(MetricClass.COUNT, "llm", "output-tokens-percentile-99"), {}), "summary.output_tokens.percentile_99"),
# output_tokens_before_timeout
(Metric(55, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-min"), {}), "summary.output_tokens_before_timeout.min"),
(Metric(56, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-max"), {}), "summary.output_tokens_before_timeout.max"),
(Metric(57, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-median"), {}), "summary.output_tokens_before_timeout.median"),
(Metric(58, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-mean"), {}), "summary.output_tokens_before_timeout.mean"),
(Metric(59, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-percentile-80"), {}), "summary.output_tokens_before_timeout.percentile_80"),
(Metric(60, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-percentile-90"), {}), "summary.output_tokens_before_timeout.percentile_90"),
(Metric(61, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-percentile-95"), {}), "summary.output_tokens_before_timeout.percentile_95"),
(Metric(62, Desc(MetricClass.COUNT, "llm", "output-tokens-before-timeout-percentile-99"), {}), "summary.output_tokens_before_timeout.percentile_99"),
]
with open("metric-data-0.json", "w") as f:
f.write(json.dumps([to_dict(metric[0]) for metric in metrics]))
# 3. Parse the metrics we listed in step 2 using the artifacts from our
# test, then dump then to the corresponding csv file (ie, metric-data-0.csv)
output = {}
with open("output.json", "r") as f:
output = json.load(f)
# multiply the timestamp by 1000. llm-load-test has timestamps in seconds we need ms.
begin_time = int(min(result["start_time"] for result in output["results"])) * 1000
end_time = int(max(result["end_time"] for result in output["results"])) * 1000
entries: List[Entry] = []
for metric in metrics:
string_value = json_lookup(output, metric[1])
if string_value is None:
print(f"WARNING: no field {metric[1]} in llm-load-test output")
continue
float_value = 0.0
try:
float_value = float(string_value)
except ValueError:
print(f"WARNING: cannot convert {metric[1]}={string_value} to float")
entries.append(Entry(metric[0].idx, begin_time, end_time, float_value))
with open("metric-data-0.csv", "w") as f:
f.write("\n".join([str(entry) for entry in entries]))
return 0
if __name__ == "__main__":
sys.exit(main())