From 626d269937bd211ce048ae96a8387056927dab2a Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Fri, 6 Sep 2024 13:05:11 +0800 Subject: [PATCH] Support microservice level benchmark (#95) * Support microservice level benchmark Signed-off-by: lvliang-intel --- evals/benchmark/benchmark.py | 32 ++++++++--------- evals/benchmark/benchmark.yaml | 2 +- evals/benchmark/stresscli/locust/aistress.py | 22 +++++++++++- .../stresscli/locust/embeddingfixed.py | 8 +++-- .../stresscli/locust/embedservefixed.py | 8 +++-- .../stresscli/locust/rerankingfixed.py | 8 +++-- .../stresscli/locust/rerankservefixed.py | 8 +++-- .../stresscli/locust/retrieverfixed.py | 8 +++-- .../stresscli/locust/tokenresponse.py | 35 +++++++++++++++++++ 9 files changed, 98 insertions(+), 33 deletions(-) diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py index c5621860..39ef2b35 100644 --- a/evals/benchmark/benchmark.py +++ b/evals/benchmark/benchmark.py @@ -11,25 +11,25 @@ service_endpoints = { "chatqna": { "embedding": "/v1/embeddings", - "embedding_serving": "/v1/embeddings", + "embedserve": "/v1/embeddings", "retriever": "/v1/retrieval", "reranking": "/v1/reranking", - "reranking_serving": "/rerank", + "rerankserve": "/rerank", "llm": "/v1/chat/completions", - "llm_serving": "/v1/chat/completions", + "llmserve": "/v1/chat/completions", "e2e": "/v1/chatqna", }, - "codegen": {"llm": "/generate_stream", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codegen"}, - "codetrans": {"llm": "/generate", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codetrans"}, - "faqgen": {"llm": "/v1/chat/completions", "llm_serving": "/v1/chat/completions", "e2e": "/v1/faqgen"}, + "codegen": {"llm": "/generate_stream", "llmserve": "/v1/chat/completions", "e2e": "/v1/codegen"}, + "codetrans": {"llm": "/generate", "llmserve": "/v1/chat/completions", "e2e": "/v1/codetrans"}, + "faqgen": {"llm": "/v1/chat/completions", "llmserve": "/v1/chat/completions", "e2e": "/v1/faqgen"}, "audioqna": { "asr": "/v1/audio/transcriptions", "llm": "/v1/chat/completions", - "llm_serving": "/v1/chat/completions", + "llmserve": "/v1/chat/completions", "tts": "/v1/audio/speech", "e2e": "/v1/audioqna", }, - "visualqna": {"lvm": "/v1/chat/completions", "lvm_serving": "/v1/chat/completions", "e2e": "/v1/visualqna"}, + "visualqna": {"lvm": "/v1/chat/completions", "lvmserve": "/v1/chat/completions", "e2e": "/v1/visualqna"}, } @@ -200,19 +200,19 @@ def process_service(example, service_type, case_data, test_suite_config): example_service_map = { "chatqna": [ "embedding", - "embedding_serving", + "embedserve", "retriever", "reranking", - "reranking_serving", + "rerankserve", "llm", - "llm_serving", + "llmserve", "e2e", ], - "codegen": ["llm", "llm_serving", "e2e"], - "codetrans": ["llm", "llm_serving", "e2e"], - "faqgen": ["llm", "llm_serving", "e2e"], - "audioqna": ["asr", "llm", "llm_serving", "tts", "e2e"], - "visualqna": ["lvm", "lvm_serving", "e2e"], + "codegen": ["llm", "llmserve", "e2e"], + "codetrans": ["llm", "llmserve", "e2e"], + "faqgen": ["llm", "llmserve", "e2e"], + "audioqna": ["asr", "llm", "llmserve", "tts", "e2e"], + "visualqna": ["lvm", "lvmserve", "e2e"], } # Process each example's services diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml index 7e26b74d..bf3576ae 100644 --- a/evals/benchmark/benchmark.yaml +++ b/evals/benchmark/benchmark.yaml @@ -111,7 +111,7 @@ test_cases: top_p: 0.95 repetition_penalty: 1.03 streaming: true - llm_serving: + llmserve: run_test: false service_name: "faq-micro-svc" # Replace with your service name e2e: diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py index 5a5cb2c3..d52c738a 100644 --- a/evals/benchmark/stresscli/locust/aistress.py +++ b/evals/benchmark/stresscli/locust/aistress.py @@ -73,12 +73,22 @@ def bench_main(self): self.environment.runner.send_message("worker_reqsent", 1) reqData = bench_package.getReqData() url = bench_package.getUrl() + streaming_bench_target = [ + "llmfixed", + "llmbench", + "chatqnafixed", + "chatqnabench", + "codegenfixed", + "codegenbench", + "faqgenfixed", + "faqgenbench", + ] try: start_ts = time.perf_counter() with self.client.post( url, json=reqData, - stream=True, + stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False, catch_response=True, timeout=self.environment.parsed_options.http_timeout, ) as resp: @@ -86,6 +96,16 @@ def bench_main(self): if resp.status_code >= 200 and resp.status_code < 400: if self.environment.parsed_options.bench_target in [ + "embedservefixed", + "embeddingfixed", + "retrieverfixed", + "rerankservefixed", + "rerankingfixed", + ]: + respData = { + "total_latency": time.perf_counter() - start_ts, + } + elif self.environment.parsed_options.bench_target in [ "audioqnafixed", "audioqnabench", ]: # non-stream case diff --git a/evals/benchmark/stresscli/locust/embeddingfixed.py b/evals/benchmark/stresscli/locust/embeddingfixed.py index 12ca010d..4fe38737 100644 --- a/evals/benchmark/stresscli/locust/embeddingfixed.py +++ b/evals/benchmark/stresscli/locust/embeddingfixed.py @@ -14,9 +14,11 @@ def getReqData(): } -def respStatics(environment, resp): - return token.respStatics(environment, resp) +def respStatics(environment, reqData, resp): + return { + "total_latency": resp["total_latency"] * 1000, + } def staticsOutput(environment, reqlist): - token.staticsOutput(environment, reqlist) + token.staticsOutputForMicroservice(environment, reqlist) diff --git a/evals/benchmark/stresscli/locust/embedservefixed.py b/evals/benchmark/stresscli/locust/embedservefixed.py index 5f370797..a0a2feba 100644 --- a/evals/benchmark/stresscli/locust/embedservefixed.py +++ b/evals/benchmark/stresscli/locust/embedservefixed.py @@ -14,9 +14,11 @@ def getReqData(): } -def respStatics(environment, resp): - return token.respStatics(environment, resp) +def respStatics(environment, reqData, resp): + return { + "total_latency": resp["total_latency"] * 1000, + } def staticsOutput(environment, reqlist): - token.staticsOutput(environment, reqlist) + token.staticsOutputForMicroservice(environment, reqlist) diff --git a/evals/benchmark/stresscli/locust/rerankingfixed.py b/evals/benchmark/stresscli/locust/rerankingfixed.py index 6c780291..241e7c0d 100644 --- a/evals/benchmark/stresscli/locust/rerankingfixed.py +++ b/evals/benchmark/stresscli/locust/rerankingfixed.py @@ -17,9 +17,11 @@ def getReqData(): return {"initial_query": my_query, "retrieved_docs": [{"text": query_rerank_1}, {"text": query_rerank_2}]} -def respStatics(environment, resp): - return token.respStatics(environment, resp) +def respStatics(environment, reqData, resp): + return { + "total_latency": resp["total_latency"] * 1000, + } def staticsOutput(environment, reqlist): - token.staticsOutput(environment, reqlist) + token.staticsOutputForMicroservice(environment, reqlist) diff --git a/evals/benchmark/stresscli/locust/rerankservefixed.py b/evals/benchmark/stresscli/locust/rerankservefixed.py index 342b44e2..d2be7c97 100644 --- a/evals/benchmark/stresscli/locust/rerankservefixed.py +++ b/evals/benchmark/stresscli/locust/rerankservefixed.py @@ -17,9 +17,11 @@ def getReqData(): return {"query": my_query, "texts": [query_rerank_1, query_rerank_2]} -def respStatics(environment, resp): - return token.respStatics(environment, resp) +def respStatics(environment, reqData, resp): + return { + "total_latency": resp["total_latency"] * 1000, + } def staticsOutput(environment, reqlist): - token.staticsOutput(environment, reqlist) + token.staticsOutputForMicroservice(environment, reqlist) diff --git a/evals/benchmark/stresscli/locust/retrieverfixed.py b/evals/benchmark/stresscli/locust/retrieverfixed.py index 03d8e260..805ad57b 100644 --- a/evals/benchmark/stresscli/locust/retrieverfixed.py +++ b/evals/benchmark/stresscli/locust/retrieverfixed.py @@ -786,9 +786,11 @@ def getReqData(): return ({"text": my_query, "embedding": my_embedding},) -def respStatics(environment, resp): - return token.respStatics(environment, resp) +def respStatics(environment, reqData, resp): + return { + "total_latency": resp["total_latency"] * 1000, + } def staticsOutput(environment, reqlist): - token.staticsOutput(environment, reqlist) + token.staticsOutputForMicroservice(environment, reqlist) diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py index fc3aff6a..f04e1656 100644 --- a/evals/benchmark/stresscli/locust/tokenresponse.py +++ b/evals/benchmark/stresscli/locust/tokenresponse.py @@ -119,3 +119,38 @@ def staticsOutput(environment, reqlist): console_logger.warning(average_msg.format(numpy.average(avg_token))) console_logger.warning("======================================================\n\n") logging.shutdown() + + +def staticsOutputForMicroservice(environment, reqlist): + e2e_lat = [] + duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time + + if len(reqlist) == 0: + logging.debug(f"len(reqlist): {len(reqlist)}, skip printing") + return + for req in iter(reqlist): + e2e_lat.append(req["total_latency"]) + + # Statistics for success response data only + req_msg = "Succeed Response: {} (Total {}, {:.1%} Success), Duration: {:.2f}s, RPS: {:.2f}" + e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}" + console_logger.warning("\n=================Total statistics=====================") + console_logger.warning( + req_msg.format( + len(reqlist), + environment.runner.stats.num_requests, + len(reqlist) / environment.runner.stats.num_requests, + duration, + len(reqlist) / duration, + ) + ) + console_logger.warning( + e2e_msg.format( + numpy.percentile(e2e_lat, 50), + numpy.percentile(e2e_lat, 90), + numpy.percentile(e2e_lat, 99), + numpy.average(e2e_lat), + ) + ) + console_logger.warning("======================================================\n\n") + logging.shutdown()