From 626d269937bd211ce048ae96a8387056927dab2a Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Fri, 6 Sep 2024 13:05:11 +0800
Subject: [PATCH] Support microservice level benchmark (#95)

* Support microservice level benchmark

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 evals/benchmark/benchmark.py                  | 32 ++++++++---------
 evals/benchmark/benchmark.yaml                |  2 +-
 evals/benchmark/stresscli/locust/aistress.py  | 22 +++++++++++-
 .../stresscli/locust/embeddingfixed.py        |  8 +++--
 .../stresscli/locust/embedservefixed.py       |  8 +++--
 .../stresscli/locust/rerankingfixed.py        |  8 +++--
 .../stresscli/locust/rerankservefixed.py      |  8 +++--
 .../stresscli/locust/retrieverfixed.py        |  8 +++--
 .../stresscli/locust/tokenresponse.py         | 35 +++++++++++++++++++
 9 files changed, 98 insertions(+), 33 deletions(-)

diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py
index c5621860..39ef2b35 100644
--- a/evals/benchmark/benchmark.py
+++ b/evals/benchmark/benchmark.py
@@ -11,25 +11,25 @@
 service_endpoints = {
     "chatqna": {
         "embedding": "/v1/embeddings",
-        "embedding_serving": "/v1/embeddings",
+        "embedserve": "/v1/embeddings",
         "retriever": "/v1/retrieval",
         "reranking": "/v1/reranking",
-        "reranking_serving": "/rerank",
+        "rerankserve": "/rerank",
         "llm": "/v1/chat/completions",
-        "llm_serving": "/v1/chat/completions",
+        "llmserve": "/v1/chat/completions",
         "e2e": "/v1/chatqna",
     },
-    "codegen": {"llm": "/generate_stream", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codegen"},
-    "codetrans": {"llm": "/generate", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codetrans"},
-    "faqgen": {"llm": "/v1/chat/completions", "llm_serving": "/v1/chat/completions", "e2e": "/v1/faqgen"},
+    "codegen": {"llm": "/generate_stream", "llmserve": "/v1/chat/completions", "e2e": "/v1/codegen"},
+    "codetrans": {"llm": "/generate", "llmserve": "/v1/chat/completions", "e2e": "/v1/codetrans"},
+    "faqgen": {"llm": "/v1/chat/completions", "llmserve": "/v1/chat/completions", "e2e": "/v1/faqgen"},
     "audioqna": {
         "asr": "/v1/audio/transcriptions",
         "llm": "/v1/chat/completions",
-        "llm_serving": "/v1/chat/completions",
+        "llmserve": "/v1/chat/completions",
         "tts": "/v1/audio/speech",
         "e2e": "/v1/audioqna",
     },
-    "visualqna": {"lvm": "/v1/chat/completions", "lvm_serving": "/v1/chat/completions", "e2e": "/v1/visualqna"},
+    "visualqna": {"lvm": "/v1/chat/completions", "lvmserve": "/v1/chat/completions", "e2e": "/v1/visualqna"},
 }
 
 
@@ -200,19 +200,19 @@ def process_service(example, service_type, case_data, test_suite_config):
     example_service_map = {
         "chatqna": [
             "embedding",
-            "embedding_serving",
+            "embedserve",
             "retriever",
             "reranking",
-            "reranking_serving",
+            "rerankserve",
             "llm",
-            "llm_serving",
+            "llmserve",
             "e2e",
         ],
-        "codegen": ["llm", "llm_serving", "e2e"],
-        "codetrans": ["llm", "llm_serving", "e2e"],
-        "faqgen": ["llm", "llm_serving", "e2e"],
-        "audioqna": ["asr", "llm", "llm_serving", "tts", "e2e"],
-        "visualqna": ["lvm", "lvm_serving", "e2e"],
+        "codegen": ["llm", "llmserve", "e2e"],
+        "codetrans": ["llm", "llmserve", "e2e"],
+        "faqgen": ["llm", "llmserve", "e2e"],
+        "audioqna": ["asr", "llm", "llmserve", "tts", "e2e"],
+        "visualqna": ["lvm", "lvmserve", "e2e"],
     }
 
     # Process each example's services
diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
index 7e26b74d..bf3576ae 100644
--- a/evals/benchmark/benchmark.yaml
+++ b/evals/benchmark/benchmark.yaml
@@ -111,7 +111,7 @@ test_cases:
         top_p: 0.95
         repetition_penalty: 1.03
         streaming: true
-    llm_serving:
+    llmserve:
       run_test: false
       service_name: "faq-micro-svc"  # Replace with your service name
     e2e:
diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py
index 5a5cb2c3..d52c738a 100644
--- a/evals/benchmark/stresscli/locust/aistress.py
+++ b/evals/benchmark/stresscli/locust/aistress.py
@@ -73,12 +73,22 @@ def bench_main(self):
             self.environment.runner.send_message("worker_reqsent", 1)
         reqData = bench_package.getReqData()
         url = bench_package.getUrl()
+        streaming_bench_target = [
+            "llmfixed",
+            "llmbench",
+            "chatqnafixed",
+            "chatqnabench",
+            "codegenfixed",
+            "codegenbench",
+            "faqgenfixed",
+            "faqgenbench",
+        ]
         try:
             start_ts = time.perf_counter()
             with self.client.post(
                 url,
                 json=reqData,
-                stream=True,
+                stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False,
                 catch_response=True,
                 timeout=self.environment.parsed_options.http_timeout,
             ) as resp:
@@ -86,6 +96,16 @@ def bench_main(self):
 
                 if resp.status_code >= 200 and resp.status_code < 400:
                     if self.environment.parsed_options.bench_target in [
+                        "embedservefixed",
+                        "embeddingfixed",
+                        "retrieverfixed",
+                        "rerankservefixed",
+                        "rerankingfixed",
+                    ]:
+                        respData = {
+                            "total_latency": time.perf_counter() - start_ts,
+                        }
+                    elif self.environment.parsed_options.bench_target in [
                         "audioqnafixed",
                         "audioqnabench",
                     ]:  # non-stream case
diff --git a/evals/benchmark/stresscli/locust/embeddingfixed.py b/evals/benchmark/stresscli/locust/embeddingfixed.py
index 12ca010d..4fe38737 100644
--- a/evals/benchmark/stresscli/locust/embeddingfixed.py
+++ b/evals/benchmark/stresscli/locust/embeddingfixed.py
@@ -14,9 +14,11 @@ def getReqData():
     }
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/embedservefixed.py b/evals/benchmark/stresscli/locust/embedservefixed.py
index 5f370797..a0a2feba 100644
--- a/evals/benchmark/stresscli/locust/embedservefixed.py
+++ b/evals/benchmark/stresscli/locust/embedservefixed.py
@@ -14,9 +14,11 @@ def getReqData():
     }
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/rerankingfixed.py b/evals/benchmark/stresscli/locust/rerankingfixed.py
index 6c780291..241e7c0d 100644
--- a/evals/benchmark/stresscli/locust/rerankingfixed.py
+++ b/evals/benchmark/stresscli/locust/rerankingfixed.py
@@ -17,9 +17,11 @@ def getReqData():
     return {"initial_query": my_query, "retrieved_docs": [{"text": query_rerank_1}, {"text": query_rerank_2}]}
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/rerankservefixed.py b/evals/benchmark/stresscli/locust/rerankservefixed.py
index 342b44e2..d2be7c97 100644
--- a/evals/benchmark/stresscli/locust/rerankservefixed.py
+++ b/evals/benchmark/stresscli/locust/rerankservefixed.py
@@ -17,9 +17,11 @@ def getReqData():
     return {"query": my_query, "texts": [query_rerank_1, query_rerank_2]}
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/retrieverfixed.py b/evals/benchmark/stresscli/locust/retrieverfixed.py
index 03d8e260..805ad57b 100644
--- a/evals/benchmark/stresscli/locust/retrieverfixed.py
+++ b/evals/benchmark/stresscli/locust/retrieverfixed.py
@@ -786,9 +786,11 @@ def getReqData():
     return ({"text": my_query, "embedding": my_embedding},)
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py
index fc3aff6a..f04e1656 100644
--- a/evals/benchmark/stresscli/locust/tokenresponse.py
+++ b/evals/benchmark/stresscli/locust/tokenresponse.py
@@ -119,3 +119,38 @@ def staticsOutput(environment, reqlist):
         console_logger.warning(average_msg.format(numpy.average(avg_token)))
     console_logger.warning("======================================================\n\n")
     logging.shutdown()
+
+
+def staticsOutputForMicroservice(environment, reqlist):
+    e2e_lat = []
+    duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time
+
+    if len(reqlist) == 0:
+        logging.debug(f"len(reqlist): {len(reqlist)}, skip printing")
+        return
+    for req in iter(reqlist):
+        e2e_lat.append(req["total_latency"])
+
+    # Statistics for success response data only
+    req_msg = "Succeed Response:  {} (Total {}, {:.1%} Success), Duration: {:.2f}s, RPS: {:.2f}"
+    e2e_msg = "End to End latency(ms),    P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
+    console_logger.warning("\n=================Total statistics=====================")
+    console_logger.warning(
+        req_msg.format(
+            len(reqlist),
+            environment.runner.stats.num_requests,
+            len(reqlist) / environment.runner.stats.num_requests,
+            duration,
+            len(reqlist) / duration,
+        )
+    )
+    console_logger.warning(
+        e2e_msg.format(
+            numpy.percentile(e2e_lat, 50),
+            numpy.percentile(e2e_lat, 90),
+            numpy.percentile(e2e_lat, 99),
+            numpy.average(e2e_lat),
+        )
+    )
+    console_logger.warning("======================================================\n\n")
+    logging.shutdown()