From 2c2fec4067e1d7cc7acd01f99ccef85fe1240ea7 Mon Sep 17 00:00:00 2001 From: Ashok Chandrasekar Date: Fri, 23 Aug 2024 17:32:35 +0000 Subject: [PATCH] Add latency profile generator (#775) * Add latency profile generator This change adds a new benchmarking suite called latency-profile-generator which runs serving benchmarks at different request rates to produce latency and throughput numbers at different QPS. This can be used to identify how different models and model servers perform depending on incoming traffic. * Update readme and GCS push steps * first commit * remove profiling * correct steps * correct steps * jetstream option for backend arg * extra parameters * configurable pipeline starting point, request rates configurable * WIP changes/reversions * setting for building latency profiler image * onoly build once for profile-generator * fmt * fmt * Move deploy model server from profile-generator to latency-profile * fix kubectl wait * dmt * intermediate changes * Update table of contents * Fix lint issues * remove specific project id * remove artifact_registry * Stripped back LPG automation for separate PR * typo * nit * remove bad depends_on * nits * nits * Update main.tf * Update README.md * more cleanup * move latency-profile module to subdirectory * fmt * supports jetstream * Added comment * more accurate comment * readd container folder --------- Co-authored-by: Brendan Slabe Co-authored-by: Brendan Slabe --- .../tools/profile-generator/README.md | 180 +++++++ .../tools/profile-generator/build.tf | 8 + .../profile-generator/container/Dockerfile | 21 + .../container/benchmark_serving.py | 469 ++++++++++++++++++ .../container/latency_throughput_curve.sh | 27 + .../container/requirements.txt | 37 ++ .../benchmark/tools/profile-generator/main.tf | 79 +++ .../modules/latency-profile/main.tf | 65 +++ .../latency-profile-generator.yaml.tpl | 53 ++ .../modules/latency-profile/sample.tfvars | 44 ++ .../modules/latency-profile/variables.tf | 155 ++++++ .../tools/profile-generator/sample.tfvars | 42 ++ .../tools/profile-generator/variables.tf | 147 ++++++ 13 files changed, 1327 insertions(+) create mode 100644 benchmarks/benchmark/tools/profile-generator/README.md create mode 100644 benchmarks/benchmark/tools/profile-generator/build.tf create mode 100644 benchmarks/benchmark/tools/profile-generator/container/Dockerfile create mode 100644 benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py create mode 100644 benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh create mode 100644 benchmarks/benchmark/tools/profile-generator/container/requirements.txt create mode 100644 benchmarks/benchmark/tools/profile-generator/main.tf create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars create mode 100644 benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf create mode 100644 benchmarks/benchmark/tools/profile-generator/sample.tfvars create mode 100644 benchmarks/benchmark/tools/profile-generator/variables.tf diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md new file mode 100644 index 000000000..6d136bab9 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/README.md @@ -0,0 +1,180 @@ +# AI on GKE Benchmark Latency Profile Generator + + +* [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator) + * [Overview](#overview) + * [Instructions](#instructions) + * [Step 1: create output bucket](#step-1--create-output-bucket) + * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket) + * [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3--create-artifact-repository-for-automated-latency-profile-generator-docker-build) + * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars) + * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) + * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager) + * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply) + * [Inputs](#inputs) + + +## Overview + +This deploys the latency profile generator which measures the throuhghput and +latency at various request rates for the model and model server of your choice. + +It currently supports the following frameworks: +- tensorrt_llm_triton +- text generation inference (tgi) +- vllm +- sax +-jetstream + +## Instructions + +### Step 1: create output bucket + +If you followed steps from `../../infra/` for creating your cluster and extra +resources, you will already have an output bucket created for you. +If not, you will have to create and manage your own gcs bucket for storing +benchmarking results. + +Set the `output_bucket` in your `terraform.tfvars` to this gcs bucket. + +### Step 2: create and give service account access to write to output gcs bucket + +The Latency profile generator requires storage.admin access to write output to +the given output gcs bucket. If you followed steps in `../../infra`, then you +already be logged into gcloud have a kubernetes and gcloud service account +created that has the proper access to the created output bucket. If you are +not logged into gcloud, run the following: + +```bash +gcloud auth application-default login +``` + +To give viewer permissions on the gcs bucket to the gcloud service account, +run the following: + +``` +gcloud storage buckets add-iam-policy-binding gs://$OUTPUT_BUCKET/ +--member=serviceAccount:$GOOGLE_SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com --role=roles/storage.admin +``` + +Your kubernetes service account will inherit the reader permissions. + +You will set the `latency_profile_kubernetes_service_account` in your +`terraform.tfvars` to the kubernetes service account name. + +### Step 3: create artifact repository for automated Latency Profile Generator docker build + +The latency profile generator rebuilds the docker file on each terraform apply +if `build_latency_profile_generator_image` is set to true (default is true). +The containers will be pushed to the given `artifact_registry`. This artifact +repository is expected to already exist. If you created your cluster via +`../../infra/`, then an artifact repository was created for you with the same +name as the prefix in the same location as the cluster. You can also create your +own via this command: + +```bash +gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker +``` + +### Step 4: create and configure terraform.tfvars + +Create a `terraform.tfvars` file. `./sample-tfvars` is provided as an example +file. You can copy the file as a starting point. +Note that at a minimum you will have to change the existing +`credentials_config`, `project_id`, and `artifact_registry`. + +```bash +cp ./sample-tfvars terraform.tfvars +``` + +Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). The following variables are required: +- `credentials_config` - credentials for cluster to deploy Latency Profile Generator benchmark tool on +- `project_id` - project id for enabling dependent services for building Latency Profile Generator artifacts +- `artifact_registry` - artifact registry to upload Latency Profile Generator artifacts to +- `build_latency_profile_generator_image` - Whether latency profile generator image will be built or not +- `targets` - Which model servers are we targeting for benchmarking? Set the fields on `manual` if intending to benchmark a model server already in the cluster. +- `output_bucket` - gcs bucket to write benchmarking metrics to. +- `latency_profile_kubernetes_service_account` - service account giving access to latency profile generator to write to `output_bucket` +- `k8s_hf_secret` - Name of secret for huggingface token stored in k8s + +#### [optional] set-up credentials config with kubeconfig + +If your cluster has fleet management enabled, the existing `credentials_config` +can use the fleet host credentials like this: + +```bash +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/$CLUSTER_NAME" +} +``` + +If your cluster does not have fleet management enabled, you can use your +cluster's kubeconfig in the `credentials_config`. You must isolate your +cluster's kubeconfig from other clusters in the default kube.config file. +To do this, run the following command: + +```bash +KUBECONFIG=~/.kube/${CLUSTER_NAME}-kube.config gcloud container clusters get-credentials $CLUSTER_NAME --location $CLUSTER_LOCATION +``` + +Then update your `terraform.tfvars` `credentials_config` to the following: + +```bash +credentials_config = { + kubeconfig = { + path = "~/.kube/${CLUSTER_NAME}-kube.config" + } +} +``` + +#### [optional] set up secret token in Secret Manager + +A model may require a security token to access it. For example, Llama2 from +HuggingFace is a gated model that requires a +[user access token](https://huggingface.co/docs/hub/en/security-tokens). If the +model you want to run does not require this, skip this step. + +If you followed steps from `.../../infra/`, Secret Manager and the user access +token should already be set up. If not, it is strongly recommended that you use +Workload Identity and Secret Manager to access the user access tokens to avoid +adding a plain text token into the terraform state. To do so, follow the +instructions for +[setting up a secret in Secret Manager here](https://cloud.google.com/kubernetes-engine/docs/tutorials/workload-identity-secrets). + +Once complete, you should add these related secret values to your +`terraform.tfvars`: + +```bash +# ex. "projects/sample-project/secrets/hugging_face_secret" +hugging_face_secret = $SECRET_ID + # ex. 1 +hugging_face_secret_version = $SECRET_VERSION +``` + +### Step 5: login to gcloud + +Run the following gcloud command for authorization: + +```bash +gcloud auth application-default login +``` + +### Step 6: terraform initialize, plan and apply + +Run the following terraform commands: + +```bash +# initialize terraform +terraform init + +# verify changes +terraform plan + +# apply changes +terraform apply +``` + +The results can be viewed via running the following: +``` +kubectl logs job/latency-profile-generator +``` diff --git a/benchmarks/benchmark/tools/profile-generator/build.tf b/benchmarks/benchmark/tools/profile-generator/build.tf new file mode 100644 index 000000000..2f5e17cd3 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/build.tf @@ -0,0 +1,8 @@ +resource "null_resource" "build_and_push_image" { + count = var.build_latency_profile_generator_image ? 1 : 0 + depends_on = [resource.google_project_service.cloudbuild] + provisioner "local-exec" { + working_dir = path.module + command = "gcloud builds submit --tag ${var.artifact_registry}/latency-profile:latest container" + } +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/container/Dockerfile b/benchmarks/benchmark/tools/profile-generator/container/Dockerfile new file mode 100644 index 000000000..6d49f511e --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/container/Dockerfile @@ -0,0 +1,21 @@ +FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev + +RUN apt-get update -y \ + && apt-get install -y python3-pip git vim curl wget +RUN pip3 install --upgrade pip +RUN pip install packaging torch transformers +WORKDIR /workspace + +# install build and runtime dependencies +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +RUN pip install -U "huggingface_hub[cli]" + +RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +COPY benchmark_serving.py benchmark_serving.py +COPY latency_throughput_curve.sh latency_throughput_curve.sh + +RUN chmod +x latency_throughput_curve.sh +RUN chmod +x benchmark_serving.py \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py new file mode 100644 index 000000000..a05226aa6 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -0,0 +1,469 @@ +r"""Benchmark LLM serving throughput and latency. +This script is for sending requests with prompts to LLM server and benchmark +the latency and throughput at various request rates. It is a modified version of +https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py. +It currently supports TGI, vLLM, Triton TensorRT-LLM and Saxml. +""" + +import argparse +import asyncio +import json +import random +import time +from typing import AsyncGenerator, List, Tuple + +import aiohttp +import numpy as np +from transformers import AutoTokenizer +from transformers import PreTrainedTokenizerBase + + +# (prompt len, output len, latency) +REQUEST_LATENCY: List[Tuple[int, int, float]] = [] + +MIN_SEQ_LEN = 4 +CLIENT_TIMEOUT_SEC = 3 * 60 * 60 +NEW_TEXT_KEY = "\nOutput:\n" + + +def sample_requests( + dataset_path: str, + num_requests: int, + max_input_len: int, + max_output_len: int, + tokenizer: PreTrainedTokenizerBase, + use_dummy_text: bool, +) -> List[Tuple[str, int, int]]: + """Samples requests from the dataset or creates dummy requests.""" + if use_dummy_text: + dummy_prompt_token_ids = [0] * max_input_len + dummy_prompt = tokenizer.decode(dummy_prompt_token_ids) + dummy_requests = [( + dummy_prompt, + max_input_len, + max_output_len, + )] * num_requests + return dummy_requests + + # Load the dataset. + with open(dataset_path) as f: + dataset = json.load(f) + # Filter out the conversations with less than 2 turns. + dataset = [data for data in dataset if len(data["conversations"]) >= 2] + # Only keep the first two turns of each conversation. + dataset = [ + (data["conversations"][0]["value"], data["conversations"][1]["value"]) + for data in dataset + ] + + # Tokenize the prompts and completions. + prompts = [prompt for prompt, _ in dataset] + prompt_token_ids = tokenizer(prompts).input_ids + completions = [completion for _, completion in dataset] + completion_token_ids = tokenizer(completions).input_ids + tokenized_dataset = [] + for i in range(len(dataset)): + output_len = len(completion_token_ids[i]) + tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) + + # Filter out too long sequences. + filtered_dataset: List[Tuple[str, int, int]] = [] + for prompt, prompt_token_ids, output_len in tokenized_dataset: + prompt_len = len(prompt_token_ids) + if prompt_len < MIN_SEQ_LEN or output_len < MIN_SEQ_LEN: + # Prune too short sequences. + # This is because TGI causes errors when the input or output length + # is too short. + continue + if prompt_len > max_input_len or output_len > max_output_len: + # Prune too long sequences. + continue + filtered_dataset.append((prompt, prompt_len, output_len)) + + # Sample the requests. + sampled_requests = random.sample(filtered_dataset, num_requests) + return sampled_requests + + +async def get_request( + input_requests: List[Tuple[str, int, int]], + request_rate: float, +) -> AsyncGenerator[Tuple[str, int, int], None]: + """Gets request async.""" + input_requests = iter(input_requests) + for request in input_requests: + yield request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + # Sample the request interval from the exponential distribution. + interval = np.random.exponential(1.0 / request_rate) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +async def send_request( + backend: str, + api_url: str, + prompt: str, + prompt_len: int, + output_len: int, + best_of: int, + use_beam_search: bool, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, +) -> None: + """Sends request to server.""" + request_start_time = time.time() + + headers = {"User-Agent": "Benchmark Client"} + if backend == "vllm": + pload = { + "prompt": prompt, + "n": 1, + "best_of": best_of, + "use_beam_search": use_beam_search, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "max_tokens": output_len, + "ignore_eos": False, + "stream": False, + } + elif backend == "tgi": + assert not use_beam_search + params = { + "best_of": best_of, + "max_new_tokens": output_len, + "do_sample": True, + } + pload = { + "inputs": prompt, + "parameters": params, + } + elif backend == "naive_transformers": + # If max_length or top_k is not specified _MAX_LENGTH_DEFAULT = 200 and + # _TOP_K_DEFAULT = 10 in peft/handler.py will be used. + pload = { + "instances": [{ + "prompt": prompt, + "max_length": output_len, + "top_k": top_k, + }] + } + elif backend == "tensorrt_llm_triton": + pload = { + "text_input": prompt, + "max_tokens": output_len, + "beam_width": 1 if not use_beam_search else best_of, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "bad_words": "", + "stop_words": "", + "stream": False, + } + elif backend == "sax": + pload = { + "model": sax_model, + "prompt": prompt, + "n": 1, + "best_of": best_of, + "use_beam_search": use_beam_search, + "temperature": 0.0 if use_beam_search else 1.0, + "top_p": 1.0, + "top_k": 50, + "max_tokens": output_len, + "stream": False, + } + elif backend == "jetstream": + pload = { + "prompt": prompt, + "max_tokens": 1, + } + else: + raise ValueError(f"Unknown backend: {backend}") + + # Set client timeout to be 3 hrs. + timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) + async with aiohttp.ClientSession(timeout=timeout) as session: + while True: + async with session.post(api_url, headers=headers, json=pload) as response: + chunks = [] + async for chunk, _ in response.content.iter_chunks(): + chunks.append(chunk) + output = b"".join(chunks).decode("utf-8") + output = json.loads(output) + + # Re-send the request if it failed. + if "error" not in output: + break + + request_end_time = time.time() + # Naive HF transformers generation and TensorRT-LLM generation stops at EOS + # tokens and the generation may be shorter than the ground-truth output + # sequence length. + if backend == "naive_transformers": + complete_pred = output["predictions"][0][0]["generated_text"] + new_text_start_index = complete_pred.find(NEW_TEXT_KEY) + len(NEW_TEXT_KEY) + pred = complete_pred[new_text_start_index:] + output_token_ids = tokenizer(pred).input_ids + output_len = len(output_token_ids) - prompt_len + elif backend == "tensorrt_llm_triton": + output_token_ids = tokenizer(output["text_output"]).input_ids + output_len = len(output_token_ids) + elif backend == "sax": + output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids + output_len = len(output_token_ids) + elif backend == "tgi": + output_token_ids = tokenizer(output["generated_text"]).input_ids + output_len = len(output_token_ids) + elif backend == "vllm": + total_token_ids = tokenizer(output["text"][0]).input_ids + new_total_len = len(total_token_ids) + output_len = new_total_len - prompt_len + elif backend == "jetstream": + output_token_ids = tokenizer(output["response"]).input_ids + output_len = len(output_token_ids) + + request_latency = request_end_time - request_start_time + REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) + + +async def benchmark( + backend: str, + api_url: str, + input_requests: List[Tuple[str, int, int]], + best_of: int, + use_beam_search: bool, + request_rate: float, + top_k: int, + tokenizer: PreTrainedTokenizerBase, + sax_model: str, +) -> None: + """Runs benchmark with asynchronous requests.""" + tasks: List[asyncio.Task] = [] + async for request in get_request(input_requests, request_rate): + prompt, prompt_len, output_len = request + task = asyncio.create_task( + send_request( + backend, + api_url, + prompt, + prompt_len, + output_len, + best_of, + use_beam_search, + top_k, + tokenizer, + sax_model, + ) + ) + tasks.append(task) + await asyncio.gather(*tasks) + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + api_url = f"http://{args.host}:{args.port}/{args.endpoint}" + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code + ) + input_requests = sample_requests( + args.dataset, + args.num_prompts, + args.max_input_length, + args.max_output_length, + tokenizer, + args.use_dummy_text, + ) + + benchmark_start_time = time.time() + asyncio.run( + benchmark( + args.backend, + api_url, + input_requests, + args.best_of, + args.use_beam_search, + args.request_rate, + args.top_k, + tokenizer, + args.sax_model, + ) + ) + benchmark_end_time = time.time() + benchmark_time = benchmark_end_time - benchmark_start_time + print(f"Total time: {benchmark_time:.2f} s") + print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}") + + total_output_tokens = np.sum([output_len for _, output_len, _ in + REQUEST_LATENCY]) + output_tokens_per_min = 60 * total_output_tokens / benchmark_time + print(f"Output_tokens/min: {output_tokens_per_min:.2f}") + + total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in + REQUEST_LATENCY]) + input_tokens_per_min = 60 * total_input_tokens / benchmark_time + print(f"Input_tokens/min: {input_tokens_per_min:.2f}") + + total_tokens = total_input_tokens + total_output_tokens + tokens_per_min = 60 * total_tokens / benchmark_time + print(f"Tokens/min: {tokens_per_min:.2f}") + + if args.machine_cost: + print( + "Cost $/1k tokens:" + f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}" + ) + # NOTE: The latency below includes requests awaiting time on server side. + # It's not comparable with the model inference latency for batch size 1. + avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY]) + print( + "Average seconds/request (includes waiting time on server):" + f" {avg_latency:.2f}" + ) + + avg_per_token_latency = np.mean([ + latency / (prompt_len + output_len) + for prompt_len, output_len, latency in REQUEST_LATENCY + ]) + print( + "Average milliseconds/token (includes waiting time on server):" + f" {1000 * avg_per_token_latency:.2f}" + ) + + avg_per_output_token_latency = np.mean( + [latency / output_len for _, output_len, latency in REQUEST_LATENCY] + ) + print( + "Average milliseconds/output_token (includes waiting time on server):" + f" {1000 * avg_per_output_token_latency:.2f}" + ) + + avg_input_len = np.mean( + [prompt_len for prompt_len, _, _ in REQUEST_LATENCY] + ) + print( + "Average input length:" + f" {avg_input_len:.2f}" + ) + + avg_output_len = np.mean( + [output_len for _, output_len, _ in REQUEST_LATENCY] + ) + print( + "Average output length:" + f" {avg_output_len:.2f}" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark the online serving throughput." + ) + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=[ + "vllm", + "tgi", + "naive_transformers", + "tensorrt_llm_triton", + "sax", + "jetstream" + ], + ) + parser.add_argument( + "--sax_model", + type=str, + default="", + help="Model name to send request to at API server for SAX model server.", + ) + parser.add_argument("--endpoint", type=str, default="generate") + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=7080) + parser.add_argument("--dataset", type=str, help="Path to the dataset.") + parser.add_argument( + "--tokenizer", + type=str, + required=True, + help="Name or path of the tokenizer.", + ) + parser.add_argument( + "--best-of", + type=int, + default=1, + help="Generates `best_of` sequences per prompt and returns the best one.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--max-input-length", + type=int, + default=1024, + help=( + "Maximum number of input tokens for filtering the benchmark dataset." + ), + ) + parser.add_argument( + "--max-output-length", + type=int, + default=1024, + help=( + "Maximum number of input tokens for filtering the benchmark dataset." + ), + ) + parser.add_argument( + "--top-k", + type=int, + default=32000, + help=( + "Number of candidate tokens that are considered at each step of the" + " generation process. 32000 is the vocab_size of Open-LLaMA and" + " LLaMA2 models." + ), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help=( + "Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process to synthesize " + "the request arrival times." + ), + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="trust remote code from huggingface", + ) + parser.add_argument( + "--machine-cost", + type=float, + default=None, + help="Machine cost per hour including accelerators (if any)", + ) + parser.add_argument( + "--use-dummy-text", + action="store_true", + help=( + "Whether to use dummy text with length defined by max_input_length" + " and max_output_length." + ), + ) + cmd_args = parser.parse_args() + main(cmd_args) + \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh new file mode 100644 index 000000000..9c9e5ccf5 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright 2024 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o xtrace + +export IP=$IP + +huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential + +for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do + # TODO: Check if profile already exists, if so then skip + timestamp=$(date +"%Y-%m-%d_%H-%M-%S") + output_file="latency-profile-${timestamp}.txt" + python3 benchmark_serving.py --host="$IP" --port="$PORT" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file +done \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt new file mode 100644 index 000000000..739d46f7d --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -0,0 +1,37 @@ +# formatting +yapf==0.32.0 +toml==0.10.2 +ruff==0.1.5 + +# type checking +mypy==0.991 +types-PyYAML +types-requests +types-setuptools + +# testing +pytest +pytest-forked +pytest-asyncio +httpx +einops # required for MPT +flash_attn # required for HuggingFace's llama implementation +openai +requests + +# run +ninja # For faster builds. +psutil +ray >= 2.9 +sentencepiece # Required for LLaMA tokenizer. +numpy +torch == 2.1.1 +transformers >= 4.37.0 # Required for Qwen2 +xformers == 0.0.23 +fastapi +uvicorn[standard] +pydantic >= 2.0 # Required for OpenAI server. +aioprometheus[starlette] +pynvml == 11.5.0 +accelerate +aiohttp \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf new file mode 100644 index 000000000..292b6122e --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -0,0 +1,79 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +provider "kubernetes" { + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + config_context = try( + var.credentials_config.kubeconfig.context, null + ) + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + token = try(data.google_client_config.identity.0.access_token, null) +} + +data "google_client_config" "identity" { + count = var.credentials_config.fleet_host != null ? 1 : 0 +} + +resource "google_project_service" "cloudbuild" { + count = var.build_latency_profile_generator_image ? 1 : 0 + project = var.project_id + service = "cloudbuild.googleapis.com" + + timeouts { + create = "30m" + update = "40m" + } + + disable_on_destroy = false +} + +# ----- Manual Benchmarking ----- + +module "latency-profile" { + count = var.targets.manual != null ? 1 : 0 + source = "./modules/latency-profile" + + credentials_config = var.credentials_config + namespace = var.namespace + project_id = var.project_id + templates_path = var.templates_path + artifact_registry = var.artifact_registry + inference_server = { + name = var.targets.manual.name + tokenizer = var.targets.manual.tokenizer + service = { + name = var.targets.manual.service_name + port = var.targets.manual.service_port + } + } + max_num_prompts = var.max_num_prompts + max_output_len = var.max_output_len + max_prompt_len = var.max_prompt_len + request_rates = var.request_rates + output_bucket = var.output_bucket + latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account + k8s_hf_secret = var.k8s_hf_secret + hugging_face_secret = var.hugging_face_secret + hugging_face_secret_version = var.hugging_face_secret_version +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf new file mode 100644 index 000000000..694e8c324 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -0,0 +1,65 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +locals { + templates = [ + for f in fileset(local.templates_path, "*tpl") : + "${local.templates_path}/${f}" + ] + templates_path = ( + var.templates_path == null + ? "${path.module}/manifest-templates" + : pathexpand(var.templates_path) + ) + latency-profile-generator-template = "${path.module}/manifest-templates/latency-profile-generator.yaml.tpl" + hugging_face_token_secret = ( + var.hugging_face_secret == null || var.hugging_face_secret_version == null + ? null + : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}" + ) +} + +terraform { + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.0" + } + } +} + +data "google_client_config" "identity" { + count = var.credentials_config.fleet_host != null ? 1 : 0 +} + + +resource "kubernetes_manifest" "latency-profile-generator" { + manifest = yamldecode(templatefile(local.latency-profile-generator-template, { + namespace = var.namespace + artifact_registry = var.artifact_registry + inference_server_framework = var.inference_server.name + inference_server_service = var.inference_server.service.name + inference_server_service_port = var.inference_server.service.port + tokenizer = var.inference_server.tokenizer + latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account + max_num_prompts = var.max_num_prompts + max_output_len = var.max_output_len + max_prompt_len = var.max_prompt_len + request_rates = join(",", [for number in var.request_rates : tostring(number)]) + hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] + k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] + output_bucket = var.output_bucket + })) +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl new file mode 100644 index 000000000..ba75c3ed1 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -0,0 +1,53 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: latency-profile-generator + namespace: ${namespace} + labels: + name: latency-profile-generator +spec: + template: + spec: + serviceAccountName: ${latency_profile_kubernetes_service_account} + restartPolicy: Never + containers: + - name: latency-profile-generator + image: ${artifact_registry}/latency-profile:latest + resources: + limits: + nvidia.com/gpu: 1 + command: ["bash", "-c", "./latency_throughput_curve.sh"] + env: + - name: TOKENIZER + value: ${tokenizer} + - name: IP + value: ${inference_server_service} + - name: PORT + value: ${inference_server_service_port} + - name: BACKEND + value: ${inference_server_framework} + - name: INPUT_LENGTH + value: ${max_prompt_len} + - name: OUTPUT_LENGTH + value: ${max_output_len} + - name: REQUEST_RATES + value: ${request_rates} + - name: OUTPUT_BUCKET + value: ${output_bucket} +%{ for hugging_face_token_secret in hugging_face_token_secret_list ~} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN +%{ endfor ~} +%{ for hf_token in k8s_hf_secret_list ~} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN +%{ endfor ~} + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 + iam.gke.io/gke-metadata-server-enabled: "true" \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars new file mode 100644 index 000000000..9451b6755 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/sample.tfvars @@ -0,0 +1,44 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +credentials_config = { + fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUM/locations/global/gkeMemberships/ai-benchmark" +} + +project_id = "$PROJECT_ID" + +namespace = "benchmark" + +k8s_hf_secret = "hf-token" + +# Latency profile generator service configuration +latency_profile_kubernetes_service_account = "sample-runner-ksa" +output_bucket = "${PROJECT_ID}-benchmark-output" +gcs_path = "gs://${PROJECT_ID}-ai-gke-benchmark-fuse/ShareGPT_V3_unfiltered_cleaned_split_filtered_prompts.txt" + +# Inference server configuration +inference_server = { + deploy = false + name = "tgi" + tokenizer = "tiiuae/falcon-7b" + service = { + name = "tgi", # inference server service name + port = 8000 + } +} + +# Benchmark configuration for Locust Docker accessing inference server +request_rates = [5, 10, 15, 20] \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf new file mode 100644 index 000000000..a5dec1259 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -0,0 +1,155 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = false + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "namespace" { + description = "Namespace used for model and benchmarking deployments." + type = string + nullable = false + default = "default" +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string + nullable = false +} + +variable "templates_path" { + description = "Path where manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "artifact_registry" { + description = "Artifact registry for storing Latency Profile Generator container." + type = string + default = null +} + +# Inference server configuration +variable "inference_server" { + type = object({ + deploy = optional(bool), # Do you want this module to deploy the model server? + name = string, + tokenizer = string, + service = object({ + name = string, + port = number, + }) + }) + nullable = false + + validation { + condition = var.inference_server.name == "vllm" || var.inference_server.name == "tgi" || var.inference_server.name == "tensorrt_llm_triton" || var.inference_server.name == "sax" || var.inference_server.name == "jetstream" + error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream." + } +} + +variable "max_num_prompts" { + description = "Benchmark server configuration for max number of prompts." + type = number + default = 1000 + validation { + condition = var.max_num_prompts > 0 + error_message = "The max_num_prompts value must be greater than 0." + } +} + +variable "max_output_len" { + description = "Benchmark server configuration for max output length." + type = number + default = 256 + validation { + condition = var.max_output_len > 4 + error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "max_prompt_len" { + description = "Benchmark server configuration for max prompt length." + type = number + default = 256 + validation { + condition = var.max_prompt_len > 4 + error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "request_rates" { + description = "" + type = list(number) + default = [1, 2] + nullable = false +} + +variable "tokenizer" { + description = "Benchmark server configuration for tokenizer." + type = string + nullable = false + default = "tiiuae/falcon-7b" +} + +variable "output_bucket" { + description = "Bucket name for storing results" + type = string +} + +variable "latency_profile_kubernetes_service_account" { + description = "Kubernetes Service Account to be used for the latency profile generator tool" + type = string + default = "sample-runner-ksa" +} + +// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644 +variable "k8s_hf_secret" { + description = "Name of secret for huggingface token; stored in k8s " + type = string + nullable = true + default = null +} + +variable "hugging_face_secret" { + description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/" + type = string + nullable = true + default = null +} + +variable "hugging_face_secret_version" { + description = "Secret version in Secret Manager" + type = string + nullable = true + default = null +} diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars new file mode 100644 index 000000000..dea00ad56 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars @@ -0,0 +1,42 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +credentials_config = { + kubeconfig = { + path = "~/.kube/config" + } +} + +project_id = "your_project_id" + +# Latency profile generator service configuration +build_latency_profile_generator_image = false +latency_profile_kubernetes_service_account = "prom-frontend-sa" +output_bucket = "your_project_id-benchmark-output-bucket" +k8s_hf_secret = "hf-token" + +# Benchmark configuration for Locust Docker accessing inference server +request_rates = [5, 10, 15, 20] + +# Model server configuration information +targets = { + manual = { + name = "your_model_server_name" + service_name = "your_model_server_service_name" + service_port = "your_model_service_service_port" + tokenizer = "your_tokenizer" + } +} diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf new file mode 100644 index 000000000..26dd77d85 --- /dev/null +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -0,0 +1,147 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = false + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "namespace" { + description = "Namespace used for model and benchmarking deployments." + type = string + nullable = false + default = "default" +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string + nullable = false +} + +variable "templates_path" { + description = "Path where manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "artifact_registry" { + description = "Artifact registry for storing Latency Profile Generator container." + type = string + default = null +} + +variable "build_latency_profile_generator_image" { + description = "Whether latency profile generator image will be built or not" + type = bool + default = true +} + +variable "max_num_prompts" { + description = "Benchmark server configuration for max number of prompts." + type = number + default = 1000 + validation { + condition = var.max_num_prompts > 0 + error_message = "The max_num_prompts value must be greater than 0." + } +} + +variable "max_output_len" { + description = "Benchmark server configuration for max output length." + type = number + default = 256 + validation { + condition = var.max_output_len > 4 + error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "max_prompt_len" { + description = "Benchmark server configuration for max prompt length." + type = number + default = 256 + validation { + condition = var.max_prompt_len > 4 + error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences." + } +} + +variable "request_rates" { + description = "" + type = list(number) + default = [1, 2] + nullable = false +} + +variable "output_bucket" { + description = "Bucket name for storing results" + type = string +} + +variable "latency_profile_kubernetes_service_account" { + description = "Kubernetes Service Account to be used for the latency profile generator tool" + type = string + default = "sample-runner-ksa" +} + +// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644 +variable "k8s_hf_secret" { + description = "Name of secret for huggingface token; stored in k8s " + type = string + nullable = true + default = null +} + +variable "hugging_face_secret" { + description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/" + type = string + nullable = true + default = null +} + +variable "hugging_face_secret_version" { + description = "Secret version in Secret Manager" + type = string + nullable = true + default = null +} + +variable "targets" { + description = "Model server(s) targeted for benchmarking, use 'manual' for already installed model servers" + type = object({ + manual = object({ + name = string + service_name = string + service_port = number + tokenizer = string + }) + }) +} \ No newline at end of file