Skip to content

Commit

Permalink
vLLM supports GPU HBM + host memory prefix kv caching
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 715213919
  • Loading branch information
vertex-mg-bot authored and copybara-github committed Jan 14, 2025
1 parent 2709fcd commit 9ed8896
Show file tree
Hide file tree
Showing 18 changed files with 192 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -616,6 +618,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -713,6 +715,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -741,6 +743,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -245,6 +247,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -350,6 +352,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -415,6 +417,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -302,6 +304,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -727,6 +729,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,7 @@
"hf_model_id = \"meta-llama/\" + base_model_name\n",
"\n",
"# The pre-built serving docker images.\n",
"VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241001_0916_RC00\"\n",
"VLLM_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20241210_0916_RC00\"\n",
"\n",
"# @markdown Set use_dedicated_endpoint to False if you don't want to use [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment#create-dedicated-endpoint).\n",
"use_dedicated_endpoint = True # @param {type:\"boolean\"}\n",
Expand Down Expand Up @@ -887,6 +887,15 @@
"max_model_len = 8192 # Maximum context length.\n",
"\n",
"\n",
"# Enable automatic prefix caching using GPU HBM\n",
"enable_prefix_cache = True\n",
"# Setting this value >0 will use the idle host memory for a second-tier prefix kv\n",
"# cache beneath the HBM cache. It only has effect if enable_prefix_cache=True.\n",
"# The range of this value: [0, 1)\n",
"# Setting host_prefix_kv_cache_utilization_target to 0 will disable the host memory prefix kv cache.\n",
"host_prefix_kv_cache_utilization_target = 0.7\n",
"\n",
"\n",
"def deploy_model_vllm(\n",
" model_name: str,\n",
" model_id: str,\n",
Expand All @@ -902,6 +911,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -948,6 +959,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down Expand Up @@ -1007,8 +1026,9 @@
" enforce_eager=True,\n",
" enable_lora=ENABLE_DYNAMIC_LORA,\n",
" enable_chunked_prefill=not ENABLE_DYNAMIC_LORA,\n",
" enable_prefix_cache=enable_prefix_cache,\n",
" host_prefix_kv_cache_utilization_target=host_prefix_kv_cache_utilization_target,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
" model_type=\"llama3.1\",\n",
")\n",
"# @markdown Click \"Show Code\" to see more details."
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -926,6 +928,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -341,6 +343,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -771,6 +773,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -283,6 +285,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -646,6 +648,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
10 changes: 10 additions & 0 deletions notebooks/community/model_garden/model_garden_pytorch_llava.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,8 @@
" enforce_eager: bool = False,\n",
" enable_lora: bool = False,\n",
" enable_chunked_prefill: bool = False,\n",
" enable_prefix_cache: bool = False,\n",
" host_prefix_kv_cache_utilization_target: float = 0.0,\n",
" max_loras: int = 1,\n",
" max_cpu_loras: int = 8,\n",
" use_dedicated_endpoint: bool = False,\n",
Expand Down Expand Up @@ -295,6 +297,14 @@
" if enable_chunked_prefill:\n",
" vllm_args.append(\"--enable-chunked-prefill\")\n",
"\n",
" if enable_prefix_cache:\n",
" vllm_args.append(\"--enable-prefix-caching\")\n",
"\n",
" if 0 < host_prefix_kv_cache_utilization_target < 1:\n",
" vllm_args.append(\n",
" f\"--host-prefix-kv-cache-utilization-target={host_prefix_kv_cache_utilization_target}\"\n",
" )\n",
"\n",
" if model_type:\n",
" vllm_args.append(f\"--model-type={model_type}\")\n",
"\n",
Expand Down
Loading

0 comments on commit 9ed8896

Please sign in to comment.