Skip to content

Commit

Permalink
Hex-LLM supports prefix caching as a GA feature
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 712616644
  • Loading branch information
vertex-mg-bot authored and copybara-github committed Jan 9, 2025
1 parent 9f6ad84 commit df9ce70
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@
" is_for_training=False,\n",
")\n",
"\n",
"# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).\n",
"enable_prefix_cache_hbm = True # @param {type:\"boolean\"}\n",
"\n",
"# Server parameters.\n",
"hbm_utilization_factor = 0.6 # A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.\n",
"max_running_seqs = 256\n",
Expand All @@ -327,6 +330,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -367,6 +371,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down Expand Up @@ -417,6 +423,7 @@
" tensor_parallel_size=tensor_parallel_size,\n",
" hbm_utilization_factor=hbm_utilization_factor,\n",
" max_running_seqs=max_running_seqs,\n",
" enable_prefix_cache_hbm=enable_prefix_cache_hbm,\n",
" min_replica_count=min_replica_count,\n",
" max_replica_count=max_replica_count,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -334,6 +335,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,9 @@
" is_for_training=False,\n",
")\n",
"\n",
"# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).\n",
"enable_prefix_cache_hbm = True # @param {type:\"boolean\"}\n",
"\n",
"# Server parameters.\n",
"hbm_utilization_factor = 0.6 # A larger value improves throughput but gives higher risk of TPU out-of-memory errors with long prompts.\n",
"max_running_seqs = 256\n",
Expand All @@ -316,6 +319,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -356,6 +360,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down Expand Up @@ -404,6 +410,7 @@
" machine_type=machine_type,\n",
" hbm_utilization_factor=hbm_utilization_factor,\n",
" max_running_seqs=max_running_seqs,\n",
" enable_prefix_cache_hbm=enable_prefix_cache_hbm,\n",
" min_replica_count=min_replica_count,\n",
" max_replica_count=max_replica_count,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -1253,6 +1254,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down Expand Up @@ -1293,6 +1296,9 @@
" return model, endpoint\n",
"\n",
"\n",
"# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).\n",
"enable_prefix_cache_hbm = True # @param {type:\"boolean\"}\n",
"\n",
"if LOAD_MODEL_FROM != \"Kaggle\":\n",
" print(\"Skipped: Expect to load model from Kaggle, got\", LOAD_MODEL_FROM)\n",
"else:\n",
Expand Down Expand Up @@ -1337,6 +1343,7 @@
" tokens_pad_multiple=tokens_pad_multiple,\n",
" seqs_pad_multiple=seqs_pad_multiple,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
" enable_prefix_cache_hbm=enable_prefix_cache_hbm,\n",
" )\n",
" print(\"endpoint_name:\", endpoints[\"hexllm_tpu\"].name)"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -681,6 +682,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@
"# Note: 1 TPU V5 chip has only one core.\n",
"tpu_type = \"TPU_V5e\"\n",
"\n",
"# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).\n",
"enable_prefix_cache_hbm = True # @param {type:\"boolean\"}\n",
"\n",
"# @markdown Set the disaggregated topology to balance the TTFT and TPOT.\n",
"# @markdown This is an **experimental** feature and is only supported for single host deployments.\n",
"# @markdown If want to enable the feature, set this parameter to a string of the form `\"num_prefill_workers,num_decode_workers\"`, like `\"3,1\"`.\n",
Expand Down Expand Up @@ -321,6 +324,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -361,6 +365,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down Expand Up @@ -413,6 +419,7 @@
" hbm_utilization_factor=hbm_utilization_factor,\n",
" max_running_seqs=max_running_seqs,\n",
" max_model_len=max_model_len,\n",
" enable_prefix_cache_hbm=enable_prefix_cache_hbm,\n",
" min_replica_count=min_replica_count,\n",
" max_replica_count=max_replica_count,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,9 @@
" is_for_training=False,\n",
")\n",
"\n",
"# @markdown Set enable_prefix_cache_hbm to False if you don't want to use [prefix caching](https://cloud.google.com/vertex-ai/generative-ai/docs/open-models/use-hex-llm#prefix-caching).\n",
"enable_prefix_cache_hbm = True # @param {type:\"boolean\"}\n",
"\n",
"# Server parameters.\n",
"tensor_parallel_size = tpu_count\n",
"\n",
Expand Down Expand Up @@ -311,6 +314,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -351,6 +355,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down Expand Up @@ -402,6 +408,7 @@
" hbm_utilization_factor=hbm_utilization_factor,\n",
" max_running_seqs=max_running_seqs,\n",
" max_model_len=max_model_len,\n",
" enable_prefix_cache_hbm=enable_prefix_cache_hbm,\n",
" min_replica_count=min_replica_count,\n",
" max_replica_count=max_replica_count,\n",
" use_dedicated_endpoint=use_dedicated_endpoint,\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@
" hbm_utilization_factor: float = 0.6,\n",
" max_running_seqs: int = 256,\n",
" max_model_len: int = 4096,\n",
" enable_prefix_cache_hbm: bool = False,\n",
" endpoint_id: str = \"\",\n",
" min_replica_count: int = 1,\n",
" max_replica_count: int = 1,\n",
Expand Down Expand Up @@ -609,6 +610,8 @@
" ]\n",
" if disagg_topology:\n",
" hexllm_args.append(f\"--disagg_topo={disagg_topology}\")\n",
" if enable_prefix_cache_hbm and not disagg_topology:\n",
" hexllm_args.append(\"--enable_prefix_cache_hbm\")\n",
"\n",
" env_vars = {\n",
" \"MODEL_ID\": base_model_id,\n",
Expand Down

0 comments on commit df9ce70

Please sign in to comment.