From 12f3c3167a1549787302359bfb4f268a63c1bf81 Mon Sep 17 00:00:00 2001
From: Robert Yang <141875536+ydm-amazon@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:45:54 -0700
Subject: [PATCH 1/5] Update chatglm doc (#2440)

---
 serving/docs/lmi/user_guides/trt_llm_user_guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/serving/docs/lmi/user_guides/trt_llm_user_guide.md b/serving/docs/lmi/user_guides/trt_llm_user_guide.md
index b78ff6667..94f9f1dc0 100644
--- a/serving/docs/lmi/user_guides/trt_llm_user_guide.md
+++ b/serving/docs/lmi/user_guides/trt_llm_user_guide.md
@@ -94,7 +94,7 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.enable_trt_overlap	                                    | >= 0.25.0   | Pass Through	      | Parameter to overlap the execution of batches of requests. It may have a negative impact on performance when the number of requests is too small. During our experiment, we saw more negative impact to turn this on than off.                                                                                                                                                                                                                                                                                                                                                                       | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
 | option.enable_kv_cache_reuse	                                 | >= 0.26.0   | Pass Through	      | This feature is only supported for GPT-like model on TRTLLM (as of 0.7.1) and need to compile the model with `--use_paged_context_fmha`. Let the LLM model to remember the last used input KV cache and try to reuse it in the next run. An instant benefit will be blazing fast first token latency. This is typically helpful for document understanding, chat applications that usually have the same input prefix. The TRTLLM backends will remember the prefix tree of the input and reuse most of its part for the next generation. However, this does come with the cost of extra GPU memory. | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
 | option.baichuan_model_version	                                | >= 0.26.0   | Pass Through	      | Parameter that exclusively for Baichuan LLM model to specify the version of the model. Need to specify the HF Baichuan checkpoint path. For v1_13b, you should use whether baichuan-inc/Baichuan-13B-Chat or baichuan-inc/Baichuan-13B-Base. For v2_13b, you should use whether baichuan-inc/Baichuan2-13B-Chat or baichuan-inc/Baichuan2-13B-Base. More Baichuan models could be found on baichuan-inc.	                                                                                                                                                                                            | `v1_7b`, `v1_13b`, `v2_7b`, `v2_13b`. <br/> Default is `v1_13b`	                                                                                                            |
-| option.chatglm_model_version                                  | >= 0.26.0   | Pass Through       | Parameter exclusive to ChatGLM models to specify the exact model type. Required for ChatGLM models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `chatglm_6b`, `chatglm2_6b`, `chatglm2_6b_32k`, `chatglm3_6b`, `chatglm3_6b_base`, `chatglm3_6b_32k`, `glm_10b`. <br/> Default is `unspecified`, which will throw an error. |
+| option.chatglm_model_version                                  | >= 0.26.0   | Pass Through       | Parameter exclusive to ChatGLM models to specify the exact model type. Required for ChatGLM models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `glm`, `chatglm`, `chatglm2`, `chatglm3`, `glm4`. <br/> Default is `None`, which will let TensorRT-LLM automatically infer the model type.                                  |
 | option.gpt_model_version                                      | >= 0.26.0   | Pass Through       | Parameter exclusive to GPT2 models to specify the exact model type. Required for GPT2 models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | `gpt2`, `santacoder`, `starcoder`. <br/> Default is `gpt2`.                                                                                                                 |
 | option.multi_block_mode                                       | >= 0.26.0   | Pass Through       | Split long kv sequence into multiple blocks (applied to generation MHA kernels). It is beneifical when `batch x num_heads` cannot fully utilize GPU. This is **not** supported for qwen model type.                                                                                                                                                                                                                                                                                                                                                                                                  | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
 | option.use_fused_mlp                                          | >= 0.26.0   | Pass Through       | Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance for large Llama models(e.g. llama-2-70b). This option is only supported for Llama model type.                                                                                                                                                                                                                                                                                                                                                                                                 | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
@@ -103,8 +103,8 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.rotary_scaling_type </br> option.rotary_scaling_factor | >= 0.26.0   | Pass Through       | Rotary scaling parameters. These two options should always be set together to prevent errors. These are supported for llama, qwen and internlm models                                                                                                                                                                                                                                                                                                                                                                                                                                                | The value of `rotary_scaling_type` can be either `linear` and `dynamic`. The value of `rotary_scaling_factor` can be any value larger than 1.0. Default is `None`.          |
 | option.logits_dtype	                                          | >= 0.28.0   | Pass Through	      | Datatype of logits; only applies for the T5 model. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | `fp16`, `fp32`. <br/> Default is `fp32`	                                                                                                                                    |
 | option.trtllm_checkpoint_path	                                | >= 0.28.0   | Pass Through	      | Specifies the location of where the checkpoint artifacts are placed. Not recommended to set. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Default is `/tmp/trtllm_{model_name}_ckpt/`	                                                                                                                                |
-| option.num_checkpoint_workers	                                | >= 0.27.0   | Pass Through	      | Specifies the number of workers used to perform checkpoint conversion.  	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Default is `tensor_parallel_degree * pipeline_parallel_degree`	                                                                                                                                |
-| option.num_engine_workers	                                | >= 0.27.0   | Pass Through	      | Specifies the number of workers used to perform engine build.  	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Default is `number of available CUDA devices`	                                                                                                                                |
+| option.num_checkpoint_workers	                                | >= 0.27.0   | Pass Through	      | Specifies the number of workers used to perform checkpoint conversion.  	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Default is `tensor_parallel_degree * pipeline_parallel_degree`	                                                                                                             |
+| option.num_engine_workers	                                | >= 0.27.0   | Pass Through	      | Specifies the number of workers used to perform engine build.  	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | Default is `number of available CUDA devices`	                                                                                                                              |
 | option.load_by_shard	                                         | >= 0.28.0   | Pass Through	      | Sharding during compilation - only for Falcon 40B model 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
 | Advanced parameters: SmoothQuant	                             |
 | option.quantize	                                              | >= 0.26.0   | Pass Through	      | Currently only supports `smoothquant` for Llama, Mistral, InternLM and Baichuan models with just in time compilation mode.	                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | `smoothquant`	                                                                                                                                                              |
@@ -119,7 +119,7 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.q_format	                                              | >= 0.27.0   | Pass Through	      | This is only applied when `option.quantize` is set to `awq`. awq format you want to set. Currently only support `int4_awq`                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | Default value is `int4_awq`	                                                                                                                                                |
 | option.calib_size	                                            | >= 0.27.0   | Pass Through	      | This is applied when `option.quantize` is set to `awq`. Number of samples for calibration. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | Default is `512`	                                                                                                                                                           |
 | option.calib_batch_size	                                      | >= 0.28.0   | Pass Through	      | This is applied when `option.quantize` is set to `awq`. Batch size for calibration. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Default is `32`                                                                                                                                                             |
-| option.awq_block_size	                                      | >= 0.28.0   | Pass Through	      | This is applied when `option.quantize` is set to `awq`. Block (group) size for AWQ quantization. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Default is `128`                                                                                                                                                             |
+| option.awq_block_size	                                      | >= 0.28.0   | Pass Through	      | This is applied when `option.quantize` is set to `awq`. Block (group) size for AWQ quantization. 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | Default is `128`                                                                                                                                                            |
 | Advanced parameters: FP8                                      |
 | option.quantize	                                              | >= 0.26.0   | Pass Through	      | Currently only supports `fp8` for Llama, Mistral, Mixtral, Baichuan, Gemma, and GPT2 models with just in time compilation mode.	                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | `fp8`	                                                                                                                                                                      |
 | option.use_fp8_context_fmha	                                  | >= 0.28.0   | Pass Through	      | Paged attention for fp8; should only be turned on for p5 instances 	                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |

From 034a01eb691bc2bc2680d1c054b88be80daab892 Mon Sep 17 00:00:00 2001
From: Tyler Osterberg <tylertosterberg@gmail.com>
Date: Mon, 21 Oct 2024 15:55:57 -0700
Subject: [PATCH 2/5] [fix] neuron unit test aot lcnc test script (#2474)

---
 tests/integration/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 0224d4d1a..b92d4277b 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -729,7 +729,7 @@ def test_gpt2_quantize(self):
 
     @pytest.mark.parametrize(
         "model",
-        ["tiny-llama-rb-aot", "tiny-llama-rb-aot-quant", "tiny-llama-lcnc"])
+        ["tiny-llama-rb-aot", "tiny-llama-rb-aot-quant", "tiny-llama-rb-lcnc"])
     def test_partition(self, model):
         with Runner('pytorch-inf2', f'partition-{model}') as r:
             try:

From 27ca2001c291206553928f4b0042ecb684ff4676 Mon Sep 17 00:00:00 2001
From: Siddharth Venkatesan <siddhave@amazon.com>
Date: Mon, 21 Oct 2024 16:07:18 -0700
Subject: [PATCH 3/5] [ci][trt] add llama3-1 test for trtllm (#2470)

---
 tests/integration/llm/client.py  | 7 ++++++-
 tests/integration/llm/prepare.py | 6 +++++-
 tests/integration/tests.py       | 6 ++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 29334b690..b321db3d5 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -686,7 +686,12 @@ def get_model_name():
         "batch_size": [1, 8],
         "seq_length": [256],
         "tokenizer": "google/flan-t5-xl",
-    }
+    },
+    "llama-3-1-8b": {
+        "batch_size": [1, 8],
+        "seq_length": [256],
+        "tokenizer": "NousResearch/Meta-Llama-3.1-8B",
+    },
 }
 
 trtllm_chat_model_spec = {
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index e2150fac1..f7659a5c8 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -986,7 +986,11 @@
     "flan-t5-xl": {
         "option.model_id": "s3://djl-llm/flan-t5-xl/",
         "option.dtype": "bf16"
-    }
+    },
+    "llama-3-1-8b": {
+        "option.model_id": "s3://djl-llm/llama-3.1-8b-hf/",
+        "option.tensor_parallel_degree": 4,
+    },
 }
 
 correctness_model_list = {
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index b92d4277b..00fb2fcb0 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -309,6 +309,12 @@ def test_santacoder(self):
             r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
             client.run("trtllm santacoder".split())
 
+    def test_llama_31_8b(self):
+        with Runner('tensorrt-llm', 'llama-3-1-8b') as r:
+            prepare.build_trtllm_handler_model('llama-3-1-8b')
+            r.launch()
+            client.run("trtllm llama-3-1-8b".split())
+
 
 @pytest.mark.trtllm
 @pytest.mark.gpu_4

From f83a2911a8db3db0d95f04e6d9a38cf22f3b1f1a Mon Sep 17 00:00:00 2001
From: Siddharth Venkatesan <siddhave@amazon.com>
Date: Mon, 21 Oct 2024 16:11:46 -0700
Subject: [PATCH 4/5] =?UTF-8?q?[lmi]=20avoid=20creating=20huggingface=20mo?=
 =?UTF-8?q?del=20config=20in=20rolling=20batch=20cases=20=E2=80=A6=20(#247?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 engines/python/setup/djl_python/huggingface.py       | 12 +++++++-----
 .../properties_manager/vllm_rb_properties.py         |  2 +-
 .../rolling_batch/lmi_dist_rolling_batch.py          |  7 ++++---
 tests/integration/llm/prepare.py                     |  3 ++-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py
index 2a997aa56..833de3581 100644
--- a/engines/python/setup/djl_python/huggingface.py
+++ b/engines/python/setup/djl_python/huggingface.py
@@ -126,22 +126,22 @@ def __init__(self):
 
     def initialize(self, properties: dict):
         self.hf_configs = HuggingFaceProperties(**properties)
-        self._read_model_config(self.hf_configs.model_id_or_path,
-                                self.hf_configs.is_peft_model)
-
         if is_rolling_batch_enabled(self.hf_configs.rolling_batch):
             _rolling_batch_cls = get_rolling_batch_class_from_str(
                 self.hf_configs.rolling_batch.value)
-            self.hf_configs.kwargs["model_config"] = self.model_config
             self.rolling_batch = _rolling_batch_cls(
                 self.hf_configs.model_id_or_path, properties,
                 **self.hf_configs.kwargs)
             self.tokenizer = self.rolling_batch.get_tokenizer()
         elif is_streaming_enabled(self.hf_configs.enable_streaming):
+            self._read_model_config(self.hf_configs.model_id_or_path,
+                                    self.hf_configs.is_peft_model)
             self._init_tokenizer(self.hf_configs.model_id_or_path)
             self._init_model(self.hf_configs.model_id_or_path,
                              **self.hf_configs.kwargs)
         else:
+            self._read_model_config(self.hf_configs.model_id_or_path,
+                                    self.hf_configs.is_peft_model)
             if not self.hf_configs.task:
                 self.hf_configs.task = self.infer_task_from_model_architecture(
                 )
@@ -460,6 +460,8 @@ def _read_model_config(self, model_config_path: str, is_peft_model: bool):
         )
 
     def get_image_token(self):
+        if self.model_config is None:
+            return None
         model_type = self.model_config.model_type
         if model_type == "phi3_v":
             return "<|image_{}|>"
@@ -479,7 +481,7 @@ def get_image_token(self):
         if model_type == "qwen2_vl":
             return "<|vision_start|><|image_pad|><|vision_end|>"
 
-        logging.warning(
+        logging.debug(
             "could not infer image token from the model artifacts. Using <image> as default."
         )
         return "<image>"
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
index 6ff1f254d..b4c2dc1f9 100644
--- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
+++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -36,7 +36,7 @@ class VllmRbProperties(Properties):
     dtype: Optional[str] = "auto"
     load_format: Optional[str] = "auto"
     quantize: Optional[VllmQuantizeMethods] = None
-    tensor_parallel_degree: Optional[int] = None
+    tensor_parallel_degree: int = 1
     pipeline_parallel_degree: int = 1
     max_rolling_batch_prefill_tokens: Optional[int] = None
     # Adjustable prefix model length for certain 32k or longer model
diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
index 9cad2744f..931f2e44f 100644
--- a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
+++ b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
@@ -18,6 +18,7 @@
 from lmi_dist.api import Request, RequestParams
 from lmi_dist.arg_utils import VllmEngineArgs
 from lmi_dist.init_engine import engine_from_args
+from lmi_dist.seq2seq_engine import Seq2SeqPreprocessor
 from vllm import SamplingParams
 
 from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
@@ -47,8 +48,6 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
         :param properties (dict): other properties of the model, such as decoder strategy
         """
         self.lmi_dist_config = LmiDistRbProperties(**properties)
-        self.model_type = getattr(kwargs.get("model_config", None),
-                                  "model_type", None)
         super().__init__(self.lmi_dist_config)
         self.supports_speculative_decoding = supports_speculative_decoding()
         engine_kwargs = {}
@@ -106,6 +105,8 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
         self.request_cache = OrderedDict()
         self.lora_ids = defaultdict(lambda: len(self.lora_ids) + 1)
         self.is_mistral_tokenizer = self.lmi_dist_config.tokenizer_mode == 'mistral'
+        self.is_t5_model = isinstance(self.engine.preprocessor,
+                                      Seq2SeqPreprocessor)
 
     def reset(self) -> None:
         """
@@ -116,7 +117,7 @@ def reset(self) -> None:
         super().reset()
 
     def get_tokenizer(self):
-        if "t5" == self.model_type:
+        if self.is_t5_model:
             return self.engine.preprocessor.tokenizer
         return self.engine.preprocessor.tokenizer.tokenizer
 
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index f7659a5c8..56b087c19 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -870,7 +870,8 @@
         "option.max_rolling_batch_size": 16,
         "option.tokenizer_mode": "mistral",
         "option.limit_mm_per_prompt": "image=4",
-        "option.entryPoint": "djl_python.huggingface"
+        "option.entryPoint": "djl_python.huggingface",
+        "option.tensor_parallel_degree": "max"
     },
     "llama32-11b-multimodal": {
         "option.model_id": "s3://djl-llm/llama-3-2-11b-vision-instruct/",

From b68b9f0a87db4d685d004243ce9097dc28566bd3 Mon Sep 17 00:00:00 2001
From: Tyler Osterberg <tylertosterberg@gmail.com>
Date: Mon, 21 Oct 2024 16:26:13 -0700
Subject: [PATCH 5/5] [fix] neuron unit test smart default handler and error
 handling (#2475)

---
 .../neuron_smart_default_utils.py             |  6 +++++
 .../test_transformers_neuronx.py              | 24 -------------------
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/engines/python/setup/djl_python/neuron_utils/neuron_smart_default_utils.py b/engines/python/setup/djl_python/neuron_utils/neuron_smart_default_utils.py
index 2b56f1b9b..b6347560e 100644
--- a/engines/python/setup/djl_python/neuron_utils/neuron_smart_default_utils.py
+++ b/engines/python/setup/djl_python/neuron_utils/neuron_smart_default_utils.py
@@ -191,6 +191,12 @@ def set_internal_settings(self, properties: Dict[str, Any],
             n_positions, param_bytes, model_config) * 0.95 /
                                     (1024.0 * 1024.0 * 1024.0))
 
+        if self.model_size_in_gb == 0 or self.sequence_size_in_gb == 0 or n_positions == 0:
+            raise Exception(
+                f"Failed to compute model size or sequence size or n_positions: {n_positions},"
+                f"model_size_in_gb: {self.model_size_in_gb}, sequence_size_in_gb: {self.sequence_size_in_gb}"
+            )
+
     def get_adjusted_model_size_in_gb(self, tp_degree: int) -> float:
         return self.model_size_in_gb * (1.0 + ((tp_degree * 2 - 2) / 100.0))
 
diff --git a/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py b/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py
index 457c6d3c3..ef5688a1a 100644
--- a/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py
+++ b/engines/python/setup/djl_python/tests/neuron_test_scripts/test_transformers_neuronx.py
@@ -210,30 +210,6 @@ def test_partition(self, params):
                          self.service.config.save_mp_checkpoint_path)
         self.assertTrue(self.service.initialized)
 
-    @parameters([{
-        "initial_value": 512,
-        "smart_default": 512
-    }, {
-        "initial_value": 8192,
-        "smart_default": 4096
-    }])
-    def test_smart_defaults(self, params):
-        # Setup
-        self.default_properties.pop('n_positions')
-        test_properties = self.default_properties
-        self.service.config = self.config_builder(test_properties)
-        self.service.model_config = AutoConfig.from_pretrained(
-            test_properties['model_id'])
-        self.service.model_config.max_position_embeddings = params[
-            'initial_value']
-
-        # Test
-        self.service.set_max_position_embeddings()
-
-        # Evaluate
-        self.assertEqual(self.service.config.n_positions,
-                         params['smart_default'])
-
     def tearDown(self):
         del self.service
         del self.default_properties