deepjavalibrary · siddvenk · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
@@ -126,22 +126,22 @@ def __init__(self):
 
     def initialize(self, properties: dict):
         self.hf_configs = HuggingFaceProperties(**properties)
-        self._read_model_config(self.hf_configs.model_id_or_path,
-                                self.hf_configs.is_peft_model)
-
         if is_rolling_batch_enabled(self.hf_configs.rolling_batch):
             _rolling_batch_cls = get_rolling_batch_class_from_str(
                 self.hf_configs.rolling_batch.value)
-            self.hf_configs.kwargs["model_config"] = self.model_config
             self.rolling_batch = _rolling_batch_cls(
                 self.hf_configs.model_id_or_path, properties,
                 **self.hf_configs.kwargs)
             self.tokenizer = self.rolling_batch.get_tokenizer()
         elif is_streaming_enabled(self.hf_configs.enable_streaming):
+            self._read_model_config(self.hf_configs.model_id_or_path,
+                                    self.hf_configs.is_peft_model)
             self._init_tokenizer(self.hf_configs.model_id_or_path)
             self._init_model(self.hf_configs.model_id_or_path,
                              **self.hf_configs.kwargs)
         else:
+            self._read_model_config(self.hf_configs.model_id_or_path,
+                                    self.hf_configs.is_peft_model)
             if not self.hf_configs.task:
                 self.hf_configs.task = self.infer_task_from_model_architecture(
                 )
@@ -460,6 +460,8 @@ def _read_model_config(self, model_config_path: str, is_peft_model: bool):
         )
 
     def get_image_token(self):
+        if self.model_config is None:
+            return None
         model_type = self.model_config.model_type
         if model_type == "phi3_v":
             return "<|image_{}|>"
@@ -479,7 +481,7 @@ def get_image_token(self):
         if model_type == "qwen2_vl":
             return "<|vision_start|><|image_pad|><|vision_end|>"
 
-        logging.warning(
+        logging.debug(
             "could not infer image token from the model artifacts. Using <image> as default."
         )
         return "<image>"

@@ -191,6 +191,12 @@ def set_internal_settings(self, properties: Dict[str, Any],
             n_positions, param_bytes, model_config) * 0.95 /
                                     (1024.0 * 1024.0 * 1024.0))
 
+        if self.model_size_in_gb == 0 or self.sequence_size_in_gb == 0 or n_positions == 0:
+            raise Exception(
+                f"Failed to compute model size or sequence size or n_positions: {n_positions},"
+                f"model_size_in_gb: {self.model_size_in_gb}, sequence_size_in_gb: {self.sequence_size_in_gb}"
+            )
+
     def get_adjusted_model_size_in_gb(self, tp_degree: int) -> float:
         return self.model_size_in_gb * (1.0 + ((tp_degree * 2 - 2) / 100.0))
 

@@ -36,7 +36,7 @@ class VllmRbProperties(Properties):
     dtype: Optional[str] = "auto"
     load_format: Optional[str] = "auto"
     quantize: Optional[VllmQuantizeMethods] = None
-    tensor_parallel_degree: Optional[int] = None
+    tensor_parallel_degree: int = 1
     pipeline_parallel_degree: int = 1
     max_rolling_batch_prefill_tokens: Optional[int] = None
     # Adjustable prefix model length for certain 32k or longer model

@@ -18,6 +18,7 @@
 from lmi_dist.api import Request, RequestParams
 from lmi_dist.arg_utils import VllmEngineArgs
 from lmi_dist.init_engine import engine_from_args
+from lmi_dist.seq2seq_engine import Seq2SeqPreprocessor
 from vllm import SamplingParams
 
 from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
@@ -47,8 +48,6 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
         :param properties (dict): other properties of the model, such as decoder strategy
         """
         self.lmi_dist_config = LmiDistRbProperties(**properties)
-        self.model_type = getattr(kwargs.get("model_config", None),
-                                  "model_type", None)
         super().__init__(self.lmi_dist_config)
         self.supports_speculative_decoding = supports_speculative_decoding()
         engine_kwargs = {}
@@ -106,6 +105,8 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
         self.request_cache = OrderedDict()
         self.lora_ids = defaultdict(lambda: len(self.lora_ids) + 1)
         self.is_mistral_tokenizer = self.lmi_dist_config.tokenizer_mode == 'mistral'
+        self.is_t5_model = isinstance(self.engine.preprocessor,
+                                      Seq2SeqPreprocessor)
 
     def reset(self) -> None:
         """
@@ -116,7 +117,7 @@ def reset(self) -> None:
         super().reset()
 
     def get_tokenizer(self):
-        if "t5" == self.model_type:
+        if self.is_t5_model:
             return self.engine.preprocessor.tokenizer
         return self.engine.preprocessor.tokenizer.tokenizer
 

@@ -210,30 +210,6 @@ def test_partition(self, params):
                          self.service.config.save_mp_checkpoint_path)
         self.assertTrue(self.service.initialized)
 
-    @parameters([{
-        "initial_value": 512,
-        "smart_default": 512
-    }, {
-        "initial_value": 8192,
-        "smart_default": 4096
-    }])
-    def test_smart_defaults(self, params):
-        # Setup
-        self.default_properties.pop('n_positions')
-        test_properties = self.default_properties
-        self.service.config = self.config_builder(test_properties)
-        self.service.model_config = AutoConfig.from_pretrained(
-            test_properties['model_id'])
-        self.service.model_config.max_position_embeddings = params[
-            'initial_value']
-
-        # Test
-        self.service.set_max_position_embeddings()
-
-        # Evaluate
-        self.assertEqual(self.service.config.n_positions,
-                         params['smart_default'])
-
     def tearDown(self):
         del self.service
         del self.default_properties