Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cherry-picking some commits from main branch to dlc branch #2476

Merged
merged 5 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,22 +126,22 @@ def __init__(self):

def initialize(self, properties: dict):
self.hf_configs = HuggingFaceProperties(**properties)
self._read_model_config(self.hf_configs.model_id_or_path,
self.hf_configs.is_peft_model)

if is_rolling_batch_enabled(self.hf_configs.rolling_batch):
_rolling_batch_cls = get_rolling_batch_class_from_str(
self.hf_configs.rolling_batch.value)
self.hf_configs.kwargs["model_config"] = self.model_config
self.rolling_batch = _rolling_batch_cls(
self.hf_configs.model_id_or_path, properties,
**self.hf_configs.kwargs)
self.tokenizer = self.rolling_batch.get_tokenizer()
elif is_streaming_enabled(self.hf_configs.enable_streaming):
self._read_model_config(self.hf_configs.model_id_or_path,
self.hf_configs.is_peft_model)
self._init_tokenizer(self.hf_configs.model_id_or_path)
self._init_model(self.hf_configs.model_id_or_path,
**self.hf_configs.kwargs)
else:
self._read_model_config(self.hf_configs.model_id_or_path,
self.hf_configs.is_peft_model)
if not self.hf_configs.task:
self.hf_configs.task = self.infer_task_from_model_architecture(
)
Expand Down Expand Up @@ -460,6 +460,8 @@ def _read_model_config(self, model_config_path: str, is_peft_model: bool):
)

def get_image_token(self):
if self.model_config is None:
return None
model_type = self.model_config.model_type
if model_type == "phi3_v":
return "<|image_{}|>"
Expand All @@ -479,7 +481,7 @@ def get_image_token(self):
if model_type == "qwen2_vl":
return "<|vision_start|><|image_pad|><|vision_end|>"

logging.warning(
logging.debug(
"could not infer image token from the model artifacts. Using <image> as default."
)
return "<image>"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ def set_internal_settings(self, properties: Dict[str, Any],
n_positions, param_bytes, model_config) * 0.95 /
(1024.0 * 1024.0 * 1024.0))

if self.model_size_in_gb == 0 or self.sequence_size_in_gb == 0 or n_positions == 0:
raise Exception(
f"Failed to compute model size or sequence size or n_positions: {n_positions},"
f"model_size_in_gb: {self.model_size_in_gb}, sequence_size_in_gb: {self.sequence_size_in_gb}"
)

def get_adjusted_model_size_in_gb(self, tp_degree: int) -> float:
return self.model_size_in_gb * (1.0 + ((tp_degree * 2 - 2) / 100.0))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class VllmRbProperties(Properties):
dtype: Optional[str] = "auto"
load_format: Optional[str] = "auto"
quantize: Optional[VllmQuantizeMethods] = None
tensor_parallel_degree: Optional[int] = None
tensor_parallel_degree: int = 1
pipeline_parallel_degree: int = 1
max_rolling_batch_prefill_tokens: Optional[int] = None
# Adjustable prefix model length for certain 32k or longer model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lmi_dist.api import Request, RequestParams
from lmi_dist.arg_utils import VllmEngineArgs
from lmi_dist.init_engine import engine_from_args
from lmi_dist.seq2seq_engine import Seq2SeqPreprocessor
from vllm import SamplingParams

from djl_python.rolling_batch.rolling_batch import RollingBatch, stop_on_any_exception, filter_unused_generation_params
Expand Down Expand Up @@ -47,8 +48,6 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
:param properties (dict): other properties of the model, such as decoder strategy
"""
self.lmi_dist_config = LmiDistRbProperties(**properties)
self.model_type = getattr(kwargs.get("model_config", None),
"model_type", None)
super().__init__(self.lmi_dist_config)
self.supports_speculative_decoding = supports_speculative_decoding()
engine_kwargs = {}
Expand Down Expand Up @@ -106,6 +105,8 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
self.request_cache = OrderedDict()
self.lora_ids = defaultdict(lambda: len(self.lora_ids) + 1)
self.is_mistral_tokenizer = self.lmi_dist_config.tokenizer_mode == 'mistral'
self.is_t5_model = isinstance(self.engine.preprocessor,
Seq2SeqPreprocessor)

def reset(self) -> None:
"""
Expand All @@ -116,7 +117,7 @@ def reset(self) -> None:
super().reset()

def get_tokenizer(self):
if "t5" == self.model_type:
if self.is_t5_model:
return self.engine.preprocessor.tokenizer
return self.engine.preprocessor.tokenizer.tokenizer

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,30 +210,6 @@ def test_partition(self, params):
self.service.config.save_mp_checkpoint_path)
self.assertTrue(self.service.initialized)

@parameters([{
"initial_value": 512,
"smart_default": 512
}, {
"initial_value": 8192,
"smart_default": 4096
}])
def test_smart_defaults(self, params):
# Setup
self.default_properties.pop('n_positions')
test_properties = self.default_properties
self.service.config = self.config_builder(test_properties)
self.service.model_config = AutoConfig.from_pretrained(
test_properties['model_id'])
self.service.model_config.max_position_embeddings = params[
'initial_value']

# Test
self.service.set_max_position_embeddings()

# Evaluate
self.assertEqual(self.service.config.n_positions,
params['smart_default'])

def tearDown(self):
del self.service
del self.default_properties
Expand Down
Loading
Loading