diff --git a/docs/references/supported_models.md b/docs/references/supported_models.md
index 9dafc3d2a3d..0c9586c671d 100644
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -24,7 +24,7 @@
 - InternLM 2
 - Exaone 3
 - BaiChuan2
-- MiniCPM / MiniCPM 3
+- MiniCPM / MiniCPM 3 / MiniCPMV
 - XVERSE / XVERSE MoE
 - SmolLM
 - GLM-4
diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py
index 4a774c4fb6b..845e1e52dda 100644
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -88,7 +88,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="claude",
@@ -101,7 +100,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="chatml",
@@ -116,7 +114,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 register_chat_template(
     ChatTemplate(
         name="chatml-llava",
@@ -132,7 +129,6 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
-
 # There is default system prompt for qwen
 # reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
 # The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
@@ -219,6 +215,21 @@ def get_chat_template_by_model_path(model_path):
     )
 )
 
+# https://huggingface.co/openbmb/MiniCPM-V-2_6
+register_chat_template(
+    ChatTemplate(
+        name="minicpmv",
+        default_system_prompt=None,
+        role_prefix_and_suffix={
+            "system": ("", " "),
+            "user": ("user:", " "),
+            "assistant": ("assistant:", "</s>"),
+        },
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
+
 # The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
 register_chat_template(
     ChatTemplate(
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
index a2f9b82844e..1472b0f1694 100644
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -393,6 +393,7 @@ def is_multimodal_model(model_architectures: List[str]):
         or "LlavaVidForCausalLM" in model_architectures
         or "MllamaForConditionalGeneration" in model_architectures
         or "Qwen2VLForConditionalGeneration" in model_architectures
+        or "MiniCPMV" in model_architectures
     ):
         return True
     else:
diff --git a/python/sglang/srt/conversation.py b/python/sglang/srt/conversation.py
index 60dba87cb08..3a775aa1e95 100644
--- a/python/sglang/srt/conversation.py
+++ b/python/sglang/srt/conversation.py
@@ -452,7 +452,6 @@ def generate_chat_conv(
 
     # Add a blank message for the assistant.
     conv.append_message(conv.roles[1], None)
-
     return conv
 
 
@@ -555,3 +554,17 @@ def generate_chat_conv(
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
     )
 )
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmv",
+        system_message="You are a helpful assistant",
+        system_template="<|im_start|>system\n{system_message}.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+    )
+)
diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py
index 7ca1d51a756..5008985e8fe 100644
--- a/python/sglang/srt/layers/logits_processor.py
+++ b/python/sglang/srt/layers/logits_processor.py
@@ -129,7 +129,7 @@ def forward(
         hidden_states,
         lm_head: VocabParallelEmbedding,
         logits_metadata: Union[LogitsMetadata, ForwardBatch],
-    ):
+    ) -> LogitsProcessorOutput:
         if isinstance(logits_metadata, ForwardBatch):
             logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
 
diff --git a/python/sglang/srt/managers/image_processor.py b/python/sglang/srt/managers/image_processor.py
index 7120fa48d52..f47acb1aafd 100644
--- a/python/sglang/srt/managers/image_processor.py
+++ b/python/sglang/srt/managers/image_processor.py
@@ -9,6 +9,8 @@
 
 import numpy as np
 import transformers
+from decord import VideoReader, cpu
+from PIL import Image
 
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.mm_utils import expand2square, process_anyres_image
@@ -36,6 +38,7 @@ class BaseImageProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
+        self.server_args = server_args
 
         self.executor = concurrent.futures.ProcessPoolExecutor(
             initializer=init_global_processor,
@@ -229,6 +232,126 @@ async def process_images_async(
         return image_inputs
 
 
+class MiniCPMVImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+
+    @staticmethod
+    def _process_images_task(images, input_text):
+        # print("_process_images_task...")
+        result = global_processor.__call__(
+            text=input_text, images=images, return_tensors="pt"
+        )
+        return {
+            "input_ids": result["input_ids"],
+            "pixel_values": result["pixel_values"],
+            "tgt_sizes": result["tgt_sizes"],
+            "image_bound": result["image_bound"],
+        }
+
+    async def _process_images(self, images, input_text):
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            image_inputs = await loop.run_in_executor(
+                self.executor,
+                MiniCPMVImageProcessor._process_images_task,
+                images,
+                input_text,
+            )
+        else:
+            image_inputs = self._processor(
+                images=images, text=input_text, return_tensors="pt"
+            )
+
+        return image_inputs
+
+    async def process_images_async(
+        self, image_data: List[Union[str, bytes]], input_text, request_obj
+    ):
+        if not image_data:
+            return None
+
+        # print("process_images_async...")
+
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+
+        image_hashes, image_sizes = [], []
+        raw_images = []
+        IMAGE_TOKEN = "(<image>./</image>)"
+
+        def encode_video(video_path):
+            if not os.path.exists(video_path):
+                return []
+            MAX_NUM_FRAMES = 3  # if cuda OOM set a smaller number
+
+            def uniform_sample(l, n):
+                gap = len(l) / n
+                idxs = [int(i * gap + gap / 2) for i in range(n)]
+                return [l[i] for i in idxs]
+
+            vr = VideoReader(video_path, ctx=cpu(0))
+            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+            frame_idx = [i for i in range(0, len(vr), sample_fps)]
+            if len(frame_idx) > MAX_NUM_FRAMES:
+                frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+            frames = vr.get_batch(frame_idx).asnumpy()
+            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
+            return frames
+
+        if isinstance(input_text, list):
+            assert len(input_text) and isinstance(input_text[0], int)
+            input_text = self._processor.tokenizer.decode(input_text)
+
+        # MiniCPMV requires each frame of video as a single image token
+        text_parts = input_text.split(IMAGE_TOKEN)
+        new_text_parts = []
+
+        for image_index, image in enumerate(image_data):
+            if isinstance(image, str) and image.startswith("video:"):
+                path = image[len("video:") :]
+                frames: List[Image] = encode_video(path)
+                image_sizes += frames[0].size * len(frames)
+            else:
+                raw_image, size = load_image(image)
+                image_sizes += [size]
+                frames = [raw_image]
+
+            image_hashes += [hash(image)]
+            raw_images += frames
+            new_text_parts.append(text_parts[image_index])
+            new_text_parts.append(IMAGE_TOKEN * len(frames))
+
+        new_text_parts.append(text_parts[-1])
+        input_text = "".join(new_text_parts)
+        res = await self._process_images(images=raw_images, input_text=input_text)
+        pixel_values = res["pixel_values"]
+        tgt_sizes = res["tgt_sizes"]
+        image_bound = res["image_bound"]
+        input_ids = res["input_ids"]
+
+        # Collect special token ids
+        tokenizer = self._processor.tokenizer
+        im_start_id = [tokenizer.im_start_id]
+        im_end_id = [tokenizer.im_end_id]
+        if tokenizer.slice_start_id:
+            slice_start_id = [tokenizer.slice_start_id]
+            slice_end_id = [tokenizer.slice_end_id]
+
+        return {
+            "input_ids": input_ids.flatten().tolist(),
+            "pixel_values": pixel_values,
+            "tgt_sizes": tgt_sizes,
+            "image_hashes": image_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "im_start_id": im_start_id,
+            "im_end_id": im_end_id,
+            "slice_start_id": slice_start_id,
+            "slice_end_id": slice_end_id,
+            "image_bound": image_bound,
+        }
+
+
 class Qwen2VLImageProcessor(BaseImageProcessor):
     def __init__(self, hf_config, server_args, _image_processor):
         self.hf_config = hf_config
@@ -350,6 +473,8 @@ def get_image_processor(
         return MllamaImageProcessor(hf_config, server_args, processor)
     elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
         return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+    elif "MiniCPMV" in hf_config.architectures:
+        return MiniCPMVImageProcessor(hf_config, server_args, processor)
     else:
         return LlavaImageProcessor(hf_config, server_args, processor.image_processor)
 
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 3b056cc5d49..9877f338cb7 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -52,7 +52,6 @@
 if TYPE_CHECKING:
     from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
 
-
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 
 # Put some global args for easy access
@@ -67,7 +66,6 @@
     "enable_ep_moe": ServerArgs.enable_ep_moe,
 }
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -148,6 +146,17 @@ class ImageInputs:
     image_grid_thws: List[Tuple[int, int, int]] = None
     mrope_position_delta: Optional[torch.Tensor] = None
 
+    # MiniCPMV related
+    # All the images in the batch should share the same special image
+    # bound token ids.
+    im_start_id: Optional[torch.Tensor] = None
+    im_end_id: Optional[torch.Tensor] = None
+    slice_start_id: Optional[torch.Tensor] = None
+    slice_end_id: Optional[torch.Tensor] = None
+    image_bound: Optional[torch.Tensor] = None
+
+    tgt_sizes: Optional[list] = None
+
     @staticmethod
     def from_dict(obj: dict):
         ret = ImageInputs(
@@ -167,6 +176,12 @@ def from_dict(obj: dict):
             "aspect_ratio_ids",
             "aspect_ratio_mask",
             "image_grid_thws",
+            "im_start_id",
+            "im_end_id",
+            "slice_start_id",
+            "slice_end_id",
+            "tgt_sizes",
+            "image_bound",
         ]
         for arg in optional_args:
             if arg in obj:
@@ -1136,7 +1151,6 @@ def get_model_worker_batch(self):
 
         global bid
         bid += 1
-
         return ModelWorkerBatch(
             bid=bid,
             forward_mode=self.forward_mode,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 719db19cd76..e837ea7113f 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -220,7 +220,7 @@ def init_torch_distributed(self):
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
 
         if not self.is_draft_worker:
-            # Only initilzie the distributed environment on the target model worker.
+            # Only initialize the distributed environment on the target model worker.
             init_distributed_environment(
                 backend=backend,
                 world_size=self.tp_size,
diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py
index c8ce9302b4f..867a470b72c 100644
--- a/python/sglang/srt/models/llava.py
+++ b/python/sglang/srt/models/llava.py
@@ -403,7 +403,7 @@ def forward(
                     pt += 1
 
             return self.language_model(
-                input_ids, positions, forward_batch, input_embeds=input_embeds
+                input_ids, positions, forward_batch, inputs_embeds=input_embeds
             )
         elif forward_batch.forward_mode.is_decode():
             return self.language_model(input_ids, positions, forward_batch)
diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py
new file mode 100644
index 00000000000..60b803c119d
--- /dev/null
+++ b/python/sglang/srt/models/minicpmv.py
@@ -0,0 +1,1322 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+from functools import cached_property, partial
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
+
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers import PretrainedConfig
+from transformers.models.idefics2.configuration_idefics2 import (
+    Idefics2Config,
+    Idefics2VisionConfig,
+)
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import ImageInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
+
+RawImageType = Union[Image.Image, torch.Tensor]
+
+
+class MultiHeadAttention(nn.Module):
+    """Multi-headed attention without any cache, used for ViT."""
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = scale
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        # dtype = torch.get_default_dtype()
+        # self.attn_backend = FlashInferAttnBackend(self)
+        # attn_backend = get_attn_backend(head_size,
+        #                                 dtype,
+        #                                 kv_cache_dtype=None,
+        #                                 block_size=16,
+        #                                 is_attention_free=False)
+        # attn_backend = backend_name_to_enum(attn_backend.get_name())
+        # if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
+        #     attn_backend = _Backend.XFORMERS
+        #
+        # self.attn_backend = attn_backend if attn_backend in {
+        #     _Backend.TORCH_SDPA, _Backend.XFORMERS
+        # } else _Backend.TORCH_SDPA
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """Input shape: batch_size x seq_len x hidden_size"""
+        # TODO(Isotr0py): Use existing backend implementations and support FA2
+        bsz, q_len, _ = query.size()
+        kv_len = key.size(1)
+
+        query = query.view(bsz, q_len, self.num_heads, self.head_size)
+        key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size)
+        import triton
+
+        out = triton.ops.attention(query, key, value, scale=self.scale)
+        return out.view(bsz, q_len, -1)
+
+
+class Idefics2VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"  # noqa: E501
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.qkv_proj = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+        # TODO: does forward_batch supports two attentions at the same time?
+        # self.attn = RadixAttention(num_heads=self.num_heads_per_partition,
+        #                            head_dim=self.head_dim,
+        #                            scaling=self.scale,
+        #                            num_kv_heads=self.num_heads,
+        #                            layer_id=layer_id
+        #                            )
+        self.attn = MultiHeadAttention(
+            self.num_heads_per_partition, self.head_dim, self.scale
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(
+            hidden_states
+        )  # batch_size, q_len, 3 * num_heads_per_partition * head_dim
+        query_states, key_states, value_states = qkv.chunk(3, dim=-1)
+        out = self.attn(
+            query_states, key_states, value_states, forward_batch=forward_batch
+        )
+        attn_output, _ = self.out_proj(out)
+        return attn_output
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Idefics2VisionAttention(
+            config,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, forward_batch=forward_batch)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(
+        self,
+        config: Idefics2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    layer_id=layer_id,
+                    prefix=f"{prefix}.layers.{layer_id}",
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states, forward_batch=forward_batch)
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: Idefics2VisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        forward_batch: ForwardBatch,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics2VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.encoder"
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values,
+        forward_batch: ForwardBatch,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            forward_batch=forward_batch,
+            tgt_sizes=tgt_sizes,
+        )
+        encoder_outputs = self.encoder(hidden_states, forward_batch=forward_batch)
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
+
+
+class MiniCPMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class MiniCPMVImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs]
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+class BaseResampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb.
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        do_post_projection: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=0.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = ReplicatedLinear(
+                kv_dim,
+                embed_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.kv_proj",
+            )
+        else:
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        # self.attn = RadixAttention(num_heads=num_heads,
+        #                            head_dim=embed_dim,
+        #                            num_kv_heads=num_heads,
+        #                            scaling=1.0,
+        #                            layer_id=0
+        #                            )
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.do_post_projection = do_post_projection
+        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
+        self.proj = (
+            nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+            if do_post_projection
+            else None
+        )
+
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: Tuple[int, int] = (70, 70),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(
+        self, max_size: Tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def forward(self, x: torch.Tensor, tgt_sizes: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
+            pos_embed.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0
+        ).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return 2, 0
+        return 2, 5
+
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
+class MiniCPMVBaseModel(nn.Module):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
+
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        # multimodal_config = config.model_config.multimodal_config
+        super().__init__()
+        # All MiniCPM-V models disable `tie_word_embeddings` but
+        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
+        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
+        # and config class
+        self.config = config
+        # self.multimodal_config = multimodal_config
+
+        self.version = get_version_by_config(self.config)
+        self.llm = self.init_llm(config=config, quant_config=quant_config)
+        self.vpm = self.init_vision_module(config, quant_config)
+        self.vision_dim = (
+            self.vpm.embed_dim
+            if self.version == (2, 0)
+            else self.vpm.embeddings.embed_dim
+        )
+        self.embed_dim = self.config.hidden_size
+
+        self.resampler = self.init_resampler(
+            self.embed_dim, self.vision_dim, quant_config=quant_config
+        )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.llm, "sampler"):
+            return self.llm.sampler
+
+        return get_sampler()
+
+    def _get_image_bounds(
+        self,
+        input_ids: torch.Tensor,
+        im_start_id: torch.Tensor,
+        im_end_id: torch.Tensor,
+        slice_start_id: Optional[torch.Tensor] = None,
+        slice_end_id: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Returns a tensor indicating the bounds (start and end token ids) of the images
+        """
+        # All the images in the batch should share the same special image
+        # bound token ids.
+        start_cond = input_ids == im_start_id[0]
+        end_cond = input_ids == im_end_id[0]
+        if slice_start_id is not None:
+            start_cond |= input_ids == slice_start_id[0]
+            end_cond |= input_ids == slice_end_id[0]
+
+        (image_start_tokens,) = torch.where(start_cond)
+        image_start_tokens += 1
+        (image_end_tokens,) = torch.where(end_cond)
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+
+        if valid_image_nums == 0:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Filter out pairs where start_token >= end_token
+        valid_pairs = []
+        for i in range(valid_image_nums):
+            start_token = image_start_tokens[i]
+            end_token = image_end_tokens[i]
+            if start_token < end_token:
+                valid_pairs.append((start_token, end_token))
+
+        if not valid_pairs:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Convert valid pairs to tensor
+        valid_pairs_tensor = torch.tensor(valid_pairs, device=input_ids.device)
+        return valid_pairs_tensor
+
+    def get_embedding(
+        self,
+        input_ids: torch.Tensor,
+        image_inputs: Optional[MiniCPMVImageInputs],
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
+
+        if image_inputs is None:  # No image
+            vision_hidden_states = torch.tensor([], device=input_ids.device)
+        else:
+            if image_inputs["type"] == "image_embeds":
+                vision_hidden_states = (
+                    image_inputs["data"]
+                    .type(vlm_embedding.dtype)
+                    .to(vlm_embedding.device)
+                )
+            else:
+                vision_hidden_states = self.get_vision_hidden_states(
+                    forward_batch, image_inputs
+                )
+
+            # See NOTE in _parse_and_validate_inputs
+            image_bounds = image_inputs["image_bounds"]
+            if len(image_bounds) > 0:
+                image_indices = torch.stack(
+                    [
+                        torch.arange(start, end, dtype=torch.long)
+                        for start, end in image_bounds.tolist()
+                    ]
+                ).to(vlm_embedding.device)
+                vlm_embedding.scatter_(
+                    0,
+                    image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                    vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
+                )
+
+        return vlm_embedding, vision_hidden_states
+
+    def _parse_and_validate_inputs(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", [])
+        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        im_start_id = kwargs.pop("im_start_id", None)
+        im_end_id = kwargs.pop("im_end_id", None)
+        slice_start_id = kwargs.pop("slice_start_id", None)
+        slice_end_id = kwargs.pop("slice_end_id", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_bound = kwargs.pop("image_bound", None)
+
+        if isinstance(image_bound, list):
+            image_bound = torch.concat(image_bound)
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image embeds. "
+                    f"Got type: {type(image_embeds)}"
+                )
+
+            if isinstance(image_embeds, list):
+                image_embeds = torch.concat(image_embeds)
+
+            return MiniCPMVImageEmbeddingInputs(
+                image_bounds=image_bound,
+                data=image_embeds,
+                type="image_embeds",
+            )
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
+            )
+
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(
+                "Incorrect type of target sizes. " f"Got type: {type(tgt_sizes)}"
+            )
+
+        if len(pixel_values) != len(tgt_sizes):
+            raise ValueError(
+                "Inconsistent batch lengths, found: "
+                f"{len(pixel_values)} vs. {len(tgt_sizes)}"
+            )
+
+        pixel_values_flat: List[torch.Tensor] = []
+        # print(f'pixel_values type: {type(pixel_values)}')
+        tgt_sizes_flat: List[torch.Tensor] = []
+        for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+            if len(pixel_b) != len(tgt_b):
+                raise ValueError(
+                    "Inconsistent N lengths, found: " f"{len(pixel_b)} vs {len(tgt_b)}"
+                )
+
+            for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                pixel_values_flat += pixel_n
+                tgt_sizes_flat += tgt_n
+
+        # NOTE: Input IDs does not contain image tokens during memory profiling,
+        # so we allow it to be empty
+        if len(pixel_values_flat) != len(tgt_sizes_flat):
+            raise ValueError(
+                "Inconsistent flattened lengths, found: "
+                f"{len(pixel_values_flat)} vs. "
+                f"{len(tgt_sizes_flat)}"
+            )
+
+        if len(pixel_values_flat) == 0:
+            return None
+
+        image_bounds = self._get_image_bounds(
+            input_ids=input_ids,
+            im_start_id=im_start_id,
+            im_end_id=im_end_id,
+            slice_start_id=slice_start_id,
+            slice_end_id=slice_end_id,
+        )
+        return MiniCPMVImagePixelInputs(
+            image_bounds=image_bounds.to(device=input_ids.device),
+            data=pixel_values_flat,
+            tgt_sizes=torch.stack(tgt_sizes_flat),
+            type="pixel_values",
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        # if intermediate_tensors is not None:
+        #     vlm_embeddings = None
+        # else:
+        if forward_batch.image_inputs is not None and forward_batch.image_inputs != [
+            None
+        ]:
+            kwargs.update(
+                {
+                    "pixel_values": (
+                        None
+                        if forward_batch.image_inputs is None
+                        else [
+                            None if i is None else i.pixel_values
+                            for i in forward_batch.image_inputs
+                        ]
+                    ),
+                    "im_start_id": forward_batch.image_inputs[0].im_start_id,
+                    "im_end_id": forward_batch.image_inputs[0].im_end_id,
+                    "slice_start_id": forward_batch.image_inputs[0].slice_start_id,
+                    "slice_end_id": forward_batch.image_inputs[0].slice_end_id,
+                    "tgt_sizes": (
+                        None
+                        if forward_batch.image_inputs is None
+                        else [
+                            None if i is None else i.tgt_sizes
+                            for i in forward_batch.image_inputs
+                        ]
+                    ),
+                    "image_bound": (
+                        torch.zeros(0)
+                        if forward_batch.image_inputs is None
+                        else forward_batch.image_inputs[0].image_bound
+                    ),
+                }
+            )
+
+        image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs)
+
+        # Clamp input ids. This is because the input_ids for the image tokens are
+        # filled with the hash values of the image for the prefix matching in the radix attention.
+        # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+        input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs, forward_batch)
+
+        # always pass the input via `inputs_embeds`
+        # to make sure the computation graph is consistent
+        # for `torch.compile` integration
+        input_ids = None
+
+        hidden_states = self.llm.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=vlm_embeddings,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.llm.lm_head, forward_batch
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.llm.compute_logits(hidden_states, sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="llm", connector="resampler", tower_model="vpm"
+        )
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_vision_hidden_states(
+        self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class MiniCPMV2_6(MiniCPMVBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config=config, quant_config=quant_config)
+        assert self.version == (2, 6)
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config
+        )
+        # model = SiglipVisionTransformer(config.vision_config)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_vision_hidden_states(
+        self,
+        forward_batch: ForwardBatch,
+        data: MiniCPMVImageInputs,
+    ) -> torch.Tensor:
+        pixel_values = data["data"]
+        tgt_sizes = data["tgt_sizes"]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+        for i in range(B):
+            patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            forward_batch=forward_batch,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
+        if not isinstance(image_inputs.im_start_id, list) or not isinstance(
+            image_inputs.im_end_id, list
+        ):
+            return input_ids
+
+        pad_values = image_inputs.pad_values
+        new_input_ids = []
+        last_idx = 0
+
+        # Get all special token IDs
+        im_start_id = (
+            image_inputs.im_start_id[0].item()
+            if isinstance(image_inputs.im_start_id[0], torch.Tensor)
+            else image_inputs.im_start_id[0]
+        )
+        im_end_id = (
+            image_inputs.im_end_id[0].item()
+            if isinstance(image_inputs.im_end_id[0], torch.Tensor)
+            else image_inputs.im_end_id[0]
+        )
+        slice_start_id = (
+            image_inputs.slice_start_id[0].item()
+            if isinstance(image_inputs.slice_start_id[0], torch.Tensor)
+            else image_inputs.slice_start_id[0]
+        )
+        slice_end_id = (
+            image_inputs.slice_end_id[0].item()
+            if isinstance(image_inputs.slice_end_id[0], torch.Tensor)
+            else image_inputs.slice_end_id[0]
+        )
+
+        # Find all start and end positions for both types
+        start_indices = [
+            i
+            for i, x in enumerate(input_ids)
+            if x == im_start_id or x == slice_start_id
+        ]
+        end_indices = [
+            i for i, x in enumerate(input_ids) if x == im_end_id or x == slice_end_id
+        ]
+
+        if len(start_indices) != len(end_indices):
+            return input_ids
+
+        # Process each region (both image and slice)
+        for start_idx, end_idx in zip(start_indices, end_indices):
+            # Add non-image tokens before this region
+            new_input_ids.extend(
+                input_ids[last_idx : start_idx + 1]
+            )  # include start token
+
+            # Calculate the number of tokens to pad
+            num_tokens = end_idx - start_idx - 1  # exclude start and end tokens
+
+            # Generate pad_ids
+            pad_ids = pad_values * ((num_tokens + len(pad_values)) // len(pad_values))
+            pad_ids = pad_ids[:num_tokens]
+
+            # Add pad_ids
+            new_input_ids.extend(pad_ids)
+
+            # Update last_idx to after end token
+            last_idx = end_idx
+
+        # Add remaining tokens after last region
+        new_input_ids.extend(input_ids[last_idx:])
+        assert len(input_ids) == len(new_input_ids)
+        return new_input_ids
+
+
+_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6}
+
+
+class MiniCPMV:
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    """
+
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized, but set all these attributes to empty.
+    packed_modules_mapping = {}
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    minicpmv: nn.Module
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        if not hasattr(config, "version"):
+            version = (2, 6)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        instance_class = _SUPPORT_VERSION.get(version)
+        if instance_class is None:
+            raise ValueError("Currently, MiniCPMV only supports versions 2.6")
+
+        try:
+            minicpmv = instance_class(config=config, quant_config=quant_config)
+            self.minicpmv = minicpmv
+        except Exception as e:
+            print(f"Failed to instantiate MiniCPMV: {e}")
+            raise e
+        self.config = config
+
+    def __getattr__(self, name):
+        if name == "minicpmv":
+            return None
+        return getattr(self.minicpmv, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.minicpmv(*args, **kwargs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.minicpmv.named_parameters())
+        # for key in params_dict.keys():
+        #     print(f'{key}')
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            # if "resampler" in name in name:
+            #     continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            if "sampler" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMV
diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py
index 2a20d6c50de..df8e7e586d9 100644
--- a/python/sglang/srt/models/qwen2.py
+++ b/python/sglang/srt/models/qwen2.py
@@ -242,17 +242,20 @@ def __init__(
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
-        input_embeds: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
-        if input_embeds is None:
+        if inputs_embeds is None:
             hidden_states = self.embed_tokens(input_ids)
         else:
-            hidden_states = input_embeds
+            hidden_states = inputs_embeds
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -267,7 +270,6 @@ def forward(
 
 
 class Qwen2ForCausalLM(nn.Module):
-
     # BitandBytes specific attributes
     default_bitsandbytes_target_modules = [
         ".gate_proj.",
@@ -305,16 +307,19 @@ def __init__(
         self.logits_processor = LogitsProcessor(config)
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
-        input_embeds: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
         get_embedding: bool = False,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
         if not get_embedding:
             return self.logits_processor(
                 input_ids, hidden_states, self.lm_head, forward_batch
diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py
index 2e9ec9d8f50..7b25d52f3ab 100644
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -661,7 +661,7 @@ def forward(
             input_ids=input_ids,
             positions=positions,
             forward_batch=forward_batch,
-            input_embeds=inputs_embeds,
+            inputs_embeds=inputs_embeds,
         )
 
         if not get_embedding:
diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py
index 5056ba22ef9..32c094c3d0c 100644
--- a/python/sglang/srt/openai_api/adapter.py
+++ b/python/sglang/srt/openai_api/adapter.py
@@ -897,7 +897,7 @@ def v1_chat_generate_request(
                             {"role": message.role, "content": message.content}
                         )
                     else:
-                        content_list = message.dict()["content"]
+                        content_list = message["content"]
                         for content in content_list:
                             if content["type"] == "text":
                                 openai_compatible_messages.append(
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
index 44a5e41a41b..80b10d70802 100644
--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -440,6 +440,8 @@ def load_image(image_file: Union[str, bytes]):
     else:
         raise ValueError(f"Invalid image: {image}")
 
+    # if image_size is None:
+    #     image_size = image.size
     return image, image_size
 
 
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 4121deb17cc..46e72960acb 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -405,7 +405,7 @@ def popen_launch_server(
     base_url: str,
     timeout: float,
     api_key: Optional[str] = None,
-    other_args: tuple = (),
+    other_args: list[str] = (),
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
 ):
diff --git a/test/README.md b/test/README.md
index 3d739cc0496..868061bbc4a 100644
--- a/test/README.md
+++ b/test/README.md
@@ -25,7 +25,7 @@ export OPENAI_API_KEY=sk-*****
 python3 test_openai_backend.py
 
 # Run a single test
-python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
+python3 -m unittest test_openai_backend.TestOpenAIServer.test_few_shot_qa
 
 # Run a suite with multiple files
 python3 run_suite.py --suite per-commit
diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py
index e19e6b01d51..5af61034b9e 100644
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -171,7 +171,7 @@ def test_multi_images_chat_completion(self):
         text = response.choices[0].message.content
         assert isinstance(text, str)
         print(text)
-        assert "man" in text or "cab" in text, text
+        assert "man" in text or "cab" in text or "SUV" in text, text
         assert "logo" in text or '"S"' in text or "SG" in text, text
         assert response.id
         assert response.created
@@ -444,5 +444,24 @@ def test_video_chat_completion(self):
         pass
 
 
+class TestMinicpmvServer(TestOpenAIVisionServer):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-2_6"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--chat-template",
+                "minicpmv",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 if __name__ == "__main__":
     unittest.main()