From 03dbf1aa8e254306eb9416054c4e93d9ee1392cf Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Wed, 3 Sep 2025 06:33:03 +0800
Subject: [PATCH] [model] support MiniCPM-V 4.0 (#8747)

Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
---
 python/sglang/srt/models/minicpmv.py          | 168 +++++++++++++++++-
 .../models/test_compressed_tensors_models.py  |   2 +-
 test/srt/test_vision_openai_server_a.py       |  23 ++-
 test/srt/test_vlm_accuracy.py                 |  59 +++++-
 4 files changed, 246 insertions(+), 6 deletions(-)

diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py
index 8166d1646..e621676fc 100644
--- a/python/sglang/srt/models/minicpmv.py
+++ b/python/sglang/srt/models/minicpmv.py
@@ -54,6 +54,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.llama import LlamaConfig, LlamaForCausalLM
 from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 from sglang.srt.utils import add_prefix, flatten_nested_list
 
@@ -581,7 +582,7 @@ class MiniCPMBaseModel(nn.Module):
 
     def init_llm(
         self,
-        config: Qwen2Config,
+        config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
@@ -774,7 +775,168 @@ class MiniCPMV2_6(MiniCPMBaseModel):
         return pattern.pad_input_tokens(input_ids, image_inputs)
 
 
-_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6}
+class MiniCPMV4_0(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (4, 0)
+
+    def init_llm(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return LlamaForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+
+_SUPPORT_VERSION = {
+    (2, 6): MiniCPMV2_6,
+    (4, 0): MiniCPMV4_0,
+}
 
 
 class MiniCPMV:
@@ -809,7 +971,7 @@ class MiniCPMV:
         # Dispatch class based on version
         instance_class = _SUPPORT_VERSION.get(version)
         if instance_class is None:
-            raise ValueError("Currently, MiniCPMV only supports versions 2.6")
+            raise ValueError("Currently, MiniCPMV only supports versions 2.6 and 4.0")
 
         try:
             minicpmv = instance_class(
diff --git a/test/srt/models/test_compressed_tensors_models.py b/test/srt/models/test_compressed_tensors_models.py
index b069008d0..34f699de4 100644
--- a/test/srt/models/test_compressed_tensors_models.py
+++ b/test/srt/models/test_compressed_tensors_models.py
@@ -39,7 +39,7 @@ class TestCompressedTensorsLlama3FP8(CustomTestCase):
         )
         metrics = run_eval(args)
         print(f"{metrics=}")
-        self.assertGreater(metrics["accuracy"], 0.45)
+        self.assertGreaterEqual(metrics["accuracy"], 0.45)
 
 
 if __name__ == "__main__":
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
index 9e311d5b1..e8e0d62e9 100644
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -165,6 +165,27 @@ class TestMinicpmvServer(ImageOpenAITestMixin):
         cls.base_url += "/v1"
 
 
+class TestMinicpmv4Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-4"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 class TestInternVL2_5Server(ImageOpenAITestMixin):
     @classmethod
     def setUpClass(cls):
@@ -184,7 +205,7 @@ class TestInternVL2_5Server(ImageOpenAITestMixin):
         cls.base_url += "/v1"
 
 
-class TestMinicpmoServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
+class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
     @classmethod
     def setUpClass(cls):
         cls.model = "openbmb/MiniCPM-o-2_6"
diff --git a/test/srt/test_vlm_accuracy.py b/test/srt/test_vlm_accuracy.py
index 2f2e294fa..a1eb0fc40 100644
--- a/test/srt/test_vlm_accuracy.py
+++ b/test/srt/test_vlm_accuracy.py
@@ -161,7 +161,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
         return self.model_runner.model
 
 
-class TestMiniCPMVLogits(VisionLLMLogitsBase):
+class TestMiniCPMV2_6Logits(VisionLLMLogitsBase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
@@ -265,3 +265,60 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
             )
 
         self.compare_outputs(sglang_output, hf_output)
+
+
+class TestMiniCPMV4Logits(VisionLLMLogitsBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.model_path = "openbmb/MiniCPM-V-4"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.chat_template = "minicpmv"
+
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.hf_model = (
+            AutoModel.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(cls.device)
+        )
+        init_embedding_cache()
+
+    async def test_vlm_embedding_output(self):
+        """
+        Compares the embedding output of vlm
+        """
+        inputs = self.get_processor_output()
+
+        with torch.no_grad():
+            # hf
+            model_inputs = {
+                "input_ids": inputs.input_ids,
+                "image_bound": inputs.image_bound,
+                "pixel_values": inputs.pixel_values,
+                "tgt_sizes": inputs.tgt_sizes,
+            }
+            hf_output = self.hf_model.get_input_embeddings()(inputs.input_ids)
+
+            # sglang
+            model = self.get_model()
+            sglang_output = self.vlm_func(
+                model,
+                input_ids=inputs.input_ids.to(self.device),
+                pixel_values=inputs.pixel_values,
+                image_bound=inputs.image_bound.to(self.device),
+                tgt_sizes=inputs.tgt_sizes.to(self.device),
+                input_embedding=model.get_input_embeddings(),
+                multimodal_model=model,
+                placeholder_tokens={
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                },
+            )
+
+        self.compare_outputs(sglang_output, hf_output)