[model] support MiniCPM-V 4.0 (#8747)

Signed-off-by: tc-mb <caitianchi@modelbest.cn> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
2025-09-03 06:33:03 +08:00
parent 11dcabc545
commit 03dbf1aa8e
4 changed files with 246 additions and 6 deletions
--- a/test/srt/models/test_compressed_tensors_models.py
+++ b/test/srt/models/test_compressed_tensors_models.py
@@ -39,7 +39,7 @@ class TestCompressedTensorsLlama3FP8(CustomTestCase):
        )
        metrics = run_eval(args)
        print(f"{metrics=}")
-        self.assertGreater(metrics["accuracy"], 0.45)
+        self.assertGreaterEqual(metrics["accuracy"], 0.45)


 if __name__ == "__main__":
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -165,6 +165,27 @@ class TestMinicpmvServer(ImageOpenAITestMixin):
        cls.base_url += "/v1"


+class TestMinicpmv4Server(ImageOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "openbmb/MiniCPM-V-4"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--trust-remote-code",
+                "--mem-fraction-static",
+                "0.35",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 class TestInternVL2_5Server(ImageOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
@@ -184,7 +205,7 @@ class TestInternVL2_5Server(ImageOpenAITestMixin):
        cls.base_url += "/v1"


-class TestMinicpmoServer(ImageOpenAITestMixin, AudioOpenAITestMixin):
+class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        cls.model = "openbmb/MiniCPM-o-2_6"
--- a/test/srt/test_vlm_accuracy.py
+++ b/test/srt/test_vlm_accuracy.py
@@ -161,7 +161,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
        return self.model_runner.model


-class TestMiniCPMVLogits(VisionLLMLogitsBase):
+class TestMiniCPMV2_6Logits(VisionLLMLogitsBase):
    @classmethod
    def setUpClass(cls):
        super().setUpClass()
@@ -265,3 +265,60 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
            )

        self.compare_outputs(sglang_output, hf_output)
+
+
+class TestMiniCPMV4Logits(VisionLLMLogitsBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.model_path = "openbmb/MiniCPM-V-4"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.chat_template = "minicpmv"
+
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.hf_model = (
+            AutoModel.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(cls.device)
+        )
+        init_embedding_cache()
+
+    async def test_vlm_embedding_output(self):
+        """
+        Compares the embedding output of vlm
+        """
+        inputs = self.get_processor_output()
+
+        with torch.no_grad():
+            # hf
+            model_inputs = {
+                "input_ids": inputs.input_ids,
+                "image_bound": inputs.image_bound,
+                "pixel_values": inputs.pixel_values,
+                "tgt_sizes": inputs.tgt_sizes,
+            }
+            hf_output = self.hf_model.get_input_embeddings()(inputs.input_ids)
+
+            # sglang
+            model = self.get_model()
+            sglang_output = self.vlm_func(
+                model,
+                input_ids=inputs.input_ids.to(self.device),
+                pixel_values=inputs.pixel_values,
+                image_bound=inputs.image_bound.to(self.device),
+                tgt_sizes=inputs.tgt_sizes.to(self.device),
+                input_embedding=model.get_input_embeddings(),
+                multimodal_model=model,
+                placeholder_tokens={
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                },
+            )
+
+        self.compare_outputs(sglang_output, hf_output)