vlm: support video as an input modality (#5888)

2025-07-10 14:48:35 +08:00
parent 4ed57807c2
commit b5e3d6031c
42 changed files with 887 additions and 524 deletions
--- a/test/srt/test_jinja_template_utils.py
+++ b/test/srt/test_jinja_template_utils.py
@@ -3,7 +3,6 @@ Unit tests for Jinja chat template utils.
 """

 import unittest
-from unittest.mock import patch

 from sglang.srt.jinja_template_utils import (
    detect_jinja_template_content_format,
@@ -76,11 +75,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        }

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "openai", image_data, audio_data, modalities
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
        )

        # Check that image_data was extracted
@@ -111,11 +111,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        }

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "string", image_data, audio_data, modalities
+            msg_dict, "string", image_data, video_data, audio_data, modalities
        )

        # For string format, should flatten to text only
@@ -139,11 +140,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        }

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "openai", image_data, audio_data, modalities
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
        )

        # Check that audio_data was extracted
@@ -162,11 +164,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        msg_dict = {"role": "user", "content": "Hello world"}

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "openai", image_data, audio_data, modalities
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
        )

        # Should pass through unchanged
@@ -188,11 +191,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        }

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "openai", image_data, audio_data, modalities
+            msg_dict, "openai", image_data, video_data, audio_data, modalities
        )

        # Check that modalities was extracted
@@ -209,11 +213,12 @@ class TestTemplateContentFormatDetection(CustomTestCase):
        }

        image_data = []
+        video_data = []
        audio_data = []
        modalities = []

        result = process_content_for_template_format(
-            msg_dict, "string", image_data, audio_data, modalities
+            msg_dict, "string", image_data, video_data, audio_data, modalities
        )

        # None values should be filtered out
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -35,6 +35,9 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
        )
        cls.base_url += "/v1"

+    def test_video_chat_completion(self):
+        self._test_video_chat_completion()
+

 class TestQwen2_5_VLServer(TestOpenAIVisionServer):
    @classmethod
@@ -54,6 +57,9 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
        )
        cls.base_url += "/v1"

+    def test_video_chat_completion(self):
+        self._test_video_chat_completion()
+

 class TestVLMContextLengthIssue(CustomTestCase):
    @classmethod
--- a/test/srt/test_vision_openai_server_b.py
+++ b/test/srt/test_vision_openai_server_b.py
@@ -93,7 +93,7 @@ class TestJanusProServer(TestOpenAIVisionServer):
        )
        cls.base_url += "/v1"

-    def test_video_chat_completion(self):
+    def test_video_images_chat_completion(self):
        pass

    def test_single_image_chat_completion(self):
@@ -170,7 +170,7 @@ class TestKimiVLServer(TestOpenAIVisionServer):
        )
        cls.base_url += "/v1"

-    def test_video_chat_completion(self):
+    def test_video_images_chat_completion(self):
        pass


--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0

-    def prepare_video_messages(self, video_path):
+    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed

@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        # from transformers import AutoTokenizer
        from decord import VideoReader, cpu

-        max_frames_num = 20
+        max_frames_num = 10
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        frame_format = {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,{}"},
-            "modalities": "video",
+            "modalities": "image",
        }

        for base64_frame in base64_frames:
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):

        return messages

-    def prepare_video_messages_video_direct(self, video_path):
+    def prepare_video_messages(self, video_path):
        messages = [
            {
                "role": "user",
                "content": [
                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"video:{video_path}"},
-                        "modalities": "video",
+                        "type": "video_url",
+                        "video_url": {"url": f"{video_path}"},
                    },
                    {"type": "text", "text": "Please describe the video in detail."},
                ],
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
                f.write(response.content)
        return file_path

-    def test_video_chat_completion(self):
+    # this test samples frames of video as input, but not video directly
+    def test_video_images_chat_completion(self):
+        url = VIDEO_JOBS_URL
+        file_path = self.get_or_download_file(url)
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        messages = self.prepare_video_images_messages(file_path)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=1024,
+            stream=False,
+        )
+
+        video_response = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"Video images response:\n{video_response}")
+        print("-" * 30)
+
+        # Add assertions to validate the video response
+        assert (
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
+        ), video_response
+        assert (
+            "man" in video_response
+            or "person" in video_response
+            or "individual" in video_response
+            or "speaker" in video_response
+        ), video_response
+        assert (
+            "present" in video_response
+            or "examine" in video_response
+            or "display" in video_response
+            or "hold" in video_response
+        )
+        assert "black" in video_response or "dark" in video_response
+        self.assertIsNotNone(video_response)
+        self.assertGreater(len(video_response), 0)
+
+    def _test_video_chat_completion(self):
        url = VIDEO_JOBS_URL
        file_path = self.get_or_download_file(url)

        client = openai.Client(api_key=self.api_key, base_url=self.base_url)

-        # messages = self.prepare_video_messages_video_direct(file_path)
        messages = self.prepare_video_messages(file_path)

        response = client.chat.completions.create(
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):

        # Add assertions to validate the video response
        assert (
-            "iPod" in video_response or "device" in video_response
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
        ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
        assert (
            "man" in video_response
--- a/test/srt/test_vlm_accuracy.py
+++ b/test/srt/test_vlm_accuracy.py
@@ -10,15 +10,8 @@ import requests
 import torch
 import torch.nn.functional as F
 from PIL import Image
-from transformers import (
-    AutoModel,
-    AutoProcessor,
-    AutoTokenizer,
-    Gemma3ForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-)
+from transformers import AutoModel, AutoProcessor, AutoTokenizer

-from sglang import Engine
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.conversation import generate_chat_conv
 from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
@@ -169,107 +162,107 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):


 # TODO: MiniCPMV is not compatible with transformers==4.52.3, temporarily disabled
-# class TestMiniCPMVLogits(VisionLLMLogitsBase):
-#     @classmethod
-#     def setUpClass(cls):
-#         super().setUpClass()
-#         cls.model_path = "openbmb/MiniCPM-V-2_6"
-#         cls.tokenizer = AutoTokenizer.from_pretrained(
-#             cls.model_path, trust_remote_code=True
-#         )
-#         cls.processor = AutoProcessor.from_pretrained(
-#             cls.model_path, trust_remote_code=True
-#         )
-#         cls.chat_template = "minicpmv"
-#
-#         cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#         cls.hf_model = (
-#             AutoModel.from_pretrained(
-#                 cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
-#             )
-#             .eval()
-#             .to(cls.device)
-#         )
-#         init_embedding_cache(0)
-#
-#     async def test_vlm_embedding_output(self):
-#         """
-#         Compares the embedding output of vlm
-#         """
-#         inputs = self.get_processor_output()
-#
-#         with torch.no_grad():
-#             # hf
-#             model_inputs = {
-#                 "input_ids": inputs.input_ids,
-#                 "image_bound": inputs.image_bound,
-#                 "pixel_values": inputs.pixel_values,
-#                 "tgt_sizes": inputs.tgt_sizes,
-#             }
-#             (hf_output, _) = self.hf_model.get_vllm_embedding(
-#                 model_inputs,
-#             )
-#             hf_output = hf_output.squeeze(0)
-#
-#             # sglang
-#             model = self.get_sglang_model()
-#             input_ids = inputs["input_ids"].to(self.device).flatten()
-#
-#             pixel_values = inputs["pixel_values"]
-#             tgt_sizes = inputs["tgt_sizes"]
-#             pixel_values_flat: List[torch.Tensor] = []
-#             tgt_sizes_flat: List[torch.Tensor] = []
-#             for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
-#                 # per image
-#                 if len(pixel_b) != len(tgt_b):
-#                     raise ValueError(
-#                         "Inconsistent N lengths, found: "
-#                         f"{len(pixel_b)} vs {len(tgt_b)}"
-#                     )
-#                 for pixel_n, tgt_n in zip(pixel_b, tgt_b):
-#                     pixel_values_flat += [pixel_n]
-#                     tgt_sizes_flat += [tgt_n]
-#
-#             im_start_id, im_end_id = (
-#                 self.tokenizer.im_start_id,
-#                 self.tokenizer.im_end_id,
-#             )
-#             slice_start_id, slice_end_id = (
-#                 self.tokenizer.slice_start_id,
-#                 self.tokenizer.slice_end_id,
-#             )
-#
-#             image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
-#                 input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
-#             )
-#             slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
-#                 input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
-#             )
-#             image_offsets.extend(slice_offsets)
-#             image_offsets = sorted(image_offsets)
-#
-#             sglang_output = embed_mm_inputs(
-#                 mm_inputs_list=[
-#                     MultimodalInputs(
-#                         mm_items=[
-#                             MultimodalDataItem(
-#                                 pixel_values=pixel_values_flat,
-#                                 image_offsets=image_offsets,
-#                                 tgt_size=tgt_sizes_flat,
-#                                 modality=Modality.IMAGE,
-#                                 pad_value=self.processor.tokenizer.unk_token_id,
-#                             )
-#                         ]
-#                     ),
-#                 ],
-#                 extend_prefix_lens=[0],
-#                 extend_seq_lens=[input_ids.shape[0]],
-#                 input_ids=input_ids,
-#                 input_embedding=model.get_input_embeddings(),
-#                 image_data_embedding_func=model.get_image_feature,
-#                 placeholder_tokens={
-#                     Modality.IMAGE: self.processor.tokenizer.unk_token_id,
-#                 },
-#             )
-#
-#         self.compare_outputs(sglang_output, hf_output)
+class TestMiniCPMVLogits(VisionLLMLogitsBase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.model_path = "openbmb/MiniCPM-V-2_6"
+        cls.tokenizer = AutoTokenizer.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.processor = AutoProcessor.from_pretrained(
+            cls.model_path, trust_remote_code=True
+        )
+        cls.chat_template = "minicpmv"
+
+        cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        cls.hf_model = (
+            AutoModel.from_pretrained(
+                cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+            .eval()
+            .to(cls.device)
+        )
+        init_embedding_cache()
+
+    async def test_vlm_embedding_output(self):
+        """
+        Compares the embedding output of vlm
+        """
+        inputs = self.get_processor_output()
+
+        with torch.no_grad():
+            # hf
+            model_inputs = {
+                "input_ids": inputs.input_ids,
+                "image_bound": inputs.image_bound,
+                "pixel_values": inputs.pixel_values,
+                "tgt_sizes": inputs.tgt_sizes,
+            }
+            (hf_output, _) = self.hf_model.get_vllm_embedding(
+                model_inputs,
+            )
+            hf_output = hf_output.squeeze(0)
+
+            # sglang
+            model = self.get_sglang_model()
+            input_ids = inputs["input_ids"].to(self.device).flatten()
+
+            pixel_values = inputs["pixel_values"]
+            tgt_sizes = inputs["tgt_sizes"]
+            pixel_values_flat: List[torch.Tensor] = []
+            tgt_sizes_flat: List[torch.Tensor] = []
+            for pixel_b, tgt_b in zip(pixel_values, tgt_sizes):
+                # per image
+                if len(pixel_b) != len(tgt_b):
+                    raise ValueError(
+                        "Inconsistent N lengths, found: "
+                        f"{len(pixel_b)} vs {len(tgt_b)}"
+                    )
+                for pixel_n, tgt_n in zip(pixel_b, tgt_b):
+                    pixel_values_flat += [pixel_n]
+                    tgt_sizes_flat += [tgt_n]
+
+            im_start_id, im_end_id = (
+                self.tokenizer.im_start_id,
+                self.tokenizer.im_end_id,
+            )
+            slice_start_id, slice_end_id = (
+                self.tokenizer.slice_start_id,
+                self.tokenizer.slice_end_id,
+            )
+
+            image_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+                input_ids=input_ids, mm_start_id=im_start_id, mm_end_id=im_end_id
+            )
+            slice_offsets = BaseMultimodalProcessor.get_mm_items_offset_by_pair(
+                input_ids=input_ids, mm_start_id=slice_start_id, mm_end_id=slice_end_id
+            )
+            image_offsets.extend(slice_offsets)
+            image_offsets = sorted(image_offsets)
+
+            sglang_output = embed_mm_inputs(
+                mm_inputs_list=[
+                    MultimodalInputs(
+                        mm_items=[
+                            MultimodalDataItem(
+                                pixel_values=pixel_values_flat,
+                                offsets=image_offsets,
+                                tgt_size=tgt_sizes_flat,
+                                modality=Modality.IMAGE,
+                                pad_value=self.processor.tokenizer.unk_token_id,
+                            )
+                        ]
+                    ),
+                ],
+                extend_prefix_lens=[0],
+                extend_seq_lens=[input_ids.shape[0]],
+                input_ids=input_ids,
+                input_embedding=model.get_input_embeddings(),
+                multimodal_model=model,
+                placeholder_tokens={
+                    Modality.IMAGE: self.processor.tokenizer.unk_token_id,
+                },
+            )
+
+        self.compare_outputs(sglang_output, hf_output)