vlm: support video as an input modality (#5888)

2025-07-10 14:48:35 +08:00
parent 4ed57807c2
commit b5e3d6031c
42 changed files with 887 additions and 524 deletions
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -198,7 +198,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        assert response.usage.completion_tokens > 0
        assert response.usage.total_tokens > 0

-    def prepare_video_messages(self, video_path):
+    def prepare_video_images_messages(self, video_path):
        # the memory consumed by the Vision Attention varies a lot, e.g. blocked qkv vs full-sequence sdpa
        # the size of the video embeds differs from the `modality` argument when preprocessed

@@ -208,7 +208,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        # from transformers import AutoTokenizer
        from decord import VideoReader, cpu

-        max_frames_num = 20
+        max_frames_num = 10
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(
@@ -229,7 +229,7 @@ class TestOpenAIVisionServer(CustomTestCase):
        frame_format = {
            "type": "image_url",
            "image_url": {"url": "data:image/jpeg;base64,{}"},
-            "modalities": "video",
+            "modalities": "image",
        }

        for base64_frame in base64_frames:
@@ -243,15 +243,14 @@ class TestOpenAIVisionServer(CustomTestCase):

        return messages

-    def prepare_video_messages_video_direct(self, video_path):
+    def prepare_video_messages(self, video_path):
        messages = [
            {
                "role": "user",
                "content": [
                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"video:{video_path}"},
-                        "modalities": "video",
+                        "type": "video_url",
+                        "video_url": {"url": f"{video_path}"},
                    },
                    {"type": "text", "text": "Please describe the video in detail."},
                ],
@@ -275,13 +274,57 @@ class TestOpenAIVisionServer(CustomTestCase):
                f.write(response.content)
        return file_path

-    def test_video_chat_completion(self):
+    # this test samples frames of video as input, but not video directly
+    def test_video_images_chat_completion(self):
+        url = VIDEO_JOBS_URL
+        file_path = self.get_or_download_file(url)
+
+        client = openai.Client(api_key=self.api_key, base_url=self.base_url)
+
+        messages = self.prepare_video_images_messages(file_path)
+
+        response = client.chat.completions.create(
+            model="default",
+            messages=messages,
+            temperature=0,
+            max_tokens=1024,
+            stream=False,
+        )
+
+        video_response = response.choices[0].message.content
+
+        print("-" * 30)
+        print(f"Video images response:\n{video_response}")
+        print("-" * 30)
+
+        # Add assertions to validate the video response
+        assert (
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
+        ), video_response
+        assert (
+            "man" in video_response
+            or "person" in video_response
+            or "individual" in video_response
+            or "speaker" in video_response
+        ), video_response
+        assert (
+            "present" in video_response
+            or "examine" in video_response
+            or "display" in video_response
+            or "hold" in video_response
+        )
+        assert "black" in video_response or "dark" in video_response
+        self.assertIsNotNone(video_response)
+        self.assertGreater(len(video_response), 0)
+
+    def _test_video_chat_completion(self):
        url = VIDEO_JOBS_URL
        file_path = self.get_or_download_file(url)

        client = openai.Client(api_key=self.api_key, base_url=self.base_url)

-        # messages = self.prepare_video_messages_video_direct(file_path)
        messages = self.prepare_video_messages(file_path)

        response = client.chat.completions.create(
@@ -301,7 +344,9 @@ class TestOpenAIVisionServer(CustomTestCase):

        # Add assertions to validate the video response
        assert (
-            "iPod" in video_response or "device" in video_response
+            "iPod" in video_response
+            or "device" in video_response
+            or "microphone" in video_response
        ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
        assert (
            "man" in video_response