fix: second_per_grid_ts should be used to get mrope position (#3682)

2025-03-18 09:12:38 +08:00
parent 98be3bd306
commit d373a48c98
8 changed files with 93 additions and 69 deletions
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -191,7 +191,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
        # from transformers import AutoTokenizer
        from decord import VideoReader, cpu

-        max_frames_num = 12
+        max_frames_num = 20
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(
@@ -226,6 +226,22 @@ class TestOpenAIVisionServer(unittest.TestCase):

        return messages

+    def prepare_video_messages_video_direct(self, video_path):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"video:{video_path}"},
+                        "modalities": "video",
+                    },
+                    {"type": "text", "text": "Please describe the video in detail."},
+                ],
+            },
+        ]
+        return messages
+
    def test_video_chat_completion(self):
        url = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
        cache_dir = os.path.expanduser("~/.cache")
@@ -241,6 +257,7 @@ class TestOpenAIVisionServer(unittest.TestCase):

        client = openai.Client(api_key=self.api_key, base_url=self.base_url)

+        # messages = self.prepare_video_messages_video_direct(file_path)
        messages = self.prepare_video_messages(file_path)

        video_request = client.chat.completions.create(
@@ -266,6 +283,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
            "man" in video_response
            or "person" in video_response
            or "individual" in video_response
+            or "speaker" in video_response
        ), video_response
        assert (
            "present" in video_response
@@ -368,7 +386,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
            list(executor.map(self.run_decode_with_image, image_ids))


-class TestQWen2VLServer(TestOpenAIVisionServer):
+class TestQwen2VLServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2-VL-7B-Instruct"
@@ -382,14 +400,14 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
            other_args=[
                "--chat-template",
                "qwen2-vl",
-                "--chunked-prefill-size",
-                "10000",
+                "--mem-fraction-static",
+                "0.4",
            ],
        )
        cls.base_url += "/v1"


-class TestQWen2_5_VLServer(TestOpenAIVisionServer):
+class TestQwen2_5_VLServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
@@ -403,9 +421,6 @@ class TestQWen2_5_VLServer(TestOpenAIVisionServer):
            other_args=[
                "--chat-template",
                "qwen2-vl",
-                # FIXME: workaround to chunked prefill within image embeds
-                "--chunked-prefill-size",
-                "10000",
                "--mem-fraction-static",
                "0.4",
            ],
@@ -508,6 +523,8 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
                "--trust-remote-code",
                "--chat-template",
                "minicpmv",
+                "--mem-fraction-static",
+                "0.4",
            ],
        )
        cls.base_url += "/v1"