fix: fix video input for qwen3-vl (#11442)

2025-10-14 00:30:43 +08:00
parent 54a46a264d
commit f35f120d70
5 changed files with 51 additions and 7 deletions
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -1142,6 +1142,13 @@ class MRotaryEmbedding(RotaryEmbedding):
        second_per_grid_ts: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if (
            model_type.startswith("qwen3_vl") or model_type.startswith("qwen3_vl_moe")
        ) and video_grid_thw is not None:
            video_grid_thw = torch.repeat_interleave(
                video_grid_thw, video_grid_thw[:, 0], dim=0
            )
            video_grid_thw[:, 0] = 1
        mrope_position_deltas = []
        if input_ids is not None and (
            image_grid_thw is not None or video_grid_thw is not None
--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -25,7 +25,6 @@ import signal
 import sys
 import threading
 import time
 import uuid
 from collections import deque
 from contextlib import nullcontext
 from datetime import datetime
@@ -360,7 +359,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
                (
                    FreezeGCReq,
                    lambda x: None,
-                ),  # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
+                ),
                # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
                (HealthCheckOutput, lambda x: None),
            ]
        )
@@ -587,9 +587,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
            )
        if self.mm_processor and obj.contains_mm_input():
-            if not isinstance(obj.image_data, list):
+            if not isinstance(obj.image_data, list) and obj.image_data:
                obj.image_data = [obj.image_data]
-            if not isinstance(obj.audio_data, list):
+            if not isinstance(obj.audio_data, list) and obj.audio_data:
                obj.audio_data = [obj.audio_data]
            mm_inputs: Dict = await self.mm_processor.process_mm_data_async(
                image_data=obj.image_data,
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -196,7 +196,6 @@ MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO = 3
 logger = logging.getLogger(__name__)
 if _is_npu:
    import torch_npu
@@ -636,6 +635,22 @@ class ModelRunner:
                    "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
                )
        if self.model_config.hf_config.model_type == "qwen3_vl_moe":
            if (
                quantization_config := getattr(
                    self.model_config.hf_config, "quantization_config", None
                )
            ) is not None:
                text_config = self.model_config.hf_text_config
                weight_block_size_n = quantization_config["weight_block_size"][0]
                if (
                    text_config.moe_intermediate_size
                    // (self.tp_size // self.moe_ep_size)
                ) % weight_block_size_n != 0:
                    raise ValueError(
                        f"For qwen3-vl-fp8 models, please make sure ({text_config.moe_intermediate_size=} // ({self.tp_size=} // {self.moe_ep_size=})) % {weight_block_size_n=} == 0"
                    )
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -50,6 +50,27 @@ class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
        cls.base_url += "/v1"
 class TestQwen3VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen3-VL-30B-A3B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--mem-fraction-static",
                "0.80",
                "--cuda-graph-max-bs",
                "4",
            ],
        )
        cls.base_url += "/v1"
 class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
    @classmethod
    def setUpClass(cls):
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -494,7 +494,7 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
            **(self.get_vision_request_kwargs()),
        )
-        video_response = response.choices[0].message.content
+        video_response = response.choices[0].message.content.lower()
        print("-" * 30)
        print(f"Video response:\n{video_response}")
@@ -502,9 +502,10 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
        # Add assertions to validate the video response
        assert (
-            "iPod" in video_response
+            "ipod" in video_response
            or "device" in video_response
            or "microphone" in video_response
            or "phone" in video_response
        ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
        assert (
            "man" in video_response