From a1a20b4c7cb86f81296ece3c4cb265584be1c179 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Fri, 10 Oct 2025 19:35:35 +0800
Subject: [PATCH] fix: fix video input for qwen3-vl (#11361)

---
 python/sglang/srt/layers/rotary_embedding.py  |  5 +++++
 .../sglang/srt/model_executor/model_runner.py | 17 ++++++++++++++-
 test/srt/test_vision_openai_server_a.py       | 21 +++++++++++++++++++
 test/srt/test_vision_openai_server_common.py  |  5 +++--
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 91e58f6a0..b86d1e9de 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -1126,6 +1126,11 @@ class MRotaryEmbedding(RotaryEmbedding):
         second_per_grid_ts: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if model_type.startswith("qwen3_vl") and video_grid_thw is not None:
+            video_grid_thw = torch.repeat_interleave(
+                video_grid_thw, video_grid_thw[:, 0], dim=0
+            )
+            video_grid_thw[:, 0] = 1
         mrope_position_deltas = []
         if input_ids is not None and (
             image_grid_thw is not None or video_grid_thw is not None
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 5b1b9d22a..fded6d58c 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -186,7 +186,6 @@ UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
 
 logger = logging.getLogger(__name__)
 
-
 if _is_npu:
     import torch_npu
 
@@ -625,6 +624,22 @@ class ModelRunner:
                     "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
                 )
 
+        if self.model_config.hf_config.model_type == "qwen3_vl_moe":
+            if (
+                quantization_config := getattr(
+                    self.model_config.hf_config, "quantization_config"
+                )
+            ) is not None:
+                text_config = self.model_config.hf_text_config
+                weight_block_size_n = quantization_config["weight_block_size"][0]
+                if (
+                    text_config.moe_intermediate_size
+                    // (self.tp_size // self.moe_ep_size)
+                ) % weight_block_size_n != 0:
+                    raise ValueError(
+                        f"For qwen3-vl-fp8 models, please make sure ({text_config.moe_intermediate_size=} // ({self.tp_size=} // {self.moe_ep_size=})) % {weight_block_size_n=} == 0"
+                    )
+
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
 
diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py
index e8e0d62e9..b6861b99c 100644
--- a/test/srt/test_vision_openai_server_a.py
+++ b/test/srt/test_vision_openai_server_a.py
@@ -50,6 +50,27 @@ class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
         cls.base_url += "/v1"
 
 
+class TestQwen3VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            api_key=cls.api_key,
+            other_args=[
+                "--mem-fraction-static",
+                "0.80",
+                "--cuda-graph-max-bs",
+                "4",
+            ],
+        )
+        cls.base_url += "/v1"
+
+
 class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin):
     @classmethod
     def setUpClass(cls):
diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py
index 792636060..6af8f099c 100644
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -494,7 +494,7 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
             **(self.get_vision_request_kwargs()),
         )
 
-        video_response = response.choices[0].message.content
+        video_response = response.choices[0].message.content.lower()
 
         print("-" * 30)
         print(f"Video response:\n{video_response}")
@@ -502,9 +502,10 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase):
 
         # Add assertions to validate the video response
         assert (
-            "iPod" in video_response
+            "ipod" in video_response
             or "device" in video_response
             or "microphone" in video_response
+            or "phone" in video_response
         ), f"video_response: {video_response}, should contain 'iPod' or 'device'"
         assert (
             "man" in video_response