diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index 55c3121ba..a18f0bff9 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -1142,6 +1142,13 @@ class MRotaryEmbedding(RotaryEmbedding): second_per_grid_ts: Optional[torch.Tensor] = None, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + if ( + model_type.startswith("qwen3_vl") or model_type.startswith("qwen3_vl_moe") + ) and video_grid_thw is not None: + video_grid_thw = torch.repeat_interleave( + video_grid_thw, video_grid_thw[:, 0], dim=0 + ) + video_grid_thw[:, 0] = 1 mrope_position_deltas = [] if input_ids is not None and ( image_grid_thw is not None or video_grid_thw is not None diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index c034c37b9..c521b1112 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -25,7 +25,6 @@ import signal import sys import threading import time -import uuid from collections import deque from contextlib import nullcontext from datetime import datetime @@ -360,7 +359,8 @@ class TokenizerManager(TokenizerCommunicatorMixin): ( FreezeGCReq, lambda x: None, - ), # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it. + ), + # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it. (HealthCheckOutput, lambda x: None), ] ) @@ -587,9 +587,9 @@ class TokenizerManager(TokenizerCommunicatorMixin): ) if self.mm_processor and obj.contains_mm_input(): - if not isinstance(obj.image_data, list): + if not isinstance(obj.image_data, list) and obj.image_data: obj.image_data = [obj.image_data] - if not isinstance(obj.audio_data, list): + if not isinstance(obj.audio_data, list) and obj.audio_data: obj.audio_data = [obj.audio_data] mm_inputs: Dict = await self.mm_processor.process_mm_data_async( image_data=obj.image_data, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 3977ad01e..d59378a72 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -196,7 +196,6 @@ MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO = 3 logger = logging.getLogger(__name__) - if _is_npu: import torch_npu @@ -636,6 +635,22 @@ class ModelRunner: "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes." ) + if self.model_config.hf_config.model_type == "qwen3_vl_moe": + if ( + quantization_config := getattr( + self.model_config.hf_config, "quantization_config", None + ) + ) is not None: + text_config = self.model_config.hf_text_config + weight_block_size_n = quantization_config["weight_block_size"][0] + if ( + text_config.moe_intermediate_size + // (self.tp_size // self.moe_ep_size) + ) % weight_block_size_n != 0: + raise ValueError( + f"For qwen3-vl-fp8 models, please make sure ({text_config.moe_intermediate_size=} // ({self.tp_size=} // {self.moe_ep_size=})) % {weight_block_size_n=} == 0" + ) + def init_torch_distributed(self): logger.info("Init torch distributed begin.") diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py index e8e0d62e9..b6861b99c 100644 --- a/test/srt/test_vision_openai_server_a.py +++ b/test/srt/test_vision_openai_server_a.py @@ -50,6 +50,27 @@ class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): cls.base_url += "/v1" +class TestQwen3VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): + @classmethod + def setUpClass(cls): + cls.model = "Qwen/Qwen3-VL-30B-A3B-Instruct" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + api_key=cls.api_key, + other_args=[ + "--mem-fraction-static", + "0.80", + "--cuda-graph-max-bs", + "4", + ], + ) + cls.base_url += "/v1" + + class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): @classmethod def setUpClass(cls): diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py index 792636060..6af8f099c 100644 --- a/test/srt/test_vision_openai_server_common.py +++ b/test/srt/test_vision_openai_server_common.py @@ -494,7 +494,7 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase): **(self.get_vision_request_kwargs()), ) - video_response = response.choices[0].message.content + video_response = response.choices[0].message.content.lower() print("-" * 30) print(f"Video response:\n{video_response}") @@ -502,9 +502,10 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase): # Add assertions to validate the video response assert ( - "iPod" in video_response + "ipod" in video_response or "device" in video_response or "microphone" in video_response + or "phone" in video_response ), f"video_response: {video_response}, should contain 'iPod' or 'device'" assert ( "man" in video_response