From b4408e6098ca06026d8ff0a56fa86492c0e27b99 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 10 Oct 2025 12:44:40 -0700 Subject: [PATCH] Revert "fix: fix video input for qwen3-vl" (#11437) --- python/sglang/srt/layers/rotary_embedding.py | 5 ----- .../sglang/srt/model_executor/model_runner.py | 17 +-------------- test/srt/test_vision_openai_server_a.py | 21 ------------------- test/srt/test_vision_openai_server_common.py | 5 ++--- 4 files changed, 3 insertions(+), 45 deletions(-) diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index b86d1e9de..91e58f6a0 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -1126,11 +1126,6 @@ class MRotaryEmbedding(RotaryEmbedding): second_per_grid_ts: Optional[torch.Tensor] = None, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: - if model_type.startswith("qwen3_vl") and video_grid_thw is not None: - video_grid_thw = torch.repeat_interleave( - video_grid_thw, video_grid_thw[:, 0], dim=0 - ) - video_grid_thw[:, 0] = 1 mrope_position_deltas = [] if input_ids is not None and ( image_grid_thw is not None or video_grid_thw is not None diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index fded6d58c..5b1b9d22a 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -186,6 +186,7 @@ UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300 logger = logging.getLogger(__name__) + if _is_npu: import torch_npu @@ -624,22 +625,6 @@ class ModelRunner: "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes." ) - if self.model_config.hf_config.model_type == "qwen3_vl_moe": - if ( - quantization_config := getattr( - self.model_config.hf_config, "quantization_config" - ) - ) is not None: - text_config = self.model_config.hf_text_config - weight_block_size_n = quantization_config["weight_block_size"][0] - if ( - text_config.moe_intermediate_size - // (self.tp_size // self.moe_ep_size) - ) % weight_block_size_n != 0: - raise ValueError( - f"For qwen3-vl-fp8 models, please make sure ({text_config.moe_intermediate_size=} // ({self.tp_size=} // {self.moe_ep_size=})) % {weight_block_size_n=} == 0" - ) - def init_torch_distributed(self): logger.info("Init torch distributed begin.") diff --git a/test/srt/test_vision_openai_server_a.py b/test/srt/test_vision_openai_server_a.py index b6861b99c..e8e0d62e9 100644 --- a/test/srt/test_vision_openai_server_a.py +++ b/test/srt/test_vision_openai_server_a.py @@ -50,27 +50,6 @@ class TestQwen2VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): cls.base_url += "/v1" -class TestQwen3VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): - @classmethod - def setUpClass(cls): - cls.model = "Qwen/Qwen3-VL-30B-A3B-Instruct" - cls.base_url = DEFAULT_URL_FOR_TEST - cls.api_key = "sk-123456" - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - api_key=cls.api_key, - other_args=[ - "--mem-fraction-static", - "0.80", - "--cuda-graph-max-bs", - "4", - ], - ) - cls.base_url += "/v1" - - class TestQwen2_5_VLServer(ImageOpenAITestMixin, VideoOpenAITestMixin): @classmethod def setUpClass(cls): diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py index 6af8f099c..792636060 100644 --- a/test/srt/test_vision_openai_server_common.py +++ b/test/srt/test_vision_openai_server_common.py @@ -494,7 +494,7 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase): **(self.get_vision_request_kwargs()), ) - video_response = response.choices[0].message.content.lower() + video_response = response.choices[0].message.content print("-" * 30) print(f"Video response:\n{video_response}") @@ -502,10 +502,9 @@ class VideoOpenAITestMixin(TestOpenAIOmniServerBase): # Add assertions to validate the video response assert ( - "ipod" in video_response + "iPod" in video_response or "device" in video_response or "microphone" in video_response - or "phone" in video_response ), f"video_response: {video_response}, should contain 'iPod' or 'device'" assert ( "man" in video_response