From 7b81f956eba096332fc35492cabac34c2fff680e Mon Sep 17 00:00:00 2001 From: Binyao Jiang Date: Sat, 9 Aug 2025 00:42:29 -0700 Subject: [PATCH] Fix qwen2 audio not working bug (#8600) --- python/sglang/srt/managers/mm_utils.py | 3 +- python/sglang/srt/models/qwen2_audio.py | 19 ++++---- test/srt/test_vision_openai_server_b.py | 47 ++++++++++++++++++++ test/srt/test_vision_openai_server_common.py | 2 +- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/managers/mm_utils.py b/python/sglang/srt/managers/mm_utils.py index d6ace8904..ceef4c332 100644 --- a/python/sglang/srt/managers/mm_utils.py +++ b/python/sglang/srt/managers/mm_utils.py @@ -614,8 +614,7 @@ def general_mm_embed_routine( input_ids: Input token IDs tensor forward_batch: Batch information for model forward pass language_model: Base language model to use - image_data_embedding_func: Function to embed image data - audio_data_embedding_func: Function to embed audio data + data_embedding_funcs: A dictionary mapping from modality type to the corresponding embedding function. placeholder_tokens: Token IDs for multimodal placeholders **kwargs: Additional arguments passed to language model diff --git a/python/sglang/srt/models/qwen2_audio.py b/python/sglang/srt/models/qwen2_audio.py index bc232f0be..180ee801b 100644 --- a/python/sglang/srt/models/qwen2_audio.py +++ b/python/sglang/srt/models/qwen2_audio.py @@ -52,7 +52,11 @@ from sglang.srt.managers.mm_utils import ( MultiModalityDataPaddingPatternMultimodalTokens, general_mm_embed_routine, ) -from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs +from sglang.srt.managers.schedule_batch import ( + Modality, + MultimodalDataItem, + MultimodalInputs, +) from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.models.qwen2 import Qwen2ForCausalLM @@ -106,15 +110,10 @@ class Qwen2AudioForConditionalGeneration(nn.Module): self.language_model = Qwen2ForCausalLM( config.text_config, quant_config, prefix=add_prefix("model", prefix) ) + self.pattern = MultiModalityDataPaddingPatternMultimodalTokens() def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs): - # Get all special token IDs for audio - audio_token_id: int = getattr( - mm_inputs, "audio_token_id", mm_inputs.im_token_id - ) - - pattern = MultiModalityDataPaddingPatternMultimodalTokens([audio_token_id]) - return pattern.pad_input_tokens(input_ids, mm_inputs) + return self.pattern.pad_input_tokens(input_ids, mm_inputs) def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # Extract audio features from input items @@ -143,7 +142,9 @@ class Qwen2AudioForConditionalGeneration(nn.Module): input_ids=input_ids, forward_batch=forward_batch, language_model=self.language_model, - audio_data_embedding_func=self.get_audio_feature, + data_embedding_funcs={ + Modality.AUDIO: self.get_audio_feature, + }, positions=positions, ) diff --git a/test/srt/test_vision_openai_server_b.py b/test/srt/test_vision_openai_server_b.py index 81dc2c6b7..f954aee48 100644 --- a/test/srt/test_vision_openai_server_b.py +++ b/test/srt/test_vision_openai_server_b.py @@ -190,6 +190,53 @@ class TestGemma3nServer(TestOpenAIVisionServer): # self._test_audio_ambient_completion() +class TestQwen2AudioServer(TestOpenAIVisionServer): + @classmethod + def setUpClass(cls): + cls.model = "Qwen/Qwen2-Audio-7B-Instruct" + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--mem-fraction-static", + "0.70", + ], + ) + cls.base_url += "/v1" + + def test_audio_chat_completion(self): + self._test_audio_speech_completion() + self._test_audio_ambient_completion() + + # Qwen2Audio does not support image + def test_single_image_chat_completion(self): + pass + + # Qwen2Audio does not support image + def test_multi_turn_chat_completion(self): + pass + + # Qwen2Audio does not support image + def test_multi_images_chat_completion(self): + pass + + # Qwen2Audio does not support image + def test_video_images_chat_completion(self): + pass + + # Qwen2Audio does not support image + def test_regex(self): + pass + + # Qwen2Audio does not support image + def test_mixed_batch(self): + pass + + class TestKimiVLServer(TestOpenAIVisionServer): @classmethod def setUpClass(cls): diff --git a/test/srt/test_vision_openai_server_common.py b/test/srt/test_vision_openai_server_common.py index d652c6be1..a8c0aac38 100644 --- a/test/srt/test_vision_openai_server_common.py +++ b/test/srt/test_vision_openai_server_common.py @@ -547,7 +547,7 @@ class TestOpenAIVisionServer(CustomTestCase): # bird song audio_response = self.get_audio_response( AUDIO_BIRD_SONG_URL, - "Please listen to the audio snippet carefully and transcribe the content.", + "Please listen to the audio snippet carefully and transcribe the content in English.", "ambient", ) assert "bird" in audio_response