From 41d71ca48834fa64c727f9b63c414dcaf3d01d80 Mon Sep 17 00:00:00 2001 From: Mick Date: Sun, 10 Aug 2025 04:18:36 +0800 Subject: [PATCH] fix: fix obsolete qwen-audio processor arg (#9003) --- .../multimodal/processors/base_processor.py | 24 +++++++++++-------- .../srt/multimodal/processors/qwen_audio.py | 6 +++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index db80184c3..933341ee9 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC): def process_mm_data( self, input_text, images=None, videos=None, audios=None, **kwargs - ): + ) -> dict: """ process multimodal data with transformers AutoProcessor """ @@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC): if videos: kwargs["videos"] = videos if audios: - kwargs["audios"] = audios - if self.__class__.__name__ == "Gemma3nSGLangProcessor": + if self.arch in { + "Gemma3nForConditionalGeneration", + "Qwen2AudioForConditionalGeneration", + }: # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107 kwargs["audio"] = audios + else: + kwargs["audios"] = audios processor = self._processor if ( @@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC): all_collected_items: list[MultimodalDataItem] = [] input_ids = None - # Handle dict items (already processed) - for dict_item in dict_items: - all_collected_items.extend( - self.collect_mm_items_from_processor_output(dict_item) - ) - # Handle raw items (need processing) if raw_images or raw_audios or raw_videos: collected_items, input_ids, ret = self._process_and_collect_mm_items( @@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC): videos=raw_videos, **kwargs, ) - all_collected_items.extend(collected_items) + all_collected_items = collected_items else: ret = None + # Handle dict items (already processed) + for dict_item in dict_items: + all_collected_items.extend( + self.collect_mm_items_from_processor_output(dict_item) + ) + # Fallback tokenization if no raw items were processed if input_ids is None: input_ids = self._processor.tokenizer( diff --git a/python/sglang/srt/multimodal/processors/qwen_audio.py b/python/sglang/srt/multimodal/processors/qwen_audio.py index b2bb38464..f9275feea 100644 --- a/python/sglang/srt/multimodal/processors/qwen_audio.py +++ b/python/sglang/srt/multimodal/processors/qwen_audio.py @@ -1,6 +1,6 @@ import re -from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem +from sglang.srt.managers.schedule_batch import Modality from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration from sglang.srt.multimodal.processors.base_processor import ( BaseMultimodalProcessor, @@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): audio_token_id=self.audio_token_id, ).build(_processor) + self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO}) + async def process_mm_data_async( self, audio_data, @@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): input_lengths = (input_lengths - 1) // 2 + 1 output_lengths = (input_lengths - 2) // 2 + 1 - mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths + mm_items[0].audio_feature_lens = output_lengths return { "mm_items": mm_items,