fix: fix obsolete qwen-audio processor arg (#9003)

2025-08-10 04:18:36 +08:00
parent 20cfc5a251
commit 41d71ca488
2 changed files with 18 additions and 12 deletions
--- a/python/sglang/srt/multimodal/processors/base_processor.py
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):

    def process_mm_data(
        self, input_text, images=None, videos=None, audios=None, **kwargs
-    ):
+    ) -> dict:
        """
        process multimodal data with transformers AutoProcessor
        """
@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
        if videos:
            kwargs["videos"] = videos
        if audios:
-            kwargs["audios"] = audios
-            if self.__class__.__name__ == "Gemma3nSGLangProcessor":
+            if self.arch in {
+                "Gemma3nForConditionalGeneration",
+                "Qwen2AudioForConditionalGeneration",
+            }:
                # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
                kwargs["audio"] = audios
+            else:
+                kwargs["audios"] = audios

        processor = self._processor
        if (
@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
        all_collected_items: list[MultimodalDataItem] = []
        input_ids = None

-        # Handle dict items (already processed)
-        for dict_item in dict_items:
-            all_collected_items.extend(
-                self.collect_mm_items_from_processor_output(dict_item)
-            )
-
        # Handle raw items (need processing)
        if raw_images or raw_audios or raw_videos:
            collected_items, input_ids, ret = self._process_and_collect_mm_items(
@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
                videos=raw_videos,
                **kwargs,
            )
-            all_collected_items.extend(collected_items)
+            all_collected_items = collected_items
        else:
            ret = None

+        # Handle dict items (already processed)
+        for dict_item in dict_items:
+            all_collected_items.extend(
+                self.collect_mm_items_from_processor_output(dict_item)
+            )
+
        # Fallback tokenization if no raw items were processed
        if input_ids is None:
            input_ids = self._processor.tokenizer(
--- a/python/sglang/srt/multimodal/processors/qwen_audio.py
+++ b/python/sglang/srt/multimodal/processors/qwen_audio.py
@@ -1,6 +1,6 @@
 import re

-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.managers.schedule_batch import Modality
 from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
    BaseMultimodalProcessor,
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
            audio_token_id=self.audio_token_id,
        ).build(_processor)

+        self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
+
    async def process_mm_data_async(
        self,
        audio_data,
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
        input_lengths = (input_lengths - 1) // 2 + 1
        output_lengths = (input_lengths - 2) // 2 + 1

-        mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
+        mm_items[0].audio_feature_lens = output_lengths

        return {
            "mm_items": mm_items,