From 41d71ca48834fa64c727f9b63c414dcaf3d01d80 Mon Sep 17 00:00:00 2001
From: Mick <mickjagger19@icloud.com>
Date: Sun, 10 Aug 2025 04:18:36 +0800
Subject: [PATCH] fix: fix obsolete qwen-audio processor arg (#9003)

---
 .../multimodal/processors/base_processor.py   | 24 +++++++++++--------
 .../srt/multimodal/processors/qwen_audio.py   |  6 +++--
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py
index db80184c3..933341ee9 100644
--- a/python/sglang/srt/multimodal/processors/base_processor.py
+++ b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):
 
     def process_mm_data(
         self, input_text, images=None, videos=None, audios=None, **kwargs
-    ):
+    ) -> dict:
         """
         process multimodal data with transformers AutoProcessor
         """
@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
         if videos:
             kwargs["videos"] = videos
         if audios:
-            kwargs["audios"] = audios
-            if self.__class__.__name__ == "Gemma3nSGLangProcessor":
+            if self.arch in {
+                "Gemma3nForConditionalGeneration",
+                "Qwen2AudioForConditionalGeneration",
+            }:
                 # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
                 kwargs["audio"] = audios
+            else:
+                kwargs["audios"] = audios
 
         processor = self._processor
         if (
@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
         all_collected_items: list[MultimodalDataItem] = []
         input_ids = None
 
-        # Handle dict items (already processed)
-        for dict_item in dict_items:
-            all_collected_items.extend(
-                self.collect_mm_items_from_processor_output(dict_item)
-            )
-
         # Handle raw items (need processing)
         if raw_images or raw_audios or raw_videos:
             collected_items, input_ids, ret = self._process_and_collect_mm_items(
@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
                 videos=raw_videos,
                 **kwargs,
             )
-            all_collected_items.extend(collected_items)
+            all_collected_items = collected_items
         else:
             ret = None
 
+        # Handle dict items (already processed)
+        for dict_item in dict_items:
+            all_collected_items.extend(
+                self.collect_mm_items_from_processor_output(dict_item)
+            )
+
         # Fallback tokenization if no raw items were processed
         if input_ids is None:
             input_ids = self._processor.tokenizer(
diff --git a/python/sglang/srt/multimodal/processors/qwen_audio.py b/python/sglang/srt/multimodal/processors/qwen_audio.py
index b2bb38464..f9275feea 100644
--- a/python/sglang/srt/multimodal/processors/qwen_audio.py
+++ b/python/sglang/srt/multimodal/processors/qwen_audio.py
@@ -1,6 +1,6 @@
 import re
 
-from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.managers.schedule_batch import Modality
 from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
 from sglang.srt.multimodal.processors.base_processor import (
     BaseMultimodalProcessor,
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
             audio_token_id=self.audio_token_id,
         ).build(_processor)
 
+        self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
+
     async def process_mm_data_async(
         self,
         audio_data,
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
         input_lengths = (input_lengths - 1) // 2 + 1
         output_lengths = (input_lengths - 2) // 2 + 1
 
-        mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
+        mm_items[0].audio_feature_lens = output_lengths
 
         return {
             "mm_items": mm_items,