fix: fix obsolete qwen-audio processor arg (#9003)
This commit is contained in:
@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):
|
||||
|
||||
def process_mm_data(
|
||||
self, input_text, images=None, videos=None, audios=None, **kwargs
|
||||
):
|
||||
) -> dict:
|
||||
"""
|
||||
process multimodal data with transformers AutoProcessor
|
||||
"""
|
||||
@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
|
||||
if videos:
|
||||
kwargs["videos"] = videos
|
||||
if audios:
|
||||
kwargs["audios"] = audios
|
||||
if self.__class__.__name__ == "Gemma3nSGLangProcessor":
|
||||
if self.arch in {
|
||||
"Gemma3nForConditionalGeneration",
|
||||
"Qwen2AudioForConditionalGeneration",
|
||||
}:
|
||||
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
|
||||
kwargs["audio"] = audios
|
||||
else:
|
||||
kwargs["audios"] = audios
|
||||
|
||||
processor = self._processor
|
||||
if (
|
||||
@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
|
||||
all_collected_items: list[MultimodalDataItem] = []
|
||||
input_ids = None
|
||||
|
||||
# Handle dict items (already processed)
|
||||
for dict_item in dict_items:
|
||||
all_collected_items.extend(
|
||||
self.collect_mm_items_from_processor_output(dict_item)
|
||||
)
|
||||
|
||||
# Handle raw items (need processing)
|
||||
if raw_images or raw_audios or raw_videos:
|
||||
collected_items, input_ids, ret = self._process_and_collect_mm_items(
|
||||
@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
|
||||
videos=raw_videos,
|
||||
**kwargs,
|
||||
)
|
||||
all_collected_items.extend(collected_items)
|
||||
all_collected_items = collected_items
|
||||
else:
|
||||
ret = None
|
||||
|
||||
# Handle dict items (already processed)
|
||||
for dict_item in dict_items:
|
||||
all_collected_items.extend(
|
||||
self.collect_mm_items_from_processor_output(dict_item)
|
||||
)
|
||||
|
||||
# Fallback tokenization if no raw items were processed
|
||||
if input_ids is None:
|
||||
input_ids = self._processor.tokenizer(
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import re
|
||||
|
||||
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
||||
from sglang.srt.managers.schedule_batch import Modality
|
||||
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
|
||||
from sglang.srt.multimodal.processors.base_processor import (
|
||||
BaseMultimodalProcessor,
|
||||
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
||||
audio_token_id=self.audio_token_id,
|
||||
).build(_processor)
|
||||
|
||||
self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
|
||||
|
||||
async def process_mm_data_async(
|
||||
self,
|
||||
audio_data,
|
||||
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
||||
input_lengths = (input_lengths - 1) // 2 + 1
|
||||
output_lengths = (input_lengths - 2) // 2 + 1
|
||||
|
||||
mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
|
||||
mm_items[0].audio_feature_lens = output_lengths
|
||||
|
||||
return {
|
||||
"mm_items": mm_items,
|
||||
|
||||
Reference in New Issue
Block a user