fix: fix obsolete qwen-audio processor arg (#9003)
This commit is contained in:
@@ -208,7 +208,7 @@ class BaseMultimodalProcessor(ABC):
|
|||||||
|
|
||||||
def process_mm_data(
|
def process_mm_data(
|
||||||
self, input_text, images=None, videos=None, audios=None, **kwargs
|
self, input_text, images=None, videos=None, audios=None, **kwargs
|
||||||
):
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
process multimodal data with transformers AutoProcessor
|
process multimodal data with transformers AutoProcessor
|
||||||
"""
|
"""
|
||||||
@@ -217,10 +217,14 @@ class BaseMultimodalProcessor(ABC):
|
|||||||
if videos:
|
if videos:
|
||||||
kwargs["videos"] = videos
|
kwargs["videos"] = videos
|
||||||
if audios:
|
if audios:
|
||||||
kwargs["audios"] = audios
|
if self.arch in {
|
||||||
if self.__class__.__name__ == "Gemma3nSGLangProcessor":
|
"Gemma3nForConditionalGeneration",
|
||||||
|
"Qwen2AudioForConditionalGeneration",
|
||||||
|
}:
|
||||||
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
|
# Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
|
||||||
kwargs["audio"] = audios
|
kwargs["audio"] = audios
|
||||||
|
else:
|
||||||
|
kwargs["audios"] = audios
|
||||||
|
|
||||||
processor = self._processor
|
processor = self._processor
|
||||||
if (
|
if (
|
||||||
@@ -607,12 +611,6 @@ class BaseMultimodalProcessor(ABC):
|
|||||||
all_collected_items: list[MultimodalDataItem] = []
|
all_collected_items: list[MultimodalDataItem] = []
|
||||||
input_ids = None
|
input_ids = None
|
||||||
|
|
||||||
# Handle dict items (already processed)
|
|
||||||
for dict_item in dict_items:
|
|
||||||
all_collected_items.extend(
|
|
||||||
self.collect_mm_items_from_processor_output(dict_item)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Handle raw items (need processing)
|
# Handle raw items (need processing)
|
||||||
if raw_images or raw_audios or raw_videos:
|
if raw_images or raw_audios or raw_videos:
|
||||||
collected_items, input_ids, ret = self._process_and_collect_mm_items(
|
collected_items, input_ids, ret = self._process_and_collect_mm_items(
|
||||||
@@ -622,10 +620,16 @@ class BaseMultimodalProcessor(ABC):
|
|||||||
videos=raw_videos,
|
videos=raw_videos,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
all_collected_items.extend(collected_items)
|
all_collected_items = collected_items
|
||||||
else:
|
else:
|
||||||
ret = None
|
ret = None
|
||||||
|
|
||||||
|
# Handle dict items (already processed)
|
||||||
|
for dict_item in dict_items:
|
||||||
|
all_collected_items.extend(
|
||||||
|
self.collect_mm_items_from_processor_output(dict_item)
|
||||||
|
)
|
||||||
|
|
||||||
# Fallback tokenization if no raw items were processed
|
# Fallback tokenization if no raw items were processed
|
||||||
if input_ids is None:
|
if input_ids is None:
|
||||||
input_ids = self._processor.tokenizer(
|
input_ids = self._processor.tokenizer(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
|
from sglang.srt.managers.schedule_batch import Modality
|
||||||
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
|
from sglang.srt.models.qwen2_audio import Qwen2AudioForConditionalGeneration
|
||||||
from sglang.srt.multimodal.processors.base_processor import (
|
from sglang.srt.multimodal.processors.base_processor import (
|
||||||
BaseMultimodalProcessor,
|
BaseMultimodalProcessor,
|
||||||
@@ -29,6 +29,8 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
|||||||
audio_token_id=self.audio_token_id,
|
audio_token_id=self.audio_token_id,
|
||||||
).build(_processor)
|
).build(_processor)
|
||||||
|
|
||||||
|
self.ATTR_NAME_TO_MODALITY.update({"feature_attention_mask": Modality.AUDIO})
|
||||||
|
|
||||||
async def process_mm_data_async(
|
async def process_mm_data_async(
|
||||||
self,
|
self,
|
||||||
audio_data,
|
audio_data,
|
||||||
@@ -54,7 +56,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor):
|
|||||||
input_lengths = (input_lengths - 1) // 2 + 1
|
input_lengths = (input_lengths - 1) // 2 + 1
|
||||||
output_lengths = (input_lengths - 2) // 2 + 1
|
output_lengths = (input_lengths - 2) // 2 + 1
|
||||||
|
|
||||||
mm_items[0].model_specific_data["audio_feature_lens"] = output_lengths
|
mm_items[0].audio_feature_lens = output_lengths
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"mm_items": mm_items,
|
"mm_items": mm_items,
|
||||||
|
|||||||
Reference in New Issue
Block a user