refactor: unify names of the feature field of MultimodalDataItem (#8075)
This commit is contained in:
@@ -5,7 +5,6 @@ import multiprocessing as mp
|
||||
import os
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC):
|
||||
# "precomputed_features" - handled specially as it can be any modality
|
||||
}
|
||||
|
||||
# name of the feature filed
|
||||
# TODO: pass from processors
|
||||
self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"]
|
||||
|
||||
def process_mm_data(
|
||||
self, input_text, images=None, videos=None, audios=None, **kwargs
|
||||
):
|
||||
@@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC):
|
||||
if modality not in items:
|
||||
items[modality] = MultimodalDataItem(modality=modality)
|
||||
|
||||
if attr_name in self.FEATURE_NAMES:
|
||||
attr_name = "feature"
|
||||
|
||||
# Set attribute
|
||||
setattr(items[modality], attr_name, value)
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor):
|
||||
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
||||
image_inputs["mm_items"] = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE
|
||||
feature=image_inputs["pixel_values"], modality=Modality.IMAGE
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
|
||||
input_ids=input_ids, mm_token_id=self._processor.image_token_id
|
||||
)
|
||||
item = MultimodalDataItem(
|
||||
pixel_values=res["images"],
|
||||
feature=res["images"],
|
||||
offsets=image_offsets,
|
||||
modality=Modality.IMAGE,
|
||||
image_emb_mask=images_seq_mask,
|
||||
|
||||
@@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
|
||||
)
|
||||
items = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=pixel_values,
|
||||
feature=pixel_values,
|
||||
modality=Modality.IMAGE,
|
||||
offsets=image_offsets,
|
||||
)
|
||||
|
||||
@@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
|
||||
return {
|
||||
"mm_items": [
|
||||
MultimodalDataItem(
|
||||
pixel_values=res["pixel_values"],
|
||||
feature=res["pixel_values"],
|
||||
image_emb_mask=res["images_emb_mask"],
|
||||
offsets=image_offsets,
|
||||
modality=Modality.IMAGE,
|
||||
|
||||
@@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor):
|
||||
return {
|
||||
"mm_items": [
|
||||
MultimodalDataItem(
|
||||
pixel_values=pixel_values,
|
||||
feature=pixel_values,
|
||||
image_sizes=image_sizes,
|
||||
modality=modality,
|
||||
)
|
||||
|
||||
@@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
||||
|
||||
if len(pixel_values) != 0:
|
||||
item = MultimodalDataItem(
|
||||
pixel_values=pixel_values,
|
||||
feature=pixel_values,
|
||||
offsets=image_offsets,
|
||||
tgt_size=tgt_sizes_flat,
|
||||
modality=Modality.IMAGE,
|
||||
@@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
|
||||
else:
|
||||
audio_offsets = None
|
||||
item = MultimodalDataItem(
|
||||
audio_features=[res["audio_features"]],
|
||||
feature=[res["audio_features"]],
|
||||
audio_feature_lens=res["audio_feature_lens"],
|
||||
offsets=audio_offsets,
|
||||
modality=Modality.AUDIO,
|
||||
|
||||
@@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor):
|
||||
image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0]
|
||||
image_inputs["mm_items"] = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=image_inputs["pixel_values"],
|
||||
feature=image_inputs["pixel_values"],
|
||||
aspect_ratio_id=image_inputs["aspect_ratio_ids"],
|
||||
aspect_ratio_mask=image_inputs["aspect_ratio_mask"],
|
||||
modality=Modality.IMAGE,
|
||||
|
||||
@@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
|
||||
# Add metadata for image processing
|
||||
processor_output["mm_items"] = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=processor_output["pixel_values"],
|
||||
feature=processor_output["pixel_values"],
|
||||
modality=Modality.IMAGE,
|
||||
offsets=image_offsets,
|
||||
)
|
||||
|
||||
@@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
|
||||
|
||||
items = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=res["input_image_embeds"],
|
||||
feature=res["input_image_embeds"],
|
||||
image_sizes=res["image_sizes"],
|
||||
image_emb_mask=res["image_attention_mask"],
|
||||
offsets=image_offsets,
|
||||
|
||||
@@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor):
|
||||
)
|
||||
mm_items = [
|
||||
MultimodalDataItem(
|
||||
pixel_values=processor_output["pixel_values"],
|
||||
feature=processor_output["pixel_values"],
|
||||
image_sizes=processor_output["image_sizes"],
|
||||
modality=Modality.IMAGE,
|
||||
offsets=image_offsets,
|
||||
|
||||
Reference in New Issue
Block a user