diff --git a/python/sglang/srt/managers/multimodal_processors/qwen_audio.py b/python/sglang/srt/managers/multimodal_processors/qwen_audio.py index 0558b5f5a..23b7de5cf 100644 --- a/python/sglang/srt/managers/multimodal_processors/qwen_audio.py +++ b/python/sglang/srt/managers/multimodal_processors/qwen_audio.py @@ -78,7 +78,7 @@ class Qwen2AudioMultimodalProcessor(BaseMultimodalProcessor): output_lengths = (input_lengths - 2) // 2 + 1 item = MultimodalDataItem( - audio_features=res["input_features"], + feature=res["input_features"], audio_feature_lens=output_lengths, audio_offsets=audio_offsets, modality=Modality.AUDIO, diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index c2750d072..01da558b7 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -207,13 +207,12 @@ class MultimodalDataItem: modality: Modality hash: int = None pad_value: int = None - image_sizes: Tuple[int, int] = None offsets: Optional[list] = None + # the raw features returned by processor, e.g. pixel_values or audio_features + feature: Union[torch.Tensor, np.ndarray] = None + + image_sizes: Tuple[int, int] = None - # the real data, pixel_values or audio_features - # data: Union[List[torch.Tensor], List[np.ndarray]] - pixel_values: Union[torch.Tensor, np.ndarray, "PIL.Image"] = None - audio_features: Union[torch.Tensor, np.ndarray] = None audio_feature_lens: Optional[List[torch.Tensor]] = None audio_offsets: Optional[List[Tuple[int, int]]] = None precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None @@ -238,7 +237,6 @@ class MultimodalDataItem: image_grid_hws: Optional[List[torch.Tensor]] = None # For gemma3n - input_features: Optional[torch.Tensor] = None input_features_mask: Optional[torch.Tensor] = None @staticmethod @@ -254,18 +252,11 @@ class MultimodalDataItem: from sglang.srt.managers.mm_utils import hash_feature if self.hash is None: - if self.precomputed_features is not None: - self.hash = hash_feature(self.precomputed_features) - elif self.is_audio(): - if self.audio_features is not None: - self.hash = hash_feature(self.audio_features) - elif self.input_features is not None: - self.hash = hash_feature(self.input_features) - elif self.is_video(): - self.hash = hash_feature(self.pixel_values_videos) + if self.feature is not None: + hashed_feature = self.feature else: - self.hash = hash_feature(self.pixel_values) - + hashed_feature = self.precomputed_features + self.hash = hash_feature(hashed_feature) assert self.hash is not None self.pad_value = self.hash % (1 << 30) @@ -275,8 +266,7 @@ class MultimodalDataItem: def is_audio(self): return (self.modality == Modality.AUDIO) and ( self.precomputed_features is not None - or not MultimodalDataItem.is_empty_list(self.audio_features) - or not MultimodalDataItem.is_empty_list(self.input_features) + or not MultimodalDataItem.is_empty_list(self.feature) ) def is_image(self): @@ -284,13 +274,13 @@ class MultimodalDataItem: self.is_modality(Modality.IMAGE) or self.is_modality(Modality.MULTI_IMAGES) ) and ( self.precomputed_features is not None - or not MultimodalDataItem.is_empty_list(self.pixel_values) + or not MultimodalDataItem.is_empty_list(self.feature) ) def is_video(self): return (self.modality == Modality.VIDEO) and ( self.precomputed_features is not None - or not MultimodalDataItem.is_empty_list(self.pixel_values_videos) + or not MultimodalDataItem.is_empty_list(self.feature) ) def is_valid(self) -> bool: @@ -311,7 +301,7 @@ class MultimodalDataItem: return ret def merge(self, other): - self.pixel_values += other.pixel_values + self.feature += other.feature self.image_sizes += other.image_sizes self.image_offsets += other.image_offsets self.hash = hash((self.hash, other.hash)) @@ -354,7 +344,6 @@ class MultimodalInputs: assert isinstance(ret.mm_items, list) ret.mm_items = [item for item in ret.mm_items if item.is_valid()] - for item in ret.mm_items: item.set_pad_value() @@ -1278,11 +1267,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): if mm_input is None: continue for mm_item in mm_input.mm_items: - pixel_values = getattr(mm_item, "pixel_values", None) + pixel_values = getattr(mm_item, "feature", None) if isinstance(pixel_values, torch.Tensor): - mm_item.pixel_values = pixel_values.to( - self.device, non_blocking=True - ) + mm_item.feature = pixel_values.to(self.device, non_blocking=True) self.multimodal_inputs = multimodal_inputs self.token_type_ids = token_type_ids_tensor self.seq_lens_sum = sum(seq_lens) diff --git a/python/sglang/srt/models/clip.py b/python/sglang/srt/models/clip.py index f271b45a4..ea9fee9ac 100644 --- a/python/sglang/srt/models/clip.py +++ b/python/sglang/srt/models/clip.py @@ -463,7 +463,7 @@ class CLIPModel(nn.Module): if forward_batch.mm_inputs is not None: mm_inputs = forward_batch.mm_inputs pixel_values_list = [ - item.pixel_values + item.feature for item in flatten_nested_list( [mm_input.mm_items for mm_input in mm_inputs if mm_input is not None] ) diff --git a/python/sglang/srt/models/deepseek_janus_pro.py b/python/sglang/srt/models/deepseek_janus_pro.py index 8d266a3be..fe1c833f7 100644 --- a/python/sglang/srt/models/deepseek_janus_pro.py +++ b/python/sglang/srt/models/deepseek_janus_pro.py @@ -1960,7 +1960,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel): self.logits_processor = LogitsProcessor(config) def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: - pixel_values = torch.concat([item.pixel_values for item in items], dim=0) + pixel_values = torch.concat([item.feature for item in items], dim=0) bs, n = pixel_values.shape[0:2] pixel_values = pixel_values.to( device=self.vision_model.device, dtype=self.vision_model.dtype diff --git a/python/sglang/srt/models/deepseek_vl2.py b/python/sglang/srt/models/deepseek_vl2.py index 9941927cd..cf4988b52 100644 --- a/python/sglang/srt/models/deepseek_vl2.py +++ b/python/sglang/srt/models/deepseek_vl2.py @@ -268,9 +268,9 @@ class DeepseekVL2ForCausalLM(nn.Module): # TODO: can it be batched ? images_in_this_batch = [] for item in items: - assert item.pixel_values.dim() == 4 + assert item.feature.dim() == 4 image_feature = self.vision.forward_features( - item.pixel_values.type(next(self.vision.parameters()).dtype).to( + item.feature.type(next(self.vision.parameters()).dtype).to( device=next(self.vision.parameters()).device ) ) diff --git a/python/sglang/srt/models/gemma3_mm.py b/python/sglang/srt/models/gemma3_mm.py index 93c145e1b..527a11b69 100644 --- a/python/sglang/srt/models/gemma3_mm.py +++ b/python/sglang/srt/models/gemma3_mm.py @@ -283,7 +283,7 @@ class Gemma3ForConditionalGeneration(PreTrainedModel): image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ # Process images one by one to handle flatten_batch=True constraint in vision_tower - all_pixel_values = flatten_nested_list([item.pixel_values for item in items]) + all_pixel_values = flatten_nested_list([item.feature for item in items]) vision_outputs_list = [] for pixel_values_batch in all_pixel_values: diff --git a/python/sglang/srt/models/gemma3n_mm.py b/python/sglang/srt/models/gemma3n_mm.py index 3bc327ea3..5139a9c2d 100644 --- a/python/sglang/srt/models/gemma3n_mm.py +++ b/python/sglang/srt/models/gemma3n_mm.py @@ -265,7 +265,7 @@ class Gemma3nForConditionalGeneration(PreTrainedModel): image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ # Process images one by one to handle flatten_batch=True constraint in vision_tower - all_pixel_values = flatten_nested_list([item.pixel_values for item in items]) + all_pixel_values = flatten_nested_list([item.feature for item in items]) vision_outputs_list = [] for pixel_values_batch in all_pixel_values: @@ -316,9 +316,7 @@ class Gemma3nForConditionalGeneration(PreTrainedModel): audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_audios, audio_length, embed_dim)`). """ # Extract audio features and masks from items - all_input_features = flatten_nested_list( - [item.input_features for item in items] - ) + all_input_features = flatten_nested_list([item.feature for item in items]) all_input_features_mask = flatten_nested_list( [~item.input_features_mask for item in items] ) # Note(Xinyuan): reverse the mask according to the HF implementation diff --git a/python/sglang/srt/models/internvl.py b/python/sglang/srt/models/internvl.py index 732752317..056797cbf 100644 --- a/python/sglang/srt/models/internvl.py +++ b/python/sglang/srt/models/internvl.py @@ -510,7 +510,7 @@ class InternVLChatModel(nn.Module): Returns: image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). """ - pixel_values = torch.cat([item.pixel_values for item in items]) + pixel_values = torch.cat([item.feature for item in items]) image_features = self.extract_feature(pixel_values) return image_features diff --git a/python/sglang/srt/models/kimi_vl.py b/python/sglang/srt/models/kimi_vl.py index f4386a808..68ed47b2e 100644 --- a/python/sglang/srt/models/kimi_vl.py +++ b/python/sglang/srt/models/kimi_vl.py @@ -144,7 +144,7 @@ class KimiVLForConditionalGeneration(nn.Module): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: pixel_values = ( - torch.cat([item.pixel_values for item in items], dim=0) + torch.cat([item.feature for item in items], dim=0) .type(self.vision_tower.dtype) .to(self.vision_tower.device) ) diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index b0b82a82b..6375657e7 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -186,7 +186,7 @@ class LlavaBaseForCausalLM(nn.Module): bs = forward_batch.batch_size pixel_values = flatten_nested_list( [ - [item.pixel_values for item in image_inputs[i].mm_items] + [item.feature for item in image_inputs[i].mm_items] for i in range(bs) if need_vision[i] ] @@ -753,7 +753,7 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM): features = [] for item in items: # in each item, we assume pixel_values is always batched - pixel_values, image_sizes = item.pixel_values, item.image_sizes + pixel_values, image_sizes = item.feature, item.image_sizes image_outputs = self.vision_tower( pixel_values, image_sizes, output_hidden_states=True ) diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py index 22a007e12..e5d6aa72b 100644 --- a/python/sglang/srt/models/llavavid.py +++ b/python/sglang/srt/models/llavavid.py @@ -135,7 +135,7 @@ class LlavaVidForCausalLM(nn.Module): if need_vision.any(): pixel_values = flatten_nested_list( [ - [item.pixel_values for item in image_inputs[i].mm_items] + [item.feature for item in image_inputs[i].mm_items] for i in range(bs) if need_vision[i] ] diff --git a/python/sglang/srt/models/minicpmo.py b/python/sglang/srt/models/minicpmo.py index a5234772e..786738eca 100644 --- a/python/sglang/srt/models/minicpmo.py +++ b/python/sglang/srt/models/minicpmo.py @@ -1552,9 +1552,7 @@ class MiniCPMO(MiniCPMBaseModel): Returns: List[List[torch.Tensor]]: audio embeddings """ - wavforms = flatten_nested_list( - [item.audio_features for item in items if item.audio_features] - ) + wavforms = flatten_nested_list([item.feature for item in items if item.feature]) # list, [[x1, x2], [y1], [z1]] audio_feature_lens_raw = flatten_nested_list( [item.audio_feature_lens for item in items if item.audio_feature_lens] @@ -1659,9 +1657,7 @@ class MiniCPMO(MiniCPMBaseModel): List[List[torch.Tensor]]: audio embeddings """ # (bs, 80, frames) or [], multi audios need filled in advance - wavforms = flatten_nested_list( - [item.audio_features for item in items if item.audio_features] - ) + wavforms = flatten_nested_list([item.feature for item in items if item.feature]) # list, [[x1, x2], [y1], [z1]] audio_feature_lens_raw = flatten_nested_list( [item.audio_feature_lens for item in items if item.audio_feature_lens] @@ -1778,7 +1774,7 @@ class MiniCPMO(MiniCPMBaseModel): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # list of tensors - pixel_values = flatten_nested_list([item.pixel_values for item in items]) + pixel_values = flatten_nested_list([item.feature for item in items]) tgt_sizes = torch.stack( flatten_nested_list([item.tgt_size for item in items]), dim=0 ) diff --git a/python/sglang/srt/models/minicpmv.py b/python/sglang/srt/models/minicpmv.py index 0c6d4297f..8166d1646 100644 --- a/python/sglang/srt/models/minicpmv.py +++ b/python/sglang/srt/models/minicpmv.py @@ -724,7 +724,7 @@ class MiniCPMV2_6(MiniCPMBaseModel): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # list of tensors - pixel_values = flatten_nested_list([item.pixel_values for item in items]) + pixel_values = flatten_nested_list([item.feature for item in items]) tgt_sizes = torch.stack( flatten_nested_list([item.tgt_size for item in items]), dim=0 ) diff --git a/python/sglang/srt/models/mistral.py b/python/sglang/srt/models/mistral.py index d3d2efcae..632e857c2 100644 --- a/python/sglang/srt/models/mistral.py +++ b/python/sglang/srt/models/mistral.py @@ -56,7 +56,7 @@ class Mistral3ForConditionalGeneration: features = [] for item in items: # in each item, we assume pixel_values is always batched - pixel_values, image_sizes = item.pixel_values, item.image_sizes + pixel_values, image_sizes = item.feature, item.image_sizes image_outputs = self.vision_tower( pixel_values, image_sizes, output_hidden_states=True ) diff --git a/python/sglang/srt/models/mllama.py b/python/sglang/srt/models/mllama.py index fed9e4b59..fa294ddcd 100644 --- a/python/sglang/srt/models/mllama.py +++ b/python/sglang/srt/models/mllama.py @@ -838,9 +838,7 @@ class MllamaForConditionalGeneration(nn.Module): self.logits_processor = LogitsProcessor(config.text_config) def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs): - pixel_values = torch.cat( - [item.pixel_values for item in mm_inputs.mm_items], dim=0 - ) + pixel_values = torch.cat([item.feature for item in mm_inputs.mm_items], dim=0) pad_values = [item.pad_value for item in mm_inputs.mm_items] num_concurrent_media, num_tiles = pixel_values.shape[1:3] @@ -862,7 +860,7 @@ class MllamaForConditionalGeneration(nn.Module): if not forward_batch.encoder_cached[i] and mm_input is not None: pixel_values = torch.cat( - [item.pixel_values for item in mm_input.mm_items], dim=0 + [item.feature for item in mm_input.mm_items], dim=0 ) max_num_images = max(max_num_images, pixel_values.shape[1]) @@ -897,7 +895,7 @@ class MllamaForConditionalGeneration(nn.Module): encoder_lens_need.append(forward_batch.encoder_lens[k]) pixel_values = torch.cat( - [item.pixel_values for item in mm_input.mm_items], dim=0 + [item.feature for item in mm_input.mm_items], dim=0 ) for j in range(pixel_values.shape[1]): img = pixel_values[0, j] diff --git a/python/sglang/srt/models/mllama4.py b/python/sglang/srt/models/mllama4.py index 55e793247..18b7e57e5 100644 --- a/python/sglang/srt/models/mllama4.py +++ b/python/sglang/srt/models/mllama4.py @@ -147,7 +147,7 @@ class Llama4ForConditionalGeneration(nn.Module): raise ValueError("Vision model not available for text-only checkpoint") pixel_values = ( - torch.concat([item.pixel_values for item in items]) + torch.concat([item.feature for item in items]) .to(next(self.vision_model.parameters()).device) .type(next(self.vision_model.parameters()).dtype) ) diff --git a/python/sglang/srt/models/phi4mm.py b/python/sglang/srt/models/phi4mm.py index 44bcad97a..8a74888ac 100644 --- a/python/sglang/srt/models/phi4mm.py +++ b/python/sglang/srt/models/phi4mm.py @@ -422,9 +422,7 @@ class Phi4MMForCausalLM(nn.Module): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: dtype = next(self.vision_encoder.parameters()).dtype - pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type( - dtype - ) + pixel_values = torch.cat([item.feature for item in items], dim=0).type(dtype) image_attention_mask = torch.cat([item.image_emb_mask for item in items], dim=0) image_sizes = torch.cat([item.image_sizes for item in items], dim=0) image_embeds = self.vision_encoder( diff --git a/python/sglang/srt/models/qwen2_5_vl.py b/python/sglang/srt/models/qwen2_5_vl.py index d4f412e49..d2a92217a 100644 --- a/python/sglang/srt/models/qwen2_5_vl.py +++ b/python/sglang/srt/models/qwen2_5_vl.py @@ -497,7 +497,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # in qwen-vl, last dim is the same - pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type( + pixel_values = torch.cat([item.feature for item in items], dim=0).type( self.visual.dtype ) image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0) @@ -508,9 +508,9 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module): def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # in qwen-vl, last dim is the same - pixel_values = torch.cat( - [getattr(item, "pixel_values_videos") for item in items], dim=0 - ).type(self.visual.dtype) + pixel_values = torch.cat([item.feature for item in items], dim=0).type( + self.visual.dtype + ) video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0) assert pixel_values.dim() == 2, pixel_values.dim() assert video_grid_thw.dim() == 2, video_grid_thw.dim() diff --git a/python/sglang/srt/models/qwen2_audio.py b/python/sglang/srt/models/qwen2_audio.py index 53e087496..bc232f0be 100644 --- a/python/sglang/srt/models/qwen2_audio.py +++ b/python/sglang/srt/models/qwen2_audio.py @@ -118,7 +118,7 @@ class Qwen2AudioForConditionalGeneration(nn.Module): def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # Extract audio features from input items - input_features = torch.cat([item.audio_features for item in items], dim=0).type( + input_features = torch.cat([item.feature for item in items], dim=0).type( self.audio_tower.dtype ) diff --git a/python/sglang/srt/models/qwen2_vl.py b/python/sglang/srt/models/qwen2_vl.py index 59179752a..55f325813 100644 --- a/python/sglang/srt/models/qwen2_vl.py +++ b/python/sglang/srt/models/qwen2_vl.py @@ -484,7 +484,7 @@ class Qwen2VLForConditionalGeneration(nn.Module): def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # in qwen-vl, last dim is the same - pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type( + pixel_values = torch.cat([item.feature for item in items], dim=0).type( self.visual.dtype ) image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0) @@ -495,9 +495,9 @@ class Qwen2VLForConditionalGeneration(nn.Module): def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor: # in qwen-vl, last dim is the same - pixel_values = torch.cat( - [item.pixel_values_videos for item in items], dim=0 - ).type(self.visual.dtype) + pixel_values = torch.cat([item.feature for item in items], dim=0).type( + self.visual.dtype + ) video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0) assert pixel_values.dim() == 2, pixel_values.dim() assert video_grid_thw.dim() == 2, video_grid_thw.dim() diff --git a/python/sglang/srt/models/vila.py b/python/sglang/srt/models/vila.py index 752217d67..2bb0b2d35 100644 --- a/python/sglang/srt/models/vila.py +++ b/python/sglang/srt/models/vila.py @@ -237,7 +237,7 @@ class VILAForConditionalGeneration(nn.Module): return cast(LogitsProcessorOutput, output) def get_image_feature(self, mm_input: List[MultimodalDataItem]) -> Tensor: - pixel_values = cast(Tensor, mm_input[0].pixel_values) + pixel_values = cast(Tensor, mm_input[0].feature) ##### BEGIN COPY modeling_vila.py ##### diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index 91aaa1909..44e22885c 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -5,7 +5,6 @@ import multiprocessing as mp import os import re from abc import ABC, abstractmethod -from functools import lru_cache from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np @@ -156,6 +155,10 @@ class BaseMultimodalProcessor(ABC): # "precomputed_features" - handled specially as it can be any modality } + # name of the feature filed + # TODO: pass from processors + self.FEATURE_NAMES = ["pixel_values", "pixel_values_videos", "audio_features"] + def process_mm_data( self, input_text, images=None, videos=None, audios=None, **kwargs ): @@ -524,6 +527,9 @@ class BaseMultimodalProcessor(ABC): if modality not in items: items[modality] = MultimodalDataItem(modality=modality) + if attr_name in self.FEATURE_NAMES: + attr_name = "feature" + # Set attribute setattr(items[modality], attr_name, value) diff --git a/python/sglang/srt/multimodal/processors/clip.py b/python/sglang/srt/multimodal/processors/clip.py index cda5edf89..a36269819 100644 --- a/python/sglang/srt/multimodal/processors/clip.py +++ b/python/sglang/srt/multimodal/processors/clip.py @@ -26,7 +26,7 @@ class ClipImageProcessor(BaseMultimodalProcessor): image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0] image_inputs["mm_items"] = [ MultimodalDataItem( - pixel_values=image_inputs["pixel_values"], modality=Modality.IMAGE + feature=image_inputs["pixel_values"], modality=Modality.IMAGE ) ] diff --git a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py index 0ffd91dc3..50547ad2d 100644 --- a/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py +++ b/python/sglang/srt/multimodal/processors/deepseek_vl_v2.py @@ -68,7 +68,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor): input_ids=input_ids, mm_token_id=self._processor.image_token_id ) item = MultimodalDataItem( - pixel_values=res["images"], + feature=res["images"], offsets=image_offsets, modality=Modality.IMAGE, image_emb_mask=images_seq_mask, diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py index df9b67aad..f9ed9ba76 100644 --- a/python/sglang/srt/multimodal/processors/internvl.py +++ b/python/sglang/srt/multimodal/processors/internvl.py @@ -223,7 +223,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor): ) items = [ MultimodalDataItem( - pixel_values=pixel_values, + feature=pixel_values, modality=Modality.IMAGE, offsets=image_offsets, ) diff --git a/python/sglang/srt/multimodal/processors/janus_pro.py b/python/sglang/srt/multimodal/processors/janus_pro.py index 36be9ded8..8ea013d29 100644 --- a/python/sglang/srt/multimodal/processors/janus_pro.py +++ b/python/sglang/srt/multimodal/processors/janus_pro.py @@ -47,7 +47,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor): return { "mm_items": [ MultimodalDataItem( - pixel_values=res["pixel_values"], + feature=res["pixel_values"], image_emb_mask=res["images_emb_mask"], offsets=image_offsets, modality=Modality.IMAGE, diff --git a/python/sglang/srt/multimodal/processors/llava.py b/python/sglang/srt/multimodal/processors/llava.py index d32398d85..03c4bf5ec 100644 --- a/python/sglang/srt/multimodal/processors/llava.py +++ b/python/sglang/srt/multimodal/processors/llava.py @@ -158,7 +158,7 @@ class LlavaImageProcessor(BaseMultimodalProcessor): return { "mm_items": [ MultimodalDataItem( - pixel_values=pixel_values, + feature=pixel_values, image_sizes=image_sizes, modality=modality, ) diff --git a/python/sglang/srt/multimodal/processors/minicpm.py b/python/sglang/srt/multimodal/processors/minicpm.py index 7945f20b5..369971ccb 100644 --- a/python/sglang/srt/multimodal/processors/minicpm.py +++ b/python/sglang/srt/multimodal/processors/minicpm.py @@ -114,7 +114,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor): if len(pixel_values) != 0: item = MultimodalDataItem( - pixel_values=pixel_values, + feature=pixel_values, offsets=image_offsets, tgt_size=tgt_sizes_flat, modality=Modality.IMAGE, @@ -135,7 +135,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor): else: audio_offsets = None item = MultimodalDataItem( - audio_features=[res["audio_features"]], + feature=[res["audio_features"]], audio_feature_lens=res["audio_feature_lens"], offsets=audio_offsets, modality=Modality.AUDIO, diff --git a/python/sglang/srt/multimodal/processors/mlama.py b/python/sglang/srt/multimodal/processors/mlama.py index aeb227be2..783145027 100644 --- a/python/sglang/srt/multimodal/processors/mlama.py +++ b/python/sglang/srt/multimodal/processors/mlama.py @@ -24,7 +24,7 @@ class MllamaImageProcessor(BaseMultimodalProcessor): image_inputs["input_ids"] = image_inputs["input_ids"].tolist()[0] image_inputs["mm_items"] = [ MultimodalDataItem( - pixel_values=image_inputs["pixel_values"], + feature=image_inputs["pixel_values"], aspect_ratio_id=image_inputs["aspect_ratio_ids"], aspect_ratio_mask=image_inputs["aspect_ratio_mask"], modality=Modality.IMAGE, diff --git a/python/sglang/srt/multimodal/processors/mllama4.py b/python/sglang/srt/multimodal/processors/mllama4.py index a7988c355..ccf70adc8 100644 --- a/python/sglang/srt/multimodal/processors/mllama4.py +++ b/python/sglang/srt/multimodal/processors/mllama4.py @@ -142,7 +142,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor): # Add metadata for image processing processor_output["mm_items"] = [ MultimodalDataItem( - pixel_values=processor_output["pixel_values"], + feature=processor_output["pixel_values"], modality=Modality.IMAGE, offsets=image_offsets, ) diff --git a/python/sglang/srt/multimodal/processors/phi4mm.py b/python/sglang/srt/multimodal/processors/phi4mm.py index fbf2cccb5..d2e009d27 100644 --- a/python/sglang/srt/multimodal/processors/phi4mm.py +++ b/python/sglang/srt/multimodal/processors/phi4mm.py @@ -62,7 +62,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor): items = [ MultimodalDataItem( - pixel_values=res["input_image_embeds"], + feature=res["input_image_embeds"], image_sizes=res["image_sizes"], image_emb_mask=res["image_attention_mask"], offsets=image_offsets, diff --git a/python/sglang/srt/multimodal/processors/pixtral.py b/python/sglang/srt/multimodal/processors/pixtral.py index 9be08cdcc..8b741d627 100644 --- a/python/sglang/srt/multimodal/processors/pixtral.py +++ b/python/sglang/srt/multimodal/processors/pixtral.py @@ -103,7 +103,7 @@ class PixtralProcessor(BaseMultimodalProcessor): ) mm_items = [ MultimodalDataItem( - pixel_values=processor_output["pixel_values"], + feature=processor_output["pixel_values"], image_sizes=processor_output["image_sizes"], modality=Modality.IMAGE, offsets=image_offsets, diff --git a/test/srt/test_vlm_accuracy.py b/test/srt/test_vlm_accuracy.py index ea83f3eef..2f2e294fa 100644 --- a/test/srt/test_vlm_accuracy.py +++ b/test/srt/test_vlm_accuracy.py @@ -245,7 +245,7 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase): MultimodalInputs( mm_items=[ MultimodalDataItem( - pixel_values=pixel_values_flat, + feature=pixel_values_flat, offsets=image_offsets, tgt_size=tgt_sizes_flat, modality=Modality.IMAGE,