2025-02-16 16:58:53 +08:00
# coding=utf-8
# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Qwen2VL model configuration """
from typing import Dict , Iterable , List , Optional , Union
import numpy as np
from transformers import (
AutoImageProcessor ,
AutoProcessor ,
BaseImageProcessor ,
BatchFeature ,
PretrainedConfig ,
ProcessorMixin ,
TensorType ,
)
from transformers . image_transforms import (
convert_to_rgb ,
normalize ,
rescale ,
resize ,
to_channel_dimension_format ,
)
from transformers . image_utils import (
ChannelDimension ,
ImageInput ,
PILImageResampling ,
VideoInput ,
get_image_size ,
infer_channel_dimension_format ,
is_pil_image ,
is_valid_image ,
make_list_of_images ,
to_numpy_array ,
valid_images ,
validate_preprocess_arguments ,
)
from transformers . modeling_rope_utils import rope_config_validation
from transformers . models . qwen2_vl . image_processing_qwen2_vl import smart_resize
from transformers . processing_utils import ProcessingKwargs , Unpack , VideosKwargs
from transformers . tokenization_utils_base import PreTokenizedInput , TextInput
from transformers . utils . constants import OPENAI_CLIP_MEAN , OPENAI_CLIP_STD
2025-02-19 01:14:11 +08:00
def is_valid_list_of_images ( images : List ) :
return images and all ( is_valid_image ( image ) for image in images )
2025-02-16 16:58:53 +08:00
class Qwen2_5_VLVisionConfig ( PretrainedConfig ) :
model_type = " qwen2_5_vl "
base_config_key = " vision_config "
def __init__ (
self ,
depth = 32 ,
hidden_size = 3584 ,
hidden_act = " silu " ,
intermediate_size = 3420 ,
num_heads = 16 ,
in_channels = 3 ,
patch_size = 14 ,
spatial_merge_size = 2 ,
temporal_patch_size = 2 ,
tokens_per_second = 4 ,
window_size = 112 ,
out_hidden_size = 3584 ,
fullatt_block_indexes = [ 7 , 15 , 23 , 31 ] ,
* * kwargs ,
) :
super ( ) . __init__ ( * * kwargs )
self . depth = depth
self . hidden_size = hidden_size
self . hidden_act = hidden_act
self . intermediate_size = intermediate_size
self . num_heads = num_heads
self . in_channels = in_channels
self . patch_size = patch_size
self . spatial_merge_size = spatial_merge_size
self . temporal_patch_size = temporal_patch_size
self . tokens_per_second = tokens_per_second
self . window_size = window_size
self . fullatt_block_indexes = fullatt_block_indexes
self . out_hidden_size = out_hidden_size
class Qwen2_5_VLConfig ( PretrainedConfig ) :
r """
This is the configuration class to store the configuration of a [ ` Qwen2_5_VLModel ` ] . It is used to instantiate a
Qwen2 - VL model according to the specified arguments , defining the model architecture . Instantiating a configuration
with the defaults will yield a similar configuration to that of
Qwen2 - VL - 7 B - Instruct [ Qwen / Qwen2 - VL - 7 B - Instruct ] ( https : / / huggingface . co / Qwen / Qwen2 - VL - 7 B - Instruct ) .
Configuration objects inherit from [ ` PretrainedConfig ` ] and can be used to control the model outputs . Read the
documentation from [ ` PretrainedConfig ` ] for more information .
Args :
vocab_size ( ` int ` , * optional * , defaults to 152064 ) :
Vocabulary size of the Qwen2_5_VL model . Defines the number of different tokens that can be represented by the
` inputs_ids ` passed when calling [ ` Qwen2_5_VLModel ` ]
hidden_size ( ` int ` , * optional * , defaults to 8192 ) :
Dimension of the hidden representations .
intermediate_size ( ` int ` , * optional * , defaults to 29568 ) :
Dimension of the MLP representations .
num_hidden_layers ( ` int ` , * optional * , defaults to 80 ) :
Number of hidden layers in the Transformer encoder .
num_attention_heads ( ` int ` , * optional * , defaults to 64 ) :
Number of attention heads for each attention layer in the Transformer encoder .
num_key_value_heads ( ` int ` , * optional * , defaults to 8 ) :
This is the number of key_value heads that should be used to implement Grouped Query Attention . If
` num_key_value_heads = num_attention_heads ` , the model will use Multi Head Attention ( MHA ) , if
` num_key_value_heads = 1 ` the model will use Multi Query Attention ( MQA ) otherwise GQA is used . When
converting a multi - head checkpoint to a GQA checkpoint , each group key and value head should be constructed
by meanpooling all the original heads within that group . For more details checkout [ this
paper ] ( https : / / arxiv . org / pdf / 2305.13245 . pdf ) . If it is not specified , will default to ` 32 ` .
hidden_act ( ` str ` or ` function ` , * optional * , defaults to ` " silu " ` ) :
The non - linear activation function ( function or string ) in the decoder .
max_position_embeddings ( ` int ` , * optional * , defaults to 32768 ) :
The maximum sequence length that this model might ever be used with .
initializer_range ( ` float ` , * optional * , defaults to 0.02 ) :
The standard deviation of the truncated_normal_initializer for initializing all weight matrices .
rms_norm_eps ( ` float ` , * optional * , defaults to 1e-05 ) :
The epsilon used by the rms normalization layers .
use_cache ( ` bool ` , * optional * , defaults to ` True ` ) :
Whether or not the model should return the last key / values attentions ( not used by all models ) . Only
relevant if ` config . is_decoder = True ` .
tie_word_embeddings ( ` bool ` , * optional * , defaults to ` False ` ) :
Whether the model ' s input and output word embeddings should be tied.
rope_theta ( ` float ` , * optional * , defaults to 1000000.0 ) :
The base period of the RoPE embeddings .
use_sliding_window ( ` bool ` , * optional * , defaults to ` False ` ) :
Whether to use sliding window attention .
sliding_window ( ` int ` , * optional * , defaults to 4096 ) :
Sliding window attention ( SWA ) window size . If not specified , will default to ` 4096 ` .
max_window_layers ( ` int ` , * optional * , defaults to 80 ) :
The number of layers that use SWA ( Sliding Window Attention ) . The bottom layers use SWA while the top use full attention .
attention_dropout ( ` float ` , * optional * , defaults to 0.0 ) :
The dropout ratio for the attention probabilities .
vision_config ( ` Dict ` , * optional * ) :
The config for the visual encoder initialization .
rope_scaling ( ` Dict ` , * optional * ) :
Dictionary containing the scaling configuration for the RoPE embeddings . NOTE : if you apply new rope type
and you expect the model to work on longer ` max_position_embeddings ` , we recommend you to update this value
accordingly .
Expected contents :
` rope_type ` ( ` str ` ) :
The sub - variant of RoPE to use . Can be one of [ ' default ' , ' linear ' , ' dynamic ' , ' yarn ' , ' longrope ' ,
' llama3 ' ] , with ' default ' being the original RoPE implementation .
` factor ` ( ` float ` , * optional * ) :
Used with all rope types except ' default ' . The scaling factor to apply to the RoPE embeddings . In
most scaling types , a ` factor ` of x will enable the model to handle sequences of length x *
original maximum pre - trained length .
` original_max_position_embeddings ` ( ` int ` , * optional * ) :
Used with ' dynamic ' , ' longrope ' and ' llama3 ' . The original max position embeddings used during
pretraining .
` attention_factor ` ( ` float ` , * optional * ) :
Used with ' yarn ' and ' longrope ' . The scaling factor to be applied on the attention
computation . If unspecified , it defaults to value recommended by the implementation , using the
` factor ` field to infer the suggested value .
` beta_fast ` ( ` float ` , * optional * ) :
Only used with ' yarn ' . Parameter to set the boundary for extrapolation ( only ) in the linear
ramp function . If unspecified , it defaults to 32.
` beta_slow ` ( ` float ` , * optional * ) :
Only used with ' yarn ' . Parameter to set the boundary for interpolation ( only ) in the linear
ramp function . If unspecified , it defaults to 1.
` short_factor ` ( ` List [ float ] ` , * optional * ) :
Only used with ' longrope ' . The scaling factor to be applied to short contexts ( <
` original_max_position_embeddings ` ) . Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
` long_factor ` ( ` List [ float ] ` , * optional * ) :
Only used with ' longrope ' . The scaling factor to be applied to long contexts ( <
` original_max_position_embeddings ` ) . Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
` low_freq_factor ` ( ` float ` , * optional * ) :
Only used with ' llama3 ' . Scaling factor applied to low frequency components of the RoPE
` high_freq_factor ` ( ` float ` , * optional * ) :
Only used with ' llama3 ' . Scaling factor applied to high frequency components of the RoPE
` ` ` python
>> > from transformers import Qwen2_5_VLForConditionalGeneration , Qwen2_5_VLConfig
>> > # Initializing a Qwen2_5_VL style configuration
>> > configuration = Qwen2_5_VLConfig ( )
>> > # Initializing a model from the Qwen2-VL-7B style configuration
>> > model = Qwen2_5_VLForConditionalGeneration ( configuration )
>> > # Accessing the model configuration
>> > configuration = model . config
` ` ` """
model_type = " qwen2_5_vl "
sub_configs = { " vision_config " : Qwen2_5_VLVisionConfig }
keys_to_ignore_at_inference = [ " past_key_values " ]
# Default tensor parallel plan for base model `Qwen2_5_VL`
base_model_tp_plan = {
" layers.*.self_attn.q_proj " : " colwise " ,
" layers.*.self_attn.k_proj " : " colwise " ,
" layers.*.self_attn.v_proj " : " colwise " ,
" layers.*.self_attn.o_proj " : " rowwise " ,
" layers.*.mlp.gate_proj " : " colwise " ,
" layers.*.mlp.up_proj " : " colwise " ,
" layers.*.mlp.down_proj " : " rowwise " ,
}
def __init__ (
self ,
vocab_size = 152064 ,
hidden_size = 8192 ,
intermediate_size = 29568 ,
num_hidden_layers = 80 ,
num_attention_heads = 64 ,
num_key_value_heads = 8 ,
hidden_act = " silu " ,
max_position_embeddings = 32768 ,
initializer_range = 0.02 ,
rms_norm_eps = 1e-05 ,
use_cache = True ,
tie_word_embeddings = False ,
rope_theta = 1000000.0 ,
use_sliding_window = False ,
sliding_window = 4096 ,
max_window_layers = 80 ,
attention_dropout = 0.0 ,
vision_config = None ,
rope_scaling = None ,
* * kwargs ,
) :
if isinstance ( vision_config , dict ) :
self . vision_config = self . sub_configs [ " vision_config " ] ( * * vision_config )
elif vision_config is None :
self . vision_config = self . sub_configs [ " vision_config " ] ( )
self . vocab_size = vocab_size
self . max_position_embeddings = max_position_embeddings
self . hidden_size = hidden_size
self . intermediate_size = intermediate_size
self . num_hidden_layers = num_hidden_layers
self . num_attention_heads = num_attention_heads
self . use_sliding_window = use_sliding_window
self . sliding_window = sliding_window
self . max_window_layers = max_window_layers
# for backward compatibility
if num_key_value_heads is None :
num_key_value_heads = num_attention_heads
self . num_key_value_heads = num_key_value_heads
self . hidden_act = hidden_act
self . initializer_range = initializer_range
self . rms_norm_eps = rms_norm_eps
self . use_cache = use_cache
self . rope_theta = rope_theta
self . attention_dropout = attention_dropout
self . rope_scaling = rope_scaling
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, move it to 'rope_type'.
# and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
# one can set it to "linear"/"dynamic" etc. to have scaled RoPE
# TODO: @raushan update config in the hub
if self . rope_scaling is not None and " type " in self . rope_scaling :
if self . rope_scaling [ " type " ] == " mrope " :
self . rope_scaling [ " type " ] = " default "
self . rope_scaling [ " rope_type " ] = self . rope_scaling [ " type " ]
rope_config_validation ( self , ignore_keys = { " mrope_section " } )
super ( ) . __init__ ( tie_word_embeddings = tie_word_embeddings , * * kwargs )
# FIXME: workaround of obsolete transformers version
class Qwen2_5_VLVideosProcessorKwargs ( VideosKwargs , total = False ) :
fps : Union [ List [ float ] , float ]
class Qwen2_5_VLProcessorKwargs ( ProcessingKwargs , total = False ) :
videos_kwargs : Qwen2_5_VLVideosProcessorKwargs
_defaults = {
" text_kwargs " : {
" padding " : False ,
} ,
" videos_kwargs " : { " fps " : 2.0 } ,
}
class Qwen2_5_VLProcessor ( ProcessorMixin ) :
r """
Constructs a Qwen2 .5 - VL processor which wraps a Qwen2 .5 - VL image processor and a Qwen2 tokenizer into a single processor .
[ ` Qwen2_5_VLProcessor ` ] offers all the functionalities of [ ` Qwen2VLImageProcessor ` ] and [ ` Qwen2TokenizerFast ` ] . See the
[ ` ~ Qwen2_5_VLProcessor . __call__ ` ] and [ ` ~ Qwen2_5_VLProcessor . decode ` ] for more information .
Args :
image_processor ( [ ` Qwen2VLImageProcessor ` ] , * optional * ) :
The image processor is a required input .
tokenizer ( [ ` Qwen2TokenizerFast ` ] , * optional * ) :
The tokenizer is a required input .
chat_template ( ` str ` , * optional * ) : A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string .
"""
attributes = [ " image_processor " , " tokenizer " ]
valid_kwargs = [ " chat_template " ]
image_processor_class = " AutoImageProcessor "
tokenizer_class = ( " Qwen2Tokenizer " , " Qwen2TokenizerFast " )
def __init__ (
self , image_processor = None , tokenizer = None , chat_template = None , * * kwargs
) :
self . image_token = (
" <|image_pad|> "
if not hasattr ( tokenizer , " image_token " )
else tokenizer . image_token
)
self . video_token = (
" <|video_pad|> "
if not hasattr ( tokenizer , " video_token " )
else tokenizer . video_token
)
super ( ) . __init__ ( image_processor , tokenizer , chat_template = chat_template )
def __call__ (
self ,
images : ImageInput = None ,
text : Union [
TextInput , PreTokenizedInput , List [ TextInput ] , List [ PreTokenizedInput ]
] = None ,
videos : VideoInput = None ,
* * kwargs : Unpack [ Qwen2_5_VLProcessorKwargs ] ,
) - > BatchFeature :
"""
Main method to prepare for the model one or several sequences ( s ) and image ( s ) . This method forwards the ` text `
and ` kwargs ` arguments to Qwen2TokenizerFast ' s [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text . To prepare the vision inputs , this method forwards the ` vision_infos ` and ` kwrags ` arguments to
Qwen2VLImageProcessor ' s [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args :
images ( ` PIL . Image . Image ` , ` np . ndarray ` , ` torch . Tensor ` , ` List [ PIL . Image . Image ] ` , ` List [ np . ndarray ] ` , ` List [ torch . Tensor ] ` ) :
The image or batch of images to be prepared . Each image can be a PIL image , NumPy array or PyTorch
tensor . Both channels - first and channels - last formats are supported .
text ( ` str ` , ` List [ str ] ` , ` List [ List [ str ] ] ` ) :
The sequence or batch of sequences to be encoded . Each sequence can be a string or a list of strings
( pretokenized string ) . If the sequences are provided as list of strings ( pretokenized ) , you must set
` is_split_into_words = True ` ( to lift the ambiguity with a batch of sequences ) .
videos ( ` np . ndarray ` , ` torch . Tensor ` , ` List [ np . ndarray ] ` , ` List [ torch . Tensor ] ` ) :
The image or batch of videos to be prepared . Each video can be a 4 D NumPy array or PyTorch
tensor , or a nested list of 3 D frames . Both channels - first and channels - last formats are supported .
return_tensors ( ` str ` or [ ` ~ utils . TensorType ` ] , * optional * ) :
If set , will return tensors of a particular framework . Acceptable values are :
- ` ' tf ' ` : Return TensorFlow ` tf . constant ` objects .
- ` ' pt ' ` : Return PyTorch ` torch . Tensor ` objects .
- ` ' np ' ` : Return NumPy ` np . ndarray ` objects .
- ` ' jax ' ` : Return JAX ` jnp . ndarray ` objects .
Returns :
[ ` BatchFeature ` ] : A [ ` BatchFeature ` ] with the following fields :
- * * input_ids * * - - List of token ids to be fed to a model . Returned when ` text ` is not ` None ` .
- * * attention_mask * * - - List of indices specifying which tokens should be attended to by the model ( when
` return_attention_mask = True ` or if * " attention_mask " * is in ` self . model_input_names ` and if ` text ` is not
` None ` ) .
- * * pixel_values * * - - Pixel values to be fed to a model . Returned when ` images ` is not ` None ` .
- * * pixel_values_videos * * - - Pixel values of videos to be fed to a model . Returned when ` videos ` is not ` None ` .
- * * image_grid_thw * * - - List of image 3 D grid in LLM . Returned when ` images ` is not ` None ` .
- * * video_grid_thw * * - - List of video 3 D grid in LLM . Returned when ` videos ` is not ` None ` .
- * * second_per_grid_ts * * - - List of video seconds per time grid . Returned when ` videos ` is not ` None ` .
"""
output_kwargs = self . _merge_kwargs (
Qwen2_5_VLProcessorKwargs ,
tokenizer_init_kwargs = self . tokenizer . init_kwargs ,
* * kwargs ,
)
if images is not None :
image_inputs = self . image_processor (
images = images , videos = None , * * output_kwargs [ " images_kwargs " ]
)
image_grid_thw = image_inputs [ " image_grid_thw " ]
else :
image_inputs = { }
image_grid_thw = None
if videos is not None :
videos_inputs = self . image_processor (
images = None , videos = videos , * * output_kwargs [ " images_kwargs " ]
)
video_grid_thw = videos_inputs [ " video_grid_thw " ]
fps = output_kwargs [ " videos_kwargs " ] . pop ( " fps " , 2.0 )
if isinstance ( fps , ( int , float ) ) :
second_per_grid_ts = [
self . image_processor . temporal_patch_size / fps
] * len ( video_grid_thw )
elif hasattr ( fps , " __len__ " ) and len ( fps ) == len ( video_grid_thw ) :
second_per_grid_ts = [
self . image_processor . temporal_patch_size / tmp for tmp in fps
]
else :
raise ValueError (
f " The length of fps ( { len ( fps ) if hasattr ( fps , ' __len__ ' ) else fps } ) must be equal to the length of video_grid_thw ( { len ( video_grid_thw ) } ) or fps should be a single number. "
)
videos_inputs . update ( { " second_per_grid_ts " : second_per_grid_ts } )
else :
videos_inputs = { }
video_grid_thw = None
if not isinstance ( text , list ) :
text = [ text ]
if image_grid_thw is not None :
merge_length = self . image_processor . merge_size * * 2
index = 0
for i in range ( len ( text ) ) :
while self . image_token in text [ i ] :
text [ i ] = text [ i ] . replace (
self . image_token ,
" <|placeholder|> "
* ( image_grid_thw [ index ] . prod ( ) / / merge_length ) ,
1 ,
)
index + = 1
text [ i ] = text [ i ] . replace ( " <|placeholder|> " , self . image_token )
if video_grid_thw is not None :
merge_length = self . image_processor . merge_size * * 2
index = 0
for i in range ( len ( text ) ) :
while self . video_token in text [ i ] :
text [ i ] = text [ i ] . replace (
self . video_token ,
" <|placeholder|> "
* ( video_grid_thw [ index ] . prod ( ) / / merge_length ) ,
1 ,
)
index + = 1
text [ i ] = text [ i ] . replace ( " <|placeholder|> " , self . video_token )
text_inputs = self . tokenizer ( text , * * output_kwargs [ " text_kwargs " ] )
return BatchFeature ( data = { * * text_inputs , * * image_inputs , * * videos_inputs } )
def batch_decode ( self , * args , * * kwargs ) :
"""
This method forwards all its arguments to Qwen2TokenizerFast ' s [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information .
"""
return self . tokenizer . batch_decode ( * args , * * kwargs )
def decode ( self , * args , * * kwargs ) :
"""
This method forwards all its arguments to Qwen2TokenizerFast ' s [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information .
"""
return self . tokenizer . decode ( * args , * * kwargs )
def post_process_image_text_to_text ( self , generated_outputs ) :
"""
Post - process the output of the model to decode the text .
Args :
generated_outputs ( ` torch . Tensor ` or ` np . ndarray ` ) :
The output of the model ` generate ` function . The output is expected to be a tensor of shape ` ( batch_size , sequence_length ) `
or ` ( sequence_length , ) ` .
Returns :
` List [ str ] ` : The decoded text .
"""
return self . tokenizer . batch_decode (
generated_outputs ,
skip_special_tokens = True ,
clean_up_tokenization_spaces = False ,
)
@property
def model_input_names ( self ) :
tokenizer_input_names = self . tokenizer . model_input_names
image_processor_input_names = self . image_processor . model_input_names
names_from_processor = list (
dict . fromkeys ( tokenizer_input_names + image_processor_input_names )
)
return names_from_processor + [ " second_per_grid_ts " ]
class Qwen2_5_VLImageProcessor ( BaseImageProcessor ) :
r """
Constructs a Qwen2 .5 - VL image processor that dynamically resizes images based on the original images .
Args :
do_resize ( ` bool ` , * optional * , defaults to ` True ` ) :
Whether to resize the image ' s (height, width) dimensions.
resample ( ` PILImageResampling ` , * optional * , defaults to ` Resampling . BICUBIC ` ) :
Resampling filter to use when resizing the image .
do_rescale ( ` bool ` , * optional * , defaults to ` True ` ) :
Whether to rescale the image by the specified scale ` rescale_factor ` .
rescale_factor ( ` int ` or ` float ` , * optional * , defaults to ` 1 / 255 ` ) :
Scale factor to use if rescaling the image .
do_normalize ( ` bool ` , * optional * , defaults to ` True ` ) :
Whether to normalize the image .
image_mean ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` [ 0.48145466 , 0.4578275 , 0.40821073 ] ` ) :
Mean to use if normalizing the image . This is a float or list of floats for each channel in the image .
image_std ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` [ 0.26862954 , 0.26130258 , 0.27577711 ] ` ) :
Standard deviation to use if normalizing the image . This is a float or list of floats for each channel in the image .
do_convert_rgb ( ` bool ` , * optional * , defaults to ` True ` ) :
Whether to convert the image to RGB .
min_pixels ( ` int ` , * optional * , defaults to ` 56 * 56 ` ) :
The min pixels of the image to resize the image .
max_pixels ( ` int ` , * optional * , defaults to ` 28 * 28 * 1280 ` ) :
The max pixels of the image to resize the image .
patch_size ( ` int ` , * optional * , defaults to 14 ) :
The spacial patch size of the vision encoder .
temporal_patch_size ( ` int ` , * optional * , defaults to 2 ) :
The temporal patch size of the vision encoder .
merge_size ( ` int ` , * optional * , defaults to 2 ) :
The merge size of the vision encoder to llm encoder .
"""
model_input_names = [
" pixel_values " ,
" image_grid_thw " ,
" pixel_values_videos " ,
" video_grid_thw " ,
" second_per_grid_ts " ,
]
def __init__ (
self ,
do_resize : bool = True ,
resample : PILImageResampling = PILImageResampling . BICUBIC ,
do_rescale : bool = True ,
rescale_factor : Union [ int , float ] = 1 / 255 ,
do_normalize : bool = True ,
image_mean : Optional [ Union [ float , List [ float ] ] ] = None ,
image_std : Optional [ Union [ float , List [ float ] ] ] = None ,
do_convert_rgb : bool = True ,
min_pixels : int = 56 * 56 ,
max_pixels : int = 28 * 28 * 1280 ,
patch_size : int = 14 ,
temporal_patch_size : int = 2 ,
merge_size : int = 2 ,
* * kwargs ,
) - > None :
super ( ) . __init__ ( * * kwargs )
self . do_resize = do_resize
self . resample = resample
self . do_rescale = do_rescale
self . rescale_factor = rescale_factor
self . do_normalize = do_normalize
self . image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self . image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self . min_pixels = min_pixels
self . max_pixels = max_pixels
self . patch_size = patch_size
self . temporal_patch_size = temporal_patch_size
self . merge_size = merge_size
self . size = { " min_pixels " : min_pixels , " max_pixels " : max_pixels }
self . do_convert_rgb = do_convert_rgb
def rescale (
self ,
image : np . ndarray ,
scale : float ,
data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
input_data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
* * kwargs ,
) - > np . ndarray :
"""
Rescale an image by a scale factor . image = image * scale .
Args :
image ( ` np . ndarray ` ) :
Image to rescale .
scale ( ` float ` ) :
The scaling factor to rescale pixel values by .
data_format ( ` str ` or ` ChannelDimension ` , * optional * ) :
The channel dimension format for the output image . If unset , the channel dimension format of the input
image is used . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
input_data_format ( ` ChannelDimension ` or ` str ` , * optional * ) :
The channel dimension format for the input image . If unset , the channel dimension format is inferred
from the input image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
Returns :
` np . ndarray ` : The rescaled image .
"""
return rescale (
image ,
scale = scale ,
data_format = data_format ,
input_data_format = input_data_format ,
* * kwargs ,
)
def normalize (
self ,
image : np . ndarray ,
mean : Union [ float , Iterable [ float ] ] ,
std : Union [ float , Iterable [ float ] ] ,
data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
input_data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
* * kwargs ,
) - > np . ndarray :
"""
Normalize an image . image = ( image - image_mean ) / image_std .
Args :
image ( ` np . ndarray ` ) :
Image to normalize .
mean ( ` float ` or ` Iterable [ float ] ` ) :
Image mean to use for normalization .
std ( ` float ` or ` Iterable [ float ] ` ) :
Image standard deviation to use for normalization .
data_format ( ` str ` or ` ChannelDimension ` , * optional * ) :
The channel dimension format for the output image . If unset , the channel dimension format of the input
image is used . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
input_data_format ( ` ChannelDimension ` or ` str ` , * optional * ) :
The channel dimension format for the input image . If unset , the channel dimension format is inferred
from the input image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
Returns :
` np . ndarray ` : The normalized image .
"""
return normalize (
image ,
mean = mean ,
std = std ,
data_format = data_format ,
input_data_format = input_data_format ,
* * kwargs ,
)
def _preprocess (
self ,
images : Union [ ImageInput , VideoInput ] ,
do_resize : bool = None ,
resample : PILImageResampling = None ,
do_rescale : bool = None ,
rescale_factor : float = None ,
do_normalize : bool = None ,
image_mean : Optional [ Union [ float , List [ float ] ] ] = None ,
image_std : Optional [ Union [ float , List [ float ] ] ] = None ,
do_convert_rgb : bool = None ,
data_format : Optional [ ChannelDimension ] = ChannelDimension . FIRST ,
input_data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
) :
"""
Preprocess an image or batch of images . Copy of the ` preprocess ` method from ` CLIPImageProcessor ` .
Args :
images ( ` ImageInput ` ) :
Image or batch of images to preprocess . Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1 , set ` do_rescale = False ` .
vision_info ( ` List [ Dict ] ` , * optional * ) :
Optional list of dictionaries containing additional information about vision inputs .
do_resize ( ` bool ` , * optional * , defaults to ` self . do_resize ` ) :
Whether to resize the image .
resample ( ` PILImageResampling ` , * optional * , defaults to ` self . resample ` ) :
Resampling filter to use if resizing the image . This can be one of the ` PILImageResampling ` enums .
do_rescale ( ` bool ` , * optional * , defaults to ` self . do_rescale ` ) :
Whether to rescale the image .
rescale_factor ( ` float ` , * optional * , defaults to ` self . rescale_factor ` ) :
Scale factor to use if rescaling the image .
do_normalize ( ` bool ` , * optional * , defaults to ` self . do_normalize ` ) :
Whether to normalize the image .
image_mean ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` self . image_mean ` ) :
Mean to use if normalizing the image . Can be a float or a list of floats corresponding to the number of channels in the image .
image_std ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` self . image_std ` ) :
Standard deviation to use if normalizing the image . Can be a float or a list of floats corresponding to the number of channels in the image .
do_convert_rgb ( ` bool ` , * optional * , defaults to ` self . do_convert_rgb ` ) :
Whether to convert the image to RGB .
data_format ( ` ChannelDimension ` , * optional * , defaults to ` ChannelDimension . FIRST ` ) :
The channel dimension format for the output image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
- Unset : Use the channel dimension format of the input image .
input_data_format ( ` ChannelDimension ` or ` str ` , * optional * ) :
The channel dimension format for the input image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
- ` " none " ` or ` ChannelDimension . NONE ` : image in ( height , width ) format . - ` " none " ` or ` ChannelDimension . NONE ` : image in ( height , width ) format .
"""
images = make_list_of_images ( images )
if do_convert_rgb :
images = [ convert_to_rgb ( image ) for image in images ]
# All transformations expect numpy arrays.
images = [ to_numpy_array ( image ) for image in images ]
if input_data_format is None :
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format ( images [ 0 ] )
height , width = get_image_size ( images [ 0 ] , channel_dim = input_data_format )
resized_height , resized_width = height , width
processed_images = [ ]
for image in images :
if do_resize :
resized_height , resized_width = smart_resize (
height ,
width ,
factor = self . patch_size * self . merge_size ,
min_pixels = self . min_pixels ,
max_pixels = self . max_pixels ,
)
image = resize (
image ,
size = ( resized_height , resized_width ) ,
resample = resample ,
input_data_format = input_data_format ,
)
if do_rescale :
image = self . rescale (
image , scale = rescale_factor , input_data_format = input_data_format
)
if do_normalize :
image = self . normalize (
image = image ,
mean = image_mean ,
std = image_std ,
input_data_format = input_data_format ,
)
image = to_channel_dimension_format (
image , data_format , input_channel_dim = input_data_format
)
processed_images . append ( image )
patches = np . array ( processed_images )
if data_format == ChannelDimension . LAST :
patches = patches . transpose ( 0 , 3 , 1 , 2 )
if patches . shape [ 0 ] % self . temporal_patch_size != 0 :
repeats = np . repeat (
patches [ - 1 ] [ np . newaxis ] , self . temporal_patch_size - 1 , axis = 0
)
patches = np . concatenate ( [ patches , repeats ] , axis = 0 )
channel = patches . shape [ 1 ]
grid_t = patches . shape [ 0 ] / / self . temporal_patch_size
grid_h , grid_w = (
resized_height / / self . patch_size ,
resized_width / / self . patch_size ,
)
patches = patches . reshape (
grid_t ,
self . temporal_patch_size ,
channel ,
grid_h / / self . merge_size ,
self . merge_size ,
self . patch_size ,
grid_w / / self . merge_size ,
self . merge_size ,
self . patch_size ,
)
patches = patches . transpose ( 0 , 3 , 6 , 4 , 7 , 2 , 1 , 5 , 8 )
flatten_patches = patches . reshape (
grid_t * grid_h * grid_w ,
channel * self . temporal_patch_size * self . patch_size * self . patch_size ,
)
return flatten_patches , ( grid_t , grid_h , grid_w )
def preprocess (
self ,
images : ImageInput ,
videos : VideoInput = None ,
do_resize : bool = None ,
size : Dict [ str , int ] = None ,
resample : PILImageResampling = None ,
do_rescale : bool = None ,
rescale_factor : float = None ,
do_normalize : bool = None ,
image_mean : Optional [ Union [ float , List [ float ] ] ] = None ,
image_std : Optional [ Union [ float , List [ float ] ] ] = None ,
do_convert_rgb : bool = None ,
return_tensors : Optional [ Union [ str , TensorType ] ] = None ,
data_format : Optional [ ChannelDimension ] = ChannelDimension . FIRST ,
input_data_format : Optional [ Union [ str , ChannelDimension ] ] = None ,
) :
"""
Args :
images ( ` ImageInput ` ) :
Image to preprocess . Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1 , set ` do_rescale = False ` .
videos ( ` VideoInput ` ) :
Video to preprocess . Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1 , set ` do_rescale = False ` .
do_resize ( ` bool ` , * optional * , defaults to ` self . do_resize ` ) :
Whether to resize the image .
size ( ` Dict [ str , int ] ` , * optional * , defaults to ` self . size ` ) :
Size of the image after resizing . Shortest edge of the image is resized to size [ " shortest_edge " ] , with
the longest edge resized to keep the input aspect ratio .
resample ( ` int ` , * optional * , defaults to ` self . resample ` ) :
Resampling filter to use if resizing the image . This can be one of the enum ` PILImageResampling ` . Only
has an effect if ` do_resize ` is set to ` True ` .
do_rescale ( ` bool ` , * optional * , defaults to ` self . do_rescale ` ) :
Whether to rescale the image .
rescale_factor ( ` float ` , * optional * , defaults to ` self . rescale_factor ` ) :
Rescale factor to rescale the image by if ` do_rescale ` is set to ` True ` .
do_normalize ( ` bool ` , * optional * , defaults to ` self . do_normalize ` ) :
Whether to normalize the image .
image_mean ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` self . image_mean ` ) :
Image mean to use for normalization . Only has an effect if ` do_normalize ` is set to ` True ` .
image_std ( ` float ` or ` List [ float ] ` , * optional * , defaults to ` self . image_std ` ) :
Image standard deviation to use for normalization . Only has an effect if ` do_normalize ` is set to
` True ` .
do_convert_rgb ( ` bool ` , * optional * , defaults to ` self . do_convert_rgb ` ) :
Whether to convert the image to RGB .
return_tensors ( ` str ` or ` TensorType ` , * optional * ) :
The type of tensors to return . Can be one of :
- Unset : Return a list of ` np . ndarray ` .
- ` TensorType . TENSORFLOW ` or ` ' tf ' ` : Return a batch of type ` tf . Tensor ` .
- ` TensorType . PYTORCH ` or ` ' pt ' ` : Return a batch of type ` torch . Tensor ` .
- ` TensorType . NUMPY ` or ` ' np ' ` : Return a batch of type ` np . ndarray ` .
- ` TensorType . JAX ` or ` ' jax ' ` : Return a batch of type ` jax . numpy . ndarray ` .
data_format ( ` ChannelDimension ` or ` str ` , * optional * , defaults to ` ChannelDimension . FIRST ` ) :
The channel dimension format for the output image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
- Unset : Use the channel dimension format of the input image .
input_data_format ( ` ChannelDimension ` or ` str ` , * optional * ) :
The channel dimension format for the input image . If unset , the channel dimension format is inferred
from the input image . Can be one of :
- ` " channels_first " ` or ` ChannelDimension . FIRST ` : image in ( num_channels , height , width ) format .
- ` " channels_last " ` or ` ChannelDimension . LAST ` : image in ( height , width , num_channels ) format .
- ` " none " ` or ` ChannelDimension . NONE ` : image in ( height , width ) format .
"""
do_resize = do_resize if do_resize is not None else self . do_resize
size = size if size is not None else self . size
resample = resample if resample is not None else self . resample
do_rescale = do_rescale if do_rescale is not None else self . do_rescale
rescale_factor = (
rescale_factor if rescale_factor is not None else self . rescale_factor
)
do_normalize = do_normalize if do_normalize is not None else self . do_normalize
image_mean = image_mean if image_mean is not None else self . image_mean
image_std = image_std if image_std is not None else self . image_std
do_convert_rgb = (
do_convert_rgb if do_convert_rgb is not None else self . do_convert_rgb
)
def make_flat_list_of_images (
images : Union [ List [ ImageInput ] , ImageInput ] ,
) - > ImageInput :
"""
Ensure that the output is a flat list of images . If the input is a single image , it is converted to a list of length 1.
If the input is a nested list of images , it is converted to a flat list of images .
Args :
images ( ` Union [ List [ ImageInput ] , ImageInput ] ` ) :
The input image .
Returns :
list : A list of images or a 4 d array of images .
"""
# If the input is a nested list of images, we flatten it
if (
isinstance ( images , ( list , tuple ) )
and all ( isinstance ( images_i , ( list , tuple ) ) for images_i in images )
and all ( is_valid_list_of_images ( images_i ) for images_i in images )
) :
return [ img for img_list in images for img in img_list ]
if isinstance ( images , ( list , tuple ) ) and is_valid_list_of_images ( images ) :
if is_pil_image ( images [ 0 ] ) or images [ 0 ] . ndim == 3 :
return images
if images [ 0 ] . ndim == 4 :
return [ img for img_list in images for img in img_list ]
if is_valid_image ( images ) :
if is_pil_image ( images ) or images . ndim == 3 :
return [ images ]
if images . ndim == 4 :
return list ( images )
raise ValueError ( f " Could not make a flat list of images from { images } " )
def make_batched_videos ( videos ) - > VideoInput :
"""
Ensure that the input is a list of videos .
Args :
videos ( ` VideoInput ` ) :
Video or videos to turn into a list of videos .
Returns :
list : A list of videos .
"""
if (
isinstance ( videos , ( list , tuple ) )
and isinstance ( videos [ 0 ] , ( list , tuple ) )
and is_valid_image ( videos [ 0 ] [ 0 ] )
) :
# case 1: nested batch of videos so we flatten it
if not is_pil_image ( videos [ 0 ] [ 0 ] ) and videos [ 0 ] [ 0 ] . ndim == 4 :
videos = [
[ video for batch_list in batched_videos for video in batch_list ]
for batched_videos in videos
]
# case 2: list of videos represented as list of video frames
return videos
elif isinstance ( videos , ( list , tuple ) ) and is_valid_image ( videos [ 0 ] ) :
if is_pil_image ( videos [ 0 ] ) or videos [ 0 ] . ndim == 3 :
return [ videos ]
elif videos [ 0 ] . ndim == 4 :
return [ list ( video ) for video in videos ]
elif is_valid_image ( videos ) :
if is_pil_image ( videos ) or videos . ndim == 3 :
return [ [ videos ] ]
elif videos . ndim == 4 :
return [ list ( videos ) ]
raise ValueError ( f " Could not make batched video from { videos } " )
if images is not None :
images = make_flat_list_of_images ( images )
if videos is not None :
videos = make_batched_videos ( videos )
if images is not None and not valid_images ( images ) :
raise ValueError (
" Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
" torch.Tensor, tf.Tensor or jax.ndarray. "
)
validate_preprocess_arguments (
rescale_factor = rescale_factor ,
do_normalize = do_normalize ,
image_mean = image_mean ,
image_std = image_std ,
do_resize = do_resize ,
size = size ,
resample = resample ,
)
if images is not None :
pixel_values , vision_grid_thws = [ ] , [ ]
for image in images :
patches , image_grid_thw = self . _preprocess (
image ,
do_resize = do_resize ,
resample = resample ,
do_rescale = do_rescale ,
rescale_factor = rescale_factor ,
do_normalize = do_normalize ,
image_mean = image_mean ,
image_std = image_std ,
data_format = data_format ,
do_convert_rgb = do_convert_rgb ,
input_data_format = input_data_format ,
)
pixel_values . extend ( patches )
vision_grid_thws . append ( image_grid_thw )
pixel_values = np . array ( pixel_values )
vision_grid_thws = np . array ( vision_grid_thws )
data = { " pixel_values " : pixel_values , " image_grid_thw " : vision_grid_thws }
if videos is not None :
pixel_values , vision_grid_thws = [ ] , [ ]
for images in videos :
patches , video_grid_thw = self . _preprocess (
images ,
do_resize = do_resize ,
resample = resample ,
do_rescale = do_rescale ,
rescale_factor = rescale_factor ,
do_normalize = do_normalize ,
image_mean = image_mean ,
image_std = image_std ,
data_format = data_format ,
do_convert_rgb = do_convert_rgb ,
input_data_format = input_data_format ,
)
pixel_values . extend ( patches )
vision_grid_thws . append ( video_grid_thw )
pixel_values = np . array ( pixel_values )
vision_grid_thws = np . array ( vision_grid_thws )
data = {
" pixel_values_videos " : pixel_values ,
" video_grid_thw " : vision_grid_thws ,
}
return BatchFeature ( data = data , tensor_type = return_tensors )
2025-02-19 01:14:11 +08:00
AutoImageProcessor . register ( Qwen2_5_VLConfig , None , Qwen2_5_VLImageProcessor , None )
2025-02-16 16:58:53 +08:00
AutoProcessor . register ( Qwen2_5_VLConfig , Qwen2_5_VLProcessor )