diff --git a/python/pyproject.toml b/python/pyproject.toml index 99511cf65..e39a4bdb8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -35,7 +35,7 @@ runtime_common = [ "python-multipart", "pyzmq>=25.1.2", "torchao>=0.7.0", - "transformers==4.48.3", + "transformers==4.50.0", "uvicorn", "uvloop", "xgrammar==0.1.16", diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py index 41d0cfcae..1e8370ba7 100644 --- a/python/sglang/srt/configs/__init__.py +++ b/python/sglang/srt/configs/__init__.py @@ -2,21 +2,12 @@ from sglang.srt.configs.chatglm import ChatGLMConfig from sglang.srt.configs.dbrx import DbrxConfig from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config from sglang.srt.configs.exaone import ExaoneConfig -from sglang.srt.configs.gemma3 import Gemma3Config, Gemma3TextConfig from sglang.srt.configs.janus_pro import MultiModalityConfig -from sglang.srt.configs.qwen2_5_vl_config import ( - Qwen2_5_VLConfig, - Qwen2_5_VLVisionConfig, -) __all__ = [ "ExaoneConfig", "ChatGLMConfig", "DbrxConfig", "DeepseekVL2Config", - "Qwen2_5_VLConfig", - "Qwen2_5_VLVisionConfig", "MultiModalityConfig", - "Gemma3Config", - "Gemma3TextConfig", ] diff --git a/python/sglang/srt/configs/gemma3.py b/python/sglang/srt/configs/gemma3.py deleted file mode 100644 index b70089f5c..000000000 --- a/python/sglang/srt/configs/gemma3.py +++ /dev/null @@ -1,1086 +0,0 @@ -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py. -# Do NOT edit this file manually as any edits will be overwritten by the generation of -# the file from the modular. If any change should be done, please apply the change to the -# modular_gemma3.py file directly. One of our CI enforces this. -# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -# coding=utf-8 -# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import itertools -import logging -import math -import re -from typing import Dict, Iterable, List, Optional, Union - -import numpy as np -import PIL -import transformers -from torch import TensorType -from transformers import ( - AutoImageProcessor, - AutoProcessor, - BatchFeature, - PretrainedConfig, - SiglipVisionConfig, -) -from transformers.image_processing_utils import BaseImageProcessor, get_size_dict -from transformers.image_transforms import ( - convert_to_rgb, - resize, - to_channel_dimension_format, -) -from transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_pil_image, - is_scaled_image, - is_valid_image, - to_numpy_array, - valid_images, - validate_preprocess_arguments, -) -from transformers.modeling_rope_utils import rope_config_validation -from transformers.processing_utils import ( - ImagesKwargs, - ProcessingKwargs, - ProcessorMixin, - Unpack, -) -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput -from transformers.utils import ( - IMAGENET_STANDARD_MEAN, - IMAGENET_STANDARD_STD, - filter_out_non_signature_kwargs, - to_py_obj, -) - -logger = logging.getLogger(__name__) - - -def is_valid_list_of_images(images: List): - return images and all(is_valid_image(image) for image in images) - - -# copied from transformer -def make_nested_list_of_images( - images: Union[List[ImageInput], ImageInput], -) -> ImageInput: - """ - Ensure that the output is a nested list of images. - Args: - images (`Union[List[ImageInput], ImageInput]`): - The input image. - Returns: - list: A list of list of images or a list of 4d array of images. - """ - # If it's a list of batches, it's already in the right format - if ( - isinstance(images, (list, tuple)) - and all(isinstance(images_i, (list, tuple)) for images_i in images) - and all(is_valid_list_of_images(images_i) for images_i in images) - ): - return images - - # If it's a list of images, it's a single batch, so convert it to a list of lists - if isinstance(images, (list, tuple)) and is_valid_list_of_images(images): - if is_pil_image(images[0]) or images[0].ndim == 3: - return [images] - if images[0].ndim == 4: - return [list(image) for image in images] - - # If it's a single image, convert it to a list of lists - if is_valid_image(images): - if is_pil_image(images) or images.ndim == 3: - return [[images]] - if images.ndim == 4: - return [list(images)] - - raise ValueError( - "Invalid input type. Must be a single image, a list of images, or a list of batches of images." - ) - - -def rescale( - image: np.ndarray, - scale: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, -) -> np.ndarray: - """ - Rescale an image by a scale factor. image = image * scale. - - Args: - image (`np.ndarray`): - Image to rescale. - scale (`float`): - The scaling factor to rescale pixel values by. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Returns: - `np.ndarray`: The rescaled image. - """ - return transformers.image_transforms.rescale( - image, - scale=scale, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) - - -def normalize( - image: np.ndarray, - mean: Union[float, Iterable[float]], - std: Union[float, Iterable[float]], - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, -) -> np.ndarray: - """ - Normalize an image. image = (image - image_mean) / image_std. - - Args: - image (`np.ndarray`): - Image to normalize. - mean (`float` or `Iterable[float]`): - Image mean to use for normalization. - std (`float` or `Iterable[float]`): - Image standard deviation to use for normalization. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Returns: - `np.ndarray`: The normalized image. - """ - return transformers.image_transforms.normalize( - image, - mean=mean, - std=std, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) - - -class Gemma3ImagesKwargs(ImagesKwargs): - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] - do_convert_rgb: Optional[bool] - - -class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): - images_kwargs: Gemma3ImagesKwargs - _defaults = { - "text_kwargs": { - "padding": False, - }, - "images_kwargs": { - "do_pan_and_scan": False, - "pan_and_scan_min_crop_size": 256, - "pan_and_scan_max_num_crops": 4, - "pan_and_scan_min_ratio_to_activate": 1.2, - }, - } - - -class Gemma3Processor(ProcessorMixin): - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "image_seq_length"] - image_processor_class = "AutoImageProcessor" - tokenizer_class = "AutoTokenizer" - - def __init__( - self, - image_processor, - tokenizer, - chat_template=None, - image_seq_length: int = 256, - **kwargs, - ): - - self.image_seq_length = image_seq_length - self.image_token_id = tokenizer.image_token_id - self.boi_token = tokenizer.boi_token - image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length) - self.full_image_sequence = ( - f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n" - ) - - super().__init__( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=chat_template, - **kwargs, - ) - - # TODO: if transformers is updated, the chat_template needs to be adjusted - self.tokenizer.add_bos_token = False - - def __call__( - self, - images: ImageInput = None, - text: Union[ - TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput] - ] = None, - videos=None, - audio=None, - **kwargs: Unpack[Gemma3ProcessorKwargs], - ) -> BatchFeature: - if text is None and images is None: - raise ValueError("Provide at least one of `text` or `images`.") - # print(f"processing, text:{text}") - output_kwargs = self._merge_kwargs( - Gemma3ProcessorKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - ) - - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError( - "Invalid input text. Please provide a string, or a list of strings" - ) - - image_inputs = {} - if images is not None: - batched_images = make_nested_list_of_images(images) - image_inputs = self.image_processor( - batched_images, **output_kwargs["images_kwargs"] - ) - - # Create empty text to be replaced with placeholders - if not text: - text = [ - " ".join([self.boi_token] * len(images)) - for images in batched_images - ] - - if len(batched_images) != len(text): - raise ValueError( - f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})." - ) - - # Replace image tokens by the full expanded sequence - batch_num_crops = to_py_obj(image_inputs.pop("num_crops")) - text_with_crops = text - - for batch_idx, (prompt, images, num_crops) in enumerate( - zip(text, batched_images, batch_num_crops) - ): - - image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)] - - if len(images) != len(image_indexes): - raise ValueError( - f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images." - ) - - # Insert additional image tokens for Pan-and-Scan crops - for num, idx in reversed(list(zip(num_crops, image_indexes))): - if num: - formatted_image_text = ( - f"Here is the original image {self.boi_token} and here are some crops to help you see better " - + " ".join([self.boi_token] * num) - ) - prompt = ( - prompt[:idx] - + formatted_image_text - + prompt[idx + len(self.boi_token) :] - ) - text_with_crops[batch_idx] = prompt - - # Expand placeholder image tokens to the full image token sequence - text = [ - prompt.replace(self.boi_token, self.full_image_sequence) - for prompt in text - ] - - return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) - text_inputs = self.tokenizer( - text=text, **output_kwargs["text_kwargs"], return_tensors="np" - ) - - # print(f"processing, text_inputs:{text_inputs}") - - # Add token type ids manually, as tokenizer can't do arbitrary position token types - array_ids = np.array(text_inputs["input_ids"]) - mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) - mm_token_type_ids[array_ids == self.image_token_id] = 1 - text_inputs = { - k: v.tolist() for k, v in text_inputs.items() - } # in case user requested list inputs - text_inputs["token_type_ids"] = mm_token_type_ids.tolist() - return BatchFeature( - data={**text_inputs, **image_inputs}, tensor_type=return_tensors - ) - - # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma - def batch_decode(self, *args, **kwargs): - """ - This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please - refer to the docstring of this method for more information. - """ - return self.tokenizer.batch_decode(*args, **kwargs) - - # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma - def decode(self, *args, **kwargs): - """ - This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to - the docstring of this method for more information. - """ - return self.tokenizer.decode(*args, **kwargs) - - @property - def model_input_names(self): - tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"] - image_processor_input_names = self.image_processor.model_input_names - return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - -class Gemma3ImageProcessor(BaseImageProcessor): - r""" - Constructs a SigLIP image processor. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by - `do_resize` in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`): - Size of the image after resizing. Can be overridden by `size` in the `preprocess` method. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): - Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in - the `preprocess` method. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` - method. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image by the specified mean and standard deviation. Can be overridden by - `do_normalize` in the `preprocess` method. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): - Mean to use if normalizing the image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`): - Standard deviation to use if normalizing the image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - Can be overridden by the `image_std` parameter in the `preprocess` method. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - do_pan_and_scan (`bool`, *optional*): - Whether to apply `pan_and_scan` to images. - pan_and_scan_min_crop_size (`int`, *optional*): - Minimum size of each crop in pan and scan. - pan_and_scan_max_num_crops (`int`, *optional*): - Maximum number of crops per image in pan and scan. - pan_and_scan_min_ratio_to_activate (`float`, *optional*): - Minimum aspect ratio to activate pan and scan. - """ - - model_input_names = ["pixel_values", "num_crops"] - - def __init__( - self, - do_resize: bool = True, - size: Dict[str, int] = None, - resample: PILImageResampling = PILImageResampling.BILINEAR, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = None, - do_pan_and_scan: bool = None, - pan_and_scan_min_crop_size: int = None, - pan_and_scan_max_num_crops: int = None, - pan_and_scan_min_ratio_to_activate: float = None, - **kwargs, - ) -> None: - super().__init__(**kwargs) - size = size if size is not None else {"height": 224, "width": 224} - size = get_size_dict(size, default_to_square=True) - image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN - image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD - - self.do_resize = do_resize - self.size = size - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean - self.image_std = image_std - self.do_convert_rgb = do_convert_rgb - self.do_pan_and_scan = do_pan_and_scan - self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size - self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops - self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate - - def pan_and_scan( - self, - image: np.ndarray, - pan_and_scan_min_crop_size: int, - pan_and_scan_max_num_crops: int, - pan_and_scan_min_ratio_to_activate: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """ - Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds - minumum allowed ratio. - - Args: - image (`np.ndarray`): - Image to resize. - pan_and_scan_min_crop_size (`int`, *optional*): - Minimum size of each crop in pan and scan. - pan_and_scan_max_num_crops (`int`, *optional*): - Maximum number of crops per image in pan and scan. - pan_and_scan_min_ratio_to_activate (`float`, *optional*): - Minimum aspect ratio to activate pan and scan. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format of the image. If not provided, it will be the same as the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format of the input image. If not provided, it will be inferred. - """ - height, width = get_image_size(image) - - # Square or landscape image. - if width >= height: - # Only apply PaS if the image is sufficiently exaggerated - if width / height < pan_and_scan_min_ratio_to_activate: - return [] - - # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size. - num_crops_w = int( - math.floor(width / height + 0.5) - ) # Half round up rounding. - num_crops_w = min( - int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w - ) - - # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops]. - num_crops_w = max(2, num_crops_w) - num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w) - num_crops_h = 1 - - # Portrait image. - else: - # Only apply PaS if the image is sufficiently exaggerated - if height / width < pan_and_scan_min_ratio_to_activate: - return [] - - # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size. - num_crops_h = int(math.floor(height / width + 0.5)) - num_crops_h = min( - int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h - ) - - # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops]. - num_crops_h = max(2, num_crops_h) - num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h) - num_crops_w = 1 - - crop_size_w = int(math.ceil(width / num_crops_w)) - crop_size_h = int(math.ceil(height / num_crops_h)) - - # Don't apply PaS if crop size is too small. - if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size: - return [] - - crop_positions_w = [crop_size_w * i for i in range(num_crops_w)] - crop_positions_h = [crop_size_h * i for i in range(num_crops_h)] - - if input_data_format == ChannelDimension.LAST: - image_crops = [ - image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w] - for pos_h, pos_w in itertools.product( - crop_positions_h, crop_positions_w - ) - ] - else: - image_crops = [ - image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w] - for pos_h, pos_w in itertools.product( - crop_positions_h, crop_positions_w - ) - ] - - return image_crops - - def _process_images_for_pan_and_scan( - self, - images: List[np.ndarray], - do_pan_and_scan: bool, - pan_and_scan_min_crop_size: int, - pan_and_scan_max_num_crops: int, - pan_and_scan_min_ratio_to_activate: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - pas_images_list = [] - num_crops = [] - for image in images: - pas_images = self.pan_and_scan( - image=image, - pan_and_scan_min_crop_size=pan_and_scan_min_crop_size, - pan_and_scan_max_num_crops=pan_and_scan_max_num_crops, - pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate, - data_format=data_format, - input_data_format=input_data_format, - ) - pas_images_list.extend([image] + pas_images) - num_crops.append(len(pas_images)) - return pas_images_list, num_crops - - @filter_out_non_signature_kwargs() - def preprocess( - self, - images: ImageInput, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - do_convert_rgb: bool = None, - do_pan_and_scan: bool = None, - pan_and_scan_min_crop_size: int = None, - pan_and_scan_max_num_crops: int = None, - pan_and_scan_min_ratio_to_activate: float = None, - ) -> PIL.Image.Image: - """ - Preprocess an image or batch of images. - - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to apply `pan_and_scan` to images. - pan_and_scan_min_crop_size (`int`, *optional*, defaults to `self.pan_and_scan_min_crop_size`): - Minimum size of each crop in pan and scan. - pan_and_scan_max_num_crops (`int`, *optional*, defaults to `self.pan_and_scan_max_num_crops`): - Maximum number of crops per image in pan and scan. - pan_and_scan_min_ratio_to_activate (`float`, *optional*, defaults to `self.pan_and_scan_min_ratio_to_activate`): - Minimum aspect ratio to activate pan and scan. - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - size = get_size_dict(size, param_name="size", default_to_square=False) - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = ( - rescale_factor if rescale_factor is not None else self.rescale_factor - ) - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = ( - do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - ) - do_pan_and_scan = ( - do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan - ) - pan_and_scan_min_crop_size = ( - pan_and_scan_min_crop_size - if pan_and_scan_min_crop_size is not None - else self.pan_and_scan_min_crop_size - ) - pan_and_scan_max_num_crops = ( - pan_and_scan_max_num_crops - if pan_and_scan_max_num_crops is not None - else self.pan_and_scan_max_num_crops - ) - pan_and_scan_min_ratio_to_activate = ( - pan_and_scan_min_ratio_to_activate - if pan_and_scan_min_ratio_to_activate is not None - else self.pan_and_scan_min_ratio_to_activate - ) - - images_list = make_nested_list_of_images(images) - - if not valid_images(images_list[0]): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - validate_preprocess_arguments( - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, - ) - if do_convert_rgb: - images_list = [ - [convert_to_rgb(image) for image in images] for images in images_list - ] - - # All transformations expect numpy arrays. - images_list = [ - [to_numpy_array(image) for image in images] for images in images_list - ] - - if do_rescale and is_scaled_image(images_list[0][0]): - logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images_list[0][0]) - - if do_pan_and_scan: - images_list_and_num_crops = [ - self._process_images_for_pan_and_scan( - images=images, - do_pan_and_scan=do_pan_and_scan, - pan_and_scan_min_crop_size=pan_and_scan_min_crop_size, - pan_and_scan_max_num_crops=pan_and_scan_max_num_crops, - pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate, - data_format=data_format, - input_data_format=input_data_format, - ) - for images in images_list - ] - images_list = [images for images, _ in images_list_and_num_crops] - num_crops = [num_crops for _, num_crops in images_list_and_num_crops] - else: - num_crops = [[0] for images in images_list] - - processed_images = [] - for images in images_list: - for image in images: - if do_resize: - height, width = size["height"], size["width"] - image = resize( - image=image, - size=(height, width), - resample=resample, - input_data_format=input_data_format, - ) - - if do_rescale: - image = rescale( - image=image, - scale=rescale_factor, - input_data_format=input_data_format, - ) - - if do_normalize: - image = normalize( - image=image, - mean=image_mean, - std=image_std, - input_data_format=input_data_format, - ) - - image = to_channel_dimension_format( - image, data_format, input_channel_dim=input_data_format - ) - processed_images.append(image) - - data = {"pixel_values": processed_images, "num_crops": num_crops} - return BatchFeature(data=data, tensor_type=return_tensors) - - -class Gemma3TextConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text - model according to the specified arguments, defining the model architecture. Instantiating a configuration with the - defaults will yield a similar configuration to that of the Gemma3Text-7B. - e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b) - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - Args: - vocab_size (`int`, *optional*, defaults to 262208): - Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Gemma3TextModel`] - hidden_size (`int`, *optional*, defaults to 2304): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 9216): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 26): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 8): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*, defaults to 4): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - head_dim (`int`, *optional*, defaults to 256): - The attention head dimension. - hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` - if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. - max_position_embeddings (`int`, *optional*, defaults to 131072): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - pad_token_id (`int`, *optional*, defaults to 0): - Padding token id. - eos_token_id (`int`, *optional*, defaults to 1): - End of stream token id. - bos_token_id (`int`, *optional*, defaults to 2): - Beginning of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `True`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - query_pre_attn_scalar (`float`, *optional*, defaults to 256): - Scaling factor used on the attention scores - sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the - size of the sliding window. - final_logit_softcapping (`float`, *optional*): - Scaling factor when applying tanh softcapping on the logits. - attn_logit_softcapping (`float`, *optional*): - Scaling factor when applying tanh softcapping on the attention scores. - cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. - sliding_window_pattern (`int`, *optional*, defaults to 6): - Pattern for the sliding window attention. - - ```python - >>> from transformers import Gemma3TextModel, Gemma3TextConfig - >>> # Initializing a Gemma3Text gemma3_text-7b style configuration - >>> configuration = Gemma3TextConfig() - >>> # Initializing a model from the gemma3_text-7b style configuration - >>> model = Gemma3TextModel(configuration) - >>> # Accessing the model configuration - >>> configuration = model.config - ``` - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. - sliding_window_pattern (`int`, *optional*, defaults to 6): - Pattern for the sliding window attention. - """ - - model_type = "gemma3_text" - keys_to_ignore_at_inference = ["past_key_values"] - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - - def __init__( - self, - vocab_size=262_208, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=131_072, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=1_000_000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - final_logit_softcapping=None, - attn_logit_softcapping=None, - cache_implementation="hybrid", - rope_scaling=None, - rope_local_base_freq=10_000.0, - sliding_window_pattern=6, - **kwargs, - ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.head_dim = head_dim - self.num_key_value_heads = num_key_value_heads - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.hidden_activation = hidden_activation - self.query_pre_attn_scalar = query_pre_attn_scalar - self.sliding_window = sliding_window - self.final_logit_softcapping = final_logit_softcapping - self.attn_logit_softcapping = attn_logit_softcapping - self.cache_implementation = cache_implementation - - self.rope_local_base_freq = rope_local_base_freq - # For configuring HybridCache to work with 5:1 attention pattern - self.sliding_window_pattern = sliding_window_pattern - self.rope_scaling = rope_scaling - rope_config_validation(self) - - -class Gemma3Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an - Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the PaliGemma-2B. - - e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b) - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - text_config (`Union[Gemma3TextConfig, dict]`, *optional*): - The config object of the text backbone. - vision_config (`Union[AutoConfig, dict]`, *optional*): - Custom vision config or dict. - mm_tokens_per_image (`int`, *optional*, defaults to 256): - The number of tokens per image embedding. - boi_token_index (`int`, *optional*, defaults to 255999): - The begin-of-image token index to wrap the image prompt. - eoi_token_index (`int`, *optional*, defaults to 256000): - The end-of-image token index to wrap the image prompt. - image_token_index (`int`, *optional*, defaults to 262144): - The image token index to encode the image prompt. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - - - Example: - - ```python - >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig - - >>> # Initializing a Siglip-like vision config - >>> vision_config = SiglipVisionConfig() - - >>> # Initializing a Gemma3 Text config - >>> text_config = Gemma3TextConfig() - - >>> # Initializing a Gemma3 gemma-3-4b style configuration - >>> configuration = Gemma3Config(vision_config, text_config) - - >>> # Initializing a model from the gemma-3-4b style configuration - >>> model = Gemma3TextConfig(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "gemma3" - sub_configs = { - "text_config": Gemma3TextConfig, - "vision_config": SiglipVisionConfig, - } - - def __init__( - self, - text_config: Optional[Gemma3TextConfig] = None, - vision_config: Optional[SiglipVisionConfig] = None, - mm_tokens_per_image: int = 256, - boi_token_index: int = 255_999, - eoi_token_index: int = 256_000, - image_token_index: int = 262_144, - initializer_range: float = 0.02, - **kwargs, - ): - if text_config is None: - text_config = Gemma3TextConfig() - # logger.info( - # "text_config is None, using default Gemma3TextConfig config." - # ) - elif isinstance(text_config, dict): - text_config = Gemma3TextConfig(**text_config) - - if isinstance(vision_config, dict): - vision_config = SiglipVisionConfig(**vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - pass - else: - # logger.info( - # "vision_config is None or incompatible with Gemma3VisionConfig initialization. Gemma3 will be limited " - # "to text tasks." - # ) - # logger.info(f"vision_config: {vision_config}") - vision_config = SiglipVisionConfig() - - self.text_config = text_config - self.vision_config = vision_config - self.mm_tokens_per_image = mm_tokens_per_image - self.boi_token_index = boi_token_index - self.eoi_token_index = eoi_token_index - self.image_token_index = image_token_index - self.initializer_range = initializer_range - - super().__init__(**kwargs) - - -AutoProcessor.register( - config_class=Gemma3Config, processor_class=Gemma3Processor, exist_ok=True -) - -AutoImageProcessor.register( - config_class=Gemma3Config, - image_processor_class=None, - slow_image_processor_class=Gemma3ImageProcessor, - fast_image_processor_class=None, - exist_ok=True, -) diff --git a/python/sglang/srt/configs/qwen2_5_vl_config.py b/python/sglang/srt/configs/qwen2_5_vl_config.py deleted file mode 100644 index b04edf26d..000000000 --- a/python/sglang/srt/configs/qwen2_5_vl_config.py +++ /dev/null @@ -1,1006 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Qwen2VL model configuration""" -from typing import Dict, Iterable, List, Optional, Union - -import numpy as np -from transformers import ( - AutoImageProcessor, - AutoProcessor, - BaseImageProcessor, - BatchFeature, - PretrainedConfig, - ProcessorMixin, - TensorType, -) -from transformers.image_transforms import ( - convert_to_rgb, - normalize, - rescale, - resize, - to_channel_dimension_format, -) -from transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - VideoInput, - get_image_size, - infer_channel_dimension_format, - is_pil_image, - is_valid_image, - make_list_of_images, - to_numpy_array, - valid_images, - validate_preprocess_arguments, -) -from transformers.modeling_rope_utils import rope_config_validation -from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize -from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def is_valid_list_of_images(images: List): - return images and all(is_valid_image(image) for image in images) - - -class Qwen2_5_VLVisionConfig(PretrainedConfig): - model_type = "qwen2_5_vl" - base_config_key = "vision_config" - - def __init__( - self, - depth=32, - hidden_size=3584, - hidden_act="silu", - intermediate_size=3420, - num_heads=16, - in_channels=3, - patch_size=14, - spatial_merge_size=2, - temporal_patch_size=2, - tokens_per_second=4, - window_size=112, - out_hidden_size=3584, - fullatt_block_indexes=[7, 15, 23, 31], - **kwargs, - ): - super().__init__(**kwargs) - - self.depth = depth - self.hidden_size = hidden_size - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.num_heads = num_heads - self.in_channels = in_channels - self.patch_size = patch_size - self.spatial_merge_size = spatial_merge_size - self.temporal_patch_size = temporal_patch_size - self.tokens_per_second = tokens_per_second - self.window_size = window_size - self.fullatt_block_indexes = fullatt_block_indexes - self.out_hidden_size = out_hidden_size - - -class Qwen2_5_VLConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a - Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of - Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct). - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 152064): - Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Qwen2_5_VLModel`] - hidden_size (`int`, *optional*, defaults to 8192): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 29568): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 80): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 64): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 8): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - use_sliding_window (`bool`, *optional*, defaults to `False`): - Whether to use sliding window attention. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention (SWA) window size. If not specified, will default to `4096`. - max_window_layers (`int`, *optional*, defaults to 80): - The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - vision_config (`Dict`, *optional*): - The config for the visual encoder initialization. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - - ```python - >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig - - >>> # Initializing a Qwen2_5_VL style configuration - >>> configuration = Qwen2_5_VLConfig() - - >>> # Initializing a model from the Qwen2-VL-7B style configuration - >>> model = Qwen2_5_VLForConditionalGeneration(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "qwen2_5_vl" - sub_configs = {"vision_config": Qwen2_5_VLVisionConfig} - keys_to_ignore_at_inference = ["past_key_values"] - # Default tensor parallel plan for base model `Qwen2_5_VL` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - - def __init__( - self, - vocab_size=152064, - hidden_size=8192, - intermediate_size=29568, - num_hidden_layers=80, - num_attention_heads=64, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=80, - attention_dropout=0.0, - vision_config=None, - rope_scaling=None, - **kwargs, - ): - if isinstance(vision_config, dict): - self.vision_config = self.sub_configs["vision_config"](**vision_config) - elif vision_config is None: - self.vision_config = self.sub_configs["vision_config"]() - - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.use_sliding_window = use_sliding_window - self.sliding_window = sliding_window - self.max_window_layers = max_window_layers - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling - - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations - # one can set it to "linear"/"dynamic" etc. to have scaled RoPE - # TODO: @raushan update config in the hub - if self.rope_scaling is not None and "type" in self.rope_scaling: - if self.rope_scaling["type"] == "mrope": - self.rope_scaling["type"] = "default" - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self, ignore_keys={"mrope_section"}) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) - - -# FIXME: workaround of obsolete transformers version - - -class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False): - fps: Union[List[float], float] - - -class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False): - videos_kwargs: Qwen2_5_VLVideosProcessorKwargs - _defaults = { - "text_kwargs": { - "padding": False, - }, - "videos_kwargs": {"fps": 2.0}, - } - - -class Qwen2_5_VLProcessor(ProcessorMixin): - r""" - Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor. - [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the - [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information. - Args: - image_processor ([`Qwen2VLImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`Qwen2TokenizerFast`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template"] - - image_processor_class = "AutoImageProcessor" - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - - def __init__( - self, image_processor=None, tokenizer=None, chat_template=None, **kwargs - ): - self.image_token = ( - "<|image_pad|>" - if not hasattr(tokenizer, "image_token") - else tokenizer.image_token - ) - self.video_token = ( - "<|video_pad|>" - if not hasattr(tokenizer, "video_token") - else tokenizer.video_token - ) - super().__init__(image_processor, tokenizer, chat_template=chat_template) - - def __call__( - self, - images: ImageInput = None, - text: Union[ - TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput] - ] = None, - videos: VideoInput = None, - **kwargs: Unpack[Qwen2_5_VLProcessorKwargs], - ) -> BatchFeature: - """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to - Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `List[str]`, `List[List[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch - tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - - Returns: - [`BatchFeature`]: A [`BatchFeature`] with the following fields: - - - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when - `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not - `None`). - - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`. - - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`. - - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`. - - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`. - """ - output_kwargs = self._merge_kwargs( - Qwen2_5_VLProcessorKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - ) - if images is not None: - image_inputs = self.image_processor( - images=images, videos=None, **output_kwargs["images_kwargs"] - ) - image_grid_thw = image_inputs["image_grid_thw"] - else: - image_inputs = {} - image_grid_thw = None - - if videos is not None: - videos_inputs = self.image_processor( - images=None, videos=videos, **output_kwargs["images_kwargs"] - ) - video_grid_thw = videos_inputs["video_grid_thw"] - - fps = output_kwargs["videos_kwargs"].pop("fps", 2.0) - if isinstance(fps, (int, float)): - second_per_grid_ts = [ - self.image_processor.temporal_patch_size / fps - ] * len(video_grid_thw) - elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw): - second_per_grid_ts = [ - self.image_processor.temporal_patch_size / tmp for tmp in fps - ] - else: - raise ValueError( - f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number." - ) - videos_inputs.update({"second_per_grid_ts": second_per_grid_ts}) - - else: - videos_inputs = {} - video_grid_thw = None - - if not isinstance(text, list): - text = [text] - - if image_grid_thw is not None: - merge_length = self.image_processor.merge_size**2 - index = 0 - for i in range(len(text)): - while self.image_token in text[i]: - text[i] = text[i].replace( - self.image_token, - "<|placeholder|>" - * (image_grid_thw[index].prod() // merge_length), - 1, - ) - index += 1 - text[i] = text[i].replace("<|placeholder|>", self.image_token) - - if video_grid_thw is not None: - merge_length = self.image_processor.merge_size**2 - index = 0 - for i in range(len(text)): - while self.video_token in text[i]: - text[i] = text[i].replace( - self.video_token, - "<|placeholder|>" - * (video_grid_thw[index].prod() // merge_length), - 1, - ) - index += 1 - text[i] = text[i].replace("<|placeholder|>", self.video_token) - - text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) - - def batch_decode(self, *args, **kwargs): - """ - This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please - refer to the docstring of this method for more information. - """ - return self.tokenizer.batch_decode(*args, **kwargs) - - def decode(self, *args, **kwargs): - """ - This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to - the docstring of this method for more information. - """ - return self.tokenizer.decode(*args, **kwargs) - - def post_process_image_text_to_text(self, generated_outputs): - """ - Post-process the output of the model to decode the text. - - Args: - generated_outputs (`torch.Tensor` or `np.ndarray`): - The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` - or `(sequence_length,)`. - - Returns: - `List[str]`: The decoded text. - """ - return self.tokenizer.batch_decode( - generated_outputs, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - - @property - def model_input_names(self): - tokenizer_input_names = self.tokenizer.model_input_names - image_processor_input_names = self.image_processor.model_input_names - names_from_processor = list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names) - ) - return names_from_processor + ["second_per_grid_ts"] - - -class Qwen2_5_VLImageProcessor(BaseImageProcessor): - r""" - Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_patch_size (`int`, *optional*, defaults to 2): - The temporal patch size of the vision encoder. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - "second_per_grid_ts", - ] - - def __init__( - self, - do_resize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 56 * 56, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_patch_size: int = 2, - merge_size: int = 2, - **kwargs, - ) -> None: - super().__init__(**kwargs) - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} - self.do_convert_rgb = do_convert_rgb - - def rescale( - self, - image: np.ndarray, - scale: float, - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> np.ndarray: - """ - Rescale an image by a scale factor. image = image * scale. - - Args: - image (`np.ndarray`): - Image to rescale. - scale (`float`): - The scaling factor to rescale pixel values by. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Returns: - `np.ndarray`: The rescaled image. - """ - return rescale( - image, - scale=scale, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) - - def normalize( - self, - image: np.ndarray, - mean: Union[float, Iterable[float]], - std: Union[float, Iterable[float]], - data_format: Optional[Union[str, ChannelDimension]] = None, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ) -> np.ndarray: - """ - Normalize an image. image = (image - image_mean) / image_std. - - Args: - image (`np.ndarray`): - Image to normalize. - mean (`float` or `Iterable[float]`): - Image mean to use for normalization. - std (`float` or `Iterable[float]`): - Image standard deviation to use for normalization. - data_format (`str` or `ChannelDimension`, *optional*): - The channel dimension format for the output image. If unset, the channel dimension format of the input - image is used. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Returns: - `np.ndarray`: The normalized image. - """ - return normalize( - image, - mean=mean, - std=std, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - do_resize: bool = None, - resample: PILImageResampling = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`List[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - resample (`PILImageResampling`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - images = make_list_of_images(images) - - if do_convert_rgb: - images = [convert_to_rgb(image) for image in images] - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = height, width - processed_images = [] - for image in images: - if do_resize: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - input_data_format=input_data_format, - ) - - if do_rescale: - image = self.rescale( - image, scale=rescale_factor, input_data_format=input_data_format - ) - - if do_normalize: - image = self.normalize( - image=image, - mean=image_mean, - std=image_std, - input_data_format=input_data_format, - ) - - image = to_channel_dimension_format( - image, data_format, input_channel_dim=input_data_format - ) - processed_images.append(image) - - patches = np.array(processed_images) - if data_format == ChannelDimension.LAST: - patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0 - ) - patches = np.concatenate([patches, repeats], axis=0) - channel = patches.shape[1] - grid_t = patches.shape[0] // self.temporal_patch_size - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - patches = patches.reshape( - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ) - patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8) - flatten_patches = patches.reshape( - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ) - - return flatten_patches, (grid_t, grid_h, grid_w) - - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: PILImageResampling = None, - do_rescale: bool = None, - rescale_factor: float = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - ): - """ - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - videos (`VideoInput`): - Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If - passing in videos with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with - the longest edge resized to keep the input aspect ratio. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = ( - rescale_factor if rescale_factor is not None else self.rescale_factor - ) - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = ( - do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - ) - - def make_flat_list_of_images( - images: Union[List[ImageInput], ImageInput], - ) -> ImageInput: - """ - Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1. - If the input is a nested list of images, it is converted to a flat list of images. - Args: - images (`Union[List[ImageInput], ImageInput]`): - The input image. - Returns: - list: A list of images or a 4d array of images. - """ - # If the input is a nested list of images, we flatten it - if ( - isinstance(images, (list, tuple)) - and all(isinstance(images_i, (list, tuple)) for images_i in images) - and all(is_valid_list_of_images(images_i) for images_i in images) - ): - return [img for img_list in images for img in img_list] - - if isinstance(images, (list, tuple)) and is_valid_list_of_images(images): - if is_pil_image(images[0]) or images[0].ndim == 3: - return images - if images[0].ndim == 4: - return [img for img_list in images for img in img_list] - - if is_valid_image(images): - if is_pil_image(images) or images.ndim == 3: - return [images] - if images.ndim == 4: - return list(images) - - raise ValueError(f"Could not make a flat list of images from {images}") - - def make_batched_videos(videos) -> VideoInput: - """ - Ensure that the input is a list of videos. - Args: - videos (`VideoInput`): - Video or videos to turn into a list of videos. - Returns: - list: A list of videos. - """ - if ( - isinstance(videos, (list, tuple)) - and isinstance(videos[0], (list, tuple)) - and is_valid_image(videos[0][0]) - ): - # case 1: nested batch of videos so we flatten it - if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4: - videos = [ - [video for batch_list in batched_videos for video in batch_list] - for batched_videos in videos - ] - # case 2: list of videos represented as list of video frames - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if is_pil_image(videos[0]) or videos[0].ndim == 3: - return [videos] - elif videos[0].ndim == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos): - if is_pil_image(videos) or videos.ndim == 3: - return [[videos]] - elif videos.ndim == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - if images is not None: - images = make_flat_list_of_images(images) - if videos is not None: - videos = make_batched_videos(videos) - - if images is not None and not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "torch.Tensor, tf.Tensor or jax.ndarray." - ) - - validate_preprocess_arguments( - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, - ) - - if images is not None: - pixel_values, vision_grid_thws = [], [] - for image in images: - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws} - - if videos is not None: - pixel_values, vision_grid_thws = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - ) - pixel_values.extend(patches) - vision_grid_thws.append(video_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = { - "pixel_values_videos": pixel_values, - "video_grid_thw": vision_grid_thws, - } - - return BatchFeature(data=data, tensor_type=return_tensors) - - -AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None) -AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor) diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index d236ce7a8..397404d30 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -35,10 +35,7 @@ from sglang.srt.configs import ( DbrxConfig, DeepseekVL2Config, ExaoneConfig, - Gemma3Config, - Gemma3TextConfig, MultiModalityConfig, - Qwen2_5_VLConfig, ) from sglang.srt.connector import create_remote_connector from sglang.srt.utils import is_remote_url @@ -47,11 +44,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { ChatGLMConfig.model_type: ChatGLMConfig, DbrxConfig.model_type: DbrxConfig, ExaoneConfig.model_type: ExaoneConfig, - Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig, DeepseekVL2Config.model_type: DeepseekVL2Config, MultiModalityConfig.model_type: MultiModalityConfig, - Gemma3Config.model_type: Gemma3Config, - Gemma3TextConfig.model_type: Gemma3TextConfig, } for name, cls in _CONFIG_REGISTRY.items(): @@ -223,11 +217,26 @@ def get_processor( tokenizer_revision: Optional[str] = None, **kwargs, ): + # pop 'revision' from kwargs if present. + revision = kwargs.pop("revision", tokenizer_revision) + + config = AutoConfig.from_pretrained( + tokenizer_name, + trust_remote_code=trust_remote_code, + revision=revision, + **kwargs, + ) + + # fix: for Qwen2-VL model, inject default 'size' if not provided. + if config.model_type in {"qwen2_vl"}: + if "size" not in kwargs: + kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520} + processor = AutoProcessor.from_pretrained( tokenizer_name, *args, trust_remote_code=trust_remote_code, - tokenizer_revision=tokenizer_revision, + revision=revision, **kwargs, ) diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py index 56632092e..cbd8beb66 100644 --- a/python/sglang/srt/layers/rotary_embedding.py +++ b/python/sglang/srt/layers/rotary_embedding.py @@ -441,16 +441,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): ): super().__init__() - if rotary_dim != head_size: - raise ValueError( - f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \ - rotary_dim != head_size ({rotary_dim}!={head_size})." - ) if is_neox_style is False: raise ValueError( "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style." ) + self.rotary_dim = rotary_dim self.head_size = head_size self.max_position_embeddings = max_position_embeddings self.original_max_position_embeddings = original_max_position_embeddings @@ -499,8 +495,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): * ( self.base ** ( - torch.arange(0, self.head_size, 2, dtype=torch.float) - / self.head_size + torch.arange(0, self.rotary_dim, 2, dtype=torch.float) + / self.rotary_dim ) ) ) @@ -549,8 +545,15 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): cos = cos.repeat(1, 2).unsqueeze(-2) sin = sin.repeat(1, 2).unsqueeze(-2) - query = query * cos + _rotate_neox(query) * sin - key = key * cos + _rotate_neox(key) * sin + query_rot = query[..., : self.rotary_dim] + query_pass = query[..., self.rotary_dim :] + query_rot = query_rot * cos + _rotate_neox(query_rot) * sin + query = torch.cat((query_rot, query_pass), dim=-1) + + key_rot = key[..., : self.rotary_dim] + key_pass = key[..., self.rotary_dim :] + key_rot = key_rot * cos + _rotate_neox(key_rot) * sin + key = torch.cat((key_rot, key_pass), dim=-1) return query.flatten(-2), key.flatten(-2) diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py index b2fa84eb3..489b15798 100644 --- a/python/sglang/srt/models/gemma3_causal.py +++ b/python/sglang/srt/models/gemma3_causal.py @@ -21,11 +21,11 @@ from torch import nn from transformers import ( ROPE_INIT_FUNCTIONS, AutoModel, + Gemma3TextConfig, PretrainedConfig, PreTrainedModel, ) -from sglang.srt.configs.gemma3 import Gemma3TextConfig from sglang.srt.distributed import get_tensor_model_parallel_world_size from sglang.srt.layers.activation import GeluAndMul from sglang.srt.layers.layernorm import Gemma3RMSNorm diff --git a/python/sglang/srt/models/gemma3_mm.py b/python/sglang/srt/models/gemma3_mm.py index 561b7e834..401e65731 100644 --- a/python/sglang/srt/models/gemma3_mm.py +++ b/python/sglang/srt/models/gemma3_mm.py @@ -21,9 +21,15 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict import torch from torch import nn -from transformers import AutoModel, PreTrainedModel +from transformers import ( + AutoModel, + BatchFeature, + Gemma3Config, + Gemma3Processor, + PreTrainedModel, +) +from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs -from sglang.srt.configs import Gemma3Config from sglang.srt.hf_transformers_utils import get_processor from sglang.srt.layers.layernorm import Gemma3RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index 873b28e02..20fa13493 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -129,6 +129,8 @@ class LlamaAttention(nn.Module): self.head_dim = getattr( config, "head_dim", self.hidden_size // self.total_num_heads ) + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1) + self.rotary_dim = int(partial_rotary_factor * self.head_dim) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -154,7 +156,7 @@ class LlamaAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, + rotary_dim=self.rotary_dim, max_position=max_position_embeddings, base=rope_theta, rope_scaling=rope_scaling, diff --git a/python/sglang/srt/models/qwen2_5_vl.py b/python/sglang/srt/models/qwen2_5_vl.py index 9fb59c26a..95f926356 100644 --- a/python/sglang/srt/models/qwen2_5_vl.py +++ b/python/sglang/srt/models/qwen2_5_vl.py @@ -34,8 +34,15 @@ from einops import rearrange from transformers import AutoModel, Qwen2VLConfig from transformers.activations import ACT2FN from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm +from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, + Qwen2_5_VLVisionConfig, +) +from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( + Qwen2_5_VLForConditionalGeneration, +) -from sglang.srt.configs import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig from sglang.srt.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -714,4 +721,3 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module): EntryClass = [Qwen2_5_VLForConditionalGeneration] -AutoModel.register(Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration) diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh index 6933eeddf..ca79ebcb6 100755 --- a/scripts/ci_install_dependency.sh +++ b/scripts/ci_install_dependency.sh @@ -20,7 +20,7 @@ pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-rei pip install torch_memory_saver --force-reinstall -pip install transformers==4.48.3 sentence_transformers accelerate==1.4.0 peft pandas datasets +pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets # For compling xgrammar kernels pip install cuda-python nvidia-cuda-nvrtc-cu12