diff --git a/python/pyproject.toml b/python/pyproject.toml
index 99511cf65..e39a4bdb8 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -35,7 +35,7 @@ runtime_common = [
     "python-multipart",
     "pyzmq>=25.1.2",
     "torchao>=0.7.0",
-    "transformers==4.48.3",
+    "transformers==4.50.0",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.16",
diff --git a/python/sglang/srt/configs/__init__.py b/python/sglang/srt/configs/__init__.py
index 41d0cfcae..1e8370ba7 100644
--- a/python/sglang/srt/configs/__init__.py
+++ b/python/sglang/srt/configs/__init__.py
@@ -2,21 +2,12 @@ from sglang.srt.configs.chatglm import ChatGLMConfig
 from sglang.srt.configs.dbrx import DbrxConfig
 from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
 from sglang.srt.configs.exaone import ExaoneConfig
-from sglang.srt.configs.gemma3 import Gemma3Config, Gemma3TextConfig
 from sglang.srt.configs.janus_pro import MultiModalityConfig
-from sglang.srt.configs.qwen2_5_vl_config import (
-    Qwen2_5_VLConfig,
-    Qwen2_5_VLVisionConfig,
-)
 
 __all__ = [
     "ExaoneConfig",
     "ChatGLMConfig",
     "DbrxConfig",
     "DeepseekVL2Config",
-    "Qwen2_5_VLConfig",
-    "Qwen2_5_VLVisionConfig",
     "MultiModalityConfig",
-    "Gemma3Config",
-    "Gemma3TextConfig",
 ]
diff --git a/python/sglang/srt/configs/gemma3.py b/python/sglang/srt/configs/gemma3.py
deleted file mode 100644
index b70089f5c..000000000
--- a/python/sglang/srt/configs/gemma3.py
+++ /dev/null
@@ -1,1086 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/gemma3/modular_gemma3.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_gemma3.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-# coding=utf-8
-# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
-#
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import itertools
-import logging
-import math
-import re
-from typing import Dict, Iterable, List, Optional, Union
-
-import numpy as np
-import PIL
-import transformers
-from torch import TensorType
-from transformers import (
-    AutoImageProcessor,
-    AutoProcessor,
-    BatchFeature,
-    PretrainedConfig,
-    SiglipVisionConfig,
-)
-from transformers.image_processing_utils import BaseImageProcessor, get_size_dict
-from transformers.image_transforms import (
-    convert_to_rgb,
-    resize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_pil_image,
-    is_scaled_image,
-    is_valid_image,
-    to_numpy_array,
-    valid_images,
-    validate_preprocess_arguments,
-)
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.processing_utils import (
-    ImagesKwargs,
-    ProcessingKwargs,
-    ProcessorMixin,
-    Unpack,
-)
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-from transformers.utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    filter_out_non_signature_kwargs,
-    to_py_obj,
-)
-
-logger = logging.getLogger(__name__)
-
-
-def is_valid_list_of_images(images: List):
-    return images and all(is_valid_image(image) for image in images)
-
-
-# copied from transformer
-def make_nested_list_of_images(
-    images: Union[List[ImageInput], ImageInput],
-) -> ImageInput:
-    """
-    Ensure that the output is a nested list of images.
-    Args:
-        images (`Union[List[ImageInput], ImageInput]`):
-            The input image.
-    Returns:
-        list: A list of list of images or a list of 4d array of images.
-    """
-    # If it's a list of batches, it's already in the right format
-    if (
-        isinstance(images, (list, tuple))
-        and all(isinstance(images_i, (list, tuple)) for images_i in images)
-        and all(is_valid_list_of_images(images_i) for images_i in images)
-    ):
-        return images
-
-    # If it's a list of images, it's a single batch, so convert it to a list of lists
-    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
-        if is_pil_image(images[0]) or images[0].ndim == 3:
-            return [images]
-        if images[0].ndim == 4:
-            return [list(image) for image in images]
-
-    # If it's a single image, convert it to a list of lists
-    if is_valid_image(images):
-        if is_pil_image(images) or images.ndim == 3:
-            return [[images]]
-        if images.ndim == 4:
-            return [list(images)]
-
-    raise ValueError(
-        "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
-    )
-
-
-def rescale(
-    image: np.ndarray,
-    scale: float,
-    data_format: Optional[Union[str, ChannelDimension]] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    **kwargs,
-) -> np.ndarray:
-    """
-    Rescale an image by a scale factor. image = image * scale.
-
-    Args:
-        image (`np.ndarray`):
-            Image to rescale.
-        scale (`float`):
-            The scaling factor to rescale pixel values by.
-        data_format (`str` or `ChannelDimension`, *optional*):
-            The channel dimension format for the output image. If unset, the channel dimension format of the input
-            image is used. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
-            The channel dimension format for the input image. If unset, the channel dimension format is inferred
-            from the input image. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-    Returns:
-        `np.ndarray`: The rescaled image.
-    """
-    return transformers.image_transforms.rescale(
-        image,
-        scale=scale,
-        data_format=data_format,
-        input_data_format=input_data_format,
-        **kwargs,
-    )
-
-
-def normalize(
-    image: np.ndarray,
-    mean: Union[float, Iterable[float]],
-    std: Union[float, Iterable[float]],
-    data_format: Optional[Union[str, ChannelDimension]] = None,
-    input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    **kwargs,
-) -> np.ndarray:
-    """
-    Normalize an image. image = (image - image_mean) / image_std.
-
-    Args:
-        image (`np.ndarray`):
-            Image to normalize.
-        mean (`float` or `Iterable[float]`):
-            Image mean to use for normalization.
-        std (`float` or `Iterable[float]`):
-            Image standard deviation to use for normalization.
-        data_format (`str` or `ChannelDimension`, *optional*):
-            The channel dimension format for the output image. If unset, the channel dimension format of the input
-            image is used. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-        input_data_format (`ChannelDimension` or `str`, *optional*):
-            The channel dimension format for the input image. If unset, the channel dimension format is inferred
-            from the input image. Can be one of:
-            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-    Returns:
-        `np.ndarray`: The normalized image.
-    """
-    return transformers.image_transforms.normalize(
-        image,
-        mean=mean,
-        std=std,
-        data_format=data_format,
-        input_data_format=input_data_format,
-        **kwargs,
-    )
-
-
-class Gemma3ImagesKwargs(ImagesKwargs):
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-    do_convert_rgb: Optional[bool]
-
-
-class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
-    images_kwargs: Gemma3ImagesKwargs
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
-        "images_kwargs": {
-            "do_pan_and_scan": False,
-            "pan_and_scan_min_crop_size": 256,
-            "pan_and_scan_max_num_crops": 4,
-            "pan_and_scan_min_ratio_to_activate": 1.2,
-        },
-    }
-
-
-class Gemma3Processor(ProcessorMixin):
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "image_seq_length"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
-    def __init__(
-        self,
-        image_processor,
-        tokenizer,
-        chat_template=None,
-        image_seq_length: int = 256,
-        **kwargs,
-    ):
-
-        self.image_seq_length = image_seq_length
-        self.image_token_id = tokenizer.image_token_id
-        self.boi_token = tokenizer.boi_token
-        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
-        self.full_image_sequence = (
-            f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
-        )
-
-        super().__init__(
-            image_processor=image_processor,
-            tokenizer=tokenizer,
-            chat_template=chat_template,
-            **kwargs,
-        )
-
-        # TODO: if transformers is updated, the chat_template needs to be adjusted
-        self.tokenizer.add_bos_token = False
-
-    def __call__(
-        self,
-        images: ImageInput = None,
-        text: Union[
-            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
-        ] = None,
-        videos=None,
-        audio=None,
-        **kwargs: Unpack[Gemma3ProcessorKwargs],
-    ) -> BatchFeature:
-        if text is None and images is None:
-            raise ValueError("Provide at least one of `text` or `images`.")
-        # print(f"processing, text:{text}")
-        output_kwargs = self._merge_kwargs(
-            Gemma3ProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError(
-                "Invalid input text. Please provide a string, or a list of strings"
-            )
-
-        image_inputs = {}
-        if images is not None:
-            batched_images = make_nested_list_of_images(images)
-            image_inputs = self.image_processor(
-                batched_images, **output_kwargs["images_kwargs"]
-            )
-
-            # Create empty text to be replaced with placeholders
-            if not text:
-                text = [
-                    " ".join([self.boi_token] * len(images))
-                    for images in batched_images
-                ]
-
-            if len(batched_images) != len(text):
-                raise ValueError(
-                    f"Received inconsistently sized batches of images ({len(batched_images)}) and text ({len(text)})."
-                )
-
-            # Replace image tokens by the full expanded sequence
-            batch_num_crops = to_py_obj(image_inputs.pop("num_crops"))
-            text_with_crops = text
-
-            for batch_idx, (prompt, images, num_crops) in enumerate(
-                zip(text, batched_images, batch_num_crops)
-            ):
-
-                image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
-
-                if len(images) != len(image_indexes):
-                    raise ValueError(
-                        f"Prompt contained {len(image_indexes)} image tokens but received {len(images)} images."
-                    )
-
-                # Insert additional image tokens for Pan-and-Scan crops
-                for num, idx in reversed(list(zip(num_crops, image_indexes))):
-                    if num:
-                        formatted_image_text = (
-                            f"Here is the original image {self.boi_token} and here are some crops to help you see better "
-                            + " ".join([self.boi_token] * num)
-                        )
-                        prompt = (
-                            prompt[:idx]
-                            + formatted_image_text
-                            + prompt[idx + len(self.boi_token) :]
-                        )
-                        text_with_crops[batch_idx] = prompt
-
-            # Expand placeholder image tokens to the full image token sequence
-            text = [
-                prompt.replace(self.boi_token, self.full_image_sequence)
-                for prompt in text
-            ]
-
-        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
-        text_inputs = self.tokenizer(
-            text=text, **output_kwargs["text_kwargs"], return_tensors="np"
-        )
-
-        # print(f"processing, text_inputs:{text_inputs}")
-
-        # Add token type ids manually, as tokenizer can't do arbitrary position token types
-        array_ids = np.array(text_inputs["input_ids"])
-        mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
-        mm_token_type_ids[array_ids == self.image_token_id] = 1
-        text_inputs = {
-            k: v.tolist() for k, v in text_inputs.items()
-        }  # in case user requested list inputs
-        text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
-        return BatchFeature(
-            data={**text_inputs, **image_inputs}, tensor_type=return_tensors
-        )
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-
-
-class Gemma3ImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a SigLIP image processor.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
-            `do_resize` in the `preprocess` method.
-        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
-            Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
-            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
-            the `preprocess` method.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
-            method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
-            `do_normalize` in the `preprocess` method.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
-            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
-            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-            Can be overridden by the `image_std` parameter in the `preprocess` method.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        do_pan_and_scan (`bool`, *optional*):
-            Whether to apply `pan_and_scan` to images.
-        pan_and_scan_min_crop_size (`int`, *optional*):
-            Minimum size of each crop in pan and scan.
-        pan_and_scan_max_num_crops (`int`, *optional*):
-            Maximum number of crops per image in pan and scan.
-        pan_and_scan_min_ratio_to_activate (`float`, *optional*):
-            Minimum aspect ratio to activate pan and scan.
-    """
-
-    model_input_names = ["pixel_values", "num_crops"]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = PILImageResampling.BILINEAR,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        do_pan_and_scan: bool = None,
-        pan_and_scan_min_crop_size: int = None,
-        pan_and_scan_max_num_crops: int = None,
-        pan_and_scan_min_ratio_to_activate: float = None,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        size = size if size is not None else {"height": 224, "width": 224}
-        size = get_size_dict(size, default_to_square=True)
-        image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
-
-        self.do_resize = do_resize
-        self.size = size
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.do_convert_rgb = do_convert_rgb
-        self.do_pan_and_scan = do_pan_and_scan
-        self.pan_and_scan_min_crop_size = pan_and_scan_min_crop_size
-        self.pan_and_scan_max_num_crops = pan_and_scan_max_num_crops
-        self.pan_and_scan_min_ratio_to_activate = pan_and_scan_min_ratio_to_activate
-
-    def pan_and_scan(
-        self,
-        image: np.ndarray,
-        pan_and_scan_min_crop_size: int,
-        pan_and_scan_max_num_crops: int,
-        pan_and_scan_min_ratio_to_activate: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        """
-        Pan and Scan and image, by cropping into smaller images when the aspect ratio exceeds
-        minumum allowed ratio.
-
-        Args:
-            image (`np.ndarray`):
-                Image to resize.
-            pan_and_scan_min_crop_size (`int`, *optional*):
-                Minimum size of each crop in pan and scan.
-            pan_and_scan_max_num_crops (`int`, *optional*):
-                Maximum number of crops per image in pan and scan.
-            pan_and_scan_min_ratio_to_activate (`float`, *optional*):
-                Minimum aspect ratio to activate pan and scan.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        height, width = get_image_size(image)
-
-        # Square or landscape image.
-        if width >= height:
-            # Only apply PaS if the image is sufficiently exaggerated
-            if width / height < pan_and_scan_min_ratio_to_activate:
-                return []
-
-            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
-            num_crops_w = int(
-                math.floor(width / height + 0.5)
-            )  # Half round up rounding.
-            num_crops_w = min(
-                int(math.floor(width / pan_and_scan_min_crop_size)), num_crops_w
-            )
-
-            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
-            num_crops_w = max(2, num_crops_w)
-            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
-            num_crops_h = 1
-
-        # Portrait image.
-        else:
-            # Only apply PaS if the image is sufficiently exaggerated
-            if height / width < pan_and_scan_min_ratio_to_activate:
-                return []
-
-            # Select ideal number of crops close to the image aspect ratio and such that crop_size > min_crop_size.
-            num_crops_h = int(math.floor(height / width + 0.5))
-            num_crops_h = min(
-                int(math.floor(height / pan_and_scan_min_crop_size)), num_crops_h
-            )
-
-            # Make sure the number of crops is in range [2, pan_and_scan_max_num_crops].
-            num_crops_h = max(2, num_crops_h)
-            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
-            num_crops_w = 1
-
-        crop_size_w = int(math.ceil(width / num_crops_w))
-        crop_size_h = int(math.ceil(height / num_crops_h))
-
-        # Don't apply PaS if crop size is too small.
-        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
-            return []
-
-        crop_positions_w = [crop_size_w * i for i in range(num_crops_w)]
-        crop_positions_h = [crop_size_h * i for i in range(num_crops_h)]
-
-        if input_data_format == ChannelDimension.LAST:
-            image_crops = [
-                image[pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
-                for pos_h, pos_w in itertools.product(
-                    crop_positions_h, crop_positions_w
-                )
-            ]
-        else:
-            image_crops = [
-                image[:, pos_h : pos_h + crop_size_h, pos_w : pos_w + crop_size_w]
-                for pos_h, pos_w in itertools.product(
-                    crop_positions_h, crop_positions_w
-                )
-            ]
-
-        return image_crops
-
-    def _process_images_for_pan_and_scan(
-        self,
-        images: List[np.ndarray],
-        do_pan_and_scan: bool,
-        pan_and_scan_min_crop_size: int,
-        pan_and_scan_max_num_crops: int,
-        pan_and_scan_min_ratio_to_activate: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        pas_images_list = []
-        num_crops = []
-        for image in images:
-            pas_images = self.pan_and_scan(
-                image=image,
-                pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
-                pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
-                pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
-                data_format=data_format,
-                input_data_format=input_data_format,
-            )
-            pas_images_list.extend([image] + pas_images)
-            num_crops.append(len(pas_images))
-        return pas_images_list, num_crops
-
-    @filter_out_non_signature_kwargs()
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        do_convert_rgb: bool = None,
-        do_pan_and_scan: bool = None,
-        pan_and_scan_min_crop_size: int = None,
-        pan_and_scan_max_num_crops: int = None,
-        pan_and_scan_min_ratio_to_activate: float = None,
-    ) -> PIL.Image.Image:
-        """
-        Preprocess an image or batch of images.
-
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            do_pan_and_scan (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to apply `pan_and_scan` to images.
-            pan_and_scan_min_crop_size (`int`, *optional*, defaults to `self.pan_and_scan_min_crop_size`):
-                Minimum size of each crop in pan and scan.
-            pan_and_scan_max_num_crops (`int`, *optional*, defaults to `self.pan_and_scan_max_num_crops`):
-                Maximum number of crops per image in pan and scan.
-            pan_and_scan_min_ratio_to_activate (`float`, *optional*, defaults to `self.pan_and_scan_min_ratio_to_activate`):
-                Minimum aspect ratio to activate pan and scan.
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=False)
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (
-            rescale_factor if rescale_factor is not None else self.rescale_factor
-        )
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (
-            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-        )
-        do_pan_and_scan = (
-            do_pan_and_scan if do_pan_and_scan is not None else self.do_pan_and_scan
-        )
-        pan_and_scan_min_crop_size = (
-            pan_and_scan_min_crop_size
-            if pan_and_scan_min_crop_size is not None
-            else self.pan_and_scan_min_crop_size
-        )
-        pan_and_scan_max_num_crops = (
-            pan_and_scan_max_num_crops
-            if pan_and_scan_max_num_crops is not None
-            else self.pan_and_scan_max_num_crops
-        )
-        pan_and_scan_min_ratio_to_activate = (
-            pan_and_scan_min_ratio_to_activate
-            if pan_and_scan_min_ratio_to_activate is not None
-            else self.pan_and_scan_min_ratio_to_activate
-        )
-
-        images_list = make_nested_list_of_images(images)
-
-        if not valid_images(images_list[0]):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        validate_preprocess_arguments(
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-        )
-        if do_convert_rgb:
-            images_list = [
-                [convert_to_rgb(image) for image in images] for images in images_list
-            ]
-
-        # All transformations expect numpy arrays.
-        images_list = [
-            [to_numpy_array(image) for image in images] for images in images_list
-        ]
-
-        if do_rescale and is_scaled_image(images_list[0][0]):
-            logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images_list[0][0])
-
-        if do_pan_and_scan:
-            images_list_and_num_crops = [
-                self._process_images_for_pan_and_scan(
-                    images=images,
-                    do_pan_and_scan=do_pan_and_scan,
-                    pan_and_scan_min_crop_size=pan_and_scan_min_crop_size,
-                    pan_and_scan_max_num_crops=pan_and_scan_max_num_crops,
-                    pan_and_scan_min_ratio_to_activate=pan_and_scan_min_ratio_to_activate,
-                    data_format=data_format,
-                    input_data_format=input_data_format,
-                )
-                for images in images_list
-            ]
-            images_list = [images for images, _ in images_list_and_num_crops]
-            num_crops = [num_crops for _, num_crops in images_list_and_num_crops]
-        else:
-            num_crops = [[0] for images in images_list]
-
-        processed_images = []
-        for images in images_list:
-            for image in images:
-                if do_resize:
-                    height, width = size["height"], size["width"]
-                    image = resize(
-                        image=image,
-                        size=(height, width),
-                        resample=resample,
-                        input_data_format=input_data_format,
-                    )
-
-                if do_rescale:
-                    image = rescale(
-                        image=image,
-                        scale=rescale_factor,
-                        input_data_format=input_data_format,
-                    )
-
-                if do_normalize:
-                    image = normalize(
-                        image=image,
-                        mean=image_mean,
-                        std=image_std,
-                        input_data_format=input_data_format,
-                    )
-
-                image = to_channel_dimension_format(
-                    image, data_format, input_channel_dim=input_data_format
-                )
-                processed_images.append(image)
-
-        data = {"pixel_values": processed_images, "num_crops": num_crops}
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-
-class Gemma3TextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the Gemma3Text-7B.
-    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 262208):
-            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Gemma3TextModel`]
-        hidden_size (`int`, *optional*, defaults to 2304):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 9216):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 26):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 8):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*, defaults to 4):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        head_dim (`int`, *optional*, defaults to 256):
-            The attention head dimension.
-        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
-            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
-        max_position_embeddings (`int`, *optional*, defaults to 131072):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
-        eos_token_id (`int`, *optional*, defaults to 1):
-            End of stream token id.
-        bos_token_id (`int`, *optional*, defaults to 2):
-            Beginning of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
-            Scaling factor used on the attention scores
-        sliding_window (`int`, *optional*, defaults to 4096): in Gemma3Text, every other layer uses sliding window attention. This is the
-            size of the sliding window.
-        final_logit_softcapping (`float`, *optional*):
-            Scaling factor when applying tanh softcapping on the logits.
-        attn_logit_softcapping (`float`, *optional*):
-            Scaling factor when applying tanh softcapping on the attention scores.
-        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings used in gloabl attention. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-        rope_local_base_freq (float, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
-            Pattern for the sliding window attention.
-
-    ```python
-    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
-    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
-    >>> configuration = Gemma3TextConfig()
-    >>> # Initializing a model from the gemma3_text-7b style configuration
-    >>> model = Gemma3TextModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-        rope_local_base_freq (float, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings for local attention.
-        sliding_window_pattern (`int`, *optional*, defaults to 6):
-            Pattern for the sliding window attention.
-    """
-
-    model_type = "gemma3_text"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }
-
-    def __init__(
-        self,
-        vocab_size=262_208,
-        hidden_size=2304,
-        intermediate_size=9216,
-        num_hidden_layers=26,
-        num_attention_heads=8,
-        num_key_value_heads=4,
-        head_dim=256,
-        hidden_activation="gelu_pytorch_tanh",
-        max_position_embeddings=131_072,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        eos_token_id=1,
-        bos_token_id=2,
-        tie_word_embeddings=True,
-        rope_theta=1_000_000.0,
-        attention_bias=False,
-        attention_dropout=0.0,
-        query_pre_attn_scalar=256,
-        sliding_window=4096,
-        final_logit_softcapping=None,
-        attn_logit_softcapping=None,
-        cache_implementation="hybrid",
-        rope_scaling=None,
-        rope_local_base_freq=10_000.0,
-        sliding_window_pattern=6,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.head_dim = head_dim
-        self.num_key_value_heads = num_key_value_heads
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-        self.hidden_activation = hidden_activation
-        self.query_pre_attn_scalar = query_pre_attn_scalar
-        self.sliding_window = sliding_window
-        self.final_logit_softcapping = final_logit_softcapping
-        self.attn_logit_softcapping = attn_logit_softcapping
-        self.cache_implementation = cache_implementation
-
-        self.rope_local_base_freq = rope_local_base_freq
-        # For configuring HybridCache to work with 5:1 attention pattern
-        self.sliding_window_pattern = sliding_window_pattern
-        self.rope_scaling = rope_scaling
-        rope_config_validation(self)
-
-
-class Gemma3Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
-    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
-
-    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-    Args:
-        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
-            The config object of the text backbone.
-        vision_config (`Union[AutoConfig, dict]`,  *optional*):
-            Custom vision config or dict.
-        mm_tokens_per_image (`int`, *optional*, defaults to 256):
-            The number of tokens per image embedding.
-        boi_token_index (`int`, *optional*, defaults to 255999):
-            The begin-of-image token index to wrap the image prompt.
-        eoi_token_index (`int`, *optional*, defaults to 256000):
-            The end-of-image token index to wrap the image prompt.
-        image_token_index (`int`, *optional*, defaults to 262144):
-            The image token index to encode the image prompt.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-
-
-    Example:
-
-    ```python
-    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig
-
-    >>> # Initializing a Siglip-like vision config
-    >>> vision_config = SiglipVisionConfig()
-
-    >>> # Initializing a Gemma3 Text config
-    >>> text_config = Gemma3TextConfig()
-
-    >>> # Initializing a Gemma3 gemma-3-4b style configuration
-    >>> configuration = Gemma3Config(vision_config, text_config)
-
-    >>> # Initializing a model from the gemma-3-4b style configuration
-    >>> model = Gemma3TextConfig(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "gemma3"
-    sub_configs = {
-        "text_config": Gemma3TextConfig,
-        "vision_config": SiglipVisionConfig,
-    }
-
-    def __init__(
-        self,
-        text_config: Optional[Gemma3TextConfig] = None,
-        vision_config: Optional[SiglipVisionConfig] = None,
-        mm_tokens_per_image: int = 256,
-        boi_token_index: int = 255_999,
-        eoi_token_index: int = 256_000,
-        image_token_index: int = 262_144,
-        initializer_range: float = 0.02,
-        **kwargs,
-    ):
-        if text_config is None:
-            text_config = Gemma3TextConfig()
-            # logger.info(
-            #     "text_config is None, using default Gemma3TextConfig config."
-            # )
-        elif isinstance(text_config, dict):
-            text_config = Gemma3TextConfig(**text_config)
-
-        if isinstance(vision_config, dict):
-            vision_config = SiglipVisionConfig(**vision_config)
-        elif isinstance(vision_config, SiglipVisionConfig):
-            pass
-        else:
-            # logger.info(
-            #     "vision_config is None or incompatible with Gemma3VisionConfig initialization. Gemma3 will be limited "
-            #     "to text tasks."
-            # )
-            # logger.info(f"vision_config: {vision_config}")
-            vision_config = SiglipVisionConfig()
-
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.mm_tokens_per_image = mm_tokens_per_image
-        self.boi_token_index = boi_token_index
-        self.eoi_token_index = eoi_token_index
-        self.image_token_index = image_token_index
-        self.initializer_range = initializer_range
-
-        super().__init__(**kwargs)
-
-
-AutoProcessor.register(
-    config_class=Gemma3Config, processor_class=Gemma3Processor, exist_ok=True
-)
-
-AutoImageProcessor.register(
-    config_class=Gemma3Config,
-    image_processor_class=None,
-    slow_image_processor_class=Gemma3ImageProcessor,
-    fast_image_processor_class=None,
-    exist_ok=True,
-)
diff --git a/python/sglang/srt/configs/qwen2_5_vl_config.py b/python/sglang/srt/configs/qwen2_5_vl_config.py
deleted file mode 100644
index b04edf26d..000000000
--- a/python/sglang/srt/configs/qwen2_5_vl_config.py
+++ /dev/null
@@ -1,1006 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Qwen2VL model configuration"""
-from typing import Dict, Iterable, List, Optional, Union
-
-import numpy as np
-from transformers import (
-    AutoImageProcessor,
-    AutoProcessor,
-    BaseImageProcessor,
-    BatchFeature,
-    PretrainedConfig,
-    ProcessorMixin,
-    TensorType,
-)
-from transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    VideoInput,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_pil_image,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-    validate_preprocess_arguments,
-)
-from transformers.modeling_rope_utils import rope_config_validation
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-from transformers.processing_utils import ProcessingKwargs, Unpack, VideosKwargs
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-
-
-def is_valid_list_of_images(images: List):
-    return images and all(is_valid_image(image) for image in images)
-
-
-class Qwen2_5_VLVisionConfig(PretrainedConfig):
-    model_type = "qwen2_5_vl"
-    base_config_key = "vision_config"
-
-    def __init__(
-        self,
-        depth=32,
-        hidden_size=3584,
-        hidden_act="silu",
-        intermediate_size=3420,
-        num_heads=16,
-        in_channels=3,
-        patch_size=14,
-        spatial_merge_size=2,
-        temporal_patch_size=2,
-        tokens_per_second=4,
-        window_size=112,
-        out_hidden_size=3584,
-        fullatt_block_indexes=[7, 15, 23, 31],
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.depth = depth
-        self.hidden_size = hidden_size
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.num_heads = num_heads
-        self.in_channels = in_channels
-        self.patch_size = patch_size
-        self.spatial_merge_size = spatial_merge_size
-        self.temporal_patch_size = temporal_patch_size
-        self.tokens_per_second = tokens_per_second
-        self.window_size = window_size
-        self.fullatt_block_indexes = fullatt_block_indexes
-        self.out_hidden_size = out_hidden_size
-
-
-class Qwen2_5_VLConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
-    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 152064):
-            Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
-        hidden_size (`int`, *optional*, defaults to 8192):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 29568):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 80):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 64):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 8):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 80):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        vision_config (`Dict`, *optional*):
-            The config for the visual encoder initialization.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
-            Expected contents:
-                `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
-                `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
-                `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
-                `attention_factor` (`float`, *optional*):
-                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
-                `beta_fast` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 32.
-                `beta_slow` (`float`, *optional*):
-                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
-                    ramp function. If unspecified, it defaults to 1.
-                `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
-                `low_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
-                `high_freq_factor` (`float`, *optional*):
-                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
-
-    ```python
-    >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
-
-    >>> # Initializing a Qwen2_5_VL style configuration
-    >>> configuration = Qwen2_5_VLConfig()
-
-    >>> # Initializing a model from the Qwen2-VL-7B style configuration
-    >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2_5_vl"
-    sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
-    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `Qwen2_5_VL`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-
-    def __init__(
-        self,
-        vocab_size=152064,
-        hidden_size=8192,
-        intermediate_size=29568,
-        num_hidden_layers=80,
-        num_attention_heads=64,
-        num_key_value_heads=8,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-05,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=1000000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=80,
-        attention_dropout=0.0,
-        vision_config=None,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        if isinstance(vision_config, dict):
-            self.vision_config = self.sub_configs["vision_config"](**vision_config)
-        elif vision_config is None:
-            self.vision_config = self.sub_configs["vision_config"]()
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-        self.rope_scaling = rope_scaling
-
-        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, move it to 'rope_type'.
-        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
-        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
-        # TODO: @raushan update config in the hub
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            if self.rope_scaling["type"] == "mrope":
-                self.rope_scaling["type"] = "default"
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
-        rope_config_validation(self, ignore_keys={"mrope_section"})
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
-
-
-# FIXME: workaround of obsolete transformers version
-
-
-class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
-    fps: Union[List[float], float]
-
-
-class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
-    videos_kwargs: Qwen2_5_VLVideosProcessorKwargs
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
-        "videos_kwargs": {"fps": 2.0},
-    }
-
-
-class Qwen2_5_VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
-
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
-    def __init__(
-        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
-    ):
-        self.image_token = (
-            "<|image_pad|>"
-            if not hasattr(tokenizer, "image_token")
-            else tokenizer.image_token
-        )
-        self.video_token = (
-            "<|video_pad|>"
-            if not hasattr(tokenizer, "video_token")
-            else tokenizer.video_token
-        )
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-    def __call__(
-        self,
-        images: ImageInput = None,
-        text: Union[
-            TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
-        ] = None,
-        videos: VideoInput = None,
-        **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
-            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
-            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
-            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
-        """
-        output_kwargs = self._merge_kwargs(
-            Qwen2_5_VLProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        if images is not None:
-            image_inputs = self.image_processor(
-                images=images, videos=None, **output_kwargs["images_kwargs"]
-            )
-            image_grid_thw = image_inputs["image_grid_thw"]
-        else:
-            image_inputs = {}
-            image_grid_thw = None
-
-        if videos is not None:
-            videos_inputs = self.image_processor(
-                images=None, videos=videos, **output_kwargs["images_kwargs"]
-            )
-            video_grid_thw = videos_inputs["video_grid_thw"]
-
-            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
-            if isinstance(fps, (int, float)):
-                second_per_grid_ts = [
-                    self.image_processor.temporal_patch_size / fps
-                ] * len(video_grid_thw)
-            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
-                second_per_grid_ts = [
-                    self.image_processor.temporal_patch_size / tmp for tmp in fps
-                ]
-            else:
-                raise ValueError(
-                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
-                )
-            videos_inputs.update({"second_per_grid_ts": second_per_grid_ts})
-
-        else:
-            videos_inputs = {}
-            video_grid_thw = None
-
-        if not isinstance(text, list):
-            text = [text]
-
-        if image_grid_thw is not None:
-            merge_length = self.image_processor.merge_size**2
-            index = 0
-            for i in range(len(text)):
-                while self.image_token in text[i]:
-                    text[i] = text[i].replace(
-                        self.image_token,
-                        "<|placeholder|>"
-                        * (image_grid_thw[index].prod() // merge_length),
-                        1,
-                    )
-                    index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.image_token)
-
-        if video_grid_thw is not None:
-            merge_length = self.image_processor.merge_size**2
-            index = 0
-            for i in range(len(text)):
-                while self.video_token in text[i]:
-                    text[i] = text[i].replace(
-                        self.video_token,
-                        "<|placeholder|>"
-                        * (video_grid_thw[index].prod() // merge_length),
-                        1,
-                    )
-                    index += 1
-                text[i] = text[i].replace("<|placeholder|>", self.video_token)
-
-        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
-
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    def post_process_image_text_to_text(self, generated_outputs):
-        """
-        Post-process the output of the model to decode the text.
-
-        Args:
-            generated_outputs (`torch.Tensor` or `np.ndarray`):
-                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
-                or `(sequence_length,)`.
-
-        Returns:
-            `List[str]`: The decoded text.
-        """
-        return self.tokenizer.batch_decode(
-            generated_outputs,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False,
-        )
-
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        names_from_processor = list(
-            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
-        )
-        return names_from_processor + ["second_per_grid_ts"]
-
-
-class Qwen2_5_VLImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use when resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
-            The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-            The max pixels of the image to resize the image.
-        patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
-        temporal_patch_size (`int`, *optional*, defaults to 2):
-            The temporal patch size of the vision encoder.
-        merge_size (`int`, *optional*, defaults to 2):
-            The merge size of the vision encoder to llm encoder.
-    """
-
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-        "second_per_grid_ts",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_patch_size: int = 2,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_patch_size = temporal_patch_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
-        self.do_convert_rgb = do_convert_rgb
-
-    def rescale(
-        self,
-        image: np.ndarray,
-        scale: float,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Rescale an image by a scale factor. image = image * scale.
-
-        Args:
-            image (`np.ndarray`):
-                Image to rescale.
-            scale (`float`):
-                The scaling factor to rescale pixel values by.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The rescaled image.
-        """
-        return rescale(
-            image,
-            scale=scale,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-
-    def normalize(
-        self,
-        image: np.ndarray,
-        mean: Union[float, Iterable[float]],
-        std: Union[float, Iterable[float]],
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            mean (`float` or `Iterable[float]`):
-                Image mean to use for normalization.
-            std (`float` or `Iterable[float]`):
-                Image standard deviation to use for normalization.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-
-        Returns:
-            `np.ndarray`: The normalized image.
-        """
-        return normalize(
-            image,
-            mean=mean,
-            std=std,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        do_resize: bool = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        """
-        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
-            vision_info (`List[Dict]`, *optional*):
-                Optional list of dictionaries containing additional information about vision inputs.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Scale factor to use if rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = height, width
-        processed_images = []
-        for image in images:
-            if do_resize:
-                resized_height, resized_width = smart_resize(
-                    height,
-                    width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
-                )
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
-
-            if do_rescale:
-                image = self.rescale(
-                    image, scale=rescale_factor, input_data_format=input_data_format
-                )
-
-            if do_normalize:
-                image = self.normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    input_data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(
-                image, data_format, input_channel_dim=input_data_format
-            )
-            processed_images.append(image)
-
-        patches = np.array(processed_images)
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        patches = patches.reshape(
-            grid_t,
-            self.temporal_patch_size,
-            channel,
-            grid_h // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-            grid_w // self.merge_size,
-            self.merge_size,
-            self.patch_size,
-        )
-        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
-        flatten_patches = patches.reshape(
-            grid_t * grid_h * grid_w,
-            channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-        )
-
-        return flatten_patches, (grid_t, grid_h, grid_w)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        do_resize: bool = None,
-        size: Dict[str, int] = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = None,
-        rescale_factor: float = None,
-        do_normalize: bool = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            videos (`VideoInput`):
-                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
-                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = (
-            rescale_factor if rescale_factor is not None else self.rescale_factor
-        )
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = (
-            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-        )
-
-        def make_flat_list_of_images(
-            images: Union[List[ImageInput], ImageInput],
-        ) -> ImageInput:
-            """
-            Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
-            If the input is a nested list of images, it is converted to a flat list of images.
-            Args:
-                images (`Union[List[ImageInput], ImageInput]`):
-                    The input image.
-            Returns:
-                list: A list of images or a 4d array of images.
-            """
-            # If the input is a nested list of images, we flatten it
-            if (
-                isinstance(images, (list, tuple))
-                and all(isinstance(images_i, (list, tuple)) for images_i in images)
-                and all(is_valid_list_of_images(images_i) for images_i in images)
-            ):
-                return [img for img_list in images for img in img_list]
-
-            if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
-                if is_pil_image(images[0]) or images[0].ndim == 3:
-                    return images
-                if images[0].ndim == 4:
-                    return [img for img_list in images for img in img_list]
-
-            if is_valid_image(images):
-                if is_pil_image(images) or images.ndim == 3:
-                    return [images]
-                if images.ndim == 4:
-                    return list(images)
-
-            raise ValueError(f"Could not make a flat list of images from {images}")
-
-        def make_batched_videos(videos) -> VideoInput:
-            """
-            Ensure that the input is a list of videos.
-            Args:
-                videos (`VideoInput`):
-                    Video or videos to turn into a list of videos.
-            Returns:
-                list: A list of videos.
-            """
-            if (
-                isinstance(videos, (list, tuple))
-                and isinstance(videos[0], (list, tuple))
-                and is_valid_image(videos[0][0])
-            ):
-                # case 1: nested batch of videos so we flatten it
-                if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
-                    videos = [
-                        [video for batch_list in batched_videos for video in batch_list]
-                        for batched_videos in videos
-                    ]
-                # case 2: list of videos represented as list of video frames
-                return videos
-
-            elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-                if is_pil_image(videos[0]) or videos[0].ndim == 3:
-                    return [videos]
-                elif videos[0].ndim == 4:
-                    return [list(video) for video in videos]
-
-            elif is_valid_image(videos):
-                if is_pil_image(videos) or videos.ndim == 3:
-                    return [[videos]]
-                elif videos.ndim == 4:
-                    return [list(videos)]
-
-            raise ValueError(f"Could not make batched video from {videos}")
-
-        if images is not None:
-            images = make_flat_list_of_images(images)
-        if videos is not None:
-            videos = make_batched_videos(videos)
-
-        if images is not None and not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-
-        validate_preprocess_arguments(
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_resize=do_resize,
-            size=size,
-            resample=resample,
-        )
-
-        if images is not None:
-            pixel_values, vision_grid_thws = [], []
-            for image in images:
-                patches, image_grid_thw = self._preprocess(
-                    image,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(image_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
-
-        if videos is not None:
-            pixel_values, vision_grid_thws = [], []
-            for images in videos:
-                patches, video_grid_thw = self._preprocess(
-                    images,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(video_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-            data = {
-                "pixel_values_videos": pixel_values,
-                "video_grid_thw": vision_grid_thws,
-            }
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-
-AutoImageProcessor.register(Qwen2_5_VLConfig, None, Qwen2_5_VLImageProcessor, None)
-AutoProcessor.register(Qwen2_5_VLConfig, Qwen2_5_VLProcessor)
diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py
index d236ce7a8..397404d30 100644
--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -35,10 +35,7 @@ from sglang.srt.configs import (
     DbrxConfig,
     DeepseekVL2Config,
     ExaoneConfig,
-    Gemma3Config,
-    Gemma3TextConfig,
     MultiModalityConfig,
-    Qwen2_5_VLConfig,
 )
 from sglang.srt.connector import create_remote_connector
 from sglang.srt.utils import is_remote_url
@@ -47,11 +44,8 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
     ChatGLMConfig.model_type: ChatGLMConfig,
     DbrxConfig.model_type: DbrxConfig,
     ExaoneConfig.model_type: ExaoneConfig,
-    Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
     DeepseekVL2Config.model_type: DeepseekVL2Config,
     MultiModalityConfig.model_type: MultiModalityConfig,
-    Gemma3Config.model_type: Gemma3Config,
-    Gemma3TextConfig.model_type: Gemma3TextConfig,
 }
 
 for name, cls in _CONFIG_REGISTRY.items():
@@ -223,11 +217,26 @@ def get_processor(
     tokenizer_revision: Optional[str] = None,
     **kwargs,
 ):
+    # pop 'revision' from kwargs if present.
+    revision = kwargs.pop("revision", tokenizer_revision)
+
+    config = AutoConfig.from_pretrained(
+        tokenizer_name,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        **kwargs,
+    )
+
+    # fix: for Qwen2-VL model, inject default 'size' if not provided.
+    if config.model_type in {"qwen2_vl"}:
+        if "size" not in kwargs:
+            kwargs["size"] = {"shortest_edge": 3136, "longest_edge": 1003520}
+
     processor = AutoProcessor.from_pretrained(
         tokenizer_name,
         *args,
         trust_remote_code=trust_remote_code,
-        tokenizer_revision=tokenizer_revision,
+        revision=revision,
         **kwargs,
     )
 
diff --git a/python/sglang/srt/layers/rotary_embedding.py b/python/sglang/srt/layers/rotary_embedding.py
index 56632092e..cbd8beb66 100644
--- a/python/sglang/srt/layers/rotary_embedding.py
+++ b/python/sglang/srt/layers/rotary_embedding.py
@@ -441,16 +441,12 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
     ):
         super().__init__()
 
-        if rotary_dim != head_size:
-            raise ValueError(
-                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
-                    rotary_dim != head_size ({rotary_dim}!={head_size})."
-            )
         if is_neox_style is False:
             raise ValueError(
                 "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
             )
 
+        self.rotary_dim = rotary_dim
         self.head_size = head_size
         self.max_position_embeddings = max_position_embeddings
         self.original_max_position_embeddings = original_max_position_embeddings
@@ -499,8 +495,8 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
             * (
                 self.base
                 ** (
-                    torch.arange(0, self.head_size, 2, dtype=torch.float)
-                    / self.head_size
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
                 )
             )
         )
@@ -549,8 +545,15 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
         cos = cos.repeat(1, 2).unsqueeze(-2)
         sin = sin.repeat(1, 2).unsqueeze(-2)
 
-        query = query * cos + _rotate_neox(query) * sin
-        key = key * cos + _rotate_neox(key) * sin
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
 
         return query.flatten(-2), key.flatten(-2)
 
diff --git a/python/sglang/srt/models/gemma3_causal.py b/python/sglang/srt/models/gemma3_causal.py
index b2fa84eb3..489b15798 100644
--- a/python/sglang/srt/models/gemma3_causal.py
+++ b/python/sglang/srt/models/gemma3_causal.py
@@ -21,11 +21,11 @@ from torch import nn
 from transformers import (
     ROPE_INIT_FUNCTIONS,
     AutoModel,
+    Gemma3TextConfig,
     PretrainedConfig,
     PreTrainedModel,
 )
 
-from sglang.srt.configs.gemma3 import Gemma3TextConfig
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import GeluAndMul
 from sglang.srt.layers.layernorm import Gemma3RMSNorm
diff --git a/python/sglang/srt/models/gemma3_mm.py b/python/sglang/srt/models/gemma3_mm.py
index 561b7e834..401e65731 100644
--- a/python/sglang/srt/models/gemma3_mm.py
+++ b/python/sglang/srt/models/gemma3_mm.py
@@ -21,9 +21,15 @@ from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
 
 import torch
 from torch import nn
-from transformers import AutoModel, PreTrainedModel
+from transformers import (
+    AutoModel,
+    BatchFeature,
+    Gemma3Config,
+    Gemma3Processor,
+    PreTrainedModel,
+)
+from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
-from sglang.srt.configs import Gemma3Config
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.layernorm import Gemma3RMSNorm
 from sglang.srt.layers.logits_processor import LogitsProcessor
diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py
index 873b28e02..20fa13493 100644
--- a/python/sglang/srt/models/llama.py
+++ b/python/sglang/srt/models/llama.py
@@ -129,6 +129,8 @@ class LlamaAttention(nn.Module):
         self.head_dim = getattr(
             config, "head_dim", self.hidden_size // self.total_num_heads
         )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -154,7 +156,7 @@ class LlamaAttention(nn.Module):
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
+            rotary_dim=self.rotary_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
diff --git a/python/sglang/srt/models/qwen2_5_vl.py b/python/sglang/srt/models/qwen2_5_vl.py
index 9fb59c26a..95f926356 100644
--- a/python/sglang/srt/models/qwen2_5_vl.py
+++ b/python/sglang/srt/models/qwen2_5_vl.py
@@ -34,8 +34,15 @@ from einops import rearrange
 from transformers import AutoModel, Qwen2VLConfig
 from transformers.activations import ACT2FN
 from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm
+from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLVisionConfig,
+)
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLForConditionalGeneration,
+)
 
-from sglang.srt.configs import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
 from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -714,4 +721,3 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module):
 
 
 EntryClass = [Qwen2_5_VLForConditionalGeneration]
-AutoModel.register(Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration)
diff --git a/scripts/ci_install_dependency.sh b/scripts/ci_install_dependency.sh
index 6933eeddf..ca79ebcb6 100755
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -20,7 +20,7 @@ pip install flashinfer_python==0.2.3 --find-links ${FLASHINFER_REPO} --force-rei
 
 pip install torch_memory_saver --force-reinstall
 
-pip install transformers==4.48.3 sentence_transformers accelerate==1.4.0 peft pandas datasets
+pip install transformers==4.50.0 sentence_transformers accelerate==1.4.0 peft pandas datasets
 
 # For compling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12