Model: Support Qwen 2.5 vl (#3258)

2025-02-16 16:58:53 +08:00
parent 39416e394a
commit bcc213df61
11 changed files with 2000 additions and 262 deletions
--- a/docs/references/supported_models.md
+++ b/docs/references/supported_models.md
@@ -4,7 +4,7 @@
 - Llama / Llama 2 / Llama 3 / Llama 3.1 / Llama 3.2
 - Mistral / Mixtral / Mistral NeMo / Mistral Small 3
 - Gemma / Gemma 2
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
+- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL / Qwen 2.5 VL
 - DeepSeek / DeepSeek 2 / [DeepSeek 3](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3)
 - OLMoE
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -54,7 +54,7 @@ To support a new model in SGLang, you only need to add a single file under [SGLa
 You can learn from existing model implementations and create new files for the new models.
 For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
-## How to Support a New vision LLM
+## How to Support a New vLM
 To support a new vision-language model (vLM) in SGLang, there are several key components in addition to the standard LLM.
--- a/python/sglang/lang/chat_template.py
+++ b/python/sglang/lang/chat_template.py
@@ -427,6 +427,8 @@ def match_chat_ml(model_path: str):
    if "tinyllama" in model_path:
        return get_chat_template("chatml")
    # Now the suffix for qwen2 chat model is "instruct"
    if "qwen" in model_path and "vl" in model_path:
        return get_chat_template("qwen2-vl")
    if "qwen" in model_path:
        if "vl" in model_path:
            return get_chat_template("qwen2-vl")
@@ -443,6 +445,12 @@ def match_chat_ml(model_path: str):
        return get_chat_template("chatml-llava")
@register_chat_template_matching_function
 def match_chat_minicpm(model_path: str):
    if "minicpm" in model_path:
        return get_chat_template("minicpmv")
@register_chat_template_matching_function
 def match_chat_yi(model_path: str):
    model_path = model_path.lower()
--- a/python/sglang/srt/configs/init.py
+++ b/python/sglang/srt/configs/init.py
@@ -1,12 +1,15 @@
 from sglang.srt.configs.chatglm import ChatGLMConfig
 from sglang.srt.configs.dbrx import DbrxConfig
 from sglang.srt.configs.exaone import ExaoneConfig
-from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
+from sglang.srt.configs.qwen2_5_vl_config import (
    Qwen2_5_VLConfig,
    Qwen2_5_VLVisionConfig,
 )
 __all__ = [
    "ExaoneConfig",
    "Qwen2VLConfig",
    "Qwen2VLVisionConfig",
    "ChatGLMConfig",
    "DbrxConfig",
    "Qwen2_5_VLConfig",
    "Qwen2_5_VLVisionConfig",
 ]
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -403,6 +403,7 @@ def is_multimodal_model(model_architectures: List[str]):
        or "LlavaVidForCausalLM" in model_architectures
        or "MllamaForConditionalGeneration" in model_architectures
        or "Qwen2VLForConditionalGeneration" in model_architectures
        or "Qwen2_5_VLForConditionalGeneration" in model_architectures
        or "MiniCPMV" in model_architectures
    ):
        return True
--- a/python/sglang/srt/configs/qwen2_5_vl_config.py
+++ b/python/sglang/srt/configs/qwen2_5_vl_config.py
--- a/python/sglang/srt/configs/qwen2vl.py
+++ b/python/sglang/srt/configs/qwen2vl.py
@@ -1,130 +0,0 @@
 # coding=utf-8
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Qwen2VL model configuration"""
 import os
 from typing import Union
 from transformers import PretrainedConfig
 class Qwen2VLVisionConfig(PretrainedConfig):
    model_type = "qwen2_vl"
    def __init__(
        self,
        depth=32,
        embed_dim=1280,
        hidden_size=3584,
        hidden_act="quick_gelu",
        mlp_ratio=4,
        num_heads=16,
        in_channels=3,
        patch_size=14,
        spatial_merge_size=2,
        temporal_patch_size=2,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.depth = depth
        self.embed_dim = embed_dim
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.mlp_ratio = mlp_ratio
        self.num_heads = num_heads
        self.in_channels = in_channels
        self.patch_size = patch_size
        self.spatial_merge_size = spatial_merge_size
        self.temporal_patch_size = temporal_patch_size
    @classmethod
    def from_pretrained(
        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
    ) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs
        )
        if config_dict.get("model_type") == "qwen2_vl":
            config_dict = config_dict["vision_config"]
        return cls.from_dict(config_dict, **kwargs)
 class Qwen2VLConfig(PretrainedConfig):
    model_type = "qwen2_vl"
    def __init__(
        self,
        vocab_size=152064,
        hidden_size=8192,
        intermediate_size=29568,
        num_hidden_layers=80,
        num_attention_heads=64,
        num_key_value_heads=8,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-05,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=1000000.0,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=80,
        attention_dropout=0.0,
        vision_config=None,
        rope_scaling=None,
        **kwargs,
    ):
        if isinstance(vision_config, dict):
            self.vision_config = Qwen2VLVisionConfig(**vision_config)
        elif vision_config is None:
            self.vision_config = Qwen2VLVisionConfig()
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window
        self.max_window_layers = max_window_layers
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.attention_dropout = attention_dropout
        self.rope_scaling = rope_scaling
        # NOTE(HandH1998): This is necessary for configuring the `rope_type`` of qwen2vl models after removing dependencies on vllm.
        if self.rope_scaling is not None and "type" in self.rope_scaling:
            if self.rope_scaling["type"] == "mrope":
                self.rope_scaling["type"] = "default"
            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -30,16 +30,15 @@ from transformers import (
 )
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
-from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2VLConfig
+from sglang.srt.configs import ChatGLMConfig, DbrxConfig, ExaoneConfig, Qwen2_5_VLConfig
 _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
    ChatGLMConfig.model_type: ChatGLMConfig,
    DbrxConfig.model_type: DbrxConfig,
    ExaoneConfig.model_type: ExaoneConfig,
-    Qwen2VLConfig.model_type: Qwen2VLConfig,
+    Qwen2_5_VLConfig.model_type: Qwen2_5_VLConfig,
 }
 for name, cls in _CONFIG_REGISTRY.items():
    with contextlib.suppress(ValueError):
        AutoConfig.register(name, cls)
--- a/python/sglang/srt/managers/image_processor.py
+++ b/python/sglang/srt/managers/image_processor.py
@@ -1,6 +1,7 @@
 # TODO: also move pad_input_ids into this module
 import asyncio
 import concurrent.futures
 import dataclasses
 import logging
 import multiprocessing as mp
 import os
@@ -8,6 +9,7 @@ from abc import ABC, abstractmethod
 from typing import List, Optional, Union
 import numpy as np
 import PIL
 import transformers
 from decord import VideoReader, cpu
 from PIL import Image
@@ -34,11 +36,22 @@ def init_global_processor(server_args: ServerArgs):
    )
@dataclasses.dataclass
 class BaseImageProcessorOutput:
    image_hashes: list[int]
    image_sizes: list[int]
    all_frames: [PIL.Image]
    # input_text, with each frame of video/image represented with a image_token
    input_text: str
 class BaseImageProcessor(ABC):
    def __init__(self, hf_config, server_args, _processor):
        self.hf_config = hf_config
        self._processor = _processor
        self.server_args = server_args
        # FIXME: not accurate, model and image specific
        self.NUM_TOKEN_PER_FRAME = 330
        self.executor = concurrent.futures.ProcessPoolExecutor(
            initializer=init_global_processor,
@@ -48,9 +61,128 @@ class BaseImageProcessor(ABC):
        )
    @abstractmethod
-    async def process_images_async(self, image_data, input_text, **kwargs):
+    async def process_images_async(
        self, image_data, input_text, max_req_input_len, **kwargs
    ):
        pass
    def get_estimated_frames_list(self, image_data):
        """
        estimate the total frame count from all visual input
        """
        # Before processing inputs
        estimated_frames_list = []
        for image in image_data:
            if isinstance(image, str) and image.startswith("video:"):
                path = image[len("video:") :]
                # Estimate frames for the video
                vr = VideoReader(path, ctx=cpu(0))
                num_frames = len(vr)
            else:
                # For images, each contributes one frame
                num_frames = 1
            estimated_frames_list.append(num_frames)
        return estimated_frames_list
    def encode_video(self, video_path, frame_count_limit=None):
        if not os.path.exists(video_path):
            logger.error(f"Video {video_path} does not exist")
            return []
        if frame_count_limit == 0:
            return []
        def uniform_sample(l, n):
            gap = len(l) / n
            idxs = [int(i * gap + gap / 2) for i in range(n)]
            return [l[i] for i in idxs]
        vr = VideoReader(video_path, ctx=cpu(0))
        sample_fps = round(vr.get_avg_fps() / 1)  # FPS
        frame_idx = [i for i in range(0, len(vr), sample_fps)]
        if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
            frame_idx = uniform_sample(frame_idx, frame_count_limit)
        frames = vr.get_batch(frame_idx).asnumpy()
        frames = [Image.fromarray(v.astype("uint8")) for v in frames]
        return frames
    def load_images(
        self,
        max_req_input_len: int,
        input_ids: list,
        image_data,
        image_token: str,
    ) -> BaseImageProcessorOutput:
        """
        Each frame of video/image will be replaced by a single image token
        """
        image_hashes, image_sizes = [], []
        all_frames = []
        new_text_parts = []
        if isinstance(input_ids, list):
            assert len(input_ids) and isinstance(input_ids[0], int)
            input_text = self._processor.tokenizer.decode(input_ids)
        else:
            input_text = input_ids
        text_parts = input_text.split(image_token)
        # roughly calculate the max number of frames under the max_req_input_len limit
        def calculate_max_num_frames() -> int:
            ret = (max_req_input_len - len(input_ids)) // self.NUM_TOKEN_PER_FRAME
            return min(ret, 100)
        MAX_NUM_FRAMES = calculate_max_num_frames()
        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
        total_frame_count = sum(estimated_frames_list)
        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
        scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
        # Process each input with allocated frames
        for image_index, (image, estimated_frames) in enumerate(
            zip(image_data, estimated_frames_list)
        ):
            if len(all_frames) >= MAX_NUM_FRAMES:
                frames_to_process = 0
            else:
                frames_to_process = max(1, int(estimated_frames * scaling_factor))
            if frames_to_process == 0:
                frames = []
            else:
                try:
                    if isinstance(image, str) and image.startswith("video:"):
                        path = image[len("video:") :]
                        frames = self.encode_video(
                            path, frame_count_limit=frames_to_process
                        )
                    else:
                        raw_image, _size = load_image(image)
                        frames = [raw_image]
                    if len(frames) == 0:
                        continue
                except FileNotFoundError as e:
                    print(e)
                    return None
                image_sizes += frames[0].size * len(frames)
                image_hashes += [hash(image)] * len(frames)
                all_frames += frames
            new_text_parts.append(text_parts[image_index])
            if frames_to_process != 0:
                new_text_parts.append(image_token * len(frames))
            assert frames_to_process == len(frames)
        new_text_parts.append(text_parts[-1])
        input_text = "".join(new_text_parts)
        return BaseImageProcessorOutput(
            image_hashes, image_sizes, all_frames, input_text
        )
 class DummyImageProcessor(BaseImageProcessor):
    def __init__(self):
@@ -248,9 +380,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
            text=input_text, images=images, return_tensors="pt"
        )
        return {
-            "input_ids": result["input_ids"],
+            "input_ids": result.input_ids,
-            "pixel_values": result["pixel_values"],
+            "pixel_values": result.pixel_values,
-            "tgt_sizes": result["tgt_sizes"],
+            "tgt_sizes": result.tgt_sizes,
        }
    async def _process_images(self, images, input_text):
@@ -278,124 +410,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
    ):
        if not image_data:
            return None
        if not isinstance(image_data, list):
            image_data = [image_data]
-        image_hashes, image_sizes = [], []
+        base_output = self.load_images(
-        all_frames = []
+            max_req_input_len, input_ids, image_data, self.IMAGE_TOKEN
-
+        )
-        # roughly calculate the max number of frames under the max_req_input_len limit
+        if base_output is None:
        def calculate_max_num_frames() -> int:
            # Model-specific
            NUM_TOKEN_PER_FRAME = 330
            ret = (max_req_input_len - len(input_ids)) // NUM_TOKEN_PER_FRAME
            return min(ret, 100)
        MAX_NUM_FRAMES = calculate_max_num_frames()
        # print(f"MAX_NUM_FRAMES: {MAX_NUM_FRAMES}")
        def get_estimated_frames_list():
            """
            estimate the total frame count from all visual input
            """
            # Before processing inputs
            estimated_frames_list = []
            for image in image_data:
                if isinstance(image, str) and image.startswith("video:"):
                    path = image[len("video:") :]
                    # Estimate frames for the video
                    vr = VideoReader(path, ctx=cpu(0))
                    num_frames = len(vr)
                else:
                    # For images, each contributes one frame
                    num_frames = 1
                estimated_frames_list.append(num_frames)
            return estimated_frames_list
        estimated_frames_list = get_estimated_frames_list()
        total_frame_count = sum(estimated_frames_list)
        scaling_factor = min(1.0, MAX_NUM_FRAMES / total_frame_count)
        def encode_video(video_path, frame_count_limit=None):
            if not os.path.exists(video_path):
                logger.error(f"Video {video_path} does not exist")
                return []
            if frame_count_limit == 0:
                return []
            def uniform_sample(l, n):
                gap = len(l) / n
                idxs = [int(i * gap + gap / 2) for i in range(n)]
                return [l[i] for i in idxs]
            vr = VideoReader(video_path, ctx=cpu(0))
            sample_fps = round(vr.get_avg_fps() / 1)  # FPS
            frame_idx = [i for i in range(0, len(vr), sample_fps)]
            if frame_count_limit is not None and len(frame_idx) > frame_count_limit:
                frame_idx = uniform_sample(frame_idx, frame_count_limit)
            frames = vr.get_batch(frame_idx).asnumpy()
            frames = [Image.fromarray(v.astype("uint8")) for v in frames]
            return frames
        if isinstance(input_ids, list):
            assert len(input_ids) and isinstance(input_ids[0], int)
            input_text = self._processor.tokenizer.decode(input_ids)
        else:
            input_text = input_ids
        # MiniCPMV requires each frame of video as a single image token
        text_parts = input_text.split(self.IMAGE_TOKEN)
        new_text_parts = []
        # Process each input with allocated frames
        for image_index, (image, estimated_frames) in enumerate(
            zip(image_data, estimated_frames_list)
        ):
            if len(all_frames) >= MAX_NUM_FRAMES:
                frames_to_process = 0
            else:
                frames_to_process = max(1, int(estimated_frames * scaling_factor))
            if frames_to_process == 0:
                frames = []
            else:
                try:
                    if isinstance(image, str) and image.startswith("video:"):
                        path = image[len("video:") :]
                        frames = encode_video(path, frame_count_limit=frames_to_process)
                    else:
                        raw_image, _size = load_image(image)
                        frames = [raw_image]
                    if len(frames) == 0:
                        continue
                except FileNotFoundError as e:
                    print(e)
                    return None
                image_sizes += frames[0].size * len(frames)
                image_hashes += [hash(image)] * len(frames)
                all_frames += frames
            assert frames_to_process == len(frames)
            new_text_parts.append(text_parts[image_index])
            if frames_to_process != 0:
                new_text_parts.append(self.IMAGE_TOKEN * len(frames))
        new_text_parts.append(text_parts[-1])
        input_text = "".join(new_text_parts)
        if len(all_frames) == 0:
            return None
-        res = await self._process_images(images=all_frames, input_text=input_text)
+
-        pixel_values = res["pixel_values"]
+        if len(base_output.all_frames) == 0:
-        tgt_sizes = res["tgt_sizes"]
+            return None
-        input_ids = res["input_ids"]
+        res = await self._process_images(
            images=base_output.all_frames, input_text=base_output.input_text
        )
        # Collect special token ids
        tokenizer = self._processor.tokenizer
@@ -405,10 +433,10 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
            slice_start_id = [tokenizer.slice_start_id]
            slice_end_id = [tokenizer.slice_end_id]
        return {
-            "input_ids": input_ids.flatten().tolist(),
+            "input_ids": res["input_ids"].flatten().tolist(),
-            "pixel_values": pixel_values,
+            "pixel_values": res["pixel_values"],
-            "tgt_sizes": tgt_sizes,
+            "tgt_sizes": res["tgt_sizes"],
-            "image_hashes": image_hashes,
+            "image_hashes": base_output.image_hashes,
            "modalities": request_obj.modalities or ["image"],
            "im_start_id": im_start_id,
            "im_end_id": im_end_id,
@@ -536,13 +564,80 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
        }
 class Qwen2_5VLImageProcessor(BaseImageProcessor):
    def __init__(self, hf_config, server_args, _processor):
        super().__init__(hf_config, server_args, _processor)
        self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
        self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
        self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
        self.NUM_TOKEN_PER_FRAME = 770
    @staticmethod
    def _process_images_task(images, input_text):
        result = global_processor.__call__(
            text=input_text, images=images, return_tensors="pt"
        )
        return {
            "input_ids": result.input_ids,
            "pixel_values": result.pixel_values,
            "image_grid_thws": result.image_grid_thw,
        }
    async def _process_images(self, images, input_text) -> dict:
        if self.executor is not None:
            loop = asyncio.get_event_loop()
            return await loop.run_in_executor(
                self.executor,
                Qwen2_5VLImageProcessor._process_images_task,
                images,
                input_text,
            )
        else:
            return self._process_images_task(images, input_text)
    async def process_images_async(
        self,
        image_data: List[Union[str, bytes]],
        input_ids,
        request_obj,
        max_req_input_len,
        *args,
        **kwargs,
    ):
        if not image_data:
            return None
        if isinstance(image_data, str):
            image_data = [image_data]
        image_token = self.IMAGE_TOKEN
        base_output = self.load_images(
            max_req_input_len, input_ids, image_data, image_token
        )
        ret = await self._process_images(base_output.all_frames, base_output.input_text)
        return {
            "input_ids": ret["input_ids"].flatten().tolist(),
            "pixel_values": ret["pixel_values"],
            "image_hashes": base_output.image_hashes,
            "modalities": request_obj.modalities or ["image"],
            "image_grid_thws": ret["image_grid_thws"],
            "im_start_id": self.IM_START_TOKEN_ID,
            "im_end_id": self.IM_END_TOKEN_ID,
        }
 def get_image_processor(
    hf_config, server_args: ServerArgs, processor
 ) -> BaseImageProcessor:
    if "MllamaForConditionalGeneration" in hf_config.architectures:
        return MllamaImageProcessor(hf_config, server_args, processor)
    elif "Qwen2VLForConditionalGeneration" in hf_config.architectures:
-        return Qwen2VLImageProcessor(hf_config, server_args, processor.image_processor)
+
        return Qwen2VLImageProcessor(hf_config, server_args, processor)
    elif "Qwen2_5_VLForConditionalGeneration" in hf_config.architectures:
        return Qwen2_5VLImageProcessor(hf_config, server_args, processor)
    elif "MiniCPMV" in hf_config.architectures:
        return MiniCPMVImageProcessor(hf_config, server_args, processor)
    else:
--- a/python/sglang/srt/models/qwen2_5_vl.py
+++ b/python/sglang/srt/models/qwen2_5_vl.py
@@ -0,0 +1,722 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 import logging
 from functools import lru_cache, partial
 from typing import Iterable, List, Optional, Tuple, Type
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from transformers import AutoModel, Qwen2VLConfig
 from transformers.activations import ACT2FN
 from transformers.models.qwen2.modeling_qwen2 import Qwen2RMSNorm
 from sglang.srt.configs import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
 from sglang.srt.distributed import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
 )
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
 from sglang.srt.managers.schedule_batch import ImageInputs
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2Model
 from sglang.srt.models.qwen2_vl import Qwen2VLImageInputs, Qwen2VLVideoInputs
 logger = logging.getLogger(__name__)
 class Qwen2_5_VLMLP(nn.Module):
    def __init__(
        self,
        in_features: int,
        hidden_features: int = None,
        bias: bool = True,
        hidden_act="silu",
        quant_config: Optional[QuantizationConfig] = None,
    ):
        super().__init__()
        self.gate_proj = ColumnParallelLinear(
            in_features, hidden_features, bias=bias, quant_config=quant_config
        )
        self.up_proj = ColumnParallelLinear(
            in_features, hidden_features, bias=bias, quant_config=quant_config
        )
        self.down_proj = RowParallelLinear(
            hidden_features, in_features, bias=bias, quant_config=quant_config
        )
        self.act = ACT2FN[hidden_act]
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_parallel_gate, _ = self.gate_proj(x)
        x_parallel_gate = self.act(x_parallel_gate)
        x_parallel_up, _ = self.up_proj(x)
        x_parallel = x_parallel_gate * x_parallel_up
        x, _ = self.down_proj(x_parallel)
        return x
 class Qwen2_5_VisionBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        intermediate_dim: int,
        num_heads: int,
        hidden_act="silu",
        norm_layer: Type[nn.Module] = None,
        attn_implementation: Optional[str] = "sdpa",
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = partial(nn.LayerNorm, eps=1e-6)
        self.norm1 = Qwen2RMSNorm(dim, eps=1e-6)
        self.norm2 = Qwen2RMSNorm(dim, eps=1e-6)
        if attn_implementation == "sdpa":
            use_context_forward = False
            use_full_precision_softmax = False
        elif attn_implementation == "flash_attention_2":
            use_full_precision_softmax = False
            use_context_forward = True
        elif attn_implementation == "eager":
            use_full_precision_softmax = True
            use_context_forward = False
        self.attn = VisionAttention(
            embed_dim=dim,
            num_heads=num_heads,
            projection_size=dim,
            use_qkv_parallel=False,
            use_context_forward=use_context_forward,
            use_full_precision_softmax=use_full_precision_softmax,
            flatten_batch=True,
            quant_config=quant_config,
        )
        self.mlp = Qwen2_5_VLMLP(
            dim, intermediate_dim, hidden_act=hidden_act, quant_config=quant_config
        )
    def forward(
        self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor
    ) -> torch.Tensor:
        hidden_states = self.norm1(x)
        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
        attn = self.attn(
            hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
        )
        attn = rearrange(attn, "b s ... -> s b ...")
        x = x + attn
        norm2 = self.norm2(x)
        mlp = self.mlp(norm2)
        x = x + mlp
        return x
 class Qwen2_5_VisionPatchEmbed(nn.Module):
    def __init__(
        self,
        patch_size: int = 14,
        temporal_patch_size: int = 2,
        in_chans: int = 3,
        embed_dim: int = 1152,
    ) -> None:
        super().__init__()
        self.patch_size = patch_size
        self.temporal_patch_size = temporal_patch_size
        self.embed_dim = embed_dim
        kernel_size = [temporal_patch_size, patch_size, patch_size]
        self.proj = nn.Conv3d(
            in_chans, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        L, C = x.shape
        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
        x = self.proj(x).view(L, self.embed_dim)
        return x
 class Qwen2_5_VisionPatchMerger(nn.Module):
    def __init__(
        self,
        dim: int,
        context_dim: int,
        spatial_merge_size: int = 2,
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.hidden_size = context_dim * (spatial_merge_size**2)
        self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
        self.mlp = nn.ModuleList(
            [
                ColumnParallelLinear(
                    self.hidden_size,
                    self.hidden_size,
                    bias=True,
                    quant_config=quant_config,
                ),
                nn.GELU(),
                RowParallelLinear(
                    self.hidden_size, dim, bias=True, quant_config=quant_config
                ),
            ]
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.ln_q(x)
        x = x.view(-1, self.hidden_size)
        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
        x_parallel, _ = mlp_fc1(x)
        x_parallel = mlp_act(x_parallel)
        out, _ = mlp_fc2(x_parallel)
        return out
 class Qwen2_5_VisionRotaryEmbedding(nn.Module):
    def __init__(self, dim: int, theta: float = 10000.0) -> None:
        super().__init__()
        self.dim = dim
        self.theta = theta
        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self._seq_len_cached = 0
        self._freqs_cached = None
    def update_freqs_cache(self, seqlen: int) -> None:
        if seqlen > self._seq_len_cached:
            seqlen *= 2
            self._seq_len_cached = seqlen
            self.inv_freq = 1.0 / (
                self.theta
                ** (
                    torch.arange(
                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
                    )
                    / self.dim
                )
            )
            seq = torch.arange(
                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
            )
            freqs = torch.outer(seq, self.inv_freq)
            self._freqs_cached = freqs
    def forward(self, seqlen: int) -> torch.Tensor:
        self.update_freqs_cache(seqlen)
        return self._freqs_cached[:seqlen]
 class Qwen2_5_VisionTransformer(nn.Module):
    def __init__(
        self,
        vision_config: Qwen2_5_VLVisionConfig,
        norm_eps: float = 1e-6,
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        patch_size: int = vision_config.patch_size
        temporal_patch_size: int = vision_config.temporal_patch_size
        spatial_merge_size: int = vision_config.spatial_merge_size
        self.spatial_merge_size = spatial_merge_size
        self.spatial_merge_unit: int = spatial_merge_size * spatial_merge_size
        in_chans: int = vision_config.in_chans
        hidden_size: int = vision_config.hidden_size
        depth: int = vision_config.depth
        num_heads: int = vision_config.num_heads
        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
        self.window_size = vision_config.window_size
        self.patch_size = vision_config.patch_size
        mlp_hidden_size: int = vision_config.intermediate_size
        self.patch_embed = Qwen2_5_VisionPatchEmbed(
            patch_size=patch_size,
            temporal_patch_size=temporal_patch_size,
            in_chans=in_chans,
            embed_dim=hidden_size,
        )
        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
        head_dim = hidden_size // num_heads
        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
        self.blocks = nn.ModuleList(
            [
                Qwen2_5_VisionBlock(
                    dim=hidden_size,
                    intermediate_dim=mlp_hidden_size,
                    num_heads=num_heads,
                    hidden_act=vision_config.hidden_act,
                    norm_layer=norm_layer,
                    attn_implementation="sdpa",
                    quant_config=quant_config,
                )
                for _ in range(depth)
            ]
        )
        self.merger = Qwen2_5_VisionPatchMerger(
            dim=vision_config.out_hidden_size,
            context_dim=hidden_size,
            spatial_merge_size=spatial_merge_size,
            quant_config=quant_config,
        )
    def get_window_index(self, grid_thw):
        window_index: list = []
        cu_window_seqlens: list = [0]
        window_index_id = 0
        vit_merger_window_size = (
            self.window_size // self.spatial_merge_size // self.patch_size
        )
        for grid_t, grid_h, grid_w in grid_thw:
            llm_grid_h, llm_grid_w = (
                grid_h // self.spatial_merge_size,
                grid_w // self.spatial_merge_size,
            )
            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
                grid_t, llm_grid_h, llm_grid_w
            )
            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
            index_padded = index_padded.reshape(
                grid_t,
                num_windows_h,
                vit_merger_window_size,
                num_windows_w,
                vit_merger_window_size,
            )
            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
                grid_t,
                num_windows_h * num_windows_w,
                vit_merger_window_size,
                vit_merger_window_size,
            )
            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
            index_padded = index_padded.reshape(-1)
            index_new = index_padded[index_padded != -100]
            window_index.append(index_new + window_index_id)
            cu_seqlens_tmp = (
                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
            )
            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
        window_index = torch.cat(window_index, dim=0)
        return window_index, cu_window_seqlens
    @property
    def dtype(self) -> torch.dtype:
        return self.blocks[0].mlp.gate_proj.weight.dtype
    @property
    def device(self) -> torch.device:
        return self.blocks[0].mlp.gate_proj.weight.device
    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            hpos_ids = (
                hpos_ids.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )
            wpos_ids = (
                wpos_ids.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
        pos_ids = torch.cat(pos_ids, dim=0)
        max_grid_size = grid_thw[:, 1:].max()
        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
        return rotary_pos_emb
    def forward(
        self,
        x: torch.Tensor,
        grid_thw: torch.Tensor,
    ) -> torch.Tensor:
        # patchify
        x = x.to(device=self.device, dtype=self.dtype)
        x = self.patch_embed(x)
        # compute position embedding
        rotary_pos_emb = self.rot_pos_emb(grid_thw)
        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
        cu_window_seqlens = torch.tensor(
            cu_window_seqlens,
            device=x.device,
            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
        )
        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
        seq_len, _ = x.size()
        x = x.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
        x = x[window_index, :, :]
        x = x.reshape(seq_len, -1)
        rotary_pos_emb = rotary_pos_emb.reshape(
            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
        )
        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
        # compute cu_seqlens
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
        # transformers
        x = x.unsqueeze(1)
        for layer_num, blk in enumerate(self.blocks):
            if layer_num in self.fullatt_block_indexes:
                cu_seqlens_now = cu_seqlens
            else:
                cu_seqlens_now = cu_window_seqlens
            x = blk(x, cu_seqlens=cu_seqlens_now, rotary_pos_emb=rotary_pos_emb)
        # adapter
        x = self.merger(x)
        reverse_indices = torch.argsort(window_index)
        x = x[reverse_indices, :]
        return x
 cached_get_processor = lru_cache(get_processor)
 class Qwen2_5_VLForConditionalGeneration(nn.Module):
    def __init__(
        self,
        config: Qwen2VLConfig,
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.config = config
        self.visual = Qwen2_5_VisionTransformer(
            config.vision_config,
            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
            # NOTE: Qwen2-VL vision encoder does not support any
            # quantization method now.
            quant_config=None,
        )
        self.model = Qwen2Model(config, quant_config)
        if config.tie_word_embeddings:
            self.lm_head = self.model.embed_tokens
        else:
            self.lm_head = ParallelLMHead(
                config.vocab_size, config.hidden_size, quant_config=quant_config
            )
        self.logits_processor = LogitsProcessor(config)
        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
    def calculate_num_image_tokens(self, image_grid_thw: Tuple[int, int, int]):
        processor = cached_get_processor(self.config._name_or_path)
        grid_t, grid_h, grid_w = image_grid_thw
        num_image_tokens = (
            grid_t
            * grid_h
            * grid_w
            // processor.image_processor.merge_size
            // processor.image_processor.merge_size
        )
        return num_image_tokens
    def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
        new_input_ids = []
        last_idx = 0
        image_idx = -1
        image_inputs.image_offsets = []
        # Get all special token IDs
        im_start_id = image_inputs.im_start_id
        im_end_id = image_inputs.im_end_id
        # Find all start and end positions for both types
        start_indices = [i for i, x in enumerate(input_ids) if x == im_start_id]
        end_indices = [i for i, x in enumerate(input_ids) if x == im_end_id]
        if len(start_indices) != len(end_indices):
            return input_ids
        # Process each region (both image and slice)
        for start_idx, end_idx in zip(start_indices, end_indices):
            # Add non-image tokens before this region
            new_input_ids.extend(input_ids[last_idx : start_idx + 1])
            is_image_start = input_ids[start_idx] == im_start_id
            if is_image_start:
                image_inputs.image_offsets += [start_idx]
                image_idx += 1
            num_tokens = end_idx - start_idx - 1  # exclude start and end tokens
            # Generate pad_ids
            pad_values = [image_inputs.pad_values[image_idx]]
            pad_ids = pad_values * ((num_tokens + len(pad_values)) // len(pad_values))
            pad_ids = pad_ids[:num_tokens]
            # Add pad_ids
            new_input_ids.extend(pad_ids)
            # Update last_idx to after end token
            last_idx = end_idx
        # Add remaining tokens after last region
        new_input_ids.extend(input_ids[last_idx:])
        assert len(input_ids) == len(new_input_ids)
        return new_input_ids
    def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor:
        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
        image_embeds = self.visual(pixel_values, grid_thw=image_input["image_grid_thw"])
        return image_embeds
    def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
        pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype)
        video_embeds = self.visual(
            pixel_values_videos, grid_thw=video_input["video_grid_thw"]
        )
        return video_embeds
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        forward_batch: ForwardBatch,
        get_embedding: bool = False,
    ):
        """Run forward pass for Qwen2_5-VL.
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
                (Use input_metadata.mrope_positions to replace it)
        """
        if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
            positions = forward_batch.mrope_positions
        image_inputs = None
        if forward_batch.image_inputs is not None:
            image_inputs = [
                img for img in forward_batch.image_inputs if img is not None
            ]
        if (
            forward_batch.forward_mode.is_decode()
            or image_inputs is None
            or len(image_inputs) == 0
        ):
            inputs_embeds = self.model.embed_tokens(input_ids)
        else:
            if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
                assert positions.ndim == 2 and positions.size(0) == 3, (
                    "multimodal section rotary embedding requires "
                    f"(3, seq_len) positions, but got {positions.size()}"
                )
            # Clamp input ids. This is because the input_ids for the image tokens are
            # filled with the hash values of the image for the prefix matching in the radix attention.
            # There values are useless because their embeddings will be replaced by vision embeddings anyway.
            input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
            # [B, s, hidden_size]
            inputs_embeds = self.model.embed_tokens(input_ids)
            extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
            prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
            for i, image in enumerate(forward_batch.image_inputs):
                if image is None:
                    continue
                start_idx = extend_start_loc_cpu[i]
                prefix_len = prefix_lens_cpu[i]
                pixel_values = image.pixel_values.clone().detach().requires_grad_(False)
                image_grid_thws = torch.tensor(
                    np.array(image.image_grid_thws), device="cuda"
                )
                image_offsets = image.image_offsets
                image_input = Qwen2VLImageInputs(
                    pixel_values=pixel_values, image_grid_thw=image_grid_thws
                )
                image_embeds = self._process_image_input(image_input)
                image_embeds_offset = 0
                for idx, image_offset in enumerate(image_offsets):
                    if image_offset < prefix_len:
                        continue
                    num_image_tokens = self.calculate_num_image_tokens(
                        image_grid_thws[idx]
                    )
                    left_idx = start_idx + (image_offset - prefix_len)
                    right_idx = left_idx + num_image_tokens
                    tp_size = get_tensor_model_parallel_world_size()
                    hidden_size = image_embeds.shape[-1]
                    if hidden_size % tp_size != 0:
                        padding_size = tp_size - (hidden_size % tp_size)
                        image_embeds = F.pad(image_embeds, (0, padding_size))
                        inputs_embeds = F.pad(inputs_embeds, (0, padding_size))
                    hidden_chunk_size = image_embeds.shape[-1] // tp_size
                    rank = get_tensor_model_parallel_rank()
                    start_dim = rank * hidden_chunk_size
                    end_dim = (rank + 1) * hidden_chunk_size
                    inputs_embeds[left_idx:right_idx, ..., start_dim:end_dim] = (
                        image_embeds[
                            image_embeds_offset : image_embeds_offset
                            + num_image_tokens,
                            ...,
                            start_dim:end_dim,
                        ]
                    )
                    image_embeds_offset += num_image_tokens
        input_ids = None
        hidden_states = self.model(
            input_ids=input_ids,
            positions=positions,
            forward_batch=forward_batch,
            input_embeds=inputs_embeds,
        )
        if not get_embedding:
            return self.logits_processor(
                input_ids, hidden_states, self.lm_head, forward_batch
            )
        else:
            return self.pooler(hidden_states, forward_batch)
    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "up_proj", 1),
            ("gate_up_proj", "gate_proj", 0),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        for name, loaded_weight in weights:
            if "rotary_emb.inv_freq" in name:
                continue
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                if "visual" in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                if "visual" in name and "qkv.weight" in name:
                    visual_num_heads = self.config.vision_config.num_heads
                    visual_embed_dim = self.config.vision_config.hidden_size
                    head_size = visual_embed_dim // visual_num_heads
                    loaded_weight = loaded_weight.view(
                        3, visual_num_heads, head_size, visual_embed_dim
                    )
                    loaded_weight = loaded_weight.transpose(0, 1)
                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
                elif "visual" in name and "qkv.bias" in name:
                    visual_num_heads = self.config.vision_config.num_heads
                    visual_embed_dim = self.config.vision_config.hidden_size
                    head_size = visual_embed_dim // visual_num_heads
                    loaded_weight = loaded_weight.view(3, visual_num_heads, head_size)
                    loaded_weight = loaded_weight.transpose(0, 1)
                    loaded_weight = loaded_weight.reshape(-1)
                if "visual" in name:
                    # adapt to VisionAttention
                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
                try:
                    # Skip loading extra bias for GPTQ models.
                    if name.endswith(".bias") and name not in params_dict:
                        continue
                    param = params_dict[name]
                except KeyError:
                    print(params_dict.keys())
                    raise
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)
 EntryClass = [Qwen2_5_VLForConditionalGeneration]
 AutoModel.register(Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration)
--- a/python/sglang/srt/models/qwen2_vl.py
+++ b/python/sglang/srt/models/qwen2_vl.py
@@ -31,8 +31,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 from transformers import Qwen2VLConfig
 from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
 from sglang.srt.configs import Qwen2VLConfig, Qwen2VLVisionConfig
 from sglang.srt.hf_transformers_utils import get_processor
 from sglang.srt.layers.activation import QuickGELU
 from sglang.srt.layers.attention.vision import VisionAttention
--- a/test/srt/test_vision_openai_server.py
+++ b/test/srt/test_vision_openai_server.py
@@ -252,6 +252,18 @@ class TestOpenAIVisionServer(unittest.TestCase):
        print("-" * 30)
        # Add assertions to validate the video response
        assert "iPod" in video_response or "device" in video_response, video_response
        assert (
            "man" in video_response
            or "person" in video_response
            or "individual" in video_response
        ), video_response
        assert (
            "present" in video_response
            or "examine" in video_response
            or "display" in video_response
        )
        assert "black" in video_response or "dark" in video_response
        self.assertIsNotNone(video_response)
        self.assertGreater(len(video_response), 0)
@@ -366,6 +378,30 @@ class TestQWen2VLServer(TestOpenAIVisionServer):
        cls.base_url += "/v1"
 class TestQWen2_5_VLServer(TestOpenAIVisionServer):
    @classmethod
    def setUpClass(cls):
        cls.model = "Qwen/Qwen2.5-VL-7B-Instruct"
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.api_key = "sk-123456"
        cls.process = popen_launch_server(
            cls.model,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            api_key=cls.api_key,
            other_args=[
                "--chat-template",
                "qwen2-vl",
                # FIXME: workaround to chunked prefill within image embeds
                "--chunked-prefill-size",
                "10000",
                "--mem-fraction-static",
                "0.4",
            ],
        )
        cls.base_url += "/v1"
 class TestQWen2VLServerContextLengthIssue(unittest.TestCase):
    @classmethod
    def setUpClass(cls):