enginex-biren-vllm/vllm_br/model_executor/models/qwen3_vl.py

################################################################################
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

# SPDX-License-Identifier: Apache-2.0

# Adapted from
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
# Copyright 2025 The vLLM team.
# Copyright 2025 The Qwen Team.
# Copyright 2025 The HuggingFace Inc. team.
# All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Qwen3-VL model compatible with HuggingFace weights."""
from collections.abc import Iterable
from typing import Optional, Union

import torch

from vllm.distributed import get_pp_group
from vllm.model_executor.layers.linear import ReplicatedLinear
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.qwen3_vl import (Qwen3_VisionBlock,
                                                 Qwen3_VisionPatchEmbed,
                                                 Qwen3_VisionTransformer,
                                                 Qwen3LLMModel)
from vllm.sequence import IntermediateTensors
from vllm_br import envs
from .br_utils import convBB


def Qwen3_VisionPatchEmbed__init__(
    self,
    patch_size: int = 14,
    temporal_patch_size: int = 2,
    in_channels: int = 3,
    hidden_size: int = 1152,
) -> None:
    super(Qwen3_VisionPatchEmbed, self).__init__()
    self.patch_size = patch_size
    self.temporal_patch_size = temporal_patch_size
    self.hidden_size = hidden_size

    self.proj = ReplicatedLinear(in_channels * temporal_patch_size *
                                 patch_size * patch_size,
                                 hidden_size,
                                 bias=True,
                                 prefix="")


Qwen3_VisionPatchEmbed.__init__ = Qwen3_VisionPatchEmbed__init__


def Qwen3_VisionPatchEmbed_forward(self, x: torch.Tensor) -> torch.Tensor:
    x = x.unsqueeze(0)
    L, _ = x.shape[-2], x.shape[-1]
    x = self.proj(x)[0].view(L, self.hidden_size)

    if envs.VLLM_BR_DEVICE_SPC_NUM > 16:
        x = convBB(x)

    return x


Qwen3_VisionPatchEmbed.forward = Qwen3_VisionPatchEmbed_forward


def Qwen3_VisionBlock_forward(
        self,
        x: torch.Tensor,
        cu_seqlens: torch.Tensor,
        rotary_pos_emb: torch.Tensor,
        max_seqlen: Optional[int] = None,  # Only used for Flash Attention
        seqlens: Optional[list[int]] = None,  # Only used for xFormers
) -> torch.Tensor:
    if x.shape[0] != 1:
        x = x.permute(1, 0, 2).contiguous()

    x = x + self.attn(self.norm1(x),
                      cu_seqlens=cu_seqlens,
                      rotary_pos_emb=rotary_pos_emb,
                      max_seqlen=max_seqlen,
                      seqlens=seqlens)

    x = x + self.mlp(self.norm2(x))
    return x


Qwen3_VisionBlock.forward = Qwen3_VisionBlock_forward


def Qwen3_VisionTransformer_load_weights(
        self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
    stacked_params_mapping = [
        # (param_name, shard_name, shard_id)
        ("attn.qkv.", "attn.q.", "q"),
        ("attn.qkv.", "attn.k.", "k"),
        ("attn.qkv.", "attn.v.", "v"),
    ]
    params_dict = dict(self.named_parameters(remove_duplicate=False))
    loaded_params: set[str] = set()

    for name, loaded_weight in weights:
        for (param_name, weight_name, shard_id) in stacked_params_mapping:
            if weight_name not in name:
                continue
            name = name.replace(weight_name, param_name)

            param = params_dict[name]
            weight_loader = param.weight_loader
            weight_loader(param, loaded_weight, shard_id)
            break
        else:
            param = params_dict[name]
            weight_loader = getattr(param, "weight_loader",
                                    default_weight_loader)
            if name == 'patch_embed.proj.weight':
                loaded_weight = loaded_weight.reshape(loaded_weight.shape[0],
                                                      -1).contiguous()
            weight_loader(param, loaded_weight)
            if name.find("norm.weight") != -1:
                param.data = param.data.to(torch.float32)
        loaded_params.add(name)
    return loaded_params


Qwen3_VisionTransformer.load_weights = Qwen3_VisionTransformer_load_weights


def Qwen3LLMModel_forward(
    self,
    input_ids: torch.Tensor,
    positions: torch.Tensor,
    intermediate_tensors: Optional[IntermediateTensors] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    # args for deepstack
    deepstack_input_embeds: Optional[IntermediateTensors] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
    if get_pp_group().is_first_rank:
        if inputs_embeds is not None:
            hidden_states = inputs_embeds
        else:
            hidden_states = self.get_input_embeddings(input_ids)
        residual = None
    else:
        assert intermediate_tensors is not None
        hidden_states = intermediate_tensors["hidden_states"]
        residual = intermediate_tensors["residual"]

    hidden_states = hidden_states.unsqueeze(0)
    residual = residual.unsqueeze(0) if residual is not None else None

    for layer_idx, layer in enumerate(
            self.layers[self.start_layer:self.end_layer]):
        layer_idx = layer_idx + self.start_layer
        hidden_states, residual = layer(
            positions,
            hidden_states,
            residual,
        )

        if deepstack_input_embeds is not None and \
                layer_idx in range(0, len(deepstack_input_embeds)):
            hidden_states = hidden_states + deepstack_input_embeds[
                f"deepstack_input_embeds_{layer_idx}"].to(
                    hidden_states.device).unsqueeze(0)

    if not get_pp_group().is_last_rank:
        return IntermediateTensors({
            "hidden_states":
            hidden_states.unsqueeze(0),
            "residual":
            residual.unsqueeze(0) if residual is not None else None
        })
    hidden_states, _ = self.norm(hidden_states, residual)
    return hidden_states.squeeze(0)


Qwen3LLMModel.forward = Qwen3LLMModel_forward