enginex-biren-vllm/vllm_br/model_executor/layers/rotary_embedding.py

################################################################################
# Copyright(c)2020-2025 Shanghai Biren Technology Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

import itertools
from typing import Any, Optional, Tuple, Union

import torch
import torch_br
from fastcore.basics import patch_to
from transformers import PretrainedConfig

import vllm.model_executor.layers.rotary_embedding
import vllm.model_executor.models.chatglm
import vllm.model_executor.models.deepseek_v2
import vllm_br.envs as br_envs
from vllm.logger import logger
from vllm.model_executor.layers.rotary_embedding import (
    _ROPE_DICT, DeepseekScalingRotaryEmbedding, DualChunkRotaryEmbedding,
    DynamicNTKScalingRotaryEmbedding, LinearScalingRotaryEmbedding,
    Llama3RotaryEmbedding, Llama4VisionRotaryEmbedding, MRotaryEmbedding,
    NTKScalingRotaryEmbedding, Phi3LongRoPEScaledRotaryEmbedding,
    RotaryEmbedding, YaRNScalingRotaryEmbedding)
from vllm.model_executor.layers.rotary_embedding.common import (
    rotate_gptj, rotate_neox, yarn_find_correction_range,
    yarn_linear_ramp_mask)
from vllm.model_executor.layers.rotary_embedding.deepseek_scaling_rope import (
    yarn_get_mscale)
from vllm.model_executor.layers.rotary_embedding.mrope import (
    apply_interleaved_rope)


@patch_to(RotaryEmbedding)
def __init__(
        self,
        head_size: int,
        rotary_dim: int,
        max_position_embeddings: int,
        base: int,
        is_neox_style: bool,
        dtype: torch.dtype,
        op_type: str = "Half",  # FIXME: other op type not supported yet
) -> None:
    logger.info('[Patch] RotaryEmbedding use SUPA RoPE')
    super(RotaryEmbedding, self).__init__()  # type: ignore
    self.head_size = head_size
    self.rotary_dim = rotary_dim
    self.max_position_embeddings = max_position_embeddings
    self.base = base
    self.is_neox_style = is_neox_style
    self.dtype = dtype
    self.op_type = op_type  # FIXME: other op type not supported yet

    if isinstance(self, MRotaryEmbedding):
        cache = self._compute_cos_sin_cache()
        cache = cache.to(dtype)
        device = torch.cuda.current_device()
        cache = cache.to(device)
        self.cos_sin_cache: torch.Tensor  # type: ignore
        self.register_buffer("cos_sin_cache", cache, persistent=False)
    elif isinstance(self, DeepseekScalingRotaryEmbedding):
        self.head_size = head_size
        self.rotary_dim = rotary_dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        self.is_neox_style = is_neox_style
        self.dtype = dtype

        cache = self._compute_cos_sin_cache()
        cache = cache.to(dtype)
        device = torch.supa.current_device()
        cache = cache.to(device)
        self.cos_sin_cache: torch.Tensor  # type: ignore
        self.register_buffer("cos_sin_cache", cache, persistent=False)
    else:
        sin_cache, cos_cache = self._compute_cos_sin_cache()
        sin_cache = sin_cache.to(torch.float32)
        cos_cache = cos_cache.to(torch.float32)
        device = torch.cuda.current_device()
        sin_cache = sin_cache.to(device)
        cos_cache = cos_cache.to(device)
        self.register_buffer("sin_cache", sin_cache, persistent=False)
        self.register_buffer("cos_cache", cos_cache, persistent=False)


@patch_to(RotaryEmbedding)
def _compute_cos_sin_cache(self) -> Tuple[torch.Tensor, torch.Tensor]:
    """Compute the cos and sin cache."""
    with torch.device('cpu'):
        inv_freq = self._compute_inv_freq(self.base)
        t = torch.arange(self.max_position_embeddings, dtype=torch.float)

        freqs = torch.einsum("i,j -> ij", t, inv_freq)
        if isinstance(self, MRotaryEmbedding):
            cos = freqs.cos()
            sin = freqs.sin()
            cache = torch.cat((cos, sin), dim=-1)
            return cache
        else:
            if self.op_type == "Half" or self.op_type == "TeleChat":
                freqs = freqs.repeat(1, 2)
                cos = freqs.cos()
                sin = freqs.sin()
            else:
                cos_freqs = freqs.repeat_interleave(2, dim=-1)
                cos = cos_freqs.cos()
                scales = torch.arange(cos_freqs.numel()) % 2 * 2 - 1
                sin_freqs = cos_freqs * scales.reshape_as(cos_freqs)
                sin = sin_freqs.sin()
            return sin, cos


@patch_to(RotaryEmbedding)
def forward_oot(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
    offsets: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    query_, key_ = torch_br.supa_rope_infer_v2(query,
                                               key,
                                               self.sin_cache,
                                               self.cos_cache,
                                               positions,
                                               self.head_size,
                                               rope_type=self.op_type,
                                               rotary_size=self.rotary_dim)
    return query_, key_


@patch_to(RotaryEmbedding)
def enabled(cls) -> bool:
    return True


class SupaDeepseekScalingRotaryEmbedding(RotaryEmbedding):

    def __init__(
        self,
        head_size: int,
        rotary_dim: int,
        max_position_embeddings: int,
        base: float,
        is_neox_style: bool,
        scaling_factor: float,
        dtype: torch.dtype,
        *,
        extrapolation_factor: float = 1,
        attn_factor: float = 1,
        beta_fast: int = 32,
        beta_slow: int = 1,
        mscale: float = 1,
        mscale_all_dim: float = 0,
    ) -> None:
        self.scaling_factor = scaling_factor
        self.extrapolation_factor = extrapolation_factor
        self.attn_factor = attn_factor
        self.beta_fast = beta_fast
        self.beta_slow = beta_slow
        # Get n-d magnitude scaling corrected for interpolation.
        self.mscale = float(
            yarn_get_mscale(self.scaling_factor, float(mscale)) /
            yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) *
            attn_factor)
        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                         is_neox_style, dtype)

    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
        with torch.device('cpu'):
            pos_freqs = self.base**(torch.arange(
                0, self.rotary_dim, 2, dtype=torch.float, device="cpu") /
                                    self.rotary_dim)
            inv_freq_extrapolation = 1.0 / pos_freqs
            inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)

            low, high = yarn_find_correction_range(
                self.beta_fast, self.beta_slow, self.rotary_dim, self.base,
                self.max_position_embeddings)
            # Get n-d rotational scaling corrected for extrapolation
            inv_freq_mask = (1 - yarn_linear_ramp_mask(
                low, high, self.rotary_dim // 2,
                dtype=torch.float)) * self.extrapolation_factor
            inv_freq = inv_freq_interpolation * (
                1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
            return inv_freq

    def _compute_cos_sin_cache(self) -> torch.Tensor:
        with torch.device('cpu'):
            inv_freq = self._compute_inv_freq(self.scaling_factor)
            t = torch.arange(self.max_position_embeddings *
                             self.scaling_factor,
                             dtype=torch.float)

            freqs = torch.einsum("i,j -> ij", t, inv_freq)
            cos_freqs = freqs.repeat_interleave(2, dim=-1)
            cos = (cos_freqs.cos() * self.mscale)
            scales = torch.arange(cos_freqs.numel()) % 2 * 2 - 1
            sin_freqs = cos_freqs * scales.reshape_as(cos_freqs)
            sin = (sin_freqs.sin() * self.mscale)
            return sin, cos


@patch_to(DeepseekScalingRotaryEmbedding)
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
    with torch.device('cpu'):
        pos_freqs = self.base**(torch.arange(
            0, self.rotary_dim, 2, dtype=torch.float, device="cpu") /
                                self.rotary_dim)
        inv_freq_extrapolation = 1.0 / pos_freqs
        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)

        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
                                               self.rotary_dim, self.base,
                                               self.max_position_embeddings)
        # Get n-d rotational scaling corrected for extrapolation
        inv_freq_mask = (1 - yarn_linear_ramp_mask(
            low, high, self.rotary_dim // 2,
            dtype=torch.float)) * self.extrapolation_factor
        inv_freq = inv_freq_interpolation * (
            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
        return inv_freq


@patch_to(DeepseekScalingRotaryEmbedding)
def _compute_cos_sin_cache(self) -> torch.Tensor:
    with torch.device('cpu'):
        inv_freq = self._compute_inv_freq(self.scaling_factor)
        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
                         dtype=torch.float32)
        freqs = torch.einsum("i,j -> ij", t, inv_freq)
        cos = (freqs.cos() * self.mscale)
        sin = (freqs.sin() * self.mscale)
        cache = torch.cat((cos, sin), dim=-1)
        return cache


@patch_to(DeepseekScalingRotaryEmbedding)
def forward_native(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: Optional[torch.Tensor] = None,
    offsets: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
    """PyTorch-native implementation equivalent to forward()."""
    assert key is not None
    self._match_cos_sin_cache_dtype(query)
    query_rot = query[..., :self.rotary_dim]
    key_rot = key[..., :self.rotary_dim]
    if self.rotary_dim < self.head_size:
        query_pass = query[..., self.rotary_dim:]
        key_pass = key[..., self.rotary_dim:]

    cos_sin = self.cos_sin_cache[
        torch.add(positions, offsets) if offsets is not None else positions]
    cos, sin = cos_sin.chunk(2, dim=-1)
    if self.is_neox_style:
        # NOTE(woosuk): Here we assume that the positions tensor has the
        # shape [batch_size, seq_len].
        cos = cos.repeat(1, 1, 2).unsqueeze(-2)
        sin = sin.repeat(1, 1, 2).unsqueeze(-2)
    else:
        device = torch.supa.current_device()
        cos = cos.to('cpu')
        sin = sin.to('cpu')
        cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
        sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
        cos = cos.to(device)
        sin = sin.to(device)

    rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
    device = query_rot.device
    if query.shape[0] > 1024:
        query_rot = query_rot.to('cpu')
        key_rot = key_rot.to('cpu')
        cos = cos.to('cpu')
        sin = sin.to('cpu')
    query_rot = query_rot * cos + rotate_fn(query_rot) * sin
    key_rot = key_rot * cos + rotate_fn(key_rot) * sin
    if query.shape[0] > 1024:
        query_rot = query_rot.to(device)
        key_rot = key_rot.to(device)

    if self.rotary_dim < self.head_size:
        query = torch.cat((query_rot, query_pass), dim=-1)
        key = torch.cat((key_rot, key_pass), dim=-1)
    else:
        query = query_rot
        key = key_rot
    return query, key


@patch_to(DeepseekScalingRotaryEmbedding)
def forward_oot(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
    offsets: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    query, key = self.forward_native(positions, query, key, offsets)
    return query, key


@patch_to(YaRNScalingRotaryEmbedding)
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
    with torch.device('cpu'):
        pos_freqs = self.base**(
            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) /
            self.rotary_dim)
        inv_freq_extrapolation = 1.0 / pos_freqs
        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)

        low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow,
                                               self.rotary_dim, self.base,
                                               self.max_position_embeddings)
        # Get n-d rotational scaling corrected for extrapolation
        inv_freq_mask = (1 - yarn_linear_ramp_mask(
            low, high, self.rotary_dim // 2,
            dtype=torch.float)) * self.extrapolation_factor
        inv_freq = inv_freq_interpolation * (
            1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
        return inv_freq


@patch_to(YaRNScalingRotaryEmbedding)
def _compute_cos_sin_cache(self) -> torch.Tensor:
    with torch.device('cpu'):
        inv_freq = self._compute_inv_freq(self.scaling_factor)
        t = torch.arange(self.max_position_embeddings * self.scaling_factor,
                         dtype=torch.float32)

        freqs = torch.einsum("i,j -> ij", t, inv_freq)
        freqs = freqs.repeat(1, 2)
        cos = freqs.cos() * self.mscale
        sin = freqs.sin() * self.mscale
        return sin, cos


def dtnamicNTK_compute_cos_sin_cache(
        self) -> Tuple[torch.Tensor, torch.Tensor]:
    """Compute the cos and sin cache."""
    with torch.device('cpu'):
        inv_freq = self._compute_inv_freq(self.base)
        t = torch.arange(self.max_position_embeddings, dtype=torch.float)

        freqs = torch.einsum("i,j -> ij", t, inv_freq)
        if self.op_type == "Half" or self.op_type == "TeleChat":
            freqs = freqs.repeat(1, 2)
            cos = freqs.cos()
            sin = freqs.sin()
        else:
            cos_freqs = freqs.repeat_interleave(2, dim=-1)
            cos = cos_freqs.cos()
            scales = torch.arange(cos_freqs.numel()) % 2 * 2 - 1
            sin_freqs = cos_freqs * scales.reshape_as(cos_freqs)
            sin = sin_freqs.sin()
        return sin, cos


def dynamicNTKScaling_rope_forward(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
    offsets: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    if query.shape[-1] != key.shape[-1]:
        query_, key_ = torch_br.supa_rope_infer_v2(query,
                                                   key,
                                                   self.sin_cache,
                                                   self.cos_cache,
                                                   positions,
                                                   self.head_size,
                                                   rope_type="MRope")
    else:
        query_, key_ = torch_br.supa_rope_infer_v2(query,
                                                   key,
                                                   self.sin_cache,
                                                   self.cos_cache,
                                                   positions,
                                                   self.head_size,
                                                   rope_type=self.op_type)
    return query_, key_


DynamicNTKScalingRotaryEmbedding._compute_cos_sin_cache = dtnamicNTK_compute_cos_sin_cache
DynamicNTKScalingRotaryEmbedding.forward = dynamicNTKScaling_rope_forward


def _apply_rotary_emb_torch(
    x: torch.Tensor,
    cos: torch.Tensor,
    sin: torch.Tensor,
    is_neox_style: bool,
) -> torch.Tensor:
    cos = cos.unsqueeze(-2).to(x.dtype)
    sin = sin.unsqueeze(-2).to(x.dtype)
    if is_neox_style:
        x1, x2 = torch.chunk(x, 2, dim=-1)
    else:
        x1 = x[..., ::2]
        x2 = x[..., 1::2]
    o1 = x1 * cos - x2 * sin
    o2 = x2 * cos + x1 * sin
    if is_neox_style:
        return torch.cat((o1, o2), dim=-1)
    else:
        return torch.stack((o1, o2), dim=-1).flatten(-2)


def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
                      is_neox_style: bool) -> torch.Tensor:
    """
    Args:
        x: [num_tokens, num_heads, head_size]
        cos: [num_tokens, head_size // 2]
        sin: [num_tokens, head_size // 2]
        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
            positional embeddings.
    """
    return _apply_rotary_emb_torch(x, cos, sin, is_neox_style)


def forward_MRotaryEmbedding_0_9_2(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
    """PyTorch-native implementation equivalent to forward().

    Args:
        positions:
            [num_tokens,] (text only) or
            [3, num_tokens] (T/H/W positions with multimodal inputs)
        query: [num_tokens, num_heads * head_size]
        key: [num_tokens, num_kv_heads * head_size]
    """
    assert positions.ndim == 1 or positions.ndim == 2
    assert key is not None

    num_tokens = positions.shape[-1]
    cos_sin = self.cos_sin_cache[positions]
    cos, sin = cos_sin.chunk(2, dim=-1)
    if positions.ndim == 2:
        assert self.mrope_section

        cos = torch.cat([
            m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))
        ],
                        dim=-1)
        sin = torch.cat([
            m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))
        ],
                        dim=-1)

    query_shape = query.shape
    query = query.view(num_tokens, -1, self.head_size)
    query_rot = query[..., :self.rotary_dim]
    query_pass = query[..., self.rotary_dim:]
    query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
    query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)

    key_shape = key.shape
    key = key.view(num_tokens, -1, self.head_size)
    key_rot = key[..., :self.rotary_dim]
    key_pass = key[..., self.rotary_dim:]
    key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
    key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
    return query, key


def forward_supa(
    self,
    positions: torch.Tensor,
    query: torch.Tensor,
    key: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Args:
        positions:
            [num_tokens,] (text only) or
            [3, num_tokens] (T/H/W positions with multimodal inputs)
        query: [num_tokens, num_heads * head_size]
        key: [num_tokens, num_kv_heads * head_size]
    """
    if br_envs.VLLM_BR_USE_MROPE_0_9_2:
        return forward_MRotaryEmbedding_0_9_2(self, positions, query, key)

    assert positions.ndim == 1 or positions.ndim == 2
    data_in_supa = lambda t: str(t.device).startswith('supa')
    data_in_cpu = lambda t: t.device == torch.device('cpu')

    if positions.ndim == 2:
        # use bypass for decode stage
        if (positions.shape[1] == 1):
            cos_sin = self.cos_sin_cache[positions]
            cos, sin = cos_sin.chunk(2, dim=-1)
            cos = cos[0]
            sin = sin[0]
        else:
            cos_sin = self.cos_sin_cache[positions.to(torch.int64)]
            cos, sin = cos_sin.chunk(2, dim=-1)
            assert self.mrope_section

            if self.mrope_interleaved:
                cos = apply_interleaved_rope(cos, self.mrope_section)
                sin = apply_interleaved_rope(sin, self.mrope_section)
            else:
                cos = torch.cat([
                    m[i] for i, m in enumerate(
                        cos.split(self.mrope_section, dim=-1))
                ],
                                dim=-1)
                sin = torch.cat([
                    m[i] for i, m in enumerate(
                        sin.split(self.mrope_section, dim=-1))
                ],
                                dim=-1)
    if data_in_supa(query) and data_in_supa(key):
        sin = sin.supa() if data_in_cpu(sin) else sin
        cos = cos.supa() if data_in_cpu(cos) else cos
        positions = positions.supa() if data_in_cpu(positions) else positions

    query, key = torch_br.supa_rope_infer_v2(query,
                                             key,
                                             sin.to(torch.float32),
                                             cos.to(torch.float32),
                                             positions.to(torch.int32),
                                             self.head_size,
                                             rope_type="MRope")
    return query, key


MRotaryEmbedding.forward = forward_supa


def get_rope(
    head_size: int,
    rotary_dim: int,
    max_position: int,
    base: int,
    is_neox_style: bool = True,
    rope_scaling: Optional[dict[str, Any]] = None,
    dtype: Optional[torch.dtype] = None,
    partial_rotary_factor: float = 1.0,
    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
    op_type: str = "Half",
) -> RotaryEmbedding:
    if dtype is None:
        dtype = torch.get_default_dtype()
    if rope_scaling is not None:
        # Transforms every value that is a list into a tuple for caching calls
        rope_scaling_tuple = {
            k: tuple(v) if isinstance(v, list) else v
            for k, v in rope_scaling.items()
        }
        rope_scaling_args = tuple(rope_scaling_tuple.items())
    else:
        rope_scaling_args = None

    if dual_chunk_attention_config is not None:
        dual_chunk_attention_tuple = {
            k: tuple(v) if isinstance(v, list) else v
            for k, v in dual_chunk_attention_config.items()
            if k != "sparse_attention_config"
        }
        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
    else:
        dual_chunk_attention_args = None

    if partial_rotary_factor < 1.0:
        rotary_dim = int(rotary_dim * partial_rotary_factor)
    key = (head_size, rotary_dim, max_position, base, is_neox_style,
           rope_scaling_args, dual_chunk_attention_args, dtype)
    if key in _ROPE_DICT:
        return _ROPE_DICT[key]

    if dual_chunk_attention_config is not None:
        extra_kwargs = {
            k: v
            for k, v in dual_chunk_attention_config.items()
            if k in ("chunk_size", "local_size")
        }
        rotary_emb = DualChunkRotaryEmbedding(head_size, rotary_dim,
                                              max_position, base,
                                              is_neox_style, dtype,
                                              **extra_kwargs)
    elif not rope_scaling:
        rotary_emb = RotaryEmbedding(head_size,
                                     rotary_dim,
                                     max_position,
                                     base,
                                     is_neox_style,
                                     dtype,
                                     op_type=op_type)
    else:
        scaling_type = rope_scaling["rope_type"]

        if scaling_type == "llama3":
            scaling_factor = rope_scaling["factor"]
            low_freq_factor = rope_scaling["low_freq_factor"]
            high_freq_factor = rope_scaling["high_freq_factor"]
            original_max_position = rope_scaling[
                "original_max_position_embeddings"]
            rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim,
                                               max_position, base,
                                               is_neox_style, dtype,
                                               scaling_factor, low_freq_factor,
                                               high_freq_factor,
                                               original_max_position)
        elif scaling_type == "mllama4":
            rotary_emb = Llama4VisionRotaryEmbedding(head_size, rotary_dim,
                                                     max_position, base,
                                                     is_neox_style, dtype)
        elif scaling_type == "default":
            if "mrope_section" in rope_scaling:
                rotary_emb = MRotaryEmbedding(
                    head_size,
                    rotary_dim,
                    max_position,
                    base,
                    is_neox_style,
                    dtype=torch.float32,
                    mrope_section=rope_scaling["mrope_section"],
                    mrope_interleaved=rope_scaling.get("mrope_interleaved",
                                                       False),
                )
            else:
                rotary_emb = RotaryEmbedding(
                    head_size,
                    rotary_dim,
                    max_position,
                    base,
                    is_neox_style,
                    dtype,
                )
        elif scaling_type == "linear":
            scaling_factor = rope_scaling["factor"]
            rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
                                                      max_position, base,
                                                      is_neox_style,
                                                      scaling_factor, dtype)
        elif scaling_type == "ntk":
            scaling_factor = rope_scaling["factor"]
            mixed_b = rope_scaling.get('mixed_b', None)
            rotary_emb = NTKScalingRotaryEmbedding(head_size, rotary_dim,
                                                   max_position, base,
                                                   is_neox_style,
                                                   scaling_factor, dtype,
                                                   mixed_b)
        elif scaling_type == "dynamic":
            scaling_factor = rope_scaling["factor"]
            rotary_emb = DynamicNTKScalingRotaryEmbedding(
                head_size, rotary_dim, max_position, base, is_neox_style,
                scaling_factor, dtype)
        elif scaling_type == "yarn":
            scaling_factor = rope_scaling["factor"]
            original_max_position = rope_scaling[
                "original_max_position_embeddings"]
            extra_kwargs = {
                k: v
                for k, v in rope_scaling.items()
                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
                         "beta_slow")
            }
            rotary_emb = YaRNScalingRotaryEmbedding(head_size, rotary_dim,
                                                    original_max_position,
                                                    base, is_neox_style,
                                                    scaling_factor, dtype,
                                                    **extra_kwargs)
        elif scaling_type == "deepseek_yarn":
            scaling_factor = rope_scaling["factor"]
            original_max_position = rope_scaling[
                "original_max_position_embeddings"]
            # assert max_position == original_max_position * scaling_factor
            extra_kwargs = {
                k: v
                for k, v in rope_scaling.items()
                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
                         "beta_slow", "mscale", "mscale_all_dim")
            }
            rotary_emb = DeepseekScalingRotaryEmbedding(
                head_size, rotary_dim, original_max_position, base,
                is_neox_style, scaling_factor, dtype, **extra_kwargs)
        elif scaling_type == "deepseek_yarn_supa":
            scaling_factor = rope_scaling["factor"]
            original_max_position = rope_scaling[
                "original_max_position_embeddings"]
            # assert max_position == original_max_position * scaling_factor
            extra_kwargs = {
                k: v
                for k, v in rope_scaling.items()
                if k in ("extrapolation_factor", "attn_factor", "beta_fast",
                         "beta_slow", "mscale", "mscale_all_dim")
            }
            rotary_emb = SupaDeepseekScalingRotaryEmbedding(
                head_size, rotary_dim, original_max_position, base,
                is_neox_style, scaling_factor, dtype, **extra_kwargs)
        elif scaling_type == "longrope":
            short_factor = rope_scaling["short_factor"]
            long_factor = rope_scaling["long_factor"]
            original_max_position = rope_scaling[
                "original_max_position_embeddings"]
            extra_kwargs = {
                k: v
                for k, v in rope_scaling.items()
                if k in ("short_mscale", "long_mscale")
            }
            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
                head_size, rotary_dim, max_position, original_max_position,
                base, is_neox_style, dtype, short_factor, long_factor,
                **extra_kwargs)
        else:
            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
    _ROPE_DICT[key] = rotary_emb
    return rotary_emb


def deepseek_get_rope(
    head_size: int,
    rotary_dim: int,
    max_position: int,
    base: int,
    is_neox_style: bool = True,
    rope_scaling: Optional[dict[str, Any]] = None,
    dtype: Optional[torch.dtype] = None,
    partial_rotary_factor: float = 1.0,
    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
) -> RotaryEmbedding:
    return get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                    rope_scaling, dtype, partial_rotary_factor,
                    dual_chunk_attention_config, "DeepSeek")


def chatglm2_get_rope(
    head_size: int,
    rotary_dim: int,
    max_position: int,
    base: int,
    is_neox_style: bool = True,
    rope_scaling: Optional[dict[str, Any]] = None,
    dtype: Optional[torch.dtype] = None,
    partial_rotary_factor: float = 1.0,
    dual_chunk_attention_config: Optional[dict[str, Any]] = None,
) -> RotaryEmbedding:
    return get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
                    rope_scaling, dtype, partial_rotary_factor,
                    dual_chunk_attention_config, "DeepSeek")


vllm.model_executor.layers.rotary_embedding.get_rope = get_rope
vllm.model_executor.models.deepseek_v2.get_rope = deepseek_get_rope
vllm.model_executor.models.chatglm.get_rope = chatglm2_get_rope


@patch_to(MRotaryEmbedding)
def _glm4v_get_input_positions_tensor(
    cls,
    input_tokens: list[int],
    hf_config: PretrainedConfig,
    image_grid_thw: Union[list[list[int]], torch.Tensor],
    video_grid_thw: Union[list[list[int]], torch.Tensor],
    context_len: int = 0,
    seq_len: Optional[int] = None,
) -> tuple[torch.Tensor, int]:
    """Get mrope input positions and delta value for GLM4V."""

    image_token_id = hf_config.image_token_id
    video_start_token_id = hf_config.video_start_token_id
    video_end_token_id = hf_config.video_end_token_id
    spatial_merge_size = hf_config.vision_config.spatial_merge_size
    llm_pos_ids_list: list = []

    if not (image_grid_thw is None and video_grid_thw is None):
        if isinstance(image_grid_thw, torch.Tensor):
            image_grid_thw = image_grid_thw.tolist()

        input_token_type: list[str] = []
        video_check_flg = False
        for token in input_tokens:
            if token == video_start_token_id:
                video_check_flg = True
            elif token == video_end_token_id:
                video_check_flg = False

            if (token == image_token_id) and (video_check_flg is False):
                input_token_type.append("image")
            elif (token == image_token_id) and (video_check_flg is True):
                input_token_type.append("video")
            else:
                input_token_type.append("text")

        input_type_group: list[tuple[str, int, int]] = []
        for key, group_iter in itertools.groupby(enumerate(input_token_type),
                                                 lambda x: x[1]):
            group_list = list(group_iter)
            start_index = group_list[0][0]
            end_index = group_list[-1][0] + 1
            input_type_group.append((key, start_index, end_index))

        video_frame_num = 1
        mm_data_idx = 0
        for modality_type, start_idx, end_idx in input_type_group:
            st_idx = llm_pos_ids_list[-1].max() + 1 if len(
                llm_pos_ids_list) > 0 else 0
            if modality_type == "image":
                t, h, w = (
                    image_grid_thw[mm_data_idx][0],
                    image_grid_thw[mm_data_idx][1],
                    image_grid_thw[mm_data_idx][2],
                )
                llm_grid_t, llm_grid_h, llm_grid_w = \
                    t, h // spatial_merge_size, w // spatial_merge_size

                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
                    -1, llm_grid_h * llm_grid_w).flatten()
                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
                    llm_grid_t, -1, llm_grid_w).flatten()
                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
                    llm_grid_t, llm_grid_h, -1).flatten()
                llm_pos_ids_list.append(
                    torch.stack([t_index, h_index, w_index]) + st_idx)
                mm_data_idx += 1

            elif modality_type == "video":
                t, h, w = (
                    video_frame_num,
                    image_grid_thw[mm_data_idx][1],
                    image_grid_thw[mm_data_idx][2],
                )
                llm_grid_t, llm_grid_h, llm_grid_w = \
                    t, h // spatial_merge_size, w // spatial_merge_size

                for t_idx in range(llm_grid_t):
                    t_index = torch.tensor(t_idx).view(-1, 1).expand(
                        -1, llm_grid_h * llm_grid_w).flatten()
                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
                        1, -1, llm_grid_w).flatten()
                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
                        1, llm_grid_h, -1).flatten()
                    llm_pos_ids_list.append(
                        torch.stack([t_index, h_index, w_index]) + st_idx)

                mm_data_idx += 1
                video_frame_num += 1

            else:
                text_len = end_idx - start_idx
                llm_pos_ids_list.append(
                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
                video_frame_num = 1

    else:
        text_len = len(input_tokens)
        llm_pos_ids_list.append(
            torch.arange(text_len).view(1, -1).expand(3, -1))

    llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
    llm_positions = llm_positions[:, context_len:seq_len]
    mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
    return llm_positions, mrope_position_delta


@patch_to(MRotaryEmbedding)
def get_input_positions_tensor_for_glm(
    cls,
    input_tokens: list[int],
    hf_config: PretrainedConfig,
    image_grid_thw: Union[list[list[int]], torch.Tensor],
    video_grid_thw: Union[list[list[int]], torch.Tensor],
    second_per_grid_ts: list[float],
    context_len: int = 0,
    seq_len: Optional[int] = None,
    audio_feature_lengths: Optional[torch.Tensor] = None,
    use_audio_in_video: bool = False,
) -> tuple[torch.Tensor, int]:
    from vllm.transformers_utils.config import thinker_uses_mrope
    if thinker_uses_mrope(hf_config):
        return cls._omni_get_input_positions_tensor(
            input_tokens=input_tokens,
            hf_config=hf_config,
            image_grid_thw=image_grid_thw,
            video_grid_thw=video_grid_thw,
            second_per_grid_ts=second_per_grid_ts,
            context_len=context_len,
            seq_len=seq_len,
            audio_feature_lengths=audio_feature_lengths,
            use_audio_in_video=use_audio_in_video,
        )
    elif "glm4v" in hf_config.model_type:
        return cls._glm4v_get_input_positions_tensor(
            cls,
            input_tokens=input_tokens,
            hf_config=hf_config,
            image_grid_thw=image_grid_thw,
            video_grid_thw=video_grid_thw,
            context_len=context_len,
            seq_len=seq_len,
        )
    else:
        return cls._vl_get_input_positions_tensor(
            input_tokens=input_tokens,
            hf_config=hf_config,
            image_grid_thw=image_grid_thw,
            video_grid_thw=video_grid_thw,
            second_per_grid_ts=second_per_grid_ts,
            context_len=context_len,
            seq_len=seq_len,
        )