Add minimal vLLM 0.16.1 build repo for BI-V150

2026-04-18 10:56:22 +08:00
commit d69657327e
1895 changed files with 615301 additions and 0 deletions
--- a/vllm/model_executor/layers/rotary_embedding/init.py
+++ b/vllm/model_executor/layers/rotary_embedding/init.py
@@ -0,0 +1,328 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Rotary Positional Embeddings."""
+
+from typing import Any
+
+import torch
+
+from .base import RotaryEmbedding
+from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding
+from .dual_chunk_rope import DualChunkRotaryEmbedding
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding
+from .fope import FourierRotaryEmbedding
+from .linear_scaling_rope import LinearScalingRotaryEmbedding
+from .llama3_rope import Llama3RotaryEmbedding
+from .llama4_vision_rope import Llama4VisionRotaryEmbedding
+from .mrope import MRotaryEmbedding
+from .mrope_interleaved import MRotaryEmbeddingInterleaved
+from .ntk_scaling_rope import NTKScalingRotaryEmbedding
+from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .xdrope import XDRotaryEmbedding
+from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
+
+_ROPE_DICT: dict[tuple[Any, ...], RotaryEmbedding] = {}
+
+
+def get_rope(
+    head_size: int,
+    max_position: int,
+    is_neox_style: bool = True,
+    rope_parameters: dict[str, Any] | None = None,
+    dtype: torch.dtype | None = None,
+    dual_chunk_attention_config: dict[str, Any] | None = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_parameters is not None:
+        # Transforms every value that is a list into a tuple for caching calls
+        rope_parameters_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_parameters.items()
+        }
+        rope_parameters_args = tuple(rope_parameters_tuple.items())
+    else:
+        rope_parameters_args = None
+
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
+
+    rope_parameters = rope_parameters or {}
+    base = rope_parameters.get("rope_theta", 10000)
+    scaling_type = rope_parameters.get("rope_type", "default")
+    partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
+
+    if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0:
+        raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0")
+    rotary_dim = int(head_size * partial_rotary_factor)
+
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        is_neox_style,
+        rope_parameters_args,
+        dual_chunk_attention_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            **extra_kwargs,
+        )
+    elif scaling_type == "default":
+        if "mrope_section" in rope_parameters:
+            rotary_emb = MRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_parameters["mrope_section"],
+                mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
+            )
+        elif "use_fope" in rope_parameters and rope_parameters["use_fope"]:
+            extra_kwargs = {
+                k: v
+                for k, v in rope_parameters.items()
+                if k
+                in (
+                    "num_key_value_heads",
+                    "num_inv_freq",
+                    "fope_sep_head",
+                    "fope_init_factor",
+                )
+            }
+            extra_kwargs["init_cache"] = False
+            rotary_emb = FourierRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                **extra_kwargs,
+            )
+        else:
+            rotary_emb = RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+            )
+    elif scaling_type == "llama3":
+        scaling_factor = rope_parameters["factor"]
+        low_freq_factor = rope_parameters["low_freq_factor"]
+        high_freq_factor = rope_parameters["high_freq_factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        rotary_emb = Llama3RotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            scaling_factor,
+            low_freq_factor,
+            high_freq_factor,
+            original_max_position,
+        )
+    elif scaling_type == "mllama4":
+        rotary_emb = Llama4VisionRotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        )
+    elif scaling_type == "linear":
+        scaling_factor = rope_parameters["factor"]
+        rotary_emb = LinearScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+        )
+    elif scaling_type == "ntk":
+        scaling_factor = rope_parameters["factor"]
+        mixed_b = rope_parameters.get("mixed_b")
+        rotary_emb = NTKScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+            mixed_b,
+        )
+    elif scaling_type == "dynamic":
+        if "alpha" in rope_parameters:
+            scaling_alpha = rope_parameters["alpha"]
+            rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_alpha,
+                dtype,
+            )
+        elif "factor" in rope_parameters:
+            scaling_factor = rope_parameters["factor"]
+            rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        else:
+            raise ValueError(
+                "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
+            )
+    elif scaling_type == "xdrope":
+        scaling_alpha = rope_parameters["alpha"]
+        rotary_emb = XDRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+            xdrope_section=rope_parameters["xdrope_section"],
+        )
+    elif scaling_type == "yarn":
+        scaling_factor = rope_parameters["factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k
+            in (
+                "extrapolation_factor",
+                "attn_factor",
+                "beta_fast",
+                "beta_slow",
+                "apply_yarn_scaling",
+                "truncate",
+            )
+        }
+        if "mrope_section" in rope_parameters:
+            extra_kwargs.pop("apply_yarn_scaling", None)
+            rotary_emb = MRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_parameters["mrope_section"],
+                mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
+                scaling_factor=scaling_factor,
+                **extra_kwargs,
+            )
+        else:
+            rotary_emb = YaRNScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+    elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
+        scaling_factor = rope_parameters["factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        # assert max_position == original_max_position * scaling_factor
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k
+            in (
+                "extrapolation_factor",
+                "attn_factor",
+                "beta_fast",
+                "beta_slow",
+                "mscale",
+                "mscale_all_dim",
+            )
+        }
+        rotary_emb = DeepseekScalingRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            original_max_position,
+            base,
+            is_neox_style,
+            scaling_factor,
+            dtype,
+            **extra_kwargs,
+        )
+    elif scaling_type == "longrope":
+        short_factor = rope_parameters["short_factor"]
+        long_factor = rope_parameters["long_factor"]
+        original_max_position = rope_parameters["original_max_position_embeddings"]
+        extra_kwargs = {
+            k: v
+            for k, v in rope_parameters.items()
+            if k in ("short_mscale", "long_mscale")
+        }
+        rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            original_max_position,
+            base,
+            is_neox_style,
+            dtype,
+            short_factor,
+            long_factor,
+            **extra_kwargs,
+        )
+    elif scaling_type == "openpangu":
+        mrope_interleaved = rope_parameters.get("mrope_interleaved", False)
+        if "mrope_section" in rope_parameters and mrope_interleaved:
+            rotary_emb = MRotaryEmbeddingInterleaved(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                mrope_section=rope_parameters["mrope_section"],
+                mrope_interleaved=mrope_interleaved,
+            )
+        else:
+            raise ValueError("Pangu mrope lacks necessary parameters.")
+    else:
+        raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Rotary Positional Embeddings Base Class."""
+
+import torch
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.model_executor.custom_op import CustomOp
+
+from .common import ApplyRotaryEmb
+
+
+# --8<-- [start:rotary_embedding]
+@CustomOp.register("rotary_embedding")
+class RotaryEmbeddingBase(CustomOp):
+    """Original rotary positional embedding."""
+
+    # --8<-- [end:rotary_embedding]
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        init_cache: bool = True,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+        # TODO(mgoin): disabled for now due to failures
+        # Flashinfer only supports head_size=64, 128, 256, 512.
+        # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
+        # self.use_flashinfer = (self.enabled()
+        #                        and dtype in (torch.float16, torch.bfloat16)
+        #                        and current_platform.is_cuda()
+        #                        and has_flashinfer()
+        #                        and self.head_size in [64, 128, 256, 512])
+
+        # Check if use_flashinfer is already set
+        if not hasattr(self, "use_flashinfer"):
+            self.use_flashinfer = False
+
+        self.use_aiter = (
+            self.enabled() and rocm_aiter_ops.is_triton_rotary_embed_enabled()
+        )
+        if self.use_aiter:
+            self.rocm_aiter_triton_rotary_embedding = (
+                rocm_aiter_ops.get_triton_rotary_embedding_op()
+            )
+
+        if init_cache:
+            cache = self._compute_cos_sin_cache()
+            if not self.use_flashinfer:
+                cache = cache.to(dtype)
+            self.cos_sin_cache: torch.Tensor
+            self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+        self.apply_rotary_emb = ApplyRotaryEmb(
+            is_neox_style=self.is_neox_style,
+        )
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> torch.Tensor:
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        cos_sin_cache = self.cos_sin_cache
+        if (
+            cos_sin_cache.device == query.device
+            and self.cos_sin_cache.dtype == query.dtype
+        ):
+            return cos_sin_cache
+
+        cos_sin_cache = cos_sin_cache.to(query.device, dtype=query.dtype)
+        # Avoid mutating buffers during torch.compile (cudagraph) tracing.
+        if torch.compiler.is_compiling():
+            return cos_sin_cache
+
+        self.cos_sin_cache = cos_sin_cache
+        return cos_sin_cache
+
+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+
+
+class RotaryEmbedding(RotaryEmbeddingBase):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        init_cache: bool = True,
+    ) -> None:
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=rotary_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=base,
+            is_neox_style=is_neox_style,
+            dtype=dtype,
+            init_cache=init_cache,
+        )
+
+    @staticmethod
+    def forward_static(
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None,
+        head_size: int,
+        rotary_dim: int,
+        cos_sin_cache: torch.Tensor,
+        is_neox_style: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A PyTorch-native implementation of forward()."""
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, head_size)
+        query_rot = query[..., :rotary_dim]
+        query_pass = query[..., rotary_dim:]
+        query_rot = ApplyRotaryEmb.forward_static(
+            query_rot,
+            cos,
+            sin,
+            is_neox_style,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, head_size)
+            key_rot = key[..., :rotary_dim]
+            key_pass = key[..., rotary_dim:]
+            key_rot = ApplyRotaryEmb.forward_static(
+                key_rot,
+                cos,
+                sin,
+                is_neox_style,
+            )
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """A PyTorch-native implementation of forward()."""
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+        return self.forward_static(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.rotary_dim,
+            cos_sin_cache,
+            self.is_neox_style,
+        )
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+
+        from vllm import _custom_ops as ops
+
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            cos_sin_cache,
+            self.is_neox_style,
+        )
+        return query, key
+
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.use_aiter:
+            cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+            self.rocm_aiter_triton_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+        return self.forward_cuda(positions, query, key)
+
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        self._match_cos_sin_cache_dtype(query)
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
+        if key is None:
+            return self.forward_native(positions, query, key)
+        else:
+            from vllm import _custom_ops as ops
+
+            cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+            ops.rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                cos_sin_cache,
+                self.is_neox_style,
+            )
+        return query, key
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        from vllm import _custom_ops as ops
+
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+
+        # ops.rotary_embedding() is an in-place operation
+        # that updates the query and key tensors.
+        ops.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            cos_sin_cache,
+            self.is_neox_style,
+        )
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from importlib.util import find_spec
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.torch_utils import direct_register_custom_op
+
+logger = init_logger(__name__)
+
+
+# common functions
+def rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+# yarn functions
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot: int,
+    high_rot: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+    truncate: bool = True,
+) -> tuple[float | int, float | int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype
+) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def yarn_get_mscale(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+def _flashinfer_rotary_embedding(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    """Custom op wrapper for flashinfer's rotary embedding.
+
+    This is an in-place operation that modifies query and key tensors directly.
+    """
+    from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
+
+    apply_rope_with_cos_sin_cache_inplace(
+        positions=positions,
+        query=query,
+        key=key,
+        head_size=head_size,
+        cos_sin_cache=cos_sin_cache,
+        is_neox=is_neox,
+    )
+
+
+def _flashinfer_rotary_embedding_fake(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    return
+
+
+# Register flashinfer rotary embedding custom op
+direct_register_custom_op(
+    op_name="flashinfer_rotary_embedding",
+    op_func=_flashinfer_rotary_embedding,
+    mutates_args=["query", "key"],  # These tensors are modified in-place
+    fake_impl=_flashinfer_rotary_embedding_fake,
+)
+
+
+# --8<-- [start:apply_rotary_emb]
+@CustomOp.register("apply_rotary_emb")
+class ApplyRotaryEmb(CustomOp):
+    # --8<-- [end:apply_rotary_emb]
+
+    def __init__(
+        self,
+        enforce_enable: bool = False,
+        is_neox_style: bool = True,
+        enable_fp32_compute: bool = False,
+    ) -> None:
+        super().__init__(enforce_enable=enforce_enable)
+        self.is_neox_style = is_neox_style
+        self.enable_fp32_compute = enable_fp32_compute
+
+        self.apply_rotary_emb_flash_attn = None
+        if find_spec("flash_attn") is not None:
+            from flash_attn.ops.triton.rotary import apply_rotary
+
+            self.apply_rotary_emb_flash_attn = apply_rotary
+
+    @staticmethod
+    def forward_static(
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        is_neox_style: bool = True,
+        enable_fp32_compute: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [batch_size (optional), seq_len, num_heads, head_size]
+            cos: [seq_len, head_size // 2]
+            sin: [seq_len, head_size // 2]
+            is_neox_style: Whether to use the Neox-style or GPT-J-style.
+            enable_fp32_compute: Temporarily convert x, cos, sin to FP32 dtype
+                                 for higher accuracy.
+        """
+        origin_dtype = x.dtype
+        if enable_fp32_compute:
+            x = x.float()
+
+        cos = cos.unsqueeze(-2).to(x.dtype)
+        sin = sin.unsqueeze(-2).to(x.dtype)
+
+        if is_neox_style:
+            x1, x2 = torch.chunk(x, 2, dim=-1)
+        else:
+            x1 = x[..., ::2]
+            x2 = x[..., 1::2]
+
+        o1 = x1 * cos - x2 * sin
+        o2 = x2 * cos + x1 * sin
+
+        if is_neox_style:
+            output = torch.cat((o1, o2), dim=-1)
+        else:
+            output = torch.stack((o1, o2), dim=-1).flatten(-2)
+
+        if enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
+    def _pre_process(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
+        origin_shape = x.shape
+        if len(origin_shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        return x, cos, sin, origin_shape, origin_dtype
+
+    def _post_process(
+        self,
+        output: torch.Tensor,
+        origin_shape: torch.Size,
+        origin_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        if len(origin_shape) == 3:
+            output = output.squeeze(0)
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        output = self.forward_static(
+            x, cos, sin, self.is_neox_style, self.enable_fp32_compute
+        )
+        return output
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        # from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+        return self.forward_native(x, cos, sin)
+        x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
+
+        """
+        Arguments of apply_rotary_emb() in vllm_flash_attn:
+            x: [batch_size, seq_len, nheads, headdim]
+            cos, sin: [seqlen_rotary, rotary_dim / 2]
+            interleaved: defalut as False (Neox-style).
+            ...
+        """
+        interleaved = not self.is_neox_style
+        output = apply_rotary_emb(x, cos, sin, interleaved)
+
+        output = self._post_process(output, origin_shape, origin_dtype)
+        return output
+
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.apply_rotary_emb_flash_attn is not None:
+            x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
+
+            """
+            Arguments of apply_rotary() in flash_attn:
+                x: [batch_size, seq_len, nheads, headdim]
+                cos, sin: [seqlen_rotary, rotary_dim / 2]
+                interleaved: defalut as False (Neox-style).
+                ...
+            """
+            interleaved = not self.is_neox_style
+            output = self.apply_rotary_emb_flash_attn(
+                x, cos, sin, interleaved=interleaved
+            ).type_as(x)
+
+            output = self._post_process(output, origin_shape, origin_dtype)
+        else:
+            # Falling back to PyTorch native implementation.
+            output = self.forward_native(x, cos, sin)
+
+        return output
+
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        # TODO (bigPYJ1151): need to enable fused CPU ROPE here
+        return self.forward_native(x, cos, sin)
+
+    def extra_repr(self) -> str:
+        s = f"is_neox_style={self.is_neox_style}"
+        s += f", enable_fp32_compute={self.enable_fp32_compute}"
+        return s
--- a/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+
+from .base import RotaryEmbeddingBase
+from .common import (
+    rotate_gptj,
+    rotate_neox,
+    yarn_find_correction_range,
+    yarn_linear_ramp_mask,
+)
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        # Get n-d magnitude scaling corrected for interpolation.
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale))
+            / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
+            * attn_factor
+        )
+        self.use_flashinfer = (
+            self.enabled()
+            and dtype in (torch.float16, torch.bfloat16)
+            and current_platform.is_cuda()
+            and has_flashinfer()
+            and head_size in [64, 128, 256, 512]
+        )
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(
+                0,
+                self.rotary_dim,
+                2,
+                dtype=torch.float,
+            )
+            / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor,
+            dtype=torch.float32,
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward()."""
+        assert key is not None
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+
+        cos_sin = cos_sin_cache[
+            torch.add(positions, offsets) if offsets is not None else positions
+        ]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+    def forward_hip(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.use_flashinfer:
+            torch.ops.vllm.flashinfer_rotary_embedding(
+                torch.add(positions, offsets) if offsets is not None else positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+            return query, key
+        else:
+            return self.forward_native(positions, query, key, offsets)
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -0,0 +1,218 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from vllm.model_executor.custom_op import CustomOp
+
+from .common import rotate_gptj, rotate_neox
+
+
+# --8<-- [start:dual_chunk_rotary_embedding]
+@CustomOp.register("dual_chunk_rotary_embedding")
+class DualChunkRotaryEmbedding(CustomOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+
+    # --8<-- [end:dual_chunk_rotary_embedding]
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
+            self._compute_cos_sin_cache()
+        )
+
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer(
+            "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
+        )
+        self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+        # avoid numerical issues with large base values (e.g., 10000000).
+        # This may cause a slight numerical difference between the HF
+        # implementation and ours.
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
+            max=self.chunk_size
+        )
+        k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
+
+        # count from chunk_len, no clamp(self.chunk_size) restriction
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        # count from self.chunk_size for q_inter's rope
+        q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
+
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+
+        q_cos = q_freqs.cos()
+        q_sin = q_freqs.sin()
+        qc_cos = qc_freqs.cos()
+        qc_sin = qc_freqs.sin()
+        k_cos = k_freqs.cos()
+        k_sin = k_freqs.sin()
+
+        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
+        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
+        q_inter_cos = q_inter_freqs.cos()
+        q_inter_sin = q_inter_freqs.sin()
+
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        else:
+            query_pass = None
+            key_pass = None
+
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
+        )
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot,
+            query_pass,
+        )
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+
+        # merge query into one tensor to simplify the interfaces
+        query = torch.cat(
+            (
+                query,
+                query_succ,
+                query_inter,
+                query_succ_critical,
+                query_inter_critical,
+            ),
+            dim=-1,
+        )
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.forward_native(positions, query, key, offsets)
+
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
--- a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_alpha_rope.py
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK alpha.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # For Hunyuan DynamicNTKAlphaRotaryEmbedding
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
--- a/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dynamic_ntk_scaling_rope.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .base import RotaryEmbedding
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        # NOTE(woosuk): self.max_position_embeddings is the original
+        # maximum length before applying the rope scaling.
+        # Thus, the maximum length after applying the rope scaling is
+        # self.max_position_embeddings * self.scaling_factor.
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings)
+            - (self.scaling_factor - 1)
+        ) ** (self.rotary_dim / (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
--- a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from .mrope import MRotaryEmbedding
+
+
+class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
+    """3D rotary positional embedding. 3D is t:time h:height w:width"""
+
+    def forward_native(  # type: ignore[override]
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            section_h = self.mrope_section[0]  # 22
+            section_w = self.mrope_section[1]  # 22
+            section_t = self.mrope_section[2]  # 20
+            assert section_h == section_w
+            # Split according to [h w h w h w h w... t t t...]
+            section_cos_t = cos[..., -section_t:]
+            section_cos_h = cos[..., : section_h + section_w : 2]
+            section_cos_w = cos[..., 1 : section_h + section_w : 2]
+
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
+                cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
+            )
+            cos = torch.cat([cos_hw, cos_t], dim=-1)
+
+            section_sin_t = sin[..., -section_t:]
+            section_sin_h = sin[..., : section_h + section_w : 2]
+            section_sin_w = sin[..., 1 : section_h + section_w : 2]
+
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
+                sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
+            )
+            sin = torch.cat([sin_hw, sin_t], dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(  # type: ignore[override]
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key)
--- a/vllm/model_executor/layers/rotary_embedding/fope.py
+++ b/vllm/model_executor/layers/rotary_embedding/fope.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .base import RotaryEmbedding
+from .common import rotate_neox
+
+
+class FourierRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        init_cache: bool,
+        # extra parameters for FoPE
+        num_key_value_heads: int,
+        num_inv_freq: int,
+        fope_sep_head: bool,
+        fope_init_factor: float,
+    ):
+        # fope related parameters
+        self.num_key_value_heads = num_key_value_heads
+        self.num_inv_freq = num_inv_freq
+        self.fope_sep_head = fope_sep_head
+        self.fope_init_factor = fope_init_factor
+
+        super().__init__(
+            head_size=head_size,
+            rotary_dim=rotary_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=base,
+            is_neox_style=is_neox_style,
+            dtype=dtype,
+            init_cache=init_cache,
+        )
+
+        # setup buffers and parameters
+        self.inv_freq: torch.Tensor
+        self.register_buffer(
+            "inv_freq", self._compute_inv_freq(self.base), persistent=False
+        )
+
+        self.input_dim = self.inv_freq.shape[-1]
+        self.output_dim = self.inv_freq.shape[-1]
+        self.cos_coef = nn.Parameter(
+            torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
+            requires_grad=False,
+        )
+        self.sin_coef = nn.Parameter(
+            torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
+            requires_grad=False,
+        )
+        self.sin_coef.weight_loader = self.weight_loader
+        self.cos_coef.weight_loader = self.weight_loader
+
+        self.cos_sin_cache: torch.Tensor
+        cache = self._compute_cos_sin_cache().to(dtype)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+        # update cache in the first forward, where sin/cos_coef weights are ready
+        self.update_cache = True
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+
+        inv_freq_idx_selected = torch.ones_like(inv_freq, dtype=torch.bool)
+        if self.num_inv_freq is not None:
+            inv_freq_idx_selected[self.num_inv_freq :] = False
+        else:
+            inv_freq_idx_selected = inv_freq > (
+                2.0 * torch.pi / self.max_position_embeddings
+            )
+
+        inv_freq = inv_freq[inv_freq_idx_selected]
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        device = self.inv_freq.device
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float, device=device)
+
+        freqs = torch.einsum("j,i -> ji", t, self.inv_freq)
+        if self.fope_sep_head:
+            pos_cos = freqs.cos().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
+            pos_sin = freqs.sin().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
+        else:
+            pos_cos = freqs.cos()
+            pos_sin = freqs.sin()
+
+        if self.fope_sep_head:
+            sin = torch.einsum("htD, hDd -> thd", pos_sin, self.sin_coef.float())
+            cos = torch.einsum("htD, hDd -> thd", pos_cos, self.cos_coef.float())
+        else:
+            sin = torch.einsum("tD, Dd -> td", pos_sin, self.sin_coef.float())
+            cos = torch.einsum("tD, Dd -> td", pos_cos, self.cos_coef.float())
+
+        sin = F.pad(
+            input=sin,
+            pad=(0, self.head_size // 2 - sin.size(-1)),
+            mode="constant",
+            value=1,
+        )
+        cos = F.pad(
+            input=cos,
+            pad=(0, self.head_size // 2 - cos.size(-1)),
+            mode="constant",
+            value=1,
+        )
+
+        sin = torch.cat((sin, sin), dim=-1)
+        cos = torch.cat((cos, cos), dim=-1)
+
+        # cache: (max_position_embeddings, num_kv_heads, kv_size * 2)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # update cos/sin cache in the first forward
+        if self.update_cache:
+            cache = self._compute_cos_sin_cache().to(self.dtype)
+            self.cos_sin_cache.copy_(cache)
+            self.update_cache = False
+
+        positions = positions.flatten()
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        # apply rotary embedding
+        # query: (seq_len, num_heads, head_size)
+        # key: (seq_len, num_kv_heads, head_size)
+        query = query.unflatten(-1, (-1, self.head_size))
+        assert key is not None, "Key tensor is required for FoPE."
+        key = key.unflatten(-1, (-1, self.head_size))
+
+        assert query.dim() == key.dim() == 3, (
+            "Expected query key (seq_len, heads, head_dim)"
+        )
+        assert cos.dim() <= 3 and sin.dim() <= 3
+
+        need_reshape = False
+        if cos.dim() == 3:
+            # for fope
+            need_reshape = True
+            query_shape = query.shape
+            key_shape = key.shape
+            cos = cos.flatten(0, 1)
+            sin = sin.flatten(0, 1)
+            seq_len = cos.size(0)
+            query = query.view(seq_len, -1, query.size(-1))
+            key = key.view(seq_len, -1, key.size(-1))
+
+        # native implementation of apply rope for neox style
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        query = (query * cos) + (rotate_neox(query) * sin)
+        key = (key * cos) + (rotate_neox(key) * sin)
+
+        if need_reshape:
+            query = query.view(query_shape)
+            key = key.view(key_shape)
+
+        return query, key
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        """load fope weights"""
+        world_size = get_tensor_model_parallel_world_size()
+        rank = get_tensor_model_parallel_rank()
+        num_key_value_heads = loaded_weight.size(0)
+
+        if num_key_value_heads < world_size:
+            n_replicate = world_size // num_key_value_heads
+            world_size = num_key_value_heads
+            rank = rank // n_replicate
+
+        loaded_weight = loaded_weight.chunk(world_size, dim=0)[rank]
+        param.data.copy_(loaded_weight)
--- a/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/linear_scaling_rope.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .base import RotaryEmbedding
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factors: list[float] | float,
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: list[float] = scaling_factors  # noqa
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        # Lazy initialized.
+        self._scaling_factor_to_offset: dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: list[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: list[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> dict[float, int]:
+        return self._scaling_factor_to_offset
--- a/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama3_rope.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class Llama3RotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
--- a/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/llama4_vision_rope.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+
+import torch
+
+from .base import RotaryEmbeddingBase
+
+
+class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase):
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+
+        # self.max_position_embeddings here is number of image patches
+        # i.e. (image_size // patch_size) ** 2
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        return cache
+
+    def forward_native(  # type: ignore[override]
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        assert key is not None
+        # self.cos_sin_cache here is complex tensor so we cannot cast into
+        # query's dtype directly with self._match_cos_sin_cache_dtype
+
+        # NOTE: by not storing cos_sin_cache in self, we can avoid
+        # memory buffer update which is costly to runtime
+        cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+    def forward_cuda(  # type: ignore[override]
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(query, key)
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -0,0 +1,414 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+
+from .base import RotaryEmbeddingBase
+from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale
+
+
+@triton.jit
+def _triton_mrope_forward(
+    q_ptr,
+    k_ptr,
+    cos,
+    sin,
+    num_tokens,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    rd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    mrope_section_w: tl.constexpr,
+    is_interleaved: tl.constexpr,
+):
+    # Adapted from
+    # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
+    # This version supports flatten input tensors from vllm
+    # and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
+    # instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary
+    pid = tl.program_id(0)
+    # locate start address
+    q_ptr = q_ptr + pid * (n_qh * hd)
+    k_ptr = k_ptr + pid * (n_kh * hd)
+
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+    # Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
+
+    # Updated stride calculation for half head_dim
+    half_rd = rd // 2
+    t_cos = cos + pid * half_rd
+    h_cos = t_cos + num_tokens * half_rd
+    w_cos = h_cos + num_tokens * half_rd
+    t_sin = sin + pid * half_rd
+    h_sin = t_sin + num_tokens * half_rd
+    w_sin = h_sin + num_tokens * half_rd
+
+    # Updated offsets for half head_dim
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    if is_interleaved:
+        h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h)
+        w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w)
+        t_mask = ~(h_mask | w_mask)
+    else:
+        t_end = mrope_section_t
+        h_end = t_end + mrope_section_h
+        t_mask = cos_offsets < mrope_section_t
+        h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
+        w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
+
+    t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
+    h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
+    w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
+    t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
+    h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
+    w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
+
+    cos_row = t_cos_row + h_cos_row + w_cos_row
+    sin_row = t_sin_row + h_sin_row + w_sin_row
+
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = (
+        tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_half_k_offsets = (
+        tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    )
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+    )
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+        tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+    )
+
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+        sin_row.dtype
+    )
+
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (rd // 2)
+    second_half_k_offsets = first_half_k_offsets + (rd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
+        sin_row.dtype
+    )
+
+    # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+    # Since cos and sin are now half-size,
+    # we use the same cos_row and sin_row for both halves
+    new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+    tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+    new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+    tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+
+    new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+    tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+    new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+    tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+
+
+def triton_mrope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    mrope_section: list[int],
+    head_size: int,
+    rotary_dim: int,
+    mrope_interleaved: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Qwen2VL mrope kernel.
+
+    Args:
+        q: [num_tokens, num_heads * head_size]
+        k: [num_tokens, num_kv_heads * head_size]
+        cos: [3, num_tokens, head_size //2 ]
+            (T/H/W positions with multimodal inputs)
+        sin: [3, num_tokens, head_size //2 ]
+            (T/H/W positions with multimodal inputs)
+        mrope_section: [t, h, w]
+        head_size: int
+    """
+    n_row, n_q_head_head_dim = q.shape
+    n_q_head = n_q_head_head_dim // head_size
+    n_kv_head = k.shape[1] // head_size
+    pad_hd = triton.next_power_of_2(head_size)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+
+    # ensure tensors passed into the kernel are contiguous.
+    # It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+
+    _triton_mrope_forward[(n_row,)](
+        q,
+        k,
+        cos,
+        sin,
+        n_row,
+        n_q_head,
+        n_kv_head,
+        head_size,
+        rotary_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        mrope_section[2],
+        mrope_interleaved,
+    )
+    return q, k
+
+
+def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.Tensor:
+    """Apply interleaved MRoPE to 3D rotary embeddings.
+    Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
+    interleaved [THTHWHTHW...TT], preserving frequency continuity.
+    """
+    x_t = x[0].clone()
+    x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3]
+    x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3]
+    return x_t
+
+
+class MRotaryEmbedding(RotaryEmbeddingBase):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: list[int] | None = None,
+        mrope_interleaved: bool = False,
+        # YaRN parameters.
+        *,
+        scaling_factor: float | None = None,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        truncate: bool = True,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.truncate = truncate
+        if self.scaling_factor is not None:
+            # Get n-d magnitude scaling corrected for interpolation
+            self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+        else:
+            self.mscale = 1.0
+
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(
+            head_size,
+            rotary_dim,
+            self.cache_max_position_num,
+            base,
+            is_neox_style,
+            dtype,
+        )
+
+        self.mrope_section = mrope_section
+        self.mrope_interleaved = mrope_interleaved
+        if self.mrope_section:
+            assert sum(self.mrope_section) == rotary_dim // 2
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        if self.scaling_factor is None:
+            return super()._compute_inv_freq(base)
+        return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base)
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        if self.scaling_factor is None:
+            return super()._compute_cos_sin_cache()
+        return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self)
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [num_tokens,] (text only) or
+                [3, num_tokens] (T/H/W positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+        num_tokens = positions.shape[-1]
+        cos_sin = cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+            if self.mrope_interleaved:
+                cos = apply_interleaved_rope(cos, self.mrope_section)
+                sin = apply_interleaved_rope(sin, self.mrope_section)
+            else:
+                cos = torch.cat(
+                    [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
+                sin = torch.cat(
+                    [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        cos_sin_cache = self._match_cos_sin_cache_dtype(query)
+        num_tokens = positions.shape[-1]
+        cos_sin = cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        query_shape = query.shape
+        key_shape = key.shape
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            q, k = triton_mrope(
+                query,
+                key,
+                cos,
+                sin,
+                self.mrope_section,
+                self.head_size,
+                self.rotary_dim,
+                self.mrope_interleaved,
+            )
+
+            return q.reshape(query_shape), k.reshape(key_shape)
+
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        return self.forward_native(positions, query, key, offsets)
+
+    @staticmethod
+    def get_next_input_positions(
+        mrope_position_delta: int,
+        context_len: int,
+        seq_len: int,
+    ) -> list[list[int]]:
+        return [
+            list(
+                range(
+                    context_len + mrope_position_delta, seq_len + mrope_position_delta
+                )
+            )
+            for _ in range(3)
+        ]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        mrope_position_delta: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            mrope_position_delta + context_len,
+            mrope_position_delta + context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
--- a/vllm/model_executor/layers/rotary_embedding/mrope_interleaved.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope_interleaved.py
@@ -0,0 +1,185 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Adapted from vllm/model_executor/layers/rotary_embedding/__init__.py
+# Copyright 2023 The vLLM team.
+#
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from .mrope import MRotaryEmbedding
+
+
+# MRotaryEmbedding with interleaved
+class MRotaryEmbeddingInterleaved(MRotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections and Interleaved Support."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: list[int],
+        mrope_interleaved: bool = True,
+    ) -> None:
+        # Enlarge max_position_embeddings for video inputs
+        self.cache_max_position_num = max_position_embeddings
+        super().__init__(
+            head_size,
+            rotary_dim,
+            self.cache_max_position_num,
+            base,
+            is_neox_style,
+            dtype,
+        )
+
+        self.mrope_section = mrope_section
+        self.mrope_interleaved = mrope_interleaved
+
+        if self.mrope_section is None:
+            raise ValueError("mrope_section cannot be None.")
+        if sum(self.mrope_section) != rotary_dim // 2:
+            raise ValueError("Sum of mrope_section must equal rotary_dim // 2.")
+        if not self.mrope_interleaved:
+            raise ValueError(
+                "mrope_interleaved must be True when mrope_section is provided."
+            )
+
+        # Generate interleaved indices
+        if len(mrope_section) == 2:
+            h_num, w_num = mrope_section[0], mrope_section[1]
+            mrope_dim = self.get_mrope_interleaved_id_list(h_num, w_num, 0)
+        elif len(mrope_section) == 3:
+            t_num, h_num, w_num = mrope_section[0], mrope_section[1], mrope_section[2]
+            mrope_dim = self.get_mrope_interleaved_id_list(
+                t_num, h_num, w_num, force_last=True
+            )
+        else:
+            raise AssertionError(
+                "Cannot support the length of mrope section is not 2 or 3."
+            )
+
+        mrope_dim = mrope_dim * 2
+        self.mrope_dim = mrope_dim
+
+        self.layer_cache = None
+
+    def _rebuild_pos_emb(
+        self,
+        positions: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Interleave the rotary embedding"""
+        cos_sin = self.cos_sin_cache[positions]
+        mrope_section_3d = [1] * len(self.mrope_dim)
+        mrope_dim = self.mrope_dim
+        cos_sin = torch.cat(
+            [
+                m[mrope_dim[i]]
+                for i, m in enumerate(cos_sin.split(mrope_section_3d, dim=-1))
+            ],
+            dim=-1,
+        )
+        return cos_sin, torch.arange(cos_sin.shape[0], device=positions.device)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Forward pass with interleaved rotary embedding."""
+        cos_sin, positions = self._rebuild_pos_emb(positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        # key may be None in some cases, e.g. cross-layer KV sharing
+        if key is not None:
+            key_shape = key.shape
+            key = key.view(num_tokens, -1, self.head_size)
+            key_rot = key[..., : self.rotary_dim]
+            key_pass = key[..., self.rotary_dim :]
+            key_rot = self.apply_rotary_emb.forward_native(
+                key_rot,
+                cos,
+                sin,
+            )
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_mrope_interleaved_id_list(
+        a: int, b: int, c: int, force_last: bool = False
+    ) -> list[int]:
+        """
+        Generate an interleaved list of indices for multi-modal rotary embedding.
+
+        Args:
+            a: Number of indices for first modality
+            b: Number of indices for second modality
+            c: Number of indices for third modality
+            force_last: Whether to force the last element to be from the first modality
+
+        Returns:
+            List of interleaved indices
+        """
+        if force_last:
+            a -= 1
+
+        counts = {0: a, 1: b, 2: c}
+        placed = {k: 0 for k in counts}
+        rem = counts.copy()
+        seq: list[int] = []
+        last = None
+
+        total = a + b + c
+        for _ in range(total):
+            # Candidates: remaining > 0 and ≠ last
+            cands = [k for k in rem if rem[k] > 0 and k != last]
+            if not cands:
+                # If only last remains, relax the condition
+                cands = [k for k in rem if rem[k] > 0]
+
+            # Select the rarest candidate
+            try:
+                best = min(cands, key=lambda k: (placed[k] / counts[k], k))
+            except KeyError:
+                best = 0
+
+            seq.append(best)
+            placed[best] += 1
+            rem[best] -= 1
+            last = best
+
+        if force_last:
+            seq.append(0)
+
+        return seq
--- a/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/ntk_scaling_rope.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+
+from .base import RotaryEmbedding
+
+
+class NTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with fixed and mixed NTK scaling.
+    https://kexue.fm/archives/9706"""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        mixed_b: float | None = None,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.mixed_b = mixed_b
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: float) -> torch.Tensor:
+        base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
+        inv_freq = super()._compute_inv_freq(base)
+
+        if self.mixed_b is None:
+            inv_freq = inv_freq / self.scaling_factor ** (2 / self.rotary_dim)
+        else:
+            a = (
+                torch.tensor(self.scaling_factor).log()
+                / (self.rotary_dim / 2) ** self.mixed_b
+            )
+            lambda_1_m = (
+                a * torch.arange(1, self.rotary_dim // 2 + 1).float() ** self.mixed_b
+            ).exp()
+            inv_freq = inv_freq / lambda_1_m
+
+        return inv_freq
--- a/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/phi3_long_rope_scaled_rope.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+
+import torch
+import torch.nn as nn
+
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
+
+from .common import rotate_neox
+
+logger = init_logger(__name__)
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding.
+
+    Based on the original RotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: list[float],
+        long_factor: list[float],
+        short_mscale: float | None = None,
+        long_mscale: float | None = None,
+    ):
+        super().__init__()
+
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
+
+        self.rotary_dim = rotary_dim
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        # Force long factors if max_model_len (runtime max length) exceeds
+        # original_max_position_embeddings to prevent KV cache invalidation when
+        # sequences cross this threshold during generation
+        max_model_len = get_current_vllm_config().model_config.max_model_len
+        self.use_long_rope = max_model_len > original_max_position_embeddings
+        if self.use_long_rope:
+            logger.warning_once(
+                "Using LongRoPE scaling factors. This enables longer "
+                "contexts (%d tokens vs original %d tokens) at the cost of "
+                "some performance degradation for shorter sequences. If "
+                "this is not desired, set `max_model_len` to be at most %d.",
+                max_position_embeddings,
+                original_max_position_embeddings,
+                original_max_position_embeddings,
+            )
+
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) / math.log(self.original_max_position_embeddings)
+            )
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale
+        )
+        short_cache = short_cache.to(dtype)
+
+        long_cache = self._compute_cos_sin_cache(
+            max_position_embeddings, long_factor, long_mscale
+        )
+        long_cache = long_cache.to(dtype)
+
+        long_short_cache = torch.cat([short_cache, long_cache], dim=0)
+        self.register_buffer(
+            "long_short_cos_sin_cache", long_short_cache, persistent=False
+        )
+
+    def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (
+            rescale_factors
+            * (
+                self.base
+                ** (
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
+                )
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: list[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        assert key is not None
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        if self.use_long_rope:
+            k = self.original_max_position_embeddings
+            long_prompt_offset = torch.full_like(positions, k).long()
+            idx = torch.add(positions, long_prompt_offset)
+        else:
+            idx = positions
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = query_rot * cos + rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = key_rot * cos + rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
+
+        return query.flatten(-2), key.flatten(-2)
--- a/vllm/model_executor/layers/rotary_embedding/xdrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+
+
+class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
+    """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
+
+    Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+        xdrope_section: list[int],
+    ) -> None:
+        self.xdrope_section = xdrope_section
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+        )
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb.forward_native(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb.forward_native(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self.apply_rotary_emb(
+            query_rot,
+            cos,
+            sin,
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self.apply_rotary_emb(
+            key_rot,
+            cos,
+            sin,
+        )
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_next_input_positions(
+        context_len: int,
+        seq_len: int,
+        xd_sections: int = 4,
+    ) -> list[list[int]]:
+        return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            context_len,
+            context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from .base import RotaryEmbedding
+from .common import yarn_find_correction_range, yarn_get_mscale, yarn_linear_ramp_mask
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        apply_yarn_scaling: bool = True,
+        truncate: bool = True,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.truncate = truncate
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = (
+            float(yarn_get_mscale(self.scaling_factor) * attn_factor)
+            if apply_yarn_scaling
+            else float(attn_factor)
+        )
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+            self.truncate,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache