v1.0
This commit is contained in:
278
model_executor/layers/rotary_embedding/__init__.py
Normal file
278
model_executor/layers/rotary_embedding/__init__.py
Normal file
@@ -0,0 +1,278 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Rotary Positional Embeddings."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding
|
||||
from .dual_chunk_rope import DualChunkRotaryEmbedding
|
||||
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
|
||||
from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding
|
||||
from .linear_scaling_rope import LinearScalingRotaryEmbedding
|
||||
from .llama3_rope import Llama3RotaryEmbedding
|
||||
from .llama4_vision_rope import Llama4VisionRotaryEmbedding
|
||||
from .mrope import MRotaryEmbedding
|
||||
from .ntk_scaling_rope import NTKScalingRotaryEmbedding
|
||||
from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
|
||||
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
|
||||
|
||||
_ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
|
||||
|
||||
|
||||
def get_rope(
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position: int,
|
||||
base: float,
|
||||
is_neox_style: bool = True,
|
||||
rope_scaling: dict[str, Any] | None = None,
|
||||
dtype: torch.dtype | None = None,
|
||||
partial_rotary_factor: float = 1.0,
|
||||
dual_chunk_attention_config: dict[str, Any] | None = None,
|
||||
) -> RotaryEmbedding:
|
||||
if dtype is None:
|
||||
dtype = torch.get_default_dtype()
|
||||
if rope_scaling is not None:
|
||||
# Transforms every value that is a list into a tuple for caching calls
|
||||
rope_scaling_tuple = {
|
||||
k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
|
||||
}
|
||||
rope_scaling_args = tuple(rope_scaling_tuple.items())
|
||||
else:
|
||||
rope_scaling_args = None
|
||||
|
||||
if dual_chunk_attention_config is not None:
|
||||
dual_chunk_attention_tuple = {
|
||||
k: tuple(v) if isinstance(v, list) else v
|
||||
for k, v in dual_chunk_attention_config.items()
|
||||
if k != "sparse_attention_config"
|
||||
}
|
||||
dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
|
||||
else:
|
||||
dual_chunk_attention_args = None
|
||||
|
||||
if partial_rotary_factor < 1.0:
|
||||
rotary_dim = int(rotary_dim * partial_rotary_factor)
|
||||
key = (
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
rope_scaling_args,
|
||||
dual_chunk_attention_args,
|
||||
dtype,
|
||||
)
|
||||
if key in _ROPE_DICT:
|
||||
return _ROPE_DICT[key]
|
||||
|
||||
if dual_chunk_attention_config is not None:
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in dual_chunk_attention_config.items()
|
||||
if k in ("chunk_size", "local_size")
|
||||
}
|
||||
rotary_emb = DualChunkRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif not rope_scaling:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, dtype
|
||||
)
|
||||
else:
|
||||
scaling_type = rope_scaling["rope_type"]
|
||||
|
||||
if scaling_type == "llama3":
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
low_freq_factor = rope_scaling["low_freq_factor"]
|
||||
high_freq_factor = rope_scaling["high_freq_factor"]
|
||||
original_max_position = rope_scaling["original_max_position_embeddings"]
|
||||
rotary_emb = Llama3RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
scaling_factor,
|
||||
low_freq_factor,
|
||||
high_freq_factor,
|
||||
original_max_position,
|
||||
)
|
||||
elif scaling_type == "mllama4":
|
||||
rotary_emb = Llama4VisionRotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, dtype
|
||||
)
|
||||
elif scaling_type == "default":
|
||||
if "mrope_section" in rope_scaling:
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_scaling["mrope_section"],
|
||||
mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
|
||||
)
|
||||
else:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "linear":
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
rotary_emb = LinearScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "ntk":
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
mixed_b = rope_scaling.get("mixed_b", None)
|
||||
rotary_emb = NTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
mixed_b,
|
||||
)
|
||||
elif scaling_type == "dynamic":
|
||||
if "alpha" in rope_scaling:
|
||||
scaling_alpha = rope_scaling["alpha"]
|
||||
rotary_emb = DynamicNTKAlphaRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
)
|
||||
elif "factor" in rope_scaling:
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
rotary_emb = DynamicNTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
|
||||
)
|
||||
elif scaling_type == "yarn":
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
original_max_position = rope_scaling["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_scaling.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"apply_yarn_scaling",
|
||||
)
|
||||
}
|
||||
if "mrope_section" in rope_scaling:
|
||||
extra_kwargs.pop("apply_yarn_scaling", None)
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_scaling["mrope_section"],
|
||||
mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
|
||||
scaling_factor=scaling_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
rotary_emb = YaRNScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "deepseek_yarn":
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
original_max_position = rope_scaling["original_max_position_embeddings"]
|
||||
# assert max_position == original_max_position * scaling_factor
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_scaling.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"mscale",
|
||||
"mscale_all_dim",
|
||||
)
|
||||
}
|
||||
rotary_emb = DeepseekScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "longrope":
|
||||
short_factor = rope_scaling["short_factor"]
|
||||
long_factor = rope_scaling["long_factor"]
|
||||
original_max_position = rope_scaling["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_scaling.items()
|
||||
if k in ("short_mscale", "long_mscale")
|
||||
}
|
||||
rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
short_factor,
|
||||
long_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
||||
_ROPE_DICT[key] = rotary_emb
|
||||
return rotary_emb
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
235
model_executor/layers/rotary_embedding/base.py
Normal file
235
model_executor/layers/rotary_embedding/base.py
Normal file
@@ -0,0 +1,235 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Rotary Positional Embeddings Base Class."""
|
||||
|
||||
import torch
|
||||
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
from .common import apply_rotary_emb_torch
|
||||
|
||||
|
||||
@CustomOp.register("rotary_embedding")
|
||||
class RotaryEmbeddingBase(CustomOp):
|
||||
"""Original rotary positional embedding."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.head_size = head_size
|
||||
self.rotary_dim = rotary_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.base = base
|
||||
self.is_neox_style = is_neox_style
|
||||
self.dtype = dtype
|
||||
# TODO(mgoin): disabled for now due to failures
|
||||
# Flashinfer only supports head_size=64, 128, 256, 512.
|
||||
# https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
|
||||
# self.use_flashinfer = (self.enabled()
|
||||
# and dtype in (torch.float16, torch.bfloat16)
|
||||
# and current_platform.is_cuda()
|
||||
# and has_flashinfer()
|
||||
# and self.head_size in [64, 128, 256, 512])
|
||||
self.use_flashinfer = False
|
||||
|
||||
cache = self._compute_cos_sin_cache()
|
||||
if not self.use_flashinfer:
|
||||
cache = cache.to(dtype)
|
||||
self.cos_sin_cache: torch.Tensor
|
||||
self.register_buffer("cos_sin_cache", cache, persistent=False)
|
||||
self.is_rocm_triton_rotary_embed_enabled = (
|
||||
rocm_aiter_ops.is_triton_rotary_embed_enabled()
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
"""Compute the inverse frequency."""
|
||||
# NOTE(woosuk): To exactly match the HF implementation, we need to
|
||||
# use CPU to compute the cache and then move it to GPU. However, we
|
||||
# create the cache on GPU for faster initialization. This may cause
|
||||
# a slight numerical difference between the HF implementation and ours.
|
||||
inv_freq = 1.0 / (
|
||||
base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
"""Compute the cos and sin cache."""
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
t = torch.arange(self.max_position_embeddings, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
|
||||
# __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
|
||||
# is expensive, so avoid calling it if possible
|
||||
if (
|
||||
self.cos_sin_cache.device != query.device
|
||||
or self.cos_sin_cache.dtype != query.dtype
|
||||
):
|
||||
self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||
|
||||
|
||||
class RotaryEmbedding(RotaryEmbeddingBase):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def forward_static(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox_style: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""A PyTorch-native implementation of forward()."""
|
||||
positions = positions.flatten()
|
||||
num_tokens = positions.shape[0]
|
||||
cos_sin = cos_sin_cache.index_select(0, positions)
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, head_size)
|
||||
query_rot = query[..., :rotary_dim]
|
||||
query_pass = query[..., rotary_dim:]
|
||||
query_rot = apply_rotary_emb_torch(query_rot, cos, sin, is_neox_style)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
# key may be None in some cases, e.g. cross-layer KV sharing
|
||||
if key is not None:
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, head_size)
|
||||
key_rot = key[..., :rotary_dim]
|
||||
key_pass = key[..., rotary_dim:]
|
||||
key_rot = apply_rotary_emb_torch(key_rot, cos, sin, is_neox_style)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""A PyTorch-native implementation of forward()."""
|
||||
return self.forward_static(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if self.use_flashinfer:
|
||||
torch.ops.vllm.flashinfer_rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
|
||||
# ops.rotary_embedding() is an in-place operation
|
||||
# that updates the query and key tensors.
|
||||
ops.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def forward_hip(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if self.is_rocm_triton_rotary_embed_enabled:
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
rocm_aiter_ops.triton_rotary_embed(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.cos_sin_cache,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
return self.forward_cuda(positions, query, key)
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
# ops.rotary_embedding() is an in-place operation
|
||||
# that updates the query and key tensors.
|
||||
if key is None:
|
||||
# XPU kernel doesn't support key=None so fall back to native impl
|
||||
# TODO(sarckk): add support for optional key in
|
||||
# ipex.llm.functional.rotary_embedding_batched
|
||||
return self.forward_native(positions, query, key)
|
||||
else:
|
||||
ops.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
|
||||
s += f", max_position_embeddings={self.max_position_embeddings}"
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
return s
|
||||
188
model_executor/layers/rotary_embedding/common.py
Normal file
188
model_executor/layers/rotary_embedding/common.py
Normal file
@@ -0,0 +1,188 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from collections.abc import Callable
|
||||
from functools import cache
|
||||
from importlib.util import find_spec
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
# if current_platform.is_cuda():
|
||||
# from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# common functions
|
||||
def rotate_neox(x: torch.Tensor) -> torch.Tensor:
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
|
||||
x1 = x[..., ::2]
|
||||
x2 = x[..., 1::2]
|
||||
x = torch.stack((-x2, x1), dim=-1)
|
||||
return x.flatten(-2)
|
||||
|
||||
|
||||
def apply_rotary_emb_torch(
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
is_neox_style: bool,
|
||||
) -> torch.Tensor:
|
||||
cos = cos.unsqueeze(-2).to(x.dtype)
|
||||
sin = sin.unsqueeze(-2).to(x.dtype)
|
||||
if is_neox_style:
|
||||
x1, x2 = torch.chunk(x, 2, dim=-1)
|
||||
else:
|
||||
x1 = x[..., ::2]
|
||||
x2 = x[..., 1::2]
|
||||
o1 = x1 * cos - x2 * sin
|
||||
o2 = x2 * cos + x1 * sin
|
||||
if is_neox_style:
|
||||
return torch.cat((o1, o2), dim=-1)
|
||||
else:
|
||||
return torch.stack((o1, o2), dim=-1).flatten(-2)
|
||||
|
||||
|
||||
def apply_rotary_emb_dispatch(
|
||||
x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, is_neox_style: bool
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
x: [num_tokens, num_heads, head_size]
|
||||
cos: [num_tokens, head_size // 2]
|
||||
sin: [num_tokens, head_size // 2]
|
||||
is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
|
||||
positional embeddings.
|
||||
"""
|
||||
# if current_platform.is_cuda():
|
||||
# return apply_rotary_emb(x.unsqueeze(0), cos, sin, not is_neox_style).squeeze(0)
|
||||
# else:
|
||||
return apply_rotary_emb_torch(x, cos, sin, is_neox_style)
|
||||
|
||||
|
||||
@cache
|
||||
def dispatch_rotary_emb_function(
|
||||
default: Callable[..., torch.Tensor] | None = None,
|
||||
) -> Callable[..., torch.Tensor]:
|
||||
# if current_platform.is_cuda():
|
||||
# return apply_rotary_emb
|
||||
|
||||
# # if torch compile is not enabled
|
||||
# # use rotary embedding function from flash_attn package
|
||||
# # otherwise use the naive pytorch embedding implementation
|
||||
# # is faster when torch compile is enabled.
|
||||
# if current_platform.is_rocm() and not torch.compiler.is_compiling():
|
||||
# if find_spec("flash_attn") is not None:
|
||||
# from flash_attn.ops.triton.rotary import apply_rotary
|
||||
|
||||
# return apply_rotary
|
||||
# else:
|
||||
# logger.warning(
|
||||
# "flash_attn is not installed. Falling back to PyTorch "
|
||||
# "implementation for rotary embeddings."
|
||||
# )
|
||||
if default is not None:
|
||||
return default
|
||||
|
||||
return apply_rotary_emb_torch
|
||||
|
||||
|
||||
# yarn functions
|
||||
# Inverse dim formula to find dim based on number of rotations
|
||||
def yarn_find_correction_dim(
|
||||
num_rotations: int,
|
||||
dim: int,
|
||||
base: float = 10000,
|
||||
max_position_embeddings: int = 2048,
|
||||
) -> float:
|
||||
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
|
||||
2 * math.log(base)
|
||||
)
|
||||
|
||||
|
||||
# Find dim range bounds based on rotations
|
||||
def yarn_find_correction_range(
|
||||
low_rot: int,
|
||||
high_rot: int,
|
||||
dim: int,
|
||||
base: float = 10000,
|
||||
max_position_embeddings: int = 2048,
|
||||
) -> tuple[int, int]:
|
||||
low = math.floor(
|
||||
yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
|
||||
)
|
||||
high = math.ceil(
|
||||
yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
|
||||
)
|
||||
return max(low, 0), min(high, dim - 1) # Clamp values just in case
|
||||
|
||||
|
||||
def yarn_linear_ramp_mask(
|
||||
low: float, high: float, dim: int, dtype: torch.dtype
|
||||
) -> torch.Tensor:
|
||||
if low == high:
|
||||
high += 0.001 # Prevent singularity
|
||||
|
||||
linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
|
||||
ramp_func = torch.clamp(linear_func, 0, 1)
|
||||
return ramp_func
|
||||
|
||||
|
||||
def yarn_get_mscale(scale: float = 1) -> float:
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * math.log(scale) + 1.0
|
||||
|
||||
|
||||
def _flashinfer_rotary_embedding(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
head_size: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox: bool,
|
||||
) -> None:
|
||||
"""Custom op wrapper for flashinfer's rotary embedding.
|
||||
|
||||
This is an in-place operation that modifies query and key tensors directly.
|
||||
"""
|
||||
from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
|
||||
|
||||
apply_rope_with_cos_sin_cache_inplace(
|
||||
positions=positions,
|
||||
query=query,
|
||||
key=key,
|
||||
head_size=head_size,
|
||||
cos_sin_cache=cos_sin_cache,
|
||||
is_neox=is_neox,
|
||||
)
|
||||
|
||||
|
||||
def _flashinfer_rotary_embedding_fake(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
head_size: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox: bool,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
# Register flashinfer rotary embedding custom op
|
||||
direct_register_custom_op(
|
||||
op_name="flashinfer_rotary_embedding",
|
||||
op_func=_flashinfer_rotary_embedding,
|
||||
mutates_args=["query", "key"], # These tensors are modified in-place
|
||||
fake_impl=_flashinfer_rotary_embedding_fake,
|
||||
)
|
||||
106
model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
Normal file
106
model_executor/layers/rotary_embedding/deepseek_scaling_rope.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .common import (
|
||||
rotate_gptj,
|
||||
rotate_neox,
|
||||
yarn_find_correction_range,
|
||||
yarn_linear_ramp_mask,
|
||||
)
|
||||
|
||||
|
||||
def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * mscale * math.log(scale) + 1.0
|
||||
|
||||
|
||||
class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with YaRN method.
|
||||
|
||||
Credits to Peng et al. github.com/jquesnelle/yarn
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
mscale: float = 1,
|
||||
mscale_all_dim: float = 0,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
# Get n-d magnitude scaling corrected for interpolation.
|
||||
self.mscale = float(
|
||||
yarn_get_mscale(self.scaling_factor, float(mscale))
|
||||
/ yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
|
||||
* attn_factor
|
||||
)
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
||||
pos_freqs = self.base ** (
|
||||
torch.arange(
|
||||
0,
|
||||
self.rotary_dim,
|
||||
2,
|
||||
dtype=torch.float,
|
||||
device=current_platform.device_type,
|
||||
)
|
||||
/ self.rotary_dim
|
||||
)
|
||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
||||
|
||||
low, high = yarn_find_correction_range(
|
||||
self.beta_fast,
|
||||
self.beta_slow,
|
||||
self.rotary_dim,
|
||||
self.base,
|
||||
self.max_position_embeddings,
|
||||
)
|
||||
# Get n-d rotational scaling corrected for extrapolation
|
||||
inv_freq_mask = (
|
||||
1
|
||||
- yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
|
||||
) * self.extrapolation_factor
|
||||
inv_freq = (
|
||||
inv_freq_interpolation * (1 - inv_freq_mask)
|
||||
+ inv_freq_extrapolation * inv_freq_mask
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
||||
t = torch.arange(
|
||||
self.max_position_embeddings * self.scaling_factor,
|
||||
device=current_platform.device_type,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * self.mscale
|
||||
sin = freqs.sin() * self.mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
215
model_executor/layers/rotary_embedding/dual_chunk_rope.py
Normal file
215
model_executor/layers/rotary_embedding/dual_chunk_rope.py
Normal file
@@ -0,0 +1,215 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
from .common import rotate_gptj, rotate_neox
|
||||
|
||||
|
||||
@CustomOp.register("dual_chunk_rotary_embedding")
|
||||
class DualChunkRotaryEmbedding(CustomOp):
|
||||
"""Rotary positional embedding for Dual Chunk Attention."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
chunk_size: int,
|
||||
local_size: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.head_size = head_size
|
||||
self.rotary_dim = rotary_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.base = base
|
||||
self.is_neox_style = is_neox_style
|
||||
self.chunk_size = chunk_size
|
||||
self.local_size = local_size
|
||||
self.dtype = dtype
|
||||
self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
|
||||
(q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
|
||||
self._compute_cos_sin_cache()
|
||||
)
|
||||
|
||||
self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
|
||||
self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
|
||||
self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
|
||||
self.register_buffer(
|
||||
"cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
|
||||
)
|
||||
self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
"""Compute the inverse frequency."""
|
||||
# NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
|
||||
# However, we use `torch.arange(..., dtype=torch.float)` instead to
|
||||
# avoid numerical issues with large base values (e.g., 10000000).
|
||||
# This may cause a slight numerical difference between the HF
|
||||
# implementation and ours.
|
||||
# NOTE(woosuk): To exactly match the HF implementation, we need to
|
||||
# use CPU to compute the cache and then move it to GPU. However, we
|
||||
# create the cache on GPU for faster initialization. This may cause
|
||||
# a slight numerical difference between the HF implementation and ours.
|
||||
inv_freq = 1.0 / (
|
||||
base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
"""Compute the cos and sin cache."""
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
chunk_len = self.chunk_size - self.local_size
|
||||
q_t = torch.arange(chunk_len, dtype=torch.float)
|
||||
qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
|
||||
max=self.chunk_size
|
||||
)
|
||||
k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
|
||||
|
||||
# count from chunk_len, no clamp(self.chunk_size) restriction
|
||||
qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
|
||||
# count from self.chunk_size for q_inter's rope
|
||||
q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
|
||||
|
||||
q_freqs = torch.outer(q_t, inv_freq)
|
||||
qc_freqs = torch.outer(qc_t, inv_freq)
|
||||
k_freqs = torch.outer(k_t, inv_freq)
|
||||
qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
|
||||
q_inter_freqs = torch.outer(q_inter_t, inv_freq)
|
||||
|
||||
q_cos = q_freqs.cos()
|
||||
q_sin = q_freqs.sin()
|
||||
qc_cos = qc_freqs.cos()
|
||||
qc_sin = qc_freqs.sin()
|
||||
k_cos = k_freqs.cos()
|
||||
k_sin = k_freqs.sin()
|
||||
|
||||
qc_no_clamp_cos = qc_no_clamp_freqs.cos()
|
||||
qc_no_clamp_sin = qc_no_clamp_freqs.sin()
|
||||
q_inter_cos = q_inter_freqs.cos()
|
||||
q_inter_sin = q_inter_freqs.sin()
|
||||
|
||||
q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
query = query.view(*query.shape[:-1], -1, self.head_size)
|
||||
key = key.view(*key.shape[:-1], -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
if self.rotary_dim < self.head_size:
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
else:
|
||||
query_pass = None
|
||||
key_pass = None
|
||||
|
||||
positions_with_offsets = (
|
||||
torch.add(positions, offsets) if offsets is not None else positions
|
||||
)
|
||||
key = self._apply_rotary_embedding(
|
||||
self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
|
||||
)
|
||||
chunk_len = self.chunk_size - self.local_size
|
||||
query = self._apply_rotary_embedding(
|
||||
self.cos_sin_q_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_succ = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_inter = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_succ_critical = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_inter_critical = self._apply_rotary_embedding(
|
||||
self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
|
||||
# merge query into one tensor to simplify the interfaces
|
||||
query = torch.cat(
|
||||
(
|
||||
query,
|
||||
query_succ,
|
||||
query_inter,
|
||||
query_succ_critical,
|
||||
query_inter_critical,
|
||||
),
|
||||
dim=-1,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if self.is_neox_style:
|
||||
# NOTE(woosuk): Here we assume that the positions tensor has the
|
||||
# shape [batch_size, seq_len].
|
||||
cos = cos.repeat(1, 1, 2).unsqueeze(-2)
|
||||
sin = sin.repeat(1, 1, 2).unsqueeze(-2)
|
||||
else:
|
||||
cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
|
||||
hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
|
||||
|
||||
if self.rotary_dim < self.head_size:
|
||||
hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
|
||||
else:
|
||||
hidden = hidden_rot
|
||||
return hidden.flatten(-2).squeeze(0)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
|
||||
s += f", max_position_embeddings={self.max_position_embeddings}"
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
|
||||
return s
|
||||
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with Dynamic NTK alpha.
|
||||
|
||||
Based on the original RotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_alpha: float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
self.scaling_alpha = scaling_alpha
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
# For Hunyuan DynamicNTKAlphaRotaryEmbedding
|
||||
max_len = self.max_position_embeddings
|
||||
base = self.base * self.scaling_alpha ** (
|
||||
self.rotary_dim / (self.rotary_dim - 2)
|
||||
)
|
||||
inv_freq = self._compute_inv_freq(base)
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with Dynamic NTK scaling.
|
||||
|
||||
Credits to the Reddit users /u/bloc97 and /u/emozilla
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
# NOTE(woosuk): self.max_position_embeddings is the original
|
||||
# maximum length before applying the rope scaling.
|
||||
# Thus, the maximum length after applying the rope scaling is
|
||||
# self.max_position_embeddings * self.scaling_factor.
|
||||
max_len = self.max_position_embeddings * self.scaling_factor
|
||||
base = self.base * (
|
||||
(self.scaling_factor * max_len / self.max_position_embeddings)
|
||||
- (self.scaling_factor - 1)
|
||||
) ** (self.rotary_dim / (self.rotary_dim - 2))
|
||||
inv_freq = self._compute_inv_freq(base)
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
75
model_executor/layers/rotary_embedding/ernie45_vl_rope.py
Normal file
75
model_executor/layers/rotary_embedding/ernie45_vl_rope.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from .common import apply_rotary_emb_dispatch
|
||||
from .mrope import MRotaryEmbedding
|
||||
|
||||
|
||||
class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
|
||||
"""3D rotary positional embedding. 3D is t:time h:height w:width"""
|
||||
|
||||
def forward_native( # type: ignore[override]
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
|
||||
section_h = self.mrope_section[0] # 22
|
||||
section_w = self.mrope_section[1] # 22
|
||||
section_t = self.mrope_section[2] # 20
|
||||
assert section_h == section_w
|
||||
# Split according to [h w h w h w h w... t t t...]
|
||||
section_cos_t = cos[..., -section_t:]
|
||||
section_cos_h = cos[..., : section_h + section_w : 2]
|
||||
section_cos_w = cos[..., 1 : section_h + section_w : 2]
|
||||
|
||||
cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2]
|
||||
cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
|
||||
cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
|
||||
)
|
||||
cos = torch.cat([cos_hw, cos_t], dim=-1)
|
||||
|
||||
section_sin_t = sin[..., -section_t:]
|
||||
section_sin_h = sin[..., : section_h + section_w : 2]
|
||||
section_sin_w = sin[..., 1 : section_h + section_w : 2]
|
||||
|
||||
sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2]
|
||||
sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
|
||||
sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
|
||||
)
|
||||
sin = torch.cat([sin_hw, sin_t], dim=-1)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cuda( # type: ignore[override]
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(positions, query, key)
|
||||
115
model_executor/layers/rotary_embedding/linear_scaling_rope.py
Normal file
115
model_executor/layers/rotary_embedding/linear_scaling_rope.py
Normal file
@@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with linear scaling.
|
||||
|
||||
It supports multiple scaling factors. Since multiple LoRA adapters may have
|
||||
different scaling factors, we need multiple cos/sin caches. In this way,
|
||||
instead of running rotary embedding kernel per lora, we can run multiple
|
||||
lora in a batched way.
|
||||
|
||||
In addition to that, we also keep the cos/sin cache for the scaling factor
|
||||
of 1 (default) at all times.
|
||||
|
||||
Exemplary for two scaling factors x=1, y and z with embeddings
|
||||
[[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
|
||||
[[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
|
||||
[[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
|
||||
|
||||
we construct the cos/sin cache as follows:
|
||||
[[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
|
||||
...
|
||||
[xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
|
||||
|
||||
We then use offsets to index into the cos/sin cache for
|
||||
the respective scaling factors.
|
||||
|
||||
The offset to cache can be accessed via `scaling_factor_to_offset` API.
|
||||
|
||||
Credits to the Reddit user /u/kaiokendev
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factors: list[float] | float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
if isinstance(scaling_factors, float):
|
||||
scaling_factors = [scaling_factors]
|
||||
self.scaling_factors: list[float] = scaling_factors # noqa
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
# Lazy initialized.
|
||||
self._scaling_factor_to_offset: dict[float, int]
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
cache_list: list[torch.Tensor] = []
|
||||
# offsets to the next cache in a tensor.
|
||||
# Each offset corresponds to the same index in scaling_factors.
|
||||
offsets: list[int] = []
|
||||
for scaling_factor in self.scaling_factors:
|
||||
# NOTE(woosuk): self.max_position_embeddings is the original
|
||||
# maximum length before applying the rope scaling.
|
||||
# Thus, the maximum length after applying the rope scaling is
|
||||
# self.max_position_embeddings * self.scaling_factor.
|
||||
max_len = self.max_position_embeddings * scaling_factor
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
t = t / scaling_factor
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
if not cache_list:
|
||||
offset = 0
|
||||
else:
|
||||
last_offset = offsets[-1]
|
||||
next_max_len = cache_list[-1].shape[0]
|
||||
offset = last_offset + next_max_len
|
||||
offsets.append(offset)
|
||||
cache_list.append(cache)
|
||||
self._scaling_factor_to_offset = {
|
||||
float(scaling_factor): offsets[i]
|
||||
for i, scaling_factor in enumerate(self.scaling_factors)
|
||||
}
|
||||
assert len(self.scaling_factors) == len(offsets)
|
||||
return torch.cat(cache_list, dim=0)
|
||||
|
||||
@property
|
||||
def scaling_factor_to_offset(self) -> dict[float, int]:
|
||||
return self._scaling_factor_to_offset
|
||||
54
model_executor/layers/rotary_embedding/llama3_rope.py
Normal file
54
model_executor/layers/rotary_embedding/llama3_rope.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class Llama3RotaryEmbedding(RotaryEmbedding):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
scaling_factor: float,
|
||||
low_freq_factor: float,
|
||||
high_freq_factor: float,
|
||||
orig_max_position: int,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.low_freq_factor = low_freq_factor
|
||||
self.high_freq_factor = high_freq_factor
|
||||
self.orig_max_position = orig_max_position
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
inv_freqs = super()._compute_inv_freq(base)
|
||||
low_freq_wavelen = self.orig_max_position / self.low_freq_factor
|
||||
high_freq_wavelen = self.orig_max_position / self.high_freq_factor
|
||||
|
||||
wave_len = 2 * math.pi / inv_freqs
|
||||
if self.low_freq_factor != self.high_freq_factor:
|
||||
smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
|
||||
self.high_freq_factor - self.low_freq_factor
|
||||
)
|
||||
else:
|
||||
smooth = 0
|
||||
new_freqs = torch.where(
|
||||
wave_len < high_freq_wavelen,
|
||||
inv_freqs,
|
||||
torch.where(
|
||||
wave_len > low_freq_wavelen,
|
||||
inv_freqs / self.scaling_factor,
|
||||
(1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
|
||||
),
|
||||
)
|
||||
return new_freqs
|
||||
80
model_executor/layers/rotary_embedding/llama4_vision_rope.py
Normal file
80
model_executor/layers/rotary_embedding/llama4_vision_rope.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbeddingBase
|
||||
|
||||
|
||||
class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
inv_freqs = super()._compute_inv_freq(base)
|
||||
inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
|
||||
return inv_freqs
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
|
||||
# self.max_position_embeddings here is number of image patches
|
||||
# i.e. (image_size // patch_size) ** 2
|
||||
num_patches = self.max_position_embeddings
|
||||
img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
|
||||
img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
|
||||
img_idx[-1, -1] = -2 # set to ID_CLS_TOKEN
|
||||
num_patches_single_dim = int(math.sqrt(num_patches))
|
||||
frequencies_x = img_idx % num_patches_single_dim
|
||||
frequencies_y = img_idx // num_patches_single_dim
|
||||
freqs_x = (
|
||||
(frequencies_x + 1)[..., None] * inv_freq[None, None, :]
|
||||
).repeat_interleave(2, dim=-1)
|
||||
freqs_y = (
|
||||
(frequencies_y + 1)[..., None] * inv_freq[None, None, :]
|
||||
).repeat_interleave(2, dim=-1)
|
||||
freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
|
||||
freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
|
||||
cache = torch.view_as_complex(
|
||||
torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
|
||||
)
|
||||
return cache
|
||||
|
||||
def forward_native( # type: ignore[override]
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert key is not None
|
||||
# self.cos_sin_cache here is complex tensor so we cannot cast into
|
||||
# query's dtype directly with self._match_cos_sin_cache_dtype
|
||||
self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
|
||||
query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
|
||||
key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
|
||||
broadcast_shape = [
|
||||
d if i == 1 or i == (query_.ndim - 1) else 1
|
||||
for i, d in enumerate(query_.shape)
|
||||
]
|
||||
freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
|
||||
query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
|
||||
key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
|
||||
return query_out.type_as(query), key_out.type_as(key)
|
||||
|
||||
def forward_cuda( # type: ignore[override]
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(query, key)
|
||||
403
model_executor/layers/rotary_embedding/mrope.py
Normal file
403
model_executor/layers/rotary_embedding/mrope.py
Normal file
@@ -0,0 +1,403 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.triton_utils import tl, triton
|
||||
|
||||
from .base import RotaryEmbeddingBase
|
||||
from .common import apply_rotary_emb_dispatch
|
||||
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _triton_mrope_forward(
|
||||
q_ptr,
|
||||
k_ptr,
|
||||
cos,
|
||||
sin,
|
||||
num_tokens,
|
||||
n_qh: tl.constexpr,
|
||||
n_kh: tl.constexpr,
|
||||
hd: tl.constexpr,
|
||||
rd: tl.constexpr,
|
||||
pad_n_qh: tl.constexpr,
|
||||
pad_n_kh: tl.constexpr,
|
||||
pad_hd: tl.constexpr,
|
||||
mrope_section_t: tl.constexpr,
|
||||
mrope_section_h: tl.constexpr,
|
||||
mrope_section_w: tl.constexpr,
|
||||
is_interleaved: tl.constexpr,
|
||||
):
|
||||
# Adapted from
|
||||
# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
|
||||
# This version supports flatten input tensors from vllm
|
||||
# and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
|
||||
# instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary
|
||||
pid = tl.program_id(0)
|
||||
# locate start address
|
||||
q_ptr = q_ptr + pid * (n_qh * hd)
|
||||
k_ptr = k_ptr + pid * (n_kh * hd)
|
||||
|
||||
# ####################################################################
|
||||
# get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
|
||||
# m of this program instance
|
||||
# ####################################################################
|
||||
# Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
|
||||
|
||||
# Updated stride calculation for half head_dim
|
||||
half_rd = rd // 2
|
||||
t_cos = cos + pid * half_rd
|
||||
h_cos = t_cos + num_tokens * half_rd
|
||||
w_cos = h_cos + num_tokens * half_rd
|
||||
t_sin = sin + pid * half_rd
|
||||
h_sin = t_sin + num_tokens * half_rd
|
||||
w_sin = h_sin + num_tokens * half_rd
|
||||
|
||||
# Updated offsets for half head_dim
|
||||
cos_offsets = tl.arange(0, pad_hd // 2)
|
||||
if is_interleaved:
|
||||
h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h)
|
||||
w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w)
|
||||
t_mask = ~(h_mask | w_mask)
|
||||
else:
|
||||
t_end = mrope_section_t
|
||||
h_end = t_end + mrope_section_h
|
||||
t_mask = cos_offsets < mrope_section_t
|
||||
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
|
||||
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
|
||||
|
||||
t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
|
||||
h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
|
||||
w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
|
||||
t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
|
||||
h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
|
||||
w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
|
||||
|
||||
cos_row = t_cos_row + h_cos_row + w_cos_row
|
||||
sin_row = t_sin_row + h_sin_row + w_sin_row
|
||||
|
||||
# ####################################################################
|
||||
# Load the left and right half of q and k for the current
|
||||
# program instance (i.e. for the current token) separately
|
||||
# ####################################################################
|
||||
# left half of the head
|
||||
first_half_q_offsets = (
|
||||
tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
|
||||
)
|
||||
first_half_k_offsets = (
|
||||
tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
|
||||
)
|
||||
first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
|
||||
tl.arange(0, pad_hd // 2)[None, :] < rd // 2
|
||||
)
|
||||
first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
|
||||
tl.arange(0, pad_hd // 2)[None, :] < rd // 2
|
||||
)
|
||||
|
||||
q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
|
||||
# right half of the head
|
||||
second_half_q_offsets = first_half_q_offsets + (rd // 2)
|
||||
second_half_k_offsets = first_half_k_offsets + (rd // 2)
|
||||
second_q_mask = first_q_mask
|
||||
second_k_mask = first_k_mask
|
||||
|
||||
q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
|
||||
# y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
|
||||
# Since cos and sin are now half-size,
|
||||
# we use the same cos_row and sin_row for both halves
|
||||
new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
|
||||
tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
|
||||
new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
|
||||
tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
|
||||
|
||||
new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
|
||||
tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
|
||||
new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
|
||||
tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
|
||||
|
||||
|
||||
def triton_mrope(
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
mrope_section: list[int],
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
mrope_interleaved: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Qwen2VL mrope kernel.
|
||||
|
||||
Args:
|
||||
q: [num_tokens, num_heads * head_size]
|
||||
k: [num_tokens, num_kv_heads * head_size]
|
||||
cos: [3, num_tokens, head_size //2 ]
|
||||
(T/H/W positions with multimodal inputs)
|
||||
sin: [3, num_tokens, head_size //2 ]
|
||||
(T/H/W positions with multimodal inputs)
|
||||
mrope_section: [t, h, w]
|
||||
head_size: int
|
||||
"""
|
||||
n_row, n_q_head_head_dim = q.shape
|
||||
n_q_head = n_q_head_head_dim // head_size
|
||||
n_kv_head = k.shape[1] // head_size
|
||||
pad_hd = triton.next_power_of_2(head_size)
|
||||
pad_n_q_head = triton.next_power_of_2(n_q_head)
|
||||
pad_n_kv_head = triton.next_power_of_2(n_kv_head)
|
||||
|
||||
# ensure tensors passed into the kernel are contiguous.
|
||||
# It will be no-op if they are already contiguous
|
||||
q = q.contiguous()
|
||||
k = k.contiguous()
|
||||
cos = cos.contiguous()
|
||||
sin = sin.contiguous()
|
||||
|
||||
_triton_mrope_forward[(n_row,)](
|
||||
q,
|
||||
k,
|
||||
cos,
|
||||
sin,
|
||||
n_row,
|
||||
n_q_head,
|
||||
n_kv_head,
|
||||
head_size,
|
||||
rotary_dim,
|
||||
pad_n_q_head,
|
||||
pad_n_kv_head,
|
||||
pad_hd,
|
||||
mrope_section[0],
|
||||
mrope_section[1],
|
||||
mrope_section[2],
|
||||
mrope_interleaved,
|
||||
)
|
||||
return q, k
|
||||
|
||||
|
||||
def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.Tensor:
|
||||
"""Apply interleaved MRoPE to 3D rotary embeddings.
|
||||
Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
|
||||
interleaved [THTHWHTHW...TT], preserving frequency continuity.
|
||||
"""
|
||||
x_t = x[0].clone()
|
||||
x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3]
|
||||
x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3]
|
||||
return x_t
|
||||
|
||||
|
||||
class MRotaryEmbedding(RotaryEmbeddingBase):
|
||||
"""Rotary Embedding with Multimodal Sections."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
mrope_section: list[int] | None = None,
|
||||
mrope_interleaved: bool = False,
|
||||
# YaRN parameters.
|
||||
*,
|
||||
scaling_factor: float | None = None,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
if self.scaling_factor is not None:
|
||||
# Get n-d magnitude scaling corrected for interpolation
|
||||
self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
|
||||
else:
|
||||
self.mscale = 1.0
|
||||
|
||||
# In Qwen2.5-VL, the maximum index value is related to the duration of
|
||||
# the input video. We enlarge max_position_embeddings to 4 times to get
|
||||
# a larger the cos and sin cache.
|
||||
self.cache_max_position_num = max_position_embeddings * 4
|
||||
super().__init__(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
self.cache_max_position_num,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
|
||||
self.mrope_section = mrope_section
|
||||
self.mrope_interleaved = mrope_interleaved
|
||||
if self.mrope_section:
|
||||
assert sum(self.mrope_section) == rotary_dim // 2
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
if self.scaling_factor is None:
|
||||
return super()._compute_inv_freq(base)
|
||||
return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
if self.scaling_factor is None:
|
||||
return super()._compute_cos_sin_cache()
|
||||
return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self)
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward().
|
||||
|
||||
Args:
|
||||
positions:
|
||||
[num_tokens,] (text only) or
|
||||
[3, num_tokens] (T/H/W positions with multimodal inputs)
|
||||
query: [num_tokens, num_heads * head_size]
|
||||
key: [num_tokens, num_kv_heads * head_size]
|
||||
"""
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
if self.mrope_interleaved:
|
||||
cos = apply_interleaved_rope(cos, self.mrope_section)
|
||||
sin = apply_interleaved_rope(sin, self.mrope_section)
|
||||
else:
|
||||
cos = torch.cat(
|
||||
[m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
|
||||
dim=-1,
|
||||
)
|
||||
sin = torch.cat(
|
||||
[m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
|
||||
dim=-1,
|
||||
)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
|
||||
if self.mrope_interleaved:
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
query_shape = query.shape
|
||||
key_shape = key.shape
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
q, k = triton_mrope(
|
||||
query,
|
||||
key,
|
||||
cos,
|
||||
sin,
|
||||
self.mrope_section,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
self.mrope_interleaved,
|
||||
)
|
||||
|
||||
return q.reshape(query_shape), k.reshape(key_shape)
|
||||
|
||||
if positions.ndim == 1:
|
||||
ops.rotary_embedding(positions, query, key, self.head_size,
|
||||
self.cos_sin_cache, self.is_neox_style)
|
||||
else:
|
||||
if self.is_neox_style:
|
||||
ops.m_rotary_embedding(positions.contiguous(), query, key, self.head_size,
|
||||
self.cos_sin_cache,
|
||||
torch.tensor(self.mrope_section, dtype=torch.int),
|
||||
self.is_neox_style)
|
||||
else:
|
||||
query, key = self.forward_native(
|
||||
positions, query, key
|
||||
)
|
||||
|
||||
|
||||
return query, key
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions(
|
||||
mrope_position_delta: int,
|
||||
context_len: int,
|
||||
seq_len: int,
|
||||
) -> list[list[int]]:
|
||||
return [
|
||||
list(
|
||||
range(
|
||||
context_len + mrope_position_delta, seq_len + mrope_position_delta
|
||||
)
|
||||
)
|
||||
for _ in range(3)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions_tensor(
|
||||
out: np.ndarray,
|
||||
out_offset: int,
|
||||
mrope_position_delta: int,
|
||||
context_len: int,
|
||||
num_new_tokens: int,
|
||||
):
|
||||
values = np.arange(
|
||||
mrope_position_delta + context_len,
|
||||
mrope_position_delta + context_len + num_new_tokens,
|
||||
dtype=out.dtype,
|
||||
)
|
||||
out[:, out_offset : out_offset + num_new_tokens] = values
|
||||
47
model_executor/layers/rotary_embedding/ntk_scaling_rope.py
Normal file
47
model_executor/layers/rotary_embedding/ntk_scaling_rope.py
Normal file
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class NTKScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with fixed and mixed NTK scaling.
|
||||
https://kexue.fm/archives/9706"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
mixed_b: float | None = None,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.mixed_b = mixed_b
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
|
||||
inv_freq = super()._compute_inv_freq(base)
|
||||
|
||||
if self.mixed_b is None:
|
||||
inv_freq = inv_freq / self.scaling_factor ** (2 / self.rotary_dim)
|
||||
else:
|
||||
a = (
|
||||
torch.tensor(self.scaling_factor).log()
|
||||
/ (self.rotary_dim / 2) ** self.mixed_b
|
||||
)
|
||||
lambda_1_m = (
|
||||
a * torch.arange(1, self.rotary_dim // 2 + 1).float() ** self.mixed_b
|
||||
).exp()
|
||||
inv_freq = inv_freq / lambda_1_m
|
||||
|
||||
return inv_freq
|
||||
@@ -0,0 +1,151 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .common import rotate_neox
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
import ixformer.inference.functions as ixops
|
||||
|
||||
class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
|
||||
"""Phi3 family of models scaled rotary embedding.
|
||||
|
||||
Based on the original RotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
original_max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
short_factor: list[float],
|
||||
long_factor: list[float],
|
||||
short_mscale: float | None = None,
|
||||
long_mscale: float | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if is_neox_style is False:
|
||||
raise ValueError(
|
||||
"`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
|
||||
)
|
||||
|
||||
self.rotary_dim = rotary_dim
|
||||
self.head_size = head_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.original_max_position_embeddings = original_max_position_embeddings
|
||||
self.base = base
|
||||
self.short_factor = short_factor
|
||||
self.long_factor = long_factor
|
||||
|
||||
# Force long factors if max_model_len (runtime max length) exceeds
|
||||
# original_max_position_embeddings to prevent KV cache invalidation when
|
||||
# sequences cross this threshold during generation
|
||||
max_model_len = get_current_vllm_config().model_config.max_model_len
|
||||
self.use_long_rope = max_model_len > original_max_position_embeddings
|
||||
if self.use_long_rope:
|
||||
logger.warning_once(
|
||||
"Using LongRoPE scaling factors. This enables longer "
|
||||
"contexts (%d tokens vs original %d tokens) at the cost of "
|
||||
"some performance degradation for shorter sequences. If "
|
||||
"this is not desired, set `max_model_len` to be at most %d.",
|
||||
max_position_embeddings,
|
||||
original_max_position_embeddings,
|
||||
original_max_position_embeddings,
|
||||
)
|
||||
|
||||
scale = self.max_position_embeddings / self.original_max_position_embeddings
|
||||
if scale <= 1.0:
|
||||
scaling_factor = 1.0
|
||||
else:
|
||||
scaling_factor = math.sqrt(
|
||||
1 + math.log(scale) / math.log(self.original_max_position_embeddings)
|
||||
)
|
||||
if short_mscale is None:
|
||||
short_mscale = scaling_factor
|
||||
if long_mscale is None:
|
||||
long_mscale = scaling_factor
|
||||
|
||||
self.short_mscale = short_mscale
|
||||
self.long_mscale = long_mscale
|
||||
|
||||
short_cache = self._compute_cos_sin_cache(
|
||||
original_max_position_embeddings, short_factor, short_mscale
|
||||
)
|
||||
short_cache = short_cache.to(dtype)
|
||||
|
||||
long_cache = self._compute_cos_sin_cache(
|
||||
max_position_embeddings, long_factor, long_mscale
|
||||
)
|
||||
long_cache = long_cache.to(dtype)
|
||||
|
||||
long_short_cache = torch.cat([short_cache, long_cache], dim=0)
|
||||
self.register_buffer(
|
||||
"long_short_cos_sin_cache", long_short_cache, persistent=False
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
|
||||
rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
|
||||
inv_freq = 1.0 / (
|
||||
rescale_factors
|
||||
* (
|
||||
self.base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
|
||||
/ self.rotary_dim
|
||||
)
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(
|
||||
self,
|
||||
max_position_embeddings: int,
|
||||
rescale_factors: list[float],
|
||||
mscale: float,
|
||||
) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(rescale_factors)
|
||||
t = torch.arange(max_position_embeddings, dtype=torch.float)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * mscale
|
||||
sin = freqs.sin() * mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert key is not None
|
||||
query = query.view(*query.shape[:-1], -1, self.head_size)
|
||||
key = key.view(*key.shape[:-1], -1, self.head_size)
|
||||
|
||||
k = self.original_max_position_embeddings
|
||||
long_prompt_offset = torch.any(positions > k)
|
||||
|
||||
ixops.vllm_rotary_embedding_phi(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.long_short_cos_sin_cache,
|
||||
long_prompt_offset,
|
||||
k,
|
||||
offsets
|
||||
)
|
||||
|
||||
return query, key
|
||||
81
model_executor/layers/rotary_embedding/yarn_scaling_rope.py
Normal file
81
model_executor/layers/rotary_embedding/yarn_scaling_rope.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .common import yarn_find_correction_range, yarn_get_mscale, yarn_linear_ramp_mask
|
||||
|
||||
|
||||
class YaRNScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with YaRN method.
|
||||
|
||||
Credits to Peng et al. github.com/jquesnelle/yarn
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
apply_yarn_scaling: bool = True,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
# Get n-d magnitude scaling corrected for interpolation
|
||||
self.mscale = (
|
||||
float(yarn_get_mscale(self.scaling_factor) * attn_factor)
|
||||
if apply_yarn_scaling
|
||||
else float(attn_factor)
|
||||
)
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
||||
pos_freqs = self.base ** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
||||
|
||||
low, high = yarn_find_correction_range(
|
||||
self.beta_fast,
|
||||
self.beta_slow,
|
||||
self.rotary_dim,
|
||||
self.base,
|
||||
self.max_position_embeddings,
|
||||
)
|
||||
# Get n-d rotational scaling corrected for extrapolation
|
||||
inv_freq_mask = (
|
||||
1
|
||||
- yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
|
||||
) * self.extrapolation_factor
|
||||
inv_freq = (
|
||||
inv_freq_interpolation * (1 - inv_freq_mask)
|
||||
+ inv_freq_extrapolation * inv_freq_mask
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
||||
t = torch.arange(
|
||||
self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
|
||||
)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * self.mscale
|
||||
sin = freqs.sin() * self.mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
Reference in New Issue
Block a user