Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
328
vllm/model_executor/layers/rotary_embedding/__init__.py
Normal file
328
vllm/model_executor/layers/rotary_embedding/__init__.py
Normal file
@@ -0,0 +1,328 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Rotary Positional Embeddings."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding
|
||||
from .dual_chunk_rope import DualChunkRotaryEmbedding
|
||||
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
|
||||
from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding
|
||||
from .fope import FourierRotaryEmbedding
|
||||
from .linear_scaling_rope import LinearScalingRotaryEmbedding
|
||||
from .llama3_rope import Llama3RotaryEmbedding
|
||||
from .llama4_vision_rope import Llama4VisionRotaryEmbedding
|
||||
from .mrope import MRotaryEmbedding
|
||||
from .mrope_interleaved import MRotaryEmbeddingInterleaved
|
||||
from .ntk_scaling_rope import NTKScalingRotaryEmbedding
|
||||
from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
|
||||
from .xdrope import XDRotaryEmbedding
|
||||
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
|
||||
|
||||
_ROPE_DICT: dict[tuple[Any, ...], RotaryEmbedding] = {}
|
||||
|
||||
|
||||
def get_rope(
|
||||
head_size: int,
|
||||
max_position: int,
|
||||
is_neox_style: bool = True,
|
||||
rope_parameters: dict[str, Any] | None = None,
|
||||
dtype: torch.dtype | None = None,
|
||||
dual_chunk_attention_config: dict[str, Any] | None = None,
|
||||
) -> RotaryEmbedding:
|
||||
if dtype is None:
|
||||
dtype = torch.get_default_dtype()
|
||||
if rope_parameters is not None:
|
||||
# Transforms every value that is a list into a tuple for caching calls
|
||||
rope_parameters_tuple = {
|
||||
k: tuple(v) if isinstance(v, list) else v
|
||||
for k, v in rope_parameters.items()
|
||||
}
|
||||
rope_parameters_args = tuple(rope_parameters_tuple.items())
|
||||
else:
|
||||
rope_parameters_args = None
|
||||
|
||||
if dual_chunk_attention_config is not None:
|
||||
dual_chunk_attention_tuple = {
|
||||
k: tuple(v) if isinstance(v, list) else v
|
||||
for k, v in dual_chunk_attention_config.items()
|
||||
if k != "sparse_attention_config"
|
||||
}
|
||||
dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
|
||||
else:
|
||||
dual_chunk_attention_args = None
|
||||
|
||||
rope_parameters = rope_parameters or {}
|
||||
base = rope_parameters.get("rope_theta", 10000)
|
||||
scaling_type = rope_parameters.get("rope_type", "default")
|
||||
partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
|
||||
if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0:
|
||||
raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0")
|
||||
rotary_dim = int(head_size * partial_rotary_factor)
|
||||
|
||||
key = (
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
is_neox_style,
|
||||
rope_parameters_args,
|
||||
dual_chunk_attention_args,
|
||||
dtype,
|
||||
)
|
||||
if key in _ROPE_DICT:
|
||||
return _ROPE_DICT[key]
|
||||
|
||||
if dual_chunk_attention_config is not None:
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in dual_chunk_attention_config.items()
|
||||
if k in ("chunk_size", "local_size")
|
||||
}
|
||||
rotary_emb = DualChunkRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "default":
|
||||
if "mrope_section" in rope_parameters:
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
)
|
||||
elif "use_fope" in rope_parameters and rope_parameters["use_fope"]:
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"num_key_value_heads",
|
||||
"num_inv_freq",
|
||||
"fope_sep_head",
|
||||
"fope_init_factor",
|
||||
)
|
||||
}
|
||||
extra_kwargs["init_cache"] = False
|
||||
rotary_emb = FourierRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
rotary_emb = RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "llama3":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
low_freq_factor = rope_parameters["low_freq_factor"]
|
||||
high_freq_factor = rope_parameters["high_freq_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
rotary_emb = Llama3RotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
scaling_factor,
|
||||
low_freq_factor,
|
||||
high_freq_factor,
|
||||
original_max_position,
|
||||
)
|
||||
elif scaling_type == "mllama4":
|
||||
rotary_emb = Llama4VisionRotaryEmbedding(
|
||||
head_size, rotary_dim, max_position, base, is_neox_style, dtype
|
||||
)
|
||||
elif scaling_type == "linear":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
rotary_emb = LinearScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
elif scaling_type == "ntk":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
mixed_b = rope_parameters.get("mixed_b")
|
||||
rotary_emb = NTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
mixed_b,
|
||||
)
|
||||
elif scaling_type == "dynamic":
|
||||
if "alpha" in rope_parameters:
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = DynamicNTKAlphaRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
)
|
||||
elif "factor" in rope_parameters:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
rotary_emb = DynamicNTKScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Dynamic rope scaling must contain either 'alpha' or 'factor' field"
|
||||
)
|
||||
elif scaling_type == "xdrope":
|
||||
scaling_alpha = rope_parameters["alpha"]
|
||||
rotary_emb = XDRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
xdrope_section=rope_parameters["xdrope_section"],
|
||||
)
|
||||
elif scaling_type == "yarn":
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"apply_yarn_scaling",
|
||||
"truncate",
|
||||
)
|
||||
}
|
||||
if "mrope_section" in rope_parameters:
|
||||
extra_kwargs.pop("apply_yarn_scaling", None)
|
||||
rotary_emb = MRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
|
||||
scaling_factor=scaling_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
else:
|
||||
rotary_emb = YaRNScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]:
|
||||
scaling_factor = rope_parameters["factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
# assert max_position == original_max_position * scaling_factor
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k
|
||||
in (
|
||||
"extrapolation_factor",
|
||||
"attn_factor",
|
||||
"beta_fast",
|
||||
"beta_slow",
|
||||
"mscale",
|
||||
"mscale_all_dim",
|
||||
)
|
||||
}
|
||||
rotary_emb = DeepseekScalingRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_factor,
|
||||
dtype,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "longrope":
|
||||
short_factor = rope_parameters["short_factor"]
|
||||
long_factor = rope_parameters["long_factor"]
|
||||
original_max_position = rope_parameters["original_max_position_embeddings"]
|
||||
extra_kwargs = {
|
||||
k: v
|
||||
for k, v in rope_parameters.items()
|
||||
if k in ("short_mscale", "long_mscale")
|
||||
}
|
||||
rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
original_max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
short_factor,
|
||||
long_factor,
|
||||
**extra_kwargs,
|
||||
)
|
||||
elif scaling_type == "openpangu":
|
||||
mrope_interleaved = rope_parameters.get("mrope_interleaved", False)
|
||||
if "mrope_section" in rope_parameters and mrope_interleaved:
|
||||
rotary_emb = MRotaryEmbeddingInterleaved(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
mrope_section=rope_parameters["mrope_section"],
|
||||
mrope_interleaved=mrope_interleaved,
|
||||
)
|
||||
else:
|
||||
raise ValueError("Pangu mrope lacks necessary parameters.")
|
||||
else:
|
||||
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
||||
_ROPE_DICT[key] = rotary_emb
|
||||
return rotary_emb
|
||||
303
vllm/model_executor/layers/rotary_embedding/base.py
Normal file
303
vllm/model_executor/layers/rotary_embedding/base.py
Normal file
@@ -0,0 +1,303 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Rotary Positional Embeddings Base Class."""
|
||||
|
||||
import torch
|
||||
|
||||
from vllm._aiter_ops import rocm_aiter_ops
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
from .common import ApplyRotaryEmb
|
||||
|
||||
|
||||
# --8<-- [start:rotary_embedding]
|
||||
@CustomOp.register("rotary_embedding")
|
||||
class RotaryEmbeddingBase(CustomOp):
|
||||
"""Original rotary positional embedding."""
|
||||
|
||||
# --8<-- [end:rotary_embedding]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
init_cache: bool = True,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.head_size = head_size
|
||||
self.rotary_dim = rotary_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.base = base
|
||||
self.is_neox_style = is_neox_style
|
||||
self.dtype = dtype
|
||||
# TODO(mgoin): disabled for now due to failures
|
||||
# Flashinfer only supports head_size=64, 128, 256, 512.
|
||||
# https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202
|
||||
# self.use_flashinfer = (self.enabled()
|
||||
# and dtype in (torch.float16, torch.bfloat16)
|
||||
# and current_platform.is_cuda()
|
||||
# and has_flashinfer()
|
||||
# and self.head_size in [64, 128, 256, 512])
|
||||
|
||||
# Check if use_flashinfer is already set
|
||||
if not hasattr(self, "use_flashinfer"):
|
||||
self.use_flashinfer = False
|
||||
|
||||
self.use_aiter = (
|
||||
self.enabled() and rocm_aiter_ops.is_triton_rotary_embed_enabled()
|
||||
)
|
||||
if self.use_aiter:
|
||||
self.rocm_aiter_triton_rotary_embedding = (
|
||||
rocm_aiter_ops.get_triton_rotary_embedding_op()
|
||||
)
|
||||
|
||||
if init_cache:
|
||||
cache = self._compute_cos_sin_cache()
|
||||
if not self.use_flashinfer:
|
||||
cache = cache.to(dtype)
|
||||
self.cos_sin_cache: torch.Tensor
|
||||
self.register_buffer("cos_sin_cache", cache, persistent=False)
|
||||
|
||||
self.apply_rotary_emb = ApplyRotaryEmb(
|
||||
is_neox_style=self.is_neox_style,
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
"""Compute the inverse frequency."""
|
||||
# NOTE(woosuk): To exactly match the HF implementation, we need to
|
||||
# use CPU to compute the cache and then move it to GPU. However, we
|
||||
# create the cache on GPU for faster initialization. This may cause
|
||||
# a slight numerical difference between the HF implementation and ours.
|
||||
inv_freq = 1.0 / (
|
||||
base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
"""Compute the cos and sin cache."""
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
t = torch.arange(self.max_position_embeddings, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> torch.Tensor:
|
||||
# __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
|
||||
# is expensive, so avoid calling it if possible
|
||||
cos_sin_cache = self.cos_sin_cache
|
||||
if (
|
||||
cos_sin_cache.device == query.device
|
||||
and self.cos_sin_cache.dtype == query.dtype
|
||||
):
|
||||
return cos_sin_cache
|
||||
|
||||
cos_sin_cache = cos_sin_cache.to(query.device, dtype=query.dtype)
|
||||
# Avoid mutating buffers during torch.compile (cudagraph) tracing.
|
||||
if torch.compiler.is_compiling():
|
||||
return cos_sin_cache
|
||||
|
||||
self.cos_sin_cache = cos_sin_cache
|
||||
return cos_sin_cache
|
||||
|
||||
def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
cos_sin = self.cos_sin_cache[:seqlen]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
return cos, sin
|
||||
|
||||
|
||||
class RotaryEmbedding(RotaryEmbeddingBase):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
init_cache: bool = True,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
head_size=head_size,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
base=base,
|
||||
is_neox_style=is_neox_style,
|
||||
dtype=dtype,
|
||||
init_cache=init_cache,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def forward_static(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox_style: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""A PyTorch-native implementation of forward()."""
|
||||
positions = positions.flatten()
|
||||
num_tokens = positions.shape[0]
|
||||
cos_sin = cos_sin_cache.index_select(0, positions)
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, head_size)
|
||||
query_rot = query[..., :rotary_dim]
|
||||
query_pass = query[..., rotary_dim:]
|
||||
query_rot = ApplyRotaryEmb.forward_static(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
is_neox_style,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
# key may be None in some cases, e.g. cross-layer KV sharing
|
||||
if key is not None:
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, head_size)
|
||||
key_rot = key[..., :rotary_dim]
|
||||
key_pass = key[..., rotary_dim:]
|
||||
key_rot = ApplyRotaryEmb.forward_static(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
is_neox_style,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""A PyTorch-native implementation of forward()."""
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
return self.forward_static(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if self.use_flashinfer:
|
||||
torch.ops.vllm.flashinfer_rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
|
||||
# ops.rotary_embedding() is an in-place operation
|
||||
# that updates the query and key tensors.
|
||||
ops.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def forward_hip(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if self.use_aiter:
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
self.rocm_aiter_triton_rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
return self.forward_cuda(positions, query, key)
|
||||
|
||||
def forward_xpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
self._match_cos_sin_cache_dtype(query)
|
||||
# ops.rotary_embedding() is an in-place operation
|
||||
# that updates the query and key tensors.
|
||||
if key is None:
|
||||
return self.forward_native(positions, query, key)
|
||||
else:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
ops.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
from vllm import _custom_ops as ops
|
||||
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
|
||||
# ops.rotary_embedding() is an in-place operation
|
||||
# that updates the query and key tensors.
|
||||
ops.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
|
||||
s += f", max_position_embeddings={self.max_position_embeddings}"
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
return s
|
||||
289
vllm/model_executor/layers/rotary_embedding/common.py
Normal file
289
vllm/model_executor/layers/rotary_embedding/common.py
Normal file
@@ -0,0 +1,289 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
from importlib.util import find_spec
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# common functions
|
||||
def rotate_neox(x: torch.Tensor) -> torch.Tensor:
|
||||
x1 = x[..., : x.shape[-1] // 2]
|
||||
x2 = x[..., x.shape[-1] // 2 :]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
|
||||
def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
|
||||
x1 = x[..., ::2]
|
||||
x2 = x[..., 1::2]
|
||||
x = torch.stack((-x2, x1), dim=-1)
|
||||
return x.flatten(-2)
|
||||
|
||||
|
||||
# yarn functions
|
||||
# Inverse dim formula to find dim based on number of rotations
|
||||
def yarn_find_correction_dim(
|
||||
num_rotations: int,
|
||||
dim: int,
|
||||
base: float = 10000,
|
||||
max_position_embeddings: int = 2048,
|
||||
) -> float:
|
||||
return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
|
||||
2 * math.log(base)
|
||||
)
|
||||
|
||||
|
||||
# Find dim range bounds based on rotations
|
||||
def yarn_find_correction_range(
|
||||
low_rot: int,
|
||||
high_rot: int,
|
||||
dim: int,
|
||||
base: float = 10000,
|
||||
max_position_embeddings: int = 2048,
|
||||
truncate: bool = True,
|
||||
) -> tuple[float | int, float | int]:
|
||||
low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
|
||||
high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
|
||||
if truncate:
|
||||
low = math.floor(low)
|
||||
high = math.ceil(high)
|
||||
return max(low, 0), min(high, dim - 1) # Clamp values just in case
|
||||
|
||||
|
||||
def yarn_linear_ramp_mask(
|
||||
low: float, high: float, dim: int, dtype: torch.dtype
|
||||
) -> torch.Tensor:
|
||||
if low == high:
|
||||
high += 0.001 # Prevent singularity
|
||||
|
||||
linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
|
||||
ramp_func = torch.clamp(linear_func, 0, 1)
|
||||
return ramp_func
|
||||
|
||||
|
||||
def yarn_get_mscale(scale: float = 1) -> float:
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * math.log(scale) + 1.0
|
||||
|
||||
|
||||
def _flashinfer_rotary_embedding(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
head_size: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox: bool,
|
||||
) -> None:
|
||||
"""Custom op wrapper for flashinfer's rotary embedding.
|
||||
|
||||
This is an in-place operation that modifies query and key tensors directly.
|
||||
"""
|
||||
from flashinfer.rope import apply_rope_with_cos_sin_cache_inplace
|
||||
|
||||
apply_rope_with_cos_sin_cache_inplace(
|
||||
positions=positions,
|
||||
query=query,
|
||||
key=key,
|
||||
head_size=head_size,
|
||||
cos_sin_cache=cos_sin_cache,
|
||||
is_neox=is_neox,
|
||||
)
|
||||
|
||||
|
||||
def _flashinfer_rotary_embedding_fake(
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
head_size: int,
|
||||
cos_sin_cache: torch.Tensor,
|
||||
is_neox: bool,
|
||||
) -> None:
|
||||
return
|
||||
|
||||
|
||||
# Register flashinfer rotary embedding custom op
|
||||
direct_register_custom_op(
|
||||
op_name="flashinfer_rotary_embedding",
|
||||
op_func=_flashinfer_rotary_embedding,
|
||||
mutates_args=["query", "key"], # These tensors are modified in-place
|
||||
fake_impl=_flashinfer_rotary_embedding_fake,
|
||||
)
|
||||
|
||||
|
||||
# --8<-- [start:apply_rotary_emb]
|
||||
@CustomOp.register("apply_rotary_emb")
|
||||
class ApplyRotaryEmb(CustomOp):
|
||||
# --8<-- [end:apply_rotary_emb]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enforce_enable: bool = False,
|
||||
is_neox_style: bool = True,
|
||||
enable_fp32_compute: bool = False,
|
||||
) -> None:
|
||||
super().__init__(enforce_enable=enforce_enable)
|
||||
self.is_neox_style = is_neox_style
|
||||
self.enable_fp32_compute = enable_fp32_compute
|
||||
|
||||
self.apply_rotary_emb_flash_attn = None
|
||||
if find_spec("flash_attn") is not None:
|
||||
from flash_attn.ops.triton.rotary import apply_rotary
|
||||
|
||||
self.apply_rotary_emb_flash_attn = apply_rotary
|
||||
|
||||
@staticmethod
|
||||
def forward_static(
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
is_neox_style: bool = True,
|
||||
enable_fp32_compute: bool = False,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
x: [batch_size (optional), seq_len, num_heads, head_size]
|
||||
cos: [seq_len, head_size // 2]
|
||||
sin: [seq_len, head_size // 2]
|
||||
is_neox_style: Whether to use the Neox-style or GPT-J-style.
|
||||
enable_fp32_compute: Temporarily convert x, cos, sin to FP32 dtype
|
||||
for higher accuracy.
|
||||
"""
|
||||
origin_dtype = x.dtype
|
||||
if enable_fp32_compute:
|
||||
x = x.float()
|
||||
|
||||
cos = cos.unsqueeze(-2).to(x.dtype)
|
||||
sin = sin.unsqueeze(-2).to(x.dtype)
|
||||
|
||||
if is_neox_style:
|
||||
x1, x2 = torch.chunk(x, 2, dim=-1)
|
||||
else:
|
||||
x1 = x[..., ::2]
|
||||
x2 = x[..., 1::2]
|
||||
|
||||
o1 = x1 * cos - x2 * sin
|
||||
o2 = x2 * cos + x1 * sin
|
||||
|
||||
if is_neox_style:
|
||||
output = torch.cat((o1, o2), dim=-1)
|
||||
else:
|
||||
output = torch.stack((o1, o2), dim=-1).flatten(-2)
|
||||
|
||||
if enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
return output
|
||||
|
||||
def _pre_process(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Size, torch.dtype]:
|
||||
origin_shape = x.shape
|
||||
if len(origin_shape) == 3:
|
||||
# x: [seq_len, num_heads, head_size]
|
||||
x = x.unsqueeze(0)
|
||||
|
||||
origin_dtype = x.dtype
|
||||
if self.enable_fp32_compute:
|
||||
x = x.float()
|
||||
cos = cos.float()
|
||||
sin = sin.float()
|
||||
|
||||
return x, cos, sin, origin_shape, origin_dtype
|
||||
|
||||
def _post_process(
|
||||
self,
|
||||
output: torch.Tensor,
|
||||
origin_shape: torch.Size,
|
||||
origin_dtype: torch.dtype,
|
||||
) -> torch.Tensor:
|
||||
if len(origin_shape) == 3:
|
||||
output = output.squeeze(0)
|
||||
if self.enable_fp32_compute:
|
||||
output = output.to(origin_dtype)
|
||||
return output
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
output = self.forward_static(
|
||||
x, cos, sin, self.is_neox_style, self.enable_fp32_compute
|
||||
)
|
||||
return output
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
|
||||
return self.forward_native(x, cos, sin)
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
|
||||
|
||||
"""
|
||||
Arguments of apply_rotary_emb() in vllm_flash_attn:
|
||||
x: [batch_size, seq_len, nheads, headdim]
|
||||
cos, sin: [seqlen_rotary, rotary_dim / 2]
|
||||
interleaved: defalut as False (Neox-style).
|
||||
...
|
||||
"""
|
||||
interleaved = not self.is_neox_style
|
||||
output = apply_rotary_emb(x, cos, sin, interleaved)
|
||||
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
return output
|
||||
|
||||
def forward_hip(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if self.apply_rotary_emb_flash_attn is not None:
|
||||
x, cos, sin, origin_shape, origin_dtype = self._pre_process(x, cos, sin)
|
||||
|
||||
"""
|
||||
Arguments of apply_rotary() in flash_attn:
|
||||
x: [batch_size, seq_len, nheads, headdim]
|
||||
cos, sin: [seqlen_rotary, rotary_dim / 2]
|
||||
interleaved: defalut as False (Neox-style).
|
||||
...
|
||||
"""
|
||||
interleaved = not self.is_neox_style
|
||||
output = self.apply_rotary_emb_flash_attn(
|
||||
x, cos, sin, interleaved=interleaved
|
||||
).type_as(x)
|
||||
|
||||
output = self._post_process(output, origin_shape, origin_dtype)
|
||||
else:
|
||||
# Falling back to PyTorch native implementation.
|
||||
output = self.forward_native(x, cos, sin)
|
||||
|
||||
return output
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
# TODO (bigPYJ1151): need to enable fused CPU ROPE here
|
||||
return self.forward_native(x, cos, sin)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
s = f"is_neox_style={self.is_neox_style}"
|
||||
s += f", enable_fp32_compute={self.enable_fp32_compute}"
|
||||
return s
|
||||
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.flashinfer import has_flashinfer
|
||||
|
||||
from .base import RotaryEmbeddingBase
|
||||
from .common import (
|
||||
rotate_gptj,
|
||||
rotate_neox,
|
||||
yarn_find_correction_range,
|
||||
yarn_linear_ramp_mask,
|
||||
)
|
||||
|
||||
|
||||
def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
|
||||
if scale <= 1:
|
||||
return 1.0
|
||||
return 0.1 * mscale * math.log(scale) + 1.0
|
||||
|
||||
|
||||
class DeepseekScalingRotaryEmbedding(RotaryEmbeddingBase):
|
||||
"""RotaryEmbedding extended with YaRN method.
|
||||
|
||||
Credits to Peng et al. github.com/jquesnelle/yarn
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
mscale: float = 1,
|
||||
mscale_all_dim: float = 0,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
# Get n-d magnitude scaling corrected for interpolation.
|
||||
self.mscale = float(
|
||||
yarn_get_mscale(self.scaling_factor, float(mscale))
|
||||
/ yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
|
||||
* attn_factor
|
||||
)
|
||||
self.use_flashinfer = (
|
||||
self.enabled()
|
||||
and dtype in (torch.float16, torch.bfloat16)
|
||||
and current_platform.is_cuda()
|
||||
and has_flashinfer()
|
||||
and head_size in [64, 128, 256, 512]
|
||||
)
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
||||
pos_freqs = self.base ** (
|
||||
torch.arange(
|
||||
0,
|
||||
self.rotary_dim,
|
||||
2,
|
||||
dtype=torch.float,
|
||||
)
|
||||
/ self.rotary_dim
|
||||
)
|
||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
||||
|
||||
low, high = yarn_find_correction_range(
|
||||
self.beta_fast,
|
||||
self.beta_slow,
|
||||
self.rotary_dim,
|
||||
self.base,
|
||||
self.max_position_embeddings,
|
||||
)
|
||||
# Get n-d rotational scaling corrected for extrapolation
|
||||
inv_freq_mask = (
|
||||
1
|
||||
- yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
|
||||
) * self.extrapolation_factor
|
||||
inv_freq = (
|
||||
inv_freq_interpolation * (1 - inv_freq_mask)
|
||||
+ inv_freq_extrapolation * inv_freq_mask
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
||||
t = torch.arange(
|
||||
self.max_position_embeddings * self.scaling_factor,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * self.mscale
|
||||
sin = freqs.sin() * self.mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward()."""
|
||||
assert key is not None
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
if self.rotary_dim < self.head_size:
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
|
||||
cos_sin = cos_sin_cache[
|
||||
torch.add(positions, offsets) if offsets is not None else positions
|
||||
]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if self.is_neox_style:
|
||||
# NOTE(woosuk): Here we assume that the positions tensor has the
|
||||
# shape [batch_size, seq_len].
|
||||
cos = cos.repeat(1, 1, 2).unsqueeze(-2)
|
||||
sin = sin.repeat(1, 1, 2).unsqueeze(-2)
|
||||
else:
|
||||
cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
|
||||
rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
|
||||
query_rot = query_rot * cos + rotate_fn(query_rot) * sin
|
||||
key_rot = key_rot * cos + rotate_fn(key_rot) * sin
|
||||
|
||||
if self.rotary_dim < self.head_size:
|
||||
query = torch.cat((query_rot, query_pass), dim=-1)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1)
|
||||
else:
|
||||
query = query_rot
|
||||
key = key_rot
|
||||
return query, key
|
||||
|
||||
def forward_hip(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
if self.use_flashinfer:
|
||||
torch.ops.vllm.flashinfer_rotary_embedding(
|
||||
torch.add(positions, offsets) if offsets is not None else positions,
|
||||
query,
|
||||
key,
|
||||
self.head_size,
|
||||
self.cos_sin_cache,
|
||||
self.is_neox_style,
|
||||
)
|
||||
return query, key
|
||||
else:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
218
vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
Normal file
218
vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
Normal file
@@ -0,0 +1,218 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.custom_op import CustomOp
|
||||
|
||||
from .common import rotate_gptj, rotate_neox
|
||||
|
||||
|
||||
# --8<-- [start:dual_chunk_rotary_embedding]
|
||||
@CustomOp.register("dual_chunk_rotary_embedding")
|
||||
class DualChunkRotaryEmbedding(CustomOp):
|
||||
"""Rotary positional embedding for Dual Chunk Attention."""
|
||||
|
||||
# --8<-- [end:dual_chunk_rotary_embedding]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
chunk_size: int,
|
||||
local_size: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.head_size = head_size
|
||||
self.rotary_dim = rotary_dim
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.base = base
|
||||
self.is_neox_style = is_neox_style
|
||||
self.chunk_size = chunk_size
|
||||
self.local_size = local_size
|
||||
self.dtype = dtype
|
||||
self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
|
||||
(q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
|
||||
self._compute_cos_sin_cache()
|
||||
)
|
||||
|
||||
self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
|
||||
self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
|
||||
self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
|
||||
self.register_buffer(
|
||||
"cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
|
||||
)
|
||||
self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
"""Compute the inverse frequency."""
|
||||
# NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
|
||||
# However, we use `torch.arange(..., dtype=torch.float)` instead to
|
||||
# avoid numerical issues with large base values (e.g., 10000000).
|
||||
# This may cause a slight numerical difference between the HF
|
||||
# implementation and ours.
|
||||
# NOTE(woosuk): To exactly match the HF implementation, we need to
|
||||
# use CPU to compute the cache and then move it to GPU. However, we
|
||||
# create the cache on GPU for faster initialization. This may cause
|
||||
# a slight numerical difference between the HF implementation and ours.
|
||||
inv_freq = 1.0 / (
|
||||
base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
"""Compute the cos and sin cache."""
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
chunk_len = self.chunk_size - self.local_size
|
||||
q_t = torch.arange(chunk_len, dtype=torch.float)
|
||||
qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
|
||||
max=self.chunk_size
|
||||
)
|
||||
k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
|
||||
|
||||
# count from chunk_len, no clamp(self.chunk_size) restriction
|
||||
qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
|
||||
# count from self.chunk_size for q_inter's rope
|
||||
q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
|
||||
|
||||
q_freqs = torch.outer(q_t, inv_freq)
|
||||
qc_freqs = torch.outer(qc_t, inv_freq)
|
||||
k_freqs = torch.outer(k_t, inv_freq)
|
||||
qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
|
||||
q_inter_freqs = torch.outer(q_inter_t, inv_freq)
|
||||
|
||||
q_cos = q_freqs.cos()
|
||||
q_sin = q_freqs.sin()
|
||||
qc_cos = qc_freqs.cos()
|
||||
qc_sin = qc_freqs.sin()
|
||||
k_cos = k_freqs.cos()
|
||||
k_sin = k_freqs.sin()
|
||||
|
||||
qc_no_clamp_cos = qc_no_clamp_freqs.cos()
|
||||
qc_no_clamp_sin = qc_no_clamp_freqs.sin()
|
||||
q_inter_cos = q_inter_freqs.cos()
|
||||
q_inter_sin = q_inter_freqs.sin()
|
||||
|
||||
q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
|
||||
dtype=self.dtype, device=self.device
|
||||
)
|
||||
return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
query = query.view(*query.shape[:-1], -1, self.head_size)
|
||||
key = key.view(*key.shape[:-1], -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
if self.rotary_dim < self.head_size:
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
else:
|
||||
query_pass = None
|
||||
key_pass = None
|
||||
|
||||
positions_with_offsets = (
|
||||
torch.add(positions, offsets) if offsets is not None else positions
|
||||
)
|
||||
key = self._apply_rotary_embedding(
|
||||
self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
|
||||
)
|
||||
chunk_len = self.chunk_size - self.local_size
|
||||
query = self._apply_rotary_embedding(
|
||||
self.cos_sin_q_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_succ = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_inter = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_succ_critical = self._apply_rotary_embedding(
|
||||
self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
query_inter_critical = self._apply_rotary_embedding(
|
||||
self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
|
||||
query_rot,
|
||||
query_pass,
|
||||
)
|
||||
|
||||
# merge query into one tensor to simplify the interfaces
|
||||
query = torch.cat(
|
||||
(
|
||||
query,
|
||||
query_succ,
|
||||
query_inter,
|
||||
query_succ_critical,
|
||||
query_inter_critical,
|
||||
),
|
||||
dim=-1,
|
||||
)
|
||||
return query, key
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if self.is_neox_style:
|
||||
# NOTE(woosuk): Here we assume that the positions tensor has the
|
||||
# shape [batch_size, seq_len].
|
||||
cos = cos.repeat(1, 1, 2).unsqueeze(-2)
|
||||
sin = sin.repeat(1, 1, 2).unsqueeze(-2)
|
||||
else:
|
||||
cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
|
||||
rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
|
||||
hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
|
||||
|
||||
if self.rotary_dim < self.head_size:
|
||||
hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
|
||||
else:
|
||||
hidden = hidden_rot
|
||||
return hidden.flatten(-2).squeeze(0)
|
||||
|
||||
def extra_repr(self) -> str:
|
||||
s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
|
||||
s += f", max_position_embeddings={self.max_position_embeddings}"
|
||||
s += f", base={self.base}, is_neox_style={self.is_neox_style}"
|
||||
s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
|
||||
return s
|
||||
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with Dynamic NTK alpha.
|
||||
|
||||
Based on the original RotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_alpha: float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
self.scaling_alpha = scaling_alpha
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
# For Hunyuan DynamicNTKAlphaRotaryEmbedding
|
||||
max_len = self.max_position_embeddings
|
||||
base = self.base * self.scaling_alpha ** (
|
||||
self.rotary_dim / (self.rotary_dim - 2)
|
||||
)
|
||||
inv_freq = self._compute_inv_freq(base)
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with Dynamic NTK scaling.
|
||||
|
||||
Credits to the Reddit users /u/bloc97 and /u/emozilla
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
# NOTE(woosuk): self.max_position_embeddings is the original
|
||||
# maximum length before applying the rope scaling.
|
||||
# Thus, the maximum length after applying the rope scaling is
|
||||
# self.max_position_embeddings * self.scaling_factor.
|
||||
max_len = self.max_position_embeddings * self.scaling_factor
|
||||
base = self.base * (
|
||||
(self.scaling_factor * max_len / self.max_position_embeddings)
|
||||
- (self.scaling_factor - 1)
|
||||
) ** (self.rotary_dim / (self.rotary_dim - 2))
|
||||
inv_freq = self._compute_inv_freq(base)
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
@@ -0,0 +1,82 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from .mrope import MRotaryEmbedding
|
||||
|
||||
|
||||
class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
|
||||
"""3D rotary positional embedding. 3D is t:time h:height w:width"""
|
||||
|
||||
def forward_native( # type: ignore[override]
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
|
||||
section_h = self.mrope_section[0] # 22
|
||||
section_w = self.mrope_section[1] # 22
|
||||
section_t = self.mrope_section[2] # 20
|
||||
assert section_h == section_w
|
||||
# Split according to [h w h w h w h w... t t t...]
|
||||
section_cos_t = cos[..., -section_t:]
|
||||
section_cos_h = cos[..., : section_h + section_w : 2]
|
||||
section_cos_w = cos[..., 1 : section_h + section_w : 2]
|
||||
|
||||
cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2]
|
||||
cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
|
||||
cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
|
||||
)
|
||||
cos = torch.cat([cos_hw, cos_t], dim=-1)
|
||||
|
||||
section_sin_t = sin[..., -section_t:]
|
||||
section_sin_h = sin[..., : section_h + section_w : 2]
|
||||
section_sin_w = sin[..., 1 : section_h + section_w : 2]
|
||||
|
||||
sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2]
|
||||
sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
|
||||
sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
|
||||
)
|
||||
sin = torch.cat([sin_hw, sin_t], dim=-1)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb.forward_native(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb.forward_native(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cuda( # type: ignore[override]
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(positions, query, key)
|
||||
199
vllm/model_executor/layers/rotary_embedding/fope.py
Normal file
199
vllm/model_executor/layers/rotary_embedding/fope.py
Normal file
@@ -0,0 +1,199 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from vllm.distributed import (
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size,
|
||||
)
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .common import rotate_neox
|
||||
|
||||
|
||||
class FourierRotaryEmbedding(RotaryEmbedding):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
init_cache: bool,
|
||||
# extra parameters for FoPE
|
||||
num_key_value_heads: int,
|
||||
num_inv_freq: int,
|
||||
fope_sep_head: bool,
|
||||
fope_init_factor: float,
|
||||
):
|
||||
# fope related parameters
|
||||
self.num_key_value_heads = num_key_value_heads
|
||||
self.num_inv_freq = num_inv_freq
|
||||
self.fope_sep_head = fope_sep_head
|
||||
self.fope_init_factor = fope_init_factor
|
||||
|
||||
super().__init__(
|
||||
head_size=head_size,
|
||||
rotary_dim=rotary_dim,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
base=base,
|
||||
is_neox_style=is_neox_style,
|
||||
dtype=dtype,
|
||||
init_cache=init_cache,
|
||||
)
|
||||
|
||||
# setup buffers and parameters
|
||||
self.inv_freq: torch.Tensor
|
||||
self.register_buffer(
|
||||
"inv_freq", self._compute_inv_freq(self.base), persistent=False
|
||||
)
|
||||
|
||||
self.input_dim = self.inv_freq.shape[-1]
|
||||
self.output_dim = self.inv_freq.shape[-1]
|
||||
self.cos_coef = nn.Parameter(
|
||||
torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
|
||||
requires_grad=False,
|
||||
)
|
||||
self.sin_coef = nn.Parameter(
|
||||
torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
|
||||
requires_grad=False,
|
||||
)
|
||||
self.sin_coef.weight_loader = self.weight_loader
|
||||
self.cos_coef.weight_loader = self.weight_loader
|
||||
|
||||
self.cos_sin_cache: torch.Tensor
|
||||
cache = self._compute_cos_sin_cache().to(dtype)
|
||||
self.register_buffer("cos_sin_cache", cache, persistent=False)
|
||||
|
||||
# update cache in the first forward, where sin/cos_coef weights are ready
|
||||
self.update_cache = True
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
"""Compute the inverse frequency."""
|
||||
inv_freq = 1.0 / (
|
||||
base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
)
|
||||
|
||||
inv_freq_idx_selected = torch.ones_like(inv_freq, dtype=torch.bool)
|
||||
if self.num_inv_freq is not None:
|
||||
inv_freq_idx_selected[self.num_inv_freq :] = False
|
||||
else:
|
||||
inv_freq_idx_selected = inv_freq > (
|
||||
2.0 * torch.pi / self.max_position_embeddings
|
||||
)
|
||||
|
||||
inv_freq = inv_freq[inv_freq_idx_selected]
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
"""Compute the cos and sin cache."""
|
||||
device = self.inv_freq.device
|
||||
t = torch.arange(self.max_position_embeddings, dtype=torch.float, device=device)
|
||||
|
||||
freqs = torch.einsum("j,i -> ji", t, self.inv_freq)
|
||||
if self.fope_sep_head:
|
||||
pos_cos = freqs.cos().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
|
||||
pos_sin = freqs.sin().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
|
||||
else:
|
||||
pos_cos = freqs.cos()
|
||||
pos_sin = freqs.sin()
|
||||
|
||||
if self.fope_sep_head:
|
||||
sin = torch.einsum("htD, hDd -> thd", pos_sin, self.sin_coef.float())
|
||||
cos = torch.einsum("htD, hDd -> thd", pos_cos, self.cos_coef.float())
|
||||
else:
|
||||
sin = torch.einsum("tD, Dd -> td", pos_sin, self.sin_coef.float())
|
||||
cos = torch.einsum("tD, Dd -> td", pos_cos, self.cos_coef.float())
|
||||
|
||||
sin = F.pad(
|
||||
input=sin,
|
||||
pad=(0, self.head_size // 2 - sin.size(-1)),
|
||||
mode="constant",
|
||||
value=1,
|
||||
)
|
||||
cos = F.pad(
|
||||
input=cos,
|
||||
pad=(0, self.head_size // 2 - cos.size(-1)),
|
||||
mode="constant",
|
||||
value=1,
|
||||
)
|
||||
|
||||
sin = torch.cat((sin, sin), dim=-1)
|
||||
cos = torch.cat((cos, cos), dim=-1)
|
||||
|
||||
# cache: (max_position_embeddings, num_kv_heads, kv_size * 2)
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
# update cos/sin cache in the first forward
|
||||
if self.update_cache:
|
||||
cache = self._compute_cos_sin_cache().to(self.dtype)
|
||||
self.cos_sin_cache.copy_(cache)
|
||||
self.update_cache = False
|
||||
|
||||
positions = positions.flatten()
|
||||
cos_sin = self.cos_sin_cache.index_select(0, positions)
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
|
||||
# apply rotary embedding
|
||||
# query: (seq_len, num_heads, head_size)
|
||||
# key: (seq_len, num_kv_heads, head_size)
|
||||
query = query.unflatten(-1, (-1, self.head_size))
|
||||
assert key is not None, "Key tensor is required for FoPE."
|
||||
key = key.unflatten(-1, (-1, self.head_size))
|
||||
|
||||
assert query.dim() == key.dim() == 3, (
|
||||
"Expected query key (seq_len, heads, head_dim)"
|
||||
)
|
||||
assert cos.dim() <= 3 and sin.dim() <= 3
|
||||
|
||||
need_reshape = False
|
||||
if cos.dim() == 3:
|
||||
# for fope
|
||||
need_reshape = True
|
||||
query_shape = query.shape
|
||||
key_shape = key.shape
|
||||
cos = cos.flatten(0, 1)
|
||||
sin = sin.flatten(0, 1)
|
||||
seq_len = cos.size(0)
|
||||
query = query.view(seq_len, -1, query.size(-1))
|
||||
key = key.view(seq_len, -1, key.size(-1))
|
||||
|
||||
# native implementation of apply rope for neox style
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
query = (query * cos) + (rotate_neox(query) * sin)
|
||||
key = (key * cos) + (rotate_neox(key) * sin)
|
||||
|
||||
if need_reshape:
|
||||
query = query.view(query_shape)
|
||||
key = key.view(key_shape)
|
||||
|
||||
return query, key
|
||||
|
||||
def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
|
||||
"""load fope weights"""
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
rank = get_tensor_model_parallel_rank()
|
||||
num_key_value_heads = loaded_weight.size(0)
|
||||
|
||||
if num_key_value_heads < world_size:
|
||||
n_replicate = world_size // num_key_value_heads
|
||||
world_size = num_key_value_heads
|
||||
rank = rank // n_replicate
|
||||
|
||||
loaded_weight = loaded_weight.chunk(world_size, dim=0)[rank]
|
||||
param.data.copy_(loaded_weight)
|
||||
@@ -0,0 +1,115 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
# Adapted from
|
||||
# https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
||||
# and OPT implementations in this library. It has been modified from its
|
||||
# original forms to accommodate minor architectural differences compared
|
||||
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class LinearScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with linear scaling.
|
||||
|
||||
It supports multiple scaling factors. Since multiple LoRA adapters may have
|
||||
different scaling factors, we need multiple cos/sin caches. In this way,
|
||||
instead of running rotary embedding kernel per lora, we can run multiple
|
||||
lora in a batched way.
|
||||
|
||||
In addition to that, we also keep the cos/sin cache for the scaling factor
|
||||
of 1 (default) at all times.
|
||||
|
||||
Exemplary for two scaling factors x=1, y and z with embeddings
|
||||
[[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
|
||||
[[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
|
||||
[[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
|
||||
|
||||
we construct the cos/sin cache as follows:
|
||||
[[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
|
||||
...
|
||||
[xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
|
||||
|
||||
We then use offsets to index into the cos/sin cache for
|
||||
the respective scaling factors.
|
||||
|
||||
The offset to cache can be accessed via `scaling_factor_to_offset` API.
|
||||
|
||||
Credits to the Reddit user /u/kaiokendev
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factors: list[float] | float,
|
||||
dtype: torch.dtype,
|
||||
) -> None:
|
||||
if isinstance(scaling_factors, float):
|
||||
scaling_factors = [scaling_factors]
|
||||
self.scaling_factors: list[float] = scaling_factors # noqa
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
# Lazy initialized.
|
||||
self._scaling_factor_to_offset: dict[float, int]
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
cache_list: list[torch.Tensor] = []
|
||||
# offsets to the next cache in a tensor.
|
||||
# Each offset corresponds to the same index in scaling_factors.
|
||||
offsets: list[int] = []
|
||||
for scaling_factor in self.scaling_factors:
|
||||
# NOTE(woosuk): self.max_position_embeddings is the original
|
||||
# maximum length before applying the rope scaling.
|
||||
# Thus, the maximum length after applying the rope scaling is
|
||||
# self.max_position_embeddings * self.scaling_factor.
|
||||
max_len = self.max_position_embeddings * scaling_factor
|
||||
t = torch.arange(max_len, dtype=torch.float)
|
||||
t = t / scaling_factor
|
||||
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos()
|
||||
sin = freqs.sin()
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
if not cache_list:
|
||||
offset = 0
|
||||
else:
|
||||
last_offset = offsets[-1]
|
||||
next_max_len = cache_list[-1].shape[0]
|
||||
offset = last_offset + next_max_len
|
||||
offsets.append(offset)
|
||||
cache_list.append(cache)
|
||||
self._scaling_factor_to_offset = {
|
||||
float(scaling_factor): offsets[i]
|
||||
for i, scaling_factor in enumerate(self.scaling_factors)
|
||||
}
|
||||
assert len(self.scaling_factors) == len(offsets)
|
||||
return torch.cat(cache_list, dim=0)
|
||||
|
||||
@property
|
||||
def scaling_factor_to_offset(self) -> dict[float, int]:
|
||||
return self._scaling_factor_to_offset
|
||||
54
vllm/model_executor/layers/rotary_embedding/llama3_rope.py
Normal file
54
vllm/model_executor/layers/rotary_embedding/llama3_rope.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class Llama3RotaryEmbedding(RotaryEmbedding):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
scaling_factor: float,
|
||||
low_freq_factor: float,
|
||||
high_freq_factor: float,
|
||||
orig_max_position: int,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.low_freq_factor = low_freq_factor
|
||||
self.high_freq_factor = high_freq_factor
|
||||
self.orig_max_position = orig_max_position
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
inv_freqs = super()._compute_inv_freq(base)
|
||||
low_freq_wavelen = self.orig_max_position / self.low_freq_factor
|
||||
high_freq_wavelen = self.orig_max_position / self.high_freq_factor
|
||||
|
||||
wave_len = 2 * math.pi / inv_freqs
|
||||
if self.low_freq_factor != self.high_freq_factor:
|
||||
smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
|
||||
self.high_freq_factor - self.low_freq_factor
|
||||
)
|
||||
else:
|
||||
smooth = 0
|
||||
new_freqs = torch.where(
|
||||
wave_len < high_freq_wavelen,
|
||||
inv_freqs,
|
||||
torch.where(
|
||||
wave_len > low_freq_wavelen,
|
||||
inv_freqs / self.scaling_factor,
|
||||
(1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
|
||||
),
|
||||
)
|
||||
return new_freqs
|
||||
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbeddingBase
|
||||
|
||||
|
||||
class Llama4VisionRotaryEmbedding(RotaryEmbeddingBase):
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
inv_freqs = super()._compute_inv_freq(base)
|
||||
inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
|
||||
return inv_freqs
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.base)
|
||||
|
||||
# self.max_position_embeddings here is number of image patches
|
||||
# i.e. (image_size // patch_size) ** 2
|
||||
num_patches = self.max_position_embeddings
|
||||
img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
|
||||
img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
|
||||
img_idx[-1, -1] = -2 # set to ID_CLS_TOKEN
|
||||
num_patches_single_dim = int(math.sqrt(num_patches))
|
||||
frequencies_x = img_idx % num_patches_single_dim
|
||||
frequencies_y = img_idx // num_patches_single_dim
|
||||
freqs_x = (
|
||||
(frequencies_x + 1)[..., None] * inv_freq[None, None, :]
|
||||
).repeat_interleave(2, dim=-1)
|
||||
freqs_y = (
|
||||
(frequencies_y + 1)[..., None] * inv_freq[None, None, :]
|
||||
).repeat_interleave(2, dim=-1)
|
||||
freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
|
||||
freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
|
||||
cache = torch.view_as_complex(
|
||||
torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
|
||||
)
|
||||
return cache
|
||||
|
||||
def forward_native( # type: ignore[override]
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert key is not None
|
||||
# self.cos_sin_cache here is complex tensor so we cannot cast into
|
||||
# query's dtype directly with self._match_cos_sin_cache_dtype
|
||||
|
||||
# NOTE: by not storing cos_sin_cache in self, we can avoid
|
||||
# memory buffer update which is costly to runtime
|
||||
cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
|
||||
query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
|
||||
key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
|
||||
broadcast_shape = [
|
||||
d if i == 1 or i == (query_.ndim - 1) else 1
|
||||
for i, d in enumerate(query_.shape)
|
||||
]
|
||||
freqs_ci = cos_sin_cache.view(*broadcast_shape)
|
||||
query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
|
||||
key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
|
||||
return query_out.type_as(query), key_out.type_as(key)
|
||||
|
||||
def forward_cuda( # type: ignore[override]
|
||||
self,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(query, key)
|
||||
414
vllm/model_executor/layers/rotary_embedding/mrope.py
Normal file
414
vllm/model_executor/layers/rotary_embedding/mrope.py
Normal file
@@ -0,0 +1,414 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.triton_utils import tl, triton
|
||||
|
||||
from .base import RotaryEmbeddingBase
|
||||
from .yarn_scaling_rope import YaRNScalingRotaryEmbedding, yarn_get_mscale
|
||||
|
||||
|
||||
@triton.jit
|
||||
def _triton_mrope_forward(
|
||||
q_ptr,
|
||||
k_ptr,
|
||||
cos,
|
||||
sin,
|
||||
num_tokens,
|
||||
n_qh: tl.constexpr,
|
||||
n_kh: tl.constexpr,
|
||||
hd: tl.constexpr,
|
||||
rd: tl.constexpr,
|
||||
pad_n_qh: tl.constexpr,
|
||||
pad_n_kh: tl.constexpr,
|
||||
pad_hd: tl.constexpr,
|
||||
mrope_section_t: tl.constexpr,
|
||||
mrope_section_h: tl.constexpr,
|
||||
mrope_section_w: tl.constexpr,
|
||||
is_interleaved: tl.constexpr,
|
||||
):
|
||||
# Adapted from
|
||||
# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/qwen2vl_mrope.py
|
||||
# This version supports flatten input tensors from vllm
|
||||
# and supports cos and sin cache with shape (3, num_tokens, head_dim // 2)
|
||||
# instead of (3, bsz, seq_len, head_dim), also supports interleaved rotary
|
||||
pid = tl.program_id(0)
|
||||
# locate start address
|
||||
q_ptr = q_ptr + pid * (n_qh * hd)
|
||||
k_ptr = k_ptr + pid * (n_kh * hd)
|
||||
|
||||
# ####################################################################
|
||||
# get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
|
||||
# m of this program instance
|
||||
# ####################################################################
|
||||
# Note: cos and sin now have shape (3, num_tokens, head_dim // 2)
|
||||
|
||||
# Updated stride calculation for half head_dim
|
||||
half_rd = rd // 2
|
||||
t_cos = cos + pid * half_rd
|
||||
h_cos = t_cos + num_tokens * half_rd
|
||||
w_cos = h_cos + num_tokens * half_rd
|
||||
t_sin = sin + pid * half_rd
|
||||
h_sin = t_sin + num_tokens * half_rd
|
||||
w_sin = h_sin + num_tokens * half_rd
|
||||
|
||||
# Updated offsets for half head_dim
|
||||
cos_offsets = tl.arange(0, pad_hd // 2)
|
||||
if is_interleaved:
|
||||
h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h)
|
||||
w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w)
|
||||
t_mask = ~(h_mask | w_mask)
|
||||
else:
|
||||
t_end = mrope_section_t
|
||||
h_end = t_end + mrope_section_h
|
||||
t_mask = cos_offsets < mrope_section_t
|
||||
h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
|
||||
w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
|
||||
|
||||
t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
|
||||
h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
|
||||
w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
|
||||
t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
|
||||
h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
|
||||
w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
|
||||
|
||||
cos_row = t_cos_row + h_cos_row + w_cos_row
|
||||
sin_row = t_sin_row + h_sin_row + w_sin_row
|
||||
|
||||
# ####################################################################
|
||||
# Load the left and right half of q and k for the current
|
||||
# program instance (i.e. for the current token) separately
|
||||
# ####################################################################
|
||||
# left half of the head
|
||||
first_half_q_offsets = (
|
||||
tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
|
||||
)
|
||||
first_half_k_offsets = (
|
||||
tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
|
||||
)
|
||||
first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
|
||||
tl.arange(0, pad_hd // 2)[None, :] < rd // 2
|
||||
)
|
||||
first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
|
||||
tl.arange(0, pad_hd // 2)[None, :] < rd // 2
|
||||
)
|
||||
|
||||
q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
|
||||
# right half of the head
|
||||
second_half_q_offsets = first_half_q_offsets + (rd // 2)
|
||||
second_half_k_offsets = first_half_k_offsets + (rd // 2)
|
||||
second_q_mask = first_q_mask
|
||||
second_k_mask = first_k_mask
|
||||
|
||||
q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
|
||||
sin_row.dtype
|
||||
)
|
||||
|
||||
# y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
|
||||
# Since cos and sin are now half-size,
|
||||
# we use the same cos_row and sin_row for both halves
|
||||
new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
|
||||
tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
|
||||
new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
|
||||
tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
|
||||
|
||||
new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
|
||||
tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
|
||||
new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
|
||||
tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
|
||||
|
||||
|
||||
def triton_mrope(
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
cos: torch.Tensor,
|
||||
sin: torch.Tensor,
|
||||
mrope_section: list[int],
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
mrope_interleaved: bool,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Qwen2VL mrope kernel.
|
||||
|
||||
Args:
|
||||
q: [num_tokens, num_heads * head_size]
|
||||
k: [num_tokens, num_kv_heads * head_size]
|
||||
cos: [3, num_tokens, head_size //2 ]
|
||||
(T/H/W positions with multimodal inputs)
|
||||
sin: [3, num_tokens, head_size //2 ]
|
||||
(T/H/W positions with multimodal inputs)
|
||||
mrope_section: [t, h, w]
|
||||
head_size: int
|
||||
"""
|
||||
n_row, n_q_head_head_dim = q.shape
|
||||
n_q_head = n_q_head_head_dim // head_size
|
||||
n_kv_head = k.shape[1] // head_size
|
||||
pad_hd = triton.next_power_of_2(head_size)
|
||||
pad_n_q_head = triton.next_power_of_2(n_q_head)
|
||||
pad_n_kv_head = triton.next_power_of_2(n_kv_head)
|
||||
|
||||
# ensure tensors passed into the kernel are contiguous.
|
||||
# It will be no-op if they are already contiguous
|
||||
q = q.contiguous()
|
||||
k = k.contiguous()
|
||||
cos = cos.contiguous()
|
||||
sin = sin.contiguous()
|
||||
|
||||
_triton_mrope_forward[(n_row,)](
|
||||
q,
|
||||
k,
|
||||
cos,
|
||||
sin,
|
||||
n_row,
|
||||
n_q_head,
|
||||
n_kv_head,
|
||||
head_size,
|
||||
rotary_dim,
|
||||
pad_n_q_head,
|
||||
pad_n_kv_head,
|
||||
pad_hd,
|
||||
mrope_section[0],
|
||||
mrope_section[1],
|
||||
mrope_section[2],
|
||||
mrope_interleaved,
|
||||
)
|
||||
return q, k
|
||||
|
||||
|
||||
def apply_interleaved_rope(x: torch.Tensor, mrope_section: list[int]) -> torch.Tensor:
|
||||
"""Apply interleaved MRoPE to 3D rotary embeddings.
|
||||
Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
|
||||
interleaved [THTHWHTHW...TT], preserving frequency continuity.
|
||||
"""
|
||||
x_t = x[0].clone()
|
||||
x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3]
|
||||
x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3]
|
||||
return x_t
|
||||
|
||||
|
||||
class MRotaryEmbedding(RotaryEmbeddingBase):
|
||||
"""Rotary Embedding with Multimodal Sections."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
mrope_section: list[int] | None = None,
|
||||
mrope_interleaved: bool = False,
|
||||
# YaRN parameters.
|
||||
*,
|
||||
scaling_factor: float | None = None,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
truncate: bool = True,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
self.truncate = truncate
|
||||
if self.scaling_factor is not None:
|
||||
# Get n-d magnitude scaling corrected for interpolation
|
||||
self.mscale = float(yarn_get_mscale(self.scaling_factor) * attn_factor)
|
||||
else:
|
||||
self.mscale = 1.0
|
||||
|
||||
# In Qwen2.5-VL, the maximum index value is related to the duration of
|
||||
# the input video. We enlarge max_position_embeddings to 4 times to get
|
||||
# a larger the cos and sin cache.
|
||||
self.cache_max_position_num = max_position_embeddings * 4
|
||||
super().__init__(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
self.cache_max_position_num,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
|
||||
self.mrope_section = mrope_section
|
||||
self.mrope_interleaved = mrope_interleaved
|
||||
if self.mrope_section:
|
||||
assert sum(self.mrope_section) == rotary_dim // 2
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
if self.scaling_factor is None:
|
||||
return super()._compute_inv_freq(base)
|
||||
return YaRNScalingRotaryEmbedding._compute_inv_freq(self, base)
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
if self.scaling_factor is None:
|
||||
return super()._compute_cos_sin_cache()
|
||||
return YaRNScalingRotaryEmbedding._compute_cos_sin_cache(self)
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward().
|
||||
|
||||
Args:
|
||||
positions:
|
||||
[num_tokens,] (text only) or
|
||||
[3, num_tokens] (T/H/W positions with multimodal inputs)
|
||||
query: [num_tokens, num_heads * head_size]
|
||||
key: [num_tokens, num_kv_heads * head_size]
|
||||
"""
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
if self.mrope_interleaved:
|
||||
cos = apply_interleaved_rope(cos, self.mrope_section)
|
||||
sin = apply_interleaved_rope(sin, self.mrope_section)
|
||||
else:
|
||||
cos = torch.cat(
|
||||
[m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
|
||||
dim=-1,
|
||||
)
|
||||
sin = torch.cat(
|
||||
[m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
|
||||
dim=-1,
|
||||
)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb.forward_native(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb.forward_native(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert positions.ndim == 1 or positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
cos_sin_cache = self._match_cos_sin_cache_dtype(query)
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
query_shape = query.shape
|
||||
key_shape = key.shape
|
||||
if positions.ndim == 2:
|
||||
assert self.mrope_section
|
||||
|
||||
q, k = triton_mrope(
|
||||
query,
|
||||
key,
|
||||
cos,
|
||||
sin,
|
||||
self.mrope_section,
|
||||
self.head_size,
|
||||
self.rotary_dim,
|
||||
self.mrope_interleaved,
|
||||
)
|
||||
|
||||
return q.reshape(query_shape), k.reshape(key_shape)
|
||||
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
return self.forward_native(positions, query, key, offsets)
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions(
|
||||
mrope_position_delta: int,
|
||||
context_len: int,
|
||||
seq_len: int,
|
||||
) -> list[list[int]]:
|
||||
return [
|
||||
list(
|
||||
range(
|
||||
context_len + mrope_position_delta, seq_len + mrope_position_delta
|
||||
)
|
||||
)
|
||||
for _ in range(3)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions_tensor(
|
||||
out: np.ndarray,
|
||||
out_offset: int,
|
||||
mrope_position_delta: int,
|
||||
context_len: int,
|
||||
num_new_tokens: int,
|
||||
):
|
||||
values = np.arange(
|
||||
mrope_position_delta + context_len,
|
||||
mrope_position_delta + context_len + num_new_tokens,
|
||||
dtype=out.dtype,
|
||||
)
|
||||
out[:, out_offset : out_offset + num_new_tokens] = values
|
||||
185
vllm/model_executor/layers/rotary_embedding/mrope_interleaved.py
Normal file
185
vllm/model_executor/layers/rotary_embedding/mrope_interleaved.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Adapted from vllm/model_executor/layers/rotary_embedding/__init__.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
|
||||
from .mrope import MRotaryEmbedding
|
||||
|
||||
|
||||
# MRotaryEmbedding with interleaved
|
||||
class MRotaryEmbeddingInterleaved(MRotaryEmbedding):
|
||||
"""Rotary Embedding with Multimodal Sections and Interleaved Support."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
mrope_section: list[int],
|
||||
mrope_interleaved: bool = True,
|
||||
) -> None:
|
||||
# Enlarge max_position_embeddings for video inputs
|
||||
self.cache_max_position_num = max_position_embeddings
|
||||
super().__init__(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
self.cache_max_position_num,
|
||||
base,
|
||||
is_neox_style,
|
||||
dtype,
|
||||
)
|
||||
|
||||
self.mrope_section = mrope_section
|
||||
self.mrope_interleaved = mrope_interleaved
|
||||
|
||||
if self.mrope_section is None:
|
||||
raise ValueError("mrope_section cannot be None.")
|
||||
if sum(self.mrope_section) != rotary_dim // 2:
|
||||
raise ValueError("Sum of mrope_section must equal rotary_dim // 2.")
|
||||
if not self.mrope_interleaved:
|
||||
raise ValueError(
|
||||
"mrope_interleaved must be True when mrope_section is provided."
|
||||
)
|
||||
|
||||
# Generate interleaved indices
|
||||
if len(mrope_section) == 2:
|
||||
h_num, w_num = mrope_section[0], mrope_section[1]
|
||||
mrope_dim = self.get_mrope_interleaved_id_list(h_num, w_num, 0)
|
||||
elif len(mrope_section) == 3:
|
||||
t_num, h_num, w_num = mrope_section[0], mrope_section[1], mrope_section[2]
|
||||
mrope_dim = self.get_mrope_interleaved_id_list(
|
||||
t_num, h_num, w_num, force_last=True
|
||||
)
|
||||
else:
|
||||
raise AssertionError(
|
||||
"Cannot support the length of mrope section is not 2 or 3."
|
||||
)
|
||||
|
||||
mrope_dim = mrope_dim * 2
|
||||
self.mrope_dim = mrope_dim
|
||||
|
||||
self.layer_cache = None
|
||||
|
||||
def _rebuild_pos_emb(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Interleave the rotary embedding"""
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
mrope_section_3d = [1] * len(self.mrope_dim)
|
||||
mrope_dim = self.mrope_dim
|
||||
cos_sin = torch.cat(
|
||||
[
|
||||
m[mrope_dim[i]]
|
||||
for i, m in enumerate(cos_sin.split(mrope_section_3d, dim=-1))
|
||||
],
|
||||
dim=-1,
|
||||
)
|
||||
return cos_sin, torch.arange(cos_sin.shape[0], device=positions.device)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""Forward pass with interleaved rotary embedding."""
|
||||
cos_sin, positions = self._rebuild_pos_emb(positions)
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
|
||||
query_shape = query.shape
|
||||
positions = positions.flatten()
|
||||
num_tokens = positions.shape[0]
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb.forward_native(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
# key may be None in some cases, e.g. cross-layer KV sharing
|
||||
if key is not None:
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb.forward_native(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
@staticmethod
|
||||
def get_mrope_interleaved_id_list(
|
||||
a: int, b: int, c: int, force_last: bool = False
|
||||
) -> list[int]:
|
||||
"""
|
||||
Generate an interleaved list of indices for multi-modal rotary embedding.
|
||||
|
||||
Args:
|
||||
a: Number of indices for first modality
|
||||
b: Number of indices for second modality
|
||||
c: Number of indices for third modality
|
||||
force_last: Whether to force the last element to be from the first modality
|
||||
|
||||
Returns:
|
||||
List of interleaved indices
|
||||
"""
|
||||
if force_last:
|
||||
a -= 1
|
||||
|
||||
counts = {0: a, 1: b, 2: c}
|
||||
placed = {k: 0 for k in counts}
|
||||
rem = counts.copy()
|
||||
seq: list[int] = []
|
||||
last = None
|
||||
|
||||
total = a + b + c
|
||||
for _ in range(total):
|
||||
# Candidates: remaining > 0 and ≠ last
|
||||
cands = [k for k in rem if rem[k] > 0 and k != last]
|
||||
if not cands:
|
||||
# If only last remains, relax the condition
|
||||
cands = [k for k in rem if rem[k] > 0]
|
||||
|
||||
# Select the rarest candidate
|
||||
try:
|
||||
best = min(cands, key=lambda k: (placed[k] / counts[k], k))
|
||||
except KeyError:
|
||||
best = 0
|
||||
|
||||
seq.append(best)
|
||||
placed[best] += 1
|
||||
rem[best] -= 1
|
||||
last = best
|
||||
|
||||
if force_last:
|
||||
seq.append(0)
|
||||
|
||||
return seq
|
||||
@@ -0,0 +1,47 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
|
||||
|
||||
class NTKScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with fixed and mixed NTK scaling.
|
||||
https://kexue.fm/archives/9706"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
mixed_b: float | None = None,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.mixed_b = mixed_b
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, base: float) -> torch.Tensor:
|
||||
base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
|
||||
inv_freq = super()._compute_inv_freq(base)
|
||||
|
||||
if self.mixed_b is None:
|
||||
inv_freq = inv_freq / self.scaling_factor ** (2 / self.rotary_dim)
|
||||
else:
|
||||
a = (
|
||||
torch.tensor(self.scaling_factor).log()
|
||||
/ (self.rotary_dim / 2) ** self.mixed_b
|
||||
)
|
||||
lambda_1_m = (
|
||||
a * torch.arange(1, self.rotary_dim // 2 + 1).float() ** self.mixed_b
|
||||
).exp()
|
||||
inv_freq = inv_freq / lambda_1_m
|
||||
|
||||
return inv_freq
|
||||
@@ -0,0 +1,159 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import get_current_vllm_config
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .common import rotate_neox
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
|
||||
"""Phi3 family of models scaled rotary embedding.
|
||||
|
||||
Based on the original RotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
original_max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
dtype: torch.dtype,
|
||||
short_factor: list[float],
|
||||
long_factor: list[float],
|
||||
short_mscale: float | None = None,
|
||||
long_mscale: float | None = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if is_neox_style is False:
|
||||
raise ValueError(
|
||||
"`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
|
||||
)
|
||||
|
||||
self.rotary_dim = rotary_dim
|
||||
self.head_size = head_size
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.original_max_position_embeddings = original_max_position_embeddings
|
||||
self.base = base
|
||||
self.short_factor = short_factor
|
||||
self.long_factor = long_factor
|
||||
|
||||
# Force long factors if max_model_len (runtime max length) exceeds
|
||||
# original_max_position_embeddings to prevent KV cache invalidation when
|
||||
# sequences cross this threshold during generation
|
||||
max_model_len = get_current_vllm_config().model_config.max_model_len
|
||||
self.use_long_rope = max_model_len > original_max_position_embeddings
|
||||
if self.use_long_rope:
|
||||
logger.warning_once(
|
||||
"Using LongRoPE scaling factors. This enables longer "
|
||||
"contexts (%d tokens vs original %d tokens) at the cost of "
|
||||
"some performance degradation for shorter sequences. If "
|
||||
"this is not desired, set `max_model_len` to be at most %d.",
|
||||
max_position_embeddings,
|
||||
original_max_position_embeddings,
|
||||
original_max_position_embeddings,
|
||||
)
|
||||
|
||||
scale = self.max_position_embeddings / self.original_max_position_embeddings
|
||||
if scale <= 1.0:
|
||||
scaling_factor = 1.0
|
||||
else:
|
||||
scaling_factor = math.sqrt(
|
||||
1 + math.log(scale) / math.log(self.original_max_position_embeddings)
|
||||
)
|
||||
if short_mscale is None:
|
||||
short_mscale = scaling_factor
|
||||
if long_mscale is None:
|
||||
long_mscale = scaling_factor
|
||||
|
||||
self.short_mscale = short_mscale
|
||||
self.long_mscale = long_mscale
|
||||
|
||||
short_cache = self._compute_cos_sin_cache(
|
||||
original_max_position_embeddings, short_factor, short_mscale
|
||||
)
|
||||
short_cache = short_cache.to(dtype)
|
||||
|
||||
long_cache = self._compute_cos_sin_cache(
|
||||
max_position_embeddings, long_factor, long_mscale
|
||||
)
|
||||
long_cache = long_cache.to(dtype)
|
||||
|
||||
long_short_cache = torch.cat([short_cache, long_cache], dim=0)
|
||||
self.register_buffer(
|
||||
"long_short_cos_sin_cache", long_short_cache, persistent=False
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, rescale_factors: list[float]) -> torch.Tensor:
|
||||
rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
|
||||
inv_freq = 1.0 / (
|
||||
rescale_factors
|
||||
* (
|
||||
self.base
|
||||
** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
|
||||
/ self.rotary_dim
|
||||
)
|
||||
)
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(
|
||||
self,
|
||||
max_position_embeddings: int,
|
||||
rescale_factors: list[float],
|
||||
mscale: float,
|
||||
) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(rescale_factors)
|
||||
t = torch.arange(max_position_embeddings, dtype=torch.float)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * mscale
|
||||
sin = freqs.sin() * mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
assert key is not None
|
||||
query = query.view(*query.shape[:-1], -1, self.head_size)
|
||||
key = key.view(*key.shape[:-1], -1, self.head_size)
|
||||
|
||||
if self.use_long_rope:
|
||||
k = self.original_max_position_embeddings
|
||||
long_prompt_offset = torch.full_like(positions, k).long()
|
||||
idx = torch.add(positions, long_prompt_offset)
|
||||
else:
|
||||
idx = positions
|
||||
idx = torch.add(idx, offsets) if offsets is not None else idx
|
||||
cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
|
||||
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
cos = cos.repeat(1, 2).unsqueeze(-2)
|
||||
sin = sin.repeat(1, 2).unsqueeze(-2)
|
||||
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = query_rot * cos + rotate_neox(query_rot) * sin
|
||||
query = torch.cat((query_rot, query_pass), dim=-1)
|
||||
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = key_rot * cos + rotate_neox(key_rot) * sin
|
||||
key = torch.cat((key_rot, key_pass), dim=-1)
|
||||
|
||||
return query.flatten(-2), key.flatten(-2)
|
||||
160
vllm/model_executor/layers/rotary_embedding/xdrope.py
Normal file
160
vllm/model_executor/layers/rotary_embedding/xdrope.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
|
||||
|
||||
|
||||
class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
|
||||
"""DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
|
||||
|
||||
Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_alpha: float,
|
||||
dtype: torch.dtype,
|
||||
xdrope_section: list[int],
|
||||
) -> None:
|
||||
self.xdrope_section = xdrope_section
|
||||
super().__init__(
|
||||
head_size,
|
||||
rotary_dim,
|
||||
max_position_embeddings,
|
||||
base,
|
||||
is_neox_style,
|
||||
scaling_alpha,
|
||||
dtype,
|
||||
)
|
||||
|
||||
def forward_native(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward().
|
||||
|
||||
Args:
|
||||
positions:
|
||||
[4, num_tokens] (P/W/H/T positions with multimodal inputs)
|
||||
query: [num_tokens, num_heads * head_size]
|
||||
key: [num_tokens, num_kv_heads * head_size]
|
||||
"""
|
||||
assert positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
cos = torch.cat(
|
||||
[m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
sin = torch.cat(
|
||||
[m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb.forward_native(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb.forward_native(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
def forward_cuda(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor | None = None,
|
||||
offsets: torch.Tensor | None = None,
|
||||
) -> tuple[torch.Tensor, torch.Tensor | None]:
|
||||
"""PyTorch-native implementation equivalent to forward().
|
||||
|
||||
Args:
|
||||
positions:
|
||||
[4, num_tokens] (P/W/H/T positions with multimodal inputs)
|
||||
query: [num_tokens, num_heads * head_size]
|
||||
key: [num_tokens, num_kv_heads * head_size]
|
||||
"""
|
||||
assert positions.ndim == 2
|
||||
assert key is not None
|
||||
|
||||
num_tokens = positions.shape[-1]
|
||||
cos_sin = self.cos_sin_cache[positions]
|
||||
cos, sin = cos_sin.chunk(2, dim=-1)
|
||||
cos = torch.cat(
|
||||
[m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
sin = torch.cat(
|
||||
[m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
|
||||
)
|
||||
|
||||
query_shape = query.shape
|
||||
query = query.view(num_tokens, -1, self.head_size)
|
||||
query_rot = query[..., : self.rotary_dim]
|
||||
query_pass = query[..., self.rotary_dim :]
|
||||
query_rot = self.apply_rotary_emb(
|
||||
query_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
|
||||
|
||||
key_shape = key.shape
|
||||
key = key.view(num_tokens, -1, self.head_size)
|
||||
key_rot = key[..., : self.rotary_dim]
|
||||
key_pass = key[..., self.rotary_dim :]
|
||||
key_rot = self.apply_rotary_emb(
|
||||
key_rot,
|
||||
cos,
|
||||
sin,
|
||||
)
|
||||
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
|
||||
return query, key
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions(
|
||||
context_len: int,
|
||||
seq_len: int,
|
||||
xd_sections: int = 4,
|
||||
) -> list[list[int]]:
|
||||
return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
|
||||
|
||||
@staticmethod
|
||||
def get_next_input_positions_tensor(
|
||||
out: np.ndarray,
|
||||
out_offset: int,
|
||||
context_len: int,
|
||||
num_new_tokens: int,
|
||||
):
|
||||
values = np.arange(
|
||||
context_len,
|
||||
context_len + num_new_tokens,
|
||||
dtype=out.dtype,
|
||||
)
|
||||
out[:, out_offset : out_offset + num_new_tokens] = values
|
||||
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
|
||||
from .base import RotaryEmbedding
|
||||
from .common import yarn_find_correction_range, yarn_get_mscale, yarn_linear_ramp_mask
|
||||
|
||||
|
||||
class YaRNScalingRotaryEmbedding(RotaryEmbedding):
|
||||
"""RotaryEmbedding extended with YaRN method.
|
||||
|
||||
Credits to Peng et al. github.com/jquesnelle/yarn
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
head_size: int,
|
||||
rotary_dim: int,
|
||||
max_position_embeddings: int,
|
||||
base: float,
|
||||
is_neox_style: bool,
|
||||
scaling_factor: float,
|
||||
dtype: torch.dtype,
|
||||
*,
|
||||
extrapolation_factor: float = 1,
|
||||
attn_factor: float = 1,
|
||||
beta_fast: int = 32,
|
||||
beta_slow: int = 1,
|
||||
apply_yarn_scaling: bool = True,
|
||||
truncate: bool = True,
|
||||
) -> None:
|
||||
self.scaling_factor = scaling_factor
|
||||
self.extrapolation_factor = extrapolation_factor
|
||||
self.attn_factor = attn_factor
|
||||
self.beta_fast = beta_fast
|
||||
self.beta_slow = beta_slow
|
||||
self.truncate = truncate
|
||||
# Get n-d magnitude scaling corrected for interpolation
|
||||
self.mscale = (
|
||||
float(yarn_get_mscale(self.scaling_factor) * attn_factor)
|
||||
if apply_yarn_scaling
|
||||
else float(attn_factor)
|
||||
)
|
||||
super().__init__(
|
||||
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
||||
)
|
||||
|
||||
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
||||
pos_freqs = self.base ** (
|
||||
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
|
||||
)
|
||||
inv_freq_extrapolation = 1.0 / pos_freqs
|
||||
inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
|
||||
|
||||
low, high = yarn_find_correction_range(
|
||||
self.beta_fast,
|
||||
self.beta_slow,
|
||||
self.rotary_dim,
|
||||
self.base,
|
||||
self.max_position_embeddings,
|
||||
self.truncate,
|
||||
)
|
||||
# Get n-d rotational scaling corrected for extrapolation
|
||||
inv_freq_mask = (
|
||||
1
|
||||
- yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
|
||||
) * self.extrapolation_factor
|
||||
inv_freq = (
|
||||
inv_freq_interpolation * (1 - inv_freq_mask)
|
||||
+ inv_freq_extrapolation * inv_freq_mask
|
||||
)
|
||||
return inv_freq
|
||||
|
||||
def _compute_cos_sin_cache(self) -> torch.Tensor:
|
||||
inv_freq = self._compute_inv_freq(self.scaling_factor)
|
||||
t = torch.arange(
|
||||
self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
|
||||
)
|
||||
freqs = torch.einsum("i,j -> ij", t, inv_freq)
|
||||
cos = freqs.cos() * self.mscale
|
||||
sin = freqs.sin() * self.mscale
|
||||
cache = torch.cat((cos, sin), dim=-1)
|
||||
return cache
|
||||
Reference in New Issue
Block a user