[Ops][Refactor] Remove custom rotary_embedding operator (#6523)

### What this PR does / why we need it?
This PR removes the custom `rotary_embedding` operator and its
associated C++ kernel implementation, PyTorch bindings, and tests.

The codebase now falls back to using the native
`torch_npu._npu_rotary_embedding` implementation. This change simplifies
the codebase by removing custom, platform-specific kernel code and
relying on the standard NPU library implementation, which is presumably
more optimized and easier to maintain.

### Does this PR introduce _any_ user-facing change?
No. This is an internal refactoring and does not introduce any
user-facing changes.

### How was this patch tested?
The tests for the custom `rotary_embedding` operator have been removed
along with the operator itself. The correctness of the fallback to the
native `torch_npu` implementation is verified by existing CI tests for
attention layers and models that use rotary embeddings.

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2026-02-07 09:24:05 +08:00
committed by GitHub
parent 06aa6036f6
commit 6c49f95da2
8 changed files with 59 additions and 1392 deletions

View File

@@ -31,7 +31,7 @@ from torch.library import Library
# 3. The registration utility will check if a meta implementation already exists for your op,
# and only register if necessary. This avoids duplicate registrations.
#
# 4. Example meta implementations are provided below for rotary_embedding and get_masked_input_and_mask.
# 4. Example meta implementations are provided below for get_masked_input_and_mask.
#
# 5. When developing new custom ops, always provide a meta implementation to enable tracing,
# export, and shape inference in PyTorch and vLLM to enable the capture of `torch.compile`
@@ -52,25 +52,6 @@ def register_meta_if_necessary(ns: str, op_name: str, fn, overload: str = ""):
lib.impl(op_name, fn, "Meta")
def rotary_embedding_meta(
positions: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor,
head_size: int,
cos_sin_cache: torch.Tensor,
is_neox: bool,
):
num_tokens = positions.numel()
query_hidden_size = query.numel() // num_tokens
key_hidden_size = key.numel() // num_tokens
num_heads = query_hidden_size // head_size
num_kv_heads = key_hidden_size // head_size
query_dst = torch.empty_like(query).view(num_tokens, num_heads, head_size)
key_dst = torch.empty_like(key).view(num_tokens, num_kv_heads, head_size)
return query_dst, key_dst
def get_masked_input_and_mask_meta(
input: torch.Tensor,
org_vocab_start_index: int,
@@ -105,7 +86,6 @@ def sgmv_expand_meta(
return y_out
register_meta_if_necessary("_C_ascend", "rotary_embedding", rotary_embedding_meta)
register_meta_if_necessary("_C_ascend", "get_masked_input_and_mask", get_masked_input_and_mask_meta)
register_meta_if_necessary("_C_ascend", "bgmv_expand", bgmv_expand_meta)
register_meta_if_necessary("_C_ascend", "sgmv_expand", sgmv_expand_meta)

View File

@@ -33,7 +33,7 @@ if HAS_TRITON:
from vllm.model_executor.layers.rotary_embedding.mrope import triton_mrope
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import AscendDeviceType, enable_custom_op, get_ascend_device_type, has_rope, is_vl_model
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type, has_rope, is_vl_model
# Currently, rope ops used on npu requires detached cos && sin as inputs.
# However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
@@ -144,10 +144,6 @@ def get_cos_and_sin_slice():
return _cos_slice, _sin_slice
def _custom_rotary_embedding_enabled(query, neox_style, head_size):
return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op()
def _rope_forward_oot(
self,
positions: torch.Tensor,
@@ -162,9 +158,62 @@ def _rope_forward_oot(
if self.cos_sin_cache.dtype != query.dtype:
self.cos_sin_cache = self.cos_sin_cache.to(query.dtype)
cos, sin = get_cos_and_sin_slice()
# adopt custom kernel path for rotary_embedding
if _custom_rotary_embedding_enabled(query, is_neox_style, self.head_size):
query, key = torch.ops._C_ascend.rotary_embedding(
if offsets is not None:
raise NotImplementedError("Batched rotary embedding is currently not supported on NPU.")
if (
is_neox_style
and self.head_size == 128
and self.cos_sin_cache.shape[-1] == 128
and cos is not None
and sin is not None
):
# If cos and sin are generated outside, use npu_apply_rotary_pos_emb to avoid redundant calculation.
# This method requires head_size and rotary_dim equal 128 and neox_style is True
query = query.contiguous().view(1, query.shape[0], -1, self.head_size)
key = key.contiguous().view(1, key.shape[0], -1, self.head_size)
# Although this function modifies in-place, please retain the function's return value.
# Otherwise, the graph fusion operation may fail.
query, key = torch_npu.npu_apply_rotary_pos_emb(query, key, cos, sin)
elif self.rotary_dim < self.head_size:
if HAS_TRITON:
cos = cos.view(-1, self.rotary_dim)
sin = sin.view(-1, self.rotary_dim)
q = query.contiguous().view(query.shape[0], -1, self.head_size)
k = key.contiguous().view(key.shape[0], -1, self.head_size)
query, key = torch.ops.vllm.rope_forward_triton(
q, k, cos, sin, rope_dim=self.rotary_dim, is_neox_style=True
)
return query.view(query_shape), key.view(key_shape)
else:
num_tokens = query.shape[0]
query = query.view(num_tokens, -1, self.head_size)
key = key.view(num_tokens, -1, self.head_size)
q_rot = query[..., : self.rotary_dim]
q_pass = query[..., self.rotary_dim :]
k_rot = key[..., : self.rotary_dim]
k_pass = key[..., self.rotary_dim :]
q_rot = q_rot.contiguous().view(num_tokens, -1)
k_rot = k_rot.contiguous().view(num_tokens, -1)
# only the rotary part is processed here,
# the dimension should be rotary_dim
torch_npu._npu_rotary_embedding(
positions,
q_rot,
k_rot,
self.rotary_dim,
self.cos_sin_cache,
is_neox_style,
)
q_rot = q_rot.view(num_tokens, -1, self.rotary_dim)
k_rot = k_rot.view(num_tokens, -1, self.rotary_dim)
q = torch.cat((q_rot, q_pass), dim=-1).reshape(query_shape)
k = torch.cat((k_rot, k_pass), dim=-1).reshape(key_shape)
return q, k
else:
# TODO: Remove the contiguous in the future.
query = query.contiguous().view(query.shape[0], -1)
key = key.contiguous().view(key.shape[0], -1)
torch_npu._npu_rotary_embedding(
positions,
query,
key,
@@ -172,72 +221,7 @@ def _rope_forward_oot(
self.cos_sin_cache,
is_neox_style,
)
return query.view(query_shape), key.view(key_shape)
if offsets is not None:
raise NotImplementedError("Batched rotary embedding is currently not supported on NPU.")
else:
if (
is_neox_style
and self.head_size == 128
and self.cos_sin_cache.shape[-1] == 128
and cos is not None
and sin is not None
):
# If cos and sin are generated outside, use npu_apply_rotary_pos_emb to avoid redundant calculation.
# This method requires head_size and rotary_dim equal 128 and neox_style is True
query = query.contiguous().view(1, query.shape[0], -1, self.head_size)
key = key.contiguous().view(1, key.shape[0], -1, self.head_size)
# Although this function modifies in-place, please retain the function's return value.
# Otherwise, the graph fusion operation may fail.
query, key = torch_npu.npu_apply_rotary_pos_emb(query, key, cos, sin)
elif self.rotary_dim < self.head_size:
if HAS_TRITON:
cos = cos.view(-1, self.rotary_dim)
sin = sin.view(-1, self.rotary_dim)
q = query.contiguous().view(query.shape[0], -1, self.head_size)
k = key.contiguous().view(key.shape[0], -1, self.head_size)
query, key = torch.ops.vllm.rope_forward_triton(
q, k, cos, sin, rope_dim=self.rotary_dim, is_neox_style=True
)
return query.view(query_shape), key.view(key_shape)
else:
num_tokens = query.shape[0]
query = query.view(num_tokens, -1, self.head_size)
key = key.view(num_tokens, -1, self.head_size)
q_rot = query[..., : self.rotary_dim]
q_pass = query[..., self.rotary_dim :]
k_rot = key[..., : self.rotary_dim]
k_pass = key[..., self.rotary_dim :]
q_rot = q_rot.contiguous().view(num_tokens, -1)
k_rot = k_rot.contiguous().view(num_tokens, -1)
# only the rotary part is processed here,
# the dimension should be rotary_dim
torch_npu._npu_rotary_embedding(
positions,
q_rot,
k_rot,
self.rotary_dim,
self.cos_sin_cache,
is_neox_style,
)
q_rot = q_rot.view(num_tokens, -1, self.rotary_dim)
k_rot = k_rot.view(num_tokens, -1, self.rotary_dim)
q = torch.cat((q_rot, q_pass), dim=-1).reshape(query_shape)
k = torch.cat((k_rot, k_pass), dim=-1).reshape(key_shape)
return q, k
else:
# TODO: Remove the contiguous in the future.
query = query.contiguous().view(query.shape[0], -1)
key = key.contiguous().view(key.shape[0], -1)
torch_npu._npu_rotary_embedding(
positions,
query,
key,
self.head_size,
self.cos_sin_cache,
is_neox_style,
)
return query.view(query_shape), key.view(key_shape)
return query.view(query_shape), key.view(key_shape)
class AscendRotaryEmbedding(RotaryEmbedding):