[Ops][Refactor] Remove custom rotary_embedding operator (#6523)
### What this PR does / why we need it? This PR removes the custom `rotary_embedding` operator and its associated C++ kernel implementation, PyTorch bindings, and tests. The codebase now falls back to using the native `torch_npu._npu_rotary_embedding` implementation. This change simplifies the codebase by removing custom, platform-specific kernel code and relying on the standard NPU library implementation, which is presumably more optimized and easier to maintain. ### Does this PR introduce _any_ user-facing change? No. This is an internal refactoring and does not introduce any user-facing changes. ### How was this patch tested? The tests for the custom `rotary_embedding` operator have been removed along with the operator itself. The correctness of the fallback to the native `torch_npu` implementation is verified by existing CI tests for attention layers and models that use rotary embeddings. - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -36,24 +36,6 @@
|
||||
namespace vllm_ascend {
|
||||
namespace meta {
|
||||
const int64_t INT4_NUMS_IN_INT32 = 8;
|
||||
std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
|
||||
at::Tensor &positions,
|
||||
at::Tensor &query,
|
||||
at::Tensor &key,
|
||||
int64_t head_size,
|
||||
at::Tensor &cos_sin_cache,
|
||||
bool is_neox) {
|
||||
auto num_tokens = positions.sym_numel();
|
||||
auto query_hidden_size = query.sym_numel() / num_tokens;
|
||||
auto key_hidden_size = key.sym_numel() / num_tokens;
|
||||
|
||||
auto num_heads = query_hidden_size / head_size;
|
||||
auto num_kv_heads = key_hidden_size / head_size;
|
||||
at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options());
|
||||
at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options());
|
||||
|
||||
return {query_dst, key_dst};
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
|
||||
at::Tensor &input,
|
||||
@@ -457,8 +439,6 @@ namespace {
|
||||
// the custom kernel been captured into aclgraph
|
||||
TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
|
||||
|
||||
// Rotary embedding meta implementation
|
||||
ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
|
||||
// Masked input and mask meta implementation
|
||||
ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta);
|
||||
// Bgmv expand
|
||||
|
||||
Reference in New Issue
Block a user