From 20ae71291d876a8511eb504601f747a60864861d Mon Sep 17 00:00:00 2001 From: panchao-hub <315134829@qq.com> Date: Sat, 30 Aug 2025 15:51:12 +0800 Subject: [PATCH] [torchair]remove aicpu op (#2640) ### What this PR does / why we need it? remove aicpu op for torchair mode ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? vLLM version: v0.10.1.1 vLLM main: https://github.com/vllm-project/vllm/commit/05d839c19e9582d62c860686678bac68240d7254 - vLLM version: v0.10.1.1 - vLLM main: https://github.com/vllm-project/vllm/commit/67c14906aaa480d4fee2606f31c784ae21f8a633 Signed-off-by: zhangdepeng Co-authored-by: zhangdepeng --- vllm_ascend/torchair/torchair_attention.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py index da754e5..81f2968 100644 --- a/vllm_ascend/torchair/torchair_attention.py +++ b/vllm_ascend/torchair/torchair_attention.py @@ -304,6 +304,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl): self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.key_cache = None self.value_cache = None + self.scale_tensor = torch.zeros((), device='npu', dtype=torch.int32) def forward( self, @@ -366,7 +367,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl): key_cache, value_cache = kv_cache[0], kv_cache[1] slots = attn_metadata.slot_mapping - block_size = key_cache.shape[1] + block_size = self.scale_tensor + key_cache.shape[1] slots_indices = slots.reshape(-1, 1) block_indices = slots_indices // block_size slots_indices = slots_indices % block_size