From 20ae71291d876a8511eb504601f747a60864861d Mon Sep 17 00:00:00 2001
From: panchao-hub <315134829@qq.com>
Date: Sat, 30 Aug 2025 15:51:12 +0800
Subject: [PATCH] [torchair]remove aicpu op (#2640)

### What this PR does / why we need it?
remove aicpu op for torchair mode
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
vLLM version: v0.10.1.1
vLLM main:
https://github.com/vllm-project/vllm/commit/05d839c19e9582d62c860686678bac68240d7254
- vLLM version: v0.10.1.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/67c14906aaa480d4fee2606f31c784ae21f8a633

Signed-off-by: zhangdepeng <zhangdepeng2@huawei.com>
Co-authored-by: zhangdepeng <zhangdepeng2@huawei.com>
---
 vllm_ascend/torchair/torchair_attention.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py
index da754e5..81f2968 100644
--- a/vllm_ascend/torchair/torchair_attention.py
+++ b/vllm_ascend/torchair/torchair_attention.py
@@ -304,6 +304,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
         self.key_cache = None
         self.value_cache = None
+        self.scale_tensor = torch.zeros((), device='npu', dtype=torch.int32)
 
     def forward(
         self,
@@ -366,7 +367,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
             key_cache, value_cache = kv_cache[0], kv_cache[1]
             slots = attn_metadata.slot_mapping
 
-            block_size = key_cache.shape[1]
+            block_size = self.scale_tensor + key_cache.shape[1]
             slots_indices = slots.reshape(-1, 1)
             block_indices = slots_indices // block_size
             slots_indices = slots_indices % block_size