[torchair]remove aicpu op (#2640)
### What this PR does / why we need it? remove aicpu op for torchair mode ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? vLLM version: v0.10.1.1 vLLM main:05d839c19e- vLLM version: v0.10.1.1 - vLLM main:67c14906aaSigned-off-by: zhangdepeng <zhangdepeng2@huawei.com> Co-authored-by: zhangdepeng <zhangdepeng2@huawei.com>
This commit is contained in:
@@ -304,6 +304,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
|||||||
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||||
self.key_cache = None
|
self.key_cache = None
|
||||||
self.value_cache = None
|
self.value_cache = None
|
||||||
|
self.scale_tensor = torch.zeros((), device='npu', dtype=torch.int32)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -366,7 +367,7 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
|||||||
key_cache, value_cache = kv_cache[0], kv_cache[1]
|
key_cache, value_cache = kv_cache[0], kv_cache[1]
|
||||||
slots = attn_metadata.slot_mapping
|
slots = attn_metadata.slot_mapping
|
||||||
|
|
||||||
block_size = key_cache.shape[1]
|
block_size = self.scale_tensor + key_cache.shape[1]
|
||||||
slots_indices = slots.reshape(-1, 1)
|
slots_indices = slots.reshape(-1, 1)
|
||||||
block_indices = slots_indices // block_size
|
block_indices = slots_indices // block_size
|
||||||
slots_indices = slots_indices % block_size
|
slots_indices = slots_indices % block_size
|
||||||
|
|||||||
Reference in New Issue
Block a user