diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index 734d594e..3ba75604 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -110,7 +110,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl): device=query.device, non_blocking=True, ) - return super().forward_paged_attention(query, attn_metadata, output) + + torch_npu._npu_paged_attention( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output, + ) + return output def forward_prefill_310(self, query, key, value, attn_metadata, output): """ diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index a1f7083e..f6139559 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -833,7 +833,7 @@ def weak_ref_tensor(tensor: Any) -> Any: but will not keep the original tensor alive. """ if isinstance(tensor, torch.Tensor): - return torch.ops._C_ascend.weak_ref_tensor(tensor) + return torch_npu._C._weak_ref_tensor(tensor) else: return tensor