From ddc78dbade04940493c0757b572681d17b7f3ca1 Mon Sep 17 00:00:00 2001 From: Shaoxu Cheng <2906339855@qq.com> Date: Mon, 2 Mar 2026 14:15:14 +0800 Subject: [PATCH] [300I] support decode-only aclgraph mode (#6849) ### What this PR does / why we need it? 310p aclgraph mode, but has some problems: - the event-id hardware limit, the num of graph will be limited. - the cann version support this feature cannot be get from external of huawei. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? local test - vLLM version: v0.15.0 - vLLM main: https://github.com/vllm-project/vllm/commit/83b47f67b1dfad505606070ae4d9f83e50ad4ebd Signed-off-by: Tflowers-0129 <2906339855@qq.com> --- vllm_ascend/_310p/attention/attention_v1.py | 14 +++++++++++++- vllm_ascend/utils.py | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py index 734d594e..3ba75604 100644 --- a/vllm_ascend/_310p/attention/attention_v1.py +++ b/vllm_ascend/_310p/attention/attention_v1.py @@ -110,7 +110,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl): device=query.device, non_blocking=True, ) - return super().forward_paged_attention(query, attn_metadata, output) + + torch_npu._npu_paged_attention( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output, + ) + return output def forward_prefill_310(self, query, key, value, attn_metadata, output): """ diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index a1f7083e..f6139559 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -833,7 +833,7 @@ def weak_ref_tensor(tensor: Any) -> Any: but will not keep the original tensor alive. """ if isinstance(tensor, torch.Tensor): - return torch.ops._C_ascend.weak_ref_tensor(tensor) + return torch_npu._C._weak_ref_tensor(tensor) else: return tensor