From ddc78dbade04940493c0757b572681d17b7f3ca1 Mon Sep 17 00:00:00 2001
From: Shaoxu Cheng <2906339855@qq.com>
Date: Mon, 2 Mar 2026 14:15:14 +0800
Subject: [PATCH] [300I] support decode-only aclgraph mode (#6849)

### What this PR does / why we need it?
310p aclgraph mode, but has some problems:
- the event-id hardware limit, the num of graph will be limited.
- the cann version support this feature cannot be get from external of
huawei.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
local test
- vLLM version: v0.15.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/83b47f67b1dfad505606070ae4d9f83e50ad4ebd

Signed-off-by: Tflowers-0129 <2906339855@qq.com>
---
 vllm_ascend/_310p/attention/attention_v1.py | 14 +++++++++++++-
 vllm_ascend/utils.py                        |  2 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/vllm_ascend/_310p/attention/attention_v1.py b/vllm_ascend/_310p/attention/attention_v1.py
index 734d594e..3ba75604 100644
--- a/vllm_ascend/_310p/attention/attention_v1.py
+++ b/vllm_ascend/_310p/attention/attention_v1.py
@@ -110,7 +110,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl):
                 device=query.device,
                 non_blocking=True,
             )
-        return super().forward_paged_attention(query, attn_metadata, output)
+
+        torch_npu._npu_paged_attention(
+            query=query,
+            key_cache=self.key_cache,
+            value_cache=self.value_cache,
+            num_kv_heads=self.num_kv_heads,
+            num_heads=self.num_heads,
+            scale_value=self.scale,
+            block_table=attn_metadata.block_tables,
+            context_lens=attn_metadata.seq_lens,
+            out=output,
+        )
+        return output
 
     def forward_prefill_310(self, query, key, value, attn_metadata, output):
         """
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index a1f7083e..f6139559 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -833,7 +833,7 @@ def weak_ref_tensor(tensor: Any) -> Any:
     but will not keep the original tensor alive.
     """
     if isinstance(tensor, torch.Tensor):
-        return torch.ops._C_ascend.weak_ref_tensor(tensor)
+        return torch_npu._C._weak_ref_tensor(tensor)
     else:
         return tensor