[300I] support decode-only aclgraph mode (#6849)

### What this PR does / why we need it? 310p aclgraph mode, but has some problems: - the event-id hardware limit, the num of graph will be limited. - the cann version support this feature cannot be get from external of huawei. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? local test - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 Signed-off-by: Tflowers-0129 <2906339855@qq.com>
2026-03-02 14:15:14 +08:00
parent 86c9109d16
commit ddc78dbade
2 changed files with 14 additions and 2 deletions
--- a/vllm_ascend/_310p/attention/attention_v1.py
+++ b/vllm_ascend/_310p/attention/attention_v1.py
@@ -110,7 +110,19 @@ class AscendAttentionBackendImpl310(AscendAttentionBackendImpl):
                device=query.device,
                non_blocking=True,
            )
-        return super().forward_paged_attention(query, attn_metadata, output)
+
        torch_npu._npu_paged_attention(
            query=query,
            key_cache=self.key_cache,
            value_cache=self.value_cache,
            num_kv_heads=self.num_kv_heads,
            num_heads=self.num_heads,
            scale_value=self.scale,
            block_table=attn_metadata.block_tables,
            context_lens=attn_metadata.seq_lens,
            out=output,
        )
        return output
    def forward_prefill_310(self, query, key, value, attn_metadata, output):
        """
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -833,7 +833,7 @@ def weak_ref_tensor(tensor: Any) -> Any:
    but will not keep the original tensor alive.
    """
    if isinstance(tensor, torch.Tensor):
-        return torch.ops._C_ascend.weak_ref_tensor(tensor)
+        return torch_npu._C._weak_ref_tensor(tensor)
    else:
        return tensor