Update torch-npu version to 2.7.1 (#3896)

### What this PR does / why we need it? Upgrade torch-npu to the official release version 2.7.1 - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-10-31 17:16:31 +08:00
parent 5f6d1b3323
commit fcc9a0eaeb
15 changed files with 83 additions and 168 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -19,8 +19,6 @@ from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import logger
 from vllm.platforms import current_platform

-from vllm_ascend.utils import version_check
-
 from ..utils import weak_ref_tensors


@@ -214,32 +212,20 @@ def update_attn_params(update_stream, forward_context, runtime_shape):
            output,
        ) = param
        seq_lens = forward_context.attn_metadata[key].seq_lens
-        torch_npu_check = version_check()

        with torch.npu.stream(update_stream):
            torch.npu.graph_task_update_begin(update_stream, handle)
-            if torch_npu_check:
-                torch_npu._npu_paged_attention(
-                    query=query,
-                    key_cache=key_cache,
-                    value_cache=value_cache,
-                    num_kv_heads=num_kv_heads,
-                    num_heads=num_heads,
-                    scale_value=scale,
-                    block_table=block_table,
-                    context_lens=seq_lens,
-                    out=output,
-                    workspace=graph_params.workspaces.get(runtime_shape))
-            else:
-                torch_npu._npu_paged_attention(query=query,
-                                               key_cache=key_cache,
-                                               value_cache=value_cache,
-                                               num_kv_heads=num_kv_heads,
-                                               num_heads=num_heads,
-                                               scale_value=scale,
-                                               block_table=block_table,
-                                               context_lens=seq_lens,
-                                               out=output)
+            torch_npu._npu_paged_attention(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                num_kv_heads=num_kv_heads,
+                num_heads=num_heads,
+                scale_value=scale,
+                block_table=block_table,
+                context_lens=seq_lens,
+                out=output,
+                workspace=graph_params.workspaces.get(runtime_shape))
            torch.npu.graph_task_update_end(update_stream)

            event.record(update_stream)