Revert PTA upgrade PR (#3352)

we notice that torch npu 0919 doesn't work. This PR revert related change which rely on 0919 version. Revert PR: #3295 #3205 #3102 Related: #3353 - vLLM version: v0.11.0
2025-10-10 14:09:53 +08:00
parent 601a37aeff
commit ba19dd3183
15 changed files with 57 additions and 312 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -34,8 +34,7 @@ from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
                                         maybe_save_kv_layer_to_connector,
                                         wait_for_kv_layer_from_connector)
-from vllm_ascend.compilation.acl_graph import (get_graph_params,
-                                               update_graph_params_workspaces)
+from vllm_ascend.compilation.acl_graph import get_graph_params
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
                               nd_to_nz_2d, nd_to_nz_spec)
@@ -394,28 +393,13 @@ class AscendAttentionBackendImpl(AttentionImpl):
            forward_context: ForwardContext = get_forward_context()
            num_tokens = query.shape[0]
            if forward_context.capturing:
-                # Get workspace from cache or calculate it if not present.
-                workspace = graph_params.workspaces.get(num_tokens)
-                if workspace is None:
-                    workspace = torch_npu._npu_paged_attention_get_workspace(
-                        query=query,
-                        key_cache=self.key_cache,
-                        value_cache=self.value_cache,
-                        num_kv_heads=self.num_kv_heads,
-                        num_heads=self.num_heads,
-                        scale_value=self.scale,
-                        block_table=attn_metadata.block_tables,
-                        context_lens=attn_metadata.seq_lens,
-                        out=output)
-                    update_graph_params_workspaces(num_tokens, workspace)
-
-                # Handle graph capturing mode
                stream = torch_npu.npu.current_stream()

                event = torch.npu.ExternalEvent()
                event.wait(stream)
                event.reset(stream)
                graph_params.events[num_tokens].append(event)
+
                graph_params.attn_params[num_tokens].append((
                    query,
                    self.key_cache,
@@ -429,7 +413,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
                ))

                torch.npu.graph_task_group_begin(stream)
-
                torch_npu._npu_paged_attention(
                    query=query,
                    key_cache=self.key_cache,
@@ -439,8 +422,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
                    scale_value=self.scale,
                    block_table=attn_metadata.block_tables,
                    context_lens=attn_metadata.seq_lens,
-                    out=output,
-                    workspace=workspace)
+                    out=output)
                handle = torch.npu.graph_task_group_end(stream)
                graph_params.handles[num_tokens].append(handle)
            else: