[Refactor] Unify full-graph parameter update logic (#6041)

### What this PR does / why we need it? **Refactor: Unify full-graph parameter update logic** This PR consolidates the scattered full-graph parameter update logic into a unified approach, improving code architecture and eliminating duplication. **Key improvements:** 1. **Unified interface** - Create `update_full_graph_params` as the single entry point for all full-graph updates - Replace multiple scattered update calls with one unified function - Remove ~50 lines of duplicated if-else logic across `model_runner_v1.py` and `eagle_proposer.py` 2. **Better architecture** - Move update logic to respective Backend classes (`AscendAttentionBackend`, `AscendMLABackend`) - Each Backend manages its own parameter update logic internally - Simplify caller code to just dispatch to the appropriate Backend 3. **Cleaner parameter handling** - Remove unnecessary `pcp_size` and `dcp_size` parameter passing - Get parallel configuration directly from distributed groups - Consistent with how other parts of the codebase obtain these values **Why we need it:** - **Maintainability**: Future changes only need to be made in one place per Backend - **Code quality**: Follows DRY principle and Single Responsibility Principle - **Readability**: Cleaner, more intuitive code structure ### Does this PR introduce _any_ user-facing change? **No.** This is a pure refactoring with no functional changes - same behavior, cleaner code. ### How was this patch tested? - All existing unit tests pass with updated mocks - No new tests needed (pure refactoring, no behavior changes) - CI validates correctness --- - vLLM version: v0.13.0 Signed-off-by: lico67373 <918688502@qq.com> Co-authored-by: drslark <slarksblood@qq.com> Co-authored-by: weijinqian0 <1184188277@qq.com>
2026-01-24 20:12:57 +08:00
parent 8129c429ef
commit 8966a99710
10 changed files with 420 additions and 415 deletions
--- a/vllm_ascend/compilation/acl_graph.py
+++ b/vllm_ascend/compilation/acl_graph.py
@@ -8,7 +8,6 @@ from dataclasses import dataclass
 from typing import Any
 from unittest.mock import patch

-import numpy as np
 import torch
 import torch_npu
 import vllm.envs as envs
@@ -20,8 +19,6 @@ from vllm.forward_context import BatchDescriptor, get_forward_context
 from vllm.logger import logger
 from vllm.platforms import current_platform

-from vllm_ascend.attention.utils import using_paged_attention
-
 from ..utils import weak_ref_tensors


@@ -213,343 +210,24 @@ def weak_ref_workspaces(params):
        params.workspaces[num_tokens] = weak_ref_tensors(params.workspaces[num_tokens])


-def _update_attn_pa_params(update_stream, forward_context, runtime_shape):
-    graph_params = get_graph_params()
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
-    with torch.npu.stream(update_stream):
-        for key, param, handle, event in zip(
-            forward_context.attn_metadata,
-            graph_params.attn_params[runtime_shape],
-            graph_params.handles[runtime_shape],
-            graph_params.events[runtime_shape],
-        ):
-            (
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                num_heads,
-                scale,
-                block_table,
-                seq_lens,
-                output,
-            ) = param
-            seq_lens = forward_context.attn_metadata[key].seq_lens
-
-            torch.npu.graph_task_update_begin(update_stream, handle)
-            torch_npu._npu_paged_attention(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                num_kv_heads=num_kv_heads,
-                num_heads=num_heads,
-                scale_value=scale,
-                block_table=block_table,
-                context_lens=seq_lens,
-                out=output,
-                workspace=graph_params.workspaces.get(runtime_shape),
-            )
-            torch.npu.graph_task_update_end(update_stream)
-
-            event.record(update_stream)
-
-
-def _update_attn_fia_params(update_stream, forward_context, runtime_shape, draft_attn_metadatas=None):
-    if forward_context.is_draft_model:
-        graph_params = get_draft_graph_params()
-        attn_metadata = draft_attn_metadatas
-        attn_keys = list(attn_metadata[0].keys())
-    else:
-        graph_params = get_graph_params()
-        attn_metadata = forward_context.attn_metadata
-        attn_keys = list(attn_metadata.keys())
-    # For Qwen3-next, since the kv_cache_config has already categorized
-    # linear_attn and self_attn, the attn_metadata is first arranged with
-    # self_attn followed by linear_attn. Therefore, using zip directly
-    # filters out the update operations for linear_attn.
-    # TODO: We use a new variable `attn_keys` to ensure the loop count is
-    # correct after get by `zip` because of the new structure of the attn_metadata
-    # when running with the merged full eagle-graph. Should check it with Qwen3-next.
-    num_layers = len(attn_keys)
-    if num_layers == 0:
-        return
-    if forward_context.is_draft_model:
-        attn_keys = attn_keys * (len(graph_params.attn_params[runtime_shape]) // num_layers)
-    attn_count = 0
-    with torch.npu.stream(update_stream):
-        for key, param, handle, event in zip(
-            attn_keys,
-            graph_params.attn_params[runtime_shape],
-            graph_params.handles[runtime_shape],
-            graph_params.events[runtime_shape],
-        ):
-            (
-                query,
-                key_cache,
-                value,
-                block_tables,
-                attn_mask,
-                block_size,
-                seq_lens,
-                query_start_loc,
-                num_kv_heads,
-                num_heads,
-                scale,
-                attn_output,
-                softmax_lse,
-            ) = param
-
-            if forward_context.is_draft_model:
-                draft_step = attn_count // num_layers
-                seq_lens = attn_metadata[draft_step][key].seq_lens_list
-                actual_seq_lengths_q = attn_metadata[draft_step][key].actual_seq_lengths_q
-                attn_count = attn_count + 1
-            else:
-                seq_lens = attn_metadata[key].seq_lens_list
-                actual_seq_lengths_q = attn_metadata[key].actual_seq_lengths_q
-
-            torch.npu.graph_task_update_begin(update_stream, handle)
-            torch_npu.npu_fused_infer_attention_score.out(
-                query=query,
-                key=key_cache,
-                value=value,
-                block_table=block_tables,
-                atten_mask=attn_mask,
-                input_layout="TND",
-                block_size=block_size,
-                actual_seq_lengths=actual_seq_lengths_q,
-                actual_seq_lengths_kv=seq_lens,
-                num_key_value_heads=num_kv_heads,
-                num_heads=num_heads,
-                scale=scale,
-                sparse_mode=3,
-                workspace=graph_params.workspaces.get(runtime_shape),
-                out=[attn_output, softmax_lse],
-            )
-            torch.npu.graph_task_update_end(update_stream)
-
-            event.record(update_stream)
-
-
-def update_attn_params(update_stream, forward_context, runtime_shape, vllm_config, draft_attn_metadatas=None):
-    if using_paged_attention(runtime_shape, vllm_config):
-        _update_attn_pa_params(update_stream, forward_context, runtime_shape)
-    else:
-        _update_attn_fia_params(update_stream, forward_context, runtime_shape, draft_attn_metadatas)
-
-
-def update_mla_attn_params(update_stream, forward_context, runtime_shape, speculative_config):
-    if forward_context.is_draft_model:
-        graph_params = get_draft_graph_params()
-    else:
-        graph_params = get_graph_params()
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
-    with torch.npu.stream(update_stream):
-        for key, param, handle, event in zip(
-            forward_context.attn_metadata,
-            graph_params.attn_params[runtime_shape],
-            graph_params.handles[runtime_shape],
-            graph_params.events[runtime_shape],
-        ):
-            (
-                q_nope,
-                k_nope,
-                q_pe,
-                k_pe,
-                num_heads,
-                num_kv_heads,
-                input_layout,
-                attn_mask,
-                sparse_mode,
-                scale,
-                block_table,
-                block_size,
-                seq_lens_list,
-                actual_seq_lengths,
-                attn_output,
-                softmax_lse,
-            ) = param
-            seq_lens_list = forward_context.attn_metadata[key].decode.seq_lens_list
-            if speculative_config and speculative_config.method == "mtp" and not forward_context.is_draft_model:
-                actual_seq_lengths = forward_context.attn_metadata[key].decode.actual_seq_lengths_q
-                spec_multiple = speculative_config.num_speculative_tokens + 1
-                seq_lens_list = seq_lens_list + [0] * (runtime_shape // spec_multiple - len(seq_lens_list))
-                actual_seq_lengths = [spec_multiple * (i + 1) for i in range(runtime_shape // spec_multiple)]
-            elif forward_context.is_draft_model:
-                actual_seq_lengths = forward_context.attn_metadata[key].decode.actual_seq_lengths_q
-                block_table = forward_context.attn_metadata[key].decode.block_table
-                # TODO: This is a hack and should be fixed in the future.
-                if speculative_config.disable_padded_drafter_batch:
-                    block_table = block_table[: len(actual_seq_lengths)]
-                seq_lens_list = seq_lens_list + [0] * (len(actual_seq_lengths) - len(seq_lens_list))
-            else:
-                seq_lens_list = seq_lens_list + [0] * (runtime_shape - len(seq_lens_list))
-            torch.npu.graph_task_update_begin(update_stream, handle)
-
-            torch_npu.npu_fused_infer_attention_score.out(
-                q_nope,
-                k_nope,
-                k_nope,
-                query_rope=q_pe,
-                key_rope=k_pe,
-                num_heads=num_heads,
-                num_key_value_heads=num_kv_heads,
-                input_layout=input_layout,
-                atten_mask=attn_mask,
-                sparse_mode=sparse_mode,
-                scale=scale,
-                antiquant_mode=0,
-                antiquant_scale=None,
-                block_table=block_table,
-                block_size=block_size,
-                actual_seq_lengths_kv=seq_lens_list,
-                actual_seq_lengths=actual_seq_lengths,
-                workspace=graph_params.workspaces.get(runtime_shape),
-                out=[attn_output, softmax_lse],
-            )
-            torch.npu.graph_task_update_end(update_stream)
-
-            event.record(update_stream)
-
-
-def update_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
-    graph_params = get_graph_params()
-    with torch.npu.stream(update_stream):
-        for key, param, handle, event in zip(
-            forward_context.attn_metadata,
-            graph_params.attn_params[runtime_shape],
-            graph_params.handles[runtime_shape],
-            graph_params.events[runtime_shape],
-        ):
-            (
-                q_nope,
-                k_nope,
-                value,
-                num_heads,
-                num_kv_heads,
-                scale,
-                block_table,
-                block_size,
-                actual_seq_lengths_kv,
-                actual_seq_lengths_q,
-                attn_output,
-                softmax_lse,
-                dcp_size,
-                pcp_rank,
-                dcp_rank,
-            ) = param
-            attn_metadata = forward_context.attn_metadata[key]
-            actual_seq_lengths_kv = attn_metadata.decode_meta.num_computed_tokens_of_pcp_dcp[:, pcp_rank, dcp_rank]
-            pad_length = runtime_shape - len(actual_seq_lengths_kv)
-            if pad_length > 0:
-                pad_tensor = np.zeros(pad_length, dtype=actual_seq_lengths_kv.dtype)
-                actual_seq_lengths_kv = np.concatenate([actual_seq_lengths_kv, pad_tensor])
-
-            actual_seq_lengths_q = attn_metadata.actual_seq_lengths_q
-
-            if dcp_size > 1:
-                num_heads = num_heads * dcp_size
-
-            torch.npu.graph_task_update_begin(update_stream, handle)
-
-            torch_npu.npu_fused_infer_attention_score.out(
-                q_nope,
-                k_nope,
-                value,
-                num_heads=num_heads,
-                num_key_value_heads=num_kv_heads,
-                input_layout="TND",
-                atten_mask=None,
-                scale=scale,
-                antiquant_mode=0,
-                antiquant_scale=None,
-                softmax_lse_flag=True,
-                block_table=block_table,
-                block_size=block_size,
-                actual_seq_lengths_kv=actual_seq_lengths_kv,
-                actual_seq_lengths=actual_seq_lengths_q,
-                workspace=graph_params.workspaces.get(runtime_shape),
-                out=[attn_output, softmax_lse],
-            )
-            torch.npu.graph_task_update_end(update_stream)
-
-            event.record(update_stream)
-
-
-def update_mla_attn_dcp_pcp_params(update_stream, forward_context, runtime_shape):
-    if forward_context.is_draft_model:
-        graph_params = get_draft_graph_params()
-    else:
-        graph_params = get_graph_params()
-    # FIXME: Behold! We are using a temporary hack here to update the args
-    # for each layer's attention op in the graph.
-    with torch.npu.stream(update_stream):
-        for key, param, handle, event in zip(
-            forward_context.attn_metadata,
-            graph_params.attn_params[runtime_shape],
-            graph_params.handles[runtime_shape],
-            graph_params.events[runtime_shape],
-        ):
-            (
-                q_nope,
-                k_nope,
-                q_pe,
-                k_pe,
-                num_heads,
-                num_kv_heads,
-                input_layout,
-                spec_attn_mask,
-                sparse_mode,
-                scale,
-                block_table,
-                block_size,
-                actual_seq_lengths,
-                actual_seq_lengths_kv,
-                attn_output,
-                softmax_lse,
-            ) = param
-
-            decode_meta = forward_context.attn_metadata[key].decode
-            seq_len = decode_meta.cp_seq_len
-            if isinstance(seq_len, torch.Tensor):
-                seq_len = seq_len.tolist()
-            actual_seq_lengths_kv = seq_len
-
-            pad_length = runtime_shape - len(actual_seq_lengths_kv)
-            if pad_length > 0:
-                actual_seq_lengths_kv = actual_seq_lengths_kv + [0] * (runtime_shape - len(actual_seq_lengths_kv))
-
-            torch.npu.graph_task_update_begin(update_stream, handle)
-
-            torch_npu.npu_fused_infer_attention_score.out(
-                q_nope,
-                k_nope,
-                k_nope,
-                query_rope=q_pe,
-                key_rope=k_pe,
-                num_heads=num_heads,
-                num_key_value_heads=num_kv_heads,
-                input_layout=input_layout,
-                atten_mask=spec_attn_mask,
-                sparse_mode=sparse_mode,
-                scale=scale,
-                antiquant_mode=0,
-                antiquant_scale=None,
-                softmax_lse_flag=True,
-                block_table=block_table,
-                block_size=block_size,
-                actual_seq_lengths_kv=actual_seq_lengths_kv,
-                actual_seq_lengths=actual_seq_lengths,
-                workspace=graph_params.workspaces.get(runtime_shape),
-                out=[attn_output, softmax_lse],
-            )
-            torch.npu.graph_task_update_end(update_stream)
-
-            event.record(update_stream)
+def update_full_graph_params(
+    attn_backend,
+    update_stream,
+    forward_context,
+    num_tokens,
+    vllm_config,
+    speculative_config=None,
+    num_dcp_pcp_tokens=None,
+):
+    impl_cls = attn_backend.get_impl_cls()
+    impl_cls.update_graph_params(
+        update_stream,
+        forward_context,
+        num_tokens,
+        vllm_config,
+        speculative_config,
+        num_dcp_pcp_tokens,
+    )


@dataclass