Reapply "[Refactor] Unify full-graph parameter update logic (#6041)" (#6227) (#6231)

This reverts commit 95649344aa. The CI failure doesn't related to this change. Let's reapply it. - vLLM version: v0.14.0 - vLLM main: d68209402d
2026-01-26 09:04:54 +08:00
parent c38c838d03
commit 4e3919e965
10 changed files with 420 additions and 415 deletions
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -36,10 +36,7 @@ from vllm_ascend.attention.attention_mask import AttentionMaskBuilder
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
 from vllm_ascend.compilation.acl_graph import (ACLGraphWrapper,
-                                               update_attn_dcp_pcp_params,
-                                               update_attn_params,
-                                               update_mla_attn_dcp_pcp_params,
-                                               update_mla_attn_params)
+                                               update_full_graph_params)
 from vllm_ascend.ops.rotary_embedding import update_cos_sin
 from vllm_ascend.ops.triton.spec_decode.utils import \
    prepare_inputs_padded_kernel
@@ -1181,21 +1178,9 @@ class EagleProposer(VllmEagleProposer):

    # update full-graph params for one spec token
    def _update_full_graph_params(self, forward_context, num_tokens, draft_attn_metadatas=None):
-        if self.vllm_config.model_config.use_mla:
-            if self.pcp_size * self.dcp_size > 1:
-                update_mla_attn_dcp_pcp_params(self.update_stream,
-                                               forward_context, num_tokens)
-            else:
-                update_mla_attn_params(self.update_stream, forward_context,
-                                       num_tokens,
-                                       self.vllm_config.speculative_config)
-        else:
-            if self.pcp_size * self.dcp_size > 1:
-                update_attn_dcp_pcp_params(self.update_stream, forward_context,
-                                           num_tokens)
-            else:
-                update_attn_params(self.update_stream, forward_context,
-                                   num_tokens, self.vllm_config, draft_attn_metadatas)
+        update_full_graph_params(
+            self.runner.attn_backend, self.update_stream, forward_context, num_tokens,
+            self.vllm_config, self.vllm_config.speculative_config)

    # padding tensor into desired size
    def _pad_tensor(self, tensor, pad_size):