From 5e0ada539515eacd6c873ed7eb95772f677a8889 Mon Sep 17 00:00:00 2001 From: MengLong Chen <71744434+dragondream-chen@users.noreply.github.com> Date: Tue, 16 Dec 2025 09:14:05 +0800 Subject: [PATCH] [Bugfix] Fix the attn_metadata is None (#5038) ### What this PR does / why we need it? Fix the bug " TypeError: 'NoneType' object is not iterable' " in vllm_ascend/compilation/acl_graph.py The reason of that is the attn_metadata is none in the dummy_run of MTP. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: chenmenglong --- vllm_ascend/spec_decode/eagle_proposer.py | 2 +- vllm_ascend/spec_decode/interface.py | 2 +- vllm_ascend/spec_decode/mtp_proposer.py | 8 +++----- vllm_ascend/spec_decode/ngram_proposer.py | 2 +- vllm_ascend/spec_decode/suffix_proposer.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 2 +- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 50ca4cc4..266eadca 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -117,7 +117,7 @@ class EagleProposer(Proposer): def dummy_run(self, num_tokens: int, with_prefill: bool = False, - skip_attn: bool = False, + in_graph_capturing: bool = False, num_reqs: int = 0, num_tokens_across_dp: Optional[torch.Tensor] = None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, diff --git a/vllm_ascend/spec_decode/interface.py b/vllm_ascend/spec_decode/interface.py index 8036d806..f7f92ddb 100644 --- a/vllm_ascend/spec_decode/interface.py +++ b/vllm_ascend/spec_decode/interface.py @@ -32,7 +32,7 @@ class Proposer: def dummy_run(self, num_tokens: int, with_prefill: bool = False, - skip_attn: bool = False, + in_graph_capturing: bool = False, num_reqs: int = 0, num_tokens_across_dp: Optional[torch.Tensor] = None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index ce84a55b..253f0ef4 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -223,7 +223,7 @@ class MtpProposer(Proposer): def dummy_run(self, num_tokens: int, with_prefill: bool = False, - skip_attn: bool = False, + in_graph_capturing: bool = False, num_reqs: int = 0, num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, @@ -247,9 +247,7 @@ class MtpProposer(Proposer): moe_comm_type = (MoECommType.ALLTOALL if moe_comm_type == MoECommType.FUSED_ALLTOALL else moe_comm_type) - if skip_attn: - attn_metadata = None - elif aclgraph_runtime_mode == CUDAGraphMode.FULL: + if aclgraph_runtime_mode == CUDAGraphMode.FULL: if len(self.runner.attn_groups) > 0: num_computed_tokens_cpu = ( self.runner.input_batch. @@ -294,7 +292,7 @@ class MtpProposer(Proposer): positions = self.positions[:num_tokens] previous_hidden_states = self.hidden_states[:num_tokens] for i in range(self.num_speculative_tokens): - if i > 0 and not skip_attn and aclgraph_runtime_mode == CUDAGraphMode.FULL: + if i > 0 and not in_graph_capturing and aclgraph_runtime_mode == CUDAGraphMode.FULL: aclgraph_runtime_mode = CUDAGraphMode.NONE with set_ascend_forward_context( attn_metadata, diff --git a/vllm_ascend/spec_decode/ngram_proposer.py b/vllm_ascend/spec_decode/ngram_proposer.py index cdda0c63..c5f498c8 100644 --- a/vllm_ascend/spec_decode/ngram_proposer.py +++ b/vllm_ascend/spec_decode/ngram_proposer.py @@ -22,7 +22,7 @@ class NgramProposer(VllmNgramProposer, Proposer): def dummy_run(self, num_tokens, with_prefill=None, - skip_attn=None, + in_graph_capturing=None, num_reqs=None, num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, diff --git a/vllm_ascend/spec_decode/suffix_proposer.py b/vllm_ascend/spec_decode/suffix_proposer.py index b748953d..920b3d4a 100644 --- a/vllm_ascend/spec_decode/suffix_proposer.py +++ b/vllm_ascend/spec_decode/suffix_proposer.py @@ -22,7 +22,7 @@ class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer): def dummy_run(self, num_tokens, with_prefill=None, - skip_attn=None, + in_graph_capturing=None, num_reqs=None, num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 39745da5..5af8af4e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2296,7 +2296,7 @@ class NPUModelRunner(GPUModelRunner): aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor, dummy_compute_logits=dummy_drafter_compute_logits, - skip_attn=not force_attention) + in_graph_capturing=not force_attention) if self.in_profile_run and self.dynamic_eplb: self.model.clear_all_moe_loads() if not self.in_profile_run and self.dynamic_eplb: