From 2304218f900cb621e53963f1d3d31cdff1eb96f7 Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Thu, 18 Dec 2025 22:27:47 +0800 Subject: [PATCH] [Bugfix] Fix in_profile_run in mtp_proposer dummy_run (#5165) ### What this PR does / why we need it? This PR aims to fix failure of `enable_force_load_balance` caused by missing `in_profile_run` in `dummy_run` of mtp_proposer. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: Zetong Li --- vllm_ascend/spec_decode/eagle_proposer.py | 3 ++- vllm_ascend/spec_decode/mtp_proposer.py | 6 ++++-- vllm_ascend/spec_decode/ngram_proposer.py | 3 ++- vllm_ascend/spec_decode/suffix_proposer.py | 3 ++- vllm_ascend/worker/model_runner_v1.py | 3 ++- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 2eef568a..0518aa4a 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -143,7 +143,8 @@ class EagleProposer(Proposer): num_tokens_across_dp: Optional[torch.Tensor] = None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None, - dummy_compute_logits=lambda hidden_states: None): + dummy_compute_logits=lambda hidden_states: None, + is_profile=False): # update global cos, sin update_cos_sin(self.positions[:num_tokens]) diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 8bd45bbe..4deef9a2 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -229,7 +229,8 @@ class MtpProposer(Proposer): num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None, - dummy_compute_logits=lambda hidden_states: None) -> None: + dummy_compute_logits=lambda hidden_states: None, + is_profile=False) -> None: ( num_tokens, @@ -299,7 +300,8 @@ class MtpProposer(Proposer): num_actual_tokens=0, aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor, - is_mtp_model=True): + is_mtp_model=True, + in_profile_run=is_profile): if self.enable_shared_expert_dp: positions = positions.unsqueeze(-1) positions = torch.ops.vllm.maybe_pad_and_reduce(positions) diff --git a/vllm_ascend/spec_decode/ngram_proposer.py b/vllm_ascend/spec_decode/ngram_proposer.py index c5f498c8..22d28b61 100644 --- a/vllm_ascend/spec_decode/ngram_proposer.py +++ b/vllm_ascend/spec_decode/ngram_proposer.py @@ -27,7 +27,8 @@ class NgramProposer(VllmNgramProposer, Proposer): num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None, - dummy_compute_logits=lambda hidden_states: None): + dummy_compute_logits=lambda hidden_states: None, + is_profile=False): pass def generate_token_ids(self, diff --git a/vllm_ascend/spec_decode/suffix_proposer.py b/vllm_ascend/spec_decode/suffix_proposer.py index 920b3d4a..ea9f0f72 100644 --- a/vllm_ascend/spec_decode/suffix_proposer.py +++ b/vllm_ascend/spec_decode/suffix_proposer.py @@ -27,7 +27,8 @@ class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer): num_tokens_across_dp=None, aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor=None, - dummy_compute_logits=lambda hidden_states: None): + dummy_compute_logits=lambda hidden_states: None, + is_profile=False): pass def generate_token_ids(self, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ba69cff1..8ef38d4f 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2164,7 +2164,8 @@ class NPUModelRunner(GPUModelRunner): aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor, dummy_compute_logits=dummy_drafter_compute_logits, - in_graph_capturing=not force_attention) + in_graph_capturing=not force_attention, + is_profile=is_profile) if is_profile and self.dynamic_eplb: self.model.clear_all_moe_loads() if not is_profile and self.dynamic_eplb: