[Bugfix] Fix in_profile_run in mtp_proposer dummy_run (#5165)

### What this PR does / why we need it? This PR aims to fix failure of `enable_force_load_balance` caused by missing `in_profile_run` in `dummy_run` of mtp_proposer. ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? by ci - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Zetong Li <slippersss@126.com>
2025-12-18 22:27:47 +08:00
parent 7d32371b7e
commit 2304218f90
5 changed files with 12 additions and 6 deletions
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -229,7 +229,8 @@ class MtpProposer(Proposer):
                  num_tokens_across_dp=None,
                  aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                  batch_descriptor=None,
-                  dummy_compute_logits=lambda hidden_states: None) -> None:
+                  dummy_compute_logits=lambda hidden_states: None,
+                  is_profile=False) -> None:

        (
            num_tokens,
@@ -299,7 +300,8 @@ class MtpProposer(Proposer):
                    num_actual_tokens=0,
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor,
-                    is_mtp_model=True):
+                    is_mtp_model=True,
+                    in_profile_run=is_profile):
                if self.enable_shared_expert_dp:
                    positions = positions.unsqueeze(-1)
                    positions = torch.ops.vllm.maybe_pad_and_reduce(positions)