From 950570f8d122603b40a4ee3b9a30380d2afc1907 Mon Sep 17 00:00:00 2001 From: zhenwenqi2024 <155598497+zhenwenqi2024@users.noreply.github.com> Date: Wed, 17 Dec 2025 23:48:34 +0800 Subject: [PATCH] [Bugfix]delele profile_run in model_runner (#5122) ### What this PR does / why we need it? delete sekf.in_profile_run in model_runner to make EPLB works as expect ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 --------- Signed-off-by: zhenwenqi2024 Signed-off-by: Mengqing Cao Co-authored-by: Mengqing Cao --- tests/ut/spec_decode/test_mtp_proposer.py | 1 - vllm_ascend/spec_decode/mtp_proposer.py | 1 - vllm_ascend/worker/model_runner_v1.py | 13 +++++-------- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index 08ef8a68..c52ef569 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -65,7 +65,6 @@ class TestMtpProposer: runner.max_num_reqs = 256 runner._use_aclgraph.return_value = False runner.reserved_mc2_mask = None - runner.in_profile_run = False return runner @patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer") diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index cd4b9a04..b20b73d7 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -761,7 +761,6 @@ class MtpProposer(Proposer): num_tokens_across_dp=num_tokens_across_dp, aclgraph_runtime_mode=aclgraph_runtime_mode, batch_descriptor=batch_descriptor, - in_profile_run=self.runner.in_profile_run, num_actual_tokens=num_tokens, is_mtp_model=True): with ProfileExecuteDuration().capture_async('mtp_forward'): diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ffc2ee04..4c3c42d9 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -312,8 +312,6 @@ class NPUModelRunner(GPUModelRunner): self.use_aclgraph = self._use_aclgraph() - # NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True - self.in_profile_run = False self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path if self.dynamic_eplb: EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb) @@ -432,7 +430,7 @@ class NPUModelRunner(GPUModelRunner): # To ensure skipping all_reduce across dp group is valid, we need to ensure that # moe_comm_method of each rank is MC2 and recomputation would never happen in D # nodes. So here we check whether recompute_scheduler_enable is True. - return self.is_kv_consumer and not self.in_profile_run and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method( + return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method( potential_max_num_tokens, self.vllm_config) == MoECommType.MC2 def _sync_metadata_across_dp( @@ -2028,7 +2026,7 @@ class NPUModelRunner(GPUModelRunner): dtype=np.int32) num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - if not self.in_profile_run and self.dynamic_eplb: + if not is_profile and self.dynamic_eplb: self.eplb_updator.forward_before() has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False @@ -2110,8 +2108,7 @@ class NPUModelRunner(GPUModelRunner): for k, v in self.intermediate_tensors.items() }) - need_dummy_logits = (not self.in_profile_run - and lmhead_tp_enable()) + need_dummy_logits = (not is_profile and lmhead_tp_enable()) max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len dummy_indices = torch.zeros(max_num_reqs_across_dp, dtype=torch.int32) @@ -2157,9 +2154,9 @@ class NPUModelRunner(GPUModelRunner): batch_descriptor=batch_descriptor, dummy_compute_logits=dummy_drafter_compute_logits, in_graph_capturing=not force_attention) - if self.in_profile_run and self.dynamic_eplb: + if is_profile and self.dynamic_eplb: self.model.clear_all_moe_loads() - if not self.in_profile_run and self.dynamic_eplb: + if not is_profile and self.dynamic_eplb: self.eplb_updator.take_update_info_from_eplb_process() self.eplb_updator.forward_end() return hidden_states, hidden_states