[Bugfix]delele profile_run in model_runner (#5122)

### What this PR does / why we need it? delete sekf.in_profile_run in model_runner to make EPLB works as expect ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com> Signed-off-by: Mengqing Cao <cmq0113@163.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-17 23:48:34 +08:00
parent 98e6e57622
commit 950570f8d1
3 changed files with 5 additions and 10 deletions
--- a/tests/ut/spec_decode/test_mtp_proposer.py
+++ b/tests/ut/spec_decode/test_mtp_proposer.py
@@ -65,7 +65,6 @@ class TestMtpProposer:
        runner.max_num_reqs = 256
        runner._use_aclgraph.return_value = False
        runner.reserved_mc2_mask = None
        runner.in_profile_run = False
        return runner
    @patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -761,7 +761,6 @@ class MtpProposer(Proposer):
                    num_tokens_across_dp=num_tokens_across_dp,
                    aclgraph_runtime_mode=aclgraph_runtime_mode,
                    batch_descriptor=batch_descriptor,
                    in_profile_run=self.runner.in_profile_run,
                    num_actual_tokens=num_tokens,
                    is_mtp_model=True):
                with ProfileExecuteDuration().capture_async('mtp_forward'):
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -312,8 +312,6 @@ class NPUModelRunner(GPUModelRunner):
        self.use_aclgraph = self._use_aclgraph()
        # NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True
        self.in_profile_run = False
        self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
        if self.dynamic_eplb:
            EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
@@ -432,7 +430,7 @@ class NPUModelRunner(GPUModelRunner):
        # To ensure skipping all_reduce across dp group is valid, we need to ensure that
        # moe_comm_method of each rank is MC2 and recomputation would never happen in D
        # nodes. So here we check whether recompute_scheduler_enable is True.
-        return self.is_kv_consumer and not self.in_profile_run and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
+        return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
            potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
    def _sync_metadata_across_dp(
@@ -2028,7 +2026,7 @@ class NPUModelRunner(GPUModelRunner):
                                        dtype=np.int32)
        num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
-        if not self.in_profile_run and self.dynamic_eplb:
+        if not is_profile and self.dynamic_eplb:
            self.eplb_updator.forward_before()
        has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
@@ -2110,8 +2108,7 @@ class NPUModelRunner(GPUModelRunner):
                    for k, v in self.intermediate_tensors.items()
                })
-            need_dummy_logits = (not self.in_profile_run
+            need_dummy_logits = (not is_profile and lmhead_tp_enable())
                                 and lmhead_tp_enable())
            max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len
            dummy_indices = torch.zeros(max_num_reqs_across_dp,
                                        dtype=torch.int32)
@@ -2157,9 +2154,9 @@ class NPUModelRunner(GPUModelRunner):
                    batch_descriptor=batch_descriptor,
                    dummy_compute_logits=dummy_drafter_compute_logits,
                    in_graph_capturing=not force_attention)
-            if self.in_profile_run and self.dynamic_eplb:
+            if is_profile and self.dynamic_eplb:
                self.model.clear_all_moe_loads()
-            if not self.in_profile_run and self.dynamic_eplb:
+            if not is_profile and self.dynamic_eplb:
                self.eplb_updator.take_update_info_from_eplb_process()
                self.eplb_updator.forward_end()
            return hidden_states, hidden_states