[Bugfix]delele profile_run in model_runner (#5122)

### What this PR does / why we need it?
delete sekf.in_profile_run in model_runner to make EPLB works as expect
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

---------

Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
zhenwenqi2024
2025-12-17 23:48:34 +08:00
committed by GitHub
parent 98e6e57622
commit 950570f8d1
3 changed files with 5 additions and 10 deletions

View File

@@ -65,7 +65,6 @@ class TestMtpProposer:
runner.max_num_reqs = 256 runner.max_num_reqs = 256
runner._use_aclgraph.return_value = False runner._use_aclgraph.return_value = False
runner.reserved_mc2_mask = None runner.reserved_mc2_mask = None
runner.in_profile_run = False
return runner return runner
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer") @patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")

View File

@@ -761,7 +761,6 @@ class MtpProposer(Proposer):
num_tokens_across_dp=num_tokens_across_dp, num_tokens_across_dp=num_tokens_across_dp,
aclgraph_runtime_mode=aclgraph_runtime_mode, aclgraph_runtime_mode=aclgraph_runtime_mode,
batch_descriptor=batch_descriptor, batch_descriptor=batch_descriptor,
in_profile_run=self.runner.in_profile_run,
num_actual_tokens=num_tokens, num_actual_tokens=num_tokens,
is_mtp_model=True): is_mtp_model=True):
with ProfileExecuteDuration().capture_async('mtp_forward'): with ProfileExecuteDuration().capture_async('mtp_forward'):

View File

@@ -312,8 +312,6 @@ class NPUModelRunner(GPUModelRunner):
self.use_aclgraph = self._use_aclgraph() self.use_aclgraph = self._use_aclgraph()
# NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True
self.in_profile_run = False
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
if self.dynamic_eplb: if self.dynamic_eplb:
EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb) EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
@@ -432,7 +430,7 @@ class NPUModelRunner(GPUModelRunner):
# To ensure skipping all_reduce across dp group is valid, we need to ensure that # To ensure skipping all_reduce across dp group is valid, we need to ensure that
# moe_comm_method of each rank is MC2 and recomputation would never happen in D # moe_comm_method of each rank is MC2 and recomputation would never happen in D
# nodes. So here we check whether recompute_scheduler_enable is True. # nodes. So here we check whether recompute_scheduler_enable is True.
return self.is_kv_consumer and not self.in_profile_run and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method( return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
potential_max_num_tokens, self.vllm_config) == MoECommType.MC2 potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
def _sync_metadata_across_dp( def _sync_metadata_across_dp(
@@ -2028,7 +2026,7 @@ class NPUModelRunner(GPUModelRunner):
dtype=np.int32) dtype=np.int32)
num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
if not self.in_profile_run and self.dynamic_eplb: if not is_profile and self.dynamic_eplb:
self.eplb_updator.forward_before() self.eplb_updator.forward_before()
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
@@ -2110,8 +2108,7 @@ class NPUModelRunner(GPUModelRunner):
for k, v in self.intermediate_tensors.items() for k, v in self.intermediate_tensors.items()
}) })
need_dummy_logits = (not self.in_profile_run need_dummy_logits = (not is_profile and lmhead_tp_enable())
and lmhead_tp_enable())
max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len
dummy_indices = torch.zeros(max_num_reqs_across_dp, dummy_indices = torch.zeros(max_num_reqs_across_dp,
dtype=torch.int32) dtype=torch.int32)
@@ -2157,9 +2154,9 @@ class NPUModelRunner(GPUModelRunner):
batch_descriptor=batch_descriptor, batch_descriptor=batch_descriptor,
dummy_compute_logits=dummy_drafter_compute_logits, dummy_compute_logits=dummy_drafter_compute_logits,
in_graph_capturing=not force_attention) in_graph_capturing=not force_attention)
if self.in_profile_run and self.dynamic_eplb: if is_profile and self.dynamic_eplb:
self.model.clear_all_moe_loads() self.model.clear_all_moe_loads()
if not self.in_profile_run and self.dynamic_eplb: if not is_profile and self.dynamic_eplb:
self.eplb_updator.take_update_info_from_eplb_process() self.eplb_updator.take_update_info_from_eplb_process()
self.eplb_updator.forward_end() self.eplb_updator.forward_end()
return hidden_states, hidden_states return hidden_states, hidden_states