[Bugfix]delele profile_run in model_runner (#5122)
### What this PR does / why we need it?
delete sekf.in_profile_run in model_runner to make EPLB works as expect
### Does this PR introduce _any_ user-facing change?
NO
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: zhenwenqi2024 <zhenwenqi_2022@qq.com>
Signed-off-by: Mengqing Cao <cmq0113@163.com>
Co-authored-by: Mengqing Cao <cmq0113@163.com>
This commit is contained in:
@@ -65,7 +65,6 @@ class TestMtpProposer:
|
|||||||
runner.max_num_reqs = 256
|
runner.max_num_reqs = 256
|
||||||
runner._use_aclgraph.return_value = False
|
runner._use_aclgraph.return_value = False
|
||||||
runner.reserved_mc2_mask = None
|
runner.reserved_mc2_mask = None
|
||||||
runner.in_profile_run = False
|
|
||||||
return runner
|
return runner
|
||||||
|
|
||||||
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
@patch("vllm_ascend.spec_decode.mtp_proposer.CpuGpuBuffer")
|
||||||
|
|||||||
@@ -761,7 +761,6 @@ class MtpProposer(Proposer):
|
|||||||
num_tokens_across_dp=num_tokens_across_dp,
|
num_tokens_across_dp=num_tokens_across_dp,
|
||||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||||
batch_descriptor=batch_descriptor,
|
batch_descriptor=batch_descriptor,
|
||||||
in_profile_run=self.runner.in_profile_run,
|
|
||||||
num_actual_tokens=num_tokens,
|
num_actual_tokens=num_tokens,
|
||||||
is_mtp_model=True):
|
is_mtp_model=True):
|
||||||
with ProfileExecuteDuration().capture_async('mtp_forward'):
|
with ProfileExecuteDuration().capture_async('mtp_forward'):
|
||||||
|
|||||||
@@ -312,8 +312,6 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
|
|
||||||
self.use_aclgraph = self._use_aclgraph()
|
self.use_aclgraph = self._use_aclgraph()
|
||||||
|
|
||||||
# NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True
|
|
||||||
self.in_profile_run = False
|
|
||||||
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
|
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
|
||||||
if self.dynamic_eplb:
|
if self.dynamic_eplb:
|
||||||
EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
|
EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
|
||||||
@@ -432,7 +430,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
# To ensure skipping all_reduce across dp group is valid, we need to ensure that
|
# To ensure skipping all_reduce across dp group is valid, we need to ensure that
|
||||||
# moe_comm_method of each rank is MC2 and recomputation would never happen in D
|
# moe_comm_method of each rank is MC2 and recomputation would never happen in D
|
||||||
# nodes. So here we check whether recompute_scheduler_enable is True.
|
# nodes. So here we check whether recompute_scheduler_enable is True.
|
||||||
return self.is_kv_consumer and not self.in_profile_run and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
|
return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
|
||||||
potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
|
potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
|
||||||
|
|
||||||
def _sync_metadata_across_dp(
|
def _sync_metadata_across_dp(
|
||||||
@@ -2028,7 +2026,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
|
num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
|
||||||
|
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not is_profile and self.dynamic_eplb:
|
||||||
self.eplb_updator.forward_before()
|
self.eplb_updator.forward_before()
|
||||||
|
|
||||||
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
||||||
@@ -2110,8 +2108,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
for k, v in self.intermediate_tensors.items()
|
for k, v in self.intermediate_tensors.items()
|
||||||
})
|
})
|
||||||
|
|
||||||
need_dummy_logits = (not self.in_profile_run
|
need_dummy_logits = (not is_profile and lmhead_tp_enable())
|
||||||
and lmhead_tp_enable())
|
|
||||||
max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len
|
max_num_reqs_across_dp = max_num_reqs * self.uniform_decode_query_len
|
||||||
dummy_indices = torch.zeros(max_num_reqs_across_dp,
|
dummy_indices = torch.zeros(max_num_reqs_across_dp,
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
@@ -2157,9 +2154,9 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
batch_descriptor=batch_descriptor,
|
batch_descriptor=batch_descriptor,
|
||||||
dummy_compute_logits=dummy_drafter_compute_logits,
|
dummy_compute_logits=dummy_drafter_compute_logits,
|
||||||
in_graph_capturing=not force_attention)
|
in_graph_capturing=not force_attention)
|
||||||
if self.in_profile_run and self.dynamic_eplb:
|
if is_profile and self.dynamic_eplb:
|
||||||
self.model.clear_all_moe_loads()
|
self.model.clear_all_moe_loads()
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not is_profile and self.dynamic_eplb:
|
||||||
self.eplb_updator.take_update_info_from_eplb_process()
|
self.eplb_updator.take_update_info_from_eplb_process()
|
||||||
self.eplb_updator.forward_end()
|
self.eplb_updator.forward_end()
|
||||||
return hidden_states, hidden_states
|
return hidden_states, hidden_states
|
||||||
|
|||||||
Reference in New Issue
Block a user