[Bugfix] Fix in_profile_run in mtp_proposer dummy_run (#5165)
### What this PR does / why we need it?
This PR aims to fix failure of `enable_force_load_balance` caused by
missing `in_profile_run` in `dummy_run` of mtp_proposer.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
by ci
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: Zetong Li <slippersss@126.com>
This commit is contained in:
@@ -143,7 +143,8 @@ class EagleProposer(Proposer):
|
||||
num_tokens_across_dp: Optional[torch.Tensor] = None,
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
dummy_compute_logits=lambda hidden_states: None):
|
||||
dummy_compute_logits=lambda hidden_states: None,
|
||||
is_profile=False):
|
||||
# update global cos, sin
|
||||
update_cos_sin(self.positions[:num_tokens])
|
||||
|
||||
|
||||
@@ -229,7 +229,8 @@ class MtpProposer(Proposer):
|
||||
num_tokens_across_dp=None,
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
dummy_compute_logits=lambda hidden_states: None) -> None:
|
||||
dummy_compute_logits=lambda hidden_states: None,
|
||||
is_profile=False) -> None:
|
||||
|
||||
(
|
||||
num_tokens,
|
||||
@@ -299,7 +300,8 @@ class MtpProposer(Proposer):
|
||||
num_actual_tokens=0,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
is_mtp_model=True):
|
||||
is_mtp_model=True,
|
||||
in_profile_run=is_profile):
|
||||
if self.enable_shared_expert_dp:
|
||||
positions = positions.unsqueeze(-1)
|
||||
positions = torch.ops.vllm.maybe_pad_and_reduce(positions)
|
||||
|
||||
@@ -27,7 +27,8 @@ class NgramProposer(VllmNgramProposer, Proposer):
|
||||
num_tokens_across_dp=None,
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
dummy_compute_logits=lambda hidden_states: None):
|
||||
dummy_compute_logits=lambda hidden_states: None,
|
||||
is_profile=False):
|
||||
pass
|
||||
|
||||
def generate_token_ids(self,
|
||||
|
||||
@@ -27,7 +27,8 @@ class SuffixDecodingProposer(VllmSuffixDecodingProposer, Proposer):
|
||||
num_tokens_across_dp=None,
|
||||
aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
|
||||
batch_descriptor=None,
|
||||
dummy_compute_logits=lambda hidden_states: None):
|
||||
dummy_compute_logits=lambda hidden_states: None,
|
||||
is_profile=False):
|
||||
pass
|
||||
|
||||
def generate_token_ids(self,
|
||||
|
||||
@@ -2164,7 +2164,8 @@ class NPUModelRunner(GPUModelRunner):
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
batch_descriptor=batch_descriptor,
|
||||
dummy_compute_logits=dummy_drafter_compute_logits,
|
||||
in_graph_capturing=not force_attention)
|
||||
in_graph_capturing=not force_attention,
|
||||
is_profile=is_profile)
|
||||
if is_profile and self.dynamic_eplb:
|
||||
self.model.clear_all_moe_loads()
|
||||
if not is_profile and self.dynamic_eplb:
|
||||
|
||||
Reference in New Issue
Block a user