Speed up when having padding tokens two-batch overlap (#6668)

Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
This commit is contained in:
fzyzcjy
2025-05-29 07:00:58 +08:00
committed by GitHub
parent ae6a5b2950
commit 31589e177e
2 changed files with 71 additions and 12 deletions

View File

@@ -454,6 +454,7 @@ class DeepseekV2MoE(nn.Module):
num_expert_group=self.num_expert_group,
correction_bias=self.correction_bias,
routed_scaling_factor=self.routed_scaling_factor,
num_token_non_padded=state.forward_batch.num_token_non_padded,
expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
layer_id=self.layer_id,
),