Speed up when having padding tokens two-batch overlap (#6668)

Co-authored-by: Cheng Wan <54331508+ch-wan@users.noreply.github.com>
2025-05-29 07:00:58 +08:00
parent ae6a5b2950
commit 31589e177e
2 changed files with 71 additions and 12 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -454,6 +454,7 @@ class DeepseekV2MoE(nn.Module):
                num_expert_group=self.num_expert_group,
                correction_bias=self.correction_bias,
                routed_scaling_factor=self.routed_scaling_factor,
+                num_token_non_padded=state.forward_batch.num_token_non_padded,
                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
                    layer_id=self.layer_id,
                ),