[Fix] DeepEP Compatibility with Low Latency (#5068)

Co-authored-by: ch-wan <cwan39@gatech.edu>
2025-04-09 11:31:31 +08:00
parent aac531c53b
commit bc3f6db2dd
4 changed files with 146 additions and 118 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -324,6 +324,7 @@ class DeepseekV2MoE(nn.Module):
                correction_bias=self.correction_bias,
            )
        if self.ep_size > 1:
+            # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
            (
                hidden_states,
                topk_idx,
@@ -336,7 +337,6 @@ class DeepseekV2MoE(nn.Module):
                hidden_states,
                topk_idx,
                topk_weights,
-                self.num_experts,
                forward_mode=forward_mode,
            )
        final_hidden_states = (