Performing Vocabulary Parallelism for LM Head across Attention TP Groups (#5558)

Co-authored-by: liusy58 <liusy58@linux.alibaba.com>
2025-05-12 02:36:29 -04:00
parent 9f2c9568f0
commit 25c83fff6a
8 changed files with 71 additions and 23 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -159,6 +159,7 @@ class ServerArgs:
    disable_overlap_schedule: bool = False
    enable_mixed_chunk: bool = False
    enable_dp_attention: bool = False
+    enable_dp_lm_head: bool = False
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
@@ -323,6 +324,11 @@ class ServerArgs:
                f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
            )

+        if self.enable_dp_lm_head:
+            assert (
+                self.enable_dp_attention
+            ), "Please enable dp attention when setting enable_dp_attention. "
+
        # DeepEP MoE
        self.enable_sp_layernorm = False
        if self.enable_deepep_moe:
@@ -1055,6 +1061,11 @@ class ServerArgs:
            action="store_true",
            help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
        )
+        parser.add_argument(
+            "--enable-dp-lm-head",
+            action="store_true",
+            help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
+        )
        parser.add_argument(
            "--enable-ep-moe",
            action="store_true",