Performing Vocabulary Parallelism for LM Head across Attention TP Groups (#5558)
Co-authored-by: liusy58 <liusy58@linux.alibaba.com>
This commit is contained in:
@@ -159,6 +159,7 @@ class ServerArgs:
|
||||
disable_overlap_schedule: bool = False
|
||||
enable_mixed_chunk: bool = False
|
||||
enable_dp_attention: bool = False
|
||||
enable_dp_lm_head: bool = False
|
||||
enable_ep_moe: bool = False
|
||||
enable_deepep_moe: bool = False
|
||||
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
||||
@@ -323,6 +324,11 @@ class ServerArgs:
|
||||
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
||||
)
|
||||
|
||||
if self.enable_dp_lm_head:
|
||||
assert (
|
||||
self.enable_dp_attention
|
||||
), "Please enable dp attention when setting enable_dp_attention. "
|
||||
|
||||
# DeepEP MoE
|
||||
self.enable_sp_layernorm = False
|
||||
if self.enable_deepep_moe:
|
||||
@@ -1055,6 +1061,11 @@ class ServerArgs:
|
||||
action="store_true",
|
||||
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-dp-lm-head",
|
||||
action="store_true",
|
||||
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-ep-moe",
|
||||
action="store_true",
|
||||
|
||||
Reference in New Issue
Block a user