[Feature] Support DeepEP Low Latency (#4767)

Co-authored-by: sleepcoo <sleepcoo@gmail.com>
Co-authored-by: laixinn <xielx@shanghaitech.edu.cn>
Co-authored-by: ch-wan <cwan39@gatech.edu>
This commit is contained in:
Jinyan Chen
2025-04-02 00:23:25 +08:00
committed by GitHub
parent 87fafa0105
commit 23c764b18a
8 changed files with 448 additions and 238 deletions

View File

@@ -161,6 +161,7 @@ class ServerArgs:
enable_dp_attention: bool = False
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
deepep_mode: Optional[str] = "auto"
enable_torch_compile: bool = False
torch_compile_max_bs: int = 32
cuda_graph_max_bs: Optional[int] = None
@@ -285,6 +286,13 @@ class ServerArgs:
if self.grammar_backend is None:
self.grammar_backend = "xgrammar"
# Expert parallelism
if self.enable_ep_moe:
self.ep_size = self.tp_size
logger.info(
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
# Data parallelism attention
if self.enable_dp_attention:
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
@@ -300,6 +308,10 @@ class ServerArgs:
self.enable_sp_layernorm = False
# DeepEP MoE
if self.enable_deepep_moe:
if self.deepep_mode == "auto":
assert (
not self.enable_dp_attention
), "DeepEP MoE `auto` mode is not supported with DP Attention."
self.ep_size = self.tp_size
self.enable_sp_layernorm = (
self.dp_size < self.tp_size if self.enable_dp_attention else True
@@ -1082,6 +1094,12 @@ class ServerArgs:
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal", "low_latency", "auto"],
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
# Server warmups
parser.add_argument(