[Feature] Integrate DeepEP into SGLang (#4232)

Co-authored-by: Cheng Wan <cwan39@gatech.edu>
Co-authored-by: Xuting Zhou <xutingz@nvidia.com>
This commit is contained in:
Jinyan Chen
2025-03-19 23:16:31 +08:00
committed by GitHub
parent f9c53cbb42
commit f44db16c8e
12 changed files with 1228 additions and 35 deletions

View File

@@ -157,6 +157,7 @@ class ServerArgs:
enable_mixed_chunk: bool = False
enable_dp_attention: bool = False
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
enable_torch_compile: bool = False
torch_compile_max_bs: int = 32
cuda_graph_max_bs: Optional[int] = None
@@ -281,6 +282,12 @@ class ServerArgs:
logger.warning(
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
)
# DeepEP MoE
if self.enable_deepep_moe:
self.ep_size = self.dp_size
logger.info(
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the data parallel size[{self.dp_size}]."
)
# Speculative Decoding
if self.speculative_algorithm == "NEXTN":
@@ -1018,6 +1025,11 @@ class ServerArgs:
default=ServerArgs.hicache_ratio,
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
# Server warmups
parser.add_argument(