[Feature] Integrate DeepEP into SGLang (#4232)
Co-authored-by: Cheng Wan <cwan39@gatech.edu> Co-authored-by: Xuting Zhou <xutingz@nvidia.com>
This commit is contained in:
@@ -157,6 +157,7 @@ class ServerArgs:
|
||||
enable_mixed_chunk: bool = False
|
||||
enable_dp_attention: bool = False
|
||||
enable_ep_moe: bool = False
|
||||
enable_deepep_moe: bool = False
|
||||
enable_torch_compile: bool = False
|
||||
torch_compile_max_bs: int = 32
|
||||
cuda_graph_max_bs: Optional[int] = None
|
||||
@@ -281,6 +282,12 @@ class ServerArgs:
|
||||
logger.warning(
|
||||
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
||||
)
|
||||
# DeepEP MoE
|
||||
if self.enable_deepep_moe:
|
||||
self.ep_size = self.dp_size
|
||||
logger.info(
|
||||
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the data parallel size[{self.dp_size}]."
|
||||
)
|
||||
|
||||
# Speculative Decoding
|
||||
if self.speculative_algorithm == "NEXTN":
|
||||
@@ -1018,6 +1025,11 @@ class ServerArgs:
|
||||
default=ServerArgs.hicache_ratio,
|
||||
help="The ratio of the size of host KV cache memory pool to the size of device pool.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-deepep-moe",
|
||||
action="store_true",
|
||||
help="Enabling DeepEP MoE implementation for EP MoE.",
|
||||
)
|
||||
|
||||
# Server warmups
|
||||
parser.add_argument(
|
||||
|
||||
Reference in New Issue
Block a user