[Feature] Integrate DeepEP into SGLang (#4232)

Co-authored-by: Cheng Wan <cwan39@gatech.edu> Co-authored-by: Xuting Zhou <xutingz@nvidia.com>
2025-03-19 23:16:31 +08:00
parent f9c53cbb42
commit f44db16c8e
12 changed files with 1228 additions and 35 deletions
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -145,6 +145,7 @@ class ModelRunner:
                "enable_nan_detection": server_args.enable_nan_detection,
                "enable_dp_attention": server_args.enable_dp_attention,
                "enable_ep_moe": server_args.enable_ep_moe,
+                "enable_deepep_moe": server_args.enable_deepep_moe,
                "device": server_args.device,
                "speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
                "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
@@ -277,6 +278,12 @@ class ModelRunner:
                server_args.chunked_prefill_size = -1
                server_args.disable_radix_cache = True

+        if server_args.enable_deepep_moe:
+            logger.info("DeepEP is turned on.")
+            assert (
+                server_args.enable_dp_attention == True
+            ), "Currently DeepEP is bind to Attention DP. Set '--enable-dp-attention --enable-deepep-moe'"
+
    def init_torch_distributed(self):
        logger.info("Init torch distributed begin.")