From ccfe5c009d61844baf61f25d707fdcbb040e7d47 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Wed, 21 May 2025 17:05:53 +0800 Subject: [PATCH] Support redundant experts in expert parallel (#6461) --- python/sglang/srt/managers/expert_location.py | 3 +-- python/sglang/srt/managers/schedule_batch.py | 1 + python/sglang/srt/model_executor/model_runner.py | 1 + python/sglang/srt/models/deepseek_v2.py | 11 ++++++++--- python/sglang/srt/server_args.py | 7 +++++++ 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/managers/expert_location.py b/python/sglang/srt/managers/expert_location.py index b31e51557..efd6c5273 100644 --- a/python/sglang/srt/managers/expert_location.py +++ b/python/sglang/srt/managers/expert_location.py @@ -163,8 +163,7 @@ class ExpertLocationMetadata: num_physical_experts = ( model_config_for_expert_location.num_logical_experts - # TODO pr-chain: enable this later - # + server_args.ep_num_redundant_experts + + server_args.ep_num_redundant_experts ) ep_size = server_args.ep_size assert num_physical_experts % ep_size == 0 diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 9981fe776..cd780b1ac 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -90,6 +90,7 @@ global_server_args_dict = { "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single, "torchao_config": ServerArgs.torchao_config, "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32, + "ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts, } logger = logging.getLogger(__name__) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 5fcc33865..3d50ce517 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -206,6 +206,7 @@ class ModelRunner: "speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc, "use_mla_backend": self.use_mla_backend, "mm_attention_backend": server_args.mm_attention_backend, + "ep_num_redundant_experts": server_args.ep_num_redundant_experts, } ) diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index b90fadec4..62ecce141 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -243,7 +243,9 @@ class DeepseekV2MoE(nn.Module): self.gate = MoEGate(config=config, prefix=add_prefix("gate", prefix)) self.experts = get_moe_impl_class()( - num_experts=config.n_routed_experts + self.n_share_experts_fusion, + num_experts=config.n_routed_experts + + self.n_share_experts_fusion + + global_server_args_dict["ep_num_redundant_experts"], top_k=config.num_experts_per_tok + min(self.n_share_experts_fusion, 1), hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, @@ -285,7 +287,10 @@ class DeepseekV2MoE(nn.Module): if global_server_args_dict["enable_deepep_moe"]: # TODO: we will support tp < ep in the future self.ep_size = get_tensor_model_parallel_world_size() - self.num_experts = config.n_routed_experts + self.num_experts = ( + config.n_routed_experts + + global_server_args_dict["ep_num_redundant_experts"] + ) self.renormalize = config.norm_topk_prob self.topk_group = config.topk_group self.num_expert_group = config.n_group @@ -299,7 +304,7 @@ class DeepseekV2MoE(nn.Module): group=parallel_state.get_tp_group().device_group, router_topk=self.top_k, permute_fusion=True, - num_experts=config.n_routed_experts, + num_experts=self.num_experts, num_local_experts=config.n_routed_experts // self.tp_size, hidden_size=config.hidden_size, params_dtype=config.torch_dtype, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index ed9e92641..d0f641b45 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -170,6 +170,7 @@ class ServerArgs: enable_ep_moe: bool = False enable_deepep_moe: bool = False deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto" + ep_num_redundant_experts: int = 0 ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None init_expert_location: str = "trivial" expert_distribution_recorder_mode: Optional[ @@ -1273,6 +1274,12 @@ class ServerArgs: default="auto", help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.", ) + parser.add_argument( + "--ep-num-redundant-experts", + type=int, + default=ServerArgs.ep_num_redundant_experts, + help="Allocate this number of redundant experts in expert parallel.", + ) parser.add_argument( "--ep-dispatch-algorithm", type=str,