Expert distribution recording without overhead for EPLB (#4957)

This commit is contained in:
fzyzcjy
2025-05-20 11:07:43 +08:00
committed by GitHub
parent b146555749
commit f0653886a5
12 changed files with 1123 additions and 194 deletions

View File

@@ -170,6 +170,11 @@ class ServerArgs:
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
init_expert_location: str = "trivial"
expert_distribution_recorder_mode: Optional[
Literal["stat", "per_pass", "per_token"]
] = None
expert_distribution_recorder_buffer_size: Optional[int] = None
deepep_config: Optional[str] = None
enable_torch_compile: bool = False
torch_compile_max_bs: int = 32
@@ -361,6 +366,15 @@ class ServerArgs:
"Pipeline parallelism is incompatible with overlap schedule."
)
if self.expert_distribution_recorder_buffer_size is None:
# TODO pr-chain: enable this later
# if (x := self.eplb_rebalance_num_iterations) is not None:
# self.expert_distribution_recorder_buffer_size = x
if False:
pass
elif self.expert_distribution_recorder_mode is not None:
self.expert_distribution_recorder_buffer_size = 1000
# Speculative Decoding
if self.speculative_algorithm == "NEXTN":
# NEXTN shares the same implementation of EAGLE
@@ -1257,6 +1271,24 @@ class ServerArgs:
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--expert-distribution-recorder-mode",
type=str,
default=ServerArgs.expert_distribution_recorder_mode,
help="Mode of expert distribution recorder.",
)
parser.add_argument(
"--expert-distribution-recorder-buffer-size",
type=int,
default=ServerArgs.expert_distribution_recorder_buffer_size,
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--deepep-config",
type=str,