Expert distribution recording without overhead for EPLB (#4957)
This commit is contained in:
@@ -170,6 +170,11 @@ class ServerArgs:
|
||||
enable_ep_moe: bool = False
|
||||
enable_deepep_moe: bool = False
|
||||
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
||||
init_expert_location: str = "trivial"
|
||||
expert_distribution_recorder_mode: Optional[
|
||||
Literal["stat", "per_pass", "per_token"]
|
||||
] = None
|
||||
expert_distribution_recorder_buffer_size: Optional[int] = None
|
||||
deepep_config: Optional[str] = None
|
||||
enable_torch_compile: bool = False
|
||||
torch_compile_max_bs: int = 32
|
||||
@@ -361,6 +366,15 @@ class ServerArgs:
|
||||
"Pipeline parallelism is incompatible with overlap schedule."
|
||||
)
|
||||
|
||||
if self.expert_distribution_recorder_buffer_size is None:
|
||||
# TODO pr-chain: enable this later
|
||||
# if (x := self.eplb_rebalance_num_iterations) is not None:
|
||||
# self.expert_distribution_recorder_buffer_size = x
|
||||
if False:
|
||||
pass
|
||||
elif self.expert_distribution_recorder_mode is not None:
|
||||
self.expert_distribution_recorder_buffer_size = 1000
|
||||
|
||||
# Speculative Decoding
|
||||
if self.speculative_algorithm == "NEXTN":
|
||||
# NEXTN shares the same implementation of EAGLE
|
||||
@@ -1257,6 +1271,24 @@ class ServerArgs:
|
||||
default="auto",
|
||||
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--init-expert-location",
|
||||
type=str,
|
||||
default=ServerArgs.init_expert_location,
|
||||
help="Initial location of EP experts.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--expert-distribution-recorder-mode",
|
||||
type=str,
|
||||
default=ServerArgs.expert_distribution_recorder_mode,
|
||||
help="Mode of expert distribution recorder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--expert-distribution-recorder-buffer-size",
|
||||
type=int,
|
||||
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
||||
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deepep-config",
|
||||
type=str,
|
||||
|
||||
Reference in New Issue
Block a user