[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)

This commit is contained in:
Cheng Wan
2025-08-01 01:20:03 -07:00
committed by GitHub
parent c8d3a402c1
commit 6c88f6c8d9
38 changed files with 342 additions and 299 deletions

View File

@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
return [nested_list]
class DeepEPMode(Enum):
normal = "normal"
low_latency = "low_latency"
auto = "auto"
def enable_normal(self):
return self in [DeepEPMode.normal, DeepEPMode.auto]
def enable_low_latency(self):
return self in [DeepEPMode.low_latency, DeepEPMode.auto]
def resolve(self, is_extend_in_batch: bool):
if self != DeepEPMode.auto:
return self
if is_extend_in_batch:
return DeepEPMode.normal
else:
return DeepEPMode.low_latency
def is_non_idle_and_non_empty(forward_mode, hidden_states):
return (
(forward_mode is not None)
@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
return True
elif not server_args.enable_dp_lm_head:
return True
elif not server_args.enable_deepep_moe:
elif server_args.moe_a2a_backend is None:
return True
else:
return (
@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
Check if the input of attention is scattered.
"""
assert server_args.moe_dense_tp_size in [1, None]
if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
if server_args.enable_dp_attention:
return server_args.dp_size < server_args.tp_size
else: