Support redundant experts in expert parallel (#6461)

This commit is contained in:
fzyzcjy
2025-05-21 17:05:53 +08:00
committed by GitHub
parent a071dc4084
commit ccfe5c009d
5 changed files with 18 additions and 5 deletions

View File

@@ -163,8 +163,7 @@ class ExpertLocationMetadata:
num_physical_experts = (
model_config_for_expert_location.num_logical_experts
# TODO pr-chain: enable this later
# + server_args.ep_num_redundant_experts
+ server_args.ep_num_redundant_experts
)
ep_size = server_args.ep_size
assert num_physical_experts % ep_size == 0

View File

@@ -90,6 +90,7 @@ global_server_args_dict = {
"speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
"torchao_config": ServerArgs.torchao_config,
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
"ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
}
logger = logging.getLogger(__name__)