[Refactor] Rename n_share_experts_fusion as num_fused_shared_experts (#6735)

This commit is contained in:
Cheng Wan
2025-06-03 17:48:24 -07:00
committed by GitHub
parent b6d0ce9f78
commit 8a5480528d
14 changed files with 82 additions and 93 deletions

View File

@@ -399,9 +399,8 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
n_share_fusion_experts = args.n_share_experts_fusion
E = (
config.n_routed_experts + n_share_fusion_experts
config.n_routed_experts + 1
if config.architectures[0] in ["DeepseekV3ForCausalLM"]
else config.n_routed_experts
)
@@ -409,8 +408,7 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "Llama4ForConditionalGeneration":
n_share_fusion_experts = args.n_share_experts_fusion
E = config.text_config.num_local_experts + n_share_fusion_experts
E = config.text_config.num_local_experts + 1
topk = config.text_config.num_experts_per_tok
intermediate_size = config.text_config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
@@ -570,12 +568,6 @@ if __name__ == "__main__":
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true")
parser.add_argument(
"--n-share-experts-fusion",
type=int,
default=0,
help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1",
)
args = parser.parse_args()
main(args)