[Refactor] Rename n_share_experts_fusion as num_fused_shared_experts (#6735)
This commit is contained in:
@@ -27,19 +27,17 @@ python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
|
||||
--dtype fp8_w8a8 \
|
||||
--tune
|
||||
|
||||
# Tune DeepSeek-V3 with FP8, TP=8 and n_share_experts_fusion=8
|
||||
# Tune DeepSeek-V3 with FP8 and TP=8
|
||||
python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
|
||||
--model deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tp-size 8 \
|
||||
--n-share-experts-fusion 8 \
|
||||
--dtype fp8_w8a8 \
|
||||
--tune
|
||||
|
||||
# Tune DeepSeek-R1 with channel-wise INT8, TP=16 and n_share_experts_fusion=16
|
||||
# Tune DeepSeek-R1 with channel-wise INT8 and TP=16
|
||||
python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
|
||||
--model meituan/DeepSeek-R1-Channel-INT8 \
|
||||
--tp-size 16 \
|
||||
--n-share-experts-fusion 16 \
|
||||
--dtype int8_w8a8 \
|
||||
--tune
|
||||
```
|
||||
@@ -65,11 +63,10 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri
|
||||
--model deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tp-size 8
|
||||
|
||||
# Compare with custom TP size and n_share_experts_fusion
|
||||
# Compare with custom TP size
|
||||
python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
|
||||
--model deepseek-ai/DeepSeek-V3-0324 \
|
||||
--tp-size 8 \
|
||||
--n-share-experts-fusion 8
|
||||
--tp-size 8
|
||||
```
|
||||
|
||||
The benchmark results will be saved as plots and data files in the specified output directory (default: `./configs/benchmark_ops/vllm_sglang_fused_moe/`).
|
||||
|
||||
@@ -18,7 +18,7 @@ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
|
||||
)
|
||||
|
||||
|
||||
def get_model_config(model_name: str, tp_size: int, n_share_experts_fusion: int = 0):
|
||||
def get_model_config(model_name: str, tp_size: int):
|
||||
"""Get model configuration parameters"""
|
||||
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
@@ -43,9 +43,8 @@ def get_model_config(model_name: str, tp_size: int, n_share_experts_fusion: int
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // tp_size
|
||||
elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
|
||||
n_share_fusion_experts = n_share_experts_fusion
|
||||
E = (
|
||||
config.n_routed_experts + n_share_fusion_experts
|
||||
config.n_routed_experts + 1
|
||||
if config.architectures[0] in ["DeepseekV3ForCausalLM"]
|
||||
else config.n_routed_experts
|
||||
)
|
||||
@@ -294,7 +293,6 @@ def main():
|
||||
"--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||
)
|
||||
parser.add_argument("--tp-size", type=int, default=2)
|
||||
parser.add_argument("--n-share-experts-fusion", type=int, default=0)
|
||||
parser.add_argument("--use-fp8-w8a8", action="store_true")
|
||||
parser.add_argument(
|
||||
"--save-path",
|
||||
@@ -325,9 +323,7 @@ def main():
|
||||
pipeline_model_parallel_size=1,
|
||||
)
|
||||
|
||||
model_config = get_model_config(
|
||||
args.model, args.tp_size, args.n_share_experts_fusion
|
||||
)
|
||||
model_config = get_model_config(args.model, args.tp_size)
|
||||
benchmark.run(
|
||||
show_plots=True,
|
||||
print_data=True,
|
||||
|
||||
@@ -399,9 +399,8 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
|
||||
n_share_fusion_experts = args.n_share_experts_fusion
|
||||
E = (
|
||||
config.n_routed_experts + n_share_fusion_experts
|
||||
config.n_routed_experts + 1
|
||||
if config.architectures[0] in ["DeepseekV3ForCausalLM"]
|
||||
else config.n_routed_experts
|
||||
)
|
||||
@@ -409,8 +408,7 @@ def main(args: argparse.Namespace):
|
||||
intermediate_size = config.moe_intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
elif config.architectures[0] == "Llama4ForConditionalGeneration":
|
||||
n_share_fusion_experts = args.n_share_experts_fusion
|
||||
E = config.text_config.num_local_experts + n_share_fusion_experts
|
||||
E = config.text_config.num_local_experts + 1
|
||||
topk = config.text_config.num_experts_per_tok
|
||||
intermediate_size = config.text_config.intermediate_size
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
@@ -570,12 +568,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--seed", type=int, default=0)
|
||||
parser.add_argument("--batch-size", type=int, required=False)
|
||||
parser.add_argument("--tune", action="store_true")
|
||||
parser.add_argument(
|
||||
"--n-share-experts-fusion",
|
||||
type=int,
|
||||
default=0,
|
||||
help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
||||
|
||||
Reference in New Issue
Block a user