From 2f47d710ae9cb1bdbbe0fe2392a0634827d257b3 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Mon, 10 Feb 2025 23:35:44 +0800 Subject: [PATCH] refine some typo (#3473) --- .../fused_moe_triton/benchmark_torch_compile_fused_moe.py | 2 +- python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py | 2 +- python/sglang/srt/layers/moe/topk.py | 2 +- python/sglang/srt/server_args.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py index b81a22800..ce5a3399a 100644 --- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py +++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py @@ -30,7 +30,7 @@ def get_model_config(model_name: str, tp_size: int): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // tp_size - elif config.architectures[0] == "DeepseekV2ForCausalLM": + elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]: E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.intermediate_size diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py index 4cebe46ea..b9390fd42 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py @@ -1094,7 +1094,7 @@ def fused_moe( - num_expert_group: Optional[int]: additional parameter for grouped_topk - topk_group: Optional[int]: additional parameter for grouped_topk - use_grouped_topk: If True, use grouped_topk instead of fused_topk - note: Deepseekv2 model uses grouped_topk + note: Deepseek V2/V3/R1 series models use grouped_topk - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner products for w1 and w2. Defaults to False. - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py index dc53e4445..b0b57d68d 100644 --- a/python/sglang/srt/layers/moe/topk.py +++ b/python/sglang/srt/layers/moe/topk.py @@ -75,7 +75,7 @@ def fused_topk( return topk_weights, topk_ids -# This is used by the Deepseek-V2 model +# This is used by the Deepseek V2/V3/R1 series models @torch.compile(dynamic=True, backend=get_compiler_backend()) def grouped_topk( hidden_states: torch.Tensor, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index f90495824..9f67c2dba 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -795,7 +795,7 @@ class ServerArgs: parser.add_argument( "--disable-mla", action="store_true", - help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.", + help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.", ) parser.add_argument( "--disable-overlap-schedule",