diff --git a/benchmark/kernels/fused_moe_triton/README.md b/benchmark/kernels/fused_moe_triton/README.md
index d9a1d1af3..573e0ffaa 100644
--- a/benchmark/kernels/fused_moe_triton/README.md
+++ b/benchmark/kernels/fused_moe_triton/README.md
@@ -27,19 +27,17 @@ python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
     --dtype fp8_w8a8 \
     --tune
 
-# Tune DeepSeek-V3 with FP8, TP=8 and n_share_experts_fusion=8
+# Tune DeepSeek-V3 with FP8 and TP=8
 python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
     --model deepseek-ai/DeepSeek-V3-0324 \
     --tp-size 8 \
-    --n-share-experts-fusion 8 \
     --dtype fp8_w8a8 \
     --tune
 
-# Tune DeepSeek-R1 with channel-wise INT8, TP=16 and n_share_experts_fusion=16
+# Tune DeepSeek-R1 with channel-wise INT8 and TP=16
 python benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py \
     --model meituan/DeepSeek-R1-Channel-INT8 \
     --tp-size 16 \
-    --n-share-experts-fusion 16 \
     --dtype int8_w8a8 \
     --tune
 ```
@@ -65,11 +63,10 @@ python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_tri
     --model deepseek-ai/DeepSeek-V3-0324 \
     --tp-size 8
 
-# Compare with custom TP size and n_share_experts_fusion
+# Compare with custom TP size
 python benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py \
     --model deepseek-ai/DeepSeek-V3-0324 \
-    --tp-size 8 \
-    --n-share-experts-fusion 8
+    --tp-size 8
 ```
 
 The benchmark results will be saved as plots and data files in the specified output directory (default: `./configs/benchmark_ops/vllm_sglang_fused_moe/`).
diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
index 98ebcd728..f6ddeb13b 100644
--- a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
@@ -18,7 +18,7 @@ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
 )
 
 
-def get_model_config(model_name: str, tp_size: int, n_share_experts_fusion: int = 0):
+def get_model_config(model_name: str, tp_size: int):
     """Get model configuration parameters"""
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
 
@@ -43,9 +43,8 @@ def get_model_config(model_name: str, tp_size: int, n_share_experts_fusion: int
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // tp_size
     elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
-        n_share_fusion_experts = n_share_experts_fusion
         E = (
-            config.n_routed_experts + n_share_fusion_experts
+            config.n_routed_experts + 1
             if config.architectures[0] in ["DeepseekV3ForCausalLM"]
             else config.n_routed_experts
         )
@@ -294,7 +293,6 @@ def main():
         "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
     )
     parser.add_argument("--tp-size", type=int, default=2)
-    parser.add_argument("--n-share-experts-fusion", type=int, default=0)
     parser.add_argument("--use-fp8-w8a8", action="store_true")
     parser.add_argument(
         "--save-path",
@@ -325,9 +323,7 @@ def main():
             pipeline_model_parallel_size=1,
         )
 
-        model_config = get_model_config(
-            args.model, args.tp_size, args.n_share_experts_fusion
-        )
+        model_config = get_model_config(args.model, args.tp_size)
         benchmark.run(
             show_plots=True,
             print_data=True,
diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
index 30ffd1d40..7bac92f12 100644
--- a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
+++ b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -399,9 +399,8 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
-        n_share_fusion_experts = args.n_share_experts_fusion
         E = (
-            config.n_routed_experts + n_share_fusion_experts
+            config.n_routed_experts + 1
             if config.architectures[0] in ["DeepseekV3ForCausalLM"]
             else config.n_routed_experts
         )
@@ -409,8 +408,7 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] == "Llama4ForConditionalGeneration":
-        n_share_fusion_experts = args.n_share_experts_fusion
-        E = config.text_config.num_local_experts + n_share_fusion_experts
+        E = config.text_config.num_local_experts + 1
         topk = config.text_config.num_experts_per_tok
         intermediate_size = config.text_config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
@@ -570,12 +568,6 @@ if __name__ == "__main__":
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
-    parser.add_argument(
-        "--n-share-experts-fusion",
-        type=int,
-        default=0,
-        help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1",
-    )
     args = parser.parse_args()
 
     main(args)
diff --git a/python/sglang/srt/layers/moe/topk.py b/python/sglang/srt/layers/moe/topk.py
index 624162799..4d3eed9bb 100644
--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -103,7 +103,7 @@ def grouped_topk(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -128,10 +128,10 @@ def grouped_topk(
     )  # [n, e]
     tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
     topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -141,7 +141,7 @@ def grouped_topk(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
@@ -160,7 +160,7 @@ def biased_grouped_topk_impl(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -192,10 +192,10 @@ def biased_grouped_topk_impl(
     _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
     topk_weights = scores.gather(1, topk_ids)
 
-    if n_share_experts_fusion:
+    if num_fused_shared_experts:
         topk_ids[:, -1] = torch.randint(
             low=num_experts,
-            high=num_experts + n_share_experts_fusion,
+            high=num_experts + num_fused_shared_experts,
             size=(topk_ids.size(0),),
             dtype=topk_ids.dtype,
             device=topk_ids.device,
@@ -205,7 +205,7 @@ def biased_grouped_topk_impl(
     if renormalize:
         topk_weights_sum = (
             topk_weights.sum(dim=-1, keepdim=True)
-            if n_share_experts_fusion == 0
+            if num_fused_shared_experts == 0
             else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
         )
         topk_weights = topk_weights / topk_weights_sum
@@ -239,7 +239,7 @@ def biased_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     compiled: bool = True,
-    n_share_experts_fusion: int = 0,
+    num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -247,7 +247,7 @@ def biased_grouped_topk(
     assert (
         routed_scaling_factor is not None
     ), "routed_scaling_factor is required for biased_grouped_topk"
-    # TODO: moe_fused_gate kernel is not supported for n_share_experts_fusion > 0 now.
+    # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
     if (
         _is_cuda
         and gating_output.shape[1] // num_expert_group
@@ -260,7 +260,7 @@ def biased_grouped_topk(
             num_expert_group,
             topk_group,
             topk,
-            n_share_experts_fusion,
+            num_fused_shared_experts,
             routed_scaling_factor,
         )
         # TODO merge into kernel for this branch
@@ -288,7 +288,7 @@ def biased_grouped_topk(
             renormalize,
             num_expert_group,
             topk_group,
-            n_share_experts_fusion=n_share_experts_fusion,
+            num_fused_shared_experts=num_fused_shared_experts,
             routed_scaling_factor=routed_scaling_factor,
             num_token_non_padded=num_token_non_padded,
             expert_location_dispatch_info=expert_location_dispatch_info,
@@ -310,7 +310,7 @@ def select_experts(
     num_token_non_padded: Optional[torch.Tensor] = None,
     expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
 ):
-    n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
+    num_fused_shared_experts = global_server_args_dict["num_fused_shared_experts"]
 
     router_logits, correction_bias = (
         expert_location_dispatch.transform_select_experts_inputs(
@@ -332,7 +332,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
@@ -346,7 +346,7 @@ def select_experts(
                 renormalize=renormalize,
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
-                n_share_experts_fusion=n_share_experts_fusion,
+                num_fused_shared_experts=num_fused_shared_experts,
                 routed_scaling_factor=routed_scaling_factor,
                 num_token_non_padded=num_token_non_padded,
                 expert_location_dispatch_info=expert_location_dispatch_info,
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index 2839cc9d2..9192648e8 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -89,7 +89,7 @@ global_server_args_dict = {
     "max_micro_batch_size": ServerArgs.max_micro_batch_size,
     "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
     "ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
-    "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
+    "num_fused_shared_experts": ServerArgs.num_fused_shared_experts,
     "sampling_backend": ServerArgs.sampling_backend,
     "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
     "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index a50e4d3b6..a146df7a7 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -204,7 +204,7 @@ class ModelRunner:
                 "flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
                 "moe_dense_tp_size": server_args.moe_dense_tp_size,
                 "ep_dispatch_algorithm": server_args.ep_dispatch_algorithm,
-                "n_share_experts_fusion": server_args.n_share_experts_fusion,
+                "num_fused_shared_experts": server_args.num_fused_shared_experts,
                 "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
                 "torchao_config": server_args.torchao_config,
                 "sampling_backend": server_args.sampling_backend,
diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py
index 8f55d8408..ce31f355c 100644
--- a/python/sglang/srt/models/deepseek_nextn.py
+++ b/python/sglang/srt/models/deepseek_nextn.py
@@ -122,7 +122,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
-        self.determine_n_share_experts_fusion("DeepseekV3ForCausalLMNextN")
+        self.determine_num_fused_shared_experts("DeepseekV3ForCausalLMNextN")
 
         self.model = DeepseekModelNextN(
             config, quant_config, prefix=add_prefix("model", prefix)
diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py
index 74c9b82cf..eab2930f4 100644
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -224,7 +224,9 @@ class DeepseekV2MoE(nn.Module):
         self.tp_size = get_tensor_model_parallel_world_size()
         self.routed_scaling_factor = config.routed_scaling_factor
         self.n_shared_experts = config.n_shared_experts
-        self.n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
+        self.num_fused_shared_experts = global_server_args_dict[
+            "num_fused_shared_experts"
+        ]
         self.config = config
         self.layer_id = layer_id
 
@@ -244,9 +246,9 @@ class DeepseekV2MoE(nn.Module):
 
         self.experts = get_moe_impl_class()(
             num_experts=config.n_routed_experts
-            + self.n_share_experts_fusion
+            + self.num_fused_shared_experts
             + global_server_args_dict["ep_num_redundant_experts"],
-            top_k=config.num_experts_per_tok + min(self.n_share_experts_fusion, 1),
+            top_k=config.num_experts_per_tok + min(self.num_fused_shared_experts, 1),
             hidden_size=config.hidden_size,
             intermediate_size=config.moe_intermediate_size,
             layer_id=self.layer_id,
@@ -265,7 +267,7 @@ class DeepseekV2MoE(nn.Module):
             ),
         )
 
-        if config.n_shared_experts is not None and self.n_share_experts_fusion == 0:
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             # disable tp for shared experts when enable deepep moe
             self.shared_experts = DeepseekV2MLP(
@@ -418,7 +420,7 @@ class DeepseekV2MoE(nn.Module):
         return final_hidden_states
 
     def _forward_shared_experts(self, hidden_states):
-        if self.n_share_experts_fusion == 0:
+        if self.num_fused_shared_experts == 0:
             return self.shared_experts(hidden_states)
         else:
             return None
@@ -434,7 +436,7 @@ class DeepseekV2MoE(nn.Module):
 
     def op_shared_experts(self, state):
         hidden_states_mlp_input = state.pop("hidden_states_mlp_input")
-        if (self.n_share_experts_fusion == 0) and is_non_idle_and_non_empty(
+        if (self.num_fused_shared_experts == 0) and is_non_idle_and_non_empty(
             state.forward_batch.forward_mode, hidden_states_mlp_input
         ):
             state.shared_output = self.shared_experts(hidden_states_mlp_input)
@@ -1648,7 +1650,7 @@ class DeepseekV2ForCausalLM(nn.Module):
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
-        self.determine_n_share_experts_fusion()
+        self.determine_num_fused_shared_experts()
         self.model = DeepseekV2Model(
             config, quant_config, prefix=add_prefix("model", prefix)
         )
@@ -1674,28 +1676,30 @@ class DeepseekV2ForCausalLM(nn.Module):
     def routed_experts_weights_of_layer(self):
         return self._routed_experts_weights_of_layer.value
 
-    def determine_n_share_experts_fusion(
+    def determine_num_fused_shared_experts(
         self, architecture: str = "DeepseekV3ForCausalLM"
     ):
-        self.n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
-        if self.n_share_experts_fusion > 0:
+        self.num_fused_shared_experts = global_server_args_dict[
+            "num_fused_shared_experts"
+        ]
+        if self.num_fused_shared_experts > 0:
             # Only Deepseek V3/R1 can use shared experts fusion optimization now.
             if (
                 not _is_cuda
                 or self.config.architectures[0] != architecture
                 or self.config.n_routed_experts != 256
             ):
-                self.n_share_experts_fusion = 0
-                global_server_args_dict["n_share_experts_fusion"] = 0
+                self.num_fused_shared_experts = 0
+                global_server_args_dict["num_fused_shared_experts"] = 0
                 log_info_on_rank0(
                     logger,
                     "Only Deepseek V3/R1 on NV-platform can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
                 )
             else:
                 assert (
-                    self.n_share_experts_fusion == self.tp_size
+                    self.num_fused_shared_experts == self.tp_size
                 ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
-        elif self.n_share_experts_fusion == 0:
+        elif self.num_fused_shared_experts == 0:
             if (
                 _is_cuda
                 and torch.cuda.get_device_capability("cuda") >= (9, 0)
@@ -1703,8 +1707,8 @@ class DeepseekV2ForCausalLM(nn.Module):
                 and self.config.n_routed_experts == 256
                 and (not global_server_args_dict["enable_deepep_moe"])
             ):
-                self.n_share_experts_fusion = self.tp_size
-                global_server_args_dict["n_share_experts_fusion"] = self.tp_size
+                self.num_fused_shared_experts = self.tp_size
+                global_server_args_dict["num_fused_shared_experts"] = self.tp_size
                 log_info_on_rank0(
                     logger,
                     "Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
@@ -1905,7 +1909,7 @@ class DeepseekV2ForCausalLM(nn.Module):
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
-        if self.n_share_experts_fusion > 0:
+        if self.num_fused_shared_experts > 0:
             weights_list = list(weights)
             weights_dict = dict(weights_list)
             if self.quant_config is not None:
@@ -1966,14 +1970,14 @@ class DeepseekV2ForCausalLM(nn.Module):
 
             for moe_layer in tqdm(
                 moe_layers,
-                desc=f"Cloning {self.n_share_experts_fusion} "
+                desc=f"Cloning {self.num_fused_shared_experts} "
                 "replicas of the shared expert into MoE",
             ):
                 for suffix in suffix_list:
                     shared_expert_weight_name = (
                         f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}"
                     )
-                    for num_repeat in range(self.n_share_experts_fusion):
+                    for num_repeat in range(self.num_fused_shared_experts):
                         weights_list.append(
                             (
                                 f"model.layers.{moe_layer}."
@@ -1992,7 +1996,7 @@ class DeepseekV2ForCausalLM(nn.Module):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts + self.n_share_experts_fusion,
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
         )
 
         # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index fedbc84eb..ab48627d0 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -206,7 +206,7 @@ class ServerArgs:
     flashinfer_mla_disable_ragged: bool = False
     warmups: Optional[str] = None
     moe_dense_tp_size: Optional[int] = None
-    n_share_experts_fusion: int = 0
+    num_fused_shared_experts: int = 0
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
     mm_attention_backend: Optional[str] = None
@@ -1373,11 +1373,11 @@ class ServerArgs:
         )
 
         parser.add_argument(
-            "--n-share-experts-fusion",
+            "--num-fused-shared-experts",
             type=int,
             default=0,
             help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
-            "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
+            "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with num_fused_shared_experts automatically set to the TP size.",
         )
         parser.add_argument(
             "--disable-chunked-prefix-cache",
diff --git a/sgl-kernel/csrc/common_extension.cc b/sgl-kernel/csrc/common_extension.cc
index 0936a88c8..1baa5aa07 100644
--- a/sgl-kernel/csrc/common_extension.cc
+++ b/sgl-kernel/csrc/common_extension.cc
@@ -161,7 +161,7 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
 
   m.def(
       "moe_fused_gate(Tensor input, Tensor bias, int num_expert_group, int topk_group, int topk, int "
-      "n_share_experts_fusion, float routed_scaling_factor) -> "
+      "num_fused_shared_experts, float routed_scaling_factor) -> "
       "(Tensor[])");
   m.impl("moe_fused_gate", torch::kCUDA, &moe_fused_gate);
   m.def(
diff --git a/sgl-kernel/csrc/moe/moe_fused_gate.cu b/sgl-kernel/csrc/moe/moe_fused_gate.cu
index 6f0c474ba..8c938ca48 100644
--- a/sgl-kernel/csrc/moe/moe_fused_gate.cu
+++ b/sgl-kernel/csrc/moe/moe_fused_gate.cu
@@ -57,7 +57,7 @@ __device__ void moe_fused_gate_impl(
     int64_t num_rows,
     int64_t topk_group,
     int64_t topk,
-    int64_t n_share_experts_fusion,
+    int64_t num_fused_shared_experts,
     double routed_scaling_factor,
     Params params) {
   int tidx = threadIdx.x;
@@ -68,7 +68,7 @@ __device__ void moe_fused_gate_impl(
   }
 
   // Calculate topk_excluding_share_expert_fusion from topk
-  int64_t topk_excluding_share_expert_fusion = topk - (n_share_experts_fusion > 0 ? 1 : 0);
+  int64_t topk_excluding_share_expert_fusion = topk - (num_fused_shared_experts > 0 ? 1 : 0);
 
   // Cast pointers to type T:
   auto* input_ptr = reinterpret_cast<T*>(input);
@@ -222,11 +222,11 @@ __device__ void moe_fused_gate_impl(
     __syncthreads();
   }
 
-  if (thread_group_idx == 0 && n_share_experts_fusion > 0) {
+  if (thread_group_idx == 0 && num_fused_shared_experts > 0) {
     int64_t last_idx = topk * thread_row + topk_excluding_share_expert_fusion;
 
     // Use round-robin to select expert
-    int64_t expert_offset = thread_row % n_share_experts_fusion;
+    int64_t expert_offset = thread_row % num_fused_shared_experts;
     indices_ptr[last_idx] = static_cast<int32_t>(params.NUM_EXPERTS + expert_offset);
 
     // Set the weight to the sum of all weights divided by routed_scaling_factor
@@ -273,7 +273,7 @@ __global__ void moe_fused_gate_kernel(
     int64_t num_rows,
     int64_t topk_group,
     int64_t topk,
-    int64_t n_share_experts_fusion,
+    int64_t num_fused_shared_experts,
     double routed_scaling_factor) {
   KernelParams<VPT, NUM_EXPERTS, THREADS_PER_ROW, ROWS_PER_WARP, ROWS_PER_CTA, WARPS_PER_CTA> params;
   moe_fused_gate_impl<T>(
@@ -284,7 +284,7 @@ __global__ void moe_fused_gate_kernel(
       num_rows,
       topk_group,
       topk,
-      n_share_experts_fusion,
+      num_fused_shared_experts,
       routed_scaling_factor,
       params);
 }
@@ -305,7 +305,7 @@ __global__ void moe_fused_gate_kernel(
             num_rows,                                                                                    \
             topk_group,                                                                                  \
             topk,                                                                                        \
-            n_share_experts_fusion,                                                                      \
+            num_fused_shared_experts,                                                                    \
             routed_scaling_factor);                                                                      \
     dispatched = true;                                                                                   \
   } while (0)
@@ -333,7 +333,7 @@ __global__ void moe_fused_gate_kernel_dynamic(
     int64_t num_expert_group,
     int64_t topk_group,
     int64_t topk,
-    int64_t n_share_experts_fusion,
+    int64_t num_fused_shared_experts,
     double routed_scaling_factor) {
   KernelParamsDynamic params;
   params.NUM_EXPERTS = num_experts;             // e.g, for deepseek v3, this is 256
@@ -351,7 +351,7 @@ __global__ void moe_fused_gate_kernel_dynamic(
       num_rows,
       topk_group,
       topk,
-      n_share_experts_fusion,
+      num_fused_shared_experts,
       routed_scaling_factor,
       params);
 }
@@ -365,7 +365,7 @@ std::vector<at::Tensor> moe_fused_gate(
     int64_t num_expert_group,
     int64_t topk_group,
     int64_t topk,
-    int64_t n_share_experts_fusion,
+    int64_t num_fused_shared_experts,
     double routed_scaling_factor) {
   int64_t num_rows = input.size(0);
   int32_t num_experts = input.size(1);
@@ -464,7 +464,7 @@ std::vector<at::Tensor> moe_fused_gate(
           num_expert_group,
           topk_group,
           topk,
-          n_share_experts_fusion,
+          num_fused_shared_experts,
           routed_scaling_factor);
     } else if (input.scalar_type() == at::kHalf) {
       moe_fused_gate_kernel_dynamic<float16_t><<<num_blocks, block_dim, 0, stream>>>(
@@ -477,7 +477,7 @@ std::vector<at::Tensor> moe_fused_gate(
           num_expert_group,
           topk_group,
           topk,
-          n_share_experts_fusion,
+          num_fused_shared_experts,
           routed_scaling_factor);
     } else if (input.scalar_type() == at::kFloat) {
       moe_fused_gate_kernel_dynamic<float32_t><<<num_blocks, block_dim, 0, stream>>>(
@@ -490,7 +490,7 @@ std::vector<at::Tensor> moe_fused_gate(
           num_expert_group,
           topk_group,
           topk,
-          n_share_experts_fusion,
+          num_fused_shared_experts,
           routed_scaling_factor);
     } else {
       TORCH_CHECK(false, "Unsupported data type for moe_fused_gate");
diff --git a/sgl-kernel/include/sgl_kernel_ops.h b/sgl-kernel/include/sgl_kernel_ops.h
index f66e754fd..4c6a37bf8 100644
--- a/sgl-kernel/include/sgl_kernel_ops.h
+++ b/sgl-kernel/include/sgl_kernel_ops.h
@@ -206,7 +206,7 @@ std::vector<at::Tensor> moe_fused_gate(
     int64_t num_expert_group,
     int64_t topk_group,
     int64_t topk,
-    int64_t n_share_experts_fusion,
+    int64_t num_fused_shared_experts,
     double routed_scaling_factor);
 
 void fp8_blockwise_scaled_grouped_mm(
diff --git a/sgl-kernel/python/sgl_kernel/moe.py b/sgl-kernel/python/sgl_kernel/moe.py
index b9e4adcc8..0f13fc6c5 100755
--- a/sgl-kernel/python/sgl_kernel/moe.py
+++ b/sgl-kernel/python/sgl_kernel/moe.py
@@ -42,7 +42,7 @@ def moe_fused_gate(
     num_expert_group,
     topk_group,
     topk,
-    n_share_experts_fusion=0,
+    num_fused_shared_experts=0,
     routed_scaling_factor=0,
 ):
     # This fused kernel function is used to select topk expert in a hierarchical 2-layer fashion
@@ -51,7 +51,7 @@ def moe_fused_gate(
     # the #experts is decided by the input tensor shape and we currently only support power of 2 #experts
     # and #experts should be divisible by num_expert_group. #expert/num_expert_group <= 32 is limited for now.
     # for non-supported case, we suggest to use the biased_grouped_topk func in sglang.srt.layers.moe.topk
-    # n_share_experts_fusion: if > 0, the last expert will be replaced with a round-robin shared expert
+    # num_fused_shared_experts: if > 0, the last expert will be replaced with a round-robin shared expert
     # routed_scaling_factor: if > 0, the last expert will be scaled by this factor
     return torch.ops.sgl_kernel.moe_fused_gate.default(
         input_tensor,
@@ -59,7 +59,7 @@ def moe_fused_gate(
         num_expert_group,
         topk_group,
         topk,
-        n_share_experts_fusion,
+        num_fused_shared_experts,
         routed_scaling_factor,
     )
 
diff --git a/sgl-kernel/tests/test_moe_fused_gate.py b/sgl-kernel/tests/test_moe_fused_gate.py
index 09787224b..c6faf5fc4 100644
--- a/sgl-kernel/tests/test_moe_fused_gate.py
+++ b/sgl-kernel/tests/test_moe_fused_gate.py
@@ -19,15 +19,15 @@ from sglang.srt.layers.moe.topk import biased_grouped_topk
         (512, 16, 8, 16),
     ],
 )
-@pytest.mark.parametrize("n_share_experts_fusion", [0, 1, 8, 16])
-def test_moe_fused_gate_combined(seq_length, dtype, params, n_share_experts_fusion):
+@pytest.mark.parametrize("num_fused_shared_experts", [0, 1])
+def test_moe_fused_gate_combined(seq_length, dtype, params, num_fused_shared_experts):
     num_experts, num_expert_group, topk_group, topk = params
 
     torch.manual_seed(seq_length)
     tensor = torch.rand((seq_length, num_experts)).to(dtype).cuda()
     scores = tensor.clone()
     bias = torch.rand(num_experts).to(dtype).cuda()
-    topk = topk + min(1, n_share_experts_fusion)
+    topk = topk + min(1, num_fused_shared_experts)
 
     output, indices = moe_fused_gate(
         tensor,
@@ -35,7 +35,7 @@ def test_moe_fused_gate_combined(seq_length, dtype, params, n_share_experts_fusi
         num_expert_group=num_expert_group,
         topk_group=topk_group,
         topk=topk,
-        n_share_experts_fusion=n_share_experts_fusion,
+        num_fused_shared_experts=num_fused_shared_experts,
         routed_scaling_factor=2.5,
     )
     ref_output, ref_indices = biased_grouped_topk(
@@ -47,12 +47,12 @@ def test_moe_fused_gate_combined(seq_length, dtype, params, n_share_experts_fusi
         num_expert_group=num_expert_group,
         topk_group=topk_group,
         compiled=False,
-        n_share_experts_fusion=n_share_experts_fusion,
+        num_fused_shared_experts=num_fused_shared_experts,
         routed_scaling_factor=2.5,
     )
 
-    # When n_share_experts_fusion > 0, ignore the comparison of the last topk dimension
-    if n_share_experts_fusion > 0:
+    # When num_fused_shared_experts > 0, ignore the comparison of the last topk dimension
+    if num_fused_shared_experts > 0:
         original_indices = indices.clone()
         original_ref_indices = ref_indices.clone()
 
@@ -60,7 +60,7 @@ def test_moe_fused_gate_combined(seq_length, dtype, params, n_share_experts_fusi
         ref_indices = ref_indices[:, :-1]
 
         valid_min = num_experts
-        valid_max = num_experts + n_share_experts_fusion
+        valid_max = num_experts + num_fused_shared_experts
         shared_indices = original_indices[:, -1]
         shared_ref_indices = original_ref_indices[:, -1]
         if shared_indices is not None:
@@ -87,11 +87,11 @@ def test_moe_fused_gate_combined(seq_length, dtype, params, n_share_experts_fusi
 
     assert idx_check, (
         f"Indices mismatch at seq_length {seq_length}, dtype {dtype}, "
-        f"params {params}, n_share_experts_fusion {n_share_experts_fusion}"
+        f"params {params}, num_fused_shared_experts {num_fused_shared_experts}"
     )
     assert output_check, (
         f"Output mismatch at seq_length {seq_length}, dtype {dtype}, "
-        f"params {params}, n_share_experts_fusion {n_share_experts_fusion}"
+        f"params {params}, num_fused_shared_experts {num_fused_shared_experts}"
     )