Set num_fused_shared_experts as num_shared_experts when shared_experts fusion is not disabled (#6736)

2025-06-04 15:53:22 -07:00
parent f0f84975f4
commit 81964328b7
22 changed files with 381 additions and 45 deletions
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
        custom_routing_function: Optional[Callable] = None,
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
            custom_routing_function=custom_routing_function,
            correction_bias=correction_bias,
            routed_scaling_factor=routed_scaling_factor,
@@ -627,6 +629,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
        use_grouped_topk: bool = False,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
        global_num_experts: int = -1,
        expert_map: Optional[torch.Tensor] = None,
        custom_routing_function: Optional[Callable] = None,
@@ -651,6 +654,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            correction_bias=correction_bias,