simplify the control logic for using shared experts fusion (#5504)

2025-04-20 04:17:35 +08:00
parent bf86c5e990
commit d58e354472
16 changed files with 69 additions and 54 deletions
--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -283,6 +283,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
        inplace: bool = True,
        no_combine: bool = False,
        apply_router_weight_on_input: bool = False,
+        routed_scaling_factor: Optional[float] = None,
    ) -> torch.Tensor:
        from sglang.srt.layers.moe.fused_moe_triton import fused_experts
        from sglang.srt.layers.moe.topk import select_experts
@@ -297,6 +298,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
            num_expert_group=num_expert_group,
            custom_routing_function=custom_routing_function,
            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
        )

        return fused_experts(
@@ -633,6 +635,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
        scoring_func: str = "softmax",
        correction_bias: Optional[torch.Tensor] = None,
        activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
    ) -> torch.Tensor:
        from sglang.srt.layers.moe.topk import select_experts

@@ -653,6 +656,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
        )

        return torch.ops.vllm.fused_marlin_moe(