From 89caf7a3c6cde7a6d01f9fa1dd362885092a0a87 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Fri, 1 Aug 2025 19:00:24 -0700 Subject: [PATCH] [bugfix] Apply routed scaling factor to cutlass_fused_experts_fp8 (#8688) --- python/sglang/srt/layers/quantization/fp8.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 0578ee60c..1b0824051 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -1039,7 +1039,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8 topk_weights, topk_ids, _ = topk_output - return cutlass_fused_experts_fp8( + output = cutlass_fused_experts_fp8( x, layer.w13_weight.transpose(1, 2), layer.w2_weight.transpose(1, 2), @@ -1062,6 +1062,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.problem_sizes2, use_fp8_blockscale=True, ) + # TODO: Fuse into select_experts + if routed_scaling_factor is not None: + output *= routed_scaling_factor + return output # Expert fusion with FP8 quantization return fused_experts( x,