[2/2] Fuse routed scaling factor into select_experts (#8690)

2025-08-20 15:10:16 -07:00
parent f96413c444
commit a91e90d9a3
6 changed files with 55 additions and 25 deletions
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1305,8 +1305,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
                tp_rank=layer.moe_tp_rank,
                tune_max_num_tokens=next_power_of_2(x.shape[0]),
            )[0]
-            if moe_runner_config.routed_scaling_factor is not None:
-                output *= moe_runner_config.routed_scaling_factor
+            # Scale by routed_scaling_factor is fused into select_experts.
            if should_use_flashinfer_cutlass_moe_fp4_allgather():
                output, global_output = get_local_dp_buffer(), output
                get_tp_group().reduce_scatterv(
@@ -1332,6 +1331,5 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
            params=layer.cutlass_moe_params,
            apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
        ).to(x.dtype)
-        if moe_runner_config.routed_scaling_factor is not None:
-            output *= moe_runner_config.routed_scaling_factor
+        # Scale by routed_scaling_factor is fused into select_experts.
        return output