diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 73de5b0d1..9087f79b0 100644 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -952,7 +952,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): tp_rank: Optional[int] = None, tp_size: Optional[int] = None, ) -> torch.Tensor: - assert activation == "silu", "Only SiLU activation is supported." if self.enable_flashinfer_moe: @@ -982,13 +981,15 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): tp_size=tp_size, tp_rank=tp_rank, tune_max_num_tokens=next_power_of_2(x.shape[0]), - ) - return output[0] + )[0] + if routed_scaling_factor is not None: + output *= routed_scaling_factor + return output from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4 topk_weights, topk_ids, _ = topk_output - return cutlass_moe_fp4( + output = cutlass_moe_fp4( a=x, a1_gscale=layer.w13_input_scale_quant, w1_fp4=layer.w13_weight, @@ -1003,3 +1004,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase): params=layer.cutlass_moe_params, apply_router_weight_on_input=apply_router_weight_on_input, ).to(x.dtype) + if routed_scaling_factor is not None: + output *= routed_scaling_factor + return output diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 107c63646..6fec17bc0 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -433,10 +433,6 @@ class ServerArgs: self.quantization == "modelopt_fp4" ), "modelopt_fp4 quantization is required for Flashinfer MOE" os.environ["TRTLLM_ENABLE_PDL"] = "1" - self.disable_shared_experts_fusion = True - logger.warning( - f"Flashinfer MoE is enabled. Shared expert fusion is disabled." - ) # DeepEP MoE if self.enable_deepep_moe: