diff --git a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py index 6e9a5f35c..5f219739c 100644 --- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py +++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py @@ -1074,16 +1074,3 @@ class FlashInferFP4MoE(FusedMoE): )[0] return result - - -def get_fused_moe_impl_class(): - """Factory function to get the appropriate FusedMoE implementation class.""" - if should_use_flashinfer_trtllm_moe() and _is_fp4_quantization_enabled(): - # Use FP4 variant when FP4 quantization is enabled - return FlashInferFP4MoE - elif should_use_flashinfer_trtllm_moe(): - # Use regular FlashInfer variant for non-FP4 FlashInfer cases - return FlashInferFusedMoE - else: - # Default case - return FusedMoE diff --git a/python/sglang/srt/layers/quantization/fp8.py b/python/sglang/srt/layers/quantization/fp8.py index 89938f4c3..31a2c2eb2 100644 --- a/python/sglang/srt/layers/quantization/fp8.py +++ b/python/sglang/srt/layers/quantization/fp8.py @@ -635,11 +635,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.register_parameter("w13_weight_scale_inv", w13_weight_scale) layer.register_parameter("w2_weight_scale_inv", w2_weight_scale) assert self.quant_config.activation_scheme == "dynamic" - if ( - get_bool_env_var("SGLANG_CUTLASS_MOE") - and self.cutlass_fp8_supported - and (is_sm100_supported() or is_sm90_supported()) - ): + if self.use_cutlass_fused_experts_fp8: self.ab_strides1 = torch.full( (num_experts,), hidden_size,