From f1d789231896da438749b395f7bf007a5b0819c0 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 20 Sep 2025 02:37:49 -0700 Subject: [PATCH] [Auto Sync] Update modelopt_quant.py (20250920) (#10688) Co-authored-by: github-actions[bot] --- .../srt/layers/quantization/modelopt_quant.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/layers/quantization/modelopt_quant.py b/python/sglang/srt/layers/quantization/modelopt_quant.py index 0ab963396..d72526a61 100755 --- a/python/sglang/srt/layers/quantization/modelopt_quant.py +++ b/python/sglang/srt/layers/quantization/modelopt_quant.py @@ -77,6 +77,9 @@ logger = logging.getLogger(__name__) CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var( "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true" ) +USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var( + "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM" +) # Supported activation schemes for the current configuration ACTIVATION_SCHEMES = ["static"] @@ -844,14 +847,25 @@ class ModelOptFp4LinearMethod(LinearMethodBase): if enable_flashinfer_fp4_gemm: w = layer.weight.T w_scale_interleaved = layer.weight_scale_interleaved.T - out = fp4_gemm( - x_fp4, - w, - x_scale_interleaved, - w_scale_interleaved, - layer.alpha, - output_dtype, - ) + if USE_CUTLASS_BACKEND_FOR_FP4_GEMM: + out = fp4_gemm( + x_fp4, + w, + x_scale_interleaved, + w_scale_interleaved, + layer.alpha, + output_dtype, + backend="cutlass", + ) + else: + out = fp4_gemm( + x_fp4, + w, + x_scale_interleaved, + w_scale_interleaved, + layer.alpha, + output_dtype, + ) if bias is not None: out = out + bias return out.view(*output_shape)