[Auto Sync] Update modelopt_quant.py (20250920) (#10688)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
2025-09-20 02:37:49 -07:00
parent 7c876de7f5
commit f1d7892318
1 changed files with 22 additions and 8 deletions
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -77,6 +77,9 @@ logger = logging.getLogger(__name__)
 CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
    "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
 )
+USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(
+    "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"
+)

 # Supported activation schemes for the current configuration
 ACTIVATION_SCHEMES = ["static"]
@@ -844,14 +847,25 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
        if enable_flashinfer_fp4_gemm:
            w = layer.weight.T
            w_scale_interleaved = layer.weight_scale_interleaved.T
-        out = fp4_gemm(
-            x_fp4,
-            w,
-            x_scale_interleaved,
-            w_scale_interleaved,
-            layer.alpha,
-            output_dtype,
-        )
+        if USE_CUTLASS_BACKEND_FOR_FP4_GEMM:
+            out = fp4_gemm(
+                x_fp4,
+                w,
+                x_scale_interleaved,
+                w_scale_interleaved,
+                layer.alpha,
+                output_dtype,
+                backend="cutlass",
+            )
+        else:
+            out = fp4_gemm(
+                x_fp4,
+                w,
+                x_scale_interleaved,
+                w_scale_interleaved,
+                layer.alpha,
+                output_dtype,
+            )
        if bias is not None:
            out = out + bias
        return out.view(*output_shape)