[NVIDIA] [3/N] Nvfp4 Masked Gemm: Add flashinfer grouped_gemm_nt_masked (#9199)

2025-09-11 22:18:43 -05:00
parent 7b141f816c
commit 3df05f4d6a
11 changed files with 694 additions and 5 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -399,6 +399,7 @@ class ServerArgs:
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
    enable_flashinfer_cutlass_moe: bool = False
+    enable_flashinfer_cutedsl_moe: bool = False
    enable_flashinfer_trtllm_moe: bool = False
    enable_triton_kernel_moe: bool = False
    enable_flashinfer_mxfp4_moe: bool = False
@@ -420,6 +421,11 @@ class ServerArgs:
            print_deprecated_warning(
                "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
            )
+        if self.enable_flashinfer_cutedsl_moe:
+            self.moe_runner_backend = "flashinfer_cutedsl"
+            print_deprecated_warning(
+                "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
+            )
        if self.enable_flashinfer_cutlass_moe:
            self.moe_runner_backend = "flashinfer_cutlass"
            print_deprecated_warning(
@@ -1622,6 +1628,7 @@ class ServerArgs:
                "flashinfer_trtllm",
                "flashinfer_cutlass",
                "flashinfer_mxfp4",
+                "flashinfer_cutedsl",
            ],
            default=ServerArgs.moe_runner_backend,
            help="Choose the runner backend for MoE.",
@@ -2204,6 +2211,11 @@ class ServerArgs:
            action="store_true",
            help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
        )
+        parser.add_argument(
+            "--enable-flashinfer-cutedsl-moe",
+            action="store_true",
+            help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
+        )
        parser.add_argument(
            "--enable-flashinfer-trtllm-moe",
            action="store_true",