[NVIDIA] Add Flashinfer MoE blockscale fp8 backend (#8036)

2025-07-27 00:34:41 -07:00
parent e34cf6ad75
commit 85486b6f6f
8 changed files with 179 additions and 47 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -169,7 +169,8 @@ class ServerArgs:
    ep_size: int = 1
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
-    enable_flashinfer_moe: bool = False
+    enable_flashinfer_cutlass_moe: bool = False
+    enable_flashinfer_trtllm_moe: bool = False
    enable_flashinfer_allreduce_fusion: bool = False
    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
    ep_num_redundant_experts: int = 0
@@ -428,12 +429,16 @@ class ServerArgs:
            ), "Please enable dp attention when setting enable_dp_lm_head. "

        # MoE kernel
-        if self.enable_flashinfer_moe:
+        if self.enable_flashinfer_cutlass_moe:
            assert (
                self.quantization == "modelopt_fp4"
            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
            os.environ["TRTLLM_ENABLE_PDL"] = "1"

+        if self.enable_flashinfer_trtllm_moe:
+            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
+            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
+
        # DeepEP MoE
        if self.enable_deepep_moe:
            if self.deepep_mode == "normal":
@@ -1293,10 +1298,15 @@ class ServerArgs:
            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
        )
        parser.add_argument(
-            "--enable-flashinfer-moe",
+            "--enable-flashinfer-cutlass-moe",
            action="store_true",
            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
        )
+        parser.add_argument(
+            "--enable-flashinfer-trtllm-moe",
+            action="store_true",
+            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
+        )
        parser.add_argument(
            "--enable-flashinfer-allreduce-fusion",
            action="store_true",