FlashInfer NVFP4 MoE with EP & 2-stream shared expert (#7327)

Co-authored-by: JieXin Liang <Alcanderian@users.noreply.github.com> Co-authored-by: alcanderian <alcanderian@gmail.com>
2025-06-22 13:38:47 -07:00
parent edc21cc8ae
commit 5962e70d8d
6 changed files with 182 additions and 20 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -152,6 +152,7 @@ class ServerArgs:
    ep_size: int = 1
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
+    enable_flashinfer_moe: bool = False
    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
    ep_num_redundant_experts: int = 0
    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
@@ -244,7 +245,15 @@ class ServerArgs:
            logger.warning(
                f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
            )
-
+        if self.enable_flashinfer_moe:
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            os.environ["TRTLLM_ENABLE_PDL"] = "1"
+            self.disable_shared_experts_fusion = True
+            logger.warning(
+                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
+            )
        # Set missing default values
        if self.tokenizer_path is None:
            self.tokenizer_path = self.model_path
@@ -1162,6 +1171,11 @@ class ServerArgs:
            action="store_true",
            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
        )
+        parser.add_argument(
+            "--enable-flashinfer-moe",
+            action="store_true",
+            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
+        )
        parser.add_argument(
            "--enable-deepep-moe",
            action="store_true",