Add enable_flashinfer_mxfp4_bf16_moe for higher precision and slower moe backend (#9004)

2025-08-23 15:38:40 +08:00
parent 127d4b0d5e
commit 0374304a2c
3 changed files with 37 additions and 5 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -190,6 +190,7 @@ class ServerArgs:
        "flashinfer_cutlass",
        "flashinfer_mxfp4",
    ] = "auto"
+    flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
    enable_flashinfer_allreduce_fusion: bool = False
    deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
    ep_num_redundant_experts: int = 0
@@ -1496,10 +1497,18 @@ class ServerArgs:
                "triton_kernel",
                "flashinfer_trtllm",
                "flashinfer_cutlass",
+                "flashinfer_mxfp4",
            ],
            default=ServerArgs.moe_runner_backend,
            help="Choose the runner backend for MoE.",
        )
+        parser.add_argument(
+            "--flashinfer-mxfp4-moe-precision",
+            type=str,
+            choices=["mxfp4", "bf16"],
+            default=ServerArgs.flashinfer_mxfp4_moe_precision,
+            help="Choose the computation precision of flashinfer mxfp4 moe",
+        )
        parser.add_argument(
            "--enable-flashinfer-allreduce-fusion",
            action="store_true",