[NVIDIA] Enable Flashinfer MoE blockscale fp8 backend for TP MoE (#8450)

Co-authored-by: kushanam <42385577+kushanam@users.noreply.github.com>
2025-07-31 19:56:34 -07:00
parent 39decec10b
commit aa4c66b564
6 changed files with 131 additions and 46 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -460,10 +460,6 @@ class ServerArgs:
                    f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
                )

-        if self.enable_flashinfer_trtllm_moe:
-            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
-            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
-
        # DeepEP MoE
        if self.enable_deepep_moe:
            if self.deepep_mode == "normal":