[NVIDIA] Add Low Latency NVFP4 decode kernels from Flashinfer (#8552)

Co-authored-by: Cheng Wan <cwan@x.ai>
2025-08-04 03:10:02 -07:00
parent 36fc9260a2
commit 915140fd18
8 changed files with 504 additions and 117 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -481,6 +481,13 @@ class ServerArgs:
                self.tp_size,
            ], "The expert parallel size must be 1 or the same as the tensor parallel size"

+        if self.enable_flashinfer_trtllm_moe:
+            if not self.disable_shared_experts_fusion:
+                self.disable_shared_experts_fusion = True
+                logger.warning(
+                    "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
+                )
+
        # DeepEP MoE
        if self.moe_a2a_backend == "deepep":
            if self.deepep_mode == "normal":