[Bugfix] Fix Weightloading for the original nvidia/Deepseek-R1-FP4 checkpoint (#9940)

Signed-off-by: Pavani Majety <pmajety@nvidia.com> Co-authored-by: Yineng Zhang <me@zhyncs.com> Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
2025-09-10 12:00:23 -07:00
parent 941002945b
commit 21176b0093
2 changed files with 19 additions and 8 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -654,11 +654,13 @@ class ServerArgs:
            ], "The expert parallel size must be 1 or the same as the tensor parallel size"

        if self.moe_runner_backend == "flashinfer_trtllm":
-            if not self.disable_shared_experts_fusion:
-                self.disable_shared_experts_fusion = True
-                logger.warning(
-                    "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
-                )
+            assert (
+                self.quantization == "modelopt_fp4" or self.quantization == "fp8"
+            ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
+            self.disable_shared_experts_fusion = True
+            logger.warning(
+                "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
+            )

        # DeepEP MoE
        if self.moe_a2a_backend == "deepep":