[Bugfix] Fix Weightloading for the original nvidia/Deepseek-R1-FP4 checkpoint (#9940)
Signed-off-by: Pavani Majety <pmajety@nvidia.com> Co-authored-by: Yineng Zhang <me@zhyncs.com> Co-authored-by: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com>
This commit is contained in:
@@ -654,11 +654,13 @@ class ServerArgs:
|
||||
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
||||
|
||||
if self.moe_runner_backend == "flashinfer_trtllm":
|
||||
if not self.disable_shared_experts_fusion:
|
||||
self.disable_shared_experts_fusion = True
|
||||
logger.warning(
|
||||
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
||||
)
|
||||
assert (
|
||||
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
||||
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
||||
self.disable_shared_experts_fusion = True
|
||||
logger.warning(
|
||||
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
||||
)
|
||||
|
||||
# DeepEP MoE
|
||||
if self.moe_a2a_backend == "deepep":
|
||||
|
||||
Reference in New Issue
Block a user