[NVIDIA] Add Low Latency NVFP4 decode kernels from Flashinfer (#8552)

Co-authored-by: Cheng Wan <cwan@x.ai>
This commit is contained in:
azhurkevich
2025-08-04 03:10:02 -07:00
committed by GitHub
parent 36fc9260a2
commit 915140fd18
8 changed files with 504 additions and 117 deletions

View File

@@ -481,6 +481,13 @@ class ServerArgs:
self.tp_size,
], "The expert parallel size must be 1 or the same as the tensor parallel size"
if self.enable_flashinfer_trtllm_moe:
if not self.disable_shared_experts_fusion:
self.disable_shared_experts_fusion = True
logger.warning(
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
)
# DeepEP MoE
if self.moe_a2a_backend == "deepep":
if self.deepep_mode == "normal":