[NVIDIA] Add Flashinfer MoE blockscale fp8 backend (#8036)

This commit is contained in:
Kaixi Hou
2025-07-27 00:34:41 -07:00
committed by GitHub
parent e34cf6ad75
commit 85486b6f6f
8 changed files with 179 additions and 47 deletions

View File

@@ -169,7 +169,8 @@ class ServerArgs:
ep_size: int = 1
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
enable_flashinfer_moe: bool = False
enable_flashinfer_cutlass_moe: bool = False
enable_flashinfer_trtllm_moe: bool = False
enable_flashinfer_allreduce_fusion: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
ep_num_redundant_experts: int = 0
@@ -428,12 +429,16 @@ class ServerArgs:
), "Please enable dp attention when setting enable_dp_lm_head. "
# MoE kernel
if self.enable_flashinfer_moe:
if self.enable_flashinfer_cutlass_moe:
assert (
self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE"
os.environ["TRTLLM_ENABLE_PDL"] = "1"
if self.enable_flashinfer_trtllm_moe:
assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
# DeepEP MoE
if self.enable_deepep_moe:
if self.deepep_mode == "normal":
@@ -1293,10 +1298,15 @@ class ServerArgs:
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-flashinfer-moe",
"--enable-flashinfer-cutlass-moe",
action="store_true",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
)
parser.add_argument(
"--enable-flashinfer-trtllm-moe",
action="store_true",
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
)
parser.add_argument(
"--enable-flashinfer-allreduce-fusion",
action="store_true",