FlashInfer NVFP4 MoE with EP & 2-stream shared expert (#7327)

Co-authored-by: JieXin Liang <Alcanderian@users.noreply.github.com>
Co-authored-by: alcanderian <alcanderian@gmail.com>
This commit is contained in:
Trevor Morris
2025-06-22 13:38:47 -07:00
committed by GitHub
parent edc21cc8ae
commit 5962e70d8d
6 changed files with 182 additions and 20 deletions

View File

@@ -152,6 +152,7 @@ class ServerArgs:
ep_size: int = 1
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
enable_flashinfer_moe: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
@@ -244,7 +245,15 @@ class ServerArgs:
logger.warning(
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
)
if self.enable_flashinfer_moe:
assert (
self.quantization == "modelopt_fp4"
), "modelopt_fp4 quantization is required for Flashinfer MOE"
os.environ["TRTLLM_ENABLE_PDL"] = "1"
self.disable_shared_experts_fusion = True
logger.warning(
f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
)
# Set missing default values
if self.tokenizer_path is None:
self.tokenizer_path = self.model_path
@@ -1162,6 +1171,11 @@ class ServerArgs:
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-flashinfer-moe",
action="store_true",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",