[b200] support trt-llm allreduce fuse rms_norm_add kernel (#7621)

This commit is contained in:
Xiaoyu Zhang
2025-07-03 10:36:20 +08:00
committed by GitHub
parent 82f021e22e
commit 8e64140e35
5 changed files with 253 additions and 2 deletions

View File

@@ -157,6 +157,7 @@ class ServerArgs:
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
enable_flashinfer_moe: bool = False
enable_flashinfer_allreduce_fusion: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
@@ -1206,6 +1207,11 @@ class ServerArgs:
action="store_true",
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
)
parser.add_argument(
"--enable-flashinfer-allreduce-fusion",
action="store_true",
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",