[b200] support trt-llm allreduce fuse rms_norm_add kernel (#7621)

2025-07-03 10:36:20 +08:00
parent 82f021e22e
commit 8e64140e35
5 changed files with 253 additions and 2 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -157,6 +157,7 @@ class ServerArgs:
    enable_ep_moe: bool = False
    enable_deepep_moe: bool = False
    enable_flashinfer_moe: bool = False
+    enable_flashinfer_allreduce_fusion: bool = False
    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
    ep_num_redundant_experts: int = 0
    ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
@@ -1206,6 +1207,11 @@ class ServerArgs:
            action="store_true",
            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
        )
+        parser.add_argument(
+            "--enable-flashinfer-allreduce-fusion",
+            action="store_true",
+            help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
+        )
        parser.add_argument(
            "--enable-deepep-moe",
            action="store_true",