Add fp4 quantize before all-gather for Flashinfer cutlass MoE DP (max throughput) (#7667)

2025-08-15 22:08:11 -07:00
parent 87dab54824
commit eff4eb3fdd
16 changed files with 360 additions and 52 deletions
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -230,6 +230,7 @@ class ServerArgs:
    enable_cudagraph_gc: bool = False
    enable_nccl_nvls: bool = False
    enable_symm_mem: bool = False
+    disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
    enable_tokenizer_batch_encode: bool = False
    disable_outlines_disk_cache: bool = False
    disable_custom_all_reduce: bool = False
@@ -1714,6 +1715,11 @@ class ServerArgs:
            action="store_true",
            help="Enable NCCL symmetric memory for fast collectives.",
        )
+        parser.add_argument(
+            "--disable-flashinfer-cutlass-moe-fp4-allgather",
+            action="store_true",
+            help="Disables quantize before all-gather for flashinfer cutlass moe.",
+        )
        parser.add_argument(
            "--enable-tokenizer-batch-encode",
            action="store_true",