Add fp4 quantize before all-gather for Flashinfer cutlass MoE DP (max throughput) (#7667)

2025-08-15 22:08:11 -07:00
parent 87dab54824
commit eff4eb3fdd
16 changed files with 360 additions and 52 deletions
--- a/python/sglang/srt/layers/communicator.py
+++ b/python/sglang/srt/layers/communicator.py
@@ -35,7 +35,10 @@ from sglang.srt.layers.dp_attention import (
    get_global_dp_buffer,
    get_local_dp_buffer,
 )
-from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -112,7 +115,11 @@ class LayerScatterModes:
        if context.is_layer_sparse:
            return (
                ScatterMode.SCATTERED
-                if not get_moe_a2a_backend().is_none()
+                if (
+                    # Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
+                    not get_moe_a2a_backend().is_none()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                )
                else ScatterMode.FULL
            )
        else: