Add fp4 quantize before all-gather for Flashinfer cutlass MoE DP (max throughput) (#7667)
This commit is contained in:
@@ -35,7 +35,10 @@ from sglang.srt.layers.dp_attention import (
|
||||
get_global_dp_buffer,
|
||||
get_local_dp_buffer,
|
||||
)
|
||||
from sglang.srt.layers.moe import get_moe_a2a_backend
|
||||
from sglang.srt.layers.moe import (
|
||||
get_moe_a2a_backend,
|
||||
should_use_flashinfer_cutlass_moe_fp4_allgather,
|
||||
)
|
||||
from sglang.srt.layers.utils import is_sm100_supported
|
||||
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
||||
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
||||
@@ -112,7 +115,11 @@ class LayerScatterModes:
|
||||
if context.is_layer_sparse:
|
||||
return (
|
||||
ScatterMode.SCATTERED
|
||||
if not get_moe_a2a_backend().is_none()
|
||||
if (
|
||||
# Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
|
||||
not get_moe_a2a_backend().is_none()
|
||||
or should_use_flashinfer_cutlass_moe_fp4_allgather()
|
||||
)
|
||||
else ScatterMode.FULL
|
||||
)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user