Implement gather before attn (#6378)

2025-06-15 21:08:56 -07:00
parent d2679f5109
commit 3c2274fbee
1 changed files with 23 additions and 5 deletions
--- a/python/sglang/srt/layers/communicator.py
+++ b/python/sglang/srt/layers/communicator.py
@@ -226,13 +226,13 @@ class LayerCommunicator:
@dataclass
 class CommunicateContext:
-    process_group_sizes: Dict["ScatterMode", int]
+    process_group_sizes: Dict[ScatterMode, int]
    attn_tp_rank: int
    attn_tp_size: int
    local_attn_dp_size: int
    tp_size: int
-    def is_same_group_size(self, a: "ScatterMode", b: "ScatterMode"):
+    def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
        return self.process_group_sizes[a] == self.process_group_sizes[b]
    @classmethod
@@ -244,6 +244,7 @@ class CommunicateContext:
        process_group_sizes = {
            ScatterMode.SCATTERED: 1,
            ScatterMode.TP_ATTN_FULL: attn_tp_size,
            # TODO: support --moe-dense-tp-size > 1
            ScatterMode.FULL: tp_size,
        }
        return cls(
@@ -323,11 +324,16 @@ class CommunicateWithAllReduceAndLayerNormFn:
        if (
            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
-            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
            )
            and (hidden_states_output_mode == ScatterMode.FULL)
            and (residual_output_mode == ScatterMode.TP_ATTN_FULL)
        ):
-            return CommunicateWithAllReduceAndLayerNormFn._gather_hidden_states
+            return partial(
                CommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residual,
                residual_input_mode=residual_input_mode,
            )
        if (
            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
@@ -360,13 +366,25 @@ class CommunicateWithAllReduceAndLayerNormFn:
        return hidden_states, residual
    @staticmethod
-    def _gather_hidden_states(
+    def _gather_hidden_states_and_residual(
        hidden_states: torch.Tensor,
        residual: torch.Tensor,
        forward_batch: ForwardBatch,
        layernorm: torch.nn.Module,
        context: CommunicateContext,
        *,
        residual_input_mode,
    ):
        if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
            residual, local_residual = (
                forward_batch.gathered_buffer[
                    : forward_batch.input_ids.shape[0]
                ].clone(),
                residual,
            )
            attn_tp_all_gather(
                list(residual.tensor_split(context.attn_tp_size)), local_residual
            )
        if context.local_attn_dp_size != 1:
            if context.attn_tp_rank == 0:
                hidden_states += residual