Do layernorm before allgather for DP attention (#8631)

2025-08-03 00:53:08 -07:00
parent f7b2853ff8
commit 32f2815451
1 changed files with 13 additions and 3 deletions
--- a/python/sglang/srt/layers/communicator.py
+++ b/python/sglang/srt/layers/communicator.py
@@ -404,11 +404,21 @@ class CommunicateWithAllReduceAndLayerNormFn:
        if context.attn_dp_size != 1:
            if context.attn_tp_rank == 0:
                hidden_states += residual
+
+            # Perform layernorm on smaller data before comm. Only valid when attn_tp_size is 1 (tp_size == dp_size)
+            use_layer_norm_before_gather = context.attn_tp_size == 1
+            if use_layer_norm_before_gather:
+                residual.copy_(hidden_states)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
+
            hidden_states, local_hidden_states = (
                forward_batch.gathered_buffer,
                hidden_states,
            )
            dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+
+            if not use_layer_norm_before_gather:
                dp_scatter(residual, hidden_states, forward_batch)
                if hidden_states.shape[0] != 0:
                    hidden_states = layernorm(hidden_states)