Introduce moe_dense_tp_size to fix dense layer errors in DeepSeek V3 + 4x8xH100 (#4836)

2025-04-18 12:38:26 +08:00
parent 1effba4c70
commit 53dcf38876
4 changed files with 31 additions and 1 deletions
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1066,12 +1066,18 @@ class DeepseekV2DecoderLayer(nn.Module):
                prefix=add_prefix("mlp", prefix),
            )
        else:
+            if self._enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
            self.mlp = DeepseekV2MLP(
                hidden_size=config.hidden_size,
                intermediate_size=config.intermediate_size,
                hidden_act=config.hidden_act,
                quant_config=quant_config,
                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
            )

        self.input_is_scattered = (
@@ -1084,6 +1090,10 @@ class DeepseekV2DecoderLayer(nn.Module):
            config.hidden_size, eps=config.rms_norm_eps
        )

+    @staticmethod
+    def _enable_moe_dense_fully_dp():
+        return global_server_args_dict["moe_dense_tp_size"] == 1
+
    @staticmethod
    def _compute_info(config: PretrainedConfig, layer_id: int, is_nextn: bool):
        is_sparse = is_nextn or (
@@ -1094,6 +1104,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        ffn_input_mode = (
            _FFNInputMode.SCATTERED
            if (global_server_args_dict["enable_deepep_moe"] and is_sparse)
+            or (DeepseekV2DecoderLayer._enable_moe_dense_fully_dp() and not is_sparse)
            else _FFNInputMode.FULL
        )
        return _DecoderLayerInfo(is_sparse=is_sparse, ffn_input_mode=ffn_input_mode)
@@ -1240,7 +1251,12 @@ class DeepseekV2DecoderLayer(nn.Module):
                    hidden_states, residual
                )

-        hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)
+        if not (
+            self._enable_moe_dense_fully_dp()
+            and (not self.info.is_sparse)
+            and hidden_states.shape[0] == 0
+        ):
+            hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)

        if self.is_last_layer and self.attn_tp_size != 1:
            hidden_states += residual