Introduce moe_dense_tp_size to fix dense layer errors in DeepSeek V3 + 4x8xH100 (#4836)

This commit is contained in:
fzyzcjy
2025-04-18 12:38:26 +08:00
committed by GitHub
parent 1effba4c70
commit 53dcf38876
4 changed files with 31 additions and 1 deletions

View File

@@ -1066,12 +1066,18 @@ class DeepseekV2DecoderLayer(nn.Module):
prefix=add_prefix("mlp", prefix),
)
else:
if self._enable_moe_dense_fully_dp():
mlp_tp_rank, mlp_tp_size = 0, 1
else:
mlp_tp_rank, mlp_tp_size = None, None
self.mlp = DeepseekV2MLP(
hidden_size=config.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
quant_config=quant_config,
prefix=add_prefix("mlp", prefix),
tp_rank=mlp_tp_rank,
tp_size=mlp_tp_size,
)
self.input_is_scattered = (
@@ -1084,6 +1090,10 @@ class DeepseekV2DecoderLayer(nn.Module):
config.hidden_size, eps=config.rms_norm_eps
)
@staticmethod
def _enable_moe_dense_fully_dp():
return global_server_args_dict["moe_dense_tp_size"] == 1
@staticmethod
def _compute_info(config: PretrainedConfig, layer_id: int, is_nextn: bool):
is_sparse = is_nextn or (
@@ -1094,6 +1104,7 @@ class DeepseekV2DecoderLayer(nn.Module):
ffn_input_mode = (
_FFNInputMode.SCATTERED
if (global_server_args_dict["enable_deepep_moe"] and is_sparse)
or (DeepseekV2DecoderLayer._enable_moe_dense_fully_dp() and not is_sparse)
else _FFNInputMode.FULL
)
return _DecoderLayerInfo(is_sparse=is_sparse, ffn_input_mode=ffn_input_mode)
@@ -1240,7 +1251,12 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, residual
)
hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)
if not (
self._enable_moe_dense_fully_dp()
and (not self.info.is_sparse)
and hidden_states.shape[0] == 0
):
hidden_states = self.mlp(hidden_states, forward_batch.forward_mode)
if self.is_last_layer and self.attn_tp_size != 1:
hidden_states += residual