Support updating expert locations dynamically (#6388)
This commit is contained in:
@@ -317,6 +317,13 @@ class DeepseekV2MoE(nn.Module):
|
||||
def _enable_deepep_moe(self):
|
||||
return global_server_args_dict["enable_deepep_moe"]
|
||||
|
||||
def get_moe_weights(self):
|
||||
return [
|
||||
x.data
|
||||
for name, x in self.experts.named_parameters()
|
||||
if name not in ["correction_bias"]
|
||||
]
|
||||
|
||||
def op_gate(self, state):
|
||||
if (not self._enable_deepep_moe) or is_non_idle_and_non_empty(
|
||||
state.forward_batch.forward_mode, state.hidden_states_mlp_input
|
||||
@@ -1599,6 +1606,14 @@ class DeepseekV2ForCausalLM(nn.Module):
|
||||
self_attn.w_vc = w_vc.contiguous()
|
||||
self_attn.use_deep_gemm_bmm = True
|
||||
|
||||
# TODO support nextn later
|
||||
if not is_nextn:
|
||||
self.routed_experts_weights_of_layer = {
|
||||
layer_id: layer.mlp.get_moe_weights()
|
||||
for layer_id, layer in enumerate(self.model.layers)
|
||||
if isinstance(layer.mlp, DeepseekV2MoE)
|
||||
}
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
|
||||
if is_nextn:
|
||||
if hasattr(self.config, "num_nextn_predict_layers"):
|
||||
|
||||
Reference in New Issue
Block a user