diff --git a/python/sglang/srt/models/deepseek_nextn.py b/python/sglang/srt/models/deepseek_nextn.py index 768a73d8a..4849177af 100644 --- a/python/sglang/srt/models/deepseek_nextn.py +++ b/python/sglang/srt/models/deepseek_nextn.py @@ -215,11 +215,11 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM): "up_proj.weight_scale_inv", ] names_to_remove = [] - for num_repeat in range(self.n_share_experts_fusion): - for suffix in suffix_list: - shared_expert_weight_name = ( - f"model.layers.0.mlp.shared_experts.{suffix}" - ) + for suffix in suffix_list: + shared_expert_weight_name = ( + f"model.layers.0.mlp.shared_experts.{suffix}" + ) + for num_repeat in range(self.n_share_experts_fusion): weights_list.append( ( f"model.layers.0." @@ -229,7 +229,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM): weights_dict[shared_expert_weight_name], ) ) - names_to_remove += [shared_expert_weight_name] + names_to_remove += [shared_expert_weight_name] weights = [w for w in weights_list if w[0] not in names_to_remove] # Params for weights, fp8 weight scales, fp8 activation scales diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index a552c11f7..2c709439b 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -1650,11 +1650,11 @@ class DeepseekV2ForCausalLM(nn.Module): desc=f"Cloning {self.n_share_experts_fusion} " "replicas of the shared expert into MoE", ): - for num_repeat in range(self.n_share_experts_fusion): - for suffix in suffix_list: - shared_expert_weight_name = ( - f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}" - ) + for suffix in suffix_list: + shared_expert_weight_name = ( + f"model.layers.{moe_layer}.mlp.shared_experts.{suffix}" + ) + for num_repeat in range(self.n_share_experts_fusion): weights_list.append( ( f"model.layers.{moe_layer}." @@ -1664,7 +1664,7 @@ class DeepseekV2ForCausalLM(nn.Module): weights_dict[shared_expert_weight_name], ) ) - names_to_remove += [shared_expert_weight_name] + names_to_remove += [shared_expert_weight_name] weights = [w for w in weights_list if w[0] not in names_to_remove] # Params for weights, fp8 weight scales, fp8 activation scales