From 1902c81fdd373943f17f5983eb8750758c7f4a69 Mon Sep 17 00:00:00 2001 From: Lu Xinlong Date: Tue, 30 Jun 2026 09:55:13 +0800 Subject: [PATCH] fix issue of loading weight --- qwen3_6_scripts/qwen3_5.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/qwen3_6_scripts/qwen3_5.py b/qwen3_6_scripts/qwen3_5.py index daaaa89..ca42760 100644 --- a/qwen3_6_scripts/qwen3_5.py +++ b/qwen3_6_scripts/qwen3_5.py @@ -1322,6 +1322,35 @@ class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLM): weight_loader(param, loaded_weight) continue + # --- Individual expert weights (FT checkpoint: experts.{i}.{proj}.weight) --- + # Standard transformers fine-tuning saves each expert separately instead of + # the pre-merged (num_experts, ...) tensors in the original checkpoint. + if ".mlp.experts." in name: + parts = name.split(".mlp.experts.", 1) + expert_rest = parts[1] # e.g. "0.gate_proj.weight" + dot_pos = expert_rest.find(".") + if dot_pos > 0 and expert_rest[:dot_pos].isdigit(): + eid = int(expert_rest[:dot_pos]) + proj_raw = expert_rest[dot_pos + 1:] + proj = proj_raw[:-7] if proj_raw.endswith(".weight") else proj_raw + prefix = parts[0] # e.g. "model.layers.0" + if proj == "gate_proj": + w13_name = f"{prefix}.mlp.experts.w13_weight" + if w13_name in params_dict: + param = params_dict[w13_name] + param.weight_loader(param, loaded_weight, "w1_weight", "w1", eid) + elif proj == "up_proj": + w13_name = f"{prefix}.mlp.experts.w13_weight" + if w13_name in params_dict: + param = params_dict[w13_name] + param.weight_loader(param, loaded_weight, "w3_weight", "w3", eid) + elif proj == "down_proj": + w2_name = f"{prefix}.mlp.experts.w2_weight" + if w2_name in params_dict: + param = params_dict[w2_name] + param.weight_loader(param, loaded_weight, "w2_weight", "w2", eid) + continue + # --- Stacked / standard weights --- for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: