From 1902c81fdd373943f17f5983eb8750758c7f4a69 Mon Sep 17 00:00:00 2001
From: Lu Xinlong <luxinlong02@4paradigm.com>
Date: Tue, 30 Jun 2026 09:55:13 +0800
Subject: [PATCH] fix issue of loading weight

---
 qwen3_6_scripts/qwen3_5.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/qwen3_6_scripts/qwen3_5.py b/qwen3_6_scripts/qwen3_5.py
index daaaa89..ca42760 100644
--- a/qwen3_6_scripts/qwen3_5.py
+++ b/qwen3_6_scripts/qwen3_5.py
@@ -1322,6 +1322,35 @@ class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLM):
                 weight_loader(param, loaded_weight)
                 continue
 
+            # --- Individual expert weights (FT checkpoint: experts.{i}.{proj}.weight) ---
+            # Standard transformers fine-tuning saves each expert separately instead of
+            # the pre-merged (num_experts, ...) tensors in the original checkpoint.
+            if ".mlp.experts." in name:
+                parts = name.split(".mlp.experts.", 1)
+                expert_rest = parts[1]          # e.g. "0.gate_proj.weight"
+                dot_pos = expert_rest.find(".")
+                if dot_pos > 0 and expert_rest[:dot_pos].isdigit():
+                    eid = int(expert_rest[:dot_pos])
+                    proj_raw = expert_rest[dot_pos + 1:]
+                    proj = proj_raw[:-7] if proj_raw.endswith(".weight") else proj_raw
+                    prefix = parts[0]           # e.g. "model.layers.0"
+                    if proj == "gate_proj":
+                        w13_name = f"{prefix}.mlp.experts.w13_weight"
+                        if w13_name in params_dict:
+                            param = params_dict[w13_name]
+                            param.weight_loader(param, loaded_weight, "w1_weight", "w1", eid)
+                    elif proj == "up_proj":
+                        w13_name = f"{prefix}.mlp.experts.w13_weight"
+                        if w13_name in params_dict:
+                            param = params_dict[w13_name]
+                            param.weight_loader(param, loaded_weight, "w3_weight", "w3", eid)
+                    elif proj == "down_proj":
+                        w2_name = f"{prefix}.mlp.experts.w2_weight"
+                        if w2_name in params_dict:
+                            param = params_dict[w2_name]
+                            param.weight_loader(param, loaded_weight, "w2_weight", "w2", eid)
+                    continue
+
             # --- Stacked / standard weights ---
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name: