[bugfix] Fixed the bug in retrieving the quantization method for mlp.… (#4797)

When retrieving the quantization method for MOE (e.g., the quantization
file of DeepSeek v3.2 exp do not match the model's naming convention in
eager mode), a KeyError is raised: "model.layers.3.mlp.experts.weight
not in self.quant_description". However the quantization file is like :
```bash
  "model.layers.3.mlp.experts.255.gate_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.gate_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.gate_proj.weight_offset": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.down_proj.weight_offset": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight_scale": "W8A8_DYNAMIC",
  "model.layers.3.mlp.experts.255.up_proj.weight_offset": "W8A8_DYNAMIC",
```

Co-Authored-By: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>
This commit is contained in:
zhangxinyuehfad
2025-12-09 08:47:19 +08:00
committed by GitHub
parent 4e728f1f40
commit 0d094531b4
2 changed files with 20 additions and 0 deletions

View File

@@ -157,6 +157,15 @@ class AscendQuantConfig(QuantizationConfig):
f"Detected some but not all shards of {prefix} "
"are quantized. All shards of fused layers "
"to have the same precision.")
elif "experts" in prefix:
# For the experts' prefix (e.g., "model.layers.3.mlp.experts")
# Assume all experts within the same MLP use the same quantization method
experts_quant_description = [
self.quant_description[layer]
for layer in self.quant_description if prefix in layer
]
is_skipped = any(quantization == "FLOAT"
for quantization in experts_quant_description)
else:
is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"

View File

@@ -52,6 +52,17 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
f"Not all shards of {prefix} are quantized with same quant type."
f"Shard {proj_name} uses {shard_quant_type}, but another shard"
f"use {quant_type}. Please check quantization config.")
elif "experts" in prefix:
# For the experts' prefix (e.g., "model.layers.3.mlp.experts")
# Assume all experts within the same MLP use the same quantization method
experts_quant_description = set(quant_description[layer]
for layer in quant_description
if prefix in layer)
if not len(experts_quant_description) == 1:
raise RuntimeError(
f"{prefix} has different quantization type: {experts_quant_description}."
)
quant_type = experts_quant_description.pop()
else:
quant_type = quant_description[prefix + '.weight']
return quant_type