[bugfix] Fixed the bug in retrieving the quantization method for mlp.… (#4797)
When retrieving the quantization method for MOE (e.g., the quantization file of DeepSeek v3.2 exp do not match the model's naming convention in eager mode), a KeyError is raised: "model.layers.3.mlp.experts.weight not in self.quant_description". However the quantization file is like : ```bash "model.layers.3.mlp.experts.255.gate_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.gate_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.gate_proj.weight_offset": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.down_proj.weight_offset": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight_scale": "W8A8_DYNAMIC", "model.layers.3.mlp.experts.255.up_proj.weight_offset": "W8A8_DYNAMIC", ``` Co-Authored-By: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: yangqinghao-cmss <yangqinghao_yewu@cmss.chinamobile.com>
This commit is contained in:
@@ -157,6 +157,15 @@ class AscendQuantConfig(QuantizationConfig):
|
||||
f"Detected some but not all shards of {prefix} "
|
||||
"are quantized. All shards of fused layers "
|
||||
"to have the same precision.")
|
||||
elif "experts" in prefix:
|
||||
# For the experts' prefix (e.g., "model.layers.3.mlp.experts")
|
||||
# Assume all experts within the same MLP use the same quantization method
|
||||
experts_quant_description = [
|
||||
self.quant_description[layer]
|
||||
for layer in self.quant_description if prefix in layer
|
||||
]
|
||||
is_skipped = any(quantization == "FLOAT"
|
||||
for quantization in experts_quant_description)
|
||||
else:
|
||||
is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
|
||||
|
||||
|
||||
@@ -52,6 +52,17 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
|
||||
f"Not all shards of {prefix} are quantized with same quant type."
|
||||
f"Shard {proj_name} uses {shard_quant_type}, but another shard"
|
||||
f"use {quant_type}. Please check quantization config.")
|
||||
elif "experts" in prefix:
|
||||
# For the experts' prefix (e.g., "model.layers.3.mlp.experts")
|
||||
# Assume all experts within the same MLP use the same quantization method
|
||||
experts_quant_description = set(quant_description[layer]
|
||||
for layer in quant_description
|
||||
if prefix in layer)
|
||||
if not len(experts_quant_description) == 1:
|
||||
raise RuntimeError(
|
||||
f"{prefix} has different quantization type: {experts_quant_description}."
|
||||
)
|
||||
quant_type = experts_quant_description.pop()
|
||||
else:
|
||||
quant_type = quant_description[prefix + '.weight']
|
||||
return quant_type
|
||||
|
||||
Reference in New Issue
Block a user