[Quantization] Supported w8a8 int8 quantized Gemma3 and Qwen-VL models (#8619)

Co-authored-by: ronnie_zheng <zl19940307@163.com>
This commit is contained in:
ichernob
2025-08-12 23:31:18 +03:00
committed by GitHub
parent 48afa8f14f
commit 83123f481e
3 changed files with 131 additions and 9 deletions

View File

@@ -255,17 +255,23 @@ class W8A8Int8Config(QuantizationConfig):
if _is_npu:
if isinstance(layer, LinearBase):
key = "model"
if "vision_model" in prefix:
key = "vision_model"
elif "visual" in prefix:
key = "visual"
packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
prefix_in_quant_config = prefix
proj_name = prefix.split(".")[-1]
if proj_name in self.packed_modules_mapping:
if proj_name in packed_modules_mapping_subset:
prefix_in_quant_config = prefix.replace(
proj_name, self.packed_modules_mapping[proj_name][0]
proj_name, packed_modules_mapping_subset[proj_name][0]
)
self.is_dynamic = (
self.quant_description[prefix_in_quant_config + ".weight"]
== "W8A8_DYNAMIC"
)
if self.is_layer_skipped(prefix, self.packed_modules_mapping):
if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
return UnquantizedLinearMethod()
return (
NPU_W8A8DynamicLinearMethod(self)

View File

@@ -162,12 +162,24 @@ def _initialize_model(
model_class, _ = get_model_architecture(model_config)
packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
if _is_npu:
packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
"q_a_proj",
"kv_a_proj_with_mqa",
]
packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
packed_modules_mapping.update(
{
"visual": {"qkv_proj": ["qkv"]},
"vision_model": {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"proj": ["out_proj"],
},
"model": {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"],
"fused_qkv_a_proj_with_mqa": [
"q_a_proj",
"kv_a_proj_with_mqa",
],
},
}
)
quant_config = _get_quantization_config(
model_config, load_config, packed_modules_mapping
)