[Quantization] Supported w8a8 int8 quantized Gemma3 and Qwen-VL models (#8619)
Co-authored-by: ronnie_zheng <zl19940307@163.com>
This commit is contained in:
@@ -255,17 +255,23 @@ class W8A8Int8Config(QuantizationConfig):
|
||||
|
||||
if _is_npu:
|
||||
if isinstance(layer, LinearBase):
|
||||
key = "model"
|
||||
if "vision_model" in prefix:
|
||||
key = "vision_model"
|
||||
elif "visual" in prefix:
|
||||
key = "visual"
|
||||
packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
|
||||
prefix_in_quant_config = prefix
|
||||
proj_name = prefix.split(".")[-1]
|
||||
if proj_name in self.packed_modules_mapping:
|
||||
if proj_name in packed_modules_mapping_subset:
|
||||
prefix_in_quant_config = prefix.replace(
|
||||
proj_name, self.packed_modules_mapping[proj_name][0]
|
||||
proj_name, packed_modules_mapping_subset[proj_name][0]
|
||||
)
|
||||
self.is_dynamic = (
|
||||
self.quant_description[prefix_in_quant_config + ".weight"]
|
||||
== "W8A8_DYNAMIC"
|
||||
)
|
||||
if self.is_layer_skipped(prefix, self.packed_modules_mapping):
|
||||
if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
|
||||
return UnquantizedLinearMethod()
|
||||
return (
|
||||
NPU_W8A8DynamicLinearMethod(self)
|
||||
|
||||
@@ -162,12 +162,24 @@ def _initialize_model(
|
||||
model_class, _ = get_model_architecture(model_config)
|
||||
packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
|
||||
if _is_npu:
|
||||
packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
|
||||
"q_a_proj",
|
||||
"kv_a_proj_with_mqa",
|
||||
]
|
||||
packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
|
||||
packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
|
||||
packed_modules_mapping.update(
|
||||
{
|
||||
"visual": {"qkv_proj": ["qkv"]},
|
||||
"vision_model": {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
"proj": ["out_proj"],
|
||||
},
|
||||
"model": {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"],
|
||||
"fused_qkv_a_proj_with_mqa": [
|
||||
"q_a_proj",
|
||||
"kv_a_proj_with_mqa",
|
||||
],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
quant_config = _get_quantization_config(
|
||||
model_config, load_config, packed_modules_mapping
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user