[Model]Add Qwen3-Omni quantization Ascend NPU adaptation and optimization (#6828)
### What this PR does / why we need it?
This pull request is for quantization adaptation of Qwen3Omni, and it
achieves operator-level optimization and AUT (Auto-Quantization Tuning)
component optimization through patch-based modifications.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.15.0
- vLLM main:
83b47f67b1
---------
Signed-off-by: tanhaoan333 <tanhaoan@huawei.com>
This commit is contained in:
@@ -79,6 +79,11 @@ class AscendW8A8DynamicLinearMethod(AscendLinearScheme):
|
|||||||
tp_rank: int | None = 0,
|
tp_rank: int | None = 0,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x)
|
quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x)
|
||||||
|
need_unsqz = False
|
||||||
|
if pertoken_scale.dim() == 2:
|
||||||
|
need_unsqz = True
|
||||||
|
quantized_x = quantized_x.squeeze(dim=1)
|
||||||
|
pertoken_scale = pertoken_scale.squeeze(dim=1)
|
||||||
output = torch_npu.npu_quant_matmul(
|
output = torch_npu.npu_quant_matmul(
|
||||||
quantized_x,
|
quantized_x,
|
||||||
layer.weight,
|
layer.weight,
|
||||||
@@ -87,6 +92,8 @@ class AscendW8A8DynamicLinearMethod(AscendLinearScheme):
|
|||||||
bias=bias,
|
bias=bias,
|
||||||
output_dtype=x.dtype,
|
output_dtype=x.dtype,
|
||||||
)
|
)
|
||||||
|
if need_unsqz:
|
||||||
|
output = output.unsqueeze(dim=1)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def process_weights_after_loading(self, layer):
|
def process_weights_after_loading(self, layer):
|
||||||
|
|||||||
@@ -64,6 +64,13 @@ QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
|
|||||||
"mm_projector.linear_1": "mm_projector.proj.0",
|
"mm_projector.linear_1": "mm_projector.proj.0",
|
||||||
"mm_projector.linear_2": "mm_projector.proj.2",
|
"mm_projector.linear_2": "mm_projector.proj.2",
|
||||||
},
|
},
|
||||||
|
"qwen3_omni_moe_thinker": {
|
||||||
|
"thinker.lm_head.": "language_model.lm_head.",
|
||||||
|
"thinker.model.": "language_model.model.",
|
||||||
|
"thinker.": "",
|
||||||
|
"lm_head.": "language_model.lm_head.",
|
||||||
|
"model.": "language_model.model.",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# key: model_type
|
# key: model_type
|
||||||
@@ -186,6 +193,18 @@ packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
|
|||||||
],
|
],
|
||||||
"experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
|
"experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
|
||||||
},
|
},
|
||||||
|
"qwen3_omni_moe_text": {
|
||||||
|
"qkv_proj": [
|
||||||
|
"q_proj",
|
||||||
|
"k_proj",
|
||||||
|
"v_proj",
|
||||||
|
],
|
||||||
|
"gate_up_proj": [
|
||||||
|
"gate_proj",
|
||||||
|
"up_proj",
|
||||||
|
],
|
||||||
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -457,7 +476,10 @@ class AscendModelSlimConfig(QuantizationConfig):
|
|||||||
"to have the same precision."
|
"to have the same precision."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
|
is_skipped = any(
|
||||||
|
key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT"
|
||||||
|
for key, value in self.quant_description.items()
|
||||||
|
)
|
||||||
|
|
||||||
assert is_skipped is not None
|
assert is_skipped is not None
|
||||||
return is_skipped
|
return is_skipped
|
||||||
|
|||||||
Reference in New Issue
Block a user