[Model]Add Qwen3-Omni quantization Ascend NPU adaptation and optimization (#6828)

### What this PR does / why we need it? This pull request is for quantization adaptation of Qwen3Omni, and it achieves operator-level optimization and AUT (Auto-Quantization Tuning) component optimization through patch-based modifications. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.15.0 - vLLM main: 83b47f67b1 --------- Signed-off-by: tanhaoan333 <tanhaoan@huawei.com>
2026-03-03 00:07:23 +08:00
parent dfa9ff7f2a
commit 15f6564976
2 changed files with 30 additions and 1 deletions
--- a/vllm_ascend/quantization/methods/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/methods/w8a8_dynamic.py
@@ -79,6 +79,11 @@ class AscendW8A8DynamicLinearMethod(AscendLinearScheme):
        tp_rank: int | None = 0,
    ) -> torch.Tensor:
        quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x)
+        need_unsqz = False
+        if pertoken_scale.dim() == 2:
+            need_unsqz = True
+            quantized_x = quantized_x.squeeze(dim=1)
+            pertoken_scale = pertoken_scale.squeeze(dim=1)
        output = torch_npu.npu_quant_matmul(
            quantized_x,
            layer.weight,
@@ -87,6 +92,8 @@ class AscendW8A8DynamicLinearMethod(AscendLinearScheme):
            bias=bias,
            output_dtype=x.dtype,
        )
+        if need_unsqz:
+            output = output.unsqueeze(dim=1)
        return output

    def process_weights_after_loading(self, layer):