support qwen25 vl w8a8 quantization (#2778)

### What this PR does / why we need it? support qwen25 vl w8a8 quantization ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 62f66be1f7 --------- Signed-off-by: lijiaojiao <lijiaojiao990304@163.com> Co-authored-by: lijiaojiao <lijiaojiao990304@163.com>
2025-09-11 16:40:51 +08:00
parent 2b9269b581
commit bd3dedea61
3 changed files with 103 additions and 3 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -53,6 +53,7 @@ class AscendQuantConfig(QuantizationConfig):
    """

    def __init__(self, quant_config: Dict[str, Any]):
+        super().__init__()
        self.quant_description = quant_config

    def __repr__(self) -> str:
@@ -89,6 +90,8 @@ class AscendQuantConfig(QuantizationConfig):
    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
        from vllm.attention.layer import Attention
+        if prefix.startswith("language_model"):
+            prefix = prefix.split('.', 1)[-1]
        if isinstance(layer, LinearBase):
            if self.is_layer_skipped_ascend(prefix,
                                            self.packed_modules_mapping):