support qwen25 vl w8a8 quantization (#2778)

### What this PR does / why we need it?
support qwen25 vl w8a8 quantization
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?

- vLLM version: v0.10.1.1
- vLLM main:
62f66be1f7

---------

Signed-off-by: lijiaojiao <lijiaojiao990304@163.com>
Co-authored-by: lijiaojiao <lijiaojiao990304@163.com>
This commit is contained in:
6lazijiamo
2025-09-11 16:40:51 +08:00
committed by GitHub
parent 2b9269b581
commit bd3dedea61
3 changed files with 103 additions and 3 deletions

View File

@@ -53,6 +53,7 @@ class AscendQuantConfig(QuantizationConfig):
"""
def __init__(self, quant_config: Dict[str, Any]):
super().__init__()
self.quant_description = quant_config
def __repr__(self) -> str:
@@ -89,6 +90,8 @@ class AscendQuantConfig(QuantizationConfig):
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import Attention
if prefix.startswith("language_model"):
prefix = prefix.split('.', 1)[-1]
if isinstance(layer, LinearBase):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):