[Quantization] Support compressed tensors w8a8 static and w8a8 dynamic weight (#4036)
### What this PR does / why we need it? While using the LLM Compressor quantization tool from the VLLM community to generate quantized weights, the VLLM Ascend engine needs to be adapted to support the compressed tensors quantization format. 1. Add AscendCompressedTensorsConfig to replace CompressedTensorsConfig in vllm. 2. Support CompressedTensorsW8A8 static weight. - weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric. 4. Support CompressedTensorsW8A8Dynamic weight. - weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic. 5. Modify the override_quantization_method in AscendQuantConfig. Co-authored-by: taoqun110 taoqun@huawei.com Co-authored-by: chenxi-hh chen464822955@163.com - vLLM version: v0.11.2 --------- Signed-off-by: LHXuuu <scut_xlh@163.com> Signed-off-by: chenxi-hh <chen464822955@163.com> Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com> Co-authored-by: chenxi-hh <chen464822955@163.com> Co-authored-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
from typing import Any, Dict, Optional, Type
|
||||
|
||||
import torch
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
|
||||
|
||||
from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
|
||||
from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
|
||||
AscendW4A8DynamicLinearMethod)
|
||||
@@ -60,8 +63,28 @@ def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str,
|
||||
def get_quant_method(quant_description: Dict[str, Any],
|
||||
prefix: str,
|
||||
layer_type: str,
|
||||
packed_modules_mapping: Optional[Dict[str, Any]] = None):
|
||||
logger.info_once("Using the vLLM Ascend Quantization now!")
|
||||
packed_modules_mapping: Optional[Dict[str, Any]] = None,
|
||||
layer: torch.nn.Module = None):
|
||||
if quant_description.get("quant_method") == COMPRESSED_TENSORS_METHOD:
|
||||
return get_quant_method_llmcompressor(layer)
|
||||
|
||||
return get_quant_method_modelslim(quant_description, prefix, layer_type,
|
||||
packed_modules_mapping)
|
||||
|
||||
|
||||
def get_quant_method_llmcompressor(layer: torch.nn.Module):
|
||||
logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!")
|
||||
if layer.scheme is None:
|
||||
raise ValueError("A scheme must be defined for each layer")
|
||||
return layer.scheme
|
||||
|
||||
|
||||
def get_quant_method_modelslim(
|
||||
quant_description: Dict[str, Any],
|
||||
prefix: str,
|
||||
layer_type: str,
|
||||
packed_modules_mapping: Optional[Dict[str, Any]] = None):
|
||||
logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
|
||||
if packed_modules_mapping is None:
|
||||
packed_modules_mapping = dict()
|
||||
# Attention
|
||||
|
||||
Reference in New Issue
Block a user