Files
xc-llm-ascend/vllm_ascend/quantization/compressed_tensors_config.py
Li Wang 83a4065b4b [CI] Add pre-commit check for patch logger (#7446)
### What this PR does / why we need it?
See https://github.com/vllm-project/vllm-ascend/pull/7402, pre-commit
hook will forbid init_logger(__name__) in vllm_ascend patch modules

- vLLM version: v0.17.0
- vLLM main:
8a680463fa

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
2026-03-19 16:53:20 +08:00

429 lines
17 KiB
Python

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
"""LLM-Compressor (compressed_tensors) quantization configuration for Ascend."""
from typing import Any, Optional, cast
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy, QuantizationType
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS, register_quantization_config
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
find_matched_target,
is_activation_quantization_format,
should_ignore_layer,
)
from vllm.model_executor.models.utils import WeightsMapper
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
from .methods import AscendLinearScheme, AscendMoEScheme
# Remove the original compressed_tensors method to replace with our implementation
def _remove_quantization_method():
if COMPRESSED_TENSORS_METHOD in QUANTIZATION_METHODS:
QUANTIZATION_METHODS.remove(COMPRESSED_TENSORS_METHOD)
_remove_quantization_method()
QUANTIZATION_SCHEME_MAP_TYPE = dict[str, dict[str, "QuantizationArgs"] | None]
@register_quantization_config(COMPRESSED_TENSORS_METHOD)
class AscendCompressedTensorsConfig(QuantizationConfig):
"""Config class for LLM-Compressor (compressed_tensors) quantization on Ascend.
This class adapts the compressed_tensors format to work with Ascend's
quantization implementations.
"""
def __init__(
self,
target_scheme_map: dict[str, Any],
ignore: list[str],
quant_format: str,
config: dict[str, Any] | None = None,
):
super().__init__()
self.ignore = ignore
self.quant_format = quant_format
# Map from [target -> scheme]
self.target_scheme_map = target_scheme_map
self.quant_description = config
def get_name(self) -> str:
return "compressed-tensors"
@classmethod
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
return [torch.int8, torch.float16, torch.bfloat16]
@classmethod
def get_min_capability(cls) -> int:
raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')
@classmethod
def get_config_filenames(cls) -> list[str]:
return []
def _add_fused_moe_to_target_scheme_map(self):
"""
Helper function to update target_scheme_map
since linear layers get fused into FusedMoE
targeting 'Linear' needs to also match
FusedMoE modules.
"""
if "Linear" not in self.target_scheme_map or "FusedMoE" in self.target_scheme_map:
return
self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
@classmethod
def from_config(cls, config: dict[str, Any]) -> "AscendCompressedTensorsConfig":
ignore: list[str] = cast(list[str], config.get("ignore", []))
quant_format = cast(str, config.get("format"))
target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
return cls(
target_scheme_map=target_scheme_map,
ignore=ignore,
quant_format=quant_format,
config=config,
)
@classmethod
def _quantization_scheme_map_from_config(cls, config: dict[str, Any]) -> QUANTIZATION_SCHEME_MAP_TYPE:
"""Build target scheme map from config.
:param config: The `quantization_config` dictionary from config.json
:return: A dictionary mapping target layer names to their corresponding
quantization_args for weights and input activations
"""
target_scheme_map: dict[str, Any] = dict()
quant_format = cast(str, config.get("format"))
config_groups = config.get("config_groups", dict())
for _, quant_config in config_groups.items():
targets = quant_config.get("targets")
for target in targets:
target_scheme_map[target] = {}
target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(quant_config.get("weights"))
target_scheme_map[target]["input_activations"] = None
target_scheme_map[target]["format"] = quant_config.get("format")
format = target_scheme_map[target].get("format")
# If no per-config format defined, use global format in config
act_quant_format = (
is_activation_quantization_format(format)
if format is not None
else is_activation_quantization_format(quant_format)
)
input_activations = quant_config.get("input_activations")
if act_quant_format and input_activations is not None:
target_scheme_map[target]["input_activations"] = QuantizationArgs.model_validate(
quant_config.get("input_activations")
)
return target_scheme_map
def get_quant_method(
self,
layer: torch.nn.Module,
prefix: str,
) -> Optional["QuantizeMethodBase"]:
from .method_adapters import AscendFusedMoEMethod, AscendLinearMethod
if isinstance(layer, LinearBase):
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
# Get the scheme for this layer
linear_scheme = self._get_linear_scheme(layer=layer, layer_name=prefix)
# Return unquantized method if no scheme found
if linear_scheme is None:
return UnquantizedLinearMethod()
# Store scheme on layer for reference (optional, for debugging)
layer.scheme = linear_scheme
logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!")
return AscendLinearMethod(linear_scheme)
if isinstance(layer, FusedMoE):
# Delayed import to avoid circular import
from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
layer_name = prefix + ".0.gate_proj"
# Get the scheme for this layer
moe_scheme = self._get_moe_scheme(layer=layer, layer_name=layer_name)
# Return unquantized method if no scheme found
if moe_scheme is None:
return AscendUnquantizedFusedMoEMethod(layer.moe_config)
# Store scheme on layer for reference (optional, for debugging)
layer.scheme = moe_scheme
logger.info_once("Using the vLLM Ascend llmcompressor Quantization now!")
return AscendFusedMoEMethod(moe_scheme, layer.moe_config)
return None
def _get_linear_scheme(self, layer: torch.nn.Module, layer_name: str | None = None) -> AscendLinearScheme | None:
"""Get the linear quantization scheme for a layer.
Returns:
An AscendLinearScheme instance, or None if the layer
should use unquantized method.
"""
weight_quant, input_quant, format = self._get_quant_args(layer, layer_name)
if weight_quant is None:
return None
scheme = self._create_scheme_for_layer_type(
weight_quant=weight_quant,
input_quant=input_quant,
format=format,
layer_type="linear",
)
return cast(AscendLinearScheme, scheme)
def _get_moe_scheme(self, layer: torch.nn.Module, layer_name: str | None = None) -> AscendMoEScheme | None:
"""Get the MoE quantization scheme for a layer.
Returns:
An AscendMoEScheme instance, or None if the layer
should use unquantized method.
"""
# Add FusedMoE to target scheme map if needed
self._add_fused_moe_to_target_scheme_map()
weight_quant, input_quant, format = self._get_quant_args(layer, layer_name)
if weight_quant is None:
return None
scheme = self._create_scheme_for_layer_type(
weight_quant=weight_quant,
input_quant=input_quant,
format=format,
layer_type="moe",
)
return cast(AscendMoEScheme, scheme)
def _get_quant_args(
self, layer: torch.nn.Module, layer_name: str | None = None
) -> tuple[Optional["QuantizationArgs"], Optional["QuantizationArgs"], str | None]:
"""Extract quantization arguments for a layer.
compressed-tensors supports non uniform in the following way:
targets of config_groups: There can be N config_groups which each
have a quantization scheme. Each config_group has a list of targets
which can be a full layer_name, a regex for a layer_name, or
an nn.Module name.
Detect whether a layer_name is found in any target and
use the quantization scheme corresponding to the matched target.
Returns:
A tuple of (weight_quant, input_quant, format). weight_quant is
None if the layer should use unquantized method.
"""
scheme_dict = self.get_scheme_dict(layer, layer_name)
weight_quant = None
input_quant = None
format = None
if scheme_dict:
weight_quant = scheme_dict.get("weights")
input_quant = scheme_dict.get("input_activations")
format = scheme_dict.get("format")
if weight_quant is None:
logger.warning_once(
"Acceleration for non-quantized schemes is "
"not supported by Compressed Tensors. "
"Falling back to UnquantizedLinearMethod"
)
return weight_quant, input_quant, format
def get_scheme_dict(
self, layer: torch.nn.Module, layer_name: str | None = None
) -> dict[str, QuantizationArgs | str | None] | None:
"""
Extract the QuantizationArgs for a given layer.
Returns:
dict with {
"weights": QuantizationArgs,
"input_activations": QuantizationArgs | None,
"format": str | None
} | None
"""
if should_ignore_layer(layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping):
return None
if self.target_scheme_map:
matched_target = find_matched_target(
layer_name=layer_name,
module=layer,
targets=self.target_scheme_map.keys(),
fused_mapping=self.packed_modules_mapping,
)
scheme_dict = self.target_scheme_map[matched_target]
if scheme_dict.get("format") is None:
scheme_dict["format"] = self.quant_format
return scheme_dict
return None
def _create_scheme_for_layer_type(
self,
weight_quant: "QuantizationArgs",
input_quant: Optional["QuantizationArgs"],
format: str | None,
layer_type: str,
) -> AscendLinearScheme | AscendMoEScheme:
"""Create the appropriate Ascend scheme based on quantization args and layer type.
Args:
weight_quant: Weight quantization arguments.
input_quant: Input activation quantization arguments.
format: Per-layer format, if defined.
layer_type: Type of layer ("linear" or "moe").
Returns:
An instance of the appropriate Ascend quantization scheme.
"""
from .methods import get_scheme_class
# Determine the quantization type
quant_type = self._detect_quant_type(weight_quant, input_quant, format)
# Get the scheme class from registry
scheme_cls = get_scheme_class(quant_type, layer_type)
if scheme_cls is None:
raise NotImplementedError(
f"No compressed-tensors compatible scheme was found for "
f"quant_type={quant_type}, layer_type={layer_type}."
)
return scheme_cls()
def _detect_quant_type(
self,
weight_quant: "QuantizationArgs",
input_quant: Optional["QuantizationArgs"],
format: str | None,
) -> str:
"""Detect the quantization type from quantization arguments.
Args:
weight_quant: Weight quantization arguments.
input_quant: Input activation quantization arguments.
format: Per-layer format, if defined.
Returns:
A string representing the quantization type (e.g., "W8A8", "W8A8_DYNAMIC").
"""
# use the per-layer format if defined, otherwise, use global format
format = format if format is not None else self.quant_format
act_quant_format = is_activation_quantization_format(format)
if act_quant_format and input_quant is not None:
if self._is_static_tensor_w8a8(weight_quant, input_quant):
return "W8A8"
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
return "W8A8_DYNAMIC"
if self._is_dynamic_token_w4a8(weight_quant, input_quant):
return "W4A8_DYNAMIC"
if self._is_w4a16(weight_quant, input_quant):
return "W4A16"
raise NotImplementedError("No compressed-tensors compatible quantization type was found.")
def _is_static_tensor_w8a8(self, weight_quant: "QuantizationArgs", input_quant: "QuantizationArgs") -> bool:
is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
weight_strategy = weight_quant.strategy == QuantizationStrategy.CHANNEL.value
is_tensor = weight_strategy and input_quant.strategy == QuantizationStrategy.TENSOR.value
is_static = not weight_quant.dynamic and not input_quant.dynamic
is_symmetric = weight_quant.symmetric and input_quant.symmetric
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_8_bits and is_tensor and is_symmetric and is_static
def _is_dynamic_token_w8a8(self, weight_quant: "QuantizationArgs", input_quant: "QuantizationArgs") -> bool:
is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
weight_strategy = weight_quant.strategy == QuantizationStrategy.CHANNEL.value
is_token = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
is_dynamic = not weight_quant.dynamic and input_quant.dynamic
is_symmetric = weight_quant.symmetric and input_quant.symmetric
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_8_bits and is_token and is_symmetric and is_dynamic
def _is_dynamic_token_w4a8(self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs) -> bool:
is_4_bits = weight_quant.num_bits == 4
is_8_bits = input_quant.num_bits == 8
weight_strategy = (weight_quant.strategy == QuantizationStrategy.CHANNEL.value) or (
weight_quant.strategy == QuantizationStrategy.GROUP.value
)
is_token = weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
is_dynamic = not weight_quant.dynamic and input_quant.dynamic
is_symmetric = weight_quant.symmetric and input_quant.symmetric
# Adapt for AscendW4A8DynamicFusedMoEMethod
assert self.quant_description is not None, "quant_description should not be None"
if weight_strategy:
self.quant_description["group_size"] = weight_quant.group_size if weight_quant.group_size else 0
self.quant_description["version"] = "0"
self.quant_description["ascend_quant_method"] = COMPRESSED_TENSORS_METHOD
self.quant_description["weight_strategy"] = str(weight_quant.strategy)
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_4_bits and is_8_bits and is_token and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: "QuantizationArgs", input_quant: Optional["QuantizationArgs"]) -> bool:
# Confirm weights quantized.
if weight_quant is None:
return False
# Confirm we have integer type.
if weight_quant.type != QuantizationType.INT:
return False
input_quant_none = input_quant is None
is_4_bits = weight_quant.num_bits == 4
is_group = weight_quant.strategy == QuantizationStrategy.GROUP.value
is_static = not weight_quant.dynamic
return input_quant_none and is_4_bits and is_group and is_static
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map)
self.ignore = hf_to_vllm_mapper.apply_list(self.ignore)