[Quantization][Feature] Support compressed tensors moe w4a8 dynamic weight (#5889)

### What this PR does / why we need it?

While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.

1. Support Moe model W4A8 dynamic weight.

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

---------

Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: menogrey <1299267905@qq.com>
This commit is contained in:
LHXuuu
2026-02-02 16:39:32 +08:00
committed by GitHub
parent 082aa2e5b7
commit 45a573cff1
5 changed files with 250 additions and 4 deletions

View File

@@ -187,8 +187,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
AscendUnquantizedFusedMoEMethod
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
layer_name = prefix + ".0.gate_proj"
# Get the scheme for this layer
moe_scheme = self._get_moe_scheme(layer=layer, layer_name=prefix)
moe_scheme = self._get_moe_scheme(layer=layer, layer_name=layer_name)
# Return unquantized method if no scheme found
if moe_scheme is None:
@@ -382,6 +383,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
return "W8A8_DYNAMIC"
if self._is_dynamic_token_w4a8(weight_quant, input_quant):
return "W4A8_DYNAMIC"
if self._is_w4a16(weight_quant, input_quant):
return "W4A16"
@@ -415,6 +419,30 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_8_bits and is_token and is_symmetric and is_dynamic
def _is_dynamic_token_w4a8(self, weight_quant: QuantizationArgs,
input_quant: QuantizationArgs) -> bool:
is_4_bits = weight_quant.num_bits == 4
is_8_bits = input_quant.num_bits == 8
weight_strategy = (
weight_quant.strategy == QuantizationStrategy.CHANNEL.value) or (weight_quant.strategy == QuantizationStrategy.GROUP.value)
is_token = (weight_strategy and input_quant.strategy
== QuantizationStrategy.TOKEN.value)
is_dynamic = not weight_quant.dynamic and input_quant.dynamic
is_symmetric = weight_quant.symmetric and input_quant.symmetric
# Adapt for AscendW4A8DynamicFusedMoEMethod
assert self.quant_description is not None, "quant_description should not be None"
if weight_strategy:
self.quant_description["group_size"] = weight_quant.group_size if weight_quant.group_size else 0
self.quant_description["version"] = "0"
self.quant_description["ascend_quant_method"] = COMPRESSED_TENSORS_METHOD
self.quant_description["weight_strategy"] = str(weight_quant.strategy)
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_4_bits and is_8_bits and is_token and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: "QuantizationArgs",
input_quant: Optional["QuantizationArgs"]) -> bool: