[Quantization] Support compressed tensors moe w8a8 int8 dynamic weight (#5718)

### What this PR does / why we need it?
While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.

1. Support Moe model W8A8 Int8 dynamic weight.
2. Specify W4A16 quantization configuration.

Co-authored-by: menogrey 1299267905@qq.com
Co-authored-by: kunpengW-code 1289706727@qq.com

### Does this PR introduce _any_ user-facing change?
No

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

---------

Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: menogrey <1299267905@qq.com>
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Co-authored-by: menogrey <1299267905@qq.com>
Co-authored-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
LHXuuu
2026-01-14 09:17:26 +08:00
committed by GitHub
parent ecf2fa482e
commit 0415e694cd
5 changed files with 192 additions and 43 deletions

View File

@@ -204,6 +204,7 @@
"vllm-ascend/Qwen3-30B-A3B-Puring",
"vllm-ascend/Qwen3-30B-A3B-W8A8",
"vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning",
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
"vllm-ascend/Qwen3-32B-W4A4",
"vllm-ascend/Qwen3-32B-W8A8",
"vllm-ascend/Qwen3-8B",

View File

@@ -72,7 +72,11 @@ pip install llmcompressor
#### Model Quantization
`LLM-Compressor` provides various quantization scheme examples. To generate W8A8 dynamic quantized weights:
`LLM-Compressor` provides various quantization scheme examples.
##### Dense Quantization
An example to generate W8A8 dynamic quantized weights for dense model:
```bash
# Navigate to LLM-Compressor examples directory
@@ -82,6 +86,18 @@ cd examples/quantization/llm-compressor
python3 w8a8_int8_dynamic.py
```
##### MoE Quantization
An example to generate W8A8 dynamic quantized weights for MoE model:
```bash
# Navigate to LLM-Compressor examples directory
cd examples/quantization/llm-compressor
# Run quantization script
python3 w8a8_int8_dynamic_moe.py
```
For more content, refer to the [official examples](https://github.com/vllm-project/llm-compressor/tree/main/examples).
Currently supported quantization types by LLM-Compressor: `W8A8` and `W8A8_DYNAMIC`.

View File

@@ -0,0 +1,29 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
recipe = QuantizationModifier(
targets="Linear",
scheme="INT8",
ignore=["lm_head", "re:.*mlp.gate$"],
)
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8_W8A8"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

View File

@@ -42,3 +42,26 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")
def test_qwen3_moe_w8a8_dynamic_llm_compressor():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"),
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
]
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")

View File

@@ -2,7 +2,8 @@ from typing import TYPE_CHECKING, Any, Optional, cast
import torch
from compressed_tensors.quantization import (QuantizationArgs,
QuantizationStrategy)
QuantizationStrategy,
QuantizationType)
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.linear import (LinearBase,
@@ -23,7 +24,8 @@ from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
AscendQuantConfig)
from vllm_ascend.quantization.w4a16 import AscendW4A16FusedMoEMethod
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
from vllm_ascend.quantization.w8a8_dynamic import (
AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod)
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
if TYPE_CHECKING:
@@ -75,6 +77,18 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
def get_config_filenames(cls) -> list[str]:
return []
def _add_fused_moe_to_target_scheme_map(self):
"""
Helper function to update target_scheme_map
since linear layers get fused into FusedMoE
targeting 'Linear' needs to also match
FusedMoE modules.
"""
if ("Linear" not in self.target_scheme_map
or "FusedMoE" in self.target_scheme_map):
return
self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
@classmethod
def from_config(cls, config: dict[str,
Any]) -> "AscendCompressedTensorsConfig":
@@ -155,20 +169,48 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
None, layer)
return quant_method
if isinstance(layer, FusedMoE):
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
# collect schemes
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
self._add_fused_moe_to_target_scheme_map()
unfused_names = [
prefix + proj_name for proj_name in
[".0.gate_proj", ".0.up_proj", ".0.down_proj"]
]
# TODO: refactor this to use expert_mapping and check all layer numbers
all_scheme_dicts = [
self.get_scheme_dict(layer, name) for name in unfused_names
]
scheme_dict = all_scheme_dicts.pop()
# choose quantization method
quant_method = AscendUnquantizedFusedMoEMethod(layer.moe_config)
if quant_scheme is not None:
layer.scheme = quant_scheme
ascend_quant_config = AscendQuantConfig(self.quant_description
or {})
quant_method = AscendFusedMoEMethod(
ascend_quant_config, prefix,
ascend_quant_config.packed_modules_mapping, layer)
return quant_method
# multiple schemes found
if not all(
[cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
raise ValueError("All MoE projections need to have same "
"quantization scheme but found multiple")
if scheme_dict is None:
return AscendUnquantizedFusedMoEMethod(layer.moe_config)
weight_quant = scheme_dict.get("weights")
input_quant = scheme_dict.get("input_activations")
quant_scheme = None
act_quant_format = is_activation_quantization_format(self.quant_format)
if act_quant_format:
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
quant_scheme = AscendW8A8DynamicFusedMoEMethod()
else:
if self._is_w4a16(weight_quant, input_quant):
quant_scheme = AscendW4A16FusedMoEMethod()
if quant_scheme is None:
raise RuntimeError(
f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
)
layer.scheme = quant_scheme
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
ascend_quant_config = AscendQuantConfig(self.quant_description
or {})
return AscendFusedMoEMethod(ascend_quant_config, prefix,
self.packed_modules_mapping, layer)
return None
def get_scheme(self,
@@ -188,26 +230,14 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
to select the CompressedTensorsScheme used for inference.
"""
# Find the "target" in the compressed-tensors config
# that our layer conforms to.
if should_ignore_layer(layer_name,
ignore=self.ignore,
fused_mapping=self.packed_modules_mapping):
return None
# Will be empty for models with only sparsity
weight_quant = input_quant = None
if self.target_scheme_map:
matched_target = find_matched_target(
layer_name=layer_name,
module=layer,
targets=self.target_scheme_map.keys(),
fused_mapping=self.packed_modules_mapping,
)
scheme_dict = self.target_scheme_map[matched_target]
scheme_dict = self.get_scheme_dict(layer, layer_name)
weight_quant = None
input_quant = None
format = None
if scheme_dict:
weight_quant = scheme_dict.get("weights")
input_quant = scheme_dict.get("input_activations")
format = scheme_dict.get("format")
if weight_quant is None:
logger.warning_once("Acceleration for non-quantized schemes is "
@@ -220,13 +250,54 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
scheme = self._get_scheme_from_parts(
weight_quant=weight_quant,
input_quant=input_quant,
format=format,
)
return scheme
def get_scheme_dict(
self,
layer: torch.nn.Module,
layer_name: str | None = None
) -> dict[str, QuantizationArgs | str | None] | None:
"""
Extract the QuantizationArgs for a given layer.
Returns:
dict with {
"weights": QuantizationArgs,
"input_activations": QuantizationArgs | None,
"format": str | None
} | None
"""
if should_ignore_layer(layer_name,
ignore=self.ignore,
fused_mapping=self.packed_modules_mapping):
return None
if self.target_scheme_map:
matched_target = find_matched_target(
layer_name=layer_name,
module=layer,
targets=self.target_scheme_map.keys(),
fused_mapping=self.packed_modules_mapping,
)
scheme_dict = self.target_scheme_map[matched_target]
if scheme_dict.get("format") is None:
scheme_dict["format"] = self.quant_format
return scheme_dict
return None
def _get_scheme_from_parts(
self, weight_quant: QuantizationArgs,
input_quant: QuantizationArgs) -> "CompressedTensorsScheme":
act_quant_format = is_activation_quantization_format(self.quant_format)
self,
weight_quant: QuantizationArgs,
input_quant: QuantizationArgs,
format: str | None = None,
) -> "CompressedTensorsScheme":
# use the per-layer format if defined, otherwise, use global format
format = format if format is not None else self.quant_format
act_quant_format = is_activation_quantization_format(format)
if act_quant_format and input_quant is not None:
if self._is_static_tensor_w8a8(weight_quant, input_quant):
return AscendW8A8LinearMethod()
@@ -234,10 +305,6 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
return AscendW8A8DynamicLinearMethod()
if weight_quant is not None:
if self._is_w4a16(weight_quant):
return AscendW4A16FusedMoEMethod()
raise NotImplementedError(
"No compressed-tensors compatible scheme was found.")
@@ -269,9 +336,22 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
# Only symmetric weight quantization supported.
return is_8_bits and is_token and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: QuantizationArgs) -> bool:
def _is_w4a16(self, weight_quant: QuantizationArgs,
input_quant: QuantizationArgs) -> bool:
# Confirm weights quantized.
if weight_quant is None:
return False
# Confirm we have floating points.
if weight_quant.type != QuantizationType.INT:
return False
input_quant_none = input_quant is None
is_4_bits = weight_quant.num_bits == 4
return is_4_bits
is_group = (weight_quant.strategy == QuantizationStrategy.GROUP.value)
is_static = not weight_quant.dynamic
return input_quant_none and is_4_bits and is_group and is_static
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
self.target_scheme_map = hf_to_vllm_mapper.apply_dict(