[Quantization] Support compressed tensors moe w8a8 int8 dynamic weight (#5718)
### What this PR does / why we need it?
While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.
1. Support Moe model W8A8 Int8 dynamic weight.
2. Specify W4A16 quantization configuration.
Co-authored-by: menogrey 1299267905@qq.com
Co-authored-by: kunpengW-code 1289706727@qq.com
### Does this PR introduce _any_ user-facing change?
No
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
---------
Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: menogrey <1299267905@qq.com>
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Co-authored-by: menogrey <1299267905@qq.com>
Co-authored-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
1
.github/workflows/misc/model_list.json
vendored
1
.github/workflows/misc/model_list.json
vendored
@@ -204,6 +204,7 @@
|
||||
"vllm-ascend/Qwen3-30B-A3B-Puring",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning",
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
"vllm-ascend/Qwen3-32B-W4A4",
|
||||
"vllm-ascend/Qwen3-32B-W8A8",
|
||||
"vllm-ascend/Qwen3-8B",
|
||||
|
||||
@@ -72,7 +72,11 @@ pip install llmcompressor
|
||||
|
||||
#### Model Quantization
|
||||
|
||||
`LLM-Compressor` provides various quantization scheme examples. To generate W8A8 dynamic quantized weights:
|
||||
`LLM-Compressor` provides various quantization scheme examples.
|
||||
|
||||
##### Dense Quantization
|
||||
|
||||
An example to generate W8A8 dynamic quantized weights for dense model:
|
||||
|
||||
```bash
|
||||
# Navigate to LLM-Compressor examples directory
|
||||
@@ -82,6 +86,18 @@ cd examples/quantization/llm-compressor
|
||||
python3 w8a8_int8_dynamic.py
|
||||
```
|
||||
|
||||
##### MoE Quantization
|
||||
|
||||
An example to generate W8A8 dynamic quantized weights for MoE model:
|
||||
|
||||
```bash
|
||||
# Navigate to LLM-Compressor examples directory
|
||||
cd examples/quantization/llm-compressor
|
||||
|
||||
# Run quantization script
|
||||
python3 w8a8_int8_dynamic_moe.py
|
||||
```
|
||||
|
||||
For more content, refer to the [official examples](https://github.com/vllm-project/llm-compressor/tree/main/examples).
|
||||
|
||||
Currently supported quantization types by LLM-Compressor: `W8A8` and `W8A8_DYNAMIC`.
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from llmcompressor import oneshot
|
||||
from llmcompressor.modifiers.quantization import QuantizationModifier
|
||||
|
||||
MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||
|
||||
recipe = QuantizationModifier(
|
||||
targets="Linear",
|
||||
scheme="INT8",
|
||||
ignore=["lm_head", "re:.*mlp.gate$"],
|
||||
)
|
||||
|
||||
oneshot(
|
||||
model=model,
|
||||
recipe=recipe,
|
||||
trust_remote_code_model=True,
|
||||
)
|
||||
|
||||
# Save to disk in compressed-tensors format.
|
||||
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8_W8A8"
|
||||
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||||
tokenizer.save_pretrained(SAVE_DIR)
|
||||
@@ -42,3 +42,26 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
def test_qwen3_moe_w8a8_dynamic_llm_compressor():
|
||||
example_prompts = [
|
||||
"The president of the United States is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"),
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
) as vllm_model:
|
||||
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
golden_results = [
|
||||
'The president of the United States is the head of state and',
|
||||
]
|
||||
|
||||
for i in range(len(vllm_output)):
|
||||
assert golden_results[i] == vllm_output[i][1]
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
@@ -2,7 +2,8 @@ from typing import TYPE_CHECKING, Any, Optional, cast
|
||||
|
||||
import torch
|
||||
from compressed_tensors.quantization import (QuantizationArgs,
|
||||
QuantizationStrategy)
|
||||
QuantizationStrategy,
|
||||
QuantizationType)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.linear import (LinearBase,
|
||||
@@ -23,7 +24,8 @@ from vllm_ascend.quantization.quant_config import (AscendFusedMoEMethod,
|
||||
AscendQuantConfig)
|
||||
from vllm_ascend.quantization.w4a16 import AscendW4A16FusedMoEMethod
|
||||
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
|
||||
from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
|
||||
from vllm_ascend.quantization.w8a8_dynamic import (
|
||||
AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod)
|
||||
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -75,6 +77,18 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
def get_config_filenames(cls) -> list[str]:
|
||||
return []
|
||||
|
||||
def _add_fused_moe_to_target_scheme_map(self):
|
||||
"""
|
||||
Helper function to update target_scheme_map
|
||||
since linear layers get fused into FusedMoE
|
||||
targeting 'Linear' needs to also match
|
||||
FusedMoE modules.
|
||||
"""
|
||||
if ("Linear" not in self.target_scheme_map
|
||||
or "FusedMoE" in self.target_scheme_map):
|
||||
return
|
||||
self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict[str,
|
||||
Any]) -> "AscendCompressedTensorsConfig":
|
||||
@@ -155,20 +169,48 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
None, layer)
|
||||
return quant_method
|
||||
if isinstance(layer, FusedMoE):
|
||||
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
|
||||
# collect schemes
|
||||
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
|
||||
self._add_fused_moe_to_target_scheme_map()
|
||||
unfused_names = [
|
||||
prefix + proj_name for proj_name in
|
||||
[".0.gate_proj", ".0.up_proj", ".0.down_proj"]
|
||||
]
|
||||
# TODO: refactor this to use expert_mapping and check all layer numbers
|
||||
all_scheme_dicts = [
|
||||
self.get_scheme_dict(layer, name) for name in unfused_names
|
||||
]
|
||||
scheme_dict = all_scheme_dicts.pop()
|
||||
|
||||
# choose quantization method
|
||||
quant_method = AscendUnquantizedFusedMoEMethod(layer.moe_config)
|
||||
if quant_scheme is not None:
|
||||
# multiple schemes found
|
||||
if not all(
|
||||
[cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
|
||||
raise ValueError("All MoE projections need to have same "
|
||||
"quantization scheme but found multiple")
|
||||
|
||||
if scheme_dict is None:
|
||||
return AscendUnquantizedFusedMoEMethod(layer.moe_config)
|
||||
|
||||
weight_quant = scheme_dict.get("weights")
|
||||
input_quant = scheme_dict.get("input_activations")
|
||||
|
||||
quant_scheme = None
|
||||
act_quant_format = is_activation_quantization_format(self.quant_format)
|
||||
if act_quant_format:
|
||||
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
|
||||
quant_scheme = AscendW8A8DynamicFusedMoEMethod()
|
||||
else:
|
||||
if self._is_w4a16(weight_quant, input_quant):
|
||||
quant_scheme = AscendW4A16FusedMoEMethod()
|
||||
if quant_scheme is None:
|
||||
raise RuntimeError(
|
||||
f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
|
||||
)
|
||||
layer.scheme = quant_scheme
|
||||
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
|
||||
|
||||
ascend_quant_config = AscendQuantConfig(self.quant_description
|
||||
or {})
|
||||
quant_method = AscendFusedMoEMethod(
|
||||
ascend_quant_config, prefix,
|
||||
ascend_quant_config.packed_modules_mapping, layer)
|
||||
return quant_method
|
||||
return AscendFusedMoEMethod(ascend_quant_config, prefix,
|
||||
self.packed_modules_mapping, layer)
|
||||
return None
|
||||
|
||||
def get_scheme(self,
|
||||
@@ -188,26 +230,14 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
to select the CompressedTensorsScheme used for inference.
|
||||
"""
|
||||
|
||||
# Find the "target" in the compressed-tensors config
|
||||
# that our layer conforms to.
|
||||
if should_ignore_layer(layer_name,
|
||||
ignore=self.ignore,
|
||||
fused_mapping=self.packed_modules_mapping):
|
||||
return None
|
||||
|
||||
# Will be empty for models with only sparsity
|
||||
weight_quant = input_quant = None
|
||||
if self.target_scheme_map:
|
||||
matched_target = find_matched_target(
|
||||
layer_name=layer_name,
|
||||
module=layer,
|
||||
targets=self.target_scheme_map.keys(),
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
)
|
||||
|
||||
scheme_dict = self.target_scheme_map[matched_target]
|
||||
scheme_dict = self.get_scheme_dict(layer, layer_name)
|
||||
weight_quant = None
|
||||
input_quant = None
|
||||
format = None
|
||||
if scheme_dict:
|
||||
weight_quant = scheme_dict.get("weights")
|
||||
input_quant = scheme_dict.get("input_activations")
|
||||
format = scheme_dict.get("format")
|
||||
|
||||
if weight_quant is None:
|
||||
logger.warning_once("Acceleration for non-quantized schemes is "
|
||||
@@ -220,13 +250,54 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
scheme = self._get_scheme_from_parts(
|
||||
weight_quant=weight_quant,
|
||||
input_quant=input_quant,
|
||||
format=format,
|
||||
)
|
||||
return scheme
|
||||
|
||||
def get_scheme_dict(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
layer_name: str | None = None
|
||||
) -> dict[str, QuantizationArgs | str | None] | None:
|
||||
"""
|
||||
Extract the QuantizationArgs for a given layer.
|
||||
|
||||
Returns:
|
||||
dict with {
|
||||
"weights": QuantizationArgs,
|
||||
"input_activations": QuantizationArgs | None,
|
||||
"format": str | None
|
||||
} | None
|
||||
"""
|
||||
if should_ignore_layer(layer_name,
|
||||
ignore=self.ignore,
|
||||
fused_mapping=self.packed_modules_mapping):
|
||||
return None
|
||||
|
||||
if self.target_scheme_map:
|
||||
matched_target = find_matched_target(
|
||||
layer_name=layer_name,
|
||||
module=layer,
|
||||
targets=self.target_scheme_map.keys(),
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
)
|
||||
scheme_dict = self.target_scheme_map[matched_target]
|
||||
if scheme_dict.get("format") is None:
|
||||
scheme_dict["format"] = self.quant_format
|
||||
return scheme_dict
|
||||
|
||||
return None
|
||||
|
||||
def _get_scheme_from_parts(
|
||||
self, weight_quant: QuantizationArgs,
|
||||
input_quant: QuantizationArgs) -> "CompressedTensorsScheme":
|
||||
act_quant_format = is_activation_quantization_format(self.quant_format)
|
||||
self,
|
||||
weight_quant: QuantizationArgs,
|
||||
input_quant: QuantizationArgs,
|
||||
format: str | None = None,
|
||||
) -> "CompressedTensorsScheme":
|
||||
# use the per-layer format if defined, otherwise, use global format
|
||||
format = format if format is not None else self.quant_format
|
||||
|
||||
act_quant_format = is_activation_quantization_format(format)
|
||||
if act_quant_format and input_quant is not None:
|
||||
if self._is_static_tensor_w8a8(weight_quant, input_quant):
|
||||
return AscendW8A8LinearMethod()
|
||||
@@ -234,10 +305,6 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
|
||||
return AscendW8A8DynamicLinearMethod()
|
||||
|
||||
if weight_quant is not None:
|
||||
if self._is_w4a16(weight_quant):
|
||||
return AscendW4A16FusedMoEMethod()
|
||||
|
||||
raise NotImplementedError(
|
||||
"No compressed-tensors compatible scheme was found.")
|
||||
|
||||
@@ -269,9 +336,22 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
|
||||
# Only symmetric weight quantization supported.
|
||||
return is_8_bits and is_token and is_symmetric and is_dynamic
|
||||
|
||||
def _is_w4a16(self, weight_quant: QuantizationArgs) -> bool:
|
||||
def _is_w4a16(self, weight_quant: QuantizationArgs,
|
||||
input_quant: QuantizationArgs) -> bool:
|
||||
# Confirm weights quantized.
|
||||
if weight_quant is None:
|
||||
return False
|
||||
|
||||
# Confirm we have floating points.
|
||||
if weight_quant.type != QuantizationType.INT:
|
||||
return False
|
||||
|
||||
input_quant_none = input_quant is None
|
||||
is_4_bits = weight_quant.num_bits == 4
|
||||
return is_4_bits
|
||||
is_group = (weight_quant.strategy == QuantizationStrategy.GROUP.value)
|
||||
is_static = not weight_quant.dynamic
|
||||
|
||||
return input_quant_none and is_4_bits and is_group and is_static
|
||||
|
||||
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
|
||||
self.target_scheme_map = hf_to_vllm_mapper.apply_dict(
|
||||
|
||||
Reference in New Issue
Block a user