[Quantization][Feature] Support compressed tensors moe w4a8 dynamic weight (#5889)

### What this PR does / why we need it?

While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.

1. Support Moe model W4A8 dynamic weight.

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

---------

Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: menogrey <1299267905@qq.com>
Co-authored-by: menogrey <1299267905@qq.com>
This commit is contained in:
LHXuuu
2026-02-02 16:39:32 +08:00
committed by GitHub
parent 082aa2e5b7
commit 45a573cff1
5 changed files with 250 additions and 4 deletions

View File

@@ -206,6 +206,7 @@
"vllm-ascend/Qwen3-30B-A3B-W8A8", "vllm-ascend/Qwen3-30B-A3B-W8A8",
"vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning", "vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning",
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8", "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
"vllm-ascend/Qwen3-32B-W4A4", "vllm-ascend/Qwen3-32B-W4A4",
"vllm-ascend/Qwen3-32B-W8A8", "vllm-ascend/Qwen3-32B-W8A8",
"vllm-ascend/Qwen3-8B", "vllm-ascend/Qwen3-8B",

View File

@@ -0,0 +1,58 @@
from llmcompressor import oneshot
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head", "re:.*mlp.gate$"]
config_groups:
group_0:
weights:
num_bits: 8
type: int
strategy: channel
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: int
strategy: token
dynamic: true
symmetric: true
targets: ["re:.*self_attn.k_proj.*", "re:.*self_attn.o_proj.*",
"re:.*self_attn.q_proj.*", "re:.*self_attn.v_proj.*"]
group_1:
weights:
num_bits: 4
type: int
strategy: group
group_size: 128
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: int
strategy: token
dynamic: true
symmetric: true
targets: ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
"""
# Apply quantization.
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A8"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

View File

@@ -64,3 +64,24 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
for i in range(len(vllm_output)): for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1] assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}") print(f"Generated text: {vllm_output[i][1]!r}")
def test_qwen3_moe_w4a8_dynamic_llm_compressor():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
golden_results = [
'The president of the United States is the head of state and',
]
for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")

View File

@@ -187,8 +187,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
AscendUnquantizedFusedMoEMethod AscendUnquantizedFusedMoEMethod
layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD
layer_name = prefix + ".0.gate_proj"
# Get the scheme for this layer # Get the scheme for this layer
moe_scheme = self._get_moe_scheme(layer=layer, layer_name=prefix) moe_scheme = self._get_moe_scheme(layer=layer, layer_name=layer_name)
# Return unquantized method if no scheme found # Return unquantized method if no scheme found
if moe_scheme is None: if moe_scheme is None:
@@ -382,6 +383,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
if self._is_dynamic_token_w8a8(weight_quant, input_quant): if self._is_dynamic_token_w8a8(weight_quant, input_quant):
return "W8A8_DYNAMIC" return "W8A8_DYNAMIC"
if self._is_dynamic_token_w4a8(weight_quant, input_quant):
return "W4A8_DYNAMIC"
if self._is_w4a16(weight_quant, input_quant): if self._is_w4a16(weight_quant, input_quant):
return "W4A16" return "W4A16"
@@ -415,6 +419,30 @@ class AscendCompressedTensorsConfig(QuantizationConfig):
# Only symmetric input quantization supported. # Only symmetric input quantization supported.
# Only symmetric weight quantization supported. # Only symmetric weight quantization supported.
return is_8_bits and is_token and is_symmetric and is_dynamic return is_8_bits and is_token and is_symmetric and is_dynamic
def _is_dynamic_token_w4a8(self, weight_quant: QuantizationArgs,
input_quant: QuantizationArgs) -> bool:
is_4_bits = weight_quant.num_bits == 4
is_8_bits = input_quant.num_bits == 8
weight_strategy = (
weight_quant.strategy == QuantizationStrategy.CHANNEL.value) or (weight_quant.strategy == QuantizationStrategy.GROUP.value)
is_token = (weight_strategy and input_quant.strategy
== QuantizationStrategy.TOKEN.value)
is_dynamic = not weight_quant.dynamic and input_quant.dynamic
is_symmetric = weight_quant.symmetric and input_quant.symmetric
# Adapt for AscendW4A8DynamicFusedMoEMethod
assert self.quant_description is not None, "quant_description should not be None"
if weight_strategy:
self.quant_description["group_size"] = weight_quant.group_size if weight_quant.group_size else 0
self.quant_description["version"] = "0"
self.quant_description["ascend_quant_method"] = COMPRESSED_TENSORS_METHOD
self.quant_description["weight_strategy"] = str(weight_quant.strategy)
# Only symmetric input quantization supported.
# Only symmetric weight quantization supported.
return is_4_bits and is_8_bits and is_token and is_symmetric and is_dynamic
def _is_w4a16(self, weight_quant: "QuantizationArgs", def _is_w4a16(self, weight_quant: "QuantizationArgs",
input_quant: Optional["QuantizationArgs"]) -> bool: input_quant: Optional["QuantizationArgs"]) -> bool:

View File

@@ -27,7 +27,7 @@ from vllm.forward_context import get_forward_context
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.fused_moe.experts_selector import select_experts from vllm_ascend.ops.fused_moe.experts_selector import select_experts
from vllm_ascend.utils import maybe_trans_nz from vllm_ascend.utils import maybe_trans_nz, COMPRESSED_TENSORS_METHOD
from .base import AscendLinearScheme, AscendMoEScheme, QuantType from .base import AscendLinearScheme, AscendMoEScheme, QuantType
from .registry import register_scheme from .registry import register_scheme
@@ -217,6 +217,13 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
"version", "0") "version", "0")
# NOTE: new quantize weights: 2 int4 pack into int8 # NOTE: new quantize weights: 2 int4 pack into int8
self.new_quant_version = quant_version == "1.0.0" self.new_quant_version = quant_version == "1.0.0"
self.quant_method = vllm_config.quant_config.quant_description.get(
"ascend_quant_method", "")
if self.quant_method == COMPRESSED_TENSORS_METHOD:
self.weight_strategy = vllm_config.quant_config.quant_description.get(
"weight_strategy", "group")
self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size
self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb
if self.new_quant_version and self.tp_size > 16: if self.new_quant_version and self.tp_size > 16:
@@ -236,6 +243,35 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
def get_weight(self, num_experts: int, def get_weight(self, num_experts: int,
intermediate_size_per_partition: int, hidden_sizes: int, intermediate_size_per_partition: int, hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]: params_dtype: torch.dtype) -> Dict[str, Any]:
if self.quant_method == COMPRESSED_TENSORS_METHOD:
return self.get_weight_compressed_tensors(
num_experts, intermediate_size_per_partition,
hidden_sizes, params_dtype)
else:
return self.get_weight_modelslim(
num_experts, intermediate_size_per_partition,
hidden_sizes, params_dtype)
def get_weight_compressed_tensors(self, num_experts: int,
intermediate_size_per_partition: int, hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]:
param_dict = {}
E = num_experts
H = hidden_sizes
IN = intermediate_size_per_partition
g = self.group_size
param_dict["w13_weight"] = torch.empty(E, 2 * IN, H,
dtype=torch.int8)
param_dict["w2_weight"] = torch.empty(E, H, IN,
dtype=torch.int8)
return param_dict
def get_weight_modelslim(self, num_experts: int,
intermediate_size_per_partition: int, hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]:
param_dict = {} param_dict = {}
if self.new_quant_version: if self.new_quant_version:
w13_output_size = intermediate_size_per_partition w13_output_size = intermediate_size_per_partition
@@ -258,6 +294,42 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
intermediate_size_per_partition: int, intermediate_size_per_partition: int,
hidden_sizes: int, hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]: params_dtype: torch.dtype) -> Dict[str, Any]:
if self.quant_method == COMPRESSED_TENSORS_METHOD:
return self.get_dynamic_quant_param_compressed_tensors(
num_experts, intermediate_size_per_partition,
hidden_sizes, params_dtype)
else:
return self.get_dynamic_quant_param_modelslim(
num_experts, intermediate_size_per_partition,
hidden_sizes, params_dtype)
def get_dynamic_quant_param_compressed_tensors(self, num_experts: int,
intermediate_size_per_partition: int,
hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]:
param_dict = {}
E = num_experts
H = hidden_sizes
IN = intermediate_size_per_partition
g = self.group_size
# Per-row scale columns
def _n_scale_cols(in_features: int) -> int:
return 1 if g <= 0 else (in_features // g)
param_dict["w13_weight_scale"] = torch.empty(
E, 2 * IN, _n_scale_cols(H), dtype=torch.bfloat16)
param_dict["w2_weight_scale"] = torch.empty(E, H, _n_scale_cols(IN),
dtype=torch.bfloat16)
return param_dict
def get_dynamic_quant_param_modelslim(self, num_experts: int,
intermediate_size_per_partition: int,
hidden_sizes: int,
params_dtype: torch.dtype) -> Dict[str, Any]:
param_dict = {} param_dict = {}
param_dict["w13_weight_scale"] = torch.empty( param_dict["w13_weight_scale"] = torch.empty(
num_experts, num_experts,
@@ -374,8 +446,10 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
w2=[layer.w2_weight], w2=[layer.w2_weight],
w1_scale=[layer.w13_weight_scale], w1_scale=[layer.w13_weight_scale],
w2_scale=[layer.w2_weight_scale], w2_scale=[layer.w2_weight_scale],
w1_scale_bias=layer.w13_scale_bias, w1_scale_bias=layer.w13_scale_bias if hasattr(
w2_scale_bias=layer.w2_scale_bias, layer, "w13_scale_bias") else None,
w2_scale_bias=layer.w2_scale_bias if hasattr(
layer, "w2_scale_bias") else None,
topk_weights=topk_weights, topk_weights=topk_weights,
topk_ids=topk_ids, topk_ids=topk_ids,
use_int4_w4a8=True, use_int4_w4a8=True,
@@ -445,6 +519,70 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme):
torch.quint4x2, -1, False) torch.quint4x2, -1, False)
def process_weights_after_loading(self, layer): def process_weights_after_loading(self, layer):
if self.quant_method == COMPRESSED_TENSORS_METHOD:
self.process_weights_after_loading_compressed_tensors(layer)
else:
self.process_weights_after_loading_modelslim(layer)
def process_weights_after_loading_compressed_tensors(self, layer):
layer.w13_weight.data = layer.w13_weight.data.transpose(
1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(1,
2).contiguous()
def process_scale_compressed_tensors(scale: torch.Tensor):
scale = scale.transpose(1, 2).to(torch.float32).contiguous()
scale_np = scale.cpu().numpy()
scale_np.dtype = np.uint32
scale_uint64_tensor = torch.from_numpy(scale_np.astype(
np.int64)).npu()
return scale_uint64_tensor
def update_bias_compressed_tensors(weight: torch.Tensor,
scale: torch.Tensor, strategy:str):
group_num, k, n = weight.shape
scale = scale.transpose(1, 2).contiguous()
scale = scale.reshape(group_num, -1, n)
group_num, quantgroup_num, n = scale.shape
bias = None
if strategy == "group":
tmp = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \
scale.reshape([group_num, quantgroup_num, 1, n])
tmp = tmp.reshape([group_num, k, n])
bias = 8 * tmp.sum(axis=1)
elif strategy == "channel":
bias = 8 * (weight.to(torch.float32) * scale).sum(axis=1)
else:
raise ValueError(f"Unsupported weight strategy: {strategy}")
return bias
w13_bias = update_bias_compressed_tensors(layer.w13_weight.data,
layer.w13_weight_scale.data,
self.weight_strategy)
w2_bias = update_bias_compressed_tensors(layer.w2_weight.data,
layer.w2_weight_scale.data,
self.weight_strategy)
layer.w13_weight_scale.data = process_scale_compressed_tensors(
layer.w13_weight_scale.data)
layer.w2_weight_scale.data = process_scale_compressed_tensors(
layer.w2_weight_scale.data)
w13_scale_bias = torch.nn.Parameter(w13_bias, requires_grad=False)
layer.register_parameter("w13_scale_bias", w13_scale_bias)
w2_scale_bias = torch.nn.Parameter(w2_bias, requires_grad=False)
layer.register_parameter("w2_scale_bias", w2_scale_bias)
# Accuracy problem in nz format
# layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data)
# layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data)
layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
def process_weights_after_loading_modelslim(self, layer):
layer.w13_weight.data = layer.w13_weight.data.transpose( layer.w13_weight.data = layer.w13_weight.data.transpose(
1, 2).contiguous() 1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(1, layer.w2_weight.data = layer.w2_weight.data.transpose(1,