From 45a573cff1e6ac1ffea571242eacce955bb6f087 Mon Sep 17 00:00:00 2001 From: LHXuuu Date: Mon, 2 Feb 2026 16:39:32 +0800 Subject: [PATCH] [Quantization][Feature] Support compressed tensors moe w4a8 dynamic weight (#5889) ### What this PR does / why we need it? While using the LLM Compressor quantization tool from the VLLM community to generate quantized weights, the VLLM Ascend engine needs to be adapted to support the compressed tensors quantization format. 1. Support Moe model W4A8 dynamic weight. - vLLM version: v0.13.0 - vLLM main: https://github.com/vllm-project/vllm/commit/bde38c11df0ea066a740efe9b77fff5418be45df --------- Signed-off-by: LHXuuu Signed-off-by: menogrey <1299267905@qq.com> Co-authored-by: menogrey <1299267905@qq.com> --- .github/workflows/misc/model_list.json | 1 + .../llm-compressor/w4a8_dynamic_moe.py | 58 +++++++ .../multicard/2-cards/test_quantization.py | 21 +++ .../quantization/compressed_tensors_config.py | 30 +++- vllm_ascend/quantization/methods/w4a8.py | 144 +++++++++++++++++- 5 files changed, 250 insertions(+), 4 deletions(-) create mode 100644 examples/quantization/llm-compressor/w4a8_dynamic_moe.py diff --git a/.github/workflows/misc/model_list.json b/.github/workflows/misc/model_list.json index 001c831d..ee709ac2 100644 --- a/.github/workflows/misc/model_list.json +++ b/.github/workflows/misc/model_list.json @@ -206,6 +206,7 @@ "vllm-ascend/Qwen3-30B-A3B-W8A8", "vllm-ascend/Qwen3-30B-A3B-W8A8-Pruning", "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8", + "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8", "vllm-ascend/Qwen3-32B-W4A4", "vllm-ascend/Qwen3-32B-W8A8", "vllm-ascend/Qwen3-8B", diff --git a/examples/quantization/llm-compressor/w4a8_dynamic_moe.py b/examples/quantization/llm-compressor/w4a8_dynamic_moe.py new file mode 100644 index 00000000..04ff8657 --- /dev/null +++ b/examples/quantization/llm-compressor/w4a8_dynamic_moe.py @@ -0,0 +1,58 @@ +from llmcompressor import oneshot +from transformers import AutoModelForCausalLM, AutoTokenizer + +MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507" + +# Load model. +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + ignore: ["lm_head", "re:.*mlp.gate$"] + config_groups: + group_0: + weights: + num_bits: 8 + type: int + strategy: channel + dynamic: false + symmetric: true + input_activations: + num_bits: 8 + type: int + strategy: token + dynamic: true + symmetric: true + targets: ["re:.*self_attn.k_proj.*", "re:.*self_attn.o_proj.*", + "re:.*self_attn.q_proj.*", "re:.*self_attn.v_proj.*"] + group_1: + weights: + num_bits: 4 + type: int + strategy: group + group_size: 128 + dynamic: false + symmetric: true + input_activations: + num_bits: 8 + type: int + strategy: token + dynamic: true + symmetric: true + targets: ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"] +""" + +# Apply quantization. +oneshot( + model=model, + recipe=recipe, + trust_remote_code_model=True, +) + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A8" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) diff --git a/tests/e2e/multicard/2-cards/test_quantization.py b/tests/e2e/multicard/2-cards/test_quantization.py index b356ba3b..da45628b 100644 --- a/tests/e2e/multicard/2-cards/test_quantization.py +++ b/tests/e2e/multicard/2-cards/test_quantization.py @@ -64,3 +64,24 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor(): for i in range(len(vllm_output)): assert golden_results[i] == vllm_output[i][1] print(f"Generated text: {vllm_output[i][1]!r}") + +def test_qwen3_moe_w4a8_dynamic_llm_compressor(): + example_prompts = [ + "The president of the United States is", + ] + max_tokens = 5 + with VllmRunner( + "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8", + tensor_parallel_size=2, + max_model_len=4096, + gpu_memory_utilization=0.8, + ) as vllm_model: + vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) + + golden_results = [ + 'The president of the United States is the head of state and', + ] + + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") diff --git a/vllm_ascend/quantization/compressed_tensors_config.py b/vllm_ascend/quantization/compressed_tensors_config.py index 30834e70..7896d1b4 100644 --- a/vllm_ascend/quantization/compressed_tensors_config.py +++ b/vllm_ascend/quantization/compressed_tensors_config.py @@ -187,8 +187,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig): AscendUnquantizedFusedMoEMethod layer.ascend_quant_method = COMPRESSED_TENSORS_METHOD + layer_name = prefix + ".0.gate_proj" # Get the scheme for this layer - moe_scheme = self._get_moe_scheme(layer=layer, layer_name=prefix) + moe_scheme = self._get_moe_scheme(layer=layer, layer_name=layer_name) # Return unquantized method if no scheme found if moe_scheme is None: @@ -382,6 +383,9 @@ class AscendCompressedTensorsConfig(QuantizationConfig): if self._is_dynamic_token_w8a8(weight_quant, input_quant): return "W8A8_DYNAMIC" + if self._is_dynamic_token_w4a8(weight_quant, input_quant): + return "W4A8_DYNAMIC" + if self._is_w4a16(weight_quant, input_quant): return "W4A16" @@ -415,6 +419,30 @@ class AscendCompressedTensorsConfig(QuantizationConfig): # Only symmetric input quantization supported. # Only symmetric weight quantization supported. return is_8_bits and is_token and is_symmetric and is_dynamic + + def _is_dynamic_token_w4a8(self, weight_quant: QuantizationArgs, + input_quant: QuantizationArgs) -> bool: + is_4_bits = weight_quant.num_bits == 4 + is_8_bits = input_quant.num_bits == 8 + weight_strategy = ( + weight_quant.strategy == QuantizationStrategy.CHANNEL.value) or (weight_quant.strategy == QuantizationStrategy.GROUP.value) + is_token = (weight_strategy and input_quant.strategy + == QuantizationStrategy.TOKEN.value) + is_dynamic = not weight_quant.dynamic and input_quant.dynamic + is_symmetric = weight_quant.symmetric and input_quant.symmetric + + # Adapt for AscendW4A8DynamicFusedMoEMethod + assert self.quant_description is not None, "quant_description should not be None" + if weight_strategy: + self.quant_description["group_size"] = weight_quant.group_size if weight_quant.group_size else 0 + + self.quant_description["version"] = "0" + self.quant_description["ascend_quant_method"] = COMPRESSED_TENSORS_METHOD + self.quant_description["weight_strategy"] = str(weight_quant.strategy) + + # Only symmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_4_bits and is_8_bits and is_token and is_symmetric and is_dynamic def _is_w4a16(self, weight_quant: "QuantizationArgs", input_quant: Optional["QuantizationArgs"]) -> bool: diff --git a/vllm_ascend/quantization/methods/w4a8.py b/vllm_ascend/quantization/methods/w4a8.py index 4510b44a..a5fc3afa 100644 --- a/vllm_ascend/quantization/methods/w4a8.py +++ b/vllm_ascend/quantization/methods/w4a8.py @@ -27,7 +27,7 @@ from vllm.forward_context import get_forward_context from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.fused_moe.experts_selector import select_experts -from vllm_ascend.utils import maybe_trans_nz +from vllm_ascend.utils import maybe_trans_nz, COMPRESSED_TENSORS_METHOD from .base import AscendLinearScheme, AscendMoEScheme, QuantType from .registry import register_scheme @@ -217,6 +217,13 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme): "version", "0") # NOTE: new quantize weights: 2 int4 pack into int8 self.new_quant_version = quant_version == "1.0.0" + + self.quant_method = vllm_config.quant_config.quant_description.get( + "ascend_quant_method", "") + if self.quant_method == COMPRESSED_TENSORS_METHOD: + self.weight_strategy = vllm_config.quant_config.quant_description.get( + "weight_strategy", "group") + self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb if self.new_quant_version and self.tp_size > 16: @@ -236,6 +243,35 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme): def get_weight(self, num_experts: int, intermediate_size_per_partition: int, hidden_sizes: int, params_dtype: torch.dtype) -> Dict[str, Any]: + if self.quant_method == COMPRESSED_TENSORS_METHOD: + return self.get_weight_compressed_tensors( + num_experts, intermediate_size_per_partition, + hidden_sizes, params_dtype) + else: + return self.get_weight_modelslim( + num_experts, intermediate_size_per_partition, + hidden_sizes, params_dtype) + + def get_weight_compressed_tensors(self, num_experts: int, + intermediate_size_per_partition: int, hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + + param_dict = {} + E = num_experts + H = hidden_sizes + IN = intermediate_size_per_partition + g = self.group_size + + param_dict["w13_weight"] = torch.empty(E, 2 * IN, H, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(E, H, IN, + dtype=torch.int8) + return param_dict + + + def get_weight_modelslim(self, num_experts: int, + intermediate_size_per_partition: int, hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: param_dict = {} if self.new_quant_version: w13_output_size = intermediate_size_per_partition @@ -258,6 +294,42 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme): intermediate_size_per_partition: int, hidden_sizes: int, params_dtype: torch.dtype) -> Dict[str, Any]: + if self.quant_method == COMPRESSED_TENSORS_METHOD: + return self.get_dynamic_quant_param_compressed_tensors( + num_experts, intermediate_size_per_partition, + hidden_sizes, params_dtype) + else: + return self.get_dynamic_quant_param_modelslim( + num_experts, intermediate_size_per_partition, + hidden_sizes, params_dtype) + + def get_dynamic_quant_param_compressed_tensors(self, num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + + E = num_experts + H = hidden_sizes + IN = intermediate_size_per_partition + g = self.group_size + + # Per-row scale columns + def _n_scale_cols(in_features: int) -> int: + return 1 if g <= 0 else (in_features // g) + + param_dict["w13_weight_scale"] = torch.empty( + E, 2 * IN, _n_scale_cols(H), dtype=torch.bfloat16) + + param_dict["w2_weight_scale"] = torch.empty(E, H, _n_scale_cols(IN), + dtype=torch.bfloat16) + + return param_dict + + def get_dynamic_quant_param_modelslim(self, num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: param_dict = {} param_dict["w13_weight_scale"] = torch.empty( num_experts, @@ -374,8 +446,10 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme): w2=[layer.w2_weight], w1_scale=[layer.w13_weight_scale], w2_scale=[layer.w2_weight_scale], - w1_scale_bias=layer.w13_scale_bias, - w2_scale_bias=layer.w2_scale_bias, + w1_scale_bias=layer.w13_scale_bias if hasattr( + layer, "w13_scale_bias") else None, + w2_scale_bias=layer.w2_scale_bias if hasattr( + layer, "w2_scale_bias") else None, topk_weights=topk_weights, topk_ids=topk_ids, use_int4_w4a8=True, @@ -445,6 +519,70 @@ class AscendW4A8DynamicFusedMoEMethod(AscendMoEScheme): torch.quint4x2, -1, False) def process_weights_after_loading(self, layer): + if self.quant_method == COMPRESSED_TENSORS_METHOD: + self.process_weights_after_loading_compressed_tensors(layer) + else: + self.process_weights_after_loading_modelslim(layer) + + + def process_weights_after_loading_compressed_tensors(self, layer): + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose(1, + 2).contiguous() + + def process_scale_compressed_tensors(scale: torch.Tensor): + scale = scale.transpose(1, 2).to(torch.float32).contiguous() + scale_np = scale.cpu().numpy() + scale_np.dtype = np.uint32 + scale_uint64_tensor = torch.from_numpy(scale_np.astype( + np.int64)).npu() + return scale_uint64_tensor + + def update_bias_compressed_tensors(weight: torch.Tensor, + scale: torch.Tensor, strategy:str): + group_num, k, n = weight.shape + scale = scale.transpose(1, 2).contiguous() + scale = scale.reshape(group_num, -1, n) + group_num, quantgroup_num, n = scale.shape + + bias = None + if strategy == "group": + tmp = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \ + scale.reshape([group_num, quantgroup_num, 1, n]) + tmp = tmp.reshape([group_num, k, n]) + bias = 8 * tmp.sum(axis=1) + elif strategy == "channel": + bias = 8 * (weight.to(torch.float32) * scale).sum(axis=1) + else: + raise ValueError(f"Unsupported weight strategy: {strategy}") + return bias + + w13_bias = update_bias_compressed_tensors(layer.w13_weight.data, + layer.w13_weight_scale.data, + self.weight_strategy) + w2_bias = update_bias_compressed_tensors(layer.w2_weight.data, + layer.w2_weight_scale.data, + self.weight_strategy) + + layer.w13_weight_scale.data = process_scale_compressed_tensors( + layer.w13_weight_scale.data) + layer.w2_weight_scale.data = process_scale_compressed_tensors( + layer.w2_weight_scale.data) + + + w13_scale_bias = torch.nn.Parameter(w13_bias, requires_grad=False) + layer.register_parameter("w13_scale_bias", w13_scale_bias) + w2_scale_bias = torch.nn.Parameter(w2_bias, requires_grad=False) + layer.register_parameter("w2_scale_bias", w2_scale_bias) + + # Accuracy problem in nz format + # layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data) + # layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data) + layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) + + def process_weights_after_loading_modelslim(self, layer): layer.w13_weight.data = layer.w13_weight.data.transpose( 1, 2).contiguous() layer.w2_weight.data = layer.w2_weight.data.transpose(1,