[Quantization][Feature] Support compressed tensors moe w4a8 dynamic weight (#5889)

### What this PR does / why we need it? While using the LLM Compressor quantization tool from the VLLM community to generate quantized weights, the VLLM Ascend engine needs to be adapted to support the compressed tensors quantization format. 1. Support Moe model W4A8 dynamic weight. - vLLM version: v0.13.0 - vLLM main: bde38c11df --------- Signed-off-by: LHXuuu <scut_xlh@163.com> Signed-off-by: menogrey <1299267905@qq.com> Co-authored-by: menogrey <1299267905@qq.com>
2026-02-02 16:39:32 +08:00
parent 082aa2e5b7
commit 45a573cff1
5 changed files with 250 additions and 4 deletions
--- a/examples/quantization/llm-compressor/w4a8_dynamic_moe.py
+++ b/examples/quantization/llm-compressor/w4a8_dynamic_moe.py
@@ -0,0 +1,58 @@
+from llmcompressor import oneshot
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
+
+# Load model.
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head", "re:.*mlp.gate$"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: int
+                        strategy: channel
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: int
+                        strategy: token
+                        dynamic: true
+                        symmetric: true
+                    targets: ["re:.*self_attn.k_proj.*", "re:.*self_attn.o_proj.*",
+                        "re:.*self_attn.q_proj.*", "re:.*self_attn.v_proj.*"]
+                group_1:
+                    weights:
+                        num_bits: 4
+                        type: int
+                        strategy: group
+                        group_size: 128
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: int
+                        strategy: token
+                        dynamic: true
+                        symmetric: true
+                    targets: ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
+"""
+
+# Apply quantization.
+oneshot(
+    model=model,
+    recipe=recipe,
+    trust_remote_code_model=True,
+)
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A8"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)