[Quantization] Support compressed tensors w8a8 static and w8a8 dynamic weight (#4036)

### What this PR does / why we need it?

While using the LLM Compressor quantization tool from the VLLM community
to generate quantized weights, the VLLM Ascend engine needs to be
adapted to support the compressed tensors quantization format.

1. Add AscendCompressedTensorsConfig to replace CompressedTensorsConfig
in vllm.
2. Support CompressedTensorsW8A8 static weight.
- weight: per-channel, int8, symmetric; activation: per-tensor, int8,
symmetric.
4. Support CompressedTensorsW8A8Dynamic weight.
- weight: per-channel, int8, symmetric; activation: per-token, int8,
symmetric, dynamic.
5. Modify the override_quantization_method in AscendQuantConfig.

Co-authored-by: taoqun110 taoqun@huawei.com
Co-authored-by: chenxi-hh chen464822955@163.com

- vLLM version: v0.11.2

---------

Signed-off-by: LHXuuu <scut_xlh@163.com>
Signed-off-by: chenxi-hh <chen464822955@163.com>
Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
Co-authored-by: chenxi-hh <chen464822955@163.com>
Co-authored-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
This commit is contained in:
LHXuuu
2025-11-28 14:09:39 +08:00
committed by GitHub
parent ab37a7d5ae
commit bdc66972db
18 changed files with 707 additions and 32 deletions

View File

@@ -0,0 +1,160 @@
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
W8A8_W_cha_A_ten_static_symmetric = {
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(
num_bits=8,
type=QuantizationType.INT,
strategy=QuantizationStrategy.CHANNEL,
symmetric=True,
dynamic=False
),
input_activations=QuantizationArgs(
num_bits=8,
type=QuantizationType.INT,
strategy=QuantizationStrategy.TENSOR,
symmetric=True,
dynamic=False
),
),
}
# supported modifiers
MODIFIER_DICT = {
"PTQ": QuantizationModifier,
"AWQ": AWQModifier,
"GPTQ": GPTQModifier,
}
# supported schemes
SCHEMES_DICT = {
"W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
}
MODEL_DICT = {
"qwen3": AutoModelForCausalLM,
}
TOKENIZER_DICT = {
"qwen3": AutoTokenizer,
}
def load_environment_variables():
env_vars = {
'model_path': "Qwen/Qwen3-32B",
'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
'modifier': "GPTQ",
'schemes': "W8A8_W_cha_A_ten_static_symmetric",
'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
}
# verify export model path
if env_vars['export_path'] is None:
env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
if env_vars['schemes'] is not None:
env_vars['export_path'] += "-" + env_vars['schemes']
os.makedirs(env_vars['export_path'], exist_ok=True)
return env_vars
def load_calibration_text_dataset(calib_prompt_path, tokenizer):
# Load dataset
for f in os.listdir(calib_prompt_path):
print(f)
if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
else:
raise ValueError("Unsupported calibration file format: {}".format(
calib_prompt_path.split('.')[-1]))
# Preprocess dataset
def preprocess(example):
if tokenizer.chat_template is not None:
return {"text": tokenizer.apply_chat_template(
example["messages"], tokenize=False)}
else:
return {"text": example["messages"]}
# Tokenize inputs
def tokenize(sample):
return tokenizer(
sample["text"],
add_special_tokens=False,
)
ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)
return ds
# Define a oneshot data collator for multimodal inputs.
def data_collator(batch):
assert len(batch) == 1
return {
key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
for key, value in batch[0].items()
}
def quantize_model(model, env_vars, dataset_dict=None):
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
ignore = ["lm_head", "re:.*mlp.down_proj"]
# define a llmcompressor recipe
recipe = [
MODIFIER_DICT[env_vars['modifier']](
config_groups=SCHEMES_DICT[env_vars['schemes']],
ignore=ignore,
),
]
# quantize the model
oneshot(
model=model,
dataset=dataset_dict,
recipe=recipe,
trust_remote_code_model=True,
)
def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
model.save_pretrained(save_path, save_compressed=save_compressed)
tokenizer.save_pretrained(save_path)
if __name__ == '__main__':
# get environment variables
env_vars = load_environment_variables()
# support model type list
config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
model_type = config.model_type
model = MODEL_DICT[model_type].from_pretrained(
env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
)
tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
# Quantize the model
quantize_model(model, env_vars, ds)
# save the quantized model
save_quantized_model(model, tokenizer, env_vars['export_path'], True)

View File

@@ -0,0 +1,83 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.utils import dispatch_for_generation
# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048
# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)
def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}
ds = ds.map(preprocess)
# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)
ds = ds.map(tokenize, remove_columns=ds.column_names)
# Configure algorithms. In this case, we:
# * apply SmoothQuant to make the activations easier to quantize
# * quantize the weights to int8 with GPTQ (static per channel)
# * quantize the activations to int8 (dynamic per token)
recipe = [
SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]
# Apply algorithms and save to output_dir
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")
# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)