### What this PR does / why we need it? While using the LLM Compressor quantization tool from the VLLM community to generate quantized weights, the VLLM Ascend engine needs to be adapted to support the compressed tensors quantization format. 1. Add AscendCompressedTensorsConfig to replace CompressedTensorsConfig in vllm. 2. Support CompressedTensorsW8A8 static weight. - weight: per-channel, int8, symmetric; activation: per-tensor, int8, symmetric. 4. Support CompressedTensorsW8A8Dynamic weight. - weight: per-channel, int8, symmetric; activation: per-token, int8, symmetric, dynamic. 5. Modify the override_quantization_method in AscendQuantConfig. Co-authored-by: taoqun110 taoqun@huawei.com Co-authored-by: chenxi-hh chen464822955@163.com - vLLM version: v0.11.2 --------- Signed-off-by: LHXuuu <scut_xlh@163.com> Signed-off-by: chenxi-hh <chen464822955@163.com> Signed-off-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com> Co-authored-by: chenxi-hh <chen464822955@163.com> Co-authored-by: chenxi-hh <32731611+chenxi-hh@users.noreply.github.com>
83 lines
2.5 KiB
Python
83 lines
2.5 KiB
Python
from datasets import load_dataset
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
from llmcompressor import oneshot
|
|
from llmcompressor.modifiers.quantization import GPTQModifier
|
|
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
|
from llmcompressor.utils import dispatch_for_generation
|
|
|
|
# Select model and load it.
|
|
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
|
|
|
# Select calibration dataset.
|
|
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
|
DATASET_SPLIT = "train_sft"
|
|
|
|
# Select number of samples. 512 samples is a good place to start.
|
|
# Increasing the number of samples can improve accuracy.
|
|
NUM_CALIBRATION_SAMPLES = 512
|
|
MAX_SEQUENCE_LENGTH = 2048
|
|
|
|
# Load dataset and preprocess.
|
|
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
|
|
ds = ds.shuffle(seed=42)
|
|
|
|
|
|
def preprocess(example):
|
|
return {
|
|
"text": tokenizer.apply_chat_template(
|
|
example["messages"],
|
|
tokenize=False,
|
|
)
|
|
}
|
|
|
|
|
|
ds = ds.map(preprocess)
|
|
|
|
|
|
# Tokenize inputs.
|
|
def tokenize(sample):
|
|
return tokenizer(
|
|
sample["text"],
|
|
padding=False,
|
|
max_length=MAX_SEQUENCE_LENGTH,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
|
|
|
|
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
|
|
|
# Configure algorithms. In this case, we:
|
|
# * apply SmoothQuant to make the activations easier to quantize
|
|
# * quantize the weights to int8 with GPTQ (static per channel)
|
|
# * quantize the activations to int8 (dynamic per token)
|
|
recipe = [
|
|
SmoothQuantModifier(smoothing_strength=0.8),
|
|
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
|
]
|
|
|
|
# Apply algorithms and save to output_dir
|
|
oneshot(
|
|
model=model,
|
|
dataset=ds,
|
|
recipe=recipe,
|
|
max_seq_length=MAX_SEQUENCE_LENGTH,
|
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
|
)
|
|
|
|
# Confirm generations of the quantized model look sane.
|
|
print("\n\n")
|
|
print("========== SAMPLE GENERATION ==============")
|
|
dispatch_for_generation(model)
|
|
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
|
|
output = model.generate(input_ids, max_new_tokens=100)
|
|
print(tokenizer.decode(output[0]))
|
|
print("==========================================\n\n")
|
|
|
|
# Save to disk compressed.
|
|
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
|
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
|
tokenizer.save_pretrained(SAVE_DIR) |