160 lines
4.9 KiB
Python
160 lines
4.9 KiB
Python
|
|
import os
|
||
|
|
import torch
|
||
|
|
|
||
|
|
from datasets import load_dataset
|
||
|
|
from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
|
||
|
|
AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor
|
||
|
|
|
||
|
|
from llmcompressor import oneshot
|
||
|
|
from llmcompressor.modifiers.awq import AWQModifier
|
||
|
|
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
|
||
|
|
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
|
||
|
|
|
||
|
|
W8A8_W_cha_A_ten_static_symmetric = {
|
||
|
|
"group_0": QuantizationScheme(
|
||
|
|
targets=["Linear"],
|
||
|
|
weights=QuantizationArgs(
|
||
|
|
num_bits=8,
|
||
|
|
type=QuantizationType.INT,
|
||
|
|
strategy=QuantizationStrategy.CHANNEL,
|
||
|
|
symmetric=True,
|
||
|
|
dynamic=False
|
||
|
|
),
|
||
|
|
input_activations=QuantizationArgs(
|
||
|
|
num_bits=8,
|
||
|
|
type=QuantizationType.INT,
|
||
|
|
strategy=QuantizationStrategy.TENSOR,
|
||
|
|
symmetric=True,
|
||
|
|
dynamic=False
|
||
|
|
),
|
||
|
|
),
|
||
|
|
}
|
||
|
|
|
||
|
|
# supported modifiers
|
||
|
|
MODIFIER_DICT = {
|
||
|
|
"PTQ": QuantizationModifier,
|
||
|
|
"AWQ": AWQModifier,
|
||
|
|
"GPTQ": GPTQModifier,
|
||
|
|
}
|
||
|
|
|
||
|
|
# supported schemes
|
||
|
|
SCHEMES_DICT = {
|
||
|
|
"W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
|
||
|
|
}
|
||
|
|
|
||
|
|
MODEL_DICT = {
|
||
|
|
"qwen3": AutoModelForCausalLM,
|
||
|
|
}
|
||
|
|
|
||
|
|
TOKENIZER_DICT = {
|
||
|
|
"qwen3": AutoTokenizer,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def load_environment_variables():
|
||
|
|
env_vars = {
|
||
|
|
'model_path': "Qwen/Qwen3-32B",
|
||
|
|
'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
|
||
|
|
'modifier': "GPTQ",
|
||
|
|
'schemes': "W8A8_W_cha_A_ten_static_symmetric",
|
||
|
|
'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
|
||
|
|
}
|
||
|
|
|
||
|
|
# verify export model path
|
||
|
|
if env_vars['export_path'] is None:
|
||
|
|
env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
|
||
|
|
if env_vars['schemes'] is not None:
|
||
|
|
env_vars['export_path'] += "-" + env_vars['schemes']
|
||
|
|
os.makedirs(env_vars['export_path'], exist_ok=True)
|
||
|
|
|
||
|
|
return env_vars
|
||
|
|
|
||
|
|
|
||
|
|
def load_calibration_text_dataset(calib_prompt_path, tokenizer):
|
||
|
|
# Load dataset
|
||
|
|
for f in os.listdir(calib_prompt_path):
|
||
|
|
print(f)
|
||
|
|
if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
|
||
|
|
ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
|
||
|
|
elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
|
||
|
|
ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
|
||
|
|
else:
|
||
|
|
raise ValueError("Unsupported calibration file format: {}".format(
|
||
|
|
calib_prompt_path.split('.')[-1]))
|
||
|
|
|
||
|
|
# Preprocess dataset
|
||
|
|
def preprocess(example):
|
||
|
|
if tokenizer.chat_template is not None:
|
||
|
|
return {"text": tokenizer.apply_chat_template(
|
||
|
|
example["messages"], tokenize=False)}
|
||
|
|
else:
|
||
|
|
return {"text": example["messages"]}
|
||
|
|
|
||
|
|
# Tokenize inputs
|
||
|
|
def tokenize(sample):
|
||
|
|
return tokenizer(
|
||
|
|
sample["text"],
|
||
|
|
add_special_tokens=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
ds = ds.map(preprocess)
|
||
|
|
ds = ds.map(tokenize, remove_columns=ds.column_names)
|
||
|
|
return ds
|
||
|
|
|
||
|
|
|
||
|
|
# Define a oneshot data collator for multimodal inputs.
|
||
|
|
def data_collator(batch):
|
||
|
|
assert len(batch) == 1
|
||
|
|
return {
|
||
|
|
key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
|
||
|
|
for key, value in batch[0].items()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def quantize_model(model, env_vars, dataset_dict=None):
|
||
|
|
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
|
||
|
|
# list so they remain at full precision
|
||
|
|
ignore = ["lm_head", "re:.*mlp.down_proj"]
|
||
|
|
|
||
|
|
# define a llmcompressor recipe
|
||
|
|
recipe = [
|
||
|
|
MODIFIER_DICT[env_vars['modifier']](
|
||
|
|
config_groups=SCHEMES_DICT[env_vars['schemes']],
|
||
|
|
ignore=ignore,
|
||
|
|
),
|
||
|
|
]
|
||
|
|
|
||
|
|
# quantize the model
|
||
|
|
oneshot(
|
||
|
|
model=model,
|
||
|
|
dataset=dataset_dict,
|
||
|
|
recipe=recipe,
|
||
|
|
trust_remote_code_model=True,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
|
||
|
|
model.save_pretrained(save_path, save_compressed=save_compressed)
|
||
|
|
tokenizer.save_pretrained(save_path)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
# get environment variables
|
||
|
|
env_vars = load_environment_variables()
|
||
|
|
|
||
|
|
# support model type list
|
||
|
|
config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
|
||
|
|
model_type = config.model_type
|
||
|
|
|
||
|
|
model = MODEL_DICT[model_type].from_pretrained(
|
||
|
|
env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
|
||
|
|
)
|
||
|
|
tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
|
||
|
|
|
||
|
|
ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
|
||
|
|
|
||
|
|
# Quantize the model
|
||
|
|
quantize_model(model, env_vars, ds)
|
||
|
|
|
||
|
|
# save the quantized model
|
||
|
|
save_quantized_model(model, tokenizer, env_vars['export_path'], True)
|