初始化项目,由ModelHub XC社区提供模型
Model: QuixiAI/Llama-3.2-1B Source: Original Platform
This commit is contained in:
171
quant.py
Normal file
171
quant.py
Normal file
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert a local BF16 model into Marlin-supported quant formats via llm-compressor."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import gc
|
||||
import os
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# Allow running against the local llm-compressor checkout without installing.
|
||||
LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src"
|
||||
if os.path.isdir(LLM_COMPRESSOR_SRC):
|
||||
sys.path.insert(0, LLM_COMPRESSOR_SRC)
|
||||
|
||||
from llmcompressor import oneshot # noqa: E402
|
||||
from llmcompressor.modifiers.awq import AWQModifier # noqa: E402
|
||||
from llmcompressor.modifiers.quantization import ( # noqa: E402
|
||||
GPTQModifier,
|
||||
QuantizationModifier,
|
||||
)
|
||||
|
||||
MODEL_PATH = "/home/quixi/models/Llama-3.2-1B"
|
||||
OUTPUT_ROOT = "/home/quixi/models"
|
||||
|
||||
CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k"
|
||||
CALIB_DATASET_SPLIT = "train_sft"
|
||||
NUM_CALIBRATION_SAMPLES = 128
|
||||
MAX_SEQUENCE_LENGTH = 512
|
||||
|
||||
|
||||
def _load_tokenized_dataset(tokenizer):
|
||||
ds = load_dataset(
|
||||
CALIB_DATASET_ID,
|
||||
split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
|
||||
).shuffle(seed=42)
|
||||
|
||||
def preprocess(example):
|
||||
return {
|
||||
"text": tokenizer.apply_chat_template(
|
||||
example["messages"],
|
||||
tokenize=False,
|
||||
)
|
||||
}
|
||||
|
||||
ds = ds.map(preprocess)
|
||||
|
||||
def tokenize(sample):
|
||||
return tokenizer(
|
||||
sample["text"],
|
||||
padding=False,
|
||||
max_length=MAX_SEQUENCE_LENGTH,
|
||||
truncation=True,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
return ds.map(tokenize, remove_columns=ds.column_names)
|
||||
|
||||
|
||||
def _load_model_and_tokenizer():
|
||||
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
||||
if torch.cuda.is_available():
|
||||
model.to("cuda")
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def _cleanup(model, tokenizer):
|
||||
del model
|
||||
del tokenizer
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def _run_recipe(
|
||||
name: str,
|
||||
recipe,
|
||||
*,
|
||||
save_compressed: bool,
|
||||
use_calibration: bool,
|
||||
) -> Optional[str]:
|
||||
print(f"\n=== Quantizing {name} ===")
|
||||
model, tokenizer = _load_model_and_tokenizer()
|
||||
|
||||
oneshot_kwargs = {"model": model, "recipe": recipe}
|
||||
if use_calibration:
|
||||
ds = _load_tokenized_dataset(tokenizer)
|
||||
oneshot_kwargs.update(
|
||||
dataset=ds,
|
||||
max_seq_length=MAX_SEQUENCE_LENGTH,
|
||||
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
|
||||
)
|
||||
|
||||
oneshot(**oneshot_kwargs)
|
||||
|
||||
base_name = os.path.basename(MODEL_PATH.rstrip("/"))
|
||||
save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}")
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
if save_compressed:
|
||||
model.save_pretrained(save_dir, save_compressed=True)
|
||||
else:
|
||||
model.save_pretrained(save_dir)
|
||||
tokenizer.save_pretrained(save_dir)
|
||||
|
||||
_cleanup(model, tokenizer)
|
||||
return save_dir
|
||||
|
||||
|
||||
def main():
|
||||
# GPTQ W4A16 (INT4 weight-only).
|
||||
_run_recipe(
|
||||
"W4A16-GPTQ",
|
||||
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
|
||||
save_compressed=True,
|
||||
use_calibration=True,
|
||||
)
|
||||
|
||||
# AWQ W4A16 (INT4 weight-only).
|
||||
_run_recipe(
|
||||
"W4A16-AWQ",
|
||||
AWQModifier(
|
||||
targets=["Linear"],
|
||||
scheme="W4A16_ASYM",
|
||||
ignore=["lm_head"],
|
||||
duo_scaling="both",
|
||||
),
|
||||
save_compressed=True,
|
||||
use_calibration=True,
|
||||
)
|
||||
|
||||
# GPTQ W8A16 (INT8 weight-only).
|
||||
_run_recipe(
|
||||
"W8A16-GPTQ",
|
||||
GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]),
|
||||
save_compressed=True,
|
||||
use_calibration=True,
|
||||
)
|
||||
|
||||
# FP8 dynamic (W8A8-FP8).
|
||||
_run_recipe(
|
||||
"FP8-Dynamic",
|
||||
QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]),
|
||||
save_compressed=False,
|
||||
use_calibration=False,
|
||||
)
|
||||
|
||||
# NVFP4A16 (FP4 weights + FP16 activations).
|
||||
_run_recipe(
|
||||
"NVFP4A16",
|
||||
QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]),
|
||||
save_compressed=True,
|
||||
use_calibration=False,
|
||||
)
|
||||
|
||||
# MXFP4 (FP4 weights).
|
||||
_run_recipe(
|
||||
"MXFP4",
|
||||
QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]),
|
||||
save_compressed=True,
|
||||
use_calibration=False,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user