Llama-3.2-1B/quant.py

#!/usr/bin/env python3
"""Convert a local BF16 model into Marlin-supported quant formats via llm-compressor."""

from __future__ import annotations

import gc
import os
import sys
from typing import Optional

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

# Allow running against the local llm-compressor checkout without installing.
LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src"
if os.path.isdir(LLM_COMPRESSOR_SRC):
    sys.path.insert(0, LLM_COMPRESSOR_SRC)

from llmcompressor import oneshot  # noqa: E402
from llmcompressor.modifiers.awq import AWQModifier  # noqa: E402
from llmcompressor.modifiers.quantization import (  # noqa: E402
    GPTQModifier,
    QuantizationModifier,
)

MODEL_PATH = "/home/quixi/models/Llama-3.2-1B"
OUTPUT_ROOT = "/home/quixi/models"

CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k"
CALIB_DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 128
MAX_SEQUENCE_LENGTH = 512


def _load_tokenized_dataset(tokenizer):
    ds = load_dataset(
        CALIB_DATASET_ID,
        split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
    ).shuffle(seed=42)

    def preprocess(example):
        return {
            "text": tokenizer.apply_chat_template(
                example["messages"],
                tokenize=False,
            )
        }

    ds = ds.map(preprocess)

    def tokenize(sample):
        return tokenizer(
            sample["text"],
            padding=False,
            max_length=MAX_SEQUENCE_LENGTH,
            truncation=True,
            add_special_tokens=False,
        )

    return ds.map(tokenize, remove_columns=ds.column_names)


def _load_model_and_tokenizer():
    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    if torch.cuda.is_available():
        model.to("cuda")
    return model, tokenizer


def _cleanup(model, tokenizer):
    del model
    del tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


def _run_recipe(
    name: str,
    recipe,
    *,
    save_compressed: bool,
    use_calibration: bool,
) -> Optional[str]:
    print(f"\n=== Quantizing {name} ===")
    model, tokenizer = _load_model_and_tokenizer()

    oneshot_kwargs = {"model": model, "recipe": recipe}
    if use_calibration:
        ds = _load_tokenized_dataset(tokenizer)
        oneshot_kwargs.update(
            dataset=ds,
            max_seq_length=MAX_SEQUENCE_LENGTH,
            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        )

    oneshot(**oneshot_kwargs)

    base_name = os.path.basename(MODEL_PATH.rstrip("/"))
    save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}")
    os.makedirs(save_dir, exist_ok=True)

    if save_compressed:
        model.save_pretrained(save_dir, save_compressed=True)
    else:
        model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    _cleanup(model, tokenizer)
    return save_dir


def main():
    # GPTQ W4A16 (INT4 weight-only).
    _run_recipe(
        "W4A16-GPTQ",
        GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
        save_compressed=True,
        use_calibration=True,
    )

    # AWQ W4A16 (INT4 weight-only).
    _run_recipe(
        "W4A16-AWQ",
        AWQModifier(
            targets=["Linear"],
            scheme="W4A16_ASYM",
            ignore=["lm_head"],
            duo_scaling="both",
        ),
        save_compressed=True,
        use_calibration=True,
    )

    # GPTQ W8A16 (INT8 weight-only).
    _run_recipe(
        "W8A16-GPTQ",
        GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]),
        save_compressed=True,
        use_calibration=True,
    )

    # FP8 dynamic (W8A8-FP8).
    _run_recipe(
        "FP8-Dynamic",
        QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]),
        save_compressed=False,
        use_calibration=False,
    )

    # NVFP4A16 (FP4 weights + FP16 activations).
    _run_recipe(
        "NVFP4A16",
        QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]),
        save_compressed=True,
        use_calibration=False,
    )

    # MXFP4 (FP4 weights).
    _run_recipe(
        "MXFP4",
        QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]),
        save_compressed=True,
        use_calibration=False,
    )


if __name__ == "__main__":
    main()