59 lines
1.9 KiB
Python
59 lines
1.9 KiB
Python
|
|
from llmcompressor import oneshot
|
||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||
|
|
|
||
|
|
MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
|
||
|
|
|
||
|
|
# Load model.
|
||
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||
|
|
|
||
|
|
recipe = """
|
||
|
|
quant_stage:
|
||
|
|
quant_modifiers:
|
||
|
|
QuantizationModifier:
|
||
|
|
ignore: ["lm_head", "re:.*mlp.gate$"]
|
||
|
|
config_groups:
|
||
|
|
group_0:
|
||
|
|
weights:
|
||
|
|
num_bits: 8
|
||
|
|
type: int
|
||
|
|
strategy: channel
|
||
|
|
dynamic: false
|
||
|
|
symmetric: true
|
||
|
|
input_activations:
|
||
|
|
num_bits: 8
|
||
|
|
type: int
|
||
|
|
strategy: token
|
||
|
|
dynamic: true
|
||
|
|
symmetric: true
|
||
|
|
targets: ["re:.*self_attn.k_proj.*", "re:.*self_attn.o_proj.*",
|
||
|
|
"re:.*self_attn.q_proj.*", "re:.*self_attn.v_proj.*"]
|
||
|
|
group_1:
|
||
|
|
weights:
|
||
|
|
num_bits: 4
|
||
|
|
type: int
|
||
|
|
strategy: group
|
||
|
|
group_size: 128
|
||
|
|
dynamic: false
|
||
|
|
symmetric: true
|
||
|
|
input_activations:
|
||
|
|
num_bits: 8
|
||
|
|
type: int
|
||
|
|
strategy: token
|
||
|
|
dynamic: true
|
||
|
|
symmetric: true
|
||
|
|
targets: ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Apply quantization.
|
||
|
|
oneshot(
|
||
|
|
model=model,
|
||
|
|
recipe=recipe,
|
||
|
|
trust_remote_code_model=True,
|
||
|
|
)
|
||
|
|
|
||
|
|
# Save to disk in compressed-tensors format.
|
||
|
|
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A8"
|
||
|
|
model.save_pretrained(SAVE_DIR, save_compressed=True)
|
||
|
|
tokenizer.save_pretrained(SAVE_DIR)
|