Files
xc-llm-ascend/examples/quantization/llm-compressor/w8a8_int8_dynamic_moe.py
wangxiyuan e5c46bf169 [CI] Fix lint CI (#5880)
Quick fix for lint CI

- vLLM version: v0.13.0
- vLLM main:
bde38c11df

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-14 11:23:38 +08:00

27 lines
776 B
Python

import torch
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
recipe = QuantizationModifier(
targets="Linear",
scheme="INT8",
ignore=["lm_head", "re:.*mlp.gate$"],
)
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8_W8A8"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)