This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,264 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import importlib
import tempfile
import unittest
from unittest import skip
import pytest
from packaging import version
from transformers import AqlmConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM, StaticCache
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_aqlm,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_aqlm_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_accelerator
class AqlmConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = AqlmConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {
"in_group_size": 32,
"num_codebooks": 8,
"nbits_per_codebook": 8,
"linear_weights_not_to_quantize": ["lm_head.weight"],
}
quantization_config = AqlmConfig.from_dict(dict)
self.assertEqual(dict["in_group_size"], quantization_config.in_group_size)
self.assertEqual(dict["num_codebooks"], quantization_config.num_codebooks)
self.assertEqual(dict["nbits_per_codebook"], quantization_config.nbits_per_codebook)
self.assertEqual(dict["linear_weights_not_to_quantize"], quantization_config.linear_weights_not_to_quantize)
@slow
@require_torch_accelerator
@require_aqlm
@require_accelerate
class AqlmTest(unittest.TestCase):
model_name = "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = "Hello my name is Katie. I am a 20 year old college student. I am a very outgoing person. I love to have fun and be active. I"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=torch_device,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from aqlm import QuantizedLinear
from transformers.integrations import replace_with_aqlm_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = AqlmConfig()
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_aqlm_linear(model, quantization_config=quantization_config)
nb_aqlm_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_aqlm_linear += 1
self.assertEqual(nb_linears, nb_aqlm_linear)
# Try with `linear_weights_not_to_quantize`
with init_empty_weights():
model = OPTForCausalLM(config)
model, _ = replace_with_aqlm_linear(
model, quantization_config=quantization_config, linear_weights_not_to_quantize=["lm_head.weight"]
)
nb_aqlm_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_aqlm_linear += 1
self.assertEqual(nb_linears - 1, nb_aqlm_linear)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AqlmConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@skip(
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
)
@require_torch_multi_accelerator
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@unittest.skipUnless(
is_aqlm_available() and version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.3"),
"test requires `aqlm>=1.0.3`",
)
@pytest.mark.torch_compile_test
def test_quantized_model_compile(self):
"""
Simple test that checks if the quantized model is working properly
"""
# Sample tokens greedily
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
logits = model(
cur_token,
position_ids=input_pos,
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
return new_token
# Tokenize the test input
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
seq_length = input_ids.shape[1]
# Setup static KV cache for generation
past_key_values = StaticCache(
config=self.quantized_model.config,
batch_size=input_ids.shape[0],
max_cache_len=seq_length + self.max_new_tokens + 1,
)
# Allocate token ids to be generated and copy prefix ids
cache_position = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
generated_ids[:, cache_position] = input_ids.to(torch_device).to(torch.int)
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
logits = self.quantized_model(
input_ids,
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
# Compile the CUDA graph
decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
# Generate tokens one by one
cache_position = torch.tensor([seq_length + 1], device=torch_device)
for _ in range(1, self.max_new_tokens):
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
next_token = decode_one_tokens(
self.quantized_model, next_token.clone(), None, cache_position, past_key_values
)
generated_ids.index_copy_(1, cache_position, next_token)
cache_position += 1
# Check generated text
self.assertEqual(self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,543 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
get_device_properties,
require_accelerate,
require_auto_awq,
require_flash_attn,
require_intel_extension_for_pytorch,
require_torch_accelerator,
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_accelerator
class AwqConfigTest(unittest.TestCase):
def test_wrong_backend(self):
"""
Simple test that checks if a user passes a wrong backend an error is raised
"""
# This should work fine
_ = AwqConfig(bits=4)
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="")
# These should work fine
_ = AwqConfig(bits=4, version="GEMM")
_ = AwqConfig(bits=4, version="gemm")
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="unexisting-backend")
# Only cuda and xpu devices can run this function
support_llm_awq = False
device_type, major, _ = get_device_properties()
if device_type == "cuda" and major >= 8:
support_llm_awq = True
elif device_type == "xpu":
support_llm_awq = True
if support_llm_awq:
# LLMAWQ should work on an A100
AwqConfig(bits=4, backend="llm-awq")
else:
# LLMAWQ does not work on a T4
with self.assertRaises(ValueError):
AwqConfig(bits=4, backend="llm-awq")
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = AwqConfig(bits=4)
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"bits": 2, "zero_point": False, "backend": "autoawq"}
quantization_config = AwqConfig.from_dict(dict)
self.assertEqual(dict["bits"], quantization_config.bits)
self.assertEqual(dict["zero_point"], quantization_config.zero_point)
self.assertEqual(dict["backend"], quantization_config.backend)
@slow
@require_torch_accelerator
@require_auto_awq
@require_accelerate
class AwqTest(unittest.TestCase):
model_name = "TheBloke/Mistral-7B-v0.1-AWQ"
dummy_transformers_model_name = "bigscience/bloom-560m"
model_with_no_k_proj_quantized = "hf-internal-testing/opt-125m-awq-no-k-proj"
input_text = "Hello my name is"
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
EXPECTED_OUTPUT_EXLLAMA = [
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out",
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative",
]
device_map = torch_device
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device_map)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
from transformers.integrations.awq import replace_with_awq_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = AwqConfig(bits=4)
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_awq_linear(model, quantization_config=quantization_config)
nb_awq_linear = 0
for module in model.modules():
if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)):
nb_awq_linear += 1
self.assertEqual(nb_linears, nb_awq_linear)
# Try with `modules_not_to_convert`
with init_empty_weights():
model = OPTForCausalLM(config)
model, _ = replace_with_awq_linear(
model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
)
nb_awq_linear = 0
for module in model.modules():
if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)):
nb_awq_linear += 1
self.assertEqual(nb_linears - 1, nb_awq_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AwqConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_quantized_model_bf16(self):
"""
Simple test that checks if the quantized model is working properly with bf16
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.bfloat16).to(torch_device)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)
@require_torch_gpu
def test_quantized_model_exllama(self):
"""
Simple test that checks if the quantized model is working properly with exllama backend
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AwqConfig(version="exllama")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=quantization_config, device_map=torch_device
)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
def test_quantized_model_no_device_map(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name).to(torch_device)
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=40)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_no_k_proj_quantized(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
dummy_input = torch.LongTensor([[0, 1, 0]]).to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_with_no_k_proj_quantized).to(torch_device)
self.assertTrue(isinstance(quantized_model.model.decoder.layers[0].self_attn.k_proj, torch.nn.Linear))
self.assertFalse(isinstance(quantized_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear))
EXPECTED_OUTPUT = torch.LongTensor([[0, 1, 0, 50118, 50118, 133, 248, 12, 134, 16, 10, 372, 2031]]).to(
torch_device
)
output = quantized_model.generate(dummy_input, max_new_tokens=10)
self.assertTrue((EXPECTED_OUTPUT == output).all())
@slow
@require_torch_accelerator
@require_auto_awq
@require_accelerate
class AwqFusedTest(unittest.TestCase):
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
multi_modal_model_name = "ybelkada/llava-1.5-7b-hf-awq"
multi_modal_model_code_revision = "ad108a50f5b9e681bdd7378409f57b7fa59a7442"
prompt = (
"You're standing on the surface of the Earth. "
"You walk one mile south, one mile west and one mile north. "
"You end up exactly where you started. Where are you?"
)
EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square."
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def _check_fused_modules(self, model):
has_fused_modules = False
fused_modules_name = ["QuantAttentionFused", "QuantFusedMLP", "FasterTransformerRMSNorm"]
for _, module in model.named_modules():
if module.__class__.__name__ in fused_modules_name:
has_fused_modules = True
break
self.assertTrue(has_fused_modules, "Modules fusing not performed correctly!")
def test_raise_save_pretrained(self):
"""
Test that `save_pretrained` is effectively blocked for fused models
"""
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
revision=self.model_revision,
).to(torch_device)
self._check_fused_modules(model)
with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
def test_fused_modules_to_not_convert(self):
"""
Test if fused + modules to_not_convert work as expected
"""
model_id = "hf-internal-testing/Mixtral-tiny-AWQ"
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=quantization_config,
).to(torch_device)
# Check if model has been correctly fused
self._check_fused_modules(model)
# Checks if the modules_to_not_convert (here gate layer) is a Linear
self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear))
@unittest.skipIf(
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
@require_flash_attn
@require_torch_gpu
def test_generation_fused(self):
"""
Test generation quality for fused models - single batch case
"""
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
revision=self.model_revision,
).to(torch_device)
self._check_fused_modules(model)
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
inputs = tokenizer(self.prompt, return_tensors="pt").to(torch_device)
outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
@require_flash_attn
@require_torch_gpu
@unittest.skipIf(
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
def test_generation_fused_batched(self):
"""
Test generation quality for fused models - multi batch case
"""
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
revision=self.model_revision,
).to(torch_device)
self._check_fused_modules(model)
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
tokenizer.pad_token_id = tokenizer.eos_token_id
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
def test_generation_llava_fused(self):
from transformers import pipeline
quantization_config = AwqConfig(do_fuse=True, fuse_max_seq_len=2048)
pipe = pipeline(
"image-to-text",
model=self.multi_modal_model_name,
device=0,
model_kwargs={
"quantization_config": quantization_config,
},
revision=self.multi_modal_model_code_revision,
)
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
outputs = pipe(url, prompt=prompt, generate_kwargs={"max_new_tokens": 100})
EXPECTED_OUTPUT = "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, possibly a carpet or a grassy area. The cat is holding a red ball in its paws, seemingly playing with it. The cat appears to be focused on the ball, possibly preparing to play or just enjoying the toy."
self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)
@require_flash_attn
@require_torch_multi_gpu
@unittest.skipIf(
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
)
def test_generation_custom_model(self):
"""
Test generation quality for fused models using custom fused map.
"""
quantization_config = AwqConfig(
bits=4,
fuse_max_seq_len=512,
modules_to_fuse={
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
"mlp": ["gate_proj", "up_proj", "down_proj"],
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
"use_alibi": False,
"hidden_size": 4096,
"num_attention_heads": 32,
"num_key_value_heads": 8,
},
)
model = AutoModelForCausalLM.from_pretrained(
self.custom_mapping_model_id,
quantization_config=quantization_config,
device_map="balanced",
revision=self.custom_model_revision,
)
self._check_fused_modules(model)
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
prompt = "Hello"
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
@require_flash_attn
@require_torch_multi_gpu
@unittest.skip(reason="Not enough GPU memory on CI runners")
def test_generation_mixtral_fused(self):
"""
Text generation test for Mixtral + AWQ + fused
"""
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=1024, do_fuse=True)
model = AutoModelForCausalLM.from_pretrained(
self.mixtral_model_name,
quantization_config=quantization_config,
device_map="auto",
revision=self.mixtral_model_revision,
)
tokenizer = AutoTokenizer.from_pretrained(self.mixtral_model_name)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
outputs = model.generate(**inputs, max_new_tokens=12)
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_MIXTRAL)
@slow
@require_torch_accelerator
@require_auto_awq
@require_accelerate
class AwqScaleTest(unittest.TestCase):
model_name = "TechxGenus/starcoder2-3b-AWQ"
def test_load_quantized_model(self):
from awq.modules.act import ScaledActivation
"""
Simple test that checks if the scales have been replaced in the quantized model
"""
quantized_model = AutoModelForCausalLM.from_pretrained(
"TechxGenus/starcoder2-3b-AWQ", dtype=torch.float16, device_map=torch_device
)
self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation))
@slow
@require_auto_awq
@require_accelerate
@require_intel_extension_for_pytorch
class AwqIPEXTest(unittest.TestCase):
def test_quantized_model_ipex(self):
"""
Simple test that checks if the quantized model is working properly with ipex backend
"""
quantization_config = AwqConfig(version="ipex")
model = AutoModelForCausalLM.from_pretrained(
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization_config=quantization_config,
device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
pad_token_id = tokenizer.eos_token_id
output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id)
print(tokenizer.decode(output[0], skip_special_tokens=True))
expected_output = (
"How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°"
)
self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)

View File

@@ -0,0 +1,217 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
from transformers.testing_utils import (
backend_empty_cache,
backend_synchronize,
require_accelerate,
require_auto_round,
require_intel_extension_for_pytorch,
require_torch_accelerator,
require_torch_gpu,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@slow
@require_torch_accelerator
@require_auto_round
@require_accelerate
class AutoRoundTest(unittest.TestCase):
model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
input_text = "There is a girl who likes adventure,"
EXPECTED_OUTPUTS = set()
## Different backends may produce slight variations in output
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world "
"for many years. She travels to different countries and cultures, trying new "
"things every day. One of her favorite places to visit is a small village in "
"the mountains where"
)
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She enjoys hiking through the mountains and discovering"
)
EXPECTED_OUTPUTS.add(
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She has also climbed mountains and explored caves"
)
device_map = torch_device
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
backend_synchronize(torch_device)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, dtype=torch.float16
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = AutoRoundConfig(bits=4)
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_quantized_model_bf16(self):
"""
Simple test that checks if the quantized model is working properly with bf16
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=self.device_map,
quantization_config=quantization_config,
)
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@require_intel_extension_for_pytorch
def test_quantized_model_on_cpu(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt")
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto")
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
## some backends like marlin/ipex will repack the weight that caused the weight shape changed
with tempfile.TemporaryDirectory() as tmpdirname:
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
dtype=torch.float16,
quantization_config=quantization_config,
)
quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=40, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertIn(output_tokens, self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = AutoRoundConfig(backend="triton")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config, dtype="auto"
)
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_convert_from_gptq(self):
"""
Simple test that checks if auto-round work properly with gptq format
"""
model_name = "ybelkada/opt-125m-gptq-4bit"
quantization_config = AutoRoundConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map=torch_device, quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
@require_intel_extension_for_pytorch
def test_convert_from_awq_cpu(self):
"""
Simple test that checks if auto-round work properly with awq format
"""
model_name = "casperhansen/opt-125m-awq"
quantization_config = AutoRoundConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
@require_torch_gpu
def test_mixed_bits(self):
"""
Simple test that checks if auto-round work properly with mixed bits
"""
model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
layer_config = {
"model.decoder.layers.0.self_attn.k_proj": {"bits": 8},
"model.decoder.layers.6.self_attn.out_proj": {"bits": 2, "group_size": 32},
}
bits, group_size, sym = 4, 128, True
from auto_round import AutoRound
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
with tempfile.TemporaryDirectory() as tmpdirname:
autoround.quantize_and_save(output_dir=tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=torch.float16, device_map=torch_device)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])

View File

@@ -0,0 +1,225 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
BitNetQuantConfig,
OPTForCausalLM,
)
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_torch_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_gpu
class BitNetQuantConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = BitNetQuantConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
@slow
@require_torch_gpu
@require_accelerate
class BitNetTest(unittest.TestCase):
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Load the model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=torch_device)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_replace_with_bitlinear(self):
from transformers.integrations import BitLinear, replace_with_bitnet_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id)
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_bitnet_linear(model)
nb_bitnet_linear = 0
for module in model.modules():
if isinstance(module, BitLinear):
nb_bitnet_linear += 1
self.assertEqual(nb_linears - 1, nb_bitnet_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_text = "What are we having for dinner?"
expected_output = "What are we having for dinner? What are we going to do for fun this weekend?"
input_ids = self.tokenizer(input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
def test_packing_unpacking(self):
"""
Simple test the packing and unpacking logic
"""
from transformers.integrations import pack_weights, unpack_weights
u = torch.randint(0, 255, (256, 256), dtype=torch.uint8)
unpacked_u = unpack_weights(u, dtype=torch.bfloat16)
repacked_u = pack_weights(unpacked_u)
for i in range(u.shape[0]):
for j in range(u.shape[1]):
self.assertEqual(repacked_u[i][j], u[i][j])
def test_activation_quant(self):
"""
test the activation function behaviour
"""
from transformers.integrations import BitLinear
layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
layer.to(torch_device)
input_tensor = torch.tensor([1.0, -1.0, -1.0, 1.0], dtype=torch.float32).to(torch_device)
# Quantize the input tensor
quantized_tensor, scale = layer.activation_quant(input_tensor)
# Verify the output quantized tensor
for i in range(input_tensor.shape[0]):
self.assertEqual(quantized_tensor[i] / scale, input_tensor[i])
# Verify the scale tensor
self.assertEqual(scale, 127)
def test_weights_dtype(self):
"""
test the weights dtype after loading
"""
self_attn_q = self.quantized_model.model.layers[0].self_attn.q_proj.weight
self_attn_k = self.quantized_model.model.layers[0].self_attn.k_proj.weight
self_attn_v = self.quantized_model.model.layers[0].self_attn.v_proj.weight
self_attn_o = self.quantized_model.model.layers[0].self_attn.o_proj.weight
mlp_gate = self.quantized_model.model.layers[0].mlp.gate_proj.weight
mlp_up = self.quantized_model.model.layers[0].mlp.up_proj.weight
mlp_down = self.quantized_model.model.layers[0].mlp.down_proj.weight
self.assertEqual(self_attn_q.dtype, torch.uint8)
self.assertEqual(self_attn_k.dtype, torch.uint8)
self.assertEqual(self_attn_v.dtype, torch.uint8)
self.assertEqual(self_attn_o.dtype, torch.uint8)
self.assertEqual(mlp_up.dtype, torch.uint8)
self.assertEqual(mlp_gate.dtype, torch.uint8)
self.assertEqual(mlp_down.dtype, torch.uint8)
def test_replace_with_bitlinear_shape(self):
"""
test that the BitNet layer weight shapes are correct, and the weight_scale is correctly initialized to 1
"""
from transformers.integrations import replace_with_bitnet_linear
out_features = 1024
in_features = 512
class SimpleLinearModule(torch.nn.Module):
"""
Simple class to test BitLinear
"""
def __init__(
self,
in_features: int = in_features,
out_features: int = out_features,
bias: bool = False,
):
super().__init__()
self.linear = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)
def forward(self, x):
return self.linear(x)
model = SimpleLinearModule()
replace_with_bitnet_linear(model)
self.assertEqual(list(model.linear.weight.shape), [out_features // 4, in_features])
self.assertEqual(model.linear.weight_scale, 1)
@slow
@require_torch_gpu
@require_accelerate
class BitNetSerializationTest(unittest.TestCase):
def test_model_serialization(self):
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
logits_ref = quantized_model.forward(input_tensor).logits
# Save
saved_model_id = "quant_model"
quantized_model.save_pretrained(saved_model_id)
# Remove old model
del quantized_model
backend_empty_cache(torch_device)
# Load and check if the logits match
model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=torch_device)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)

View File

@@ -0,0 +1,120 @@
# Testing mixed int8 quantization
![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1660567705337-62441d1d9fdefb55a0b7d12c.png)
The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
## Library requirements
+ `transformers>=4.22.0`
+ `accelerate>=0.12.0`
+ `bitsandbytes>=0.31.5`.
## Hardware requirements
The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported.
## Virtual envs
```bash
conda create --name int8-testing python==3.8
pip install bitsandbytes>=0.31.5
pip install accelerate>=0.12.0
pip install transformers>=4.23.0
```
if `transformers>=4.23.0` is not released yet, then use:
```bash
pip install git+https://github.com/huggingface/transformers.git
```
## Troubleshooting
A list of common errors:
### Torch does not correctly do the operations on GPU
First check that:
```py
import torch
vec = torch.randn(1, 2, 3).to(0)
```
Works without any error. If not, install torch using `conda` like:
```bash
conda create --name int8-testing python==3.8
conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
pip install bitsandbytes>=0.31.5
pip install accelerate>=0.12.0
pip install transformers>=4.23.0
```
For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
and the snippet above should work.
### ` bitsandbytes operations are not supported under CPU!`
This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
### `Parameter has no attribute .CB`
Same solution as above.
### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
Run your script by prepending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
### `CUDA illegal memory error: an illegal memory access at line...`:
Check the CUDA versions with:
```bash
nvcc --version
```
and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
```bash
ls -l $CONDA_PREFIX/lib/libcudart.so
```
or
```bash
ls -l $LD_LIBRARY_PATH
```
Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file.
Here is an example of a badly configured CUDA installation:
`nvcc --version` gives:
![Screenshot 2022-08-15 at 15.12.23.png](https://cdn-uploads.huggingface.co/production/uploads/1660569220888-62441d1d9fdefb55a0b7d12c.png)
which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
![image.png](https://cdn-uploads.huggingface.co/production/uploads/1660569284243-62441d1d9fdefb55a0b7d12c.png)
First check:
```bash
echo $LD_LIBRARY_PATH
```
If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
```bash
ls -l $path/libcudart.so
```
On each path (`$path`) separated by `:`.
If not, simply run
```bash
ls -l $LD_LIBRARY_PATH/libcudart.so
```
and you can see
![Screenshot 2022-08-15 at 15.12.33.png](https://cdn-uploads.huggingface.co/production/uploads/1660569176504-62441d1d9fdefb55a0b7d12c.png)
If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.

View File

@@ -0,0 +1,868 @@
# Copyright 2022 The HuggingFace Team Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a clone of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import importlib.metadata
import tempfile
import unittest
import pytest
from packaging import version
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoTokenizer,
BitsAndBytesConfig,
pipeline,
set_seed,
)
from transformers.models.opt.modeling_opt import OPTAttention
from transformers.testing_utils import (
apply_skip_if_not_implemented,
backend_empty_cache,
backend_torch_accelerator_module,
is_bitsandbytes_available,
is_torch_available,
require_accelerate,
require_bitsandbytes,
require_torch,
require_torch_gpu_if_bnb_not_multi_backend_enabled,
require_torch_multi_accelerator,
slow,
torch_device,
)
def get_some_linear_layer(model):
if model.config.model_type == "gpt2":
return model.transformer.h[0].mlp.c_fc
elif model.config.model_type == "opt":
try:
return model.decoder.layers[0].fc1
except AttributeError:
# for AutoModelforCausalLM
return model.model.decoder.layers[0].fc1
elif model.config.model_type == "llama":
return model.model.layers[0].mlp.gate_proj
else:
return model.transformer.h[0].mlp.dense_4h_to_h
if is_torch_available():
import torch
import torch.nn as nn
class LoRALayer(nn.Module):
"""Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
def __init__(self, module: nn.Module, rank: int):
super().__init__()
self.module = module
self.adapter = nn.Sequential(
nn.Linear(module.in_features, rank, bias=False),
nn.Linear(rank, module.out_features, bias=False),
)
small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
nn.init.normal_(self.adapter[0].weight, std=small_std)
nn.init.zeros_(self.adapter[1].weight)
self.adapter.to(module.weight.device)
def forward(self, input, *args, **kwargs):
return self.module(input, *args, **kwargs) + self.adapter(input)
if is_bitsandbytes_available():
import bitsandbytes as bnb
@require_bitsandbytes
@require_accelerate
@require_torch
@require_torch_gpu_if_bnb_not_multi_backend_enabled
@slow
class Base4bitTest(unittest.TestCase):
# We keep the constants inside the init function and model loading inside setUp function
# We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
# Therefore here we use only bloom-1b3 to test our module
model_name = "bigscience/bloom-1b7"
# Constant values
EXPECTED_RELATIVE_DIFFERENCE = (
2.109659552692574 # This was obtained on a RTX Titan so the number might slightly change
)
input_text = "Hello my name is"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
EXPECTED_OUTPUTS.add("Hello my name is John and I am 25 years old.")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a student at the University of")
# Expected values on Intel XPU and NV A100
EXPECTED_OUTPUTS.add("Hello my name is Alina. I have been working as a professional")
MAX_NEW_TOKENS = 10
def setUp(self):
# Models and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@apply_skip_if_not_implemented
class Bnb4BitTest(Base4bitTest):
def setUp(self):
super().setUp()
# Models and tokenizer
self.model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.float16, device_map="auto")
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
del self.model_fp16
del self.model_4bit
gc.collect()
backend_empty_cache(torch_device)
def test_quantization_num_parameters(self):
r"""
Test if the number of returned parameters is correct
See: https://github.com/huggingface/transformers/issues/25978
"""
num_params_4bit = self.model_4bit.num_parameters()
num_params_fp16 = self.model_fp16.num_parameters()
self.assertEqual(num_params_4bit, num_params_fp16)
def test_quantization_config_json_serialization(self):
r"""
A simple test to check if the quantization config is correctly serialized and deserialized
"""
config = self.model_4bit.config
self.assertTrue(hasattr(config, "quantization_config"))
_ = config.to_dict()
_ = config.to_diff_dict()
_ = config.to_json_string()
def test_memory_footprint(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model and the class type of the linear layers of the converted models
"""
from bitsandbytes.nn import Params4bit
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_4bit = self.model_4bit.get_memory_footprint()
self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
linear = get_some_linear_layer(self.model_4bit)
self.assertTrue(linear.weight.__class__ == Params4bit)
def test_original_dtype(self):
r"""
A simple test to check if the model successfully stores the original dtype
"""
self.assertTrue(hasattr(self.model_4bit.config, "_pre_quantization_dtype"))
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
self.assertTrue(self.model_4bit.config._pre_quantization_dtype == torch.float16)
def test_linear_are_4bit(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model and the class type of the linear layers of the converted models
"""
from transformers import T5PreTrainedModel
self.model_fp16.get_memory_footprint()
self.model_4bit.get_memory_footprint()
for name, module in self.model_4bit.named_modules():
if isinstance(module, torch.nn.Linear):
if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
# 4-bit parameters are packed in uint8 variables
self.assertTrue(module.weight.dtype == torch.uint8)
def test_rwkv_4bit(self):
r"""
A simple test to check if 4-bit RWKV inference works as expected.
"""
model_id = "RWKV/rwkv-4-169m-pile"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
tok = AutoTokenizer.from_pretrained(model_id)
text = "Hello my name is"
input_ids = tok.encode(text, return_tensors="pt").to(torch_device)
_ = model.generate(input_ids, max_new_tokens=30)
def test_generate_quality(self):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_config(self):
r"""
Test that loading the model with the config is equivalent
"""
bnb_config = BitsAndBytesConfig()
bnb_config.load_in_4bit = True
model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_4bit_from_config.generate(
input_ids=encoded_input["input_ids"].to(model_4bit_from_config.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_dequantize(self):
r"""
Test that loading the model and unquantize it produce correct results
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model_4bit.generate(
input_ids=encoded_input["input_ids"].to(model_4bit.device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_clear_quantization_trace(self):
r"""
Test that dequantizing the model won't leave any attribute relative to quantization in the model's configuration
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
self.assertFalse(hasattr(model_4bit, "hf_quantizer"))
self.assertFalse(hasattr(model_4bit.config, "quantization_config"))
self.assertFalse(hasattr(model_4bit.config, "_pre_quantization_dtype"))
self.assertFalse(hasattr(model_4bit, "quantization_method"))
self.assertFalse(model_4bit.is_quantized)
def test_to_device_dequantized(self):
r"""
Test that dequantizing the model won't prevent converting it to a different dtype
"""
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model_4bit = AutoModelForCausalLM.from_pretrained(
self.model_name, quantization_config=bnb_config, device_map="auto"
)
model_4bit.dequantize()
model_4bit.to(dtype=torch.float16)
def test_device_assignment(self):
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"):
self.skipTest(reason="This test requires bitsandbytes >= 0.43.2")
mem_before = self.model_4bit.get_memory_footprint()
# Move to CPU
self.model_4bit.to("cpu")
self.assertEqual(self.model_4bit.device.type, "cpu")
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
if torch_device in ["cuda", "xpu"]:
# Move back to CUDA device
self.model_4bit.to(torch_device)
self.assertEqual(self.model_4bit.device.type, torch_device)
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
def test_device_and_dtype_assignment(self):
r"""
Test whether attempting to change the device or cast the dtype of a model
after converting it to 4-bit precision will raise an appropriate error.
The test ensures that such operations are prohibited on 4-bit models
to prevent invalid conversions.
"""
# Moving with `to` or `cuda` is not supported with versions < 0.43.2.
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"):
with self.assertRaises(ValueError):
# Tries with `str`
self.model_4bit.to("cpu")
with self.assertRaises(ValueError):
# Tries with a `device`
self.model_4bit.to(torch.device("cuda:0"))
with self.assertRaises(ValueError):
# Tries with `cuda`
self.model_4bit.cuda()
with self.assertRaises(ValueError):
# Tries with a `dtype`
self.model_4bit.to(torch.float16)
with self.assertRaises(ValueError):
# Tries to cast the 4-bit model to float32 using `float()`
self.model_4bit.float()
with self.assertRaises(ValueError):
# Tries to cast the 4-bit model to float16 using `half()`
self.model_4bit.half()
# Test if we did not break anything
self.model_4bit.to(torch.device(torch_device))
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
self.model_fp16 = self.model_fp16.to(torch.float32)
_ = self.model_fp16.generate(
input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10
)
if torch_device in ["cuda", "xpu"]:
# Check that this does not throw an error
_ = self.model_fp16.to(torch_device)
# Check this does not throw an error
_ = self.model_fp16.to("cpu")
# Check this does not throw an error
_ = self.model_fp16.half()
# Check this does not throw an error
_ = self.model_fp16.float()
def test_fp32_4bit_conversion(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
"""
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small", load_in_4bit=True, device_map="auto")
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
def test_bnb_4bit_wrong_config(self):
r"""
Test whether creating a bnb config with unsupported values leads to errors.
"""
with self.assertRaises(ValueError):
_ = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_storage="add")
@require_bitsandbytes
@require_accelerate
@require_torch
@require_torch_gpu_if_bnb_not_multi_backend_enabled
@slow
@apply_skip_if_not_implemented
class Bnb4BitT5Test(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_name = "google-t5/t5-small"
cls.dense_act_model_name = "google/flan-t5-small" # flan-t5 uses dense-act instead of dense-relu-dense
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.input_text = "Translate in German: Hello, my dog is cute"
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
gc.collect()
backend_empty_cache(torch_device)
def test_inference_without_keep_in_fp32(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
both cases.
"""
from transformers import T5ForConditionalGeneration
modules = T5ForConditionalGeneration._keep_in_fp32_modules
T5ForConditionalGeneration._keep_in_fp32_modules = None
# test with `google-t5/t5-small`
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
# test with `flan-t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.dense_act_model_name, load_in_4bit=True, device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
T5ForConditionalGeneration._keep_in_fp32_modules = modules
def test_inference_with_keep_in_fp32(self):
r"""
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
both cases.
"""
from transformers import T5ForConditionalGeneration
# test with `google-t5/t5-small`
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
# there was a bug with decoders - this test checks that it is fixed
self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
# test with `flan-t5-small`
model = T5ForConditionalGeneration.from_pretrained(
self.dense_act_model_name, load_in_4bit=True, device_map="auto"
)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
_ = model.generate(**encoded_input)
@apply_skip_if_not_implemented
class Classes4BitModelTest(Base4bitTest):
def setUp(self):
super().setUp()
# model_name
self.model_name = "bigscience/bloom-560m"
self.seq_to_seq_name = "google-t5/t5-small"
# Different types of model
self.base_model = AutoModel.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
# Sequence classification model
self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
self.model_name, load_in_4bit=True, device_map="auto"
)
# CausalLM model
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
# Seq2seq model
self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
self.seq_to_seq_name, load_in_4bit=True, device_map="auto"
)
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
del self.base_model
del self.sequence_model
del self.model_4bit
del self.seq_to_seq_model
gc.collect()
backend_empty_cache(torch_device)
def test_correct_head_class(self):
r"""
A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
are kept in their native class.
"""
from bitsandbytes.nn import Params4bit
self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
# Other heads should be nn.Parameter
self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
@apply_skip_if_not_implemented
class Pipeline4BitTest(Base4bitTest):
def setUp(self):
super().setUp()
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
if hasattr(self, "pipe"):
del self.pipe
gc.collect()
backend_empty_cache(torch_device)
def test_pipeline(self):
r"""
The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
we used pipeline for inference speed benchmarking we want to make sure that this feature does not break anything
on pipeline.
"""
# self._clear_cuda_cache()
self.pipe = pipeline(
"text-generation",
model=self.model_name,
model_kwargs={
"device_map": "auto",
"load_in_4bit": True,
# float16 isn't supported on CPU, use bfloat16 instead
"dtype": torch.bfloat16 if torch_device == "cpu" else torch.float16,
},
max_new_tokens=self.MAX_NEW_TOKENS,
)
# Avoid sampling different outputs
set_seed(42)
# Real second forward pass
pipeline_output = self.pipe(self.input_text)
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
@require_torch_multi_accelerator
@apply_skip_if_not_implemented
class Bnb4bitTestMultiAccelerator(Base4bitTest):
def setUp(self):
super().setUp()
def test_multi_accelerator_loading(self):
r"""
This tests that the model has been loaded and can be used correctly on a multi-accelerator setup.
Let's just try to load a model on 2 accelerators and see if it works. The model we test has ~2GB of total, 3GB should suffice
"""
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": 0,
"transformer.h.0": 0,
"transformer.h.1": 0,
"transformer.h.2": 0,
"transformer.h.3": 0,
"transformer.h.4": 0,
"transformer.h.5": 0,
"transformer.h.6": 0,
"transformer.h.7": 0,
"transformer.h.8": 0,
"transformer.h.9": 0,
"transformer.h.10": 1,
"transformer.h.11": 1,
"transformer.h.12": 1,
"transformer.h.13": 1,
"transformer.h.14": 1,
"transformer.h.15": 1,
"transformer.h.16": 1,
"transformer.h.17": 0,
"transformer.h.18": 0,
"transformer.h.19": 0,
"transformer.h.20": 0,
"transformer.h.21": 0,
"transformer.h.22": 0,
"transformer.h.23": 1,
"transformer.ln_f": 0,
}
model_parallel = AutoModelForCausalLM.from_pretrained(
self.model_name, load_in_4bit=True, device_map=device_map
)
# Check correct device map
self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Second real batch
output_parallel = model_parallel.generate(
input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
)
self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
@apply_skip_if_not_implemented
class Bnb4BitTestTraining(Base4bitTest):
def setUp(self):
self.model_name = "facebook/opt-350m"
super().setUp()
def test_training(self):
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
self.skipTest(reason="This test requires bitsandbytes >= 0.37.0")
# Step 1: freeze all parameters
model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
if torch_device in ["cuda", "xpu"]:
self.assertEqual(
set(model.hf_device_map.values()), {backend_torch_accelerator_module(torch_device).current_device()}
)
else:
self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
for param in model.parameters():
param.requires_grad = False # freeze the model - train adapters later
if param.ndim == 1:
# cast the small parameters (e.g. layernorm) to fp32 for stability
param.data = param.data.to(torch.float32)
# Step 2: add adapters
for _, module in model.named_modules():
if isinstance(module, OPTAttention):
module.q_proj = LoRALayer(module.q_proj, rank=16)
module.k_proj = LoRALayer(module.k_proj, rank=16)
module.v_proj = LoRALayer(module.v_proj, rank=16)
# Step 3: dummy batch
batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
# Step 4: Check if the gradient is not None
with torch.autocast(torch_device):
out = model.forward(**batch)
out.logits.norm().backward()
for module in model.modules():
if isinstance(module, LoRALayer):
self.assertTrue(module.adapter[1].weight.grad is not None)
self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
elif isinstance(module, nn.Embedding):
self.assertTrue(module.weight.grad is None)
@apply_skip_if_not_implemented
class Bnb4BitGPT2Test(Bnb4BitTest):
model_name = "openai-community/gpt2-xl"
EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
@apply_skip_if_not_implemented
class Bnb4BitLlamaTest(Bnb4BitTest):
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EXPECTED_RELATIVE_DIFFERENCE = 2.9461410686392764
@require_bitsandbytes
@require_accelerate
@require_torch
@require_torch_gpu_if_bnb_not_multi_backend_enabled
@slow
@apply_skip_if_not_implemented
class BaseSerializationTest(unittest.TestCase):
model_name = "facebook/opt-125m"
input_text = "Mars colonists' favorite meals are"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
r"""
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
See ExtendedSerializationTest class for more params combinations.
"""
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type,
bnb_4bit_use_double_quant=double_quant,
bnb_4bit_compute_dtype=torch.bfloat16,
)
model_0 = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.quantization_config,
device_map=torch_device,
)
with tempfile.TemporaryDirectory() as tmpdirname:
model_0.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
config = AutoConfig.from_pretrained(tmpdirname)
self.assertTrue(hasattr(config, "quantization_config"))
model_1 = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
# checking quantized linear module weight
linear = get_some_linear_layer(model_1)
self.assertTrue(linear.weight.__class__ == bnb.nn.Params4bit)
self.assertTrue(hasattr(linear.weight, "quant_state"))
self.assertTrue(linear.weight.quant_state.__class__ == bnb.functional.QuantState)
# checking memory footpring
self.assertAlmostEqual(model_0.get_memory_footprint() / model_1.get_memory_footprint(), 1, places=2)
# Matching all parameters and their quant_state items:
d0 = dict(model_0.named_parameters())
d1 = dict(model_1.named_parameters())
self.assertTrue(d0.keys() == d1.keys())
for k in d0:
self.assertTrue(d0[k].shape == d1[k].shape)
self.assertTrue(d0[k].device.type == d1[k].device.type)
self.assertTrue(d0[k].device == d1[k].device)
self.assertTrue(d0[k].dtype == d1[k].dtype)
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
if isinstance(d0[k], bnb.nn.modules.Params4bit):
for v0, v1 in zip(
d0[k].quant_state.as_dict().values(),
d1[k].quant_state.as_dict().values(),
):
if isinstance(v0, torch.Tensor):
# The absmax will not be saved in the quant_state when using NF4 in CPU
if v0.numel() != 0:
self.assertTrue(torch.equal(v0, v1.to(v0.device)))
else:
self.assertTrue(v0 == v1)
# comparing forward() outputs
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
out_0 = model_0(**encoded_input)
out_1 = model_1(**encoded_input)
torch.testing.assert_close(out_0["logits"], out_1["logits"], rtol=0.05, atol=0.05)
# comparing generate() outputs
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output_sequences_0 = model_0.generate(**encoded_input, max_new_tokens=10)
output_sequences_1 = model_1.generate(**encoded_input, max_new_tokens=10)
def _decode(token):
return tokenizer.decode(token, skip_special_tokens=True)
self.assertEqual(
[_decode(x) for x in output_sequences_0],
[_decode(x) for x in output_sequences_1],
)
@apply_skip_if_not_implemented
class ExtendedSerializationTest(BaseSerializationTest):
"""
tests more combinations of parameters
"""
def test_nf4_single_unsafe(self):
self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=False)
def test_nf4_single_safe(self):
self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=True)
def test_nf4_double_unsafe(self):
self.test_serialization(quant_type="nf4", double_quant=True, safe_serialization=False)
# nf4 double safetensors quantization is tested in test_serialization() method from the parent class
def test_fp4_single_unsafe(self):
self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=False)
def test_fp4_single_safe(self):
self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=True)
def test_fp4_double_unsafe(self):
self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=False)
def test_fp4_double_safe(self):
self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
class BloomSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with Bloom family model
"""
model_name = "bigscience/bloom-560m"
class GPTSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with GPT family model
"""
model_name = "openai-community/gpt2-xl"
class LlamaSerializationTest(BaseSerializationTest):
"""
default BaseSerializationTest config tested with Llama family model
"""
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@require_bitsandbytes
@require_accelerate
@require_torch_gpu_if_bnb_not_multi_backend_enabled
@slow
@apply_skip_if_not_implemented
class Bnb4BitTestBasicConfigTest(unittest.TestCase):
def test_load_in_4_and_8_bit_fails(self):
with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
AutoModelForCausalLM.from_pretrained("facebook/opt-125m", load_in_4bit=True, load_in_8bit=True)
def test_set_load_in_8_bit(self):
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
quantization_config.load_in_8bit = True
@require_bitsandbytes
@require_accelerate
@require_torch_gpu_if_bnb_not_multi_backend_enabled
@slow
@apply_skip_if_not_implemented
class Bnb4bitCompile(unittest.TestCase):
model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
input_text = "Hello my name is"
def setUp(self):
# Models and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
@pytest.mark.torch_compile_test
def test_generate_compile(self):
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# if nothing is set, compile will be disabled for bnb
self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
max_new_tokens=10,
cache_implementation="static",
)
with self.assertRaises(Exception):
# overwrite property
object.__setattr__(self.model_4bit.hf_quantizer, "is_compileable", True)
self.model_4bit.generate(
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
max_new_tokens=10,
cache_implementation="static",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,232 @@
import gc
import unittest
import warnings
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
from transformers.utils.quantization_config import CompressedTensorsConfig
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class StackCompressedModelTest(unittest.TestCase):
# Define stubs as class attributes
compressed_uncompressed_model_stubs = [
(
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
),
(
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
),
(
"nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
),
]
# Flatten the list for tests that require a single list of stubs.
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
# For the outputs matching test, use the sparse-only pair.
sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
prompt = "Paris is the capital of which country?"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_compressed_uncompressed_model_shapes(self):
"""
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
Note: Weights for sparsely compressed models may differ due to packing.
"""
def _has_nested_attr(obj, attr_path):
attrs = attr_path.split(".")
for attr in attrs:
if not hasattr(obj, attr):
return None
obj = getattr(obj, attr)
return obj
from compressed_tensors.quantization.utils import iter_named_leaf_modules
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
uncompressed = AutoModelForCausalLM.from_pretrained(
uncompressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
compressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for name, submodule in iter_named_leaf_modules(uncompressed):
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
if "sparse-only" in uncompressed_model:
self.assertTrue(
torch.equal(submodule.weight, comp_decomp_obj.weight),
f"Weight mismatch for module '{name}' in sparse-only model.",
)
else:
self.assertTrue(
torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
)
def test_outputs_match(self):
"""
Ensure that the generated outputs match between the uncompressed model
and its decompressed compressed counterpart.
"""
tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
uncompressed = AutoModelForCausalLM.from_pretrained(
self.sparse_uncompressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
decompressed = AutoModelForCausalLM.from_pretrained(
self.sparse_compressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
self.assertEqual(
tokenizer.decode(output_uncompressed[0]),
tokenizer.decode(output_decompressed[0]),
"Generated outputs do not match between compressed and uncompressed models.",
)
def test_no_warnings_for_all_models(self):
"""
Confirm that loading any model using compressed tensors does not trigger
warnings about missing or unexpected keys.
"""
for model_stub in self.model_stubs:
with self.subTest(model_stub=model_stub):
with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("always")
AutoModelForCausalLM.from_pretrained(
model_stub,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for warning in caught_warnings:
self.assertNotIn(
"missing keys",
str(warning.message).lower(),
f"'missing keys' found in warnings for model {model_stub}",
)
self.assertNotIn(
"unexpected keys",
str(warning.message).lower(),
f"'unexpected keys' found in warnings for model {model_stub}",
)
@require_compressed_tensors
@require_torch
class RunCompressedTest(unittest.TestCase):
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
prompt = "Paris is the capital of which country?"
stubs = [tinyllama_w4a16, tinyllama_w8a8]
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_default_run_compressed__True(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# some linear models are not compressed - ex. lm_head
assert compressed_linear_counts > 0
def test_default_run_compressed__False(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# No modules should be CompressedLinear
assert compressed_linear_counts == 0
def test_run_compressed_outputs_match(self):
"""Check that run_compressed=True/False output are the same"""
from transformers import AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
tokenizer = AutoTokenizer.from_pretrained(stub)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
stub,
)
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])

View File

@@ -0,0 +1,87 @@
import gc
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
prompt = "Paris is the capital of which country?"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_config_args(self):
with self.assertRaises(ValueError):
# passing quant scheme directly is not allowed
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
CompressedTensorsConfig(
config_groups={"FP8": ["Linear"]},
ignore=["lm_head"],
quantization_status="frozen",
sparsity_config={"format": "dense"},
)
def test_config_to_from_dict(self):
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]}, sparsity_config={"format": "dense"})
config_dict = config.to_dict()
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
from compressed_tensors import QuantizationConfig, SparsityCompressionConfig
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
def test_tinyllama_w8a8(self):
expected_out = "<s> Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n"
self._test_quantized_model(self.tinyllama_w8a8, expected_out)
def test_tinyllama_w4a16(self):
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
self._test_quantized_model(self.tinyllama_w4a16, expected_out)
def test_tinyllama_w8a16(self):
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
self._test_quantized_model(self.tinyllama_w8a16, expected_out)
def test_llama_8b_fp8(self):
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous museum in Paris that is home to the Mona Lisa? The Louvre\nWhat is the name of the famous bridge in Paris that is often associated with the city"
self._test_quantized_model(self.llama3_8b_fp8, expected_out)
def _test_quantized_model(self, model_name: str, expected_output: str):
"""Carry out generation"""
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = quantized_model.device
self.assertIsNotNone(
quantized_model.config.quantization_config,
"quantization_config should not be None",
)
self.assertTrue(
any(
key
for key, tensor in quantized_model.state_dict().items()
if "scale" in key and not torch.all(tensor == 1.0)
),
"quantized model should load a non-trivial scale into the state dict",
)
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
outputs = tokenizer.batch_decode(generated_ids)
self.assertIsNotNone(outputs)
self.assertEqual(outputs[0], expected_output)

View File

@@ -0,0 +1,171 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, EetqConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_eetq,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_gpu
class EetqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = EetqConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "eetq", "weights": "int8"}
quantization_config = EetqConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
self.assertEqual(dict["weights"], quantization_config.weights)
@slow
@require_torch_gpu
@require_eetq
@require_accelerate
class EetqTest(unittest.TestCase):
model_name = "facebook/opt-350m"
input_text = "What are we having for dinner?"
max_new_tokens = 9
EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = EetqConfig(weights="int8")
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from eetq import EetqLinear
from transformers.integrations import replace_with_eetq_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = EetqConfig(weights="int8")
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_eetq_linear(model, quantization_config=quantization_config)
nb_eetq_linear = 0
for module in model.modules():
if isinstance(module, EetqLinear):
nb_eetq_linear += 1
self.assertEqual(nb_linears - 1, nb_eetq_linear)
# Try with `modules_to_not_convert`
with init_empty_weights():
model = OPTForCausalLM(config)
quantization_config = EetqConfig(modules_to_not_convert=["fc1"])
model = replace_with_eetq_linear(model, quantization_config=quantization_config)
nb_eetq_linear = 0
for module in model.modules():
if isinstance(module, EetqLinear):
nb_eetq_linear += 1
# 25 corresponds to the lm_head along with 24 fc1 layers.
self.assertEqual(nb_linears - 25, nb_eetq_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = EetqConfig()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,301 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_fbgemm_gpu,
require_read_token,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_gpu
class FbgemmFp8ConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FbgemmFp8Config()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fbgemm_fp8"}
quantization_config = FbgemmFp8Config.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_fbgemm_gpu
@require_accelerate
@require_read_token
class FbgemmFp8Test(unittest.TestCase):
model_name = "meta-llama/Meta-Llama-3-8B"
input_text = "What are we having for dinner?"
max_new_tokens = 9
EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad"
device_map = "cuda"
offload_device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": 0,
"model.layers.8": 0,
"model.layers.9": 0,
"model.layers.10": 0,
"model.layers.11": 0,
"model.layers.12": 0,
"model.layers.13": 0,
"model.layers.14": 0,
"model.layers.15": 0,
"model.layers.16": "cpu",
"model.layers.17": "cpu",
"model.layers.18": "cpu",
"model.layers.19": "cpu",
"model.layers.20": "disk",
"model.layers.21": "disk",
"model.layers.22": "disk",
"model.layers.23": "disk",
"model.layers.24": "disk",
"model.layers.25": "disk",
"model.layers.26": "disk",
"model.layers.27": "disk",
"model.layers.28": "disk",
"model.layers.29": "disk",
"model.layers.30": "disk",
"model.layers.31": "disk",
"model.norm": "disk",
"lm_head": "disk",
}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = FbgemmFp8Config()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = FbgemmFp8Config()
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config)
nb_fbgemm_linear = 0
for module in model.modules():
if isinstance(module, FbgemmFp8Linear):
nb_fbgemm_linear += 1
self.assertEqual(nb_linears - 1, nb_fbgemm_linear)
with init_empty_weights():
model = OPTForCausalLM(config)
quantization_config = FbgemmFp8Config(modules_to_not_convert=["fc1"])
model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config)
nb_fbgemm_linear = 0
for module in model.modules():
if isinstance(module, FbgemmFp8Linear):
nb_fbgemm_linear += 1
self.assertEqual(nb_linears - 25, nb_fbgemm_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_change_loading_attributes(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
quantization_config = FbgemmFp8Config(activation_scale_ub=1000.0)
model = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map=self.device_map, quantization_config=quantization_config
)
self.assertEqual(model.model.layers[1].mlp.down_proj.input_scale_ub.item(), 1000.0)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = FbgemmFp8Config()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_offload(self):
"""
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
"""
quantization_config = FbgemmFp8Config()
with self.assertRaisesRegex(
ValueError, "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
):
AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.offload_device_map, quantization_config=quantization_config
)
def test_save_pretrained_offload(self):
"""
Simple test that checks if the saved quantized model is working properly cpu/disk offload
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_gpu
@require_accelerate
@require_fbgemm_gpu
class FbgemmFp8LinearTest(unittest.TestCase):
def test_linear_preserves_shape(self):
"""
Test that FbgemmFp8Linear preserves shape when in_features == out_features.
"""
from transformers.integrations import FbgemmFp8Linear
with init_empty_weights(include_buffers=True):
linear = FbgemmFp8Linear(1024, 1024, True)
x = torch.rand((17, 23, 1024))
x_ = linear(x)
self.assertEqual(x_.shape, x.shape)
def test_linear_with_diff_feature_size_preserves_shape(self):
"""
Test that FbgemmFp8Linear generates the correct shape when in_features != out_features.
"""
from transformers.integrations import FbgemmFp8Linear
with init_empty_weights(include_buffers=True):
linear = FbgemmFp8Linear(1024, 2048, True)
x = torch.rand((17, 23, 1024))
x_ = linear(x)
self.assertEqual(x_.shape, (17, 23, 2048))

View File

@@ -0,0 +1,287 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
get_device_properties,
require_accelerate,
require_read_token,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_accelerator
class FineGrainedFP8ConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FineGrainedFP8Config()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fp8"}
quantization_config = FineGrainedFP8Config.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_accelerate
@require_read_token
@require_torch_accelerator
class FP8QuantizerTest(unittest.TestCase):
model_name = "meta-llama/Llama-3.2-1B"
input_text = "Once upon a time"
max_new_tokens = 10
EXPECTED_OUTPUT = "Once upon a time, there was a man who was very rich."
device_map = torch_device
offload_device_map = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": "cpu",
"model.layers.8": "cpu",
"model.layers.9": "cpu",
"model.layers.10": "cpu",
"model.layers.11": "cpu",
"model.layers.12": "cpu",
"model.layers.13": "cpu",
"model.layers.14": "cpu",
"model.layers.15": "cpu",
"model.rotary_emb": "disk",
"model.norm": "disk",
"lm_head": 0,
}
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = FineGrainedFP8Config()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import FP8Linear, replace_with_fp8_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = FineGrainedFP8Config()
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model = replace_with_fp8_linear(model, quantization_config=quantization_config)
nb_fp8_linear = 0
for module in model.modules():
if isinstance(module, FP8Linear):
nb_fp8_linear += 1
self.assertEqual(nb_linears - 1, nb_fp8_linear)
with init_empty_weights():
model = OPTForCausalLM(config)
quantization_config = FineGrainedFP8Config(modules_to_not_convert=["fc1"])
model = replace_with_fp8_linear(model, quantization_config=quantization_config)
nb_fp8_linear = 0
for module in model.modules():
if isinstance(module, FP8Linear):
nb_fp8_linear += 1
self.assertEqual(nb_linears - 25, nb_fp8_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
self.assertEqual(output_tokens, self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_weight_and_weight_scale_inv(self):
"""
Simple test that checks if the weight and weight_scale_inv are working properly
"""
weight = self.quantized_model.model.layers[0].self_attn.q_proj.weight
weight_scale_inv = self.quantized_model.model.layers[0].self_attn.q_proj.weight_scale_inv
self.assertEqual(weight.dtype, torch.float8_e4m3fn)
self.assertEqual(weight_scale_inv.dtype, torch.float32)
self.assertEqual(weight.shape, (weight_scale_inv.shape[0] * 128, weight_scale_inv.shape[1] * 128))
def test_block_size(self):
"""
Simple test that checks if the block size is working properly
"""
self.assertEqual(self.quantized_model.config.quantization_config.weight_block_size, (128, 128))
quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.device_map, quantization_config=quantization_config
)
self.assertEqual(quantized_model.config.quantization_config.weight_block_size, (32, 32))
@require_torch_multi_accelerator
def test_quantized_model_multi_accelerator(self):
"""
Simple test that checks if the quantized model is working properly with multiple accelerators
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs; or set ZE_AFFINITY_MASK=0,1 if you
have more than 2 XPUs.
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
quantization_config = FineGrainedFP8Config()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_save_pretrained_multi_accelerators(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_offload(self):
"""
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
"""
with self.assertRaisesRegex(
ValueError, "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
):
AutoModelForCausalLM.from_pretrained(
self.model_name, device_map=self.offload_device_map, quantization_config=self.quantization_config
)
def test_save_pretrained_offload(self):
"""
Simple test that checks if the saved quantized model is working properly cpu/disk offload
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_accelerator
class FP8LinearTest(unittest.TestCase):
device = torch_device
@unittest.skipIf(
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
)
def test_linear_preserves_shape(self):
"""
Test that FP8Linear preserves shape when in_features == out_features.
"""
from transformers.integrations import FP8Linear
linear = FP8Linear(256, 256, block_size=(128, 128), device=self.device)
x = torch.rand((1, 5, 256)).to(self.device)
x_ = linear(x)
self.assertEqual(x_.shape, x.shape)
@unittest.skipIf(
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
)
def test_linear_with_diff_feature_size_preserves_shape(self):
"""
Test that FP8Linear generates the correct shape when in_features != out_features.
"""
from transformers.integrations import FP8Linear
linear = FP8Linear(128, 256, block_size=(128, 128), device=self.device)
x = torch.rand((1, 5, 128)).to(self.device)
x_ = linear(x)
self.assertEqual(x_.shape, (1, 5, 256))

View File

@@ -0,0 +1,227 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_fp_quant,
require_qutlass,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
@require_torch_gpu
class FPQuantConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = FPQuantConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "fp_quant"}
quantization_config = FPQuantConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_fp_quant
@require_qutlass
@require_accelerate
class FPQuantTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "1 2 3 4"
max_new_tokens = 4
EXPECTED_OUTPUT = "1 2 3 4 5 6"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = FPQuantConfig(pseudoquantization=False)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = FPQuantConfig()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@slow
@require_torch_gpu
@require_fp_quant
@require_accelerate
class FPQuantPseudoquantTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "1 2 3 4"
max_new_tokens = 4
EXPECTED_OUTPUT = "1 2 3 4 5 6"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = FPQuantConfig(pseudoquantization=True)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = FPQuantConfig(pseudoquantization=True)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,499 @@
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
import pytest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
from transformers.testing_utils import (
is_torch_available,
require_accelerate,
require_gptq,
require_optimum,
require_torch_gpu,
require_torch_multi_gpu,
slow,
)
from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available
if is_torch_available():
import torch
class GPTQConfigTest(unittest.TestCase):
def test_bits(self):
with self.assertRaises(ValueError):
GPTQConfig(bits="")
GPTQConfig(bits=1)
GPTQConfig(bits=2)
GPTQConfig(bits=4)
def test_dataset(self):
with self.assertRaises(ValueError):
GPTQConfig(bits=2, dataset="auto_gpt")
GPTQConfig(bits=2, dataset="c4")
def test_damp_percent(self):
with self.assertRaises(ValueError):
GPTQConfig(bits=2, damp_percent=10)
GPTQConfig(bits=2, damp_percent=-1)
GPTQConfig(bits=2, damp_percent="0")
GPTQConfig(bits=2, damp_percent=0.01)
def test_to_dict(self):
quantization_config = GPTQConfig(bits=2)
quantization_config.to_dict()
def test_from_dict(self):
dict = {"bits": 2}
quantization_config = GPTQConfig.from_dict(dict)
self.assertEqual(dict["bits"], quantization_config.bits)
@require_optimum
def test_optimum_config(self):
from optimum.gptq import GPTQQuantizer
config = GPTQConfig(bits=2)
optimum_config = GPTQQuantizer.from_dict(config.to_dict_optimum())
self.assertEqual(optimum_config.bits, config.bits)
new_config = GPTQConfig.from_dict_optimum(optimum_config.to_dict())
self.assertEqual(optimum_config.bits, new_config.bits)
@slow
@require_optimum
@require_gptq
class GPTQTest(unittest.TestCase):
model_name = "bigscience/bloom-560m"
input_text = "Hello my name is"
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
bits = 4
sym = True
group_size = 128
desc_act = False
use_exllama = False
dataset = [
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
]
device_map = "cpu" if is_gptqmodel_available() else None
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
cls.model_name, dtype=torch.float16, device_map=cls.device_map
)
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
cls.config = AutoConfig.from_pretrained(cls.model_name)
cls.quantization_config = GPTQConfig(
bits=cls.bits,
dataset=cls.dataset,
tokenizer=cls.tokenizer,
group_size=cls.group_size,
desc_act=cls.desc_act,
sym=cls.sym,
use_exllama=cls.use_exllama,
)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map=cls.device_map,
quantization_config=cls.quantization_config,
)
def test_memory_footprint(self):
r"""
A simple test to check if the model conversion has been done correctly by checking on the
memory footprint of the converted model
"""
mem_quantized = self.quantized_model.get_memory_footprint()
self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)
def test_device_and_dtype_assignment(self):
r"""
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
Checks also if other models are casted correctly.
"""
# This should work
if self.device_map in (None, "cpu"):
_ = self.quantized_model.to(0)
with self.assertRaises(ValueError):
# Tries with a `dtype``
self.quantized_model.to(torch.float16)
def test_original_dtype(self):
r"""
A simple test to check if the model successfully stores the original dtype
"""
self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
self.assertTrue(self.quantized_model.config._pre_quantization_dtype == torch.float16)
def test_quantized_layers_class(self):
"""
Simple test to check if the model conversion has been done correctly by checking on
the class type of the linear layers of the converted models
"""
if is_gptqmodel_available():
from gptqmodel.utils.importer import hf_select_quant_linear
if hasattr(self.config, "quantization_config"):
checkpoint_format = self.config.quantization_config.get("checkpoint_format")
meta = self.config.quantization_config.get("meta")
else:
checkpoint_format = "gptq"
meta = None
QuantLinear = hf_select_quant_linear(
bits=self.bits,
group_size=self.group_size,
desc_act=self.desc_act,
sym=self.sym,
device_map=self.device_map,
checkpoint_format=checkpoint_format,
meta=meta,
backend=self.quantization_config.backend,
)
elif is_auto_gptq_available():
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
QuantLinear = hf_select_quant_linear(
use_triton=False,
desc_act=self.desc_act,
group_size=self.group_size,
bits=self.bits,
disable_exllama=not self.use_exllama,
disable_exllamav2=True,
)
self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
def check_inference_correctness(self, model):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def check_quantized_layers_type(self, model, value):
self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
if self.device_map is None:
self.check_inference_correctness(self.quantized_model.to(0))
else:
if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
self.quantized_model.to("cpu")
self.check_inference_correctness(self.quantized_model)
def test_serialization(self):
"""
Test the serialization of the model and the loading of the quantized weights works
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
if is_auto_gptq_available() and not is_gptqmodel_available():
quant_type = "cuda-old" if not self.use_exllama else "exllama"
if not self.use_exllama:
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
)
if self.device_map != "cpu":
quantized_model_from_saved = quantized_model_from_saved.to(0)
else:
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map=self.device_map
)
else:
if self.device_map == "cpu":
quant_type = "ipex" if is_ipex_available() else "torch"
else:
# We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
# TODO: Remove this once GPTQModel exllama kernels supports packing
quant_type = "tritonv2"
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname, device_map=self.device_map
)
self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
self.check_inference_correctness(quantized_model_from_saved)
@require_accelerate
def test_serialization_big_model_inference(self):
"""
Test the serialization of the model and the loading of the quantized weights with big model inference
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
device_map = self.device_map or "auto"
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
self.check_inference_correctness(quantized_model_from_saved)
@require_torch_gpu
class GPTQTestCUDA(GPTQTest):
device_map = {"": 0}
def test_change_loading_attributes(self):
"""
Test the serialization of the model and the loading of the quantized weights works with another config file
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama:
self.check_quantized_layers_type(self.quantized_model, "cuda-old")
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map
)
self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
self.check_inference_correctness(quantized_model_from_saved)
@require_accelerate
@require_torch_multi_gpu
class GPTQTestDeviceMap(GPTQTestCUDA):
device_map = "auto"
@require_accelerate
@require_torch_multi_gpu
class GPTQTestDeviceMapExllama(GPTQTestCUDA):
device_map = "auto"
use_exllama = True
@slow
@require_optimum
@require_gptq
@require_torch_gpu
@require_accelerate
class GPTQTestActOrderExllama(unittest.TestCase):
"""
Test GPTQ model with exllama kernel and desc_act=True (also known as act-order).
More information on those arguments here:
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
"""
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
# 4bit + act_order + 128g
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
input_text = "Hello, how are you ?"
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = GPTQConfig(bits=4, max_input_length=4028)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map={"": 0},
quantization_config=cls.quantization_config,
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
def check_inference_correctness(self, model):
"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_quantized_layers_type(self):
self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllama")
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model)
def test_max_input_length(self):
"""
Test if the max_input_length works. It modifies the maximum input length that of the model that runs with exllama backend.
"""
prompt = "I am in Paris and" * 1000
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
self.assertTrue(inp["input_ids"].shape[1] > 4028)
with self.assertRaises(RuntimeError) as cm:
self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
self.assertTrue("temp_state buffer is too small" in str(cm.exception))
prompt = "I am in Paris and"
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
self.assertTrue(inp["input_ids"].shape[1] < 4028)
self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
@slow
@require_optimum
@require_gptq
@require_torch_gpu
@require_accelerate
class GPTQTestExllamaV2(unittest.TestCase):
"""
Test GPTQ model with exllamav2 kernel and desc_act=True (also known as act-order).
More information on those arguments here:
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
"""
EXPECTED_OUTPUTS = set()
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
# 4bit + act_order + 128g
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
input_text = "Hello, how are you ?"
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.quantization_config = GPTQConfig(bits=4, exllama_config={"version": 2})
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
dtype=torch.float16,
device_map={"": 0},
quantization_config=cls.quantization_config,
)
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
def test_quantized_layers_type(self):
if is_auto_gptq_available() and not is_gptqmodel_available():
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"exllamav2",
)
else:
# We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
# TODO: Remove this once GPTQModel exllama kernels supports packing
self.assertEqual(
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
"tritonv2",
)
def check_inference_correctness(self, model):
"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model)
# fail when run all together
@pytest.mark.skip
@require_accelerate
@require_torch_multi_gpu
class GPTQTestDeviceMapCPUOffload(GPTQTest):
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"lm_head": 0,
"transformer.h.0": 0,
"transformer.h.1": 0,
"transformer.h.2": 0,
"transformer.h.3": 0,
"transformer.h.4": 0,
"transformer.h.5": 0,
"transformer.h.6": 0,
"transformer.h.7": 0,
"transformer.h.8": 0,
"transformer.h.9": 0,
"transformer.h.10": 1,
"transformer.h.11": 1,
"transformer.h.12": 1,
"transformer.h.13": 1,
"transformer.h.14": 1,
"transformer.h.15": 1,
"transformer.h.16": 1,
"transformer.h.17": 0,
"transformer.h.18": "cpu",
"transformer.h.19": "cpu",
"transformer.h.20": "cpu",
"transformer.h.21": "cpu",
"transformer.h.22": "cpu",
"transformer.h.23": 1,
"transformer.ln_f": 0,
}

View File

@@ -0,0 +1,197 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HiggsConfig, OPTForCausalLM
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_flute_hadamard,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_gpu
class HiggsConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = HiggsConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "higgs"}
quantization_config = HiggsConfig.from_dict(dict)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_flute_hadamard
@require_accelerate
# @require_read_token
class HiggsTest(unittest.TestCase):
model_name = "unsloth/Llama-3.2-1B"
input_text = "Font test: A quick brown fox jumps over the"
max_new_tokens = 2
EXPECTED_OUTPUT = "Font test: A quick brown fox jumps over the lazy dog"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
quantization_config = HiggsConfig()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from transformers.integrations import HiggsLinear, replace_with_higgs_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
quantization_config = HiggsConfig()
with init_empty_weights():
model = OPTForCausalLM(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
nb_higgs_linear = 0
for module in model.modules():
if isinstance(module, HiggsLinear):
nb_higgs_linear += 1
self.assertEqual(nb_linears - 1, nb_higgs_linear)
with init_empty_weights():
model = OPTForCausalLM(config)
quantization_config = HiggsConfig(modules_to_not_convert=["fc1"])
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
nb_higgs_linear = 0
for module in model.modules():
if isinstance(module, HiggsLinear):
nb_higgs_linear += 1
self.assertEqual(nb_linears - 24, nb_higgs_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantization_config = HiggsConfig()
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name, device_map="auto", quantization_config=quantization_config
)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_save_pretrained_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@unittest.skip("This will almost surely OOM. Enable when switched to a smaller model")
def test_dequantize(self):
"""
Test the ability to dequantize a model
"""
self.quantized_model.dequantize()
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)

View File

@@ -0,0 +1,314 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_deterministic_for_xpu,
require_hqq,
require_torch_accelerator,
require_torch_multi_accelerator,
slow,
torch_device,
)
from transformers.utils import is_hqq_available, is_torch_available
if is_torch_available():
import torch
if is_hqq_available():
from hqq.core.quantize import HQQBackend, HQQLinear
class HQQLLMRunner:
def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir=None):
self.model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=compute_dtype,
device_map=device,
quantization_config=quant_config,
cache_dir=cache_dir,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
self.device = self.model.device
HQQLinear.set_backend(HQQBackend.PYTORCH)
def cleanup():
backend_empty_cache(torch_device)
gc.collect()
def check_hqqlayer(test_module, hqq_layer, batch_size=1, context_size=1024):
# Test HQQ layer
W_dequant = hqq_layer.dequantize() # Reconstructed weights
inputs = (
torch.randn(
(batch_size, context_size, hqq_layer.meta["shape"][1]),
device=hqq_layer.device,
dtype=hqq_layer.compute_dtype,
)
/ 10.0
)
with torch.no_grad():
outputs = hqq_layer(inputs)
test_module.assertEqual(outputs.shape[-1], W_dequant.shape[0])
test_module.assertEqual(outputs.dtype, hqq_layer.compute_dtype)
del W_dequant, inputs, outputs
cleanup()
def check_forward(test_module, model, batch_size=1, context_size=1024):
# Test forward pass
with torch.no_grad():
out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits
test_module.assertEqual(out.shape[0], batch_size)
test_module.assertEqual(out.shape[1], context_size)
cleanup()
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@require_torch_accelerator
@require_hqq
class HqqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = HqqConfig()
hqq_orig_config = quantization_config.to_dict()
self.assertEqual(quantization_config.quant_config, hqq_orig_config["quant_config"])
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
class HQQTest(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model(self):
"""
Simple LLM model testing fp16
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
def test_quantized_model_to_new_device_and_new_dtype(self):
"""
Simple LLM model testing different devices and dtypes
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
# Remove `accelerate` hooks to enable move the model to a new device
accelerate.hooks.remove_hook_from_module(hqq_runner.model, recurse=True)
hqq_runner.model.to("cpu", torch.bfloat16)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
hqq_runner.model.to(torch_device)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
def test_quantized_model_fake_weight_dtype(self):
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
# We use a hack to inject a fake weight to HQQLinear. Check that it works
self.assertEqual(hqq_runner.model.model.layers[0].self_attn.v_proj.weight.dtype, torch.float16)
@slow
@require_torch_accelerator
@require_torch_multi_accelerator
@require_accelerate
@require_hqq
class HQQTestMultiGPU(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model_multipgpu(self):
"""
Simple LLM model testing fp16 with multi-gpu
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto"
)
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
class HQQTestBias(unittest.TestCase):
def tearDown(self):
cleanup()
def test_fp16_quantized_model(self):
"""
Simple LLM model testing fp16 with bias
"""
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
check_forward(self, hqq_runner.model)
@require_deterministic_for_xpu
def test_save_and_load_quantized_model(self):
"""
Test saving and loading a quantized model with bias
"""
import tempfile
quant_config = HqqConfig(nbits=8, group_size=64)
hqq_runner = HQQLLMRunner(
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
# Get reference logits
with torch.no_grad():
logits_ref = hqq_runner.model.forward(input_tensor).logits
with tempfile.TemporaryDirectory() as tmpdirname:
hqq_runner.model.save_pretrained(tmpdirname)
del hqq_runner.model
backend_empty_cache(torch_device)
model_loaded = AutoModelForCausalLM.from_pretrained(
tmpdirname, dtype=torch.float16, device_map=torch_device
)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
@slow
@require_torch_accelerator
@require_accelerate
@require_hqq
class HQQSerializationTest(unittest.TestCase):
def tearDown(self):
cleanup()
def test_model_serialization(self):
"""
Simple HQQ LLM save/load test
"""
quant_config = HqqConfig(nbits=4, group_size=64)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
logits_ref = hqq_runner.model.forward(input_tensor).logits
# Save
saved_model_id = "quant_model"
hqq_runner.model.save_pretrained(saved_model_id)
# Remove old model
del hqq_runner.model
backend_empty_cache(torch_device)
# Load and check if the logits match
model_loaded = AutoModelForCausalLM.from_pretrained(
"quant_model",
dtype=torch.float16,
device_map=torch_device,
)
with torch.no_grad():
logits_loaded = model_loaded.forward(input_tensor).logits
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
def test_model_serialization_dynamic_quant_with_skip(self):
"""
Simple HQQ LLM save/load test with dynamic quant
"""
q4_config = {"nbits": 4, "group_size": 64}
q3_config = {"nbits": 3, "group_size": 64}
quant_config = HqqConfig(
dynamic_config={
"self_attn.q_proj": q4_config,
"self_attn.k_proj": q4_config,
"self_attn.v_proj": q4_config,
"self_attn.o_proj": q4_config,
"mlp.gate_proj": q3_config,
"mlp.up_proj": q3_config,
},
skip_modules=["lm_head", "down_proj"],
)
hqq_runner = HQQLLMRunner(
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
)
model = hqq_runner.model
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
with torch.no_grad():
model.forward(input_tensor).logits
self.assertEqual(isinstance(model.model.layers[1].mlp.down_proj, torch.nn.Linear), True)
self.assertEqual(model.model.layers[1].self_attn.v_proj.quant_config["weight_quant_params"]["nbits"], 4)
self.assertEqual(model.model.layers[1].mlp.gate_proj.quant_config["weight_quant_params"]["nbits"], 3)

View File

@@ -0,0 +1,505 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from unittest.mock import patch
from transformers import AutoTokenizer, GptOssForCausalLM, Mxfp4Config
from transformers.testing_utils import (
require_kernels,
require_torch,
require_torch_gpu,
require_torch_large_gpu,
require_triton,
slow,
)
from transformers.utils import (
is_torch_available,
)
if is_torch_available():
import torch
class Mxfp4ConfigTest(unittest.TestCase):
def test_basic_config_creation(self):
"""Test basic configuration creation with default values"""
config = Mxfp4Config()
self.assertEqual(config.quant_method.value, "mxfp4")
self.assertIsNone(config.modules_to_not_convert)
self.assertFalse(config.dequantize)
def test_config_with_modules_to_not_convert(self):
"""Test configuration with modules to not convert"""
modules = ["model.layers.*.self_attn", "lm_head"]
config = Mxfp4Config(modules_to_not_convert=modules)
self.assertEqual(config.modules_to_not_convert, modules)
def test_config_with_dequantize(self):
"""Test configuration with dequantize enabled"""
config = Mxfp4Config(dequantize=True)
self.assertTrue(config.dequantize)
def test_get_loading_attributes(self):
"""Test get_loading_attributes method"""
config = Mxfp4Config(dequantize=True)
attrs = config.get_loading_attributes()
self.assertEqual(attrs["dequantize"], True)
def test_to_dict(self):
"""Test configuration serialization to dict"""
config = Mxfp4Config(modules_to_not_convert=["lm_head"], dequantize=True)
config_dict = config.to_dict()
self.assertEqual(config_dict["quant_method"], "mxfp4")
self.assertEqual(config_dict["modules_to_not_convert"], ["lm_head"])
# we don't keep dequantize in config_dict
self.assertTrue("dequantize" not in config_dict)
def test_from_dict(self):
"""Test configuration creation from dict"""
config_dict = {"quant_method": "mxfp4", "modules_to_not_convert": ["lm_head"], "dequantize": True}
config = Mxfp4Config.from_dict(config_dict)
self.assertEqual(config.modules_to_not_convert, ["lm_head"])
self.assertTrue(config.dequantize)
class Mxfp4QuantizerTest(unittest.TestCase):
"""Test the Mxfp4HfQuantizer class"""
def setUp(self):
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def test_quantizer_validation_no_torch(self):
"""Test quantizer validation when torch is not available"""
with patch("transformers.quantizers.quantizer_mxfp4.is_torch_available", return_value=False):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
with self.assertRaises(ImportError):
quantizer.validate_environment()
def test_quantizer_validation_no_cuda(self):
"""Test quantizer validation when CUDA is not available"""
with patch("torch.cuda.is_available", return_value=False):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(RuntimeError):
quantizer.validate_environment()
def test_quantizer_validation_low_compute_capability(self):
"""Test quantizer validation with low compute capability"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment()
def test_quantizer_validation_low_compute_capability_with_prequantized(self):
"""Test quantizer validation with low compute capability"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# Should automatically set dequantize=True and warn
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
def test_quantizer_validation_low_compute_capability_with_dequantize(self):
"""Test quantizer validation with low compute capability but dequantize enabled"""
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config(dequantize=True)
quantizer = Mxfp4HfQuantizer(config)
# Should not raise error with dequantize=True
try:
quantizer.validate_environment()
except ValueError as e:
if "compute capability" in str(e):
self.fail("Should not raise compute capability error when dequantize=True")
def test_quantizer_validation_order_dequantize_before_cuda_check(self):
"""Test that dequantize check happens before CUDA availability check"""
# Mock torch.cuda.is_available
with patch("torch.cuda.is_available", return_value=False):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
# Test with dequantize=True - should pass even without CUDA and accelerate
config = Mxfp4Config(dequantize=True)
quantizer = Mxfp4HfQuantizer(config)
# This should not raise any error because dequantize check comes first
quantizer.validate_environment()
# Test with dequantize=False - should still fail due to missing CUDA
config = Mxfp4Config(dequantize=False)
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(RuntimeError):
quantizer.validate_environment()
def test_quantizer_validation_missing_triton(self):
"""Test quantizer validation when triton is not available"""
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
with self.assertRaises(ValueError):
quantizer.validate_environment()
def test_quantizer_validation_missing_triton_pre_quantized_no_dequantize(self):
"""Test quantizer validation when triton is not available but model is pre-quantized and dequantize is False"""
with (
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
):
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = True
# Should automatically set dequantize=True and warn
quantizer.validate_environment()
self.assertTrue(quantizer.quantization_config.dequantize)
def test_update_dtype(self):
"""Test torch dtype updating"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# Should default to bfloat16
result_dtype = quantizer.update_dtype(None)
self.assertEqual(result_dtype, torch.bfloat16)
# Should preserve existing dtype
result_dtype = quantizer.update_dtype(torch.float32)
self.assertEqual(result_dtype, torch.float32)
def test_update_expected_keys(self):
"""Test expected keys updating for quantized models"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
expected_keys = [
"model.layers.0.mlp.experts.gate_up_proj",
"model.layers.0.mlp.experts.down_proj",
"model.embed_tokens.weight",
]
updated_keys = quantizer.update_expected_keys(None, expected_keys, [])
expected_updated = [
"model.layers.0.mlp.experts.gate_up_proj_blocks",
"model.layers.0.mlp.experts.gate_up_proj_scales",
"model.layers.0.mlp.experts.down_proj_blocks",
"model.layers.0.mlp.experts.down_proj_scales",
"model.embed_tokens.weight",
]
self.assertEqual(set(updated_keys), set(expected_updated))
def test_update_param_name_dequantize(self):
"""Test parameter name updating when dequantizing"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config(dequantize=True)
quantizer = Mxfp4HfQuantizer(config)
# Should remove _blocks suffix
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
updated_name = quantizer.update_param_name(param_name)
self.assertEqual(updated_name, "model.layers.0.mlp.experts.gate_up_proj")
# Should remove _scales suffix
param_name = "model.layers.0.mlp.experts.down_proj_scales"
updated_name = quantizer.update_param_name(param_name)
self.assertEqual(updated_name, "model.layers.0.mlp.experts.down_proj")
# Should not change other names
param_name = "model.embed_tokens.weight"
updated_name = quantizer.update_param_name(param_name)
self.assertEqual(updated_name, "model.embed_tokens.weight")
def test_update_param_name_no_dequantize(self):
"""Test parameter name updating when not dequantizing"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config(dequantize=False)
quantizer = Mxfp4HfQuantizer(config)
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
updated_name = quantizer.update_param_name(param_name)
self.assertEqual(updated_name, param_name)
def test_is_trainable(self):
"""Test trainability"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# MXFP4 is not trainable
self.assertFalse(quantizer.is_trainable)
class Mxfp4IntegrationTest(unittest.TestCase):
"""Test mxfp4 integration functions"""
def test_should_convert_module(self):
"""Test module conversion decision logic"""
from transformers.integrations.mxfp4 import should_convert_module
# Should convert by default
self.assertTrue(should_convert_module(["model", "layers", "0", "mlp"], []))
# Should not convert if in exclusion list
patterns = ["model.layers.*.self_attn", "lm_head"]
self.assertFalse(should_convert_module(["model", "layers", "0", "self_attn"], patterns))
self.assertFalse(should_convert_module(["lm_head"], patterns))
# Should convert if not in exclusion list
self.assertTrue(should_convert_module(["model", "layers", "0", "mlp", "experts"], patterns))
@require_torch
def test_convert_moe_packed_tensors(self):
"""Test unpacking of quantized tensors"""
from transformers.integrations.mxfp4 import convert_moe_packed_tensors
# Create dummy packed tensors
blocks = torch.randint(0, 255, (2, 4, 8, 16), dtype=torch.uint8)
scales = torch.randint(100, 150, (2, 4, 8), dtype=torch.uint8)
result = convert_moe_packed_tensors(blocks, scales, dtype=torch.bfloat16)
self.assertEqual(result.shape, (2, 8 * 16 * 2, 4))
self.assertEqual(result.dtype, torch.bfloat16)
@require_triton(min_version="3.4.0")
@require_kernels
@require_torch_gpu
@require_torch
def test_quantize_to_mxfp4(self):
"""Test quantization function"""
from transformers.integrations.mxfp4 import quantize_to_mxfp4
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
# Create dummy weight tensor
w = torch.randn(32, 64, 128, dtype=torch.bfloat16, device=torch.device("cuda"))
quantized_w, w_scale = quantize_to_mxfp4(w, quantizer._lazy_import_kernels())
# Check that shapes are reasonable
self.assertEqual(quantized_w.dtype, torch.uint8)
@require_torch
@require_torch_large_gpu
@require_triton(min_version="3.4.0")
@require_kernels
@slow
class Mxfp4ModelTest(unittest.TestCase):
"""Test mxfp4 with actual models (requires specific model and hardware)"""
# These should be paths to real OpenAI MoE models for proper testing
model_name = "openai/gpt-oss-20b"
input_text = "Once upon a time"
# Expected outputs for generation tests
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Once upon a time, in a small town, there lived a young")
def setUp(self):
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def tearDown(self):
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def check_inference_correctness_quantized(self, model, tokenizer):
# Check that inference pass works on the model
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(model.device)
# Set pad token if not set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
with torch.no_grad():
output_sequences = model.generate(
**encoded_input,
max_new_tokens=10,
do_sample=False,
pad_token_id=tokenizer.eos_token_id,
use_cache=False,
)
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
self.assertIn(generated_text, self.EXPECTED_OUTPUTS)
def test_gpt_oss_model_loading_quantized_with_device_map(self):
"""Test loading OpenAI MoE model with mxfp4 quantization and device_map"""
model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.check_inference_correctness_quantized(model, tokenizer)
def test_gpt_oss_model_loading_dequantized_with_device_map(self):
"""Test loading OpenAI MoE model with mxfp4 dequantization and device_map"""
quantization_config = Mxfp4Config(dequantize=True)
# Test that config is properly set up
self.assertTrue(quantization_config.dequantize)
model = GptOssForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.check_inference_correctness_quantized(model, tokenizer)
def test_model_device_map_validation(self):
"""Test device map validation"""
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
config = Mxfp4Config()
quantizer = Mxfp4HfQuantizer(config)
quantizer.pre_quantized = False
# Test with CPU in device map (should raise error for non-pre-quantized)
with self.assertRaises(ValueError):
quantizer.validate_environment(device_map={"": "cpu"})
def test_memory_footprint_comparison(self):
"""Test memory footprint differences between quantized and unquantized models"""
# Expected: quantized < dequantized < unquantized memory usage
quantization_config = Mxfp4Config(dequantize=True)
quantized_model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
)
dequantized_model = GptOssForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
quantization_config=quantization_config,
)
quantized_mem = quantized_model.get_memory_footprint()
dequantized_mem = dequantized_model.get_memory_footprint()
self.assertLess(quantized_mem, dequantized_mem)
def test_save_mxfp4(self):
"""Test saving quantized OpenAI MoE model with device_map"""
model = GptOssForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
with tempfile.TemporaryDirectory() as tmp:
# Save the model in mxfp4 format
model.save_pretrained(tmp)
torch.cuda.empty_cache()
gc.collect()
# test quantized model
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
# test dequantized model
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
quantization_config=Mxfp4Config(dequantize=True),
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
def test_save_mxfp4_non_quantized(self):
"""Test saving dequantized OpenAI MoE model with mxfp4 quantization and device_map"""
non_quantized_model_name = "hf-internal-testing/gpt-oss-20b-bf16"
tokenizer = AutoTokenizer.from_pretrained(non_quantized_model_name)
loaded_model = GptOssForCausalLM.from_pretrained(
non_quantized_model_name,
quantization_config=Mxfp4Config(),
torch_dtype=torch.bfloat16,
device_map="auto",
)
# save the quantized model
with tempfile.TemporaryDirectory() as tmp:
loaded_model.save_pretrained(tmp)
torch.cuda.empty_cache()
gc.collect()
# load it back to check with everything works as expected
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)
loaded_model = GptOssForCausalLM.from_pretrained(
tmp,
quantization_config=Mxfp4Config(dequantized=True),
torch_dtype=torch.bfloat16,
device_map="auto",
)
self.check_inference_correctness_quantized(loaded_model, tokenizer)

View File

@@ -0,0 +1,475 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from transformers.testing_utils import (
require_accelerate,
require_optimum_quanto,
require_read_token,
require_torch_accelerator,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_optimum_quanto_available, is_torch_available
if is_torch_available():
import torch
from transformers import LlamaForCausalLM
if is_accelerate_available():
from accelerate import init_empty_weights
if is_optimum_quanto_available():
from optimum.quanto import QLayerNorm, QLinear
from transformers.integrations.quanto import replace_with_quanto_layers
class QuantoConfigTest(unittest.TestCase):
def test_attributes(self):
pass
@require_optimum_quanto
@require_accelerate
class QuantoTestIntegration(unittest.TestCase):
model_id = "facebook/opt-350m"
def setUp(self):
config = AutoConfig.from_pretrained(self.model_id)
with init_empty_weights():
self.model = AutoModelForCausalLM.from_config(config)
self.nb_linear = 0
self.nb_layernorm = 0
for module in self.model.modules():
if isinstance(module, torch.nn.Linear):
self.nb_linear += 1
elif isinstance(module, torch.nn.LayerNorm):
self.nb_layernorm += 1
def test_weight_only_quantization_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly when using weight only quantization
"""
# Try with weight only quantization
quantization_config = QuantoConfig(weights="int8", activations=None)
self.model, _ = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
nb_qlinear = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
self.assertEqual(self.nb_linear, nb_qlinear)
def test_weight_and_activation_quantization_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly when using weight + activation quantization
"""
# Try with weight + activation quantization
quantization_config = QuantoConfig(weights="int8", activations="int8")
self.model, _ = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
nb_qlinear = 0
nb_qlayernorm = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
if isinstance(module, QLayerNorm):
nb_qlayernorm += 1
self.assertEqual(self.nb_linear, nb_qlinear)
self.assertEqual(self.nb_layernorm, nb_qlayernorm)
def test_conversion_with_modules_to_not_convert(self):
"""
Simple test that checks if the quantized model has been converted properly when specifying modules_to_not_convert argument
"""
# Try with weight + activatioin quantization
quantization_config = QuantoConfig(weights="int8", activations="int8")
self.model, _ = replace_with_quanto_layers(
self.model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
)
nb_qlinear = 0
nb_qlayernorm = 0
for module in self.model.modules():
if isinstance(module, QLinear):
nb_qlinear += 1
if isinstance(module, QLayerNorm):
nb_qlayernorm += 1
self.assertEqual(self.nb_linear - 1, nb_qlinear)
@slow
@require_torch_accelerator
@require_optimum_quanto
@require_accelerate
class QuantoQuantizationTest(unittest.TestCase):
"""
Test 8-bit weights only quantization
"""
model_name = "bigscience/bloom-560m"
weights = "int8"
activations = None
device_map = "cpu"
input_text = "Hello my name is"
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer and I"
def setUp(self):
"""
Setup quantized model
"""
quantization_config = QuantoConfig(
weights=self.weights,
activations=self.activations,
)
self.quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
quantization_config=quantization_config,
dtype=torch.float32,
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.have_accelerate_hooks = (
getattr(self.quantized_model, "hf_device_map", False) and len(self.quantized_model.hf_device_map) > 1
)
def check_inference_correctness(self, model, device):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
if not self.have_accelerate_hooks:
model.to(device)
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality_cpu(self):
"""
Simple test to check the quality of the model on cpu by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model, "cpu")
def test_generate_quality_accelerator(self):
"""
Simple test to check the quality of the model on accelerators by comparing the generated tokens with the expected tokens
"""
self.check_inference_correctness(self.quantized_model, torch_device)
def test_quantized_model_layers(self):
from optimum.quanto import QBitsTensor, QModuleMixin, QTensor
"""
Suite of simple test to check if the layers are quantized and are working properly
"""
# Test the type of the quantized layer
self.assertTrue(isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value, QModuleMixin))
self.assertTrue(
isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value.weight, QTensor)
)
if self.weights == "int4":
self.assertTrue(
isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value.weight, QBitsTensor)
)
# check that the lm_head was indeed not quantized, just like bnb
self.assertTrue(
isinstance(self.quantized_model.lm_head, torch.nn.Linear)
and not isinstance(self.quantized_model.lm_head, QModuleMixin)
)
if self.device_map in ["cpu", "cuda"]:
self.assertEqual(
self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type,
self.device_map,
)
self.quantized_model.to(0)
self.assertEqual(
self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type, torch_device
)
def test_serialization_bin(self):
"""
Test the serialization, the loading and the inference of the quantized weights
"""
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(ValueError) as e:
self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
self.assertIn(
"The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception)
)
# TODO: replace by the following when it works
# quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
# tmpdirname, dtype=torch.float32, device_map="cpu"
# )
# self.check_inference_correctness(quantized_model_from_saved, device="cuda")
def test_serialization_safetensors(self):
"""
Test the serialization, the loading and the inference of the quantized weights
"""
with tempfile.TemporaryDirectory() as tmpdirname:
with self.assertRaises(ValueError) as e:
self.quantized_model.save_pretrained(tmpdirname)
self.assertIn(
"The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception)
)
# quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
# tmpdirname, dtype=torch.float32, device_map="cpu"
# )
# self.check_inference_correctness(quantized_model_from_saved, device="cuda")
def check_same_model(self, model1, model2):
d0 = dict(model1.named_parameters())
d1 = dict(model2.named_parameters())
self.assertTrue(d0.keys() == d1.keys())
for k in d0:
self.assertTrue(d0[k].shape == d1[k].shape)
self.assertTrue(d0[k].device.type == d1[k].device.type)
self.assertTrue(d0[k].device == d1[k].device)
self.assertTrue(d0[k].dtype == d1[k].dtype)
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
def test_compare_with_quanto(self):
from optimum.quanto import freeze, qint4, qint8, quantize
w_mapping = {"int8": qint8, "int4": qint4}
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
dtype=torch.float32,
)
# we do not quantize the lm_head since we don't do that in transformers
quantize(model.transformer, weights=w_mapping[self.weights])
freeze(model.transformer)
self.check_same_model(model, self.quantized_model)
self.check_inference_correctness(model, device=torch_device)
@unittest.skip
def test_load_from_quanto_saved(self):
from optimum.quanto import freeze, qint4, qint8, quantize
from transformers import QuantoConfig
w_mapping = {"int8": qint8, "int4": qint4}
model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
dtype=torch.float32,
)
# we do not quantize the lm_head since we don't do that in transformers
quantize(model.transformer, weights=w_mapping[self.weights])
freeze(model.transformer)
with tempfile.TemporaryDirectory() as tmpdirname:
model.config.quantization_config = QuantoConfig(
weights=self.weights, activations=self.activations, modules_to_not_convert=["lm_head"]
)
model.save_pretrained(tmpdirname, safe_serialization=False)
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
tmpdirname,
device_map=self.device_map,
dtype=torch.float32,
)
self.check_same_model(model, quantized_model_from_saved)
self.check_inference_correctness(quantized_model_from_saved, device="cuda")
class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
device_map = {
"transformer.word_embeddings": 0,
"transformer.word_embeddings_layernorm": 0,
"transformer.ln_f": 0,
"transformer.h.0": 0,
"transformer.h.1": 0,
"transformer.h.2": 0,
"transformer.h.3": 0,
"transformer.h.4": 0,
"transformer.h.5": 0,
"transformer.h.6": 0,
"transformer.h.7": 0,
"transformer.h.8": 0,
"transformer.h.9": 0,
"transformer.h.10": 0,
"transformer.h.11": 0,
"transformer.h.12": 0,
"transformer.h.13": 0,
"transformer.h.14": 0,
"transformer.h.15": 0,
"transformer.h.16": 0,
"transformer.h.17": 0,
"transformer.h.18": 0,
"transformer.h.19": 0,
"transformer.h.20": 0,
"transformer.h.21": 0,
"transformer.h.22": "cpu",
"transformer.h.23": "disk",
"lm_head": 0,
}
@unittest.skip(reason="The execution device is a gpu")
def test_generate_quality_cpu(self):
pass
@unittest.skip(reason="We can't save offloaded values")
def test_serialization_bin(self):
pass
@unittest.skip
def test_serialization_safetensors(self):
pass
@unittest.skip
def test_compare_with_quanto(self):
pass
@unittest.skip
def test_load_from_quanto_saved(self):
pass
def test_check_offload_quantized(self):
"""
We check that we have unquantized value in the cpu and in the disk
"""
from optimum.quanto import QBitsTensor, QTensor
cpu_weights = self.quantized_model.transformer.h[22].self_attention.query_key_value._hf_hook.weights_map[
"weight"
]
disk_weights = self.quantized_model.transformer.h[23].self_attention.query_key_value._hf_hook.weights_map[
"weight"
]
self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(cpu_weights, QTensor))
self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QTensor))
if self.weights == "int4":
self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
"""
Perform the same tests as in QuantoQuantizationTest but with a serialized model.
"""
def setUp(self):
"""
Setup quantized model
"""
quantization_config = QuantoConfig(
weights=self.weights,
activations=self.activations,
)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device_map,
quantization_config=quantization_config,
dtype=torch.float32,
)
with tempfile.TemporaryDirectory() as tmpdirname:
quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
self.quantized_model = AutoModelForCausalLM.from_pretrained(
tmpdirname, dtype=torch.float32, device_map=self.device_map
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.have_accelerate_hooks = (
getattr(self.quantized_model, "hf_device_map", False) and len(self.quantized_model.hf_device_map) > 1
)
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
"""
Perform the same tests as in QuantoQuantizationTest but with model on cuda
"""
device_map = "cuda:0"
class QuantoQuantizationQBitsTensorTest(QuantoQuantizationTest):
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
weights = "int4"
class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
EXPECTED_OUTPUTS = [
"Hello my name is John, I am a professional photographer, I", # CUDA output
"Hello my name is Nils, I am a student of the University", # XPU output
]
weights = "int4"
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
weights = "int4"
@require_torch_accelerator
class QuantoQuantizationActivationTest(unittest.TestCase):
def test_quantize_activation(self):
quantization_config = QuantoConfig(
weights="int8",
activations="int8",
)
with self.assertRaises(ValueError) as e:
AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", quantization_config=quantization_config)
self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
@require_optimum_quanto
@require_torch_accelerator
class QuantoKVCacheQuantizationTest(unittest.TestCase):
@slow
@require_read_token
def test_quantized_cache(self):
EXPECTED_TEXT_COMPLETION = [
"Simply put, the theory of relativity states that 1) time and space are not absolute, but are relative to the observer, and 2) the laws of physics are the same everywhere in the universe. This means that the speed of light is",
"My favorite all time favorite condiment is ketchup. I love how it adds a sweet and tangy flavor to my food. I also enjoy using it as a dip for fries, burgers, and grilled meats. It's a classic condiment that never",
]
prompts = [
"Simply put, the theory of relativity states that ",
"My favorite all time favorite condiment is ketchup.",
]
tokenizer = AutoTokenizer.from_pretrained(
"unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left"
)
model = LlamaForCausalLM.from_pretrained(
"unsloth/Llama-3.2-1B-Instruct", device_map="sequential", dtype=torch.float16
)
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(torch_device)
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False, cache_implementation="quantized")
text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

View File

@@ -0,0 +1,153 @@
# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
from transformers.testing_utils import (
cleanup,
is_torch_available,
require_accelerate,
require_quark,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils.import_utils import is_quark_available
if is_torch_available():
import torch
if is_quark_available():
from quark.torch.export.nn.modules.qparamslinear import QParamsLinear
@require_quark
class QuarkConfigTest(unittest.TestCase):
def test_common_args(self):
config = AutoConfig.from_pretrained("amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test")
QuarkConfig(**config.quantization_config)
@slow
@require_quark
@require_torch_gpu
class QuarkTest(unittest.TestCase):
reference_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
quantized_model_name = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
input_text = "Today I am in Paris and"
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris, France\nToday I am in Paris, Illinois")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are")
EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,")
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris at all! I am not in Paris, but")
EXPECTED_RELATIVE_DIFFERENCE = 1.66
device_map = None
@classmethod
def setUpClass(cls):
"""
Setup reference & quantized model
"""
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
cls.reference_model_name, dtype=torch.float16, device_map=cls.device_map
)
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
cls.tokenizer = AutoTokenizer.from_pretrained(cls.reference_model_name, use_fast=True)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.quantized_model_name,
dtype=torch.float16,
device_map=cls.device_map,
)
def tearDown(self):
r"""
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
"""
cleanup(torch_device, gc_collect=True)
def test_memory_footprint(self):
mem_quantized = self.quantized_model.get_memory_footprint()
self.assertTrue(self.mem_fp16 / mem_quantized > self.EXPECTED_RELATIVE_DIFFERENCE)
def test_device_and_dtype_assignment(self):
r"""
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
Checks also if other models are casted correctly .
"""
# This should work
if self.device_map is None:
_ = self.quantized_model.to(0)
with self.assertRaises(ValueError):
# Tries with a `dtype``
self.quantized_model.to(torch.float16)
def test_original_dtype(self):
r"""
A simple test to check if the model successfully stores the original dtype
"""
self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
self.assertTrue(self.quantized_model.config._pre_quantization_dtype == torch.float16)
self.assertTrue(isinstance(self.quantized_model.model.layers[0].mlp.gate_proj, QParamsLinear))
def check_inference_correctness(self, model):
r"""
Test the generation quality of the quantized model and see that we are matching the expected output.
Given that we are operating on small numbers + the testing model is relatively small, we might not get
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
"""
# Check that inference pass works on the model
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
gen_config = GenerationConfig(
max_new_tokens=15,
min_new_tokens=15,
use_cache=True,
num_beams=1,
do_sample=False,
)
# Check the exactness of the results
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), generation_config=gen_config)
# Get the generation
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
def test_generate_quality(self):
"""
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
"""
if self.device_map is None:
self.check_inference_correctness(self.quantized_model.to(0))
else:
self.check_inference_correctness(self.quantized_model)
@require_accelerate
@require_torch_multi_gpu
@require_quark
class QuarkTestDeviceMap(QuarkTest):
device_map = "auto"

View File

@@ -0,0 +1,246 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
import pytest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, SpQRConfig, StaticCache
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_spqr,
require_torch_gpu,
require_torch_multi_gpu,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
@require_torch_gpu
class SpQRConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
"""
quantization_config = SpQRConfig()
config_to_dict = quantization_config.to_dict()
for key in config_to_dict:
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
def test_from_dict(self):
"""
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
"""
dict = {
"beta1": 16,
"beta2": 16,
"bits": 3,
"modules_to_not_convert": ["lm_head.weight"],
"shapes": {"model.layers.0.self_attn.q_proj.dense_weights.shape": 16},
}
quantization_config = SpQRConfig.from_dict(dict)
self.assertEqual(dict["beta1"], quantization_config.beta1)
self.assertEqual(dict["beta2"], quantization_config.beta2)
self.assertEqual(dict["bits"], quantization_config.bits)
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
self.assertEqual(dict["shapes"], quantization_config.shapes)
@slow
@require_torch_gpu
@require_spqr
@require_accelerate
class SpQRTest(unittest.TestCase):
model_name = "elvircrn/Llama-2-7b-SPQR-3Bit-16x16-red_pajama-hf"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = (
"Hello my name is Jesse. (I'm also known as Jesse) I'm a 25 year old male from United States. I'm looking for"
)
EXPECTED_OUTPUT_COMPILE = "Hello my name is Jake and I am a 20 year old student at the University of North Texas. (Go Mean Green!) I am a huge fan of the Dallas"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=torch_device,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from spqr_quant import QuantizedLinear
from transformers.integrations import replace_with_spqr_linear
model_id = "meta-llama/Llama-2-7b-hf"
config = AutoConfig.from_pretrained(model_id)
quantization_config = AutoConfig.from_pretrained(self.model_name, return_dict=False).quantization_config
quantization_config = SpQRConfig.from_dict(quantization_config)
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, config=config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_spqr_linear(
model,
quantization_config=quantization_config,
modules_to_not_convert=quantization_config.modules_to_not_convert,
)
nb_spqr_linear = 0
for module in model.modules():
if isinstance(module, QuantizedLinear):
nb_spqr_linear += 1
self.assertEqual(nb_linears - 1, nb_spqr_linear)
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "meta-llama/Llama-2-7b-hf"
quantization_config = SpQRConfig()
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
@unittest.skip
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@pytest.mark.torch_compile_test
def test_quantized_model_compile(self):
"""
Simple test that checks if the quantized model is working properly
"""
# Sample tokens greedily
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
logits = model(
cur_token,
position_ids=input_pos,
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
return new_token
# Tokenize the test input
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
seq_length = input_ids.shape[1]
# Setup static KV cache for generation
past_key_values = StaticCache(
config=self.quantized_model.config, max_cache_len=seq_length + self.max_new_tokens + 1
)
# Allocate token ids to be generated and copy prefix ids
cache_position = torch.arange(seq_length, device=torch_device)
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
generated_ids[:, cache_position] = input_ids.to(torch_device).to(torch.int)
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
logits = self.quantized_model(
input_ids,
cache_position=cache_position,
past_key_values=past_key_values,
return_dict=False,
use_cache=True,
)[0]
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
generated_ids[:, [seq_length]] = next_token
with torch.no_grad():
# Compile the CUDA graph
decode_one_tokens = torch.compile(decode_one_tokens, mode="default", backend="inductor", fullgraph=True)
# Generate tokens one by one
cache_position = torch.tensor([seq_length + 1], device=torch_device)
for _ in range(1, self.max_new_tokens):
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
next_token = decode_one_tokens(
self.quantized_model, next_token.clone(), None, cache_position, past_key_values
)
generated_ids.index_copy_(1, cache_position, next_token)
cache_position += 1
# Check generated text
self.assertEqual(
self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_COMPILE
)

View File

@@ -0,0 +1,636 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import importlib.metadata
import tempfile
import unittest
from packaging import version
from parameterized import parameterized
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
from transformers.testing_utils import (
Expectations,
backend_empty_cache,
get_device_properties,
require_torch_accelerator,
require_torch_multi_accelerator,
require_torchao,
require_torchao_version_greater_or_equal,
torch_device,
)
from transformers.utils import is_torch_available, is_torchao_available
if is_torch_available():
import torch
if is_torchao_available():
import torchao
# renamed in torchao 0.7.0, please install the latest torchao
from torchao.dtypes import (
AffineQuantizedTensor,
TensorCoreTiledLayout,
)
from torchao.quantization import (
Int8WeightOnlyConfig,
IntxWeightOnlyConfig,
MappingType,
ModuleFqnToConfig,
PerAxis,
)
from torchao.quantization.autoquant import AQMixin
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"):
from torchao.dtypes import Int4CPULayout
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.11.0"):
from torchao.dtypes import Int4XPULayout
def check_torchao_int4_wo_quantized(test_module, qlayer):
weight = qlayer.weight
test_module.assertEqual(weight.quant_min, 0)
test_module.assertEqual(weight.quant_max, 15)
test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
layout = None
if weight.device.type == "cpu":
layout = Int4CPULayout
elif weight.device.type == "xpu":
layout = Int4XPULayout
elif weight.device.type == "cuda":
layout = TensorCoreTiledLayout
test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))
def check_autoquantized(test_module, qlayer):
weight = qlayer.weight
test_module.assertTrue(isinstance(weight, AQMixin))
def check_forward(test_module, model, batch_size=1, context_size=1024):
# Test forward pass
with torch.no_grad():
out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits
test_module.assertEqual(out.shape[0], batch_size)
test_module.assertEqual(out.shape[1], context_size)
@require_torchao
@require_torchao_version_greater_or_equal("0.8.0")
class TorchAoConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = TorchAoConfig("int4_weight_only")
torchao_orig_config = quantization_config.to_dict()
for key in torchao_orig_config:
self.assertEqual(getattr(quantization_config, key), torchao_orig_config[key])
def test_post_init_check(self):
"""
Test kwargs validations in TorchAoConfig
"""
_ = TorchAoConfig("int4_weight_only")
with self.assertRaisesRegex(ValueError, "Unsupported string quantization type"):
_ = TorchAoConfig("fp6")
with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"):
_ = TorchAoConfig("int4_weight_only", group_size1=32)
def test_repr(self):
"""
Check that there is no error in the repr
"""
quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
repr(quantization_config)
def test_json_serializable(self):
"""
Check that the config dict can be JSON serialized.
"""
quantization_config = TorchAoConfig("int4_weight_only", group_size=32, layout=TensorCoreTiledLayout())
d = quantization_config.to_dict()
self.assertIsInstance(d["quant_type_kwargs"]["layout"], list)
self.assertTrue("inner_k_tiles" in d["quant_type_kwargs"]["layout"][1])
quantization_config.to_json_string(use_diff=False)
@require_torchao
@require_torchao_version_greater_or_equal("0.8.0")
class TorchAoTest(unittest.TestCase):
input_text = "What are we having for dinner?"
max_new_tokens = 10
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cpu"
quant_scheme_kwargs = (
{"group_size": 32, "layout": Int4CPULayout(), "version": 1}
if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
else {"group_size": 32}
)
# called only once for all test in this class
@classmethod
def setUpClass(cls):
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_int4wo_quant(self):
"""
Simple LLM model testing int4 weight only quantization
"""
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
# Note: we quantize the bfloat16 model on the fly to int4
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_int4wo_quant_bfloat16_conversion(self):
"""
Testing the dtype of model will be modified to be bfloat16 for int4 weight only quantization
"""
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
# Note: we quantize the bfloat16 model on the fly to int4
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_int8_dynamic_activation_int8_weight_quant(self):
"""
Simple LLM model testing int8_dynamic_activation_int8_weight
"""
quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = [
"What are we having for dinner?\n\nJessica: (smiling)",
"What are we having for dinner?\n\nJess: (smiling) I",
]
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
@require_torchao_version_greater_or_equal("0.11.0")
def test_include_input_output_embeddings(self):
weight_dtype = torch.int8
granularity = PerAxis(0)
mapping_type = MappingType.ASYMMETRIC
embedding_config = IntxWeightOnlyConfig(
weight_dtype=weight_dtype,
granularity=granularity,
mapping_type=mapping_type,
version=1,
)
config = ModuleFqnToConfig(
{"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config}
)
# need set `include_input_output_embeddings` to True
quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
)
# making sure embedding is quantized
self.assertTrue(isinstance(quantized_model.model.embed_tokens.weight, AffineQuantizedTensor))
self.assertTrue(isinstance(quantized_model.lm_head.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = [
"What are we having for dinner?\n\nJessica: (smiling)",
"What are we having for dinner?\n\nJess: (smiling) I",
]
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
@require_torchao_version_greater_or_equal("0.11.0")
def test_per_module_config_skip(self):
linear_config = Int8WeightOnlyConfig()
config = ModuleFqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
quant_config = TorchAoConfig(quant_type=config)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
device_map=self.device,
quantization_config=quant_config,
)
# making sure `model.layers.0.self_attn.q_proj` is skipped
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
EXPECTED_OUTPUT = [
"What are we having for dinner?\n\nJessica: (smiling)",
"What are we having for dinner?\n\nJess: (smiling) I",
]
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
@require_torch_accelerator
class TorchAoAcceleratorTest(TorchAoTest):
device = torch_device
quant_scheme_kwargs = {"group_size": 32, "version": 1}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
}
)
# fmt: on
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
def test_int4wo_offload(self):
"""
Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload
"""
device_map_offload = {
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": 0,
"model.layers.8": 0,
"model.layers.9": 0,
"model.layers.10": 0,
"model.layers.11": 0,
"model.layers.12": 0,
"model.layers.13": 0,
"model.layers.14": 0,
"model.layers.15": 0,
"model.layers.16": 0,
"model.layers.17": 0,
"model.layers.18": 0,
"model.layers.19": "cpu",
"model.layers.20": "cpu",
"model.layers.21": "disk",
"model.norm": 0,
"model.rotary_emb": 0,
"lm_head": 0,
}
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=device_map_offload,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 2. What is the temperature outside",
}
)
# fmt: on
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
self.assertEqual(generated_text, EXPECTED_OUTPUT)
@require_torch_multi_accelerator
def test_int4wo_quant_multi_accelerator(self):
"""
Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
"""
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map="auto",
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_autoquant(self):
"""
Simple LLM model testing autoquant
"""
quant_config = TorchAoConfig("autoquant")
quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype="auto",
device_map=self.device,
quantization_config=quant_config,
)
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = quantized_model.generate(
**input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
)
quantized_model.finalize_autoquant()
check_autoquantized(self, quantized_model.model.layers[0].self_attn.v_proj)
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
output = quantized_model.generate(
**input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
)
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
@require_torchao
@require_torchao_version_greater_or_equal("0.8.0")
class TorchAoSerializationTest(unittest.TestCase):
input_text = "What are we having for dinner?"
max_new_tokens = 10
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quant_scheme = "int4_weight_only"
quant_scheme_kwargs = (
{"group_size": 32, "layout": Int4CPULayout(), "version": 1}
if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
else {"group_size": 32}
)
device = "cpu"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
def setUp(self):
self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs)
dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
self.quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=dtype,
device_map=self.device,
quantization_config=self.quant_config,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_original_model_expected_output(self):
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def check_serialization_expected_output(self, device, expected_output, safe_serialization=False):
"""
Test if we can serialize and load/infer the model again on the same device
"""
dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
loaded_quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
def test_serialization_expected_output(self):
self.check_serialization_expected_output(self.device, self.EXPECTED_OUTPUT)
@require_torchao
@require_torchao_version_greater_or_equal("0.14.0")
class TorchAoSafeSerializationTest(TorchAoSerializationTest):
# called only once for all test in this class
@classmethod
def setUpClass(cls):
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
if hasattr(self, "quantized_model"):
del self.quantized_model
gc.collect()
test_params = (
[
(
torchao.quantization.Float8DynamicActivationFloat8WeightConfig(),
"What are we having for dinner?\n\nJess: (smiling) I",
),
(torchao.quantization.Float8WeightOnlyConfig(), "What are we having for dinner?\n\nJessica: (smiling)"),
]
if is_torchao_available()
else []
)
@parameterized.expand(test_params, skip_on_empty=True)
def test_serialization_expected_output(self, config, expected_output):
device = "cuda"
self.quant_config = TorchAoConfig(config)
self.quantized_model = AutoModelForCausalLM.from_pretrained(
self.model_name,
dtype=torch.bfloat16,
device_map=device,
quantization_config=self.quant_config,
)
self.check_serialization_expected_output(device, expected_output, safe_serialization=True)
class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
def test_serialization_expected_output_on_accelerator(self):
"""
Test if we can serialize on device (cpu) and load/infer the model on accelerator
"""
self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
def test_serialization_expected_output_on_accelerator(self):
"""
Test if we can serialize on device (cpu) and load/infer the model on accelerator
"""
self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
@require_torch_accelerator
class TorchAoSerializationAcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32, "version": 1}
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
# fmt: off
EXPECTED_OUTPUTS = Expectations(
{
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
}
)
# fmt: on
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
@require_torch_accelerator
class TorchAoSerializationW8A8AcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
class TorchAoSerializationW8AcceleratorTest(TorchAoSerializationTest):
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
@require_torchao_version_greater_or_equal("0.10.0")
class TorchAoSerializationFP8AcceleratorTest(TorchAoSerializationTest):
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
device_type, major, minor = get_device_properties()
if device_type == "cuda" and major < 9:
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
from torchao.quantization import Float8WeightOnlyConfig
cls.quant_scheme = Float8WeightOnlyConfig()
cls.quant_scheme_kwargs = {}
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
@require_torch_accelerator
@require_torchao_version_greater_or_equal("0.10.0")
class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
device = f"{torch_device}:0"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
device_type, major, minor = get_device_properties()
if device_type == "cuda" and major < 9:
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
from torchao.quantization import Int8DynamicActivationInt4WeightConfig
cls.quant_scheme = Int8DynamicActivationInt4WeightConfig()
cls.quant_scheme_kwargs = {}
super().setUpClass()
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,194 @@
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import tempfile
import unittest
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, VptqConfig
from transformers.testing_utils import (
backend_empty_cache,
require_accelerate,
require_torch_gpu,
require_torch_multi_gpu,
require_vptq,
slow,
torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available
if is_torch_available():
import torch
if is_accelerate_available():
from accelerate import init_empty_weights
class VptqConfigTest(unittest.TestCase):
def test_to_dict(self):
"""
Makes sure the config format is properly set
"""
quantization_config = VptqConfig()
vptq_orig_config = quantization_config.to_dict()
self.assertEqual(vptq_orig_config["quant_method"], quantization_config.quant_method)
@slow
@require_torch_gpu
@require_vptq
@require_accelerate
class VptqTest(unittest.TestCase):
model_name = "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft"
input_text = "Hello my name is"
max_new_tokens = 32
EXPECTED_OUTPUT = "Hello my name is Sarah and I am a 25 year old woman from the United States. I am a college graduate and I am currently working as a marketing specialist for a small"
device_map = "cuda"
# called only once for all test in this class
@classmethod
def setUpClass(cls):
"""
Setup quantized model
"""
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
cls.model_name,
device_map=cls.device_map,
)
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_quantized_model(self):
"""
Simple test that checks if the quantized model is working properly
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_raise_if_non_quantized(self):
model_id = "facebook/opt-125m"
quantization_config = VptqConfig()
with self.assertRaises(ValueError):
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
def test_save_pretrained(self):
"""
Simple test that checks if the quantized model is working properly after being saved and loaded
"""
with tempfile.TemporaryDirectory() as tmpdirname:
self.quantized_model.save_pretrained(tmpdirname)
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
@require_torch_multi_gpu
def test_quantized_model_multi_gpu(self):
"""
Simple test that checks if the quantized model is working properly with multiple GPUs
"""
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
def test_quantized_model_conversion(self):
"""
Simple test that checks if the quantized model has been converted properly
"""
from vptq import VQuantLinear
from transformers.integrations import replace_with_vptq_linear
model_id = "facebook/opt-350m"
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
modules_to_not_convert = ["lm_head"]
names = [
"q_proj",
"k_proj",
"v_proj",
"out_proj",
"fc1",
"fc2",
]
value = {
"enable_norm": True,
"enable_perm": True,
"group_num": 1,
"group_size": 128,
"indices_as_float": False,
"num_centroids": [-1, 128],
"num_res_centroids": [-1, 128],
"outlier_size": 0,
"vector_lens": [-1, 12],
}
shared_layer_config = {}
for name in names:
shared_layer_config[name] = value
for i in range(24):
modules_to_not_convert.append(f"model.decoder.layers.{i}.fc1")
layer_configs = {}
layer_configs["model.decoder.project_out"] = value
layer_configs["model.decoder.project_in"] = value
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
nb_linears = 0
for module in model.modules():
if isinstance(module, torch.nn.Linear):
nb_linears += 1
model, _ = replace_with_vptq_linear(model, quantization_config=quantization_config)
nb_vptq_linear = 0
for module in model.modules():
if isinstance(module, VQuantLinear):
nb_vptq_linear += 1
self.assertEqual(nb_linears - 1, nb_vptq_linear)
# Try with `linear_weights_not_to_quantize`
with init_empty_weights():
model = AutoModelForCausalLM.from_config(config)
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
model, _ = replace_with_vptq_linear(
model, quantization_config=quantization_config, modules_to_not_convert=modules_to_not_convert
)
nb_vptq_linear = 0
for module in model.modules():
if isinstance(module, VQuantLinear):
nb_vptq_linear += 1
# 25 comes from 24 decoder.layers.{layer_idx}.fc1
# and the last lm_head
self.assertEqual(nb_linears - 25, nb_vptq_linear)