init
This commit is contained in:
264
transformers/tests/quantization/aqlm_integration/test_aqlm.py
Normal file
264
transformers/tests/quantization/aqlm_integration/test_aqlm.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import importlib
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import skip
|
||||
|
||||
import pytest
|
||||
from packaging import version
|
||||
|
||||
from transformers import AqlmConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM, StaticCache
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_aqlm,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_aqlm_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class AqlmConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = AqlmConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {
|
||||
"in_group_size": 32,
|
||||
"num_codebooks": 8,
|
||||
"nbits_per_codebook": 8,
|
||||
"linear_weights_not_to_quantize": ["lm_head.weight"],
|
||||
}
|
||||
quantization_config = AqlmConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["in_group_size"], quantization_config.in_group_size)
|
||||
self.assertEqual(dict["num_codebooks"], quantization_config.num_codebooks)
|
||||
self.assertEqual(dict["nbits_per_codebook"], quantization_config.nbits_per_codebook)
|
||||
self.assertEqual(dict["linear_weights_not_to_quantize"], quantization_config.linear_weights_not_to_quantize)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_aqlm
|
||||
@require_accelerate
|
||||
class AqlmTest(unittest.TestCase):
|
||||
model_name = "BlackSamorez/Llama-2-7b-AQLM-2Bit-1x16-hf"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
max_new_tokens = 32
|
||||
|
||||
EXPECTED_OUTPUT = "Hello my name is Katie. I am a 20 year old college student. I am a very outgoing person. I love to have fun and be active. I"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
from aqlm import QuantizedLinear
|
||||
|
||||
from transformers.integrations import replace_with_aqlm_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = AqlmConfig()
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model, _ = replace_with_aqlm_linear(model, quantization_config=quantization_config)
|
||||
nb_aqlm_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, QuantizedLinear):
|
||||
nb_aqlm_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears, nb_aqlm_linear)
|
||||
|
||||
# Try with `linear_weights_not_to_quantize`
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
model, _ = replace_with_aqlm_linear(
|
||||
model, quantization_config=quantization_config, linear_weights_not_to_quantize=["lm_head.weight"]
|
||||
)
|
||||
nb_aqlm_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, QuantizedLinear):
|
||||
nb_aqlm_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_aqlm_linear)
|
||||
|
||||
@skip(
|
||||
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
|
||||
)
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_raise_if_non_quantized(self):
|
||||
model_id = "facebook/opt-125m"
|
||||
quantization_config = AqlmConfig(bits=4)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
|
||||
@skip(
|
||||
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
|
||||
)
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@skip(
|
||||
"inference doesn't work with quantized aqlm models using torch.Any type with recent torch versions. Waiting for the fix from AQLM side"
|
||||
)
|
||||
@require_torch_multi_accelerator
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@unittest.skipUnless(
|
||||
is_aqlm_available() and version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.3"),
|
||||
"test requires `aqlm>=1.0.3`",
|
||||
)
|
||||
@pytest.mark.torch_compile_test
|
||||
def test_quantized_model_compile(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
|
||||
# Sample tokens greedily
|
||||
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
|
||||
logits = model(
|
||||
cur_token,
|
||||
position_ids=input_pos,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)[0]
|
||||
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
|
||||
|
||||
return new_token
|
||||
|
||||
# Tokenize the test input
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
|
||||
seq_length = input_ids.shape[1]
|
||||
|
||||
# Setup static KV cache for generation
|
||||
past_key_values = StaticCache(
|
||||
config=self.quantized_model.config,
|
||||
batch_size=input_ids.shape[0],
|
||||
max_cache_len=seq_length + self.max_new_tokens + 1,
|
||||
)
|
||||
|
||||
# Allocate token ids to be generated and copy prefix ids
|
||||
cache_position = torch.arange(seq_length, device=torch_device)
|
||||
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
|
||||
generated_ids[:, cache_position] = input_ids.to(torch_device).to(torch.int)
|
||||
|
||||
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
|
||||
logits = self.quantized_model(
|
||||
input_ids,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)[0]
|
||||
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
|
||||
generated_ids[:, [seq_length]] = next_token
|
||||
|
||||
with torch.no_grad():
|
||||
# Compile the CUDA graph
|
||||
decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
# Generate tokens one by one
|
||||
cache_position = torch.tensor([seq_length + 1], device=torch_device)
|
||||
for _ in range(1, self.max_new_tokens):
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
|
||||
next_token = decode_one_tokens(
|
||||
self.quantized_model, next_token.clone(), None, cache_position, past_key_values
|
||||
)
|
||||
generated_ids.index_copy_(1, cache_position, next_token)
|
||||
cache_position += 1
|
||||
|
||||
# Check generated text
|
||||
self.assertEqual(self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
0
transformers/tests/quantization/autoawq/__init__.py
Normal file
0
transformers/tests/quantization/autoawq/__init__.py
Normal file
543
transformers/tests/quantization/autoawq/test_awq.py
Normal file
543
transformers/tests/quantization/autoawq/test_awq.py
Normal file
@@ -0,0 +1,543 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AwqConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_accelerate,
|
||||
require_auto_awq,
|
||||
require_flash_attn,
|
||||
require_intel_extension_for_pytorch,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_accelerator,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class AwqConfigTest(unittest.TestCase):
|
||||
def test_wrong_backend(self):
|
||||
"""
|
||||
Simple test that checks if a user passes a wrong backend an error is raised
|
||||
"""
|
||||
# This should work fine
|
||||
_ = AwqConfig(bits=4)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
AwqConfig(bits=4, backend="")
|
||||
|
||||
# These should work fine
|
||||
_ = AwqConfig(bits=4, version="GEMM")
|
||||
_ = AwqConfig(bits=4, version="gemm")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
AwqConfig(bits=4, backend="unexisting-backend")
|
||||
|
||||
# Only cuda and xpu devices can run this function
|
||||
support_llm_awq = False
|
||||
device_type, major, _ = get_device_properties()
|
||||
if device_type == "cuda" and major >= 8:
|
||||
support_llm_awq = True
|
||||
elif device_type == "xpu":
|
||||
support_llm_awq = True
|
||||
|
||||
if support_llm_awq:
|
||||
# LLMAWQ should work on an A100
|
||||
AwqConfig(bits=4, backend="llm-awq")
|
||||
else:
|
||||
# LLMAWQ does not work on a T4
|
||||
with self.assertRaises(ValueError):
|
||||
AwqConfig(bits=4, backend="llm-awq")
|
||||
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = AwqConfig(bits=4)
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"bits": 2, "zero_point": False, "backend": "autoawq"}
|
||||
quantization_config = AwqConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["bits"], quantization_config.bits)
|
||||
self.assertEqual(dict["zero_point"], quantization_config.zero_point)
|
||||
self.assertEqual(dict["backend"], quantization_config.backend)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_auto_awq
|
||||
@require_accelerate
|
||||
class AwqTest(unittest.TestCase):
|
||||
model_name = "TheBloke/Mistral-7B-v0.1-AWQ"
|
||||
dummy_transformers_model_name = "bigscience/bloom-560m"
|
||||
model_with_no_k_proj_quantized = "hf-internal-testing/opt-125m-awq-no-k-proj"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
|
||||
EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
|
||||
EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish"
|
||||
|
||||
EXPECTED_OUTPUT_EXLLAMA = [
|
||||
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out",
|
||||
"Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very creative",
|
||||
]
|
||||
device_map = torch_device
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=cls.device_map)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
|
||||
|
||||
from transformers.integrations.awq import replace_with_awq_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = AwqConfig(bits=4)
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model, _ = replace_with_awq_linear(model, quantization_config=quantization_config)
|
||||
nb_awq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)):
|
||||
nb_awq_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears, nb_awq_linear)
|
||||
|
||||
# Try with `modules_not_to_convert`
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
model, _ = replace_with_awq_linear(
|
||||
model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
|
||||
)
|
||||
nb_awq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, (WQLinear_GEMM, WQLinear_GEMV)):
|
||||
nb_awq_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_awq_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_raise_if_non_quantized(self):
|
||||
model_id = "facebook/opt-125m"
|
||||
quantization_config = AwqConfig(bits=4)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
|
||||
def test_quantized_model_bf16(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with bf16
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.bfloat16).to(torch_device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_BF16)
|
||||
|
||||
@require_torch_gpu
|
||||
def test_quantized_model_exllama(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with exllama backend
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantization_config = AwqConfig(version="exllama")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=quantization_config, device_map=torch_device
|
||||
)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA)
|
||||
|
||||
def test_quantized_model_no_device_map(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name).to(torch_device)
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=40)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_quantized_model_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_quantized_model_no_k_proj_quantized(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
"""
|
||||
dummy_input = torch.LongTensor([[0, 1, 0]]).to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_with_no_k_proj_quantized).to(torch_device)
|
||||
|
||||
self.assertTrue(isinstance(quantized_model.model.decoder.layers[0].self_attn.k_proj, torch.nn.Linear))
|
||||
self.assertFalse(isinstance(quantized_model.model.decoder.layers[0].self_attn.v_proj, torch.nn.Linear))
|
||||
|
||||
EXPECTED_OUTPUT = torch.LongTensor([[0, 1, 0, 50118, 50118, 133, 248, 12, 134, 16, 10, 372, 2031]]).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
output = quantized_model.generate(dummy_input, max_new_tokens=10)
|
||||
self.assertTrue((EXPECTED_OUTPUT == output).all())
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_auto_awq
|
||||
@require_accelerate
|
||||
class AwqFusedTest(unittest.TestCase):
|
||||
model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
|
||||
model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
|
||||
|
||||
custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ"
|
||||
custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7"
|
||||
|
||||
mixtral_model_name = "casperhansen/mixtral-instruct-awq"
|
||||
mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b"
|
||||
|
||||
multi_modal_model_name = "ybelkada/llava-1.5-7b-hf-awq"
|
||||
multi_modal_model_code_revision = "ad108a50f5b9e681bdd7378409f57b7fa59a7442"
|
||||
|
||||
prompt = (
|
||||
"You're standing on the surface of the Earth. "
|
||||
"You walk one mile south, one mile west and one mile north. "
|
||||
"You end up exactly where you started. Where are you?"
|
||||
)
|
||||
|
||||
EXPECTED_GENERATION = prompt + "\n\nYou're at the center of a square."
|
||||
EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
|
||||
EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def _check_fused_modules(self, model):
|
||||
has_fused_modules = False
|
||||
fused_modules_name = ["QuantAttentionFused", "QuantFusedMLP", "FasterTransformerRMSNorm"]
|
||||
|
||||
for _, module in model.named_modules():
|
||||
if module.__class__.__name__ in fused_modules_name:
|
||||
has_fused_modules = True
|
||||
break
|
||||
|
||||
self.assertTrue(has_fused_modules, "Modules fusing not performed correctly!")
|
||||
|
||||
def test_raise_save_pretrained(self):
|
||||
"""
|
||||
Test that `save_pretrained` is effectively blocked for fused models
|
||||
"""
|
||||
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
quantization_config=quantization_config,
|
||||
revision=self.model_revision,
|
||||
).to(torch_device)
|
||||
|
||||
self._check_fused_modules(model)
|
||||
|
||||
with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
|
||||
def test_fused_modules_to_not_convert(self):
|
||||
"""
|
||||
Test if fused + modules to_not_convert work as expected
|
||||
"""
|
||||
model_id = "hf-internal-testing/Mixtral-tiny-AWQ"
|
||||
|
||||
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
quantization_config=quantization_config,
|
||||
).to(torch_device)
|
||||
|
||||
# Check if model has been correctly fused
|
||||
self._check_fused_modules(model)
|
||||
# Checks if the modules_to_not_convert (here gate layer) is a Linear
|
||||
self.assertTrue(isinstance(model.model.layers[0].block_sparse_moe.gate, torch.nn.Linear))
|
||||
|
||||
@unittest.skipIf(
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
def test_generation_fused(self):
|
||||
"""
|
||||
Test generation quality for fused models - single batch case
|
||||
"""
|
||||
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
quantization_config=quantization_config,
|
||||
revision=self.model_revision,
|
||||
).to(torch_device)
|
||||
|
||||
self._check_fused_modules(model)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
|
||||
|
||||
inputs = tokenizer(self.prompt, return_tensors="pt").to(torch_device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||
|
||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@unittest.skipIf(
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
def test_generation_fused_batched(self):
|
||||
"""
|
||||
Test generation quality for fused models - multi batch case
|
||||
"""
|
||||
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
quantization_config=quantization_config,
|
||||
revision=self.model_revision,
|
||||
).to(torch_device)
|
||||
|
||||
self._check_fused_modules(model)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
|
||||
|
||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||
|
||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
|
||||
|
||||
def test_generation_llava_fused(self):
|
||||
from transformers import pipeline
|
||||
|
||||
quantization_config = AwqConfig(do_fuse=True, fuse_max_seq_len=2048)
|
||||
|
||||
pipe = pipeline(
|
||||
"image-to-text",
|
||||
model=self.multi_modal_model_name,
|
||||
device=0,
|
||||
model_kwargs={
|
||||
"quantization_config": quantization_config,
|
||||
},
|
||||
revision=self.multi_modal_model_code_revision,
|
||||
)
|
||||
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
|
||||
|
||||
prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
|
||||
|
||||
outputs = pipe(url, prompt=prompt, generate_kwargs={"max_new_tokens": 100})
|
||||
EXPECTED_OUTPUT = "USER: \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, possibly a carpet or a grassy area. The cat is holding a red ball in its paws, seemingly playing with it. The cat appears to be focused on the ball, possibly preparing to play or just enjoying the toy."
|
||||
|
||||
self.assertEqual(outputs[0]["generated_text"], EXPECTED_OUTPUT)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_multi_gpu
|
||||
@unittest.skipIf(
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
|
||||
"Skipping because RuntimeError: FlashAttention only supports Ampere GPUs or newer, so not supported on GPU with capability < 8.0",
|
||||
)
|
||||
def test_generation_custom_model(self):
|
||||
"""
|
||||
Test generation quality for fused models using custom fused map.
|
||||
"""
|
||||
quantization_config = AwqConfig(
|
||||
bits=4,
|
||||
fuse_max_seq_len=512,
|
||||
modules_to_fuse={
|
||||
"attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
|
||||
"mlp": ["gate_proj", "up_proj", "down_proj"],
|
||||
"layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
|
||||
"use_alibi": False,
|
||||
"hidden_size": 4096,
|
||||
"num_attention_heads": 32,
|
||||
"num_key_value_heads": 8,
|
||||
},
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.custom_mapping_model_id,
|
||||
quantization_config=quantization_config,
|
||||
device_map="balanced",
|
||||
revision=self.custom_model_revision,
|
||||
)
|
||||
|
||||
self._check_fused_modules(model)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision)
|
||||
|
||||
prompt = "Hello"
|
||||
inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_multi_gpu
|
||||
@unittest.skip(reason="Not enough GPU memory on CI runners")
|
||||
def test_generation_mixtral_fused(self):
|
||||
"""
|
||||
Text generation test for Mixtral + AWQ + fused
|
||||
"""
|
||||
quantization_config = AwqConfig(bits=4, fuse_max_seq_len=1024, do_fuse=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.mixtral_model_name,
|
||||
quantization_config=quantization_config,
|
||||
device_map="auto",
|
||||
revision=self.mixtral_model_revision,
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.mixtral_model_name)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=12)
|
||||
self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_MIXTRAL)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_auto_awq
|
||||
@require_accelerate
|
||||
class AwqScaleTest(unittest.TestCase):
|
||||
model_name = "TechxGenus/starcoder2-3b-AWQ"
|
||||
|
||||
def test_load_quantized_model(self):
|
||||
from awq.modules.act import ScaledActivation
|
||||
|
||||
"""
|
||||
Simple test that checks if the scales have been replaced in the quantized model
|
||||
"""
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
"TechxGenus/starcoder2-3b-AWQ", dtype=torch.float16, device_map=torch_device
|
||||
)
|
||||
self.assertTrue(isinstance(quantized_model.model.layers[0].mlp.act, ScaledActivation))
|
||||
|
||||
|
||||
@slow
|
||||
@require_auto_awq
|
||||
@require_accelerate
|
||||
@require_intel_extension_for_pytorch
|
||||
class AwqIPEXTest(unittest.TestCase):
|
||||
def test_quantized_model_ipex(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with ipex backend
|
||||
"""
|
||||
quantization_config = AwqConfig(version="ipex")
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
|
||||
quantization_config=quantization_config,
|
||||
device_map="cpu",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ")
|
||||
input_ids = tokenizer.encode("How to make a cake", return_tensors="pt")
|
||||
pad_token_id = tokenizer.eos_token_id
|
||||
output = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=pad_token_id)
|
||||
print(tokenizer.decode(output[0], skip_special_tokens=True))
|
||||
|
||||
expected_output = (
|
||||
"How to make a cake with a round tin?\nHow to make a cake with a round tin?\n1. Preheat the oven to 180°"
|
||||
)
|
||||
self.assertIn(tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
217
transformers/tests/quantization/autoround/test_auto_round.py
Normal file
217
transformers/tests/quantization/autoround/test_auto_round.py
Normal file
@@ -0,0 +1,217 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
backend_synchronize,
|
||||
require_accelerate,
|
||||
require_auto_round,
|
||||
require_intel_extension_for_pytorch,
|
||||
require_torch_accelerator,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_auto_round
|
||||
@require_accelerate
|
||||
class AutoRoundTest(unittest.TestCase):
|
||||
model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
|
||||
input_text = "There is a girl who likes adventure,"
|
||||
EXPECTED_OUTPUTS = set()
|
||||
## Different backends may produce slight variations in output
|
||||
EXPECTED_OUTPUTS.add(
|
||||
"There is a girl who likes adventure, and she has been exploring the world "
|
||||
"for many years. She travels to different countries and cultures, trying new "
|
||||
"things every day. One of her favorite places to visit is a small village in "
|
||||
"the mountains where"
|
||||
)
|
||||
EXPECTED_OUTPUTS.add(
|
||||
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She enjoys hiking through the mountains and discovering"
|
||||
)
|
||||
EXPECTED_OUTPUTS.add(
|
||||
"There is a girl who likes adventure, and she has been exploring the world for many years. She has visited every country in Europe and has even traveled to some of the most remote parts of Africa. She has also climbed mountains and explored caves"
|
||||
)
|
||||
|
||||
device_map = torch_device
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
backend_synchronize(torch_device)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, dtype=torch.float16
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
|
||||
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_raise_if_non_quantized(self):
|
||||
model_id = "facebook/opt-125m"
|
||||
quantization_config = AutoRoundConfig(bits=4)
|
||||
with self.assertRaises(ValueError):
|
||||
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
|
||||
def test_quantized_model_bf16(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with bf16
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = AutoRoundConfig(backend="triton")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=self.device_map,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
|
||||
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
@require_intel_extension_for_pytorch
|
||||
def test_quantized_model_on_cpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto")
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
|
||||
|
||||
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
|
||||
## some backends like marlin/ipex will repack the weight that caused the weight shape changed
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
quantization_config = AutoRoundConfig(backend="triton")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device_map,
|
||||
dtype=torch.float16,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
|
||||
quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=40, do_sample=False)
|
||||
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
self.assertIn(output_tokens, self.EXPECTED_OUTPUTS)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_quantized_model_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple accelerators
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = AutoRoundConfig(backend="triton")
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config, dtype="auto"
|
||||
)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
|
||||
|
||||
self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_convert_from_gptq(self):
|
||||
"""
|
||||
Simple test that checks if auto-round work properly with gptq format
|
||||
"""
|
||||
model_name = "ybelkada/opt-125m-gptq-4bit"
|
||||
|
||||
quantization_config = AutoRoundConfig()
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map=torch_device, quantization_config=quantization_config, dtype="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
text = "There is a girl who likes adventure,"
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
|
||||
|
||||
@require_intel_extension_for_pytorch
|
||||
def test_convert_from_awq_cpu(self):
|
||||
"""
|
||||
Simple test that checks if auto-round work properly with awq format
|
||||
"""
|
||||
model_name = "casperhansen/opt-125m-awq"
|
||||
|
||||
quantization_config = AutoRoundConfig()
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
text = "There is a girl who likes adventure,"
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
|
||||
|
||||
@require_torch_gpu
|
||||
def test_mixed_bits(self):
|
||||
"""
|
||||
Simple test that checks if auto-round work properly with mixed bits
|
||||
"""
|
||||
model_name = "facebook/opt-125m"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
layer_config = {
|
||||
"model.decoder.layers.0.self_attn.k_proj": {"bits": 8},
|
||||
"model.decoder.layers.6.self_attn.out_proj": {"bits": 2, "group_size": 32},
|
||||
}
|
||||
|
||||
bits, group_size, sym = 4, 128, True
|
||||
from auto_round import AutoRound
|
||||
|
||||
autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
autoround.quantize_and_save(output_dir=tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=torch.float16, device_map=torch_device)
|
||||
text = "There is a girl who likes adventure,"
|
||||
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||
tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
|
||||
@@ -0,0 +1,225 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
BitNetQuantConfig,
|
||||
OPTForCausalLM,
|
||||
)
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class BitNetQuantConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = BitNetQuantConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class BitNetTest(unittest.TestCase):
|
||||
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Load the model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_replace_with_bitlinear(self):
|
||||
from transformers.integrations import BitLinear, replace_with_bitnet_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id)
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model = replace_with_bitnet_linear(model)
|
||||
nb_bitnet_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, BitLinear):
|
||||
nb_bitnet_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_bitnet_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_text = "What are we having for dinner?"
|
||||
expected_output = "What are we having for dinner? What are we going to do for fun this weekend?"
|
||||
input_ids = self.tokenizer(input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
|
||||
def test_packing_unpacking(self):
|
||||
"""
|
||||
Simple test the packing and unpacking logic
|
||||
"""
|
||||
|
||||
from transformers.integrations import pack_weights, unpack_weights
|
||||
|
||||
u = torch.randint(0, 255, (256, 256), dtype=torch.uint8)
|
||||
unpacked_u = unpack_weights(u, dtype=torch.bfloat16)
|
||||
repacked_u = pack_weights(unpacked_u)
|
||||
for i in range(u.shape[0]):
|
||||
for j in range(u.shape[1]):
|
||||
self.assertEqual(repacked_u[i][j], u[i][j])
|
||||
|
||||
def test_activation_quant(self):
|
||||
"""
|
||||
test the activation function behaviour
|
||||
"""
|
||||
|
||||
from transformers.integrations import BitLinear
|
||||
|
||||
layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
|
||||
layer.to(torch_device)
|
||||
|
||||
input_tensor = torch.tensor([1.0, -1.0, -1.0, 1.0], dtype=torch.float32).to(torch_device)
|
||||
|
||||
# Quantize the input tensor
|
||||
quantized_tensor, scale = layer.activation_quant(input_tensor)
|
||||
|
||||
# Verify the output quantized tensor
|
||||
for i in range(input_tensor.shape[0]):
|
||||
self.assertEqual(quantized_tensor[i] / scale, input_tensor[i])
|
||||
|
||||
# Verify the scale tensor
|
||||
self.assertEqual(scale, 127)
|
||||
|
||||
def test_weights_dtype(self):
|
||||
"""
|
||||
test the weights dtype after loading
|
||||
"""
|
||||
|
||||
self_attn_q = self.quantized_model.model.layers[0].self_attn.q_proj.weight
|
||||
self_attn_k = self.quantized_model.model.layers[0].self_attn.k_proj.weight
|
||||
self_attn_v = self.quantized_model.model.layers[0].self_attn.v_proj.weight
|
||||
self_attn_o = self.quantized_model.model.layers[0].self_attn.o_proj.weight
|
||||
mlp_gate = self.quantized_model.model.layers[0].mlp.gate_proj.weight
|
||||
mlp_up = self.quantized_model.model.layers[0].mlp.up_proj.weight
|
||||
mlp_down = self.quantized_model.model.layers[0].mlp.down_proj.weight
|
||||
|
||||
self.assertEqual(self_attn_q.dtype, torch.uint8)
|
||||
self.assertEqual(self_attn_k.dtype, torch.uint8)
|
||||
self.assertEqual(self_attn_v.dtype, torch.uint8)
|
||||
self.assertEqual(self_attn_o.dtype, torch.uint8)
|
||||
self.assertEqual(mlp_up.dtype, torch.uint8)
|
||||
self.assertEqual(mlp_gate.dtype, torch.uint8)
|
||||
self.assertEqual(mlp_down.dtype, torch.uint8)
|
||||
|
||||
def test_replace_with_bitlinear_shape(self):
|
||||
"""
|
||||
test that the BitNet layer weight shapes are correct, and the weight_scale is correctly initialized to 1
|
||||
"""
|
||||
|
||||
from transformers.integrations import replace_with_bitnet_linear
|
||||
|
||||
out_features = 1024
|
||||
in_features = 512
|
||||
|
||||
class SimpleLinearModule(torch.nn.Module):
|
||||
"""
|
||||
Simple class to test BitLinear
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_features: int = in_features,
|
||||
out_features: int = out_features,
|
||||
bias: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self.linear = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(x)
|
||||
|
||||
model = SimpleLinearModule()
|
||||
replace_with_bitnet_linear(model)
|
||||
|
||||
self.assertEqual(list(model.linear.weight.shape), [out_features // 4, in_features])
|
||||
self.assertEqual(model.linear.weight_scale, 1)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class BitNetSerializationTest(unittest.TestCase):
|
||||
def test_model_serialization(self):
|
||||
model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_ref = quantized_model.forward(input_tensor).logits
|
||||
|
||||
# Save
|
||||
saved_model_id = "quant_model"
|
||||
quantized_model.save_pretrained(saved_model_id)
|
||||
|
||||
# Remove old model
|
||||
del quantized_model
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
# Load and check if the logits match
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_loaded = model_loaded.forward(input_tensor).logits
|
||||
|
||||
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
|
||||
120
transformers/tests/quantization/bnb/README.md
Normal file
120
transformers/tests/quantization/bnb/README.md
Normal file
@@ -0,0 +1,120 @@
|
||||
# Testing mixed int8 quantization
|
||||
|
||||

|
||||
|
||||
The following is the recipe on how to effectively debug `bitsandbytes` integration on Hugging Face `transformers`.
|
||||
|
||||
## Library requirements
|
||||
|
||||
+ `transformers>=4.22.0`
|
||||
+ `accelerate>=0.12.0`
|
||||
+ `bitsandbytes>=0.31.5`.
|
||||
## Hardware requirements
|
||||
|
||||
The following instructions are tested with 2 NVIDIA-Tesla T4 GPUs. To run successfully `bitsandbytes` you would need a 8-bit core tensor supported GPU. Note that Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100, A6000 should be supported.
|
||||
|
||||
## Virtual envs
|
||||
|
||||
```bash
|
||||
conda create --name int8-testing python==3.8
|
||||
pip install bitsandbytes>=0.31.5
|
||||
pip install accelerate>=0.12.0
|
||||
pip install transformers>=4.23.0
|
||||
```
|
||||
if `transformers>=4.23.0` is not released yet, then use:
|
||||
```bash
|
||||
pip install git+https://github.com/huggingface/transformers.git
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
A list of common errors:
|
||||
|
||||
### Torch does not correctly do the operations on GPU
|
||||
|
||||
First check that:
|
||||
|
||||
```py
|
||||
import torch
|
||||
|
||||
vec = torch.randn(1, 2, 3).to(0)
|
||||
```
|
||||
|
||||
Works without any error. If not, install torch using `conda` like:
|
||||
|
||||
```bash
|
||||
conda create --name int8-testing python==3.8
|
||||
conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
|
||||
pip install bitsandbytes>=0.31.5
|
||||
pip install accelerate>=0.12.0
|
||||
pip install transformers>=4.23.0
|
||||
```
|
||||
For the latest pytorch instructions please see [this](https://pytorch.org/get-started/locally/)
|
||||
|
||||
and the snippet above should work.
|
||||
|
||||
### ` bitsandbytes operations are not supported under CPU!`
|
||||
|
||||
This happens when some Linear weights are set to the CPU when using `accelerate`. Please check carefully `model.hf_device_map` and make sure that there is no `Linear` module that is assigned to CPU. It is fine to have the last module (usually the Lm_head) set on CPU.
|
||||
|
||||
### `To use the type as a Parameter, please correct the detach() semantics defined by __torch_dispatch__() implementation.`
|
||||
|
||||
Use the latest version of `accelerate` with a command such as: `pip install -U accelerate` and the problem should be solved.
|
||||
|
||||
### `Parameter has no attribute .CB`
|
||||
|
||||
Same solution as above.
|
||||
|
||||
### `RuntimeError: CUDA error: an illegal memory access was encountered ... consider passing CUDA_LAUNCH_BLOCKING=1`
|
||||
|
||||
Run your script by prepending `CUDA_LAUNCH_BLOCKING=1` and you should observe an error as described in the next section.
|
||||
|
||||
### `CUDA illegal memory error: an illegal memory access at line...`:
|
||||
|
||||
Check the CUDA versions with:
|
||||
```bash
|
||||
nvcc --version
|
||||
```
|
||||
and confirm it is the same version as the one detected by `bitsandbytes`. If not, run:
|
||||
```bash
|
||||
ls -l $CONDA_PREFIX/lib/libcudart.so
|
||||
```
|
||||
or
|
||||
```bash
|
||||
ls -l $LD_LIBRARY_PATH
|
||||
```
|
||||
Check if `libcudart.so` has a correct symlink that is set. Sometimes `nvcc` detects the correct CUDA version but `bitsandbytes` doesn't. You have to make sure that the symlink that is set for the file `libcudart.so` is redirected to the correct CUDA file.
|
||||
|
||||
Here is an example of a badly configured CUDA installation:
|
||||
|
||||
`nvcc --version` gives:
|
||||
|
||||

|
||||
|
||||
which means that the detected CUDA version is 11.3 but `bitsandbytes` outputs:
|
||||
|
||||

|
||||
|
||||
First check:
|
||||
|
||||
```bash
|
||||
echo $LD_LIBRARY_PATH
|
||||
```
|
||||
|
||||
If this contains multiple paths separated by `:`. Then you have to make sure that the correct CUDA version is set. By doing:
|
||||
|
||||
```bash
|
||||
ls -l $path/libcudart.so
|
||||
```
|
||||
|
||||
On each path (`$path`) separated by `:`.
|
||||
If not, simply run
|
||||
```bash
|
||||
ls -l $LD_LIBRARY_PATH/libcudart.so
|
||||
```
|
||||
|
||||
and you can see
|
||||
|
||||

|
||||
|
||||
If you see that the file is linked to the wrong CUDA version (here 10.2), find the correct location for `libcudart.so` (`find --name libcudart.so`) and replace the environment variable `LD_LIBRARY_PATH` with the one containing the correct `libcudart.so` file.
|
||||
0
transformers/tests/quantization/bnb/__init__.py
Normal file
0
transformers/tests/quantization/bnb/__init__.py
Normal file
868
transformers/tests/quantization/bnb/test_4bit.py
Normal file
868
transformers/tests/quantization/bnb/test_4bit.py
Normal file
@@ -0,0 +1,868 @@
|
||||
# Copyright 2022 The HuggingFace Team Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a clone of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import gc
|
||||
import importlib.metadata
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from packaging import version
|
||||
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoModelForSequenceClassification,
|
||||
AutoTokenizer,
|
||||
BitsAndBytesConfig,
|
||||
pipeline,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.models.opt.modeling_opt import OPTAttention
|
||||
from transformers.testing_utils import (
|
||||
apply_skip_if_not_implemented,
|
||||
backend_empty_cache,
|
||||
backend_torch_accelerator_module,
|
||||
is_bitsandbytes_available,
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
require_bitsandbytes,
|
||||
require_torch,
|
||||
require_torch_gpu_if_bnb_not_multi_backend_enabled,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
def get_some_linear_layer(model):
|
||||
if model.config.model_type == "gpt2":
|
||||
return model.transformer.h[0].mlp.c_fc
|
||||
elif model.config.model_type == "opt":
|
||||
try:
|
||||
return model.decoder.layers[0].fc1
|
||||
except AttributeError:
|
||||
# for AutoModelforCausalLM
|
||||
return model.model.decoder.layers[0].fc1
|
||||
elif model.config.model_type == "llama":
|
||||
return model.model.layers[0].mlp.gate_proj
|
||||
else:
|
||||
return model.transformer.h[0].mlp.dense_4h_to_h
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class LoRALayer(nn.Module):
|
||||
"""Wraps a linear layer with LoRA-like adapter - Used for testing purposes only"""
|
||||
|
||||
def __init__(self, module: nn.Module, rank: int):
|
||||
super().__init__()
|
||||
self.module = module
|
||||
self.adapter = nn.Sequential(
|
||||
nn.Linear(module.in_features, rank, bias=False),
|
||||
nn.Linear(rank, module.out_features, bias=False),
|
||||
)
|
||||
small_std = (2.0 / (5 * min(module.in_features, module.out_features))) ** 0.5
|
||||
nn.init.normal_(self.adapter[0].weight, std=small_std)
|
||||
nn.init.zeros_(self.adapter[1].weight)
|
||||
self.adapter.to(module.weight.device)
|
||||
|
||||
def forward(self, input, *args, **kwargs):
|
||||
return self.module(input, *args, **kwargs) + self.adapter(input)
|
||||
|
||||
|
||||
if is_bitsandbytes_available():
|
||||
import bitsandbytes as bnb
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch
|
||||
@require_torch_gpu_if_bnb_not_multi_backend_enabled
|
||||
@slow
|
||||
class Base4bitTest(unittest.TestCase):
|
||||
# We keep the constants inside the init function and model loading inside setUp function
|
||||
|
||||
# We need to test on relatively large models (aka >1b parameters otherwise the quantiztion may not work as expected)
|
||||
# Therefore here we use only bloom-1b3 to test our module
|
||||
model_name = "bigscience/bloom-1b7"
|
||||
|
||||
# Constant values
|
||||
EXPECTED_RELATIVE_DIFFERENCE = (
|
||||
2.109659552692574 # This was obtained on a RTX Titan so the number might slightly change
|
||||
)
|
||||
|
||||
input_text = "Hello my name is"
|
||||
EXPECTED_OUTPUTS = set()
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John Doe, I am a student at the University")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am 25 years old.")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a student at the University of")
|
||||
# Expected values on Intel XPU and NV A100
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Alina. I have been working as a professional")
|
||||
MAX_NEW_TOKENS = 10
|
||||
|
||||
def setUp(self):
|
||||
# Models and tokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitTest(Base4bitTest):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
# Models and tokenizer
|
||||
self.model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, dtype=torch.float16, device_map="auto")
|
||||
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
del self.model_fp16
|
||||
del self.model_4bit
|
||||
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_quantization_num_parameters(self):
|
||||
r"""
|
||||
Test if the number of returned parameters is correct
|
||||
|
||||
See: https://github.com/huggingface/transformers/issues/25978
|
||||
"""
|
||||
num_params_4bit = self.model_4bit.num_parameters()
|
||||
num_params_fp16 = self.model_fp16.num_parameters()
|
||||
|
||||
self.assertEqual(num_params_4bit, num_params_fp16)
|
||||
|
||||
def test_quantization_config_json_serialization(self):
|
||||
r"""
|
||||
A simple test to check if the quantization config is correctly serialized and deserialized
|
||||
"""
|
||||
config = self.model_4bit.config
|
||||
|
||||
self.assertTrue(hasattr(config, "quantization_config"))
|
||||
|
||||
_ = config.to_dict()
|
||||
_ = config.to_diff_dict()
|
||||
|
||||
_ = config.to_json_string()
|
||||
|
||||
def test_memory_footprint(self):
|
||||
r"""
|
||||
A simple test to check if the model conversion has been done correctly by checking on the
|
||||
memory footprint of the converted model and the class type of the linear layers of the converted models
|
||||
"""
|
||||
from bitsandbytes.nn import Params4bit
|
||||
|
||||
mem_fp16 = self.model_fp16.get_memory_footprint()
|
||||
mem_4bit = self.model_4bit.get_memory_footprint()
|
||||
|
||||
self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
|
||||
linear = get_some_linear_layer(self.model_4bit)
|
||||
self.assertTrue(linear.weight.__class__ == Params4bit)
|
||||
|
||||
def test_original_dtype(self):
|
||||
r"""
|
||||
A simple test to check if the model successfully stores the original dtype
|
||||
"""
|
||||
self.assertTrue(hasattr(self.model_4bit.config, "_pre_quantization_dtype"))
|
||||
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
|
||||
self.assertTrue(self.model_4bit.config._pre_quantization_dtype == torch.float16)
|
||||
|
||||
def test_linear_are_4bit(self):
|
||||
r"""
|
||||
A simple test to check if the model conversion has been done correctly by checking on the
|
||||
memory footprint of the converted model and the class type of the linear layers of the converted models
|
||||
"""
|
||||
from transformers import T5PreTrainedModel
|
||||
|
||||
self.model_fp16.get_memory_footprint()
|
||||
self.model_4bit.get_memory_footprint()
|
||||
|
||||
for name, module in self.model_4bit.named_modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
if name not in ["lm_head"] + T5PreTrainedModel._keep_in_fp32_modules:
|
||||
# 4-bit parameters are packed in uint8 variables
|
||||
self.assertTrue(module.weight.dtype == torch.uint8)
|
||||
|
||||
def test_rwkv_4bit(self):
|
||||
r"""
|
||||
A simple test to check if 4-bit RWKV inference works as expected.
|
||||
"""
|
||||
model_id = "RWKV/rwkv-4-169m-pile"
|
||||
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
tok = AutoTokenizer.from_pretrained(model_id)
|
||||
|
||||
text = "Hello my name is"
|
||||
input_ids = tok.encode(text, return_tensors="pt").to(torch_device)
|
||||
|
||||
_ = model.generate(input_ids, max_new_tokens=30)
|
||||
|
||||
def test_generate_quality(self):
|
||||
r"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = self.model_4bit.generate(
|
||||
input_ids=encoded_input["input_ids"].to(self.model_4bit.device), max_new_tokens=10
|
||||
)
|
||||
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality_config(self):
|
||||
r"""
|
||||
Test that loading the model with the config is equivalent
|
||||
"""
|
||||
bnb_config = BitsAndBytesConfig()
|
||||
bnb_config.load_in_4bit = True
|
||||
|
||||
model_4bit_from_config = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=bnb_config, device_map="auto"
|
||||
)
|
||||
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model_4bit_from_config.generate(
|
||||
input_ids=encoded_input["input_ids"].to(model_4bit_from_config.device), max_new_tokens=10
|
||||
)
|
||||
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality_dequantize(self):
|
||||
r"""
|
||||
Test that loading the model and unquantize it produce correct results
|
||||
"""
|
||||
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=bnb_config, device_map="auto"
|
||||
)
|
||||
|
||||
model_4bit.dequantize()
|
||||
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model_4bit.generate(
|
||||
input_ids=encoded_input["input_ids"].to(model_4bit.device), max_new_tokens=10
|
||||
)
|
||||
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_clear_quantization_trace(self):
|
||||
r"""
|
||||
Test that dequantizing the model won't leave any attribute relative to quantization in the model's configuration
|
||||
"""
|
||||
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=bnb_config, device_map="auto"
|
||||
)
|
||||
model_4bit.dequantize()
|
||||
|
||||
self.assertFalse(hasattr(model_4bit, "hf_quantizer"))
|
||||
self.assertFalse(hasattr(model_4bit.config, "quantization_config"))
|
||||
self.assertFalse(hasattr(model_4bit.config, "_pre_quantization_dtype"))
|
||||
self.assertFalse(hasattr(model_4bit, "quantization_method"))
|
||||
self.assertFalse(model_4bit.is_quantized)
|
||||
|
||||
def test_to_device_dequantized(self):
|
||||
r"""
|
||||
Test that dequantizing the model won't prevent converting it to a different dtype
|
||||
"""
|
||||
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
model_4bit = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, quantization_config=bnb_config, device_map="auto"
|
||||
)
|
||||
model_4bit.dequantize()
|
||||
model_4bit.to(dtype=torch.float16)
|
||||
|
||||
def test_device_assignment(self):
|
||||
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"):
|
||||
self.skipTest(reason="This test requires bitsandbytes >= 0.43.2")
|
||||
|
||||
mem_before = self.model_4bit.get_memory_footprint()
|
||||
|
||||
# Move to CPU
|
||||
self.model_4bit.to("cpu")
|
||||
self.assertEqual(self.model_4bit.device.type, "cpu")
|
||||
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
|
||||
|
||||
if torch_device in ["cuda", "xpu"]:
|
||||
# Move back to CUDA device
|
||||
self.model_4bit.to(torch_device)
|
||||
self.assertEqual(self.model_4bit.device.type, torch_device)
|
||||
self.assertAlmostEqual(self.model_4bit.get_memory_footprint(), mem_before)
|
||||
|
||||
def test_device_and_dtype_assignment(self):
|
||||
r"""
|
||||
Test whether attempting to change the device or cast the dtype of a model
|
||||
after converting it to 4-bit precision will raise an appropriate error.
|
||||
The test ensures that such operations are prohibited on 4-bit models
|
||||
to prevent invalid conversions.
|
||||
"""
|
||||
|
||||
# Moving with `to` or `cuda` is not supported with versions < 0.43.2.
|
||||
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"):
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with `str`
|
||||
self.model_4bit.to("cpu")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with a `device`
|
||||
self.model_4bit.to(torch.device("cuda:0"))
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with `cuda`
|
||||
self.model_4bit.cuda()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with a `dtype`
|
||||
self.model_4bit.to(torch.float16)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries to cast the 4-bit model to float32 using `float()`
|
||||
self.model_4bit.float()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries to cast the 4-bit model to float16 using `half()`
|
||||
self.model_4bit.half()
|
||||
|
||||
# Test if we did not break anything
|
||||
self.model_4bit.to(torch.device(torch_device))
|
||||
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
self.model_fp16 = self.model_fp16.to(torch.float32)
|
||||
_ = self.model_fp16.generate(
|
||||
input_ids=encoded_input["input_ids"].to(self.model_fp16.device), max_new_tokens=10
|
||||
)
|
||||
|
||||
if torch_device in ["cuda", "xpu"]:
|
||||
# Check that this does not throw an error
|
||||
_ = self.model_fp16.to(torch_device)
|
||||
|
||||
# Check this does not throw an error
|
||||
_ = self.model_fp16.to("cpu")
|
||||
|
||||
# Check this does not throw an error
|
||||
_ = self.model_fp16.half()
|
||||
|
||||
# Check this does not throw an error
|
||||
_ = self.model_fp16.float()
|
||||
|
||||
def test_fp32_4bit_conversion(self):
|
||||
r"""
|
||||
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
|
||||
"""
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small", load_in_4bit=True, device_map="auto")
|
||||
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
|
||||
|
||||
def test_bnb_4bit_wrong_config(self):
|
||||
r"""
|
||||
Test whether creating a bnb config with unsupported values leads to errors.
|
||||
"""
|
||||
with self.assertRaises(ValueError):
|
||||
_ = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_storage="add")
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch
|
||||
@require_torch_gpu_if_bnb_not_multi_backend_enabled
|
||||
@slow
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitT5Test(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model_name = "google-t5/t5-small"
|
||||
cls.dense_act_model_name = "google/flan-t5-small" # flan-t5 uses dense-act instead of dense-relu-dense
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.input_text = "Translate in German: Hello, my dog is cute"
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_inference_without_keep_in_fp32(self):
|
||||
r"""
|
||||
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
|
||||
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
|
||||
both cases.
|
||||
"""
|
||||
from transformers import T5ForConditionalGeneration
|
||||
|
||||
modules = T5ForConditionalGeneration._keep_in_fp32_modules
|
||||
T5ForConditionalGeneration._keep_in_fp32_modules = None
|
||||
|
||||
# test with `google-t5/t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
# test with `flan-t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(
|
||||
self.dense_act_model_name, load_in_4bit=True, device_map="auto"
|
||||
)
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
|
||||
_ = model.generate(**encoded_input)
|
||||
T5ForConditionalGeneration._keep_in_fp32_modules = modules
|
||||
|
||||
def test_inference_with_keep_in_fp32(self):
|
||||
r"""
|
||||
Test whether it is possible to mix both `4bit` and `fp32` weights when using `keep_in_fp32_modules` correctly.
|
||||
`flan-t5-small` uses `T5DenseGatedActDense` whereas `google-t5/t5-small` uses `T5DenseReluDense`. We need to test
|
||||
both cases.
|
||||
"""
|
||||
from transformers import T5ForConditionalGeneration
|
||||
|
||||
# test with `google-t5/t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
|
||||
|
||||
# there was a bug with decoders - this test checks that it is fixed
|
||||
self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit))
|
||||
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
# test with `flan-t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(
|
||||
self.dense_act_model_name, load_in_4bit=True, device_map="auto"
|
||||
)
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Classes4BitModelTest(Base4bitTest):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
# model_name
|
||||
self.model_name = "bigscience/bloom-560m"
|
||||
self.seq_to_seq_name = "google-t5/t5-small"
|
||||
|
||||
# Different types of model
|
||||
|
||||
self.base_model = AutoModel.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
|
||||
# Sequence classification model
|
||||
self.sequence_model = AutoModelForSequenceClassification.from_pretrained(
|
||||
self.model_name, load_in_4bit=True, device_map="auto"
|
||||
)
|
||||
# CausalLM model
|
||||
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto")
|
||||
# Seq2seq model
|
||||
self.seq_to_seq_model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
self.seq_to_seq_name, load_in_4bit=True, device_map="auto"
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
del self.base_model
|
||||
del self.sequence_model
|
||||
del self.model_4bit
|
||||
del self.seq_to_seq_model
|
||||
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_correct_head_class(self):
|
||||
r"""
|
||||
A simple test to check if the last modules for some classes (AutoModelForCausalLM or SequenceClassification)
|
||||
are kept in their native class.
|
||||
"""
|
||||
from bitsandbytes.nn import Params4bit
|
||||
|
||||
self.assertTrue(self.base_model.h[-1].mlp.dense_4h_to_h.weight.__class__ == Params4bit)
|
||||
|
||||
# Other heads should be nn.Parameter
|
||||
self.assertTrue(self.model_4bit.lm_head.weight.__class__ == torch.nn.Parameter)
|
||||
self.assertTrue(self.sequence_model.score.weight.__class__ == torch.nn.Parameter)
|
||||
self.assertTrue(self.seq_to_seq_model.lm_head.weight.__class__ == torch.nn.Parameter)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Pipeline4BitTest(Base4bitTest):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
if hasattr(self, "pipe"):
|
||||
del self.pipe
|
||||
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_pipeline(self):
|
||||
r"""
|
||||
The aim of this test is to verify that the mixed 4bit is compatible with `pipeline` from transformers. Since
|
||||
we used pipeline for inference speed benchmarking we want to make sure that this feature does not break anything
|
||||
on pipeline.
|
||||
"""
|
||||
# self._clear_cuda_cache()
|
||||
self.pipe = pipeline(
|
||||
"text-generation",
|
||||
model=self.model_name,
|
||||
model_kwargs={
|
||||
"device_map": "auto",
|
||||
"load_in_4bit": True,
|
||||
# float16 isn't supported on CPU, use bfloat16 instead
|
||||
"dtype": torch.bfloat16 if torch_device == "cpu" else torch.float16,
|
||||
},
|
||||
max_new_tokens=self.MAX_NEW_TOKENS,
|
||||
)
|
||||
|
||||
# Avoid sampling different outputs
|
||||
set_seed(42)
|
||||
# Real second forward pass
|
||||
pipeline_output = self.pipe(self.input_text)
|
||||
self.assertIn(pipeline_output[0]["generated_text"], self.EXPECTED_OUTPUTS)
|
||||
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4bitTestMultiAccelerator(Base4bitTest):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
def test_multi_accelerator_loading(self):
|
||||
r"""
|
||||
This tests that the model has been loaded and can be used correctly on a multi-accelerator setup.
|
||||
Let's just try to load a model on 2 accelerators and see if it works. The model we test has ~2GB of total, 3GB should suffice
|
||||
"""
|
||||
device_map = {
|
||||
"transformer.word_embeddings": 0,
|
||||
"transformer.word_embeddings_layernorm": 0,
|
||||
"lm_head": 0,
|
||||
"transformer.h.0": 0,
|
||||
"transformer.h.1": 0,
|
||||
"transformer.h.2": 0,
|
||||
"transformer.h.3": 0,
|
||||
"transformer.h.4": 0,
|
||||
"transformer.h.5": 0,
|
||||
"transformer.h.6": 0,
|
||||
"transformer.h.7": 0,
|
||||
"transformer.h.8": 0,
|
||||
"transformer.h.9": 0,
|
||||
"transformer.h.10": 1,
|
||||
"transformer.h.11": 1,
|
||||
"transformer.h.12": 1,
|
||||
"transformer.h.13": 1,
|
||||
"transformer.h.14": 1,
|
||||
"transformer.h.15": 1,
|
||||
"transformer.h.16": 1,
|
||||
"transformer.h.17": 0,
|
||||
"transformer.h.18": 0,
|
||||
"transformer.h.19": 0,
|
||||
"transformer.h.20": 0,
|
||||
"transformer.h.21": 0,
|
||||
"transformer.h.22": 0,
|
||||
"transformer.h.23": 1,
|
||||
"transformer.ln_f": 0,
|
||||
}
|
||||
|
||||
model_parallel = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, load_in_4bit=True, device_map=device_map
|
||||
)
|
||||
|
||||
# Check correct device map
|
||||
self.assertEqual(set(model_parallel.hf_device_map.values()), {0, 1})
|
||||
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# Second real batch
|
||||
output_parallel = model_parallel.generate(
|
||||
input_ids=encoded_input["input_ids"].to(torch_device), max_new_tokens=10
|
||||
)
|
||||
self.assertIn(self.tokenizer.decode(output_parallel[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitTestTraining(Base4bitTest):
|
||||
def setUp(self):
|
||||
self.model_name = "facebook/opt-350m"
|
||||
super().setUp()
|
||||
|
||||
def test_training(self):
|
||||
if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.0"):
|
||||
self.skipTest(reason="This test requires bitsandbytes >= 0.37.0")
|
||||
|
||||
# Step 1: freeze all parameters
|
||||
model = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
|
||||
|
||||
if torch_device in ["cuda", "xpu"]:
|
||||
self.assertEqual(
|
||||
set(model.hf_device_map.values()), {backend_torch_accelerator_module(torch_device).current_device()}
|
||||
)
|
||||
else:
|
||||
self.assertTrue(all(param.device.type == "cpu" for param in model.parameters()))
|
||||
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False # freeze the model - train adapters later
|
||||
if param.ndim == 1:
|
||||
# cast the small parameters (e.g. layernorm) to fp32 for stability
|
||||
param.data = param.data.to(torch.float32)
|
||||
|
||||
# Step 2: add adapters
|
||||
for _, module in model.named_modules():
|
||||
if isinstance(module, OPTAttention):
|
||||
module.q_proj = LoRALayer(module.q_proj, rank=16)
|
||||
module.k_proj = LoRALayer(module.k_proj, rank=16)
|
||||
module.v_proj = LoRALayer(module.v_proj, rank=16)
|
||||
|
||||
# Step 3: dummy batch
|
||||
batch = self.tokenizer("Test batch ", return_tensors="pt").to(torch_device)
|
||||
|
||||
# Step 4: Check if the gradient is not None
|
||||
with torch.autocast(torch_device):
|
||||
out = model.forward(**batch)
|
||||
out.logits.norm().backward()
|
||||
|
||||
for module in model.modules():
|
||||
if isinstance(module, LoRALayer):
|
||||
self.assertTrue(module.adapter[1].weight.grad is not None)
|
||||
self.assertTrue(module.adapter[1].weight.grad.norm().item() > 0)
|
||||
elif isinstance(module, nn.Embedding):
|
||||
self.assertTrue(module.weight.grad is None)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitGPT2Test(Bnb4BitTest):
|
||||
model_name = "openai-community/gpt2-xl"
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 3.3191854854152187
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitLlamaTest(Bnb4BitTest):
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 2.9461410686392764
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch
|
||||
@require_torch_gpu_if_bnb_not_multi_backend_enabled
|
||||
@slow
|
||||
@apply_skip_if_not_implemented
|
||||
class BaseSerializationTest(unittest.TestCase):
|
||||
model_name = "facebook/opt-125m"
|
||||
input_text = "Mars colonists' favorite meals are"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_serialization(self, quant_type="nf4", double_quant=True, safe_serialization=True):
|
||||
r"""
|
||||
Test whether it is possible to serialize a model in 4-bit. Uses most typical params as default.
|
||||
See ExtendedSerializationTest class for more params combinations.
|
||||
"""
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.quantization_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type=quant_type,
|
||||
bnb_4bit_use_double_quant=double_quant,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
)
|
||||
model_0 = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
quantization_config=self.quantization_config,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model_0.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
|
||||
|
||||
config = AutoConfig.from_pretrained(tmpdirname)
|
||||
self.assertTrue(hasattr(config, "quantization_config"))
|
||||
|
||||
model_1 = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
# checking quantized linear module weight
|
||||
linear = get_some_linear_layer(model_1)
|
||||
self.assertTrue(linear.weight.__class__ == bnb.nn.Params4bit)
|
||||
self.assertTrue(hasattr(linear.weight, "quant_state"))
|
||||
self.assertTrue(linear.weight.quant_state.__class__ == bnb.functional.QuantState)
|
||||
|
||||
# checking memory footpring
|
||||
self.assertAlmostEqual(model_0.get_memory_footprint() / model_1.get_memory_footprint(), 1, places=2)
|
||||
|
||||
# Matching all parameters and their quant_state items:
|
||||
d0 = dict(model_0.named_parameters())
|
||||
d1 = dict(model_1.named_parameters())
|
||||
self.assertTrue(d0.keys() == d1.keys())
|
||||
|
||||
for k in d0:
|
||||
self.assertTrue(d0[k].shape == d1[k].shape)
|
||||
self.assertTrue(d0[k].device.type == d1[k].device.type)
|
||||
self.assertTrue(d0[k].device == d1[k].device)
|
||||
self.assertTrue(d0[k].dtype == d1[k].dtype)
|
||||
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
|
||||
|
||||
if isinstance(d0[k], bnb.nn.modules.Params4bit):
|
||||
for v0, v1 in zip(
|
||||
d0[k].quant_state.as_dict().values(),
|
||||
d1[k].quant_state.as_dict().values(),
|
||||
):
|
||||
if isinstance(v0, torch.Tensor):
|
||||
# The absmax will not be saved in the quant_state when using NF4 in CPU
|
||||
if v0.numel() != 0:
|
||||
self.assertTrue(torch.equal(v0, v1.to(v0.device)))
|
||||
else:
|
||||
self.assertTrue(v0 == v1)
|
||||
|
||||
# comparing forward() outputs
|
||||
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
out_0 = model_0(**encoded_input)
|
||||
out_1 = model_1(**encoded_input)
|
||||
torch.testing.assert_close(out_0["logits"], out_1["logits"], rtol=0.05, atol=0.05)
|
||||
|
||||
# comparing generate() outputs
|
||||
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
output_sequences_0 = model_0.generate(**encoded_input, max_new_tokens=10)
|
||||
output_sequences_1 = model_1.generate(**encoded_input, max_new_tokens=10)
|
||||
|
||||
def _decode(token):
|
||||
return tokenizer.decode(token, skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(
|
||||
[_decode(x) for x in output_sequences_0],
|
||||
[_decode(x) for x in output_sequences_1],
|
||||
)
|
||||
|
||||
|
||||
@apply_skip_if_not_implemented
|
||||
class ExtendedSerializationTest(BaseSerializationTest):
|
||||
"""
|
||||
tests more combinations of parameters
|
||||
"""
|
||||
|
||||
def test_nf4_single_unsafe(self):
|
||||
self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=False)
|
||||
|
||||
def test_nf4_single_safe(self):
|
||||
self.test_serialization(quant_type="nf4", double_quant=False, safe_serialization=True)
|
||||
|
||||
def test_nf4_double_unsafe(self):
|
||||
self.test_serialization(quant_type="nf4", double_quant=True, safe_serialization=False)
|
||||
|
||||
# nf4 double safetensors quantization is tested in test_serialization() method from the parent class
|
||||
|
||||
def test_fp4_single_unsafe(self):
|
||||
self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=False)
|
||||
|
||||
def test_fp4_single_safe(self):
|
||||
self.test_serialization(quant_type="fp4", double_quant=False, safe_serialization=True)
|
||||
|
||||
def test_fp4_double_unsafe(self):
|
||||
self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=False)
|
||||
|
||||
def test_fp4_double_safe(self):
|
||||
self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
|
||||
|
||||
|
||||
class BloomSerializationTest(BaseSerializationTest):
|
||||
"""
|
||||
default BaseSerializationTest config tested with Bloom family model
|
||||
"""
|
||||
|
||||
model_name = "bigscience/bloom-560m"
|
||||
|
||||
|
||||
class GPTSerializationTest(BaseSerializationTest):
|
||||
"""
|
||||
default BaseSerializationTest config tested with GPT family model
|
||||
"""
|
||||
|
||||
model_name = "openai-community/gpt2-xl"
|
||||
|
||||
|
||||
class LlamaSerializationTest(BaseSerializationTest):
|
||||
"""
|
||||
default BaseSerializationTest config tested with Llama family model
|
||||
"""
|
||||
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch_gpu_if_bnb_not_multi_backend_enabled
|
||||
@slow
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4BitTestBasicConfigTest(unittest.TestCase):
|
||||
def test_load_in_4_and_8_bit_fails(self):
|
||||
with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
|
||||
AutoModelForCausalLM.from_pretrained("facebook/opt-125m", load_in_4bit=True, load_in_8bit=True)
|
||||
|
||||
def test_set_load_in_8_bit(self):
|
||||
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
|
||||
with self.assertRaisesRegex(ValueError, "load_in_4bit and load_in_8bit are both True"):
|
||||
quantization_config.load_in_8bit = True
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch_gpu_if_bnb_not_multi_backend_enabled
|
||||
@slow
|
||||
@apply_skip_if_not_implemented
|
||||
class Bnb4bitCompile(unittest.TestCase):
|
||||
model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM"
|
||||
input_text = "Hello my name is"
|
||||
|
||||
def setUp(self):
|
||||
# Models and tokenizer
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
self.model_4bit = AutoModelForCausalLM.from_pretrained(self.model_name, load_in_4bit=True)
|
||||
|
||||
@pytest.mark.torch_compile_test
|
||||
def test_generate_compile(self):
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# if nothing is set, compile will be disabled for bnb
|
||||
self.model_4bit.generate(
|
||||
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
|
||||
max_new_tokens=10,
|
||||
cache_implementation="static",
|
||||
)
|
||||
with self.assertRaises(Exception):
|
||||
# overwrite property
|
||||
object.__setattr__(self.model_4bit.hf_quantizer, "is_compileable", True)
|
||||
self.model_4bit.generate(
|
||||
input_ids=encoded_input["input_ids"].to(self.model_4bit.device),
|
||||
max_new_tokens=10,
|
||||
cache_implementation="static",
|
||||
)
|
||||
1015
transformers/tests/quantization/bnb/test_mixed_int8.py
Normal file
1015
transformers/tests/quantization/bnb/test_mixed_int8.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,232 @@
|
||||
import gc
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class StackCompressedModelTest(unittest.TestCase):
|
||||
# Define stubs as class attributes
|
||||
compressed_uncompressed_model_stubs = [
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
|
||||
),
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
|
||||
),
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
|
||||
),
|
||||
]
|
||||
# Flatten the list for tests that require a single list of stubs.
|
||||
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
|
||||
|
||||
# For the outputs matching test, use the sparse-only pair.
|
||||
sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
|
||||
sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_compressed_uncompressed_model_shapes(self):
|
||||
"""
|
||||
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
|
||||
Note: Weights for sparsely compressed models may differ due to packing.
|
||||
"""
|
||||
|
||||
def _has_nested_attr(obj, attr_path):
|
||||
attrs = attr_path.split(".")
|
||||
for attr in attrs:
|
||||
if not hasattr(obj, attr):
|
||||
return None
|
||||
obj = getattr(obj, attr)
|
||||
return obj
|
||||
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
|
||||
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
|
||||
uncompressed = AutoModelForCausalLM.from_pretrained(
|
||||
uncompressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
|
||||
compressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
|
||||
for name, submodule in iter_named_leaf_modules(uncompressed):
|
||||
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
|
||||
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
|
||||
if "sparse-only" in uncompressed_model:
|
||||
self.assertTrue(
|
||||
torch.equal(submodule.weight, comp_decomp_obj.weight),
|
||||
f"Weight mismatch for module '{name}' in sparse-only model.",
|
||||
)
|
||||
else:
|
||||
self.assertTrue(
|
||||
torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
|
||||
f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
|
||||
)
|
||||
|
||||
def test_outputs_match(self):
|
||||
"""
|
||||
Ensure that the generated outputs match between the uncompressed model
|
||||
and its decompressed compressed counterpart.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
|
||||
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
|
||||
|
||||
uncompressed = AutoModelForCausalLM.from_pretrained(
|
||||
self.sparse_uncompressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
|
||||
output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
|
||||
|
||||
decompressed = AutoModelForCausalLM.from_pretrained(
|
||||
self.sparse_compressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
|
||||
|
||||
self.assertEqual(
|
||||
tokenizer.decode(output_uncompressed[0]),
|
||||
tokenizer.decode(output_decompressed[0]),
|
||||
"Generated outputs do not match between compressed and uncompressed models.",
|
||||
)
|
||||
|
||||
def test_no_warnings_for_all_models(self):
|
||||
"""
|
||||
Confirm that loading any model using compressed tensors does not trigger
|
||||
warnings about missing or unexpected keys.
|
||||
"""
|
||||
for model_stub in self.model_stubs:
|
||||
with self.subTest(model_stub=model_stub):
|
||||
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||
warnings.simplefilter("always")
|
||||
AutoModelForCausalLM.from_pretrained(
|
||||
model_stub,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
for warning in caught_warnings:
|
||||
self.assertNotIn(
|
||||
"missing keys",
|
||||
str(warning.message).lower(),
|
||||
f"'missing keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
self.assertNotIn(
|
||||
"unexpected keys",
|
||||
str(warning.message).lower(),
|
||||
f"'unexpected keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class RunCompressedTest(unittest.TestCase):
|
||||
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
|
||||
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
stubs = [tinyllama_w4a16, tinyllama_w8a8]
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_default_run_compressed__True(self):
|
||||
from compressed_tensors.linear.compressed_linear import CompressedLinear
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
compressed_linear_counts = 0
|
||||
|
||||
for _, submodule in iter_named_leaf_modules(
|
||||
model,
|
||||
):
|
||||
if isinstance(submodule, CompressedLinear):
|
||||
compressed_linear_counts += 1
|
||||
|
||||
# some linear models are not compressed - ex. lm_head
|
||||
assert compressed_linear_counts > 0
|
||||
|
||||
def test_default_run_compressed__False(self):
|
||||
from compressed_tensors.linear.compressed_linear import CompressedLinear
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
compressed_linear_counts = 0
|
||||
|
||||
for _, submodule in iter_named_leaf_modules(
|
||||
model,
|
||||
):
|
||||
if isinstance(submodule, CompressedLinear):
|
||||
compressed_linear_counts += 1
|
||||
|
||||
# No modules should be CompressedLinear
|
||||
assert compressed_linear_counts == 0
|
||||
|
||||
def test_run_compressed_outputs_match(self):
|
||||
"""Check that run_compressed=True/False output are the same"""
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
tokenizer = AutoTokenizer.from_pretrained(stub)
|
||||
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
|
||||
|
||||
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
|
||||
@@ -0,0 +1,87 @@
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class CompressedTensorsTest(unittest.TestCase):
|
||||
tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
|
||||
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
|
||||
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
|
||||
llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_config_args(self):
|
||||
with self.assertRaises(ValueError):
|
||||
# passing quant scheme directly is not allowed
|
||||
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
|
||||
CompressedTensorsConfig(
|
||||
config_groups={"FP8": ["Linear"]},
|
||||
ignore=["lm_head"],
|
||||
quantization_status="frozen",
|
||||
sparsity_config={"format": "dense"},
|
||||
)
|
||||
|
||||
def test_config_to_from_dict(self):
|
||||
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]}, sparsity_config={"format": "dense"})
|
||||
config_dict = config.to_dict()
|
||||
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
|
||||
|
||||
from compressed_tensors import QuantizationConfig, SparsityCompressionConfig
|
||||
|
||||
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
|
||||
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
|
||||
|
||||
def test_tinyllama_w8a8(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n"
|
||||
self._test_quantized_model(self.tinyllama_w8a8, expected_out)
|
||||
|
||||
def test_tinyllama_w4a16(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
|
||||
self._test_quantized_model(self.tinyllama_w4a16, expected_out)
|
||||
|
||||
def test_tinyllama_w8a16(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
|
||||
self._test_quantized_model(self.tinyllama_w8a16, expected_out)
|
||||
|
||||
def test_llama_8b_fp8(self):
|
||||
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous museum in Paris that is home to the Mona Lisa? The Louvre\nWhat is the name of the famous bridge in Paris that is often associated with the city"
|
||||
self._test_quantized_model(self.llama3_8b_fp8, expected_out)
|
||||
|
||||
def _test_quantized_model(self, model_name: str, expected_output: str):
|
||||
"""Carry out generation"""
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
device = quantized_model.device
|
||||
self.assertIsNotNone(
|
||||
quantized_model.config.quantization_config,
|
||||
"quantization_config should not be None",
|
||||
)
|
||||
self.assertTrue(
|
||||
any(
|
||||
key
|
||||
for key, tensor in quantized_model.state_dict().items()
|
||||
if "scale" in key and not torch.all(tensor == 1.0)
|
||||
),
|
||||
"quantized model should load a non-trivial scale into the state dict",
|
||||
)
|
||||
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
|
||||
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
|
||||
outputs = tokenizer.batch_decode(generated_ids)
|
||||
|
||||
self.assertIsNotNone(outputs)
|
||||
self.assertEqual(outputs[0], expected_output)
|
||||
171
transformers/tests/quantization/eetq_integration/test_eetq.py
Normal file
171
transformers/tests/quantization/eetq_integration/test_eetq.py
Normal file
@@ -0,0 +1,171 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, EetqConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_eetq,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class EetqConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = EetqConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "eetq", "weights": "int8"}
|
||||
quantization_config = EetqConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
|
||||
self.assertEqual(dict["weights"], quantization_config.weights)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_eetq
|
||||
@require_accelerate
|
||||
class EetqTest(unittest.TestCase):
|
||||
model_name = "facebook/opt-350m"
|
||||
|
||||
input_text = "What are we having for dinner?"
|
||||
max_new_tokens = 9
|
||||
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = EetqConfig(weights="int8")
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
from eetq import EetqLinear
|
||||
|
||||
from transformers.integrations import replace_with_eetq_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = EetqConfig(weights="int8")
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model = replace_with_eetq_linear(model, quantization_config=quantization_config)
|
||||
nb_eetq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, EetqLinear):
|
||||
nb_eetq_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_eetq_linear)
|
||||
|
||||
# Try with `modules_to_not_convert`
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
quantization_config = EetqConfig(modules_to_not_convert=["fc1"])
|
||||
model = replace_with_eetq_linear(model, quantization_config=quantization_config)
|
||||
nb_eetq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, EetqLinear):
|
||||
nb_eetq_linear += 1
|
||||
# 25 corresponds to the lm_head along with 24 fc1 layers.
|
||||
self.assertEqual(nb_linears - 25, nb_eetq_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = EetqConfig()
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
301
transformers/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py
Normal file
301
transformers/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py
Normal file
@@ -0,0 +1,301 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_fbgemm_gpu,
|
||||
require_read_token,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class FbgemmFp8ConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = FbgemmFp8Config()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fbgemm_fp8"}
|
||||
quantization_config = FbgemmFp8Config.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_fbgemm_gpu
|
||||
@require_accelerate
|
||||
@require_read_token
|
||||
class FbgemmFp8Test(unittest.TestCase):
|
||||
model_name = "meta-llama/Meta-Llama-3-8B"
|
||||
|
||||
input_text = "What are we having for dinner?"
|
||||
max_new_tokens = 9
|
||||
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
offload_device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
"model.layers.2": 0,
|
||||
"model.layers.3": 0,
|
||||
"model.layers.4": 0,
|
||||
"model.layers.5": 0,
|
||||
"model.layers.6": 0,
|
||||
"model.layers.7": 0,
|
||||
"model.layers.8": 0,
|
||||
"model.layers.9": 0,
|
||||
"model.layers.10": 0,
|
||||
"model.layers.11": 0,
|
||||
"model.layers.12": 0,
|
||||
"model.layers.13": 0,
|
||||
"model.layers.14": 0,
|
||||
"model.layers.15": 0,
|
||||
"model.layers.16": "cpu",
|
||||
"model.layers.17": "cpu",
|
||||
"model.layers.18": "cpu",
|
||||
"model.layers.19": "cpu",
|
||||
"model.layers.20": "disk",
|
||||
"model.layers.21": "disk",
|
||||
"model.layers.22": "disk",
|
||||
"model.layers.23": "disk",
|
||||
"model.layers.24": "disk",
|
||||
"model.layers.25": "disk",
|
||||
"model.layers.26": "disk",
|
||||
"model.layers.27": "disk",
|
||||
"model.layers.28": "disk",
|
||||
"model.layers.29": "disk",
|
||||
"model.layers.30": "disk",
|
||||
"model.layers.31": "disk",
|
||||
"model.norm": "disk",
|
||||
"lm_head": "disk",
|
||||
}
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = FbgemmFp8Config()
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
|
||||
from transformers.integrations import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = FbgemmFp8Config()
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config)
|
||||
nb_fbgemm_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, FbgemmFp8Linear):
|
||||
nb_fbgemm_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_fbgemm_linear)
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
quantization_config = FbgemmFp8Config(modules_to_not_convert=["fc1"])
|
||||
model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config)
|
||||
nb_fbgemm_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, FbgemmFp8Linear):
|
||||
nb_fbgemm_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 25, nb_fbgemm_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_change_loading_attributes(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
quantization_config = FbgemmFp8Config(activation_scale_ub=1000.0)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
self.assertEqual(model.model.layers[1].mlp.down_proj.input_scale_ub.item(), 1000.0)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = FbgemmFp8Config()
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_quantized_model_offload(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
|
||||
"""
|
||||
quantization_config = FbgemmFp8Config()
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
ValueError, "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
|
||||
):
|
||||
AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map=self.offload_device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def test_save_pretrained_offload(self):
|
||||
"""
|
||||
Simple test that checks if the saved quantized model is working properly cpu/disk offload
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_save_pretrained_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
@require_fbgemm_gpu
|
||||
class FbgemmFp8LinearTest(unittest.TestCase):
|
||||
def test_linear_preserves_shape(self):
|
||||
"""
|
||||
Test that FbgemmFp8Linear preserves shape when in_features == out_features.
|
||||
"""
|
||||
from transformers.integrations import FbgemmFp8Linear
|
||||
|
||||
with init_empty_weights(include_buffers=True):
|
||||
linear = FbgemmFp8Linear(1024, 1024, True)
|
||||
x = torch.rand((17, 23, 1024))
|
||||
|
||||
x_ = linear(x)
|
||||
self.assertEqual(x_.shape, x.shape)
|
||||
|
||||
def test_linear_with_diff_feature_size_preserves_shape(self):
|
||||
"""
|
||||
Test that FbgemmFp8Linear generates the correct shape when in_features != out_features.
|
||||
"""
|
||||
from transformers.integrations import FbgemmFp8Linear
|
||||
|
||||
with init_empty_weights(include_buffers=True):
|
||||
linear = FbgemmFp8Linear(1024, 2048, True)
|
||||
x = torch.rand((17, 23, 1024))
|
||||
|
||||
x_ = linear(x)
|
||||
self.assertEqual(x_.shape, (17, 23, 2048))
|
||||
287
transformers/tests/quantization/finegrained_fp8/test_fp8.py
Normal file
287
transformers/tests/quantization/finegrained_fp8/test_fp8.py
Normal file
@@ -0,0 +1,287 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_accelerate,
|
||||
require_read_token,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class FineGrainedFP8ConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = FineGrainedFP8Config()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fp8"}
|
||||
quantization_config = FineGrainedFP8Config.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
|
||||
|
||||
|
||||
@slow
|
||||
@require_accelerate
|
||||
@require_read_token
|
||||
@require_torch_accelerator
|
||||
class FP8QuantizerTest(unittest.TestCase):
|
||||
model_name = "meta-llama/Llama-3.2-1B"
|
||||
input_text = "Once upon a time"
|
||||
max_new_tokens = 10
|
||||
EXPECTED_OUTPUT = "Once upon a time, there was a man who was very rich."
|
||||
device_map = torch_device
|
||||
offload_device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
"model.layers.2": 0,
|
||||
"model.layers.3": 0,
|
||||
"model.layers.4": 0,
|
||||
"model.layers.5": 0,
|
||||
"model.layers.6": 0,
|
||||
"model.layers.7": "cpu",
|
||||
"model.layers.8": "cpu",
|
||||
"model.layers.9": "cpu",
|
||||
"model.layers.10": "cpu",
|
||||
"model.layers.11": "cpu",
|
||||
"model.layers.12": "cpu",
|
||||
"model.layers.13": "cpu",
|
||||
"model.layers.14": "cpu",
|
||||
"model.layers.15": "cpu",
|
||||
"model.rotary_emb": "disk",
|
||||
"model.norm": "disk",
|
||||
"lm_head": 0,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.quantization_config = FineGrainedFP8Config()
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=cls.quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
|
||||
from transformers.integrations import FP8Linear, replace_with_fp8_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = FineGrainedFP8Config()
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model = replace_with_fp8_linear(model, quantization_config=quantization_config)
|
||||
nb_fp8_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, FP8Linear):
|
||||
nb_fp8_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_fp8_linear)
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
quantization_config = FineGrainedFP8Config(modules_to_not_convert=["fc1"])
|
||||
model = replace_with_fp8_linear(model, quantization_config=quantization_config)
|
||||
nb_fp8_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, FP8Linear):
|
||||
nb_fp8_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 25, nb_fp8_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
self.assertEqual(output_tokens, self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_weight_and_weight_scale_inv(self):
|
||||
"""
|
||||
Simple test that checks if the weight and weight_scale_inv are working properly
|
||||
"""
|
||||
weight = self.quantized_model.model.layers[0].self_attn.q_proj.weight
|
||||
weight_scale_inv = self.quantized_model.model.layers[0].self_attn.q_proj.weight_scale_inv
|
||||
self.assertEqual(weight.dtype, torch.float8_e4m3fn)
|
||||
self.assertEqual(weight_scale_inv.dtype, torch.float32)
|
||||
self.assertEqual(weight.shape, (weight_scale_inv.shape[0] * 128, weight_scale_inv.shape[1] * 128))
|
||||
|
||||
def test_block_size(self):
|
||||
"""
|
||||
Simple test that checks if the block size is working properly
|
||||
"""
|
||||
self.assertEqual(self.quantized_model.config.quantization_config.weight_block_size, (128, 128))
|
||||
quantization_config = FineGrainedFP8Config(weight_block_size=(32, 32))
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map=self.device_map, quantization_config=quantization_config
|
||||
)
|
||||
self.assertEqual(quantized_model.config.quantization_config.weight_block_size, (32, 32))
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_quantized_model_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple accelerators
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs; or set ZE_AFFINITY_MASK=0,1 if you
|
||||
have more than 2 XPUs.
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
|
||||
quantization_config = FineGrainedFP8Config()
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_save_pretrained_multi_accelerators(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_quantized_model_offload(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded
|
||||
"""
|
||||
with self.assertRaisesRegex(
|
||||
ValueError, "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
|
||||
):
|
||||
AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map=self.offload_device_map, quantization_config=self.quantization_config
|
||||
)
|
||||
|
||||
def test_save_pretrained_offload(self):
|
||||
"""
|
||||
Simple test that checks if the saved quantized model is working properly cpu/disk offload
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map)
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class FP8LinearTest(unittest.TestCase):
|
||||
device = torch_device
|
||||
|
||||
@unittest.skipIf(
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
|
||||
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
|
||||
)
|
||||
def test_linear_preserves_shape(self):
|
||||
"""
|
||||
Test that FP8Linear preserves shape when in_features == out_features.
|
||||
"""
|
||||
from transformers.integrations import FP8Linear
|
||||
|
||||
linear = FP8Linear(256, 256, block_size=(128, 128), device=self.device)
|
||||
x = torch.rand((1, 5, 256)).to(self.device)
|
||||
|
||||
x_ = linear(x)
|
||||
self.assertEqual(x_.shape, x.shape)
|
||||
|
||||
@unittest.skipIf(
|
||||
get_device_properties()[0] == "cuda" and get_device_properties()[1] < 9,
|
||||
"Skipping FP8LinearTest because it is not supported on GPU with capability < 9.0",
|
||||
)
|
||||
def test_linear_with_diff_feature_size_preserves_shape(self):
|
||||
"""
|
||||
Test that FP8Linear generates the correct shape when in_features != out_features.
|
||||
"""
|
||||
from transformers.integrations import FP8Linear
|
||||
|
||||
linear = FP8Linear(128, 256, block_size=(128, 128), device=self.device)
|
||||
x = torch.rand((1, 5, 128)).to(self.device)
|
||||
|
||||
x_ = linear(x)
|
||||
self.assertEqual(x_.shape, (1, 5, 256))
|
||||
@@ -0,0 +1,227 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_fp_quant,
|
||||
require_qutlass,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class FPQuantConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = FPQuantConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "fp_quant"}
|
||||
quantization_config = FPQuantConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_fp_quant
|
||||
@require_qutlass
|
||||
@require_accelerate
|
||||
class FPQuantTest(unittest.TestCase):
|
||||
model_name = "unsloth/Llama-3.2-1B"
|
||||
|
||||
input_text = "1 2 3 4"
|
||||
max_new_tokens = 4
|
||||
|
||||
EXPECTED_OUTPUT = "1 2 3 4 5 6"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = FPQuantConfig(pseudoquantization=False)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = FPQuantConfig()
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_save_pretrained_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_fp_quant
|
||||
@require_accelerate
|
||||
class FPQuantPseudoquantTest(unittest.TestCase):
|
||||
model_name = "unsloth/Llama-3.2-1B"
|
||||
|
||||
input_text = "1 2 3 4"
|
||||
max_new_tokens = 4
|
||||
|
||||
EXPECTED_OUTPUT = "1 2 3 4 5 6"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = FPQuantConfig(pseudoquantization=True)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = FPQuantConfig(pseudoquantization=True)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_save_pretrained_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
0
transformers/tests/quantization/ggml/__init__.py
Normal file
0
transformers/tests/quantization/ggml/__init__.py
Normal file
1137
transformers/tests/quantization/ggml/test_ggml.py
Normal file
1137
transformers/tests/quantization/ggml/test_ggml.py
Normal file
File diff suppressed because it is too large
Load Diff
0
transformers/tests/quantization/gptq/__init__.py
Normal file
0
transformers/tests/quantization/gptq/__init__.py
Normal file
499
transformers/tests/quantization/gptq/test_gptq.py
Normal file
499
transformers/tests/quantization/gptq/test_gptq.py
Normal file
@@ -0,0 +1,499 @@
|
||||
# Copyright 2023 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GPTQConfig
|
||||
from transformers.testing_utils import (
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
require_gptq,
|
||||
require_optimum,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
)
|
||||
from transformers.utils import is_auto_gptq_available, is_gptqmodel_available, is_ipex_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class GPTQConfigTest(unittest.TestCase):
|
||||
def test_bits(self):
|
||||
with self.assertRaises(ValueError):
|
||||
GPTQConfig(bits="")
|
||||
GPTQConfig(bits=1)
|
||||
GPTQConfig(bits=2)
|
||||
GPTQConfig(bits=4)
|
||||
|
||||
def test_dataset(self):
|
||||
with self.assertRaises(ValueError):
|
||||
GPTQConfig(bits=2, dataset="auto_gpt")
|
||||
GPTQConfig(bits=2, dataset="c4")
|
||||
|
||||
def test_damp_percent(self):
|
||||
with self.assertRaises(ValueError):
|
||||
GPTQConfig(bits=2, damp_percent=10)
|
||||
GPTQConfig(bits=2, damp_percent=-1)
|
||||
GPTQConfig(bits=2, damp_percent="0")
|
||||
GPTQConfig(bits=2, damp_percent=0.01)
|
||||
|
||||
def test_to_dict(self):
|
||||
quantization_config = GPTQConfig(bits=2)
|
||||
quantization_config.to_dict()
|
||||
|
||||
def test_from_dict(self):
|
||||
dict = {"bits": 2}
|
||||
quantization_config = GPTQConfig.from_dict(dict)
|
||||
self.assertEqual(dict["bits"], quantization_config.bits)
|
||||
|
||||
@require_optimum
|
||||
def test_optimum_config(self):
|
||||
from optimum.gptq import GPTQQuantizer
|
||||
|
||||
config = GPTQConfig(bits=2)
|
||||
optimum_config = GPTQQuantizer.from_dict(config.to_dict_optimum())
|
||||
self.assertEqual(optimum_config.bits, config.bits)
|
||||
new_config = GPTQConfig.from_dict_optimum(optimum_config.to_dict())
|
||||
self.assertEqual(optimum_config.bits, new_config.bits)
|
||||
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_gptq
|
||||
class GPTQTest(unittest.TestCase):
|
||||
model_name = "bigscience/bloom-560m"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Alyson, I am a student in the")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Alyson and I am a very sweet,")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Aiden, I am a student at the University")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a member of the N")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nellie and I am a student at the")
|
||||
EXPECTED_OUTPUTS.add("Hello my name is Nate and I am a new member of the")
|
||||
|
||||
# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
|
||||
|
||||
bits = 4
|
||||
sym = True
|
||||
group_size = 128
|
||||
desc_act = False
|
||||
use_exllama = False
|
||||
|
||||
dataset = [
|
||||
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
||||
]
|
||||
|
||||
device_map = "cpu" if is_gptqmodel_available() else None
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, dtype=torch.float16, device_map=cls.device_map
|
||||
)
|
||||
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
|
||||
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
cls.config = AutoConfig.from_pretrained(cls.model_name)
|
||||
|
||||
cls.quantization_config = GPTQConfig(
|
||||
bits=cls.bits,
|
||||
dataset=cls.dataset,
|
||||
tokenizer=cls.tokenizer,
|
||||
group_size=cls.group_size,
|
||||
desc_act=cls.desc_act,
|
||||
sym=cls.sym,
|
||||
use_exllama=cls.use_exllama,
|
||||
)
|
||||
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
dtype=torch.float16,
|
||||
device_map=cls.device_map,
|
||||
quantization_config=cls.quantization_config,
|
||||
)
|
||||
|
||||
def test_memory_footprint(self):
|
||||
r"""
|
||||
A simple test to check if the model conversion has been done correctly by checking on the
|
||||
memory footprint of the converted model
|
||||
"""
|
||||
|
||||
mem_quantized = self.quantized_model.get_memory_footprint()
|
||||
|
||||
self.assertAlmostEqual(self.mem_fp16 / mem_quantized, self.EXPECTED_RELATIVE_DIFFERENCE, places=4)
|
||||
|
||||
def test_device_and_dtype_assignment(self):
|
||||
r"""
|
||||
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
|
||||
Checks also if other models are casted correctly.
|
||||
"""
|
||||
# This should work
|
||||
if self.device_map in (None, "cpu"):
|
||||
_ = self.quantized_model.to(0)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with a `dtype``
|
||||
self.quantized_model.to(torch.float16)
|
||||
|
||||
def test_original_dtype(self):
|
||||
r"""
|
||||
A simple test to check if the model successfully stores the original dtype
|
||||
"""
|
||||
self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
|
||||
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
|
||||
self.assertTrue(self.quantized_model.config._pre_quantization_dtype == torch.float16)
|
||||
|
||||
def test_quantized_layers_class(self):
|
||||
"""
|
||||
Simple test to check if the model conversion has been done correctly by checking on
|
||||
the class type of the linear layers of the converted models
|
||||
"""
|
||||
if is_gptqmodel_available():
|
||||
from gptqmodel.utils.importer import hf_select_quant_linear
|
||||
|
||||
if hasattr(self.config, "quantization_config"):
|
||||
checkpoint_format = self.config.quantization_config.get("checkpoint_format")
|
||||
meta = self.config.quantization_config.get("meta")
|
||||
else:
|
||||
checkpoint_format = "gptq"
|
||||
meta = None
|
||||
QuantLinear = hf_select_quant_linear(
|
||||
bits=self.bits,
|
||||
group_size=self.group_size,
|
||||
desc_act=self.desc_act,
|
||||
sym=self.sym,
|
||||
device_map=self.device_map,
|
||||
checkpoint_format=checkpoint_format,
|
||||
meta=meta,
|
||||
backend=self.quantization_config.backend,
|
||||
)
|
||||
elif is_auto_gptq_available():
|
||||
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
|
||||
|
||||
QuantLinear = hf_select_quant_linear(
|
||||
use_triton=False,
|
||||
desc_act=self.desc_act,
|
||||
group_size=self.group_size,
|
||||
bits=self.bits,
|
||||
disable_exllama=not self.use_exllama,
|
||||
disable_exllamav2=True,
|
||||
)
|
||||
self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
r"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# Check the exactness of the results
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(model.device), max_new_tokens=10)
|
||||
|
||||
# Get the generation
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def check_quantized_layers_type(self, model, value):
|
||||
self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value)
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
if self.device_map is None:
|
||||
self.check_inference_correctness(self.quantized_model.to(0))
|
||||
else:
|
||||
if self.device_map == "cpu" and self.quantized_model.device.type != "cpu":
|
||||
self.quantized_model.to("cpu")
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
def test_serialization(self):
|
||||
"""
|
||||
Test the serialization of the model and the loading of the quantized weights works
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available():
|
||||
quant_type = "cuda-old" if not self.use_exllama else "exllama"
|
||||
if not self.use_exllama:
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=False, bits=4)
|
||||
)
|
||||
if self.device_map != "cpu":
|
||||
quantized_model_from_saved = quantized_model_from_saved.to(0)
|
||||
else:
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map
|
||||
)
|
||||
else:
|
||||
if self.device_map == "cpu":
|
||||
quant_type = "ipex" if is_ipex_available() else "torch"
|
||||
else:
|
||||
# We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||
quant_type = "tritonv2"
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, device_map=self.device_map
|
||||
)
|
||||
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, quant_type)
|
||||
self.check_inference_correctness(quantized_model_from_saved)
|
||||
|
||||
@require_accelerate
|
||||
def test_serialization_big_model_inference(self):
|
||||
"""
|
||||
Test the serialization of the model and the loading of the quantized weights with big model inference
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
device_map = self.device_map or "auto"
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=device_map)
|
||||
self.check_inference_correctness(quantized_model_from_saved)
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class GPTQTestCUDA(GPTQTest):
|
||||
device_map = {"": 0}
|
||||
|
||||
def test_change_loading_attributes(self):
|
||||
"""
|
||||
Test the serialization of the model and the loading of the quantized weights works with another config file
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available() and not self.use_exllama:
|
||||
self.check_quantized_layers_type(self.quantized_model, "cuda-old")
|
||||
# we need to put it directly to the gpu. Otherwise, we won't be able to initialize the exllama kernel
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, quantization_config=GPTQConfig(use_exllama=True, bits=4), device_map=self.device_map
|
||||
)
|
||||
self.assertEqual(quantized_model_from_saved.config.quantization_config.bits, self.bits)
|
||||
self.check_quantized_layers_type(quantized_model_from_saved, "exllama")
|
||||
self.check_inference_correctness(quantized_model_from_saved)
|
||||
|
||||
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
class GPTQTestDeviceMap(GPTQTestCUDA):
|
||||
device_map = "auto"
|
||||
|
||||
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
class GPTQTestDeviceMapExllama(GPTQTestCUDA):
|
||||
device_map = "auto"
|
||||
use_exllama = True
|
||||
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_gptq
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class GPTQTestActOrderExllama(unittest.TestCase):
|
||||
"""
|
||||
Test GPTQ model with exllama kernel and desc_act=True (also known as act-order).
|
||||
More information on those arguments here:
|
||||
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
|
||||
"""
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
|
||||
# 4bit + act_order + 128g
|
||||
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
|
||||
input_text = "Hello, how are you ?"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.quantization_config = GPTQConfig(bits=4, max_input_length=4028)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
dtype=torch.float16,
|
||||
device_map={"": 0},
|
||||
quantization_config=cls.quantization_config,
|
||||
)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# Check the exactness of the results
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
# Get the generation
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_quantized_layers_type(self):
|
||||
self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE == "exllama")
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
def test_max_input_length(self):
|
||||
"""
|
||||
Test if the max_input_length works. It modifies the maximum input length that of the model that runs with exllama backend.
|
||||
"""
|
||||
|
||||
prompt = "I am in Paris and" * 1000
|
||||
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
|
||||
self.assertTrue(inp["input_ids"].shape[1] > 4028)
|
||||
with self.assertRaises(RuntimeError) as cm:
|
||||
self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
|
||||
self.assertTrue("temp_state buffer is too small" in str(cm.exception))
|
||||
|
||||
prompt = "I am in Paris and"
|
||||
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
|
||||
self.assertTrue(inp["input_ids"].shape[1] < 4028)
|
||||
self.quantized_model.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
|
||||
|
||||
|
||||
@slow
|
||||
@require_optimum
|
||||
@require_gptq
|
||||
@require_torch_gpu
|
||||
@require_accelerate
|
||||
class GPTQTestExllamaV2(unittest.TestCase):
|
||||
"""
|
||||
Test GPTQ model with exllamav2 kernel and desc_act=True (also known as act-order).
|
||||
More information on those arguments here:
|
||||
https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig
|
||||
"""
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
# flaky test: gptqmodel and auto-gptq are not output equivalent nor is string compare deterministic even between transformer/torch versions
|
||||
EXPECTED_OUTPUTS.add("Hello, how are you ? I'm doing good, thanks for asking.")
|
||||
# 4bit + act_order + 128g
|
||||
model_name = "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ"
|
||||
input_text = "Hello, how are you ?"
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.quantization_config = GPTQConfig(bits=4, exllama_config={"version": 2})
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
dtype=torch.float16,
|
||||
device_map={"": 0},
|
||||
quantization_config=cls.quantization_config,
|
||||
)
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name, use_fast=True)
|
||||
|
||||
def test_quantized_layers_type(self):
|
||||
if is_auto_gptq_available() and not is_gptqmodel_available():
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"exllamav2",
|
||||
)
|
||||
else:
|
||||
# We expect tritonv2 to be used here, because exllama backend doesn't support packing https://github.com/ModelCloud/GPTQModel/issues/1354
|
||||
# TODO: Remove this once GPTQModel exllama kernels supports packing
|
||||
self.assertEqual(
|
||||
self.quantized_model.model.layers[0].self_attn.k_proj.QUANT_TYPE,
|
||||
"tritonv2",
|
||||
)
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
# Check the exactness of the results
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), max_new_tokens=10)
|
||||
|
||||
# Get the generation
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
|
||||
# fail when run all together
|
||||
@pytest.mark.skip
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
class GPTQTestDeviceMapCPUOffload(GPTQTest):
|
||||
device_map = {
|
||||
"transformer.word_embeddings": 0,
|
||||
"transformer.word_embeddings_layernorm": 0,
|
||||
"lm_head": 0,
|
||||
"transformer.h.0": 0,
|
||||
"transformer.h.1": 0,
|
||||
"transformer.h.2": 0,
|
||||
"transformer.h.3": 0,
|
||||
"transformer.h.4": 0,
|
||||
"transformer.h.5": 0,
|
||||
"transformer.h.6": 0,
|
||||
"transformer.h.7": 0,
|
||||
"transformer.h.8": 0,
|
||||
"transformer.h.9": 0,
|
||||
"transformer.h.10": 1,
|
||||
"transformer.h.11": 1,
|
||||
"transformer.h.12": 1,
|
||||
"transformer.h.13": 1,
|
||||
"transformer.h.14": 1,
|
||||
"transformer.h.15": 1,
|
||||
"transformer.h.16": 1,
|
||||
"transformer.h.17": 0,
|
||||
"transformer.h.18": "cpu",
|
||||
"transformer.h.19": "cpu",
|
||||
"transformer.h.20": "cpu",
|
||||
"transformer.h.21": "cpu",
|
||||
"transformer.h.22": "cpu",
|
||||
"transformer.h.23": 1,
|
||||
"transformer.ln_f": 0,
|
||||
}
|
||||
0
transformers/tests/quantization/higgs/__init__.py
Normal file
0
transformers/tests/quantization/higgs/__init__.py
Normal file
197
transformers/tests/quantization/higgs/test_higgs.py
Normal file
197
transformers/tests/quantization/higgs/test_higgs.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HiggsConfig, OPTForCausalLM
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_flute_hadamard,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class HiggsConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = HiggsConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "higgs"}
|
||||
quantization_config = HiggsConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["quant_method"], quantization_config.quant_method)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_flute_hadamard
|
||||
@require_accelerate
|
||||
# @require_read_token
|
||||
class HiggsTest(unittest.TestCase):
|
||||
model_name = "unsloth/Llama-3.2-1B"
|
||||
|
||||
input_text = "Font test: A quick brown fox jumps over the"
|
||||
max_new_tokens = 2
|
||||
|
||||
EXPECTED_OUTPUT = "Font test: A quick brown fox jumps over the lazy dog"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = HiggsConfig()
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
|
||||
from transformers.integrations import HiggsLinear, replace_with_higgs_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
quantization_config = HiggsConfig()
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
|
||||
nb_higgs_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, HiggsLinear):
|
||||
nb_higgs_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_higgs_linear)
|
||||
|
||||
with init_empty_weights():
|
||||
model = OPTForCausalLM(config)
|
||||
quantization_config = HiggsConfig(modules_to_not_convert=["fc1"])
|
||||
model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
|
||||
nb_higgs_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, HiggsLinear):
|
||||
nb_higgs_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 24, nb_higgs_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
quantization_config = HiggsConfig()
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name, device_map="auto", quantization_config=quantization_config
|
||||
)
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_save_pretrained_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
|
||||
self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@unittest.skip("This will almost surely OOM. Enable when switched to a smaller model")
|
||||
def test_dequantize(self):
|
||||
"""
|
||||
Test the ability to dequantize a model
|
||||
"""
|
||||
self.quantized_model.dequantize()
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
314
transformers/tests/quantization/hqq/test_hqq.py
Executable file
314
transformers/tests/quantization/hqq/test_hqq.py
Executable file
@@ -0,0 +1,314 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import accelerate
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_deterministic_for_xpu,
|
||||
require_hqq,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_hqq_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_hqq_available():
|
||||
from hqq.core.quantize import HQQBackend, HQQLinear
|
||||
|
||||
|
||||
class HQQLLMRunner:
|
||||
def __init__(self, model_id, quant_config, compute_dtype, device, cache_dir=None):
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
dtype=compute_dtype,
|
||||
device_map=device,
|
||||
quantization_config=quant_config,
|
||||
cache_dir=cache_dir,
|
||||
)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
|
||||
self.device = self.model.device
|
||||
HQQLinear.set_backend(HQQBackend.PYTORCH)
|
||||
|
||||
|
||||
def cleanup():
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
|
||||
def check_hqqlayer(test_module, hqq_layer, batch_size=1, context_size=1024):
|
||||
# Test HQQ layer
|
||||
W_dequant = hqq_layer.dequantize() # Reconstructed weights
|
||||
inputs = (
|
||||
torch.randn(
|
||||
(batch_size, context_size, hqq_layer.meta["shape"][1]),
|
||||
device=hqq_layer.device,
|
||||
dtype=hqq_layer.compute_dtype,
|
||||
)
|
||||
/ 10.0
|
||||
)
|
||||
with torch.no_grad():
|
||||
outputs = hqq_layer(inputs)
|
||||
test_module.assertEqual(outputs.shape[-1], W_dequant.shape[0])
|
||||
test_module.assertEqual(outputs.dtype, hqq_layer.compute_dtype)
|
||||
del W_dequant, inputs, outputs
|
||||
cleanup()
|
||||
|
||||
|
||||
def check_forward(test_module, model, batch_size=1, context_size=1024):
|
||||
# Test forward pass
|
||||
with torch.no_grad():
|
||||
out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits
|
||||
test_module.assertEqual(out.shape[0], batch_size)
|
||||
test_module.assertEqual(out.shape[1], context_size)
|
||||
cleanup()
|
||||
|
||||
|
||||
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_hqq
|
||||
class HqqConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Makes sure the config format is properly set
|
||||
"""
|
||||
quantization_config = HqqConfig()
|
||||
hqq_orig_config = quantization_config.to_dict()
|
||||
|
||||
self.assertEqual(quantization_config.quant_config, hqq_orig_config["quant_config"])
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_accelerate
|
||||
@require_hqq
|
||||
class HQQTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
cleanup()
|
||||
|
||||
def test_fp16_quantized_model(self):
|
||||
"""
|
||||
Simple LLM model testing fp16
|
||||
"""
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
def test_quantized_model_to_new_device_and_new_dtype(self):
|
||||
"""
|
||||
Simple LLM model testing different devices and dtypes
|
||||
"""
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
# Remove `accelerate` hooks to enable move the model to a new device
|
||||
accelerate.hooks.remove_hook_from_module(hqq_runner.model, recurse=True)
|
||||
|
||||
hqq_runner.model.to("cpu", torch.bfloat16)
|
||||
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
hqq_runner.model.to(torch_device)
|
||||
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
def test_quantized_model_fake_weight_dtype(self):
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
# We use a hack to inject a fake weight to HQQLinear. Check that it works
|
||||
self.assertEqual(hqq_runner.model.model.layers[0].self_attn.v_proj.weight.dtype, torch.float16)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_torch_multi_accelerator
|
||||
@require_accelerate
|
||||
@require_hqq
|
||||
class HQQTestMultiGPU(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
cleanup()
|
||||
|
||||
def test_fp16_quantized_model_multipgpu(self):
|
||||
"""
|
||||
Simple LLM model testing fp16 with multi-gpu
|
||||
"""
|
||||
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device="auto"
|
||||
)
|
||||
|
||||
check_hqqlayer(self, hqq_runner.model.model.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_accelerate
|
||||
@require_hqq
|
||||
class HQQTestBias(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
cleanup()
|
||||
|
||||
def test_fp16_quantized_model(self):
|
||||
"""
|
||||
Simple LLM model testing fp16 with bias
|
||||
"""
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
check_hqqlayer(self, hqq_runner.model.model.decoder.layers[0].self_attn.v_proj)
|
||||
check_forward(self, hqq_runner.model)
|
||||
|
||||
@require_deterministic_for_xpu
|
||||
def test_save_and_load_quantized_model(self):
|
||||
"""
|
||||
Test saving and loading a quantized model with bias
|
||||
"""
|
||||
import tempfile
|
||||
|
||||
quant_config = HqqConfig(nbits=8, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id="facebook/opt-125m", quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
|
||||
|
||||
# Get reference logits
|
||||
with torch.no_grad():
|
||||
logits_ref = hqq_runner.model.forward(input_tensor).logits
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
hqq_runner.model.save_pretrained(tmpdirname)
|
||||
|
||||
del hqq_runner.model
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, dtype=torch.float16, device_map=torch_device
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_loaded = model_loaded.forward(input_tensor).logits
|
||||
|
||||
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_accelerate
|
||||
@require_hqq
|
||||
class HQQSerializationTest(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
cleanup()
|
||||
|
||||
def test_model_serialization(self):
|
||||
"""
|
||||
Simple HQQ LLM save/load test
|
||||
"""
|
||||
quant_config = HqqConfig(nbits=4, group_size=64)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_ref = hqq_runner.model.forward(input_tensor).logits
|
||||
|
||||
# Save
|
||||
saved_model_id = "quant_model"
|
||||
hqq_runner.model.save_pretrained(saved_model_id)
|
||||
|
||||
# Remove old model
|
||||
del hqq_runner.model
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
# Load and check if the logits match
|
||||
model_loaded = AutoModelForCausalLM.from_pretrained(
|
||||
"quant_model",
|
||||
dtype=torch.float16,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
logits_loaded = model_loaded.forward(input_tensor).logits
|
||||
|
||||
self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)
|
||||
|
||||
def test_model_serialization_dynamic_quant_with_skip(self):
|
||||
"""
|
||||
Simple HQQ LLM save/load test with dynamic quant
|
||||
"""
|
||||
q4_config = {"nbits": 4, "group_size": 64}
|
||||
q3_config = {"nbits": 3, "group_size": 64}
|
||||
|
||||
quant_config = HqqConfig(
|
||||
dynamic_config={
|
||||
"self_attn.q_proj": q4_config,
|
||||
"self_attn.k_proj": q4_config,
|
||||
"self_attn.v_proj": q4_config,
|
||||
"self_attn.o_proj": q4_config,
|
||||
"mlp.gate_proj": q3_config,
|
||||
"mlp.up_proj": q3_config,
|
||||
},
|
||||
skip_modules=["lm_head", "down_proj"],
|
||||
)
|
||||
|
||||
hqq_runner = HQQLLMRunner(
|
||||
model_id=MODEL_ID, quant_config=quant_config, compute_dtype=torch.float16, device=torch_device
|
||||
)
|
||||
|
||||
model = hqq_runner.model
|
||||
|
||||
input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)
|
||||
with torch.no_grad():
|
||||
model.forward(input_tensor).logits
|
||||
|
||||
self.assertEqual(isinstance(model.model.layers[1].mlp.down_proj, torch.nn.Linear), True)
|
||||
self.assertEqual(model.model.layers[1].self_attn.v_proj.quant_config["weight_quant_params"]["nbits"], 4)
|
||||
self.assertEqual(model.model.layers[1].mlp.gate_proj.quant_config["weight_quant_params"]["nbits"], 3)
|
||||
0
transformers/tests/quantization/mxfp4/__init__.py
Normal file
0
transformers/tests/quantization/mxfp4/__init__.py
Normal file
505
transformers/tests/quantization/mxfp4/test_mxfp4.py
Normal file
505
transformers/tests/quantization/mxfp4/test_mxfp4.py
Normal file
@@ -0,0 +1,505 @@
|
||||
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from transformers import AutoTokenizer, GptOssForCausalLM, Mxfp4Config
|
||||
from transformers.testing_utils import (
|
||||
require_kernels,
|
||||
require_torch,
|
||||
require_torch_gpu,
|
||||
require_torch_large_gpu,
|
||||
require_triton,
|
||||
slow,
|
||||
)
|
||||
from transformers.utils import (
|
||||
is_torch_available,
|
||||
)
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
class Mxfp4ConfigTest(unittest.TestCase):
|
||||
def test_basic_config_creation(self):
|
||||
"""Test basic configuration creation with default values"""
|
||||
config = Mxfp4Config()
|
||||
self.assertEqual(config.quant_method.value, "mxfp4")
|
||||
self.assertIsNone(config.modules_to_not_convert)
|
||||
self.assertFalse(config.dequantize)
|
||||
|
||||
def test_config_with_modules_to_not_convert(self):
|
||||
"""Test configuration with modules to not convert"""
|
||||
modules = ["model.layers.*.self_attn", "lm_head"]
|
||||
config = Mxfp4Config(modules_to_not_convert=modules)
|
||||
self.assertEqual(config.modules_to_not_convert, modules)
|
||||
|
||||
def test_config_with_dequantize(self):
|
||||
"""Test configuration with dequantize enabled"""
|
||||
config = Mxfp4Config(dequantize=True)
|
||||
self.assertTrue(config.dequantize)
|
||||
|
||||
def test_get_loading_attributes(self):
|
||||
"""Test get_loading_attributes method"""
|
||||
config = Mxfp4Config(dequantize=True)
|
||||
attrs = config.get_loading_attributes()
|
||||
self.assertEqual(attrs["dequantize"], True)
|
||||
|
||||
def test_to_dict(self):
|
||||
"""Test configuration serialization to dict"""
|
||||
config = Mxfp4Config(modules_to_not_convert=["lm_head"], dequantize=True)
|
||||
config_dict = config.to_dict()
|
||||
self.assertEqual(config_dict["quant_method"], "mxfp4")
|
||||
self.assertEqual(config_dict["modules_to_not_convert"], ["lm_head"])
|
||||
# we don't keep dequantize in config_dict
|
||||
self.assertTrue("dequantize" not in config_dict)
|
||||
|
||||
def test_from_dict(self):
|
||||
"""Test configuration creation from dict"""
|
||||
config_dict = {"quant_method": "mxfp4", "modules_to_not_convert": ["lm_head"], "dequantize": True}
|
||||
config = Mxfp4Config.from_dict(config_dict)
|
||||
self.assertEqual(config.modules_to_not_convert, ["lm_head"])
|
||||
self.assertTrue(config.dequantize)
|
||||
|
||||
|
||||
class Mxfp4QuantizerTest(unittest.TestCase):
|
||||
"""Test the Mxfp4HfQuantizer class"""
|
||||
|
||||
def setUp(self):
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_quantizer_validation_no_torch(self):
|
||||
"""Test quantizer validation when torch is not available"""
|
||||
with patch("transformers.quantizers.quantizer_mxfp4.is_torch_available", return_value=False):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
with self.assertRaises(ImportError):
|
||||
quantizer.validate_environment()
|
||||
|
||||
def test_quantizer_validation_no_cuda(self):
|
||||
"""Test quantizer validation when CUDA is not available"""
|
||||
with patch("torch.cuda.is_available", return_value=False):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = False
|
||||
|
||||
with self.assertRaises(RuntimeError):
|
||||
quantizer.validate_environment()
|
||||
|
||||
def test_quantizer_validation_low_compute_capability(self):
|
||||
"""Test quantizer validation with low compute capability"""
|
||||
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = False
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
quantizer.validate_environment()
|
||||
|
||||
def test_quantizer_validation_low_compute_capability_with_prequantized(self):
|
||||
"""Test quantizer validation with low compute capability"""
|
||||
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# Should automatically set dequantize=True and warn
|
||||
quantizer.validate_environment()
|
||||
self.assertTrue(quantizer.quantization_config.dequantize)
|
||||
|
||||
def test_quantizer_validation_low_compute_capability_with_dequantize(self):
|
||||
"""Test quantizer validation with low compute capability but dequantize enabled"""
|
||||
with patch("torch.cuda.get_device_capability", return_value=(7, 0)):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config(dequantize=True)
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# Should not raise error with dequantize=True
|
||||
try:
|
||||
quantizer.validate_environment()
|
||||
except ValueError as e:
|
||||
if "compute capability" in str(e):
|
||||
self.fail("Should not raise compute capability error when dequantize=True")
|
||||
|
||||
def test_quantizer_validation_order_dequantize_before_cuda_check(self):
|
||||
"""Test that dequantize check happens before CUDA availability check"""
|
||||
# Mock torch.cuda.is_available
|
||||
with patch("torch.cuda.is_available", return_value=False):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
# Test with dequantize=True - should pass even without CUDA and accelerate
|
||||
config = Mxfp4Config(dequantize=True)
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# This should not raise any error because dequantize check comes first
|
||||
quantizer.validate_environment()
|
||||
|
||||
# Test with dequantize=False - should still fail due to missing CUDA
|
||||
config = Mxfp4Config(dequantize=False)
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = False
|
||||
|
||||
with self.assertRaises(RuntimeError):
|
||||
quantizer.validate_environment()
|
||||
|
||||
def test_quantizer_validation_missing_triton(self):
|
||||
"""Test quantizer validation when triton is not available"""
|
||||
with (
|
||||
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
|
||||
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
|
||||
):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = False
|
||||
with self.assertRaises(ValueError):
|
||||
quantizer.validate_environment()
|
||||
|
||||
def test_quantizer_validation_missing_triton_pre_quantized_no_dequantize(self):
|
||||
"""Test quantizer validation when triton is not available but model is pre-quantized and dequantize is False"""
|
||||
with (
|
||||
patch("transformers.quantizers.quantizer_mxfp4.is_triton_available", return_value=False),
|
||||
patch("transformers.quantizers.quantizer_mxfp4.is_kernels_available", return_value=False),
|
||||
):
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = True
|
||||
|
||||
# Should automatically set dequantize=True and warn
|
||||
quantizer.validate_environment()
|
||||
self.assertTrue(quantizer.quantization_config.dequantize)
|
||||
|
||||
def test_update_dtype(self):
|
||||
"""Test torch dtype updating"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# Should default to bfloat16
|
||||
result_dtype = quantizer.update_dtype(None)
|
||||
self.assertEqual(result_dtype, torch.bfloat16)
|
||||
|
||||
# Should preserve existing dtype
|
||||
result_dtype = quantizer.update_dtype(torch.float32)
|
||||
self.assertEqual(result_dtype, torch.float32)
|
||||
|
||||
def test_update_expected_keys(self):
|
||||
"""Test expected keys updating for quantized models"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
expected_keys = [
|
||||
"model.layers.0.mlp.experts.gate_up_proj",
|
||||
"model.layers.0.mlp.experts.down_proj",
|
||||
"model.embed_tokens.weight",
|
||||
]
|
||||
|
||||
updated_keys = quantizer.update_expected_keys(None, expected_keys, [])
|
||||
|
||||
expected_updated = [
|
||||
"model.layers.0.mlp.experts.gate_up_proj_blocks",
|
||||
"model.layers.0.mlp.experts.gate_up_proj_scales",
|
||||
"model.layers.0.mlp.experts.down_proj_blocks",
|
||||
"model.layers.0.mlp.experts.down_proj_scales",
|
||||
"model.embed_tokens.weight",
|
||||
]
|
||||
|
||||
self.assertEqual(set(updated_keys), set(expected_updated))
|
||||
|
||||
def test_update_param_name_dequantize(self):
|
||||
"""Test parameter name updating when dequantizing"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config(dequantize=True)
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# Should remove _blocks suffix
|
||||
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
|
||||
updated_name = quantizer.update_param_name(param_name)
|
||||
self.assertEqual(updated_name, "model.layers.0.mlp.experts.gate_up_proj")
|
||||
|
||||
# Should remove _scales suffix
|
||||
param_name = "model.layers.0.mlp.experts.down_proj_scales"
|
||||
updated_name = quantizer.update_param_name(param_name)
|
||||
self.assertEqual(updated_name, "model.layers.0.mlp.experts.down_proj")
|
||||
|
||||
# Should not change other names
|
||||
param_name = "model.embed_tokens.weight"
|
||||
updated_name = quantizer.update_param_name(param_name)
|
||||
self.assertEqual(updated_name, "model.embed_tokens.weight")
|
||||
|
||||
def test_update_param_name_no_dequantize(self):
|
||||
"""Test parameter name updating when not dequantizing"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config(dequantize=False)
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
param_name = "model.layers.0.mlp.experts.gate_up_proj_blocks"
|
||||
updated_name = quantizer.update_param_name(param_name)
|
||||
self.assertEqual(updated_name, param_name)
|
||||
|
||||
def test_is_trainable(self):
|
||||
"""Test trainability"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# MXFP4 is not trainable
|
||||
self.assertFalse(quantizer.is_trainable)
|
||||
|
||||
|
||||
class Mxfp4IntegrationTest(unittest.TestCase):
|
||||
"""Test mxfp4 integration functions"""
|
||||
|
||||
def test_should_convert_module(self):
|
||||
"""Test module conversion decision logic"""
|
||||
from transformers.integrations.mxfp4 import should_convert_module
|
||||
|
||||
# Should convert by default
|
||||
self.assertTrue(should_convert_module(["model", "layers", "0", "mlp"], []))
|
||||
|
||||
# Should not convert if in exclusion list
|
||||
patterns = ["model.layers.*.self_attn", "lm_head"]
|
||||
self.assertFalse(should_convert_module(["model", "layers", "0", "self_attn"], patterns))
|
||||
self.assertFalse(should_convert_module(["lm_head"], patterns))
|
||||
|
||||
# Should convert if not in exclusion list
|
||||
self.assertTrue(should_convert_module(["model", "layers", "0", "mlp", "experts"], patterns))
|
||||
|
||||
@require_torch
|
||||
def test_convert_moe_packed_tensors(self):
|
||||
"""Test unpacking of quantized tensors"""
|
||||
from transformers.integrations.mxfp4 import convert_moe_packed_tensors
|
||||
|
||||
# Create dummy packed tensors
|
||||
blocks = torch.randint(0, 255, (2, 4, 8, 16), dtype=torch.uint8)
|
||||
scales = torch.randint(100, 150, (2, 4, 8), dtype=torch.uint8)
|
||||
|
||||
result = convert_moe_packed_tensors(blocks, scales, dtype=torch.bfloat16)
|
||||
self.assertEqual(result.shape, (2, 8 * 16 * 2, 4))
|
||||
self.assertEqual(result.dtype, torch.bfloat16)
|
||||
|
||||
@require_triton(min_version="3.4.0")
|
||||
@require_kernels
|
||||
@require_torch_gpu
|
||||
@require_torch
|
||||
def test_quantize_to_mxfp4(self):
|
||||
"""Test quantization function"""
|
||||
from transformers.integrations.mxfp4 import quantize_to_mxfp4
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
|
||||
# Create dummy weight tensor
|
||||
w = torch.randn(32, 64, 128, dtype=torch.bfloat16, device=torch.device("cuda"))
|
||||
|
||||
quantized_w, w_scale = quantize_to_mxfp4(w, quantizer._lazy_import_kernels())
|
||||
|
||||
# Check that shapes are reasonable
|
||||
self.assertEqual(quantized_w.dtype, torch.uint8)
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torch_large_gpu
|
||||
@require_triton(min_version="3.4.0")
|
||||
@require_kernels
|
||||
@slow
|
||||
class Mxfp4ModelTest(unittest.TestCase):
|
||||
"""Test mxfp4 with actual models (requires specific model and hardware)"""
|
||||
|
||||
# These should be paths to real OpenAI MoE models for proper testing
|
||||
model_name = "openai/gpt-oss-20b"
|
||||
|
||||
input_text = "Once upon a time"
|
||||
|
||||
# Expected outputs for generation tests
|
||||
EXPECTED_OUTPUTS = set()
|
||||
EXPECTED_OUTPUTS.add("Once upon a time, in a small town, there lived a young")
|
||||
|
||||
def setUp(self):
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def check_inference_correctness_quantized(self, model, tokenizer):
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = tokenizer(self.input_text, return_tensors="pt").to(model.device)
|
||||
|
||||
# Set pad token if not set
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
with torch.no_grad():
|
||||
output_sequences = model.generate(
|
||||
**encoded_input,
|
||||
max_new_tokens=10,
|
||||
do_sample=False,
|
||||
pad_token_id=tokenizer.eos_token_id,
|
||||
use_cache=False,
|
||||
)
|
||||
|
||||
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
|
||||
|
||||
self.assertIn(generated_text, self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_gpt_oss_model_loading_quantized_with_device_map(self):
|
||||
"""Test loading OpenAI MoE model with mxfp4 quantization and device_map"""
|
||||
|
||||
model = GptOssForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
self.check_inference_correctness_quantized(model, tokenizer)
|
||||
|
||||
def test_gpt_oss_model_loading_dequantized_with_device_map(self):
|
||||
"""Test loading OpenAI MoE model with mxfp4 dequantization and device_map"""
|
||||
|
||||
quantization_config = Mxfp4Config(dequantize=True)
|
||||
|
||||
# Test that config is properly set up
|
||||
self.assertTrue(quantization_config.dequantize)
|
||||
|
||||
model = GptOssForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
quantization_config=quantization_config,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
self.check_inference_correctness_quantized(model, tokenizer)
|
||||
|
||||
def test_model_device_map_validation(self):
|
||||
"""Test device map validation"""
|
||||
from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
|
||||
|
||||
config = Mxfp4Config()
|
||||
quantizer = Mxfp4HfQuantizer(config)
|
||||
quantizer.pre_quantized = False
|
||||
|
||||
# Test with CPU in device map (should raise error for non-pre-quantized)
|
||||
with self.assertRaises(ValueError):
|
||||
quantizer.validate_environment(device_map={"": "cpu"})
|
||||
|
||||
def test_memory_footprint_comparison(self):
|
||||
"""Test memory footprint differences between quantized and unquantized models"""
|
||||
|
||||
# Expected: quantized < dequantized < unquantized memory usage
|
||||
quantization_config = Mxfp4Config(dequantize=True)
|
||||
quantized_model = GptOssForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
dequantized_model = GptOssForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
quantized_mem = quantized_model.get_memory_footprint()
|
||||
dequantized_mem = dequantized_model.get_memory_footprint()
|
||||
self.assertLess(quantized_mem, dequantized_mem)
|
||||
|
||||
def test_save_mxfp4(self):
|
||||
"""Test saving quantized OpenAI MoE model with device_map"""
|
||||
|
||||
model = GptOssForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
# Save the model in mxfp4 format
|
||||
model.save_pretrained(tmp)
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
# test quantized model
|
||||
loaded_model = GptOssForCausalLM.from_pretrained(
|
||||
tmp,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
self.check_inference_correctness_quantized(loaded_model, tokenizer)
|
||||
|
||||
# test dequantized model
|
||||
loaded_model = GptOssForCausalLM.from_pretrained(
|
||||
tmp,
|
||||
quantization_config=Mxfp4Config(dequantize=True),
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
self.check_inference_correctness_quantized(loaded_model, tokenizer)
|
||||
|
||||
def test_save_mxfp4_non_quantized(self):
|
||||
"""Test saving dequantized OpenAI MoE model with mxfp4 quantization and device_map"""
|
||||
non_quantized_model_name = "hf-internal-testing/gpt-oss-20b-bf16"
|
||||
tokenizer = AutoTokenizer.from_pretrained(non_quantized_model_name)
|
||||
loaded_model = GptOssForCausalLM.from_pretrained(
|
||||
non_quantized_model_name,
|
||||
quantization_config=Mxfp4Config(),
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
# save the quantized model
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
loaded_model.save_pretrained(tmp)
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
# load it back to check with everything works as expected
|
||||
loaded_model = GptOssForCausalLM.from_pretrained(
|
||||
tmp,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
self.check_inference_correctness_quantized(loaded_model, tokenizer)
|
||||
|
||||
loaded_model = GptOssForCausalLM.from_pretrained(
|
||||
tmp,
|
||||
quantization_config=Mxfp4Config(dequantized=True),
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
)
|
||||
self.check_inference_correctness_quantized(loaded_model, tokenizer)
|
||||
@@ -0,0 +1,475 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
|
||||
from transformers.testing_utils import (
|
||||
require_accelerate,
|
||||
require_optimum_quanto,
|
||||
require_read_token,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_optimum_quanto_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import LlamaForCausalLM
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
if is_optimum_quanto_available():
|
||||
from optimum.quanto import QLayerNorm, QLinear
|
||||
|
||||
from transformers.integrations.quanto import replace_with_quanto_layers
|
||||
|
||||
|
||||
class QuantoConfigTest(unittest.TestCase):
|
||||
def test_attributes(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_optimum_quanto
|
||||
@require_accelerate
|
||||
class QuantoTestIntegration(unittest.TestCase):
|
||||
model_id = "facebook/opt-350m"
|
||||
|
||||
def setUp(self):
|
||||
config = AutoConfig.from_pretrained(self.model_id)
|
||||
with init_empty_weights():
|
||||
self.model = AutoModelForCausalLM.from_config(config)
|
||||
self.nb_linear = 0
|
||||
self.nb_layernorm = 0
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
self.nb_linear += 1
|
||||
elif isinstance(module, torch.nn.LayerNorm):
|
||||
self.nb_layernorm += 1
|
||||
|
||||
def test_weight_only_quantization_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly when using weight only quantization
|
||||
"""
|
||||
|
||||
# Try with weight only quantization
|
||||
quantization_config = QuantoConfig(weights="int8", activations=None)
|
||||
self.model, _ = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
|
||||
|
||||
nb_qlinear = 0
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, QLinear):
|
||||
nb_qlinear += 1
|
||||
|
||||
self.assertEqual(self.nb_linear, nb_qlinear)
|
||||
|
||||
def test_weight_and_activation_quantization_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly when using weight + activation quantization
|
||||
"""
|
||||
|
||||
# Try with weight + activation quantization
|
||||
quantization_config = QuantoConfig(weights="int8", activations="int8")
|
||||
self.model, _ = replace_with_quanto_layers(self.model, quantization_config=quantization_config)
|
||||
|
||||
nb_qlinear = 0
|
||||
nb_qlayernorm = 0
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, QLinear):
|
||||
nb_qlinear += 1
|
||||
if isinstance(module, QLayerNorm):
|
||||
nb_qlayernorm += 1
|
||||
|
||||
self.assertEqual(self.nb_linear, nb_qlinear)
|
||||
self.assertEqual(self.nb_layernorm, nb_qlayernorm)
|
||||
|
||||
def test_conversion_with_modules_to_not_convert(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly when specifying modules_to_not_convert argument
|
||||
"""
|
||||
|
||||
# Try with weight + activatioin quantization
|
||||
quantization_config = QuantoConfig(weights="int8", activations="int8")
|
||||
self.model, _ = replace_with_quanto_layers(
|
||||
self.model, quantization_config=quantization_config, modules_to_not_convert=["lm_head"]
|
||||
)
|
||||
|
||||
nb_qlinear = 0
|
||||
nb_qlayernorm = 0
|
||||
for module in self.model.modules():
|
||||
if isinstance(module, QLinear):
|
||||
nb_qlinear += 1
|
||||
if isinstance(module, QLayerNorm):
|
||||
nb_qlayernorm += 1
|
||||
|
||||
self.assertEqual(self.nb_linear - 1, nb_qlinear)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
@require_optimum_quanto
|
||||
@require_accelerate
|
||||
class QuantoQuantizationTest(unittest.TestCase):
|
||||
"""
|
||||
Test 8-bit weights only quantization
|
||||
"""
|
||||
|
||||
model_name = "bigscience/bloom-560m"
|
||||
|
||||
weights = "int8"
|
||||
activations = None
|
||||
device_map = "cpu"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer and I"
|
||||
|
||||
def setUp(self):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = QuantoConfig(
|
||||
weights=self.weights,
|
||||
activations=self.activations,
|
||||
)
|
||||
|
||||
self.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device_map,
|
||||
quantization_config=quantization_config,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.have_accelerate_hooks = (
|
||||
getattr(self.quantized_model, "hf_device_map", False) and len(self.quantized_model.hf_device_map) > 1
|
||||
)
|
||||
|
||||
def check_inference_correctness(self, model, device):
|
||||
r"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
if not self.have_accelerate_hooks:
|
||||
model.to(device)
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(device), max_new_tokens=10)
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality_cpu(self):
|
||||
"""
|
||||
Simple test to check the quality of the model on cpu by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model, "cpu")
|
||||
|
||||
def test_generate_quality_accelerator(self):
|
||||
"""
|
||||
Simple test to check the quality of the model on accelerators by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
self.check_inference_correctness(self.quantized_model, torch_device)
|
||||
|
||||
def test_quantized_model_layers(self):
|
||||
from optimum.quanto import QBitsTensor, QModuleMixin, QTensor
|
||||
|
||||
"""
|
||||
Suite of simple test to check if the layers are quantized and are working properly
|
||||
"""
|
||||
# Test the type of the quantized layer
|
||||
self.assertTrue(isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value, QModuleMixin))
|
||||
self.assertTrue(
|
||||
isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value.weight, QTensor)
|
||||
)
|
||||
if self.weights == "int4":
|
||||
self.assertTrue(
|
||||
isinstance(self.quantized_model.transformer.h[0].self_attention.query_key_value.weight, QBitsTensor)
|
||||
)
|
||||
|
||||
# check that the lm_head was indeed not quantized, just like bnb
|
||||
self.assertTrue(
|
||||
isinstance(self.quantized_model.lm_head, torch.nn.Linear)
|
||||
and not isinstance(self.quantized_model.lm_head, QModuleMixin)
|
||||
)
|
||||
if self.device_map in ["cpu", "cuda"]:
|
||||
self.assertEqual(
|
||||
self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type,
|
||||
self.device_map,
|
||||
)
|
||||
self.quantized_model.to(0)
|
||||
self.assertEqual(
|
||||
self.quantized_model.transformer.h[0].self_attention.query_key_value.weight._data.device.type, torch_device
|
||||
)
|
||||
|
||||
def test_serialization_bin(self):
|
||||
"""
|
||||
Test the serialization, the loading and the inference of the quantized weights
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
with self.assertRaises(ValueError) as e:
|
||||
self.quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
|
||||
self.assertIn(
|
||||
"The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception)
|
||||
)
|
||||
# TODO: replace by the following when it works
|
||||
# quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
# tmpdirname, dtype=torch.float32, device_map="cpu"
|
||||
# )
|
||||
# self.check_inference_correctness(quantized_model_from_saved, device="cuda")
|
||||
|
||||
def test_serialization_safetensors(self):
|
||||
"""
|
||||
Test the serialization, the loading and the inference of the quantized weights
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
with self.assertRaises(ValueError) as e:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
self.assertIn(
|
||||
"The model is quantized with QuantizationMethod.QUANTO and is not serializable", str(e.exception)
|
||||
)
|
||||
# quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
# tmpdirname, dtype=torch.float32, device_map="cpu"
|
||||
# )
|
||||
# self.check_inference_correctness(quantized_model_from_saved, device="cuda")
|
||||
|
||||
def check_same_model(self, model1, model2):
|
||||
d0 = dict(model1.named_parameters())
|
||||
d1 = dict(model2.named_parameters())
|
||||
self.assertTrue(d0.keys() == d1.keys())
|
||||
for k in d0:
|
||||
self.assertTrue(d0[k].shape == d1[k].shape)
|
||||
self.assertTrue(d0[k].device.type == d1[k].device.type)
|
||||
self.assertTrue(d0[k].device == d1[k].device)
|
||||
self.assertTrue(d0[k].dtype == d1[k].dtype)
|
||||
self.assertTrue(torch.equal(d0[k], d1[k].to(d0[k].device)))
|
||||
|
||||
def test_compare_with_quanto(self):
|
||||
from optimum.quanto import freeze, qint4, qint8, quantize
|
||||
|
||||
w_mapping = {"int8": qint8, "int4": qint4}
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device_map,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
# we do not quantize the lm_head since we don't do that in transformers
|
||||
quantize(model.transformer, weights=w_mapping[self.weights])
|
||||
freeze(model.transformer)
|
||||
self.check_same_model(model, self.quantized_model)
|
||||
self.check_inference_correctness(model, device=torch_device)
|
||||
|
||||
@unittest.skip
|
||||
def test_load_from_quanto_saved(self):
|
||||
from optimum.quanto import freeze, qint4, qint8, quantize
|
||||
|
||||
from transformers import QuantoConfig
|
||||
|
||||
w_mapping = {"int8": qint8, "int4": qint4}
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device_map,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
# we do not quantize the lm_head since we don't do that in transformers
|
||||
quantize(model.transformer, weights=w_mapping[self.weights])
|
||||
freeze(model.transformer)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.config.quantization_config = QuantoConfig(
|
||||
weights=self.weights, activations=self.activations, modules_to_not_convert=["lm_head"]
|
||||
)
|
||||
model.save_pretrained(tmpdirname, safe_serialization=False)
|
||||
quantized_model_from_saved = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname,
|
||||
device_map=self.device_map,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
self.check_same_model(model, quantized_model_from_saved)
|
||||
self.check_inference_correctness(quantized_model_from_saved, device="cuda")
|
||||
|
||||
|
||||
class QuantoQuantizationOffloadTest(QuantoQuantizationTest):
|
||||
device_map = {
|
||||
"transformer.word_embeddings": 0,
|
||||
"transformer.word_embeddings_layernorm": 0,
|
||||
"transformer.ln_f": 0,
|
||||
"transformer.h.0": 0,
|
||||
"transformer.h.1": 0,
|
||||
"transformer.h.2": 0,
|
||||
"transformer.h.3": 0,
|
||||
"transformer.h.4": 0,
|
||||
"transformer.h.5": 0,
|
||||
"transformer.h.6": 0,
|
||||
"transformer.h.7": 0,
|
||||
"transformer.h.8": 0,
|
||||
"transformer.h.9": 0,
|
||||
"transformer.h.10": 0,
|
||||
"transformer.h.11": 0,
|
||||
"transformer.h.12": 0,
|
||||
"transformer.h.13": 0,
|
||||
"transformer.h.14": 0,
|
||||
"transformer.h.15": 0,
|
||||
"transformer.h.16": 0,
|
||||
"transformer.h.17": 0,
|
||||
"transformer.h.18": 0,
|
||||
"transformer.h.19": 0,
|
||||
"transformer.h.20": 0,
|
||||
"transformer.h.21": 0,
|
||||
"transformer.h.22": "cpu",
|
||||
"transformer.h.23": "disk",
|
||||
"lm_head": 0,
|
||||
}
|
||||
|
||||
@unittest.skip(reason="The execution device is a gpu")
|
||||
def test_generate_quality_cpu(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="We can't save offloaded values")
|
||||
def test_serialization_bin(self):
|
||||
pass
|
||||
|
||||
@unittest.skip
|
||||
def test_serialization_safetensors(self):
|
||||
pass
|
||||
|
||||
@unittest.skip
|
||||
def test_compare_with_quanto(self):
|
||||
pass
|
||||
|
||||
@unittest.skip
|
||||
def test_load_from_quanto_saved(self):
|
||||
pass
|
||||
|
||||
def test_check_offload_quantized(self):
|
||||
"""
|
||||
We check that we have unquantized value in the cpu and in the disk
|
||||
"""
|
||||
from optimum.quanto import QBitsTensor, QTensor
|
||||
|
||||
cpu_weights = self.quantized_model.transformer.h[22].self_attention.query_key_value._hf_hook.weights_map[
|
||||
"weight"
|
||||
]
|
||||
disk_weights = self.quantized_model.transformer.h[23].self_attention.query_key_value._hf_hook.weights_map[
|
||||
"weight"
|
||||
]
|
||||
self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(cpu_weights, QTensor))
|
||||
self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QTensor))
|
||||
if self.weights == "int4":
|
||||
self.assertTrue(isinstance(cpu_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
|
||||
self.assertTrue(isinstance(disk_weights, torch.Tensor) and not isinstance(disk_weights, QBitsTensor))
|
||||
|
||||
|
||||
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
|
||||
class QuantoQuantizationSerializationTest(QuantoQuantizationTest):
|
||||
"""
|
||||
Perform the same tests as in QuantoQuantizationTest but with a serialized model.
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
quantization_config = QuantoConfig(
|
||||
weights=self.weights,
|
||||
activations=self.activations,
|
||||
)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device_map,
|
||||
quantization_config=quantization_config,
|
||||
dtype=torch.float32,
|
||||
)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
quantized_model.save_pretrained(tmpdirname, safe_serialization=False)
|
||||
self.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
tmpdirname, dtype=torch.float32, device_map=self.device_map
|
||||
)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.have_accelerate_hooks = (
|
||||
getattr(self.quantized_model, "hf_device_map", False) and len(self.quantized_model.hf_device_map) > 1
|
||||
)
|
||||
|
||||
|
||||
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
|
||||
class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
|
||||
"""
|
||||
Perform the same tests as in QuantoQuantizationTest but with model on cuda
|
||||
"""
|
||||
|
||||
device_map = "cuda:0"
|
||||
|
||||
|
||||
class QuantoQuantizationQBitsTensorTest(QuantoQuantizationTest):
|
||||
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
|
||||
weights = "int4"
|
||||
|
||||
|
||||
class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
|
||||
EXPECTED_OUTPUTS = [
|
||||
"Hello my name is John, I am a professional photographer, I", # CUDA output
|
||||
"Hello my name is Nils, I am a student of the University", # XPU output
|
||||
]
|
||||
weights = "int4"
|
||||
|
||||
|
||||
@unittest.skip(reason="Skipping test class because serialization is not supported yet")
|
||||
class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
|
||||
EXPECTED_OUTPUTS = "Hello my name is John, I am a professional photographer, I"
|
||||
weights = "int4"
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class QuantoQuantizationActivationTest(unittest.TestCase):
|
||||
def test_quantize_activation(self):
|
||||
quantization_config = QuantoConfig(
|
||||
weights="int8",
|
||||
activations="int8",
|
||||
)
|
||||
with self.assertRaises(ValueError) as e:
|
||||
AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", quantization_config=quantization_config)
|
||||
self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
|
||||
|
||||
|
||||
@require_optimum_quanto
|
||||
@require_torch_accelerator
|
||||
class QuantoKVCacheQuantizationTest(unittest.TestCase):
|
||||
@slow
|
||||
@require_read_token
|
||||
def test_quantized_cache(self):
|
||||
EXPECTED_TEXT_COMPLETION = [
|
||||
"Simply put, the theory of relativity states that 1) time and space are not absolute, but are relative to the observer, and 2) the laws of physics are the same everywhere in the universe. This means that the speed of light is",
|
||||
"My favorite all time favorite condiment is ketchup. I love how it adds a sweet and tangy flavor to my food. I also enjoy using it as a dip for fries, burgers, and grilled meats. It's a classic condiment that never",
|
||||
]
|
||||
|
||||
prompts = [
|
||||
"Simply put, the theory of relativity states that ",
|
||||
"My favorite all time favorite condiment is ketchup.",
|
||||
]
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
"unsloth/Llama-3.2-1B-Instruct", pad_token="</s>", padding_side="left"
|
||||
)
|
||||
model = LlamaForCausalLM.from_pretrained(
|
||||
"unsloth/Llama-3.2-1B-Instruct", device_map="sequential", dtype=torch.float16
|
||||
)
|
||||
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(torch_device)
|
||||
|
||||
generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False, cache_implementation="quantized")
|
||||
text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
|
||||
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
|
||||
153
transformers/tests/quantization/quark_integration/test_quark.py
Normal file
153
transformers/tests/quantization/quark_integration/test_quark.py
Normal file
@@ -0,0 +1,153 @@
|
||||
# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, QuarkConfig
|
||||
from transformers.testing_utils import (
|
||||
cleanup,
|
||||
is_torch_available,
|
||||
require_accelerate,
|
||||
require_quark,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils.import_utils import is_quark_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_quark_available():
|
||||
from quark.torch.export.nn.modules.qparamslinear import QParamsLinear
|
||||
|
||||
|
||||
@require_quark
|
||||
class QuarkConfigTest(unittest.TestCase):
|
||||
def test_common_args(self):
|
||||
config = AutoConfig.from_pretrained("amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test")
|
||||
QuarkConfig(**config.quantization_config)
|
||||
|
||||
|
||||
@slow
|
||||
@require_quark
|
||||
@require_torch_gpu
|
||||
class QuarkTest(unittest.TestCase):
|
||||
reference_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
|
||||
quantized_model_name = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
||||
|
||||
input_text = "Today I am in Paris and"
|
||||
|
||||
EXPECTED_OUTPUTS = set()
|
||||
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris, France\nToday I am in Paris, Illinois")
|
||||
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying the city of light. I am not just any ordinary Paris")
|
||||
EXPECTED_OUTPUTS.add("Today I am in Paris and I am enjoying my day off! The sun is shining, the birds are")
|
||||
EXPECTED_OUTPUTS.add("Today I am in Paris and I'm here to tell you about it. It's a beautiful day,")
|
||||
EXPECTED_OUTPUTS.add("Today I am in Paris and I am not in Paris at all! I am not in Paris, but")
|
||||
|
||||
EXPECTED_RELATIVE_DIFFERENCE = 1.66
|
||||
device_map = None
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup reference & quantized model
|
||||
"""
|
||||
cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
|
||||
cls.reference_model_name, dtype=torch.float16, device_map=cls.device_map
|
||||
)
|
||||
cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
|
||||
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.reference_model_name, use_fast=True)
|
||||
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.quantized_model_name,
|
||||
dtype=torch.float16,
|
||||
device_map=cls.device_map,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the accelerator memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
cleanup(torch_device, gc_collect=True)
|
||||
|
||||
def test_memory_footprint(self):
|
||||
mem_quantized = self.quantized_model.get_memory_footprint()
|
||||
|
||||
self.assertTrue(self.mem_fp16 / mem_quantized > self.EXPECTED_RELATIVE_DIFFERENCE)
|
||||
|
||||
def test_device_and_dtype_assignment(self):
|
||||
r"""
|
||||
Test whether trying to cast (or assigning a device to) a model after quantization will throw an error.
|
||||
Checks also if other models are casted correctly .
|
||||
"""
|
||||
# This should work
|
||||
if self.device_map is None:
|
||||
_ = self.quantized_model.to(0)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Tries with a `dtype``
|
||||
self.quantized_model.to(torch.float16)
|
||||
|
||||
def test_original_dtype(self):
|
||||
r"""
|
||||
A simple test to check if the model successfully stores the original dtype
|
||||
"""
|
||||
self.assertTrue(hasattr(self.quantized_model.config, "_pre_quantization_dtype"))
|
||||
self.assertFalse(hasattr(self.model_fp16.config, "_pre_quantization_dtype"))
|
||||
self.assertTrue(self.quantized_model.config._pre_quantization_dtype == torch.float16)
|
||||
|
||||
self.assertTrue(isinstance(self.quantized_model.model.layers[0].mlp.gate_proj, QParamsLinear))
|
||||
|
||||
def check_inference_correctness(self, model):
|
||||
r"""
|
||||
Test the generation quality of the quantized model and see that we are matching the expected output.
|
||||
Given that we are operating on small numbers + the testing model is relatively small, we might not get
|
||||
the same output across GPUs. So we'll generate few tokens (5-10) and check their output.
|
||||
"""
|
||||
# Check that inference pass works on the model
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt")
|
||||
|
||||
gen_config = GenerationConfig(
|
||||
max_new_tokens=15,
|
||||
min_new_tokens=15,
|
||||
use_cache=True,
|
||||
num_beams=1,
|
||||
do_sample=False,
|
||||
)
|
||||
|
||||
# Check the exactness of the results
|
||||
output_sequences = model.generate(input_ids=encoded_input["input_ids"].to(0), generation_config=gen_config)
|
||||
|
||||
# Get the generation
|
||||
self.assertIn(self.tokenizer.decode(output_sequences[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
|
||||
|
||||
def test_generate_quality(self):
|
||||
"""
|
||||
Simple test to check the quality of the model by comparing the generated tokens with the expected tokens
|
||||
"""
|
||||
if self.device_map is None:
|
||||
self.check_inference_correctness(self.quantized_model.to(0))
|
||||
else:
|
||||
self.check_inference_correctness(self.quantized_model)
|
||||
|
||||
|
||||
@require_accelerate
|
||||
@require_torch_multi_gpu
|
||||
@require_quark
|
||||
class QuarkTestDeviceMap(QuarkTest):
|
||||
device_map = "auto"
|
||||
246
transformers/tests/quantization/spqr_integration/test_spqr.py
Normal file
246
transformers/tests/quantization/spqr_integration/test_spqr.py
Normal file
@@ -0,0 +1,246 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, SpQRConfig, StaticCache
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_spqr,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
@require_torch_gpu
|
||||
class SpQRConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
|
||||
"""
|
||||
quantization_config = SpQRConfig()
|
||||
config_to_dict = quantization_config.to_dict()
|
||||
|
||||
for key in config_to_dict:
|
||||
self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
|
||||
|
||||
def test_from_dict(self):
|
||||
"""
|
||||
Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
|
||||
"""
|
||||
dict = {
|
||||
"beta1": 16,
|
||||
"beta2": 16,
|
||||
"bits": 3,
|
||||
"modules_to_not_convert": ["lm_head.weight"],
|
||||
"shapes": {"model.layers.0.self_attn.q_proj.dense_weights.shape": 16},
|
||||
}
|
||||
quantization_config = SpQRConfig.from_dict(dict)
|
||||
|
||||
self.assertEqual(dict["beta1"], quantization_config.beta1)
|
||||
self.assertEqual(dict["beta2"], quantization_config.beta2)
|
||||
self.assertEqual(dict["bits"], quantization_config.bits)
|
||||
self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
|
||||
self.assertEqual(dict["shapes"], quantization_config.shapes)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_spqr
|
||||
@require_accelerate
|
||||
class SpQRTest(unittest.TestCase):
|
||||
model_name = "elvircrn/Llama-2-7b-SPQR-3Bit-16x16-red_pajama-hf"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
max_new_tokens = 32
|
||||
|
||||
EXPECTED_OUTPUT = (
|
||||
"Hello my name is Jesse. (I'm also known as Jesse) I'm a 25 year old male from United States. I'm looking for"
|
||||
)
|
||||
EXPECTED_OUTPUT_COMPILE = "Hello my name is Jake and I am a 20 year old student at the University of North Texas. (Go Mean Green!) I am a huge fan of the Dallas"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
device_map=torch_device,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
from spqr_quant import QuantizedLinear
|
||||
|
||||
from transformers.integrations import replace_with_spqr_linear
|
||||
|
||||
model_id = "meta-llama/Llama-2-7b-hf"
|
||||
config = AutoConfig.from_pretrained(model_id)
|
||||
quantization_config = AutoConfig.from_pretrained(self.model_name, return_dict=False).quantization_config
|
||||
quantization_config = SpQRConfig.from_dict(quantization_config)
|
||||
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, config=config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model, _ = replace_with_spqr_linear(
|
||||
model,
|
||||
quantization_config=quantization_config,
|
||||
modules_to_not_convert=quantization_config.modules_to_not_convert,
|
||||
)
|
||||
|
||||
nb_spqr_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, QuantizedLinear):
|
||||
nb_spqr_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_spqr_linear)
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_raise_if_non_quantized(self):
|
||||
model_id = "meta-llama/Llama-2-7b-hf"
|
||||
quantization_config = SpQRConfig()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
|
||||
@unittest.skip
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=torch_device)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@pytest.mark.torch_compile_test
|
||||
def test_quantized_model_compile(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
|
||||
# Sample tokens greedily
|
||||
def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_values):
|
||||
logits = model(
|
||||
cur_token,
|
||||
position_ids=input_pos,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)[0]
|
||||
new_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
|
||||
|
||||
return new_token
|
||||
|
||||
# Tokenize the test input
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)["input_ids"]
|
||||
seq_length = input_ids.shape[1]
|
||||
|
||||
# Setup static KV cache for generation
|
||||
past_key_values = StaticCache(
|
||||
config=self.quantized_model.config, max_cache_len=seq_length + self.max_new_tokens + 1
|
||||
)
|
||||
|
||||
# Allocate token ids to be generated and copy prefix ids
|
||||
cache_position = torch.arange(seq_length, device=torch_device)
|
||||
generated_ids = torch.zeros(1, seq_length + self.max_new_tokens, dtype=torch.int, device=torch_device)
|
||||
generated_ids[:, cache_position] = input_ids.to(torch_device).to(torch.int)
|
||||
|
||||
# Do a forward pass to fill the prefix cache and compile the kernels if necessary
|
||||
logits = self.quantized_model(
|
||||
input_ids,
|
||||
cache_position=cache_position,
|
||||
past_key_values=past_key_values,
|
||||
return_dict=False,
|
||||
use_cache=True,
|
||||
)[0]
|
||||
next_token = torch.argmax(logits[:, [-1]], dim=-1).to(torch.int)
|
||||
generated_ids[:, [seq_length]] = next_token
|
||||
|
||||
with torch.no_grad():
|
||||
# Compile the CUDA graph
|
||||
decode_one_tokens = torch.compile(decode_one_tokens, mode="default", backend="inductor", fullgraph=True)
|
||||
|
||||
# Generate tokens one by one
|
||||
cache_position = torch.tensor([seq_length + 1], device=torch_device)
|
||||
for _ in range(1, self.max_new_tokens):
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
|
||||
next_token = decode_one_tokens(
|
||||
self.quantized_model, next_token.clone(), None, cache_position, past_key_values
|
||||
)
|
||||
generated_ids.index_copy_(1, cache_position, next_token)
|
||||
cache_position += 1
|
||||
|
||||
# Check generated text
|
||||
self.assertEqual(
|
||||
self.tokenizer.decode(generated_ids[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_COMPILE
|
||||
)
|
||||
@@ -0,0 +1,636 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import importlib.metadata
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from packaging import version
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
|
||||
from transformers.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
get_device_properties,
|
||||
require_torch_accelerator,
|
||||
require_torch_multi_accelerator,
|
||||
require_torchao,
|
||||
require_torchao_version_greater_or_equal,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_torch_available, is_torchao_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_torchao_available():
|
||||
import torchao
|
||||
|
||||
# renamed in torchao 0.7.0, please install the latest torchao
|
||||
from torchao.dtypes import (
|
||||
AffineQuantizedTensor,
|
||||
TensorCoreTiledLayout,
|
||||
)
|
||||
from torchao.quantization import (
|
||||
Int8WeightOnlyConfig,
|
||||
IntxWeightOnlyConfig,
|
||||
MappingType,
|
||||
ModuleFqnToConfig,
|
||||
PerAxis,
|
||||
)
|
||||
from torchao.quantization.autoquant import AQMixin
|
||||
|
||||
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0"):
|
||||
from torchao.dtypes import Int4CPULayout
|
||||
if version.parse(importlib.metadata.version("torchao")) >= version.parse("0.11.0"):
|
||||
from torchao.dtypes import Int4XPULayout
|
||||
|
||||
|
||||
def check_torchao_int4_wo_quantized(test_module, qlayer):
|
||||
weight = qlayer.weight
|
||||
test_module.assertEqual(weight.quant_min, 0)
|
||||
test_module.assertEqual(weight.quant_max, 15)
|
||||
test_module.assertTrue(isinstance(weight, AffineQuantizedTensor))
|
||||
layout = None
|
||||
if weight.device.type == "cpu":
|
||||
layout = Int4CPULayout
|
||||
elif weight.device.type == "xpu":
|
||||
layout = Int4XPULayout
|
||||
elif weight.device.type == "cuda":
|
||||
layout = TensorCoreTiledLayout
|
||||
test_module.assertTrue(isinstance(weight.tensor_impl._layout, layout))
|
||||
|
||||
|
||||
def check_autoquantized(test_module, qlayer):
|
||||
weight = qlayer.weight
|
||||
test_module.assertTrue(isinstance(weight, AQMixin))
|
||||
|
||||
|
||||
def check_forward(test_module, model, batch_size=1, context_size=1024):
|
||||
# Test forward pass
|
||||
with torch.no_grad():
|
||||
out = model(torch.zeros([batch_size, context_size], device=model.device, dtype=torch.int32)).logits
|
||||
test_module.assertEqual(out.shape[0], batch_size)
|
||||
test_module.assertEqual(out.shape[1], context_size)
|
||||
|
||||
|
||||
@require_torchao
|
||||
@require_torchao_version_greater_or_equal("0.8.0")
|
||||
class TorchAoConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Makes sure the config format is properly set
|
||||
"""
|
||||
quantization_config = TorchAoConfig("int4_weight_only")
|
||||
torchao_orig_config = quantization_config.to_dict()
|
||||
|
||||
for key in torchao_orig_config:
|
||||
self.assertEqual(getattr(quantization_config, key), torchao_orig_config[key])
|
||||
|
||||
def test_post_init_check(self):
|
||||
"""
|
||||
Test kwargs validations in TorchAoConfig
|
||||
"""
|
||||
_ = TorchAoConfig("int4_weight_only")
|
||||
with self.assertRaisesRegex(ValueError, "Unsupported string quantization type"):
|
||||
_ = TorchAoConfig("fp6")
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"):
|
||||
_ = TorchAoConfig("int4_weight_only", group_size1=32)
|
||||
|
||||
def test_repr(self):
|
||||
"""
|
||||
Check that there is no error in the repr
|
||||
"""
|
||||
quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8)
|
||||
repr(quantization_config)
|
||||
|
||||
def test_json_serializable(self):
|
||||
"""
|
||||
Check that the config dict can be JSON serialized.
|
||||
"""
|
||||
quantization_config = TorchAoConfig("int4_weight_only", group_size=32, layout=TensorCoreTiledLayout())
|
||||
d = quantization_config.to_dict()
|
||||
self.assertIsInstance(d["quant_type_kwargs"]["layout"], list)
|
||||
self.assertTrue("inner_k_tiles" in d["quant_type_kwargs"]["layout"][1])
|
||||
quantization_config.to_json_string(use_diff=False)
|
||||
|
||||
|
||||
@require_torchao
|
||||
@require_torchao_version_greater_or_equal("0.8.0")
|
||||
class TorchAoTest(unittest.TestCase):
|
||||
input_text = "What are we having for dinner?"
|
||||
max_new_tokens = 10
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
device = "cpu"
|
||||
quant_scheme_kwargs = (
|
||||
{"group_size": 32, "layout": Int4CPULayout(), "version": 1}
|
||||
if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
|
||||
else {"group_size": 32}
|
||||
)
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_int4wo_quant(self):
|
||||
"""
|
||||
Simple LLM model testing int4 weight only quantization
|
||||
"""
|
||||
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
|
||||
|
||||
# Note: we quantize the bfloat16 model on the fly to int4
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_int4wo_quant_bfloat16_conversion(self):
|
||||
"""
|
||||
Testing the dtype of model will be modified to be bfloat16 for int4 weight only quantization
|
||||
"""
|
||||
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
|
||||
|
||||
# Note: we quantize the bfloat16 model on the fly to int4
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
check_torchao_int4_wo_quantized(self, quantized_model.model.layers[0].self_attn.v_proj)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_int8_dynamic_activation_int8_weight_quant(self):
|
||||
"""
|
||||
Simple LLM model testing int8_dynamic_activation_int8_weight
|
||||
"""
|
||||
quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight")
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
EXPECTED_OUTPUT = [
|
||||
"What are we having for dinner?\n\nJessica: (smiling)",
|
||||
"What are we having for dinner?\n\nJess: (smiling) I",
|
||||
]
|
||||
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
|
||||
|
||||
@require_torchao_version_greater_or_equal("0.11.0")
|
||||
def test_include_input_output_embeddings(self):
|
||||
weight_dtype = torch.int8
|
||||
granularity = PerAxis(0)
|
||||
mapping_type = MappingType.ASYMMETRIC
|
||||
embedding_config = IntxWeightOnlyConfig(
|
||||
weight_dtype=weight_dtype,
|
||||
granularity=granularity,
|
||||
mapping_type=mapping_type,
|
||||
version=1,
|
||||
)
|
||||
config = ModuleFqnToConfig(
|
||||
{"_default": None, "model.embed_tokens": embedding_config, "lm_head": embedding_config}
|
||||
)
|
||||
# need set `include_input_output_embeddings` to True
|
||||
quant_config = TorchAoConfig(quant_type=config, include_input_output_embeddings=True)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
# making sure embedding is quantized
|
||||
self.assertTrue(isinstance(quantized_model.model.embed_tokens.weight, AffineQuantizedTensor))
|
||||
self.assertTrue(isinstance(quantized_model.lm_head.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
EXPECTED_OUTPUT = [
|
||||
"What are we having for dinner?\n\nJessica: (smiling)",
|
||||
"What are we having for dinner?\n\nJess: (smiling) I",
|
||||
]
|
||||
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
|
||||
|
||||
@require_torchao_version_greater_or_equal("0.11.0")
|
||||
def test_per_module_config_skip(self):
|
||||
linear_config = Int8WeightOnlyConfig()
|
||||
config = ModuleFqnToConfig({"_default": linear_config, "model.layers.0.self_attn.q_proj": None})
|
||||
quant_config = TorchAoConfig(quant_type=config)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
# making sure `model.layers.0.self_attn.q_proj` is skipped
|
||||
self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor))
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
EXPECTED_OUTPUT = [
|
||||
"What are we having for dinner?\n\nJessica: (smiling)",
|
||||
"What are we having for dinner?\n\nJess: (smiling) I",
|
||||
]
|
||||
self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class TorchAoAcceleratorTest(TorchAoTest):
|
||||
device = torch_device
|
||||
quant_scheme_kwargs = {"group_size": 32, "version": 1}
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
# fmt: off
|
||||
EXPECTED_OUTPUTS = Expectations(
|
||||
{
|
||||
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
|
||||
}
|
||||
)
|
||||
# fmt: on
|
||||
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
|
||||
|
||||
def test_int4wo_offload(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model int4 weight only is working properly with cpu/disk offload
|
||||
"""
|
||||
|
||||
device_map_offload = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.layers.0": 0,
|
||||
"model.layers.1": 0,
|
||||
"model.layers.2": 0,
|
||||
"model.layers.3": 0,
|
||||
"model.layers.4": 0,
|
||||
"model.layers.5": 0,
|
||||
"model.layers.6": 0,
|
||||
"model.layers.7": 0,
|
||||
"model.layers.8": 0,
|
||||
"model.layers.9": 0,
|
||||
"model.layers.10": 0,
|
||||
"model.layers.11": 0,
|
||||
"model.layers.12": 0,
|
||||
"model.layers.13": 0,
|
||||
"model.layers.14": 0,
|
||||
"model.layers.15": 0,
|
||||
"model.layers.16": 0,
|
||||
"model.layers.17": 0,
|
||||
"model.layers.18": 0,
|
||||
"model.layers.19": "cpu",
|
||||
"model.layers.20": "cpu",
|
||||
"model.layers.21": "disk",
|
||||
"model.norm": 0,
|
||||
"model.rotary_emb": 0,
|
||||
"lm_head": 0,
|
||||
}
|
||||
|
||||
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=device_map_offload,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
# fmt: off
|
||||
EXPECTED_OUTPUTS = Expectations(
|
||||
{
|
||||
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("cuda", 7): "What are we having for dinner?\n- 2. What is the temperature outside",
|
||||
}
|
||||
)
|
||||
# fmt: on
|
||||
EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
|
||||
self.assertEqual(generated_text, EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_int4wo_quant_multi_accelerator(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model int4 weight only is working properly with multiple accelerators
|
||||
set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 CUDA GPUs
|
||||
set ZE_AFFINITY_MASK=0,1 if you have more than 2 Intel XPUs
|
||||
"""
|
||||
|
||||
quant_config = TorchAoConfig("int4_weight_only", **self.quant_scheme_kwargs)
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map="auto",
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_autoquant(self):
|
||||
"""
|
||||
Simple LLM model testing autoquant
|
||||
"""
|
||||
quant_config = TorchAoConfig("autoquant")
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype="auto",
|
||||
device_map=self.device,
|
||||
quantization_config=quant_config,
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
output = quantized_model.generate(
|
||||
**input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
|
||||
)
|
||||
quantized_model.finalize_autoquant()
|
||||
|
||||
check_autoquantized(self, quantized_model.model.layers[0].self_attn.v_proj)
|
||||
|
||||
EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
output = quantized_model.generate(
|
||||
**input_ids, max_new_tokens=self.max_new_tokens, cache_implementation="static"
|
||||
)
|
||||
self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torchao
|
||||
@require_torchao_version_greater_or_equal("0.8.0")
|
||||
class TorchAoSerializationTest(unittest.TestCase):
|
||||
input_text = "What are we having for dinner?"
|
||||
max_new_tokens = 10
|
||||
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
quant_scheme = "int4_weight_only"
|
||||
quant_scheme_kwargs = (
|
||||
{"group_size": 32, "layout": Int4CPULayout(), "version": 1}
|
||||
if is_torchao_available() and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
|
||||
else {"group_size": 32}
|
||||
)
|
||||
device = "cpu"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
|
||||
|
||||
def setUp(self):
|
||||
self.quant_config = TorchAoConfig(self.quant_scheme, **self.quant_scheme_kwargs)
|
||||
dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
|
||||
self.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=dtype,
|
||||
device_map=self.device,
|
||||
quantization_config=self.quant_config,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_original_model_expected_output(self):
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device)
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def check_serialization_expected_output(self, device, expected_output, safe_serialization=False):
|
||||
"""
|
||||
Test if we can serialize and load/infer the model again on the same device
|
||||
"""
|
||||
dtype = torch.bfloat16 if self.quant_scheme == "int4_weight_only" else "auto"
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname, safe_serialization=safe_serialization)
|
||||
loaded_quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, dtype=dtype, device_map=device)
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(device)
|
||||
|
||||
output = loaded_quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)
|
||||
|
||||
def test_serialization_expected_output(self):
|
||||
self.check_serialization_expected_output(self.device, self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torchao
|
||||
@require_torchao_version_greater_or_equal("0.14.0")
|
||||
class TorchAoSafeSerializationTest(TorchAoSerializationTest):
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n- 1. What is the temperature outside"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
if hasattr(self, "quantized_model"):
|
||||
del self.quantized_model
|
||||
gc.collect()
|
||||
|
||||
test_params = (
|
||||
[
|
||||
(
|
||||
torchao.quantization.Float8DynamicActivationFloat8WeightConfig(),
|
||||
"What are we having for dinner?\n\nJess: (smiling) I",
|
||||
),
|
||||
(torchao.quantization.Float8WeightOnlyConfig(), "What are we having for dinner?\n\nJessica: (smiling)"),
|
||||
]
|
||||
if is_torchao_available()
|
||||
else []
|
||||
)
|
||||
|
||||
@parameterized.expand(test_params, skip_on_empty=True)
|
||||
def test_serialization_expected_output(self, config, expected_output):
|
||||
device = "cuda"
|
||||
self.quant_config = TorchAoConfig(config)
|
||||
self.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
self.model_name,
|
||||
dtype=torch.bfloat16,
|
||||
device_map=device,
|
||||
quantization_config=self.quant_config,
|
||||
)
|
||||
self.check_serialization_expected_output(device, expected_output, safe_serialization=True)
|
||||
|
||||
|
||||
class TorchAoSerializationW8A8CPUTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_serialization_expected_output_on_accelerator(self):
|
||||
"""
|
||||
Test if we can serialize on device (cpu) and load/infer the model on accelerator
|
||||
"""
|
||||
self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
class TorchAoSerializationW8CPUTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_serialization_expected_output_on_accelerator(self):
|
||||
"""
|
||||
Test if we can serialize on device (cpu) and load/infer the model on accelerator
|
||||
"""
|
||||
self.check_serialization_expected_output(torch_device, self.EXPECTED_OUTPUT)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class TorchAoSerializationAcceleratorTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int4_weight_only", {"group_size": 32, "version": 1}
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
# fmt: off
|
||||
EXPECTED_OUTPUTS = Expectations(
|
||||
{
|
||||
("xpu", 3): "What are we having for dinner?\n\nJessica: (smiling)",
|
||||
("cuda", 7): "What are we having for dinner?\n- 1. What is the temperature outside",
|
||||
}
|
||||
)
|
||||
# fmt: on
|
||||
cls.EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class TorchAoSerializationW8A8AcceleratorTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_dynamic_activation_int8_weight", {}
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
class TorchAoSerializationW8AcceleratorTest(TorchAoSerializationTest):
|
||||
quant_scheme, quant_scheme_kwargs = "int8_weight_only", {}
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_torchao_version_greater_or_equal("0.10.0")
|
||||
class TorchAoSerializationFP8AcceleratorTest(TorchAoSerializationTest):
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
device_type, major, minor = get_device_properties()
|
||||
if device_type == "cuda" and major < 9:
|
||||
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
|
||||
|
||||
from torchao.quantization import Float8WeightOnlyConfig
|
||||
|
||||
cls.quant_scheme = Float8WeightOnlyConfig()
|
||||
cls.quant_scheme_kwargs = {}
|
||||
|
||||
super().setUpClass()
|
||||
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_torchao_version_greater_or_equal("0.10.0")
|
||||
class TorchAoSerializationA8W4Test(TorchAoSerializationTest):
|
||||
device = f"{torch_device}:0"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
device_type, major, minor = get_device_properties()
|
||||
if device_type == "cuda" and major < 9:
|
||||
raise unittest.SkipTest("CUDA compute capability 9.0 or higher required for FP8 tests")
|
||||
|
||||
from torchao.quantization import Int8DynamicActivationInt4WeightConfig
|
||||
|
||||
cls.quant_scheme = Int8DynamicActivationInt4WeightConfig()
|
||||
cls.quant_scheme_kwargs = {}
|
||||
|
||||
super().setUpClass()
|
||||
|
||||
cls.EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
194
transformers/tests/quantization/vptq_integration/test_vptq.py
Normal file
194
transformers/tests/quantization/vptq_integration/test_vptq.py
Normal file
@@ -0,0 +1,194 @@
|
||||
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, VptqConfig
|
||||
from transformers.testing_utils import (
|
||||
backend_empty_cache,
|
||||
require_accelerate,
|
||||
require_torch_gpu,
|
||||
require_torch_multi_gpu,
|
||||
require_vptq,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from transformers.utils import is_accelerate_available, is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_accelerate_available():
|
||||
from accelerate import init_empty_weights
|
||||
|
||||
|
||||
class VptqConfigTest(unittest.TestCase):
|
||||
def test_to_dict(self):
|
||||
"""
|
||||
Makes sure the config format is properly set
|
||||
"""
|
||||
quantization_config = VptqConfig()
|
||||
vptq_orig_config = quantization_config.to_dict()
|
||||
|
||||
self.assertEqual(vptq_orig_config["quant_method"], quantization_config.quant_method)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
@require_vptq
|
||||
@require_accelerate
|
||||
class VptqTest(unittest.TestCase):
|
||||
model_name = "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft"
|
||||
|
||||
input_text = "Hello my name is"
|
||||
max_new_tokens = 32
|
||||
|
||||
EXPECTED_OUTPUT = "Hello my name is Sarah and I am a 25 year old woman from the United States. I am a college graduate and I am currently working as a marketing specialist for a small"
|
||||
|
||||
device_map = "cuda"
|
||||
|
||||
# called only once for all test in this class
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""
|
||||
Setup quantized model
|
||||
"""
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.quantized_model = AutoModelForCausalLM.from_pretrained(
|
||||
cls.model_name,
|
||||
device_map=cls.device_map,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_quantized_model(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_raise_if_non_quantized(self):
|
||||
model_id = "facebook/opt-125m"
|
||||
quantization_config = VptqConfig()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
|
||||
|
||||
def test_save_pretrained(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly after being saved and loaded
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
self.quantized_model.save_pretrained(tmpdirname)
|
||||
model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
|
||||
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
@require_torch_multi_gpu
|
||||
def test_quantized_model_multi_gpu(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model is working properly with multiple GPUs
|
||||
"""
|
||||
input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
|
||||
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto")
|
||||
|
||||
self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
|
||||
|
||||
output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
|
||||
|
||||
self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
|
||||
|
||||
def test_quantized_model_conversion(self):
|
||||
"""
|
||||
Simple test that checks if the quantized model has been converted properly
|
||||
"""
|
||||
from vptq import VQuantLinear
|
||||
|
||||
from transformers.integrations import replace_with_vptq_linear
|
||||
|
||||
model_id = "facebook/opt-350m"
|
||||
config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
|
||||
modules_to_not_convert = ["lm_head"]
|
||||
names = [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
"out_proj",
|
||||
"fc1",
|
||||
"fc2",
|
||||
]
|
||||
value = {
|
||||
"enable_norm": True,
|
||||
"enable_perm": True,
|
||||
"group_num": 1,
|
||||
"group_size": 128,
|
||||
"indices_as_float": False,
|
||||
"num_centroids": [-1, 128],
|
||||
"num_res_centroids": [-1, 128],
|
||||
"outlier_size": 0,
|
||||
"vector_lens": [-1, 12],
|
||||
}
|
||||
shared_layer_config = {}
|
||||
for name in names:
|
||||
shared_layer_config[name] = value
|
||||
for i in range(24):
|
||||
modules_to_not_convert.append(f"model.decoder.layers.{i}.fc1")
|
||||
layer_configs = {}
|
||||
layer_configs["model.decoder.project_out"] = value
|
||||
layer_configs["model.decoder.project_in"] = value
|
||||
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
|
||||
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_config(config)
|
||||
|
||||
nb_linears = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, torch.nn.Linear):
|
||||
nb_linears += 1
|
||||
|
||||
model, _ = replace_with_vptq_linear(model, quantization_config=quantization_config)
|
||||
nb_vptq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, VQuantLinear):
|
||||
nb_vptq_linear += 1
|
||||
|
||||
self.assertEqual(nb_linears - 1, nb_vptq_linear)
|
||||
|
||||
# Try with `linear_weights_not_to_quantize`
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_config(config)
|
||||
quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config)
|
||||
model, _ = replace_with_vptq_linear(
|
||||
model, quantization_config=quantization_config, modules_to_not_convert=modules_to_not_convert
|
||||
)
|
||||
nb_vptq_linear = 0
|
||||
for module in model.modules():
|
||||
if isinstance(module, VQuantLinear):
|
||||
nb_vptq_linear += 1
|
||||
# 25 comes from 24 decoder.layers.{layer_idx}.fc1
|
||||
# and the last lm_head
|
||||
self.assertEqual(nb_linears - 25, nb_vptq_linear)
|
||||
Reference in New Issue
Block a user