init
This commit is contained in:
@@ -0,0 +1,232 @@
|
||||
import gc
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class StackCompressedModelTest(unittest.TestCase):
|
||||
# Define stubs as class attributes
|
||||
compressed_uncompressed_model_stubs = [
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
|
||||
),
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
|
||||
),
|
||||
(
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
|
||||
),
|
||||
]
|
||||
# Flatten the list for tests that require a single list of stubs.
|
||||
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
|
||||
|
||||
# For the outputs matching test, use the sparse-only pair.
|
||||
sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
|
||||
sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_compressed_uncompressed_model_shapes(self):
|
||||
"""
|
||||
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
|
||||
Note: Weights for sparsely compressed models may differ due to packing.
|
||||
"""
|
||||
|
||||
def _has_nested_attr(obj, attr_path):
|
||||
attrs = attr_path.split(".")
|
||||
for attr in attrs:
|
||||
if not hasattr(obj, attr):
|
||||
return None
|
||||
obj = getattr(obj, attr)
|
||||
return obj
|
||||
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
|
||||
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
|
||||
uncompressed = AutoModelForCausalLM.from_pretrained(
|
||||
uncompressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
|
||||
compressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
|
||||
for name, submodule in iter_named_leaf_modules(uncompressed):
|
||||
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
|
||||
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
|
||||
if "sparse-only" in uncompressed_model:
|
||||
self.assertTrue(
|
||||
torch.equal(submodule.weight, comp_decomp_obj.weight),
|
||||
f"Weight mismatch for module '{name}' in sparse-only model.",
|
||||
)
|
||||
else:
|
||||
self.assertTrue(
|
||||
torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
|
||||
f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
|
||||
)
|
||||
|
||||
def test_outputs_match(self):
|
||||
"""
|
||||
Ensure that the generated outputs match between the uncompressed model
|
||||
and its decompressed compressed counterpart.
|
||||
"""
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
|
||||
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
|
||||
|
||||
uncompressed = AutoModelForCausalLM.from_pretrained(
|
||||
self.sparse_uncompressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
|
||||
output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
|
||||
|
||||
decompressed = AutoModelForCausalLM.from_pretrained(
|
||||
self.sparse_compressed_model,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
|
||||
|
||||
self.assertEqual(
|
||||
tokenizer.decode(output_uncompressed[0]),
|
||||
tokenizer.decode(output_decompressed[0]),
|
||||
"Generated outputs do not match between compressed and uncompressed models.",
|
||||
)
|
||||
|
||||
def test_no_warnings_for_all_models(self):
|
||||
"""
|
||||
Confirm that loading any model using compressed tensors does not trigger
|
||||
warnings about missing or unexpected keys.
|
||||
"""
|
||||
for model_stub in self.model_stubs:
|
||||
with self.subTest(model_stub=model_stub):
|
||||
with warnings.catch_warnings(record=True) as caught_warnings:
|
||||
warnings.simplefilter("always")
|
||||
AutoModelForCausalLM.from_pretrained(
|
||||
model_stub,
|
||||
device_map="auto",
|
||||
dtype="auto",
|
||||
quantization_config=CompressedTensorsConfig(run_compressed=False),
|
||||
)
|
||||
for warning in caught_warnings:
|
||||
self.assertNotIn(
|
||||
"missing keys",
|
||||
str(warning.message).lower(),
|
||||
f"'missing keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
self.assertNotIn(
|
||||
"unexpected keys",
|
||||
str(warning.message).lower(),
|
||||
f"'unexpected keys' found in warnings for model {model_stub}",
|
||||
)
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class RunCompressedTest(unittest.TestCase):
|
||||
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
|
||||
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
stubs = [tinyllama_w4a16, tinyllama_w8a8]
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_default_run_compressed__True(self):
|
||||
from compressed_tensors.linear.compressed_linear import CompressedLinear
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
compressed_linear_counts = 0
|
||||
|
||||
for _, submodule in iter_named_leaf_modules(
|
||||
model,
|
||||
):
|
||||
if isinstance(submodule, CompressedLinear):
|
||||
compressed_linear_counts += 1
|
||||
|
||||
# some linear models are not compressed - ex. lm_head
|
||||
assert compressed_linear_counts > 0
|
||||
|
||||
def test_default_run_compressed__False(self):
|
||||
from compressed_tensors.linear.compressed_linear import CompressedLinear
|
||||
from compressed_tensors.quantization.utils import iter_named_leaf_modules
|
||||
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
compressed_linear_counts = 0
|
||||
|
||||
for _, submodule in iter_named_leaf_modules(
|
||||
model,
|
||||
):
|
||||
if isinstance(submodule, CompressedLinear):
|
||||
compressed_linear_counts += 1
|
||||
|
||||
# No modules should be CompressedLinear
|
||||
assert compressed_linear_counts == 0
|
||||
|
||||
def test_run_compressed_outputs_match(self):
|
||||
"""Check that run_compressed=True/False output are the same"""
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from transformers.utils.quantization_config import CompressedTensorsConfig
|
||||
|
||||
quantization_config = CompressedTensorsConfig(run_compressed=False)
|
||||
|
||||
for stub in self.stubs:
|
||||
tokenizer = AutoTokenizer.from_pretrained(stub)
|
||||
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
|
||||
|
||||
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
)
|
||||
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
|
||||
stub,
|
||||
quantization_config=quantization_config,
|
||||
)
|
||||
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
|
||||
|
||||
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])
|
||||
@@ -0,0 +1,87 @@
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
|
||||
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
|
||||
from transformers.utils import is_torch_available
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
@require_compressed_tensors
|
||||
@require_torch
|
||||
class CompressedTensorsTest(unittest.TestCase):
|
||||
tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
|
||||
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
|
||||
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
|
||||
llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
|
||||
|
||||
prompt = "Paris is the capital of which country?"
|
||||
|
||||
def tearDown(self):
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
gc.collect()
|
||||
|
||||
def test_config_args(self):
|
||||
with self.assertRaises(ValueError):
|
||||
# passing quant scheme directly is not allowed
|
||||
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
|
||||
CompressedTensorsConfig(
|
||||
config_groups={"FP8": ["Linear"]},
|
||||
ignore=["lm_head"],
|
||||
quantization_status="frozen",
|
||||
sparsity_config={"format": "dense"},
|
||||
)
|
||||
|
||||
def test_config_to_from_dict(self):
|
||||
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]}, sparsity_config={"format": "dense"})
|
||||
config_dict = config.to_dict()
|
||||
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
|
||||
|
||||
from compressed_tensors import QuantizationConfig, SparsityCompressionConfig
|
||||
|
||||
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
|
||||
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
|
||||
|
||||
def test_tinyllama_w8a8(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n"
|
||||
self._test_quantized_model(self.tinyllama_w8a8, expected_out)
|
||||
|
||||
def test_tinyllama_w4a16(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
|
||||
self._test_quantized_model(self.tinyllama_w4a16, expected_out)
|
||||
|
||||
def test_tinyllama_w8a16(self):
|
||||
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
|
||||
self._test_quantized_model(self.tinyllama_w8a16, expected_out)
|
||||
|
||||
def test_llama_8b_fp8(self):
|
||||
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous museum in Paris that is home to the Mona Lisa? The Louvre\nWhat is the name of the famous bridge in Paris that is often associated with the city"
|
||||
self._test_quantized_model(self.llama3_8b_fp8, expected_out)
|
||||
|
||||
def _test_quantized_model(self, model_name: str, expected_output: str):
|
||||
"""Carry out generation"""
|
||||
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
device = quantized_model.device
|
||||
self.assertIsNotNone(
|
||||
quantized_model.config.quantization_config,
|
||||
"quantization_config should not be None",
|
||||
)
|
||||
self.assertTrue(
|
||||
any(
|
||||
key
|
||||
for key, tensor in quantized_model.state_dict().items()
|
||||
if "scale" in key and not torch.all(tensor == 1.0)
|
||||
),
|
||||
"quantized model should load a non-trivial scale into the state dict",
|
||||
)
|
||||
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
|
||||
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
|
||||
outputs = tokenizer.batch_decode(generated_ids)
|
||||
|
||||
self.assertIsNotNone(outputs)
|
||||
self.assertEqual(outputs[0], expected_output)
|
||||
Reference in New Issue
Block a user