This commit is contained in:
2025-10-09 16:47:16 +08:00
parent c8feb4deb5
commit e27e3f16bb
5248 changed files with 1778505 additions and 0 deletions

View File

@@ -0,0 +1,232 @@
import gc
import unittest
import warnings
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
from transformers.utils.quantization_config import CompressedTensorsConfig
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class StackCompressedModelTest(unittest.TestCase):
# Define stubs as class attributes
compressed_uncompressed_model_stubs = [
(
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
),
(
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
),
(
"nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
"nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
),
]
# Flatten the list for tests that require a single list of stubs.
model_stubs = [stub for pair in compressed_uncompressed_model_stubs for stub in pair]
# For the outputs matching test, use the sparse-only pair.
sparse_compressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed"
sparse_uncompressed_model = "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed"
prompt = "Paris is the capital of which country?"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_compressed_uncompressed_model_shapes(self):
"""
Verify that the weights of an uncompressed model and its decompressed compressed counterpart match.
Note: Weights for sparsely compressed models may differ due to packing.
"""
def _has_nested_attr(obj, attr_path):
attrs = attr_path.split(".")
for attr in attrs:
if not hasattr(obj, attr):
return None
obj = getattr(obj, attr)
return obj
from compressed_tensors.quantization.utils import iter_named_leaf_modules
for compressed_model, uncompressed_model in self.compressed_uncompressed_model_stubs:
with self.subTest(compressed_model=compressed_model, uncompressed_model=uncompressed_model):
uncompressed = AutoModelForCausalLM.from_pretrained(
uncompressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
compressed_decompressed = AutoModelForCausalLM.from_pretrained(
compressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for name, submodule in iter_named_leaf_modules(uncompressed):
comp_decomp_obj = _has_nested_attr(compressed_decompressed, name)
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
if "sparse-only" in uncompressed_model:
self.assertTrue(
torch.equal(submodule.weight, comp_decomp_obj.weight),
f"Weight mismatch for module '{name}' in sparse-only model.",
)
else:
self.assertTrue(
torch.allclose(submodule.weight, comp_decomp_obj.weight, atol=0.2),
f"Weight mismatch for module '{name}' in quantized-only or stacked model.",
)
def test_outputs_match(self):
"""
Ensure that the generated outputs match between the uncompressed model
and its decompressed compressed counterpart.
"""
tokenizer = AutoTokenizer.from_pretrained(self.sparse_uncompressed_model)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
uncompressed = AutoModelForCausalLM.from_pretrained(
self.sparse_uncompressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
output_uncompressed = uncompressed.generate(input_ids.to(uncompressed.device), max_new_tokens=100)
decompressed = AutoModelForCausalLM.from_pretrained(
self.sparse_compressed_model,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
output_decompressed = decompressed.generate(input_ids.to(decompressed.device), max_new_tokens=100)
self.assertEqual(
tokenizer.decode(output_uncompressed[0]),
tokenizer.decode(output_decompressed[0]),
"Generated outputs do not match between compressed and uncompressed models.",
)
def test_no_warnings_for_all_models(self):
"""
Confirm that loading any model using compressed tensors does not trigger
warnings about missing or unexpected keys.
"""
for model_stub in self.model_stubs:
with self.subTest(model_stub=model_stub):
with warnings.catch_warnings(record=True) as caught_warnings:
warnings.simplefilter("always")
AutoModelForCausalLM.from_pretrained(
model_stub,
device_map="auto",
dtype="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
for warning in caught_warnings:
self.assertNotIn(
"missing keys",
str(warning.message).lower(),
f"'missing keys' found in warnings for model {model_stub}",
)
self.assertNotIn(
"unexpected keys",
str(warning.message).lower(),
f"'unexpected keys' found in warnings for model {model_stub}",
)
@require_compressed_tensors
@require_torch
class RunCompressedTest(unittest.TestCase):
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
prompt = "Paris is the capital of which country?"
stubs = [tinyllama_w4a16, tinyllama_w8a8]
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_default_run_compressed__True(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# some linear models are not compressed - ex. lm_head
assert compressed_linear_counts > 0
def test_default_run_compressed__False(self):
from compressed_tensors.linear.compressed_linear import CompressedLinear
from compressed_tensors.quantization.utils import iter_named_leaf_modules
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
model = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
compressed_linear_counts = 0
for _, submodule in iter_named_leaf_modules(
model,
):
if isinstance(submodule, CompressedLinear):
compressed_linear_counts += 1
# No modules should be CompressedLinear
assert compressed_linear_counts == 0
def test_run_compressed_outputs_match(self):
"""Check that run_compressed=True/False output are the same"""
from transformers import AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig
quantization_config = CompressedTensorsConfig(run_compressed=False)
for stub in self.stubs:
tokenizer = AutoTokenizer.from_pretrained(stub)
input_ids = tokenizer(self.prompt, return_tensors="pt").input_ids
model_run_compressed__True = AutoModelForCausalLM.from_pretrained(
stub,
)
output_rc_true = model_run_compressed__True.generate(input_ids, max_new_tokens=100)
model_run_compressed__False = AutoModelForCausalLM.from_pretrained(
stub,
quantization_config=quantization_config,
)
output_rc_false = model_run_compressed__False.generate(input_ids, max_new_tokens=100)
assert tokenizer.decode(output_rc_true[0]) == tokenizer.decode(output_rc_false[0])

View File

@@ -0,0 +1,87 @@
import gc
import unittest
from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.utils import is_torch_available
if is_torch_available():
import torch
@require_compressed_tensors
@require_torch
class CompressedTensorsTest(unittest.TestCase):
tinyllama_w8a16 = "nm-testing/tinyllama-w8a16-dense-hf-quantizer"
tinyllama_w4a16 = "nm-testing/tinyllama-w4a16-compressed-hf-quantizer"
tinyllama_w8a8 = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
llama3_8b_fp8 = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"
prompt = "Paris is the capital of which country?"
def tearDown(self):
gc.collect()
backend_empty_cache(torch_device)
gc.collect()
def test_config_args(self):
with self.assertRaises(ValueError):
# passing quant scheme directly is not allowed
CompressedTensorsConfig(config_groups={"weights": {"num_bits": 8}})
CompressedTensorsConfig(
config_groups={"FP8": ["Linear"]},
ignore=["lm_head"],
quantization_status="frozen",
sparsity_config={"format": "dense"},
)
def test_config_to_from_dict(self):
config = CompressedTensorsConfig(config_groups={"FP8": ["Linear"]}, sparsity_config={"format": "dense"})
config_dict = config.to_dict()
config_from_dict = CompressedTensorsConfig.from_dict(config_dict)
from compressed_tensors import QuantizationConfig, SparsityCompressionConfig
self.assertIsInstance(config_from_dict.quantization_config, QuantizationConfig)
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)
def test_tinyllama_w8a8(self):
expected_out = "<s> Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n 1. Paris is the capital of which country?\n\n"
self._test_quantized_model(self.tinyllama_w8a8, expected_out)
def test_tinyllama_w4a16(self):
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
self._test_quantized_model(self.tinyllama_w4a16, expected_out)
def test_tinyllama_w8a16(self):
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
self._test_quantized_model(self.tinyllama_w8a16, expected_out)
def test_llama_8b_fp8(self):
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous museum in Paris that is home to the Mona Lisa? The Louvre\nWhat is the name of the famous bridge in Paris that is often associated with the city"
self._test_quantized_model(self.llama3_8b_fp8, expected_out)
def _test_quantized_model(self, model_name: str, expected_output: str):
"""Carry out generation"""
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = quantized_model.device
self.assertIsNotNone(
quantized_model.config.quantization_config,
"quantization_config should not be None",
)
self.assertTrue(
any(
key
for key, tensor in quantized_model.state_dict().items()
if "scale" in key and not torch.all(tensor == 1.0)
),
"quantized model should load a non-trivial scale into the state dict",
)
inputs = tokenizer(self.prompt, return_tensors="pt").to(device)
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
outputs = tokenizer.batch_decode(generated_ids)
self.assertIsNotNone(outputs)
self.assertEqual(outputs[0], expected_output)