forked from EngineX-Cambricon/enginex-mlu370-vllm
add qwen3
This commit is contained in:
0
vllm-v0.6.2/tests/quantization/__init__.py
Normal file
0
vllm-v0.6.2/tests/quantization/__init__.py
Normal file
168
vllm-v0.6.2/tests/quantization/test_bitsandbytes.py
Normal file
168
vllm-v0.6.2/tests/quantization/test_bitsandbytes.py
Normal file
@@ -0,0 +1,168 @@
|
||||
'''Tests whether bitsandbytes computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_bitsandbytes.py`.
|
||||
'''
|
||||
|
||||
import gc
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from tests.utils import compare_two_settings, fork_new_process_for_each_test
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
]
|
||||
|
||||
models_pre_qaunt_4bit_to_test = [
|
||||
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
|
||||
'read pre-quantized 4-bit FP4 model'),
|
||||
('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
|
||||
]
|
||||
|
||||
models_pre_quant_8bit_to_test = [
|
||||
('meta-llama/Llama-Guard-3-8B-INT8',
|
||||
'read pre-quantized llama 8-bit model'),
|
||||
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
hf_model_kwargs = {"load_in_4bit": True}
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name, hf_model_kwargs)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_qaunt_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description",
|
||||
models_pre_quant_8bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
|
||||
model_name)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason='Test requires at least 2 GPUs.')
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
hf_model_kwargs = {"load_in_4bit": True}
|
||||
validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts[:1],
|
||||
model_name,
|
||||
hf_model_kwargs,
|
||||
vllm_tp_size=2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason='Test requires at least 2 GPUs.')
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
common_args = [
|
||||
"--disable-log-stats",
|
||||
"--disable-log-requests",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enable-prefix-caching",
|
||||
"--quantization",
|
||||
"bitsandbytes",
|
||||
"--load-format",
|
||||
"bitsandbytes",
|
||||
"--gpu-memory-utilization",
|
||||
"0.7",
|
||||
]
|
||||
pp_args = [
|
||||
*common_args,
|
||||
"--pipeline-parallel-size",
|
||||
"2",
|
||||
]
|
||||
compare_two_settings(model_name, common_args, pp_args)
|
||||
|
||||
|
||||
def log_generated_texts(prompts, outputs, runner_name):
|
||||
logged_texts = []
|
||||
for i, (_, generated_text) in enumerate(outputs):
|
||||
log_entry = {
|
||||
"prompt": prompts[i],
|
||||
"runner_name": runner_name,
|
||||
"generated_text": generated_text,
|
||||
}
|
||||
logged_texts.append(log_entry)
|
||||
return logged_texts
|
||||
|
||||
|
||||
def validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model_name,
|
||||
hf_model_kwargs=None,
|
||||
vllm_tp_size=1):
|
||||
|
||||
# NOTE: run vLLM first, as it requires a clean process
|
||||
# when using distributed inference
|
||||
with vllm_runner(model_name,
|
||||
quantization='bitsandbytes',
|
||||
load_format='bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, 8)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, 8)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Compare the generated strings
|
||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||
hf_str = hf_log["generated_text"]
|
||||
vllm_str = vllm_log["generated_text"]
|
||||
prompt = hf_log["prompt"]
|
||||
|
||||
assert hf_str == vllm_str, (f"Model: {model_name}"
|
||||
f"Mismatch between HF and vLLM outputs:\n"
|
||||
f"Prompt: {prompt}\n"
|
||||
f"HF Output: '{hf_str}'\n"
|
||||
f"vLLM Output: '{vllm_str}'")
|
||||
210
vllm-v0.6.2/tests/quantization/test_compressed_tensors.py
Normal file
210
vllm-v0.6.2/tests/quantization/test_compressed_tensors.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""Test model set-up and weight loading for llmcompressor-quantized models.
|
||||
|
||||
Run `pytest tests/quantization/test_compressed_tensors.py`.
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationType
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
|
||||
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_args",
|
||||
[("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
|
||||
QuantizationType.INT, 2560, True),
|
||||
("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
|
||||
QuantizationType.INT, 2560, True),
|
||||
("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
|
||||
QuantizationType.INT, 2560, False)])
|
||||
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
model_path, strategy, quant_type, shape_0, is_symmetric = model_args
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
# assert zp for symmetric and asymmetric cases
|
||||
def zp_valid(zp: Optional[torch.Tensor]):
|
||||
if is_symmetric:
|
||||
return zp is None
|
||||
|
||||
return zp is not None and zp.dtype is torch.int32
|
||||
|
||||
assert zp_valid(qkv_proj.input_zero_point)
|
||||
assert zp_valid(o_proj.input_zero_point)
|
||||
assert zp_valid(gate_up_proj.input_zero_point)
|
||||
assert zp_valid(down_proj.input_zero_point)
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method,
|
||||
CompressedTensorsLinearMethod)
|
||||
assert isinstance(down_proj.quant_method,
|
||||
CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
|
||||
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.scheme.is_static_input_scheme
|
||||
expected_type = torch.int8
|
||||
|
||||
assert qkv_proj.weight.dtype is expected_type
|
||||
assert o_proj.weight.dtype is expected_type
|
||||
assert gate_up_proj.weight.dtype is expected_type
|
||||
|
||||
if qkv_proj.scheme.strategy == "tensor":
|
||||
# Make sure it is a channelwise buffer
|
||||
# After running process_weights_after_loading
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
assert qkv_proj.weight_scale.shape[0] == shape_0
|
||||
assert qkv_proj.weight_scale.shape[1] == 1
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert qkv_proj.input_scale.dtype is torch.float32
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
|
||||
# TODO static & asymmetric
|
||||
])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
|
||||
example_prompts, model_path,
|
||||
max_tokens, num_logprobs):
|
||||
dtype = "bfloat16"
|
||||
|
||||
with hf_runner(model_path, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
with vllm_runner(model_path, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
||||
with vllm_runner(model_path) as llm:
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_args", [
|
||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
|
||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
|
||||
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
|
||||
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"channel"),
|
||||
])
|
||||
def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
|
||||
model_path, strategy = model_args
|
||||
with vllm_runner(model_path, dtype=torch.float16) as llm:
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
|
||||
assert not qkv_proj.scheme.is_static_input_scheme
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.weight.dtype is torch.int8
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"wNa16_args",
|
||||
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
|
||||
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
|
||||
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
|
||||
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||
model, strategy, group, pack_factor = wNa16_args
|
||||
with vllm_runner(model) as llm:
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
|
||||
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.scheme.group_size == (-1 if group is None else group)
|
||||
|
||||
assert qkv_proj.weight_packed.dtype is torch.int32
|
||||
assert qkv_proj.weight_scale.dtype is torch.float16
|
||||
assert qkv_proj.scheme.pack_factor == pack_factor
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
||||
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
|
||||
with vllm_runner(model_path) as llm:
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
|
||||
assert qkv_proj.weight_packed.dtype is torch.int32
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8(vllm_runner):
|
||||
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
with vllm_runner(model_path) as llm:
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(
|
||||
qkv_proj.scheme,
|
||||
(CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
|
||||
|
||||
assert qkv_proj.input_scale.dtype is torch.float32
|
||||
|
||||
if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
|
||||
assert len(qkv_proj.input_scale.shape) == 0
|
||||
assert qkv_proj.weight.dtype is torch.float8_e4m3fn
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight_scale.shape) == 0
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=20)
|
||||
assert output
|
||||
|
||||
|
||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
||||
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=20)
|
||||
assert output
|
||||
75
vllm-v0.6.2/tests/quantization/test_configs.py
Normal file
75
vllm-v0.6.2/tests/quantization/test_configs.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Tests whether Marlin models can be loaded from the autogptq config.
|
||||
|
||||
Run `pytest tests/quantization/test_configs.py --forked`.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
# Model Id // Quantization Arg // Expected Type
|
||||
MODEL_ARG_EXPTYPES = [
|
||||
# AUTOGPTQ
|
||||
# compat: autogptq <=0.7.1 is_marlin_format: bool
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
|
||||
# compat: autogptq >=0.8.0 use checkpoint_format: str
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
|
||||
|
||||
# AUTOAWQ
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
|
||||
def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
|
||||
model_path, quantization_arg, expected_type = model_arg_exptype
|
||||
|
||||
try:
|
||||
model_config = ModelConfig(model_path,
|
||||
task="auto",
|
||||
tokenizer=model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization=quantization_arg)
|
||||
found_quantization_type = model_config.quantization
|
||||
except ValueError:
|
||||
found_quantization_type = "ERROR"
|
||||
|
||||
assert found_quantization_type == expected_type, (
|
||||
f"Expected quant_type == {expected_type} for {model_path}, "
|
||||
f"but found {found_quantization_type} "
|
||||
f"for no --quantization {quantization_arg} case")
|
||||
68
vllm-v0.6.2/tests/quantization/test_cpu_offload.py
Normal file
68
vllm-v0.6.2/tests/quantization/test_cpu_offload.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# Expanded quantized model tests for CPU offloading
|
||||
# Base tests: tests/basic_correctness/test_cpu_offload.py
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.")
|
||||
def test_cpu_offload_fp8():
|
||||
# Test quantization of an unquantized checkpoint
|
||||
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
|
||||
["--quantization", "fp8"],
|
||||
["--quantization", "fp8", "--cpu-offload-gb", "2"],
|
||||
max_wait_seconds=480)
|
||||
# Test loading a quantized checkpoint
|
||||
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
|
||||
["--cpu-offload-gb", "2"],
|
||||
max_wait_seconds=480)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
def test_cpu_offload_gptq():
|
||||
# Test GPTQ Marlin
|
||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
|
||||
["--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
# Test GPTQ
|
||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
||||
["--quantization", "gptq"],
|
||||
["--quantization", "gptq", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
|
||||
reason="awq_marlin is not supported on this GPU type.")
|
||||
def test_cpu_offload_awq():
|
||||
# Test AWQ Marlin
|
||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
|
||||
["--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
# Test AWQ
|
||||
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
|
||||
["--quantization", "awq"],
|
||||
["--quantization", "awq", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
def test_cpu_offload_compressed_tensors():
|
||||
# Test wNa16
|
||||
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
|
||||
["--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
# Test w4a16_marlin24
|
||||
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||
[], ["--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
# Test w8a8
|
||||
compare_two_settings(
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
|
||||
["--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480)
|
||||
28
vllm-v0.6.2/tests/quantization/test_experts_int8.py
Normal file
28
vllm-v0.6.2/tests/quantization/test_experts_int8.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# flake8: noqa
|
||||
"""Tests experts_int8 quantization startup and generation,
|
||||
doesn't test correctness
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-random"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
|
||||
reason="ExpertsInt8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_model_experts_int8_startup(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
|
||||
with vllm_runner(model, dtype=dtype,
|
||||
quantization="experts_int8") as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
142
vllm-v0.6.2/tests/quantization/test_fp8.py
Normal file
142
vllm-v0.6.2/tests/quantization/test_fp8.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""Tests whether FP8 computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_fp8.py --forked`.
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
|
||||
Fp8LinearMethod)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODELS = [
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_id", MODELS)
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
|
||||
monkeypatch) -> None:
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(model_id) as llm:
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(prompts=["Hello my name is"],
|
||||
max_tokens=10)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
KV_CACHE_MODELS = [
|
||||
# Deprecated AutoFP8 format using .kv_scale
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
# AutoFP8 format using separate .k_scale and .v_scale
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
|
||||
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
|
||||
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
attn = model.model.layers[0].self_attn.attn
|
||||
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
|
||||
# NOTE: it is valid for scales to be 1.0 (default value), but we know
|
||||
# these checkpoints have scales < 1.0
|
||||
assert 0.0 < attn._k_scale < 1.0
|
||||
assert 0.0 < attn._v_scale < 1.0
|
||||
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(prompts=["Hello my name is"],
|
||||
max_tokens=10)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
|
||||
monkeypatch) -> None:
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner("facebook/opt-125m",
|
||||
quantization="fp8",
|
||||
kv_cache_dtype=kv_cache_dtype) as llm:
|
||||
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
|
||||
fc1 = model.model.decoder.layers[0].fc1
|
||||
assert isinstance(fc1.quant_method, Fp8LinearMethod)
|
||||
if kv_cache_dtype == "fp8":
|
||||
attn = model.model.decoder.layers[0].self_attn.attn
|
||||
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
|
||||
assert attn._k_scale == 1.0
|
||||
assert attn._v_scale == 1.0
|
||||
|
||||
if current_platform.has_device_capability(89) and not force_marlin:
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fn
|
||||
else:
|
||||
# For GPUs without hardware support, we pack the fp8 weights
|
||||
# for weight-only quantization using Marlin kernels
|
||||
assert fc1.weight.dtype == torch.int32
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
def test_scaled_fp8_quant(dtype) -> None:
|
||||
|
||||
def quantize_ref(tensor, inv_scale):
|
||||
# The reference implementation that fully aligns to
|
||||
# the kernel being tested.
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
scale = inv_scale.reciprocal()
|
||||
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min,
|
||||
max=finfo.max)
|
||||
qweight = qweight.to(torch.float8_e4m3fn)
|
||||
return qweight
|
||||
|
||||
def per_tensor_dequantize(tensor, inv_scale, dtype):
|
||||
fake_qweight = tensor.to(dtype)
|
||||
dq_weight = fake_qweight * inv_scale
|
||||
return dq_weight
|
||||
|
||||
# Note that we use a shape % 4 != 0 to cover edge cases,
|
||||
# because scaled_fp8_quant is vectorized by 4.
|
||||
x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
|
||||
|
||||
# Dynamic quantization
|
||||
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
|
||||
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
|
||||
|
||||
# Reference dynamic quantizaton
|
||||
y = quantize_ref(x, inv_scale)
|
||||
torch.testing.assert_close(ref_y,
|
||||
per_tensor_dequantize(y, inv_scale, dtype))
|
||||
|
||||
# Static quantization
|
||||
y, _ = ops.scaled_fp8_quant(x, inv_scale)
|
||||
torch.testing.assert_close(ref_y,
|
||||
per_tensor_dequantize(y, inv_scale, dtype))
|
||||
|
||||
# Padding
|
||||
y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
|
||||
assert y.shape[0] == 17
|
||||
torch.testing.assert_close(
|
||||
ref_y,
|
||||
per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale,
|
||||
dtype))
|
||||
28
vllm-v0.6.2/tests/quantization/test_ipex_quant.py
Normal file
28
vllm-v0.6.2/tests/quantization/test_ipex_quant.py
Normal file
@@ -0,0 +1,28 @@
|
||||
"""Test model set-up and inference for quantized HF models supported
|
||||
on the CPU backend using IPEX (including AWQ).
|
||||
|
||||
Validating the configuration and printing results for manual checking.
|
||||
|
||||
Run `pytest tests/quantization/test_ipex_quant.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODELS = [
|
||||
"casperhansen/llama-3-8b-instruct-awq",
|
||||
]
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(),
|
||||
reason="only supports the CPU backend.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"],
|
||||
max_tokens=32)
|
||||
assert output
|
||||
print(output)
|
||||
47
vllm-v0.6.2/tests/quantization/test_lm_head.py
Normal file
47
vllm-v0.6.2/tests/quantization/test_lm_head.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Tests whether gptq models with quantized lm_head can be loaded.
|
||||
|
||||
Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
|
||||
"""
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
GPTQMarlinLinearMethod)
|
||||
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
UnquantizedEmbeddingMethod)
|
||||
|
||||
PROMPT = "On the surface of Mars, we found"
|
||||
|
||||
MODELS_QUANT = [(
|
||||
"LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
|
||||
True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
|
||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
|
||||
def test_lm_head(
|
||||
vllm_runner,
|
||||
model_lm_head_quant: Tuple[str, bool],
|
||||
) -> None:
|
||||
model, lm_head_quantized = model_lm_head_quant
|
||||
vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048)
|
||||
|
||||
lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker.
|
||||
model_runner.model.lm_head)
|
||||
|
||||
if lm_head_quantized:
|
||||
assert isinstance(
|
||||
lm_head_layer.linear_method,
|
||||
(GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
|
||||
else:
|
||||
assert isinstance(lm_head_layer.linear_method,
|
||||
UnquantizedEmbeddingMethod)
|
||||
|
||||
print(
|
||||
vllm_model.generate_greedy(prompts=["Hello my name is"],
|
||||
max_tokens=10)[0][1])
|
||||
del vllm_model
|
||||
15
vllm-v0.6.2/tests/quantization/utils.py
Normal file
15
vllm-v0.6.2/tests/quantization/utils.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def is_quant_method_supported(quant_method: str) -> bool:
|
||||
# Currently, all quantization methods require Nvidia or AMD GPUs
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
return False
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
assert capability is not None
|
||||
|
||||
min_capability = QUANTIZATION_METHODS[quant_method].get_min_capability()
|
||||
|
||||
return capability.to_int() >= min_capability
|
||||
Reference in New Issue
Block a user