Sync from v0.13
This commit is contained in:
0
tests/models/quantization/__init__.py
Normal file
0
tests/models/quantization/__init__.py
Normal file
137
tests/models/quantization/test_awq.py
Normal file
137
tests/models/quantization/test_awq.py
Normal file
@@ -0,0 +1,137 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
|
||||
from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
"cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def run_awq_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
source_model: str,
|
||||
quant_model: str,
|
||||
*,
|
||||
size_factors: list[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: str | None = None,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
source_model,
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
source_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
quant_model,
|
||||
quantization="awq",
|
||||
max_model_len=4096,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
quant_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, images=images
|
||||
)
|
||||
for prompts, images in inputs_per_image
|
||||
]
|
||||
|
||||
for source_outputs, quant_outputs in zip(
|
||||
source_outputs_per_image, quant_outputs_per_image
|
||||
):
|
||||
# TODO: Check whether using original CLIPVisionModel can improve
|
||||
# consistency against HF
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=source_outputs,
|
||||
outputs_1_lst=quant_outputs,
|
||||
name_0="source",
|
||||
name_1="awq",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("source_model", "quant_model"),
|
||||
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@torch.inference_mode()
|
||||
def test_awq_models(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
source_model,
|
||||
quant_model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
) -> None:
|
||||
run_awq_test(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
source_model,
|
||||
quant_model,
|
||||
size_factors=size_factors,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
68
tests/models/quantization/test_bitblas.py
Normal file
68
tests/models/quantization/test_bitblas.py
Normal file
@@ -0,0 +1,68 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the outputs of a GPTQ model to a bitblas model.
|
||||
|
||||
Note: GPTQ and bitblas do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
bitblas/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Note: bitblas internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
test up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_bitblas: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
ModelPair(
|
||||
model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
|
||||
model_gptq="hxbgsyxh/opt-125m-4bit-128g",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
|
||||
) as bitblas_model:
|
||||
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=bitblas_outputs,
|
||||
name_0="gptq",
|
||||
name_1="bitblas",
|
||||
)
|
||||
290
tests/models/quantization/test_bitsandbytes.py
Normal file
290
tests/models/quantization/test_bitsandbytes.py
Normal file
@@ -0,0 +1,290 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether bitsandbytes computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_bitsandbytes.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from transformers import BitsAndBytesConfig
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import compare_two_settings, multi_gpu_test
|
||||
from ..utils import check_embeddings_close, check_logprobs_close
|
||||
|
||||
if current_platform.is_rocm():
|
||||
from vllm.platforms.rocm import on_gfx9
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
on_gfx9(),
|
||||
reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
|
||||
)
|
||||
|
||||
models_4bit_to_test = [
|
||||
("facebook/opt-125m", "quantize opt model inflight"),
|
||||
(
|
||||
"mistralai/Mistral-7B-Instruct-v0.3",
|
||||
"quantize inflight model with both HF and Mistral format weights",
|
||||
),
|
||||
]
|
||||
|
||||
models_4bit_to_embedding_test = [
|
||||
("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
|
||||
]
|
||||
|
||||
models_4bit_to_moe_test = [
|
||||
("allenai/OLMoE-1B-7B-0125-Instruct", "quantize moe model inflight"),
|
||||
]
|
||||
|
||||
models_pre_qaunt_4bit_to_test = [
|
||||
(
|
||||
"PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
|
||||
"read pre-quantized 4-bit FP4 model",
|
||||
),
|
||||
("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
|
||||
]
|
||||
|
||||
models_pre_quant_8bit_to_test = [
|
||||
("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
|
||||
("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
def test_load_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
|
||||
def test_load_pre_quant_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
|
||||
def test_load_8bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
validate_generated_texts(
|
||||
hf_runner, vllm_runner, example_prompts[:1], model_name, True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_tp_4bit_bnb_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
validate_generated_texts(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts[:1],
|
||||
model_name,
|
||||
False,
|
||||
hf_model_kwargs,
|
||||
vllm_tp_size=2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_load_pp_4bit_bnb_model(model_name, description) -> None:
|
||||
common_args = [
|
||||
"--disable-log-stats",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--enable-prefix-caching",
|
||||
"--quantization",
|
||||
"bitsandbytes",
|
||||
"--gpu-memory-utilization",
|
||||
"0.7",
|
||||
]
|
||||
pp_args = [
|
||||
*common_args,
|
||||
"--pipeline-parallel-size",
|
||||
"2",
|
||||
]
|
||||
compare_two_settings(model_name, common_args, pp_args)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
|
||||
def test_4bit_bnb_moe_model(
|
||||
hf_runner, vllm_runner, example_prompts, model_name, description
|
||||
) -> None:
|
||||
hf_model_kwargs = dict(
|
||||
quantization_config=BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
)
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
quantization="bitsandbytes",
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1,
|
||||
) as llm:
|
||||
vllm_outputs = llm.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens=32, num_logprobs=5
|
||||
)
|
||||
|
||||
with hf_runner(
|
||||
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
|
||||
) as llm:
|
||||
transformers_outputs = llm.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens=32, num_logprobs=5
|
||||
)
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=transformers_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="transformers",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("bitsandbytes"),
|
||||
reason="bitsandbytes is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_4bit_bnb_embedding_model(
|
||||
model_name,
|
||||
description,
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
# The example_prompts has ending "\n", for example:
|
||||
# "Write a short story about a robot that dreams for the first time.\n"
|
||||
# sentence_transformers will strip the input texts, see:
|
||||
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
|
||||
# This makes the input_ids different between hf_model and vllm_model.
|
||||
# So we need to strip the input texts to avoid test failing.
|
||||
example_prompts = [str(s).strip() for s in example_prompts]
|
||||
|
||||
# Inflight 4bit quantization
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
gpu_memory_utilization=0.5,
|
||||
quantization="bitsandbytes",
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(example_prompts)
|
||||
|
||||
hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
|
||||
with hf_runner(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs,
|
||||
is_sentence_transformer=True,
|
||||
default_torch_num_threads=1,
|
||||
) as hf_model:
|
||||
hf_outputs = hf_model.encode(example_prompts)
|
||||
|
||||
check_embeddings_close(
|
||||
embeddings_0_lst=hf_outputs,
|
||||
embeddings_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
tol=5e-2,
|
||||
)
|
||||
|
||||
|
||||
def log_generated_texts(prompts, outputs, runner_name):
|
||||
logged_texts = []
|
||||
for i, (_, generated_text) in enumerate(outputs):
|
||||
log_entry = {
|
||||
"prompt": prompts[i],
|
||||
"runner_name": runner_name,
|
||||
"generated_text": generated_text,
|
||||
}
|
||||
logged_texts.append(log_entry)
|
||||
return logged_texts
|
||||
|
||||
|
||||
def validate_generated_texts(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model_name,
|
||||
pre_quant=False,
|
||||
hf_model_kwargs=None,
|
||||
vllm_tp_size=1,
|
||||
max_tokens=8,
|
||||
):
|
||||
# NOTE: run vLLM first, as it requires a clean process
|
||||
# when using distributed inference
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
quantization=None if pre_quant else "bitsandbytes",
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=False,
|
||||
default_torch_num_threads=1,
|
||||
tokenizer_mode="hf",
|
||||
load_format="hf",
|
||||
config_format="hf",
|
||||
) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
if hf_model_kwargs is None:
|
||||
hf_model_kwargs = {}
|
||||
|
||||
# Run with HF runner
|
||||
with hf_runner(
|
||||
model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
|
||||
) as llm:
|
||||
hf_outputs = llm.generate_greedy(prompts, max_tokens)
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Compare the generated strings
|
||||
for hf_log, vllm_log in zip(hf_logs, vllm_logs):
|
||||
hf_str = hf_log["generated_text"]
|
||||
vllm_str = vllm_log["generated_text"]
|
||||
prompt = hf_log["prompt"]
|
||||
assert hf_str == vllm_str, (
|
||||
f"Model: {model_name}"
|
||||
f"Mismatch between HF and vLLM outputs:\n"
|
||||
f"Prompt: {prompt}\n"
|
||||
f"HF Output: '{hf_str}'\n"
|
||||
f"vLLM Output: '{vllm_str}'"
|
||||
)
|
||||
172
tests/models/quantization/test_fp8.py
Normal file
172
tests/models/quantization/test_fp8.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# flake8: noqa
|
||||
"""Tests fp8 models against ground truth generation
|
||||
Note: these tests will only pass on L4 GPU.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
|
||||
from vllm.platforms import current_platform
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model",
|
||||
[
|
||||
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
|
||||
(
|
||||
"fp8_e4m3",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
|
||||
),
|
||||
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
(
|
||||
"fp8_e5m2",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
|
||||
(
|
||||
"fp8_e4m3",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
],
|
||||
)
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("enforce_eager", [True])
|
||||
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
|
||||
# NOTE: Increasing this in this suite will fail CI because we currently cannot
|
||||
# reset distributed env properly. Use a value > 1 just when you test.
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [1])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
max_tokens: int,
|
||||
enforce_eager: bool,
|
||||
backend: str,
|
||||
tensor_parallel_size: int,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
Only checks log probs match to cover the discrepancy in
|
||||
numerical sensitive kernels.
|
||||
"""
|
||||
|
||||
if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
|
||||
pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
|
||||
|
||||
if not flash_attn_supports_fp8():
|
||||
pytest.skip(
|
||||
f"{kv_cache_dtype} is not supported on this GPU type with {backend} attention."
|
||||
)
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", "true")
|
||||
m.setenv("VLLM_ATTENTION_BACKEND", backend)
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype="auto",
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=enforce_eager,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
outputs_1_lst=test_outputs,
|
||||
name_0="fp16_kv_cache",
|
||||
name_1="fp8_kv_cache",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.cpu_model
|
||||
@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
|
||||
@pytest.mark.parametrize(
|
||||
"kv_cache_dtype,base_model,test_model",
|
||||
[
|
||||
# Test BF16 checkpoint w. fp8_e5m2 kv-cache.
|
||||
(
|
||||
"fp8_e5m2",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
),
|
||||
],
|
||||
)
|
||||
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
def test_cpu_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
kv_cache_dtype: str,
|
||||
base_model: str,
|
||||
test_model: str,
|
||||
max_tokens: int,
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""
|
||||
Only checks log probs match to cover the discrepancy in
|
||||
numerical sensitive kernels.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("TOKENIZERS_PARALLELISM", "true")
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
NUM_LOG_PROBS = 8
|
||||
|
||||
with vllm_runner(
|
||||
base_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype="auto",
|
||||
) as vllm_model:
|
||||
baseline_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
test_model,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
dtype="bfloat16",
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as vllm_model:
|
||||
test_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, NUM_LOG_PROBS
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=baseline_outputs,
|
||||
outputs_1_lst=test_outputs,
|
||||
name_0="bf16_kv_cache",
|
||||
name_1="fp8_kv_cache",
|
||||
)
|
||||
204
tests/models/quantization/test_gguf.py
Normal file
204
tests/models/quantization/test_gguf.py
Normal file
@@ -0,0 +1,204 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Tests gguf models against unquantized models generations
|
||||
Note: To pass the test, quantization higher than Q4 should be used
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ...conftest import VllmRunner
|
||||
from ...utils import multi_gpu_test
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
|
||||
class GGUFTestConfig(NamedTuple):
|
||||
original_model: str
|
||||
gguf_repo: str
|
||||
gguf_filename: str
|
||||
marks: list[MarkDecorator] = []
|
||||
|
||||
@property
|
||||
def gguf_model(self):
|
||||
return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
|
||||
|
||||
|
||||
LLAMA_CONFIG = GGUFTestConfig(
|
||||
original_model="meta-llama/Llama-3.2-1B-Instruct",
|
||||
gguf_repo="bartowski/Llama-3.2-1B-Instruct-GGUF",
|
||||
gguf_filename="Llama-3.2-1B-Instruct-Q6_K.gguf",
|
||||
)
|
||||
|
||||
QWEN2_CONFIG = GGUFTestConfig(
|
||||
original_model="Qwen/Qwen2.5-1.5B-Instruct",
|
||||
gguf_repo="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
|
||||
gguf_filename="qwen2.5-1.5b-instruct-q6_k.gguf",
|
||||
)
|
||||
|
||||
QWEN3_CONFIG = GGUFTestConfig(
|
||||
original_model="Qwen/Qwen3-0.6B",
|
||||
gguf_repo="unsloth/Qwen3-0.6B-GGUF",
|
||||
gguf_filename="Qwen3-0.6B-BF16.gguf",
|
||||
)
|
||||
|
||||
PHI3_CONFIG = GGUFTestConfig(
|
||||
original_model="microsoft/Phi-3.5-mini-instruct",
|
||||
gguf_repo="bartowski/Phi-3.5-mini-instruct-GGUF",
|
||||
gguf_filename="Phi-3.5-mini-instruct-IQ4_XS.gguf",
|
||||
)
|
||||
|
||||
GPT2_CONFIG = GGUFTestConfig(
|
||||
original_model="openai-community/gpt2-large",
|
||||
gguf_repo="QuantFactory/gpt2-large-GGUF",
|
||||
gguf_filename="gpt2-large.Q4_K_M.gguf",
|
||||
)
|
||||
|
||||
STABLELM_CONFIG = GGUFTestConfig(
|
||||
original_model="stabilityai/stablelm-3b-4e1t",
|
||||
gguf_repo="afrideva/stablelm-3b-4e1t-GGUF",
|
||||
gguf_filename="stablelm-3b-4e1t.q4_k_m.gguf",
|
||||
)
|
||||
|
||||
STARCODER_CONFIG = GGUFTestConfig(
|
||||
original_model="bigcode/starcoder2-3b",
|
||||
gguf_repo="QuantFactory/starcoder2-3b-GGUF",
|
||||
gguf_filename="starcoder2-3b.Q6_K.gguf",
|
||||
)
|
||||
|
||||
DOLPHIN_CONFIG = GGUFTestConfig(
|
||||
# Test VocabParallelEmbedding sharding issue.
|
||||
original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
|
||||
gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
|
||||
gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
|
||||
)
|
||||
|
||||
GEMMA3_CONFIG = GGUFTestConfig(
|
||||
original_model="google/gemma-3-270m-it",
|
||||
gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
|
||||
gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
# LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
|
||||
QWEN2_CONFIG,
|
||||
QWEN3_CONFIG,
|
||||
PHI3_CONFIG,
|
||||
GPT2_CONFIG,
|
||||
STABLELM_CONFIG,
|
||||
DOLPHIN_CONFIG,
|
||||
GEMMA3_CONFIG,
|
||||
# STARCODER_CONFIG, # broken
|
||||
]
|
||||
|
||||
|
||||
def check_model_outputs(
|
||||
vllm_runner: type[VllmRunner],
|
||||
prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model.original_model)
|
||||
if tokenizer.chat_template is not None:
|
||||
messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
|
||||
prompts = tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
|
||||
# Run gguf model.
|
||||
with vllm_runner(
|
||||
model_name=model.gguf_model,
|
||||
enforce_eager=True,
|
||||
tokenizer_name=model.original_model,
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=tp_size,
|
||||
) as gguf_model:
|
||||
gguf_outputs = gguf_model.generate_greedy_logprobs(
|
||||
prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
# Run unquantized model.
|
||||
# Should run with tp=1, otherwise the test will stuck at
|
||||
# nccl initialization.
|
||||
with vllm_runner(
|
||||
model_name=model.original_model,
|
||||
enforce_eager=True, # faster tests
|
||||
dtype=dtype,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as original_model:
|
||||
original_outputs = original_model.generate_greedy_logprobs(
|
||||
prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=original_outputs,
|
||||
outputs_1_lst=gguf_outputs,
|
||||
name_0="original",
|
||||
name_1="gguf",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [1])
|
||||
def test_models(
|
||||
vllm_runner: type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(
|
||||
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gguf"),
|
||||
reason="gguf is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("tp_size", [2])
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
def test_distributed(
|
||||
vllm_runner: type[VllmRunner],
|
||||
example_prompts: list[str],
|
||||
model: GGUFTestConfig,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tp_size: int,
|
||||
) -> None:
|
||||
check_model_outputs(
|
||||
vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
|
||||
)
|
||||
80
tests/models/quantization/test_gpt_oss_attn_quantization.py
Normal file
80
tests/models/quantization/test_gpt_oss_attn_quantization.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test attention quantization of gpt-oss model.
|
||||
The qkv_proj and o_proj in self_attention can be either quantized or excluded.
|
||||
|
||||
Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`.
|
||||
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
|
||||
import huggingface_hub
|
||||
import lm_eval
|
||||
import pytest
|
||||
from packaging import version
|
||||
|
||||
MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"]
|
||||
|
||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")
|
||||
) >= version.parse("0.8.99")
|
||||
|
||||
|
||||
def has_huggingface_access(repo):
|
||||
try:
|
||||
huggingface_hub.list_repo_refs(repo)
|
||||
return True
|
||||
except huggingface_hub.errors.RepositoryNotFoundError:
|
||||
return False
|
||||
|
||||
|
||||
HF_HUB_AMD_ORG_ACCESS = all(
|
||||
[has_huggingface_access(model_name) for model_name in MODEL_NAMES]
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelCase:
|
||||
model_id: str
|
||||
tp: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationConfig:
|
||||
model_name: str
|
||||
|
||||
def get_model_args(self) -> str:
|
||||
return (
|
||||
f"pretrained={self.model_name},"
|
||||
"tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False"
|
||||
)
|
||||
|
||||
|
||||
EXPECTED_ACCURACIES = {"arc_challenge": 0.20}
|
||||
|
||||
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
@pytest.mark.skipif(
|
||||
not HF_HUB_AMD_ORG_ACCESS,
|
||||
reason="Read access to huggingface.co/amd is required for this test.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", MODEL_NAMES)
|
||||
@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items())
|
||||
def test_gpt_oss_attention_quantization(
|
||||
model_name: str, task_name: str, expected_accuracy: float
|
||||
):
|
||||
measured_accuracy = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=EvaluationConfig(model_name).get_model_args(),
|
||||
tasks=task_name,
|
||||
batch_size="auto",
|
||||
)["results"][task_name]["acc,none"]
|
||||
|
||||
rtol = 0.05
|
||||
assert (
|
||||
measured_accuracy - rtol < expected_accuracy
|
||||
and measured_accuracy + rtol > expected_accuracy
|
||||
), f"Expected: {expected_accuracy} | Measured: {measured_accuracy}"
|
||||
64
tests/models/quantization/test_gptq_bitblas.py
Normal file
64
tests/models/quantization/test_gptq_bitblas.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the outputs of a GPTQ model to a bitblas model.
|
||||
|
||||
Note: GPTQ and bitblas do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
bitblas/GPTQ models are in the top 3 selections of each other.
|
||||
|
||||
Note: bitblas internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for bitblas. As a result, we re-run the
|
||||
test up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="bitblas"
|
||||
) as bitblas_model:
|
||||
bitblas_outputs = bitblas_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=bitblas_outputs,
|
||||
name_0="gptq",
|
||||
name_1="gptq_bitblas",
|
||||
)
|
||||
93
tests/models/quantization/test_gptq_marlin.py
Normal file
93
tests/models/quantization/test_gptq_marlin.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compares the outputs of gptq vs gptq_marlin.
|
||||
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 5 selections of each other.
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = [
|
||||
# act_order==True, group_size=128
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
|
||||
# 8-bit, act_order==True, group_size=channelwise
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
|
||||
# 4-bit, act_order==True, group_size=128
|
||||
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="gptq_marlin is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_name, revision = model
|
||||
|
||||
# Run marlin.
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
revision=revision,
|
||||
dtype=dtype,
|
||||
quantization="marlin",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as gptq_marlin_model:
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
_ROPE_DICT.clear() # clear rope cache to avoid rope dtype error
|
||||
|
||||
# Run gptq.
|
||||
# The naive gptq kernel doesn't support bf16 yet.
|
||||
# Here we always compare fp16/bf16 gpt marlin kernel
|
||||
# to fp16 gptq kernel.
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
revision=revision,
|
||||
dtype="half",
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1,
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts[:-1], max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=gptq_marlin_outputs,
|
||||
name_0="gptq",
|
||||
name_1="gptq_marlin",
|
||||
)
|
||||
84
tests/models/quantization/test_gptq_marlin_24.py
Normal file
84
tests/models/quantization/test_gptq_marlin_24.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Compare the outputs of a GPTQ model to a Marlin_24 model.
|
||||
|
||||
Note: GPTQ and Marlin_24 do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import check_logprobs_close
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelPair:
|
||||
model_marlin: str
|
||||
model_gptq: str
|
||||
|
||||
|
||||
model_pairs = [
|
||||
# 4-bit, group_size == 128
|
||||
ModelPair(
|
||||
model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
|
||||
),
|
||||
# # 4-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
|
||||
# 8-bit, group_size == 128
|
||||
ModelPair(
|
||||
model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
|
||||
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
|
||||
),
|
||||
# # 8-bit, group_size == channelwise
|
||||
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
|
||||
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin_24")
|
||||
or current_platform.is_rocm()
|
||||
or not current_platform.is_cuda(),
|
||||
reason="Marlin24 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_pair", model_pairs)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [8])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_pair: ModelPair,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
|
||||
) as marlin_24_model:
|
||||
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_pair.model_gptq, dtype=dtype, quantization="gptq"
|
||||
) as gptq_model:
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=gptq_outputs,
|
||||
outputs_1_lst=marlin_24_outputs,
|
||||
name_0="gptq",
|
||||
name_1="marlin_24",
|
||||
)
|
||||
84
tests/models/quantization/test_modelopt.py
Normal file
84
tests/models/quantization/test_modelopt.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# flake8: noqa
|
||||
"""Tests Model Optimizer fp8 models against ground truth generation
|
||||
Note: these tests will only pass on H100
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.1-8B-Instruct-FP8": [
|
||||
"You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
|
||||
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
|
||||
"The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
|
||||
'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
|
||||
"**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
|
||||
"The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
|
||||
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
|
||||
"Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# This test compares against golden strings for exact match since
|
||||
# there is no baseline implementation to compare against
|
||||
# and is unstable w.r.t specifics of the fp8 implementation or
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason="Prevent unstable test based on golden strings from breaking the build."
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
quantization="modelopt",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations: list[str] = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = llm.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del llm
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
for i in range(len(example_prompts)):
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
|
||||
)
|
||||
42
tests/models/quantization/test_mxfp4.py
Normal file
42
tests/models/quantization/test_mxfp4.py
Normal file
@@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# flake8: noqa
|
||||
"""Tests Quark mxfp4 models against ground truth generation"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
|
||||
"\n### Key Features\n\n* **High-throughput Inference**: vLL",
|
||||
"\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
|
||||
"Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
|
||||
"A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
|
||||
"\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
|
||||
"\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
|
||||
"The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
|
||||
" everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Model to be released in the future")
|
||||
@pytest.mark.quant_model
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
sampling_params = SamplingParams(max_tokens=20, temperature=0)
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
kv_cache_dtype="fp8",
|
||||
quantization="quark",
|
||||
)
|
||||
outputs = llm.generate(example_prompts, sampling_params)
|
||||
for i, output in enumerate(outputs):
|
||||
output_str = output.outputs[0].text
|
||||
expected_str = EXPECTED_STRS_MAP[model_name][i]
|
||||
assert expected_str == output_str, (
|
||||
f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
|
||||
)
|
||||
85
tests/models/quantization/test_nvfp4.py
Normal file
85
tests/models/quantization/test_nvfp4.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# flake8: noqa
|
||||
"""Tests Model Optimizer nvfp4 models against ground truth generation
|
||||
Note: these tests will only pass on B200
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
||||
|
||||
MAX_MODEL_LEN = 1024
|
||||
|
||||
MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
|
||||
|
||||
EXPECTED_STRS_MAP = {
|
||||
"nvidia/Llama-3.3-70B-Instruct-FP4": [
|
||||
"vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
|
||||
"Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
|
||||
"Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
|
||||
"A neural network is a type of machine learning model inspired by the structure and function of the human brain",
|
||||
"In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
|
||||
"The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
|
||||
"The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
|
||||
"Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# This test compares against golden strings for exact match since
|
||||
# there is no baseline implementation to compare against
|
||||
# and is unstable w.r.t specifics of the fp4 implementation or
|
||||
# the hardware being run on.
|
||||
# Disabled to prevent it from breaking the build
|
||||
@pytest.mark.skip(
|
||||
reason="Prevent unstable test based on golden strings from breaking the build "
|
||||
" and test input model being too large and hanging the system."
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt_fp4"),
|
||||
reason="modelopt_fp4 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_name", MODELS)
|
||||
def test_models(example_prompts, model_name) -> None:
|
||||
llm = LLM(
|
||||
model=model_name,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
trust_remote_code=True,
|
||||
enforce_eager=True,
|
||||
quantization="modelopt_fp4",
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
formatted_prompts = [
|
||||
tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
for prompt in example_prompts
|
||||
]
|
||||
params = SamplingParams(max_tokens=20, temperature=0)
|
||||
generations: List[str] = []
|
||||
# Note: these need to be run 1 at a time due to numerical precision,
|
||||
# since the expected strs were generated this way.
|
||||
for prompt in formatted_prompts:
|
||||
outputs = llm.generate(prompt, params)
|
||||
generations.append(outputs[0].outputs[0].text)
|
||||
del llm
|
||||
|
||||
print(model_name, generations)
|
||||
expected_strs = EXPECTED_STRS_MAP[model_name]
|
||||
for i in range(len(example_prompts)):
|
||||
generated_str = generations[i]
|
||||
expected_str = expected_strs[i]
|
||||
assert expected_str == generated_str, (
|
||||
f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
|
||||
)
|
||||
Reference in New Issue
Block a user