Sync from v0.13
This commit is contained in:
0
tests/quantization/__init__.py
Normal file
0
tests/quantization/__init__.py
Normal file
32
tests/quantization/fp_quant.py
Normal file
32
tests/quantization/fp_quant.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and inference for quantized HF models supported
|
||||
on the GPU backend using FPQuant.
|
||||
|
||||
Validating the configuration and printing results for manual checking.
|
||||
|
||||
Run `pytest tests/quantization/test_fp_quant.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
MODELS = [
|
||||
"ISTA-DASLab/Qwen3-0.6B-RTN-NVFP4",
|
||||
"ISTA-DASLab/Qwen3-0.6B-RTN-MXFP4",
|
||||
]
|
||||
DTYPE = ["bfloat16"]
|
||||
EAGER = [True, False]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp_quant"),
|
||||
reason="FPQuant is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("eager", EAGER)
|
||||
def test_fpquant(vllm_runner, model, eager):
|
||||
with vllm_runner(model, enforce_eager=eager) as llm:
|
||||
output = llm.generate_greedy(["1 2 3 4 5"], max_tokens=2)
|
||||
assert output[0][1] == "1 2 3 4 5 6"
|
||||
292
tests/quantization/reference_mxfp4.py
Normal file
292
tests/quantization/reference_mxfp4.py
Normal file
@@ -0,0 +1,292 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import torch
|
||||
|
||||
BFLOAT16_EXP_BIAS = 127
|
||||
BFLOAT16_MANTISSA_BITS = 7
|
||||
BFLOAT16_EXP_BITS = 8
|
||||
|
||||
FLOAT16_EXP_BIAS = 15
|
||||
FLOAT16_MANTISSA_BITS = 10
|
||||
FLOAT16_EXP_BITS = 5
|
||||
|
||||
FLOAT8_E8M0_MAX_EXP = 127
|
||||
FLOAT4_EXP_BIAS = 1
|
||||
FLOAT4_MANTISSA_BITS = 1
|
||||
|
||||
FLOAT16_VAL_TO_ADD = 1 << (FLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
|
||||
FLOAT16_SIGN_EXPONENT_MASK = (
|
||||
(1 << (FLOAT16_EXP_BITS + 1)) - 1
|
||||
) << FLOAT16_MANTISSA_BITS
|
||||
|
||||
BFLOAT16_VAL_TO_ADD = 1 << (BFLOAT16_MANTISSA_BITS - FLOAT4_MANTISSA_BITS - 1)
|
||||
BFLOAT16_SIGN_EXPONENT_MASK = (
|
||||
(1 << (BFLOAT16_EXP_BITS + 1)) - 1
|
||||
) << BFLOAT16_MANTISSA_BITS
|
||||
|
||||
|
||||
def e8m0_to_half(scale, half_dtype: torch.dtype):
|
||||
assert scale.dtype == torch.uint8
|
||||
|
||||
scale_exp = scale.to(torch.int16) - 127
|
||||
|
||||
# This can be implemented with bitwise operations in a proper kernel.
|
||||
scale_half = 2.0 ** (scale_exp.to(torch.float))
|
||||
|
||||
return scale_half.to(half_dtype)
|
||||
|
||||
|
||||
def upcast_fp4_to_fp16_or_bf16(
|
||||
val, float_dtype: torch.dtype, half_exp_bias: int, half_mantissa_bits: int
|
||||
):
|
||||
assert val.dtype == torch.uint8
|
||||
|
||||
unpacked = torch.zeros(
|
||||
*val.shape[:-1], val.shape[-1] * 2, dtype=torch.uint8, device=val.device
|
||||
)
|
||||
unpacked[..., 1::2] = (val >> 4) & 0x0F # Extract high 4 bits.
|
||||
unpacked[..., ::2] = val & 0x0F # Extract low 4 bits.
|
||||
|
||||
# Takes one float4 values represented as b0000xxxx,
|
||||
# and converts it to the corresponding float16 value.
|
||||
|
||||
sign = unpacked >> 3
|
||||
|
||||
exp = (unpacked >> 1) & 3
|
||||
new_mantissa = unpacked & 1
|
||||
|
||||
# if exp == 0 and new_mantissa == 0:
|
||||
# new_exp = 0
|
||||
# else:
|
||||
# new_exp = exp - FLOAT4_EXP_BIAS + FLOAT16_EXP_BIAS
|
||||
|
||||
# int8_t works with float16, but may overflow with bfloat16.
|
||||
new_exp = exp - FLOAT4_EXP_BIAS + half_exp_bias
|
||||
|
||||
# Cast b0000 to 0. in fp16/bf16.
|
||||
new_exp = new_exp * torch.logical_or(exp > 0, new_mantissa > 0)
|
||||
|
||||
# Cast b0001 to 0.5 in fp16/bf16.
|
||||
new_mantissa = torch.logical_and(new_mantissa, exp > 0)
|
||||
|
||||
new_mantissa = new_mantissa.to(torch.int32)
|
||||
new_exp = new_exp.to(torch.int32)
|
||||
sign = sign.to(torch.int32)
|
||||
|
||||
qdq_val = (
|
||||
(sign << 15)
|
||||
+ (new_exp << half_mantissa_bits)
|
||||
+ (new_mantissa << (half_mantissa_bits - 1))
|
||||
)
|
||||
|
||||
assert qdq_val.max() <= 65535
|
||||
assert qdq_val.min() >= 0
|
||||
qdq_val = qdq_val.to(torch.uint16)
|
||||
|
||||
result = qdq_val.view(float_dtype)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def dq_mxfp4_torch(
|
||||
x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
|
||||
) -> torch.Tensor:
|
||||
assert x.dtype == torch.uint8
|
||||
assert scale.dtype == torch.uint8
|
||||
|
||||
if float_dtype == torch.float16:
|
||||
half_exp_bias = FLOAT16_EXP_BIAS
|
||||
half_mantissa_bits = FLOAT16_MANTISSA_BITS
|
||||
elif float_dtype == torch.bfloat16:
|
||||
half_exp_bias = BFLOAT16_EXP_BIAS
|
||||
half_mantissa_bits = BFLOAT16_MANTISSA_BITS
|
||||
|
||||
scale_half = e8m0_to_half(scale, half_dtype=float_dtype)
|
||||
|
||||
x_half = upcast_fp4_to_fp16_or_bf16(
|
||||
x,
|
||||
float_dtype=float_dtype,
|
||||
half_exp_bias=half_exp_bias,
|
||||
half_mantissa_bits=half_mantissa_bits,
|
||||
)
|
||||
|
||||
x_half = x_half.reshape(*x_half.shape[:-1], -1, 32)
|
||||
x_half = x_half * scale_half[..., None]
|
||||
x_half = x_half.reshape(*x_half.shape[:-2], -1)
|
||||
|
||||
return x_half
|
||||
|
||||
|
||||
def fp16_to_fp4_simulate(
|
||||
val, half_mantissa_bits: int, half_exp_bits: int, half_exp_bias: int
|
||||
):
|
||||
# Casts an fp16/bf16 input to the restricted values of float4_e2m1,
|
||||
# that is to say [0., 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, -0.0,
|
||||
# -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0].
|
||||
|
||||
float_type = val.dtype
|
||||
|
||||
# "rshift_cuda" not implemented for 'UInt16'
|
||||
val_view = val.view(torch.int16) # .to(torch.int32)
|
||||
|
||||
exp = val_view >> half_mantissa_bits
|
||||
exp = exp & ((1 << half_exp_bits) - 1)
|
||||
|
||||
exp = exp.view(torch.uint16).to(torch.int32)
|
||||
|
||||
sign = (val_view >> (half_mantissa_bits + half_exp_bits)) & 1
|
||||
|
||||
mantissa_last = (val_view >> (half_mantissa_bits - 1)) & 1
|
||||
|
||||
exp_unbias = exp - half_exp_bias
|
||||
new_exp = exp_unbias + FLOAT4_EXP_BIAS
|
||||
|
||||
exp_shift = (new_exp <= 0) * (1 - new_exp)
|
||||
|
||||
# Typically 9.
|
||||
# Take the min to prevent overflow on `uint16_t half`. This is the case for
|
||||
# very small values, correctly mapped to `round_close`.
|
||||
tail_bits = half_mantissa_bits - FLOAT4_MANTISSA_BITS + exp_shift
|
||||
tail_bits[tail_bits >= 16] = 16
|
||||
|
||||
mantissa_plus_one = val_view & ((1 << (half_mantissa_bits + 1)) - 1)
|
||||
|
||||
half = 1 << (tail_bits - 1)
|
||||
|
||||
tail = mantissa_plus_one & ((1 << tail_bits) - 1)
|
||||
|
||||
round_close = tail < half # round towards 0
|
||||
round_away = tail > half # round away from 0
|
||||
tie = tail == half
|
||||
|
||||
new_mantissa_close = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
|
||||
new_exp_close = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
|
||||
|
||||
new_mantissa_away = torch.zeros(val.shape, device=val.device, dtype=torch.bool)
|
||||
new_exp_away = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
|
||||
|
||||
new_exp_tie = torch.zeros(val.shape, device=val.device, dtype=torch.uint16)
|
||||
|
||||
# 1. round down
|
||||
# if new_exp == 0: # case [0.5, 0.749999]
|
||||
# new_mantissa = 0
|
||||
# elif new_exp < 0: # case [0, 0.24999]
|
||||
# new_mantissa = 0
|
||||
# else:
|
||||
# new_mantissa = mantissa_last
|
||||
|
||||
new_mantissa_close = (new_exp > 0) * mantissa_last
|
||||
new_exp_close = exp
|
||||
|
||||
# # 2. round up
|
||||
# if new_exp <= 0: # case [0.250001, 0.499999] and [0.75001, 0.99999]
|
||||
# new_mantissa = 0
|
||||
# new_exp += 1
|
||||
# elif mantissa_last == 0:
|
||||
# new_mantissa = 1
|
||||
# else:
|
||||
# new_mantissa = 0
|
||||
# new_exp += 1
|
||||
|
||||
new_mantissa_away = torch.logical_and(new_exp > 0, mantissa_last == 0)
|
||||
new_exp_away = exp + torch.logical_or(new_exp <= 0, mantissa_last == 1)
|
||||
|
||||
# # 3. tie
|
||||
# 0.25 -> 0. (handled by `exp > (half_exp_bias - 2)`)
|
||||
# 0.75 -> 1.
|
||||
# 1.25 -> 1.
|
||||
# 1.75 -> 2.
|
||||
# 2.5 -> 2.
|
||||
# 3.5 -> 4.
|
||||
# 5. -> 4.
|
||||
new_exp_tie = (exp > (half_exp_bias - 2)) * (exp + (mantissa_last == 1))
|
||||
|
||||
# Gather round up, round down and tie.
|
||||
new_exp = (
|
||||
round_away * new_exp_away + round_close * new_exp_close + tie * new_exp_tie
|
||||
)
|
||||
|
||||
new_mantissa = round_away * new_mantissa_away + round_close * new_mantissa_close
|
||||
|
||||
# if new_exp > 3:
|
||||
# new_mantissa = 1
|
||||
new_mantissa = new_mantissa + (new_exp > (2 + half_exp_bias)) * (new_mantissa == 0)
|
||||
|
||||
# Clamp the exponent to acceptable values.
|
||||
new_exp = (new_exp >= (half_exp_bias - 2)) * torch.clamp(
|
||||
new_exp, half_exp_bias - 2, half_exp_bias + 2
|
||||
)
|
||||
|
||||
sign = sign.to(torch.int32)
|
||||
new_mantissa = new_mantissa.to(torch.int32)
|
||||
|
||||
qdq_val = (
|
||||
(sign << 15)
|
||||
+ (new_exp << half_mantissa_bits)
|
||||
+ (new_mantissa << (half_mantissa_bits - 1))
|
||||
)
|
||||
|
||||
assert qdq_val.max() <= 65535
|
||||
assert qdq_val.min() >= 0
|
||||
assert qdq_val.dtype == torch.int32
|
||||
qdq_val = qdq_val.to(torch.uint16)
|
||||
|
||||
result = qdq_val.view(float_type)
|
||||
return result
|
||||
|
||||
|
||||
def qdq_mxfp4_torch(
|
||||
x: torch.Tensor, scale_calculation_mode: str = "even"
|
||||
) -> torch.Tensor:
|
||||
half_dtype = x.dtype
|
||||
|
||||
if half_dtype == torch.float16:
|
||||
half_mantissa_bits = FLOAT16_MANTISSA_BITS
|
||||
half_exp_bits = FLOAT16_EXP_BITS
|
||||
half_exp_bias = FLOAT16_EXP_BIAS
|
||||
val_to_add = FLOAT16_VAL_TO_ADD
|
||||
sign_exponent_mask = FLOAT16_SIGN_EXPONENT_MASK
|
||||
elif half_dtype == torch.bfloat16:
|
||||
half_mantissa_bits = BFLOAT16_MANTISSA_BITS
|
||||
half_exp_bits = BFLOAT16_EXP_BITS
|
||||
half_exp_bias = BFLOAT16_EXP_BIAS
|
||||
val_to_add = BFLOAT16_VAL_TO_ADD
|
||||
sign_exponent_mask = BFLOAT16_SIGN_EXPONENT_MASK
|
||||
else:
|
||||
raise ValueError("not implemented")
|
||||
|
||||
x = x.reshape(*x.shape[:-1], -1, 32)
|
||||
|
||||
block_max = torch.max(torch.abs(x), dim=-1).values
|
||||
|
||||
block_max = block_max.view(torch.uint16).to(torch.int32)
|
||||
|
||||
block_max_uint = torch.bitwise_and(block_max + val_to_add, sign_exponent_mask)
|
||||
|
||||
assert block_max_uint.max() <= 65535
|
||||
assert block_max_uint.min() >= 0
|
||||
assert block_max_uint.dtype == torch.int32
|
||||
block_max_uint = block_max_uint.to(torch.uint16)
|
||||
|
||||
block_max = block_max_uint.view(half_dtype)
|
||||
|
||||
scale_exp = (
|
||||
FLOAT8_E8M0_MAX_EXP + torch.floor(torch.log2(block_max)).to(torch.int32) - 2
|
||||
)
|
||||
|
||||
scale_exp = torch.clamp(scale_exp, 0, 2 * FLOAT8_E8M0_MAX_EXP)
|
||||
|
||||
scale = 2.0 ** (scale_exp - FLOAT8_E8M0_MAX_EXP)
|
||||
scale = scale.to(half_dtype)
|
||||
|
||||
x = x / scale[..., None]
|
||||
|
||||
x_fp4 = fp16_to_fp4_simulate(
|
||||
x,
|
||||
half_exp_bits=half_exp_bits,
|
||||
half_mantissa_bits=half_mantissa_bits,
|
||||
half_exp_bias=half_exp_bias,
|
||||
)
|
||||
|
||||
x_fp4 = x_fp4 * scale[..., None]
|
||||
return x_fp4.reshape(*x_fp4.shape[:-2], -1)
|
||||
32
tests/quantization/test_auto_round.py
Normal file
32
tests/quantization/test_auto_round.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and inference for quantized HF models supported
|
||||
on the AutoRound.
|
||||
|
||||
Validating the configuration and printing results for manual checking.
|
||||
|
||||
Run `pytest tests/quantization/test_auto_round.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODELS = [
|
||||
"OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", ##auto_round:auto_gptq
|
||||
"Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound", ##auto_round:auto_awq
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cpu()
|
||||
and not current_platform.is_xpu()
|
||||
and not current_platform.is_cuda(),
|
||||
reason="only supports CPU/XPU/CUDA backend.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_auto_round(vllm_runner, model):
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
|
||||
assert output
|
||||
print(f"{output[0][1]}")
|
||||
180
tests/quantization/test_blackwell_moe.py
Normal file
180
tests/quantization/test_blackwell_moe.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not current_platform.is_device_capability_family(100):
|
||||
pytest.skip(
|
||||
"This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def set_test_environment():
|
||||
"""Sets environment variables required for this test module."""
|
||||
# Make sure TRTLLM attention is available
|
||||
os.environ["VLLM_HAS_FLASHINFER_CUBIN"] = "1"
|
||||
# Set compilation threads to 16 to speed up startup
|
||||
os.environ["FLASHINFER_NVCC_THREADS"] = "16"
|
||||
|
||||
|
||||
# Overide the backbone layers to 4 for faster startup
|
||||
HF_OVERRIDE_TEXT = {
|
||||
"num_layers": 4,
|
||||
"num_hidden_layers": 4,
|
||||
}
|
||||
HF_OVERRIDE_MM = {
|
||||
"text_config": {"num_layers": 4, "num_hidden_layers": 4},
|
||||
}
|
||||
|
||||
|
||||
def can_initialize(
|
||||
model: str,
|
||||
hf_overrides: dict[str, Any] | None = None,
|
||||
extra_args: list[str] | None = None,
|
||||
):
|
||||
# Server arguments
|
||||
extra_args = extra_args if extra_args is not None else []
|
||||
server_args = [
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-batched-tokens",
|
||||
"256",
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": 0}),
|
||||
*extra_args,
|
||||
]
|
||||
|
||||
# Launch server and make a simple request
|
||||
with RemoteOpenAIServer(
|
||||
model,
|
||||
server_args,
|
||||
max_wait_seconds=1500, # Due to FlashInfer compile
|
||||
override_hf_configs=hf_overrides,
|
||||
) as server:
|
||||
client = server.get_client()
|
||||
# Make a simple request to verify the server works
|
||||
completion = client.completions.create(
|
||||
model=model,
|
||||
prompt=["Hello, World!"],
|
||||
temperature=0,
|
||||
max_tokens=2,
|
||||
)
|
||||
print(completion)
|
||||
assert completion.choices[0].text is not None
|
||||
|
||||
|
||||
## Llama4 ##
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"RuntimeError: run_moe() Expected a value of type "
|
||||
"'Optional[List[Tensor]]' for argument '_9' but instead found type "
|
||||
"'list'."
|
||||
)
|
||||
)
|
||||
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
|
||||
)
|
||||
|
||||
|
||||
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=HF_OVERRIDE_MM
|
||||
)
|
||||
|
||||
|
||||
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
|
||||
)
|
||||
|
||||
|
||||
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize(
|
||||
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", hf_overrides=HF_OVERRIDE_MM
|
||||
)
|
||||
|
||||
|
||||
## DeepSeekV3 ##
|
||||
|
||||
|
||||
def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"Known issue: lack of kernel support. "
|
||||
"Expected failure: assert self.block_quant is None"
|
||||
)
|
||||
)
|
||||
def test_deepseek_fp8_block_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_deepseek_fp8_block_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("deepseek-ai/DeepSeek-V3.1", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_deepseek_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
|
||||
monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
|
||||
can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
## GPT-OSS ##
|
||||
|
||||
|
||||
def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
|
||||
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
|
||||
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
|
||||
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
|
||||
can_initialize("openai/gpt-oss-20b", hf_overrides=HF_OVERRIDE_TEXT)
|
||||
|
||||
|
||||
def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch):
|
||||
can_initialize(
|
||||
"openai/gpt-oss-20b",
|
||||
hf_overrides=HF_OVERRIDE_TEXT,
|
||||
extra_args=["--enforce-eager"],
|
||||
)
|
||||
817
tests/quantization/test_compressed_tensors.py
Normal file
817
tests/quantization/test_compressed_tensors.py
Normal file
@@ -0,0 +1,817 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and weight loading for llmcompressor-quantized models.
|
||||
|
||||
Run `pytest tests/quantization/test_compressed_tensors.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationType
|
||||
|
||||
from tests.models.utils import check_logprobs_close
|
||||
from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensors24,
|
||||
CompressedTensorsLinearMethod,
|
||||
CompressedTensorsW4A4Fp4,
|
||||
CompressedTensorsW4A8Fp8,
|
||||
CompressedTensorsW4A16Fp4,
|
||||
CompressedTensorsW4A16Sparse24,
|
||||
CompressedTensorsW8A8Fp8,
|
||||
CompressedTensorsW8A8Int8,
|
||||
CompressedTensorsW8A16Fp8,
|
||||
CompressedTensorsWNA16,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
cutlass_fp4_supported,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
sparse_cutlass_supported,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# AITER only supports per-channel-per-channel INT8 gemm
|
||||
# and per-tensor-per-tensor INT8 GEMM.
|
||||
# It does not support mix precision MM and mix quantization scheme.
|
||||
ROCM_AITER_SUPPORTED_INT8_MODEL = [
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
]
|
||||
|
||||
# TritonScaledMMLinearKernel only supports symmetric quantization.
|
||||
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_args",
|
||||
[
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"tensor",
|
||||
QuantizationType.INT,
|
||||
2560,
|
||||
True,
|
||||
),
|
||||
(
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"tensor",
|
||||
QuantizationType.INT,
|
||||
2560,
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
|
||||
model_path, strategy, quant_type, shape_0, is_symmetric = model_args
|
||||
|
||||
if (
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
# assert zp for symmetric and asymmetric cases
|
||||
def zp_valid(zp: torch.Tensor | None):
|
||||
if is_symmetric:
|
||||
return zp is None
|
||||
|
||||
return zp is not None and zp.dtype is torch.int32
|
||||
|
||||
assert zp_valid(qkv_proj.input_zero_point)
|
||||
assert zp_valid(o_proj.input_zero_point)
|
||||
assert zp_valid(gate_up_proj.input_zero_point)
|
||||
assert zp_valid(down_proj.input_zero_point)
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
|
||||
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.scheme.is_static_input_scheme
|
||||
expected_type = torch.int8
|
||||
|
||||
assert qkv_proj.weight.dtype is expected_type
|
||||
assert o_proj.weight.dtype is expected_type
|
||||
assert gate_up_proj.weight.dtype is expected_type
|
||||
|
||||
if qkv_proj.scheme.strategy == "tensor":
|
||||
# Make sure it is a channelwise buffer
|
||||
# After running process_weights_after_loading
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
assert qkv_proj.weight_scale.shape[0] == shape_0
|
||||
assert qkv_proj.weight_scale.shape[1] == 1
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert qkv_proj.input_scale.dtype is torch.float32
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_path",
|
||||
[
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
@pytest.mark.parametrize(
|
||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
def test_compressed_tensors_w8a8_logprobs(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model_path,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
use_aiter,
|
||||
monkeypatch,
|
||||
):
|
||||
if (
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
|
||||
if use_aiter:
|
||||
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
|
||||
pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
|
||||
# this will enable VLLM_ROCM_USE_AITER_LINEAR
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
dtype = "bfloat16"
|
||||
|
||||
# skip language translation prompt for the static per tensor models
|
||||
if model_path in (
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
):
|
||||
example_prompts = example_prompts[0:-1]
|
||||
|
||||
with hf_runner(model_path, dtype=dtype) as hf_model:
|
||||
hf_outputs = hf_model.generate_greedy_logprobs_limit(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
)
|
||||
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
if current_platform.is_rocm():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def test_compressed_tensors_no_enforce_eager(vllm_runner):
|
||||
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
|
||||
with vllm_runner(model_path) as llm:
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_args",
|
||||
[
|
||||
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"channel",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"use_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
def test_compressed_tensors_w8a8_dynamic_per_token(
|
||||
vllm_runner,
|
||||
model_args,
|
||||
use_aiter,
|
||||
monkeypatch,
|
||||
):
|
||||
model_path, strategy = model_args
|
||||
|
||||
if (
|
||||
current_platform.is_rocm()
|
||||
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
|
||||
):
|
||||
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
|
||||
|
||||
if use_aiter:
|
||||
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
|
||||
pytest.skip(f"Skip model {model_path} as it is not support by aiter.")
|
||||
# this will enable VLLM_ROCM_USE_AITER_LINEAR
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
with vllm_runner(model_path, enforce_eager=True, dtype=torch.float16) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8)
|
||||
assert not qkv_proj.scheme.is_static_input_scheme
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.weight.dtype is torch.int8
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"wNa16_args",
|
||||
[
|
||||
(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"channel",
|
||||
None,
|
||||
8,
|
||||
True,
|
||||
False,
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
|
||||
"group",
|
||||
128,
|
||||
8,
|
||||
False,
|
||||
True,
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="The tests are skipped on non-CUDA platform."
|
||||
)
|
||||
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
|
||||
model, strategy, group, pack_factor, symmetric, has_g_idx = wNa16_args
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
|
||||
|
||||
assert qkv_proj.scheme.strategy == strategy
|
||||
assert qkv_proj.scheme.group_size == (-1 if group is None else group)
|
||||
|
||||
assert qkv_proj.scheme.pack_factor == pack_factor
|
||||
assert qkv_proj.scheme.symmetric == symmetric
|
||||
assert qkv_proj.scheme.has_g_idx == has_g_idx
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
|
||||
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
|
||||
assert qkv_proj.weight_packed.dtype is torch.int32
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8(vllm_runner):
|
||||
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(
|
||||
qkv_proj.scheme,
|
||||
(CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
|
||||
)
|
||||
|
||||
assert qkv_proj.input_scale.dtype is torch.float32
|
||||
|
||||
if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
|
||||
assert len(qkv_proj.input_scale.shape) == 0
|
||||
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight_scale.shape) == 0
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
def test_compressed_tensors_kv_cache(vllm_runner):
|
||||
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
|
||||
with vllm_runner(model_path, enforce_eager=True, kv_cache_dtype="fp8") as llm:
|
||||
output = llm.generate_greedy("Hello world!", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not sparse_cutlass_supported(),
|
||||
reason="Sparse FP8 is not yet supported on this GPU type.",
|
||||
)
|
||||
def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy, format="dense"):
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensors24)
|
||||
|
||||
assert qkv_proj.scheme.weight_quant.strategy == weight_strategy
|
||||
assert qkv_proj.scheme.input_quant.strategy == input_strategy
|
||||
assert qkv_proj.scheme.quantized
|
||||
assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
|
||||
sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501
|
||||
assert sparsity_map.get("Linear").format == format
|
||||
assert sparsity_map.get("Linear").sparsity_structure == "2:4"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
|
||||
reason="Sparse FP8 is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4",
|
||||
[
|
||||
(
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
||||
"channel",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
||||
"channel",
|
||||
"tensor",
|
||||
),
|
||||
(
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
||||
"tensor",
|
||||
"tensor",
|
||||
),
|
||||
(
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"tensor",
|
||||
"token",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
|
||||
_test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
|
||||
reason="Sparse FP8 is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4",
|
||||
[
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"channel",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||
"channel",
|
||||
"tensor",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"tensor",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||
"tensor",
|
||||
"tensor",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
|
||||
_test_2of4_quant_models(
|
||||
qkv_proj,
|
||||
weight_strategy,
|
||||
input_strategy,
|
||||
format="sparse-24-bitmask",
|
||||
)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not sparse_cutlass_supported(),
|
||||
reason="cutlass is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4",
|
||||
[
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
||||
"channel",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
||||
"channel",
|
||||
"tensor",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
||||
"tensor",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
||||
"tensor",
|
||||
"tensor",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert qkv_proj.scheme.weights_dtype == torch.int8
|
||||
_test_2of4_quant_models(
|
||||
qkv_proj,
|
||||
weight_strategy,
|
||||
input_strategy,
|
||||
format="sparse-24-bitmask",
|
||||
)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not sparse_cutlass_supported(),
|
||||
reason="Sparse FP8 is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4",
|
||||
[
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
||||
"channel",
|
||||
"token",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||
"tensor",
|
||||
"tensor",
|
||||
),
|
||||
(
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"tensor",
|
||||
"token",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
|
||||
model, weight_strategy, input_strategy = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert qkv_proj.scheme.weights_dtype == torch.int8
|
||||
_test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not sparse_cutlass_supported(),
|
||||
reason="2of4 Sparse is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4",
|
||||
[("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
|
||||
)
|
||||
def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
|
||||
model = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensors24)
|
||||
|
||||
assert qkv_proj.scheme.weight_quant is None
|
||||
assert qkv_proj.scheme.input_quant is None
|
||||
assert not qkv_proj.scheme.quantized
|
||||
assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
|
||||
sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501
|
||||
assert sparsity_map.get("Linear").format == "dense"
|
||||
assert sparsity_map.get("Linear").sparsity_structure == "2:4"
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not sparse_cutlass_supported(),
|
||||
reason="Cutlass is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")]
|
||||
)
|
||||
def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
|
||||
model = args_2of4
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensors24)
|
||||
|
||||
assert qkv_proj.scheme.weight_quant is None
|
||||
assert qkv_proj.scheme.input_quant is None
|
||||
assert not qkv_proj.scheme.quantized
|
||||
assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
|
||||
sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501
|
||||
assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
|
||||
assert sparsity_map.get("Linear").sparsity_structure == "2:4"
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[
|
||||
# TODO: Enable once model is available again
|
||||
# ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A16", CompressedTensorsW4A16Fp4),
|
||||
("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4", CompressedTensorsW4A4Fp4),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_nvfp4(vllm_runner, args):
|
||||
model, scheme = args
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
if (
|
||||
isinstance(qkv_proj.scheme, scheme)
|
||||
or isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
|
||||
and not cutlass_fp4_supported()
|
||||
):
|
||||
assert True
|
||||
else:
|
||||
raise AssertionError("FP4 Scheme Mismatch")
|
||||
|
||||
assert qkv_proj.scheme.group_size == 16
|
||||
|
||||
llm.apply_model(check_model)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda() or not current_platform.has_device_capability(90),
|
||||
reason="W4A8 FP8 is not yet supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"args",
|
||||
[("czhu-cohere/TinyLlama-1.1B-Chat-v1.0-W4A8-e2e", CompressedTensorsW4A8Fp8)],
|
||||
)
|
||||
def test_compressed_tensors_w4a8_fp8(vllm_runner, args):
|
||||
model, scheme = args
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
for proj in (qkv_proj, o_proj, gate_up_proj, down_proj):
|
||||
assert isinstance(proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(proj.scheme, scheme)
|
||||
|
||||
assert proj.weight_packed.dtype is torch.int32
|
||||
assert proj.weight_scale.dtype is torch.float8_e4m3fn
|
||||
assert proj.weight_chan_scale.dtype is torch.float32
|
||||
assert proj.scheme.group_size == 128
|
||||
|
||||
llm.apply_model(check_model)
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
print(output)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"model,prompt,exp_perplexity",
|
||||
[
|
||||
(
|
||||
"nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
|
||||
"Flat is better than nested.\nSparse is better than dense.",
|
||||
150.0,
|
||||
),
|
||||
(
|
||||
"nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
|
||||
"Flat is better than nested.\nSparse is better than dense.",
|
||||
150.0,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compressed_tensors_transforms_perplexity(
|
||||
vllm_runner, model, prompt, exp_perplexity
|
||||
):
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
perplexity = llm.generate_prompt_perplexity([prompt])[0]
|
||||
print(perplexity)
|
||||
assert perplexity <= exp_perplexity
|
||||
|
||||
|
||||
def test_compressed_tensors_fp8_block_enabled(vllm_runner):
|
||||
model_path = "RedHatAI/Qwen3-0.6B-FP8-BLOCK"
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8)
|
||||
assert isinstance(
|
||||
qkv_proj.scheme.w8a8_block_fp8_linear, W8A8BlockFp8LinearOp
|
||||
)
|
||||
|
||||
assert qkv_proj.weight.dtype is fp8_dtype
|
||||
assert qkv_proj.weight_scale.dtype is torch.float32
|
||||
assert len(qkv_proj.weight.shape) == 2
|
||||
assert len(qkv_proj.weight_scale.shape) == 2
|
||||
|
||||
input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
|
||||
assert isinstance(input_quant_op, QuantFP8)
|
||||
assert input_quant_op._forward_method == input_quant_op.forward_cuda
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="This test is not for non-CUDA platforms",
|
||||
)
|
||||
def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
|
||||
"""
|
||||
Integration test for MoE layer ignore functionality with a real model.
|
||||
|
||||
This test would verify that when loading a compressed-tensors quantized
|
||||
MoE model where some MoE layers are in the ignore list, those layers
|
||||
use UnquantizedFusedMoEMethod while non-ignored layers use the
|
||||
quantized method.
|
||||
|
||||
Expected model structure:
|
||||
- Compressed-tensors quantized MoE model (e.g., Mixtral-based)
|
||||
- Config with ignore list containing specific MoE layers
|
||||
- Multiple MoE layers where some are quantized and some are not
|
||||
"""
|
||||
|
||||
# model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
|
||||
model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable" # CT 12.2
|
||||
|
||||
with vllm_runner(model_path, enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import ( # noqa: E501
|
||||
CompressedTensorsMoEMethod,
|
||||
)
|
||||
|
||||
# Check layer 0 MoE (should be quantized)
|
||||
layer_quantized = model.model.layers[0].mlp.experts
|
||||
assert isinstance(layer_quantized, FusedMoE)
|
||||
assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)
|
||||
|
||||
# Check layer 10 MoE (should be unquantized + ignored)
|
||||
layer_unquantized = model.model.layers[3].mlp.experts
|
||||
assert isinstance(layer_unquantized, FusedMoE)
|
||||
assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# Verify the model can generate output
|
||||
output = llm.generate_greedy("Hello, my name is", max_tokens=4)
|
||||
assert output
|
||||
@@ -1,3 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether Marlin models can be loaded from the autogptq config.
|
||||
|
||||
Run `pytest tests/quantization/test_configs.py --forked`.
|
||||
@@ -20,49 +22,31 @@ class ModelPair:
|
||||
MODEL_ARG_EXPTYPES = [
|
||||
# AUTOGPTQ
|
||||
# compat: autogptq <=0.7.1 is_marlin_format: bool
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
|
||||
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
|
||||
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
|
||||
# compat: autogptq >=0.8.0 use checkpoint_format: str
|
||||
# Model Serialized in Marlin Format should always use Marlin kernel.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
|
||||
# Model Serialized in Exllama Format.
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
|
||||
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
|
||||
|
||||
# AUTOAWQ
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "ERROR"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
|
||||
def test_auto_gptq(model_arg_exptype: str) -> None:
|
||||
def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
|
||||
model_path, quantization_arg, expected_type = model_arg_exptype
|
||||
|
||||
try:
|
||||
model_config = ModelConfig(model_path,
|
||||
model_path,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=False,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
quantization=quantization_arg)
|
||||
model_config = ModelConfig(model_path, quantization=quantization_arg)
|
||||
found_quantization_type = model_config.quantization
|
||||
except ValueError:
|
||||
found_quantization_type = "ERROR"
|
||||
@@ -70,4 +54,5 @@ def test_auto_gptq(model_arg_exptype: str) -> None:
|
||||
assert found_quantization_type == expected_type, (
|
||||
f"Expected quant_type == {expected_type} for {model_path}, "
|
||||
f"but found {found_quantization_type} "
|
||||
f"for no --quantization {quantization_arg} case")
|
||||
f"for no --quantization {quantization_arg} case"
|
||||
)
|
||||
|
||||
73
tests/quantization/test_cpu_offload.py
Normal file
73
tests/quantization/test_cpu_offload.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Expanded quantized model tests for CPU offloading
|
||||
# Base tests: tests/basic_correctness/test_cpu_offload.py
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..utils import compare_two_settings
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="fp8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_cpu_offload_fp8():
|
||||
# Test loading a quantized checkpoint
|
||||
compare_two_settings(
|
||||
"neuralmagic/Qwen2-1.5B-Instruct-FP8",
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.",
|
||||
)
|
||||
def test_cpu_offload_gptq(monkeypatch):
|
||||
# This quant method is sensitive to dummy weights, so we force real weights
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
|
||||
# Test GPTQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("awq_marlin"),
|
||||
reason="awq_marlin is not supported on this GPU type.",
|
||||
)
|
||||
def test_cpu_offload_awq(monkeypatch):
|
||||
# This quant method is sensitive to dummy weights, so we force real weights
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
|
||||
# Test AWQ Marlin
|
||||
compare_two_settings(
|
||||
"Qwen/Qwen2-1.5B-Instruct-AWQ",
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("gptq_marlin"),
|
||||
reason="gptq_marlin is not supported on this GPU type.",
|
||||
)
|
||||
def test_cpu_offload_compressed_tensors(monkeypatch):
|
||||
# This quant method is sensitive to dummy weights, so we force real weights
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
|
||||
# Test wNa16
|
||||
compare_two_settings(
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
["--enforce_eager"],
|
||||
["--enforce_eager", "--cpu-offload-gb", "1"],
|
||||
max_wait_seconds=480,
|
||||
)
|
||||
23
tests/quantization/test_cpu_wna16.py
Normal file
23
tests/quantization/test_cpu_wna16.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
if not current_platform.is_cpu():
|
||||
pytest.skip("skipping CPU-only tests", allow_module_level=True)
|
||||
|
||||
MODELS = [
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", # with g_idx
|
||||
]
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
assert output
|
||||
print(output)
|
||||
39
tests/quantization/test_experts_int8.py
Normal file
39
tests/quantization/test_experts_int8.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# flake8: noqa
|
||||
"""Tests experts_int8 quantization startup and generation,
|
||||
doesn't test correctness
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
from ..models.registry import HF_EXAMPLE_MODELS
|
||||
|
||||
MODELS = ["ai21labs/Jamba-tiny-random", "pfnet/plamo-2-1b"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("experts_int8"),
|
||||
reason="ExpertsInt8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [4])
|
||||
def test_model_experts_int8_startup(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(
|
||||
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
@@ -1,24 +1,351 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether FP8 computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_fp8.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.quantization.fp8 import (
|
||||
Fp8Config,
|
||||
Fp8KVCacheMethod,
|
||||
Fp8LinearMethod,
|
||||
Fp8MoEMethod,
|
||||
)
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
capability = torch.cuda.get_device_capability()
|
||||
capability = capability[0] * 10 + capability[1]
|
||||
MODELS = [
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
# The checkpoint below was removed from the HF.
|
||||
# TODO: add a small replacement checkpoint.
|
||||
pytest.param(
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
marks=pytest.mark.skip(reason="Checkpoint removed from HF."),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
|
||||
reason="FP8 is not supported on this GPU type.")
|
||||
def test_load_fp16_model(vllm_runner) -> None:
|
||||
llm = vllm_runner("facebook/opt-125m", quantization="fp8")
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", MODELS)
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
def test_model_load_and_run(
|
||||
vllm_runner, model_id: str, force_marlin: bool, use_rocm_aiter: bool, monkeypatch
|
||||
) -> None:
|
||||
if use_rocm_aiter:
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
|
||||
fc1 = model.model.decoder.layers[0].fc1
|
||||
assert isinstance(fc1.quant_method, Fp8LinearMethod)
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fn
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(model_id, enforce_eager=True) as llm:
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
KV_CACHE_MODELS = [
|
||||
# AutoFP8 format using separate .k_scale and .v_scale
|
||||
# The original checkpoint below was removed from the Hub. To unblock CI and
|
||||
# until a small replacement with split K/V scales is found, skip this case.
|
||||
# See PR #27717 for context.
|
||||
pytest.param(
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
marks=pytest.mark.skip(
|
||||
reason=(
|
||||
"Checkpoint removed from HF; temporarily disabling this "
|
||||
"AutoFP8 split K/V case (PR #27717)."
|
||||
)
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
def test_kv_cache_model_load_and_run(
|
||||
vllm_runner, model_id: str, use_rocm_aiter: bool, monkeypatch
|
||||
):
|
||||
if use_rocm_aiter:
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
attn = model.model.layers[0].self_attn.attn
|
||||
|
||||
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
|
||||
|
||||
if not current_platform.is_rocm():
|
||||
# NOTE: This code path requires validation on Non-CUDA platform
|
||||
# NOTE: it is valid for scales to be 1.0 (default value), but
|
||||
# we know these checkpoints have scales < 1.0
|
||||
assert 0.0 < attn._k_scale < 1.0
|
||||
assert 0.0 < attn._v_scale < 1.0
|
||||
else:
|
||||
# NOTE: This code path is for ROCm platform
|
||||
# NOTE: it is valid for scales to be 1.0 (default value), but
|
||||
# we know these checkpoints have scales < 1.0
|
||||
# However on ROCm platform, the _k_scale and _v_scale will be
|
||||
# scaled by a factor of 2 as described in
|
||||
# vllm/model_executor/layers/quantization/kv_cache.py
|
||||
assert 0.0 < attn._k_scale < (1.0 * 2.0)
|
||||
assert 0.0 < attn._v_scale < (1.0 * 2.0)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# note: this does not test accuracy, just that we can run through
|
||||
# see lm-eval tests for accuracy
|
||||
outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
print(outputs[0][1])
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
@pytest.mark.parametrize("force_marlin", [False, True])
|
||||
@pytest.mark.parametrize(
|
||||
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
|
||||
)
|
||||
def test_load_fp16_model(
|
||||
vllm_runner,
|
||||
kv_cache_dtype: str,
|
||||
force_marlin: bool,
|
||||
use_rocm_aiter: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if use_rocm_aiter:
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
if force_marlin:
|
||||
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
|
||||
|
||||
with vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
quantization="fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
fc1 = model.model.decoder.layers[0].fc1
|
||||
assert isinstance(fc1.quant_method, Fp8LinearMethod)
|
||||
if kv_cache_dtype == "fp8":
|
||||
attn = model.model.decoder.layers[0].self_attn.attn
|
||||
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
|
||||
assert attn._k_scale == 1.0
|
||||
assert attn._v_scale == 1.0
|
||||
|
||||
if current_platform.is_cuda():
|
||||
if current_platform.supports_fp8() and not force_marlin:
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fn
|
||||
else:
|
||||
# For GPUs without hardware support, we pack the fp8 weights
|
||||
# for weight-only quantization using Marlin kernels
|
||||
assert fc1.weight.dtype == torch.int32
|
||||
elif current_platform.is_rocm():
|
||||
if current_platform.supports_fp8() and not force_marlin:
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
assert fc1.weight.dtype == current_platform.fp8_dtype()
|
||||
else: # unsupported ROCm platform
|
||||
pytest.skip(
|
||||
"Skip `test_load_fp16_model`. "
|
||||
"It only runs on ROCm platform with FP8 compute."
|
||||
" e.g. MI300X and above."
|
||||
)
|
||||
else: # unsupported platform
|
||||
pytest.skip(
|
||||
"Skip `test_load_fp16_model`. "
|
||||
"It only runs on CUDA and ROCm platform."
|
||||
)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("fp8"),
|
||||
reason="FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
|
||||
def test_scaled_fp8_quant(dtype) -> None:
|
||||
def quantize_ref(tensor, inv_scale):
|
||||
# The reference implementation that fully aligns to
|
||||
# the kernel being tested.
|
||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||
scale = inv_scale.reciprocal()
|
||||
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
|
||||
qweight = qweight.to(torch.float8_e4m3fn)
|
||||
return qweight
|
||||
|
||||
def per_tensor_dequantize(tensor, inv_scale, dtype):
|
||||
fake_qweight = tensor.to(dtype)
|
||||
dq_weight = fake_qweight * inv_scale
|
||||
return dq_weight
|
||||
|
||||
# Note that we use a shape % 4 != 0 to cover edge cases,
|
||||
# because scaled_fp8_quant is vectorized by 4.
|
||||
x = (torch.randn(size=(11, 11), device="cuda") * 13).to(dtype)
|
||||
|
||||
# Dynamic quantization
|
||||
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
|
||||
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
|
||||
|
||||
# Reference dynamic quantizaton
|
||||
y = quantize_ref(x, inv_scale)
|
||||
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
|
||||
|
||||
# Static quantization
|
||||
y, _ = ops.scaled_fp8_quant(x, inv_scale)
|
||||
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
|
||||
|
||||
# Padding
|
||||
y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
|
||||
assert y.shape[0] == 17
|
||||
torch.testing.assert_close(
|
||||
ref_y,
|
||||
per_tensor_dequantize(torch.narrow(y, 0, 0, x.shape[0]), inv_scale, dtype),
|
||||
)
|
||||
|
||||
# non-contiguous input with padding
|
||||
m, n, padded_stride = 975, 512, 576
|
||||
padded_tensor = (torch.randn(size=(m, padded_stride), device="cuda") * 13).to(dtype)
|
||||
x_nc = padded_tensor[:, :n] # shape (m, n) with stride (padded_stride, 1)
|
||||
|
||||
assert not x_nc.is_contiguous()
|
||||
assert x_nc.stride(0) == padded_stride
|
||||
|
||||
# dynamic quantization
|
||||
ref_y_nc, inv_scale_nc = ops.scaled_fp8_quant(x_nc, None)
|
||||
ref_y_nc = per_tensor_dequantize(ref_y_nc, inv_scale_nc, dtype)
|
||||
|
||||
# reference dynamic quantization
|
||||
y_nc = quantize_ref(x_nc, inv_scale_nc)
|
||||
torch.testing.assert_close(
|
||||
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
|
||||
)
|
||||
|
||||
# static quantization
|
||||
y_nc, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc)
|
||||
torch.testing.assert_close(
|
||||
ref_y_nc, per_tensor_dequantize(y_nc, inv_scale_nc, dtype)
|
||||
)
|
||||
|
||||
# padding after non-contiguous input quantization
|
||||
y_nc_pad, _ = ops.scaled_fp8_quant(x_nc, inv_scale_nc, num_token_padding=m + 10)
|
||||
assert y_nc_pad.shape[0] == m + 10
|
||||
torch.testing.assert_close(
|
||||
ref_y_nc,
|
||||
per_tensor_dequantize(
|
||||
torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
|
||||
# FP8 weight reloading does not support online quantization
|
||||
@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False
|
||||
@pytest.mark.parametrize("weight_block_size", [None, [1, 1]])
|
||||
# any postprocessing that is applied to the weights such as padding and repacking
|
||||
# (excluding device sharding) must also be applied to the reloaded weights
|
||||
#
|
||||
# this is the case for marlin as well as per-tensor Fp8MoEMethod
|
||||
@pytest.mark.parametrize("use_marlin", [False]) # skip True
|
||||
def test_fp8_reloading(
|
||||
method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
|
||||
):
|
||||
if is_checkpoint_fp8_serialized is False:
|
||||
pytest.skip("FP8 weight reloading does not support online quantization")
|
||||
|
||||
if method_cls is Fp8MoEMethod and weight_block_size is None:
|
||||
pytest.skip(
|
||||
"FP8 Tensor weight reloading does not support fusing w13_weight_scale. "
|
||||
"If this is your use case, consider using a restore function like #26327"
|
||||
)
|
||||
|
||||
with torch.device("cuda:0"):
|
||||
config = Fp8Config(
|
||||
is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
|
||||
weight_block_size=weight_block_size,
|
||||
)
|
||||
|
||||
if method_cls is Fp8LinearMethod:
|
||||
layer = torch.nn.Linear(1, 1)
|
||||
method = method_cls(config)
|
||||
method.create_weights(
|
||||
layer=layer,
|
||||
input_size_per_partition=1,
|
||||
output_partition_sizes=[1],
|
||||
input_size=1,
|
||||
output_size=1,
|
||||
params_dtype=torch.bfloat16,
|
||||
weight_loader=default_weight_loader,
|
||||
)
|
||||
|
||||
else:
|
||||
layer = FusedMoE(
|
||||
num_experts=1,
|
||||
top_k=1,
|
||||
hidden_size=1,
|
||||
intermediate_size=1,
|
||||
)
|
||||
method = method_cls(config, layer)
|
||||
method.create_weights(
|
||||
layer=layer,
|
||||
num_experts=1,
|
||||
hidden_size=1,
|
||||
intermediate_size_per_partition=1,
|
||||
params_dtype=torch.bfloat16,
|
||||
weight_loader=default_weight_loader,
|
||||
)
|
||||
|
||||
method.use_marlin = use_marlin
|
||||
|
||||
# capture weights format during loading
|
||||
original_metadata = [
|
||||
(name, param.shape, getattr(param, "weight_loader", default_weight_loader))
|
||||
for name, param in layer.named_parameters()
|
||||
]
|
||||
|
||||
# test loading
|
||||
for name, shape, _ in original_metadata:
|
||||
param = getattr(layer, name)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, torch.zeros(shape)) # cannot use empty
|
||||
|
||||
method.process_weights_after_loading(layer)
|
||||
|
||||
# test reloading works after loading
|
||||
# assuming that no reshaping occurred
|
||||
for name, shape, original_weight_loader in original_metadata:
|
||||
param = getattr(layer, name)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
assert weight_loader is original_weight_loader
|
||||
weight_loader(param, torch.zeros(shape)) # cannot use empty
|
||||
|
||||
method.process_weights_after_loading(layer)
|
||||
|
||||
81
tests/quantization/test_gptq_dynamic.py
Normal file
81
tests/quantization/test_gptq_dynamic.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether gptq models with dynamic quantized can be loaded.
|
||||
|
||||
Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
|
||||
get_dynamic_override,
|
||||
)
|
||||
|
||||
PROMPT = "On the surface of Mars, we found"
|
||||
|
||||
# The first layer is quantized using bits=4, group_size=128
|
||||
# The second layer is quantized using bits=8, group_size=32
|
||||
# All other layers (layer index >= 2) are not quantized
|
||||
MODEL_QUANT = [
|
||||
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue", True),
|
||||
(
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
False,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
|
||||
def test_gptq_with_dynamic(
|
||||
vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch
|
||||
):
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
linear_method_cls = (
|
||||
GPTQMarlinLinearMethod if use_marlin_kernel else (GPTQLinearMethod)
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
for name, submodule in model.named_modules():
|
||||
if name == "lm_head":
|
||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||
elif name == "model.layers.0.self_attn.qkv_proj":
|
||||
# The first layer is quantized using bits=4, group_size=128
|
||||
# desc_act=True
|
||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||
config = submodule.quant_method.quant_config
|
||||
assert config.weight_bits == 4
|
||||
assert config.group_size == 128
|
||||
assert config.desc_act
|
||||
elif name == "model.layers.1.self_attn.qkv_proj":
|
||||
# The second layer is quantized using bits=8, group_size=32
|
||||
# desc_act=False
|
||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||
config = submodule.quant_method.quant_config
|
||||
assert (
|
||||
get_dynamic_override(config, layer_name=name, key="bits") == 8
|
||||
)
|
||||
assert (
|
||||
get_dynamic_override(config, layer_name=name, key="group_size")
|
||||
== 32
|
||||
)
|
||||
assert not get_dynamic_override(
|
||||
config, layer_name=name, key="desc_act"
|
||||
)
|
||||
elif (
|
||||
name == "model.layers.2.self_attn.qkv_proj"
|
||||
or name == "model.layers.2.mlp.gate_up_proj"
|
||||
):
|
||||
# All other layers (layer index >= 2) are not quantized
|
||||
assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
109
tests/quantization/test_gptq_v2.py
Normal file
109
tests/quantization/test_gptq_v2.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether vllm correctly load and run gptq_v2 format checkpoints.
|
||||
|
||||
Run `pytest tests/quantization/test_gptq_v2.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm import SamplingParams
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
|
||||
# A dummy small model quantized by GPTQModel, stored in GPTQ v2 format
|
||||
MODELS = ["XXXXyu/Qwen3-1.7B-w2g64-gptq_v2"]
|
||||
|
||||
# Generate multiple sequences for testing, because an 1.7B 2-bit model
|
||||
# cannot always generate normal texts.
|
||||
N_SEQ = 5
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", MODELS)
|
||||
def test_model_load(vllm_runner, model_id, monkeypatch):
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
# Only check the default GPTQ linear method (used for 2/3-bit models).
|
||||
# 4/8-bit linear methods like Marlin already support gptq_v2.
|
||||
linear_method_cls = GPTQLinearMethod
|
||||
|
||||
with vllm_runner(model_id, dtype=torch.float16, max_model_len=512) as llm:
|
||||
|
||||
def check_model(model_id):
|
||||
for name, submodule in model_id.named_modules():
|
||||
# Could check more modules if necessary
|
||||
if name == "model_id.layers.0.self_attn.qkv_proj":
|
||||
assert isinstance(submodule.quant_method, linear_method_cls)
|
||||
|
||||
config = submodule.quant_method.quant_config
|
||||
assert config.checkpoint_format == "gptq_v2"
|
||||
assert submodule.quant_method.use_v2_format
|
||||
|
||||
# Just break since currently we only check 1 module
|
||||
break
|
||||
|
||||
# Check if gptq_v2 format is correctly loaded
|
||||
llm.apply_model(check_model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", MODELS)
|
||||
def test_model_inference(vllm_runner, model_id):
|
||||
# Prepare prompt to test the model's generation result.
|
||||
prompt = "What is the meaning of life?"
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
enable_thinking=False, # If thinking model, set it to false
|
||||
)
|
||||
sampling_params = SamplingParams(
|
||||
n=N_SEQ,
|
||||
max_tokens=128,
|
||||
temperature=0.7,
|
||||
top_p=0.8,
|
||||
top_k=20,
|
||||
min_p=0,
|
||||
presence_penalty=2,
|
||||
)
|
||||
|
||||
with vllm_runner(model_id, dtype=torch.float16, max_model_len=512) as llm:
|
||||
# Generate a response to verify inference correctness
|
||||
output = llm.generate(text, sampling_params)
|
||||
|
||||
# Make sure the output exists
|
||||
assert output
|
||||
assert output[0][1]
|
||||
assert len(output[0][1]) == N_SEQ
|
||||
|
||||
def has_normal_char_distribution(texts, min_len):
|
||||
for text in texts:
|
||||
# Response too short
|
||||
if len(text) < min_len:
|
||||
return False
|
||||
|
||||
# Basic ratio checks
|
||||
letters = sum(c.isalpha() for c in text)
|
||||
spaces = sum(c.isspace() for c in text)
|
||||
total = len(text)
|
||||
|
||||
letter_ratio = letters / total
|
||||
space_ratio = spaces / total
|
||||
|
||||
# At least 1 normal text should exist within output sequences
|
||||
# Normal text should be mostly letters with reasonable spacing
|
||||
# Some magic numbers, could be adjusted
|
||||
if 0.5 <= letter_ratio <= 0.9 and 0.01 <= space_ratio <= 0.3:
|
||||
return True
|
||||
# No sequence contains normal text, output might be broken
|
||||
return False
|
||||
|
||||
# Apply some simple checks for giberish output
|
||||
# Print the output sequences if failed
|
||||
assert has_normal_char_distribution(output[0][1], 5), output[0][1]
|
||||
32
tests/quantization/test_ipex_quant.py
Normal file
32
tests/quantization/test_ipex_quant.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and inference for quantized HF models supported
|
||||
on the CPU/GPU backend using IPEX (including AWQ/GPTQ).
|
||||
|
||||
Validating the configuration and printing results for manual checking.
|
||||
|
||||
Run `pytest tests/quantization/test_ipex_quant.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODELS = [
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx
|
||||
]
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not current_platform.is_cpu() and not current_platform.is_xpu(),
|
||||
reason="only supports Intel CPU/XPU backend.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", DTYPE)
|
||||
def test_ipex_quant(vllm_runner, model, dtype):
|
||||
with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
assert output
|
||||
print(output)
|
||||
52
tests/quantization/test_lm_head.py
Normal file
52
tests/quantization/test_lm_head.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether gptq models with quantized lm_head can be loaded.
|
||||
|
||||
Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinearMethod
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
UnquantizedEmbeddingMethod,
|
||||
)
|
||||
|
||||
PROMPT = "On the surface of Mars, we found"
|
||||
|
||||
MODELS_QUANT = [
|
||||
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT)
|
||||
def test_lm_head(
|
||||
vllm_runner,
|
||||
model_id: str,
|
||||
lm_head_quantized: bool,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
with vllm_runner(
|
||||
model_id, dtype=torch.float16, max_model_len=2048, enforce_eager=True
|
||||
) as vllm_model:
|
||||
|
||||
def check_model(model):
|
||||
lm_head_layer = model.lm_head
|
||||
if lm_head_quantized:
|
||||
assert isinstance(
|
||||
lm_head_layer.quant_method,
|
||||
(GPTQLinearMethod, GPTQMarlinLinearMethod),
|
||||
)
|
||||
else:
|
||||
assert isinstance(
|
||||
lm_head_layer.quant_method, UnquantizedEmbeddingMethod
|
||||
)
|
||||
|
||||
vllm_model.apply_model(check_model)
|
||||
|
||||
print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
|
||||
69
tests/quantization/test_mixed_precision.py
Executable file
69
tests/quantization/test_mixed_precision.py
Executable file
@@ -0,0 +1,69 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test quark-quantized {MXFP4, FP8} mixed precision models.
|
||||
|
||||
Run `pytest tests/quantization/test_mixed_precision.py`.
|
||||
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
|
||||
import lm_eval
|
||||
import pytest
|
||||
from packaging import version
|
||||
|
||||
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")
|
||||
) >= version.parse("0.8.99")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelCase:
|
||||
model_id: str
|
||||
tp: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationConfig:
|
||||
model_name: str
|
||||
|
||||
def get_model_args(self) -> str:
|
||||
return (
|
||||
f"pretrained={self.model_name},"
|
||||
"tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.8,trust_remote_code=False"
|
||||
)
|
||||
|
||||
|
||||
TEST_CONFIGS = {
|
||||
# Mixed-precision (AMP) model
|
||||
# - Demonstrates end-to-end pipeline functionality
|
||||
"amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8": {"arc_challenge": 0.52, "mmlu": 0.72},
|
||||
# Non-mixed-precision (PTQ) model
|
||||
# - Reference for pipeline compatibility verification -> No conflicts or breakings
|
||||
"amd/Llama-2-70b-chat-hf-FP8-MLPerf-fp8_attn_quark_format": {
|
||||
"arc_challenge": 0.53,
|
||||
"mmlu": 0.61,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name, accuracy_numbers", TEST_CONFIGS.items())
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
def test_mixed_precision_model_accuracies(model_name: str, accuracy_numbers: dict):
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=EvaluationConfig(model_name).get_model_args(),
|
||||
tasks=list(accuracy_numbers.keys()),
|
||||
batch_size=8,
|
||||
)
|
||||
|
||||
rtol = 0.05
|
||||
|
||||
for task, expect_accuracy in accuracy_numbers.items():
|
||||
measured_accuracy = results["results"][task]["acc,none"]
|
||||
assert (
|
||||
measured_accuracy - rtol < expect_accuracy
|
||||
and measured_accuracy + rtol > expect_accuracy
|
||||
), f"Expected: {expect_accuracy} | Measured: {measured_accuracy}"
|
||||
93
tests/quantization/test_modelopt.py
Normal file
93
tests/quantization/test_modelopt.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test ModelOpt quantization method setup and weight loading.
|
||||
|
||||
Run `pytest tests/quantization/test_modelopt.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("modelopt"),
|
||||
reason="ModelOpt FP8 is not supported on this GPU type.",
|
||||
)
|
||||
def test_modelopt_fp8_checkpoint_setup(vllm_runner):
|
||||
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
|
||||
# TODO: provide a small publicly available test checkpoint
|
||||
model_path = (
|
||||
"/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
|
||||
"TinyLlama-1.1B-Chat-v1.0-fp8-0710"
|
||||
)
|
||||
|
||||
# Skip test if checkpoint doesn't exist
|
||||
if not os.path.exists(model_path):
|
||||
pytest.skip(
|
||||
f"Test checkpoint not found at {model_path}. "
|
||||
"This test requires a local ModelOpt FP8 checkpoint."
|
||||
)
|
||||
|
||||
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
o_proj = layer.self_attn.o_proj
|
||||
gate_up_proj = layer.mlp.gate_up_proj
|
||||
down_proj = layer.mlp.down_proj
|
||||
|
||||
# Check that ModelOpt quantization method is properly applied
|
||||
from vllm.model_executor.layers.quantization.modelopt import (
|
||||
ModelOptFp8LinearMethod,
|
||||
)
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
assert isinstance(gate_up_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod)
|
||||
|
||||
# Check weight dtype is FP8
|
||||
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert o_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
|
||||
assert down_proj.weight.dtype == torch.float8_e4m3fn
|
||||
|
||||
# Check scales are present and have correct dtype
|
||||
assert hasattr(qkv_proj, "weight_scale")
|
||||
assert hasattr(qkv_proj, "input_scale")
|
||||
assert qkv_proj.weight_scale.dtype == torch.float32
|
||||
assert qkv_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(o_proj, "weight_scale")
|
||||
assert hasattr(o_proj, "input_scale")
|
||||
assert o_proj.weight_scale.dtype == torch.float32
|
||||
assert o_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(gate_up_proj, "weight_scale")
|
||||
assert hasattr(gate_up_proj, "input_scale")
|
||||
assert gate_up_proj.weight_scale.dtype == torch.float32
|
||||
assert gate_up_proj.input_scale.dtype == torch.float32
|
||||
|
||||
assert hasattr(down_proj, "weight_scale")
|
||||
assert hasattr(down_proj, "input_scale")
|
||||
assert down_proj.weight_scale.dtype == torch.float32
|
||||
assert down_proj.input_scale.dtype == torch.float32
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
# Run a simple generation test to ensure the model works
|
||||
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
|
||||
assert output
|
||||
print(f"ModelOpt FP8 output: {output}")
|
||||
70
tests/quantization/test_ptpc_fp8.py
Normal file
70
tests/quantization/test_ptpc_fp8.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests whether PTPC w8a8 FP8 computation is enabled correctly.
|
||||
|
||||
Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
|
||||
from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
UNSUPPORTED_STR = (
|
||||
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
|
||||
"support output dtype of bfloat16. torch.float16 is specified."
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("ptpc_fp8"),
|
||||
reason="PTPC FP8 is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
|
||||
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
|
||||
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
|
||||
try:
|
||||
llm = vllm_runner(
|
||||
"facebook/opt-125m",
|
||||
dtype=dtype,
|
||||
quantization="ptpc_fp8",
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
except AssertionError as e:
|
||||
if str(e) == UNSUPPORTED_STR:
|
||||
# If the error message matches, the test passes
|
||||
return
|
||||
else:
|
||||
# If the error message does not match, re-raise the exception
|
||||
raise
|
||||
|
||||
with llm:
|
||||
|
||||
def check_model(model):
|
||||
fc1 = model.model.decoder.layers[0].fc1
|
||||
assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
|
||||
if kv_cache_dtype == "ptpc_fp8":
|
||||
attn = model.model.decoder.layers[0].self_attn.attn
|
||||
assert isinstance(attn.quant_method, Fp8KVCacheMethod)
|
||||
assert attn._k_scale == 1.0
|
||||
assert attn._v_scale == 1.0
|
||||
|
||||
if current_platform.has_device_capability(94):
|
||||
# For GPUs with hardware support, we keep weights in fp8
|
||||
assert fc1.weight.dtype == torch.float8_e4m3fnuz
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
341
tests/quantization/test_quark.py
Normal file
341
tests/quantization/test_quark.py
Normal file
@@ -0,0 +1,341 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Test model set-up and weight loading for quark-quantized models.
|
||||
|
||||
Run `pytest tests/quantization/test_quark.py`.
|
||||
|
||||
See also `tests/kernels/moe/test_ocp_mx_moe.py`.
|
||||
"""
|
||||
|
||||
import importlib.metadata
|
||||
from dataclasses import dataclass
|
||||
from importlib.util import find_spec
|
||||
|
||||
import huggingface_hub
|
||||
import lm_eval
|
||||
import pytest
|
||||
import torch
|
||||
from packaging import version
|
||||
|
||||
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501
|
||||
QuarkLinearMethod,
|
||||
QuarkW8A8Fp8,
|
||||
QuarkW8A8Int8,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch
|
||||
|
||||
QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse(
|
||||
importlib.metadata.version("amd-quark")
|
||||
) >= version.parse("0.8.99")
|
||||
|
||||
if QUARK_MXFP4_AVAILABLE:
|
||||
from quark.torch.export.nn.modules.realquantizer import StaticScaledRealQuantizer
|
||||
from quark.torch.kernel import mx as mx_kernel
|
||||
from quark.torch.quantization.config.config import FP4PerGroupSpec
|
||||
|
||||
try:
|
||||
huggingface_hub.list_repo_refs(
|
||||
"amd/Llama-3.3-70B-Instruct-WMXFP4-AMXFP4-KVFP8-Scale-UINT8-SQ"
|
||||
)
|
||||
HF_HUB_AMD_ORG_ACCESS = True
|
||||
except huggingface_hub.errors.RepositoryNotFoundError:
|
||||
HF_HUB_AMD_ORG_ACCESS = False
|
||||
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def enable_pickle(monkeypatch):
|
||||
"""`LLM.apply_model` requires pickling a function."""
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
@pytest.mark.parametrize("tp", [1])
|
||||
def test_quark_fp8_w_per_tensor_a_per_tensor(vllm_runner, kv_cache_dtype, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
|
||||
with vllm_runner(
|
||||
model_path,
|
||||
enforce_eager=True,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
tensor_parallel_size=tp,
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
|
||||
|
||||
if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
|
||||
assert len(qkv_proj.input_scale.shape) == 0
|
||||
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
|
||||
assert len(qkv_proj.weight_scale.shape) == 0
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", [1])
|
||||
def test_quark_fp8_w_per_channel_a_per_token(vllm_runner, tp):
|
||||
model_path = "amd/Qwen2.5-1.5B-Instruct-ptpc-Quark-ts"
|
||||
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8)
|
||||
|
||||
if isinstance(qkv_proj.scheme, QuarkW8A8Fp8):
|
||||
assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
|
||||
assert qkv_proj.weight_scale.shape[0] == qkv_proj.weight.shape[1]
|
||||
assert qkv_proj.weight_scale.shape[1] == 1
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp", [1])
|
||||
def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
|
||||
model_path = "amd/Llama-3.1-8B-Instruct-w-int8-a-int8-sym-test"
|
||||
with vllm_runner(model_path, enforce_eager=True, tensor_parallel_size=tp) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
|
||||
assert isinstance(qkv_proj.scheme, QuarkW8A8Int8)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
def test_quark_fp8_parity(vllm_runner):
|
||||
quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
|
||||
fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"
|
||||
|
||||
llm_kwargs = {
|
||||
"tensor_parallel_size": 1,
|
||||
"enforce_eager": True,
|
||||
"gpu_memory_utilization": 0.1,
|
||||
}
|
||||
with (
|
||||
vllm_runner(quark_model_id, **llm_kwargs) as quark_handle,
|
||||
vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle,
|
||||
):
|
||||
|
||||
def get_state_dict(model):
|
||||
return {k: v.cpu() for k, v in model.state_dict().items()}
|
||||
|
||||
(quark_state_dict,) = quark_handle.apply_model(get_state_dict)
|
||||
(fp8_state_dict,) = fp8_handle.apply_model(get_state_dict)
|
||||
|
||||
assert fp8_state_dict.keys() == quark_state_dict.keys()
|
||||
|
||||
for key in fp8_state_dict:
|
||||
assert torch.equal(fp8_state_dict[key], quark_state_dict[key])
|
||||
|
||||
|
||||
@dataclass
|
||||
class AccuracyTestConfig:
|
||||
model_name: str
|
||||
excepted_value: float
|
||||
|
||||
def get_model_args(
|
||||
self,
|
||||
tp_size: int,
|
||||
model_max_len: int | None = None,
|
||||
kwargs: dict | None = None,
|
||||
) -> dict:
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
|
||||
model_args = {
|
||||
"pretrained": self.model_name,
|
||||
"dtype": "auto",
|
||||
"add_bos_token": True,
|
||||
"tensor_parallel_size": tp_size,
|
||||
"gpu_memory_utilization": 0.7,
|
||||
**kwargs,
|
||||
}
|
||||
if model_max_len is not None:
|
||||
model_args["max_model_len"] = model_max_len
|
||||
|
||||
return model_args
|
||||
|
||||
|
||||
GSM8K_ACCURACY_CONFIGS = [
|
||||
# Private model.
|
||||
AccuracyTestConfig(
|
||||
model_name="amd/DeepSeek-R1-WMXFP4-AMXFP4-Scale-UINT8-MoE-Quant",
|
||||
excepted_value=0.96,
|
||||
),
|
||||
]
|
||||
|
||||
WIKITEXT_ACCURACY_CONFIGS = [
|
||||
AccuracyTestConfig(
|
||||
model_name="fxmarty/qwen1.5_moe_a2.7b_chat_w_fp4_a_fp6_e2m3",
|
||||
excepted_value=11.3,
|
||||
),
|
||||
AccuracyTestConfig(
|
||||
model_name="fxmarty/qwen1.5_moe_a2.7b_chat_w_fp6_e3m2_a_fp6_e3m2",
|
||||
excepted_value=10.6,
|
||||
),
|
||||
AccuracyTestConfig(
|
||||
model_name="fxmarty/qwen_1.5-moe-a2.7b-mxfp4", excepted_value=12.4
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
@pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
|
||||
@pytest.mark.parametrize("tp_size", [1, 2])
|
||||
def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
|
||||
if torch.cuda.device_count() < tp_size:
|
||||
pytest.skip(
|
||||
f"This test requires >={tp_size} gpus, got only {torch.cuda.device_count()}"
|
||||
)
|
||||
|
||||
task = "wikitext"
|
||||
rtol = 0.1
|
||||
|
||||
# Smaller cudagraph_capture_sizes to speed up the test.
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=config.get_model_args(
|
||||
tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]}
|
||||
),
|
||||
tasks=task,
|
||||
batch_size=64,
|
||||
)
|
||||
|
||||
EXPECTED_VALUE = config.excepted_value
|
||||
measured_value = results["results"][task]["word_perplexity,none"]
|
||||
assert (
|
||||
measured_value < EXPECTED_VALUE + rtol
|
||||
and measured_value > EXPECTED_VALUE - rtol
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("config", GSM8K_ACCURACY_CONFIGS)
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
@pytest.mark.skipif(
|
||||
not HF_HUB_AMD_ORG_ACCESS,
|
||||
reason="Read access to huggingface.co/amd is required for this test.",
|
||||
)
|
||||
def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
|
||||
if torch.cuda.device_count() < 8:
|
||||
pytest.skip(
|
||||
f"This test requires >=8 gpus, got only {torch.cuda.device_count()}"
|
||||
)
|
||||
|
||||
task = "gsm8k"
|
||||
rtol = 0.03
|
||||
|
||||
results = lm_eval.simple_evaluate(
|
||||
model="vllm",
|
||||
model_args=config.get_model_args(tp_size=8, model_max_len=38768),
|
||||
tasks=task,
|
||||
batch_size=64,
|
||||
num_fewshot=8,
|
||||
)
|
||||
|
||||
EXPECTED_VALUE = config.excepted_value
|
||||
measured_value = results["results"][task]["exact_match,strict-match"]
|
||||
assert (
|
||||
measured_value - rtol < EXPECTED_VALUE
|
||||
and measured_value + rtol > EXPECTED_VALUE
|
||||
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
|
||||
def test_mxfp4_fused_qdq_match_quark(float_dtype: torch.dtype, scalings: list[int]):
|
||||
torch.manual_seed(0)
|
||||
|
||||
hidden_size = 64 * 32
|
||||
inp = (torch.rand(1, hidden_size, dtype=float_dtype, device="cuda") - 0.5) * 2
|
||||
for i in range(hidden_size // 32):
|
||||
inp[:, i * 32 : (i + 1) * 32] = (
|
||||
inp[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
|
||||
)
|
||||
|
||||
inp_kernel = inp.clone()
|
||||
inp_kernel_clone = inp_kernel.clone()
|
||||
|
||||
res_hip = mx_kernel.qdq_mxfp4_hip(inp_kernel_clone, "even")
|
||||
res_torch = qdq_mxfp4_torch(inp_kernel, "even")
|
||||
|
||||
for i in range(hidden_size // 32):
|
||||
assert torch.all(torch.isfinite(res_hip[:, i * 32 : (i + 1) * 32]))
|
||||
assert torch.all(torch.isfinite(res_torch[:, i * 32 : (i + 1) * 32]))
|
||||
|
||||
torch.testing.assert_close(
|
||||
res_hip[:, i * 32 : (i + 1) * 32], res_torch[:, i * 32 : (i + 1) * 32]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
|
||||
@pytest.mark.parametrize("float_dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("scalings", [[2.3, 0.03, 7.3, 0.1, 0.004, 17.3, 1e4, 1e-4]])
|
||||
def test_mxfp4_dequant_kernel_match_quark(
|
||||
float_dtype: torch.dtype, scalings: list[int]
|
||||
):
|
||||
qspec = FP4PerGroupSpec(
|
||||
ch_axis=-1,
|
||||
group_size=32,
|
||||
scale_format="e8m0",
|
||||
scale_calculation_mode="even",
|
||||
is_dynamic=False,
|
||||
).to_quantization_spec()
|
||||
|
||||
weight_quantizer = StaticScaledRealQuantizer(
|
||||
qspec=qspec,
|
||||
quantizer=None,
|
||||
reorder=False,
|
||||
real_quantized=True,
|
||||
float_dtype=float_dtype,
|
||||
device="cuda",
|
||||
)
|
||||
|
||||
observer = qspec.observer_cls(qspec, device="cuda")
|
||||
|
||||
hidden_size = 512
|
||||
shape = (11008, hidden_size)
|
||||
|
||||
w = (torch.rand(shape, device="cuda", dtype=float_dtype) - 0.5) * 2
|
||||
|
||||
# Make it so that different groups have different scales.
|
||||
for i in range(hidden_size // 32):
|
||||
w[:, i * 32 : (i + 1) * 32] = (
|
||||
w[:, i * 32 : (i + 1) * 32] * scalings[i % len(scalings)]
|
||||
)
|
||||
|
||||
observer(w)
|
||||
scale, _ = observer._calculate_qparams()
|
||||
weight_quantizer.scale = scale
|
||||
|
||||
w_mxfp4 = weight_quantizer.to_real_quantize_params(w).to("cuda")
|
||||
weight_quantizer.maybe_convert_and_transpose_scale()
|
||||
|
||||
scale = weight_quantizer.scale
|
||||
|
||||
out_hip = mx_kernel.dq_mxfp4_hip(w_mxfp4, scale, float_dtype)
|
||||
|
||||
out_torch = dq_mxfp4_torch(w_mxfp4, scale, float_dtype)
|
||||
|
||||
assert torch.equal(out_hip, out_torch)
|
||||
146
tests/quantization/test_register_quantization_config.py
Normal file
146
tests/quantization/test_register_quantization_config.py
Normal file
@@ -0,0 +1,146 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests register custom quantization config.
|
||||
|
||||
See https://github.com/vllm-project/vllm/issues/11926 for more details.
|
||||
|
||||
Run `pytest tests/quantization/test_register_quantization_config.py`.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearBase, # noqa: E501
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization import (
|
||||
QuantizationMethods,
|
||||
get_quantization_config,
|
||||
register_quantization_config,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.base_config import (
|
||||
QuantizationConfig, # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
class FakeQuantLinearMethod(UnquantizedLinearMethod):
|
||||
"""Fake quantization linear method for per-token dynamic quantization."""
|
||||
|
||||
def __init__(self, num_bits: int = 8) -> None:
|
||||
"""Initialize the quantization method."""
|
||||
super().__init__()
|
||||
self.num_bits = num_bits
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
x: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
) -> torch.Tensor:
|
||||
"""Perform fake quantization before the linear layer."""
|
||||
|
||||
# Calculate the scales dynamically
|
||||
max_val = torch.amax(x, dim=(0, -1), keepdims=True)
|
||||
min_val = torch.amin(x, dim=(0, -1), keepdims=True)
|
||||
scales = (max_val - min_val) / (2**self.num_bits - 1)
|
||||
|
||||
# Fake quantize the input
|
||||
quant_x = torch.clamp(
|
||||
torch.round(x / scales),
|
||||
-(2 ** (self.num_bits - 1)),
|
||||
2 ** (self.num_bits - 1) - 1,
|
||||
)
|
||||
dequant_x = quant_x * scales
|
||||
|
||||
return F.linear(dequant_x, layer.weight, bias)
|
||||
|
||||
|
||||
@register_quantization_config("custom_quant")
|
||||
class CustomQuantConfig(QuantizationConfig):
|
||||
"""Custom quantization config for per-token dynamic fake quantization."""
|
||||
|
||||
def __init__(self, num_bits: int = 8) -> None:
|
||||
"""Initialize the quantization config."""
|
||||
super().__init__()
|
||||
self.num_bits = num_bits
|
||||
|
||||
def get_name(self) -> QuantizationMethods:
|
||||
"""Name of the quantization method."""
|
||||
return "custom_quant"
|
||||
|
||||
def get_supported_act_dtypes(self) -> list[torch.dtype]:
|
||||
"""List of supported activation dtypes."""
|
||||
return [torch.float16, torch.bfloat16]
|
||||
|
||||
@classmethod
|
||||
def get_min_capability(cls) -> int:
|
||||
"""Minimum GPU capability to support the quantization method."""
|
||||
return -1
|
||||
|
||||
@staticmethod
|
||||
def get_config_filenames() -> list[str]:
|
||||
"""List of filenames to search for in the model directory."""
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
|
||||
"""Create a config class from the model's quantization config."""
|
||||
return CustomQuantConfig(num_bits=config.get("num_bits", 8))
|
||||
|
||||
def get_quant_method(
|
||||
self, layer: torch.nn.Module, prefix: str
|
||||
) -> FakeQuantLinearMethod | None:
|
||||
"""Get the quantize method to use for the quantized layer."""
|
||||
if isinstance(layer, LinearBase):
|
||||
return FakeQuantLinearMethod(num_bits=self.num_bits)
|
||||
return None
|
||||
|
||||
|
||||
def test_register_quantization_config(caplog_vllm):
|
||||
"""Test register custom quantization config."""
|
||||
|
||||
# The quantization method `custom_quant` should be registered.
|
||||
assert get_quantization_config("custom_quant") == CustomQuantConfig
|
||||
|
||||
# The quantization method `custom_quant` is already exists,
|
||||
# should raise a warning when re-registering it.
|
||||
with caplog_vllm.at_level(logging.WARNING):
|
||||
register_quantization_config("custom_quant")(CustomQuantConfig)
|
||||
|
||||
assert any(
|
||||
"The quantization method 'custom_quant' already exists" in message
|
||||
for message in caplog_vllm.messages
|
||||
), "Expected a warning when re-registering custom_quant"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
argnames="model",
|
||||
argvalues=[
|
||||
"meta-llama/Llama-3.2-1B-Instruct",
|
||||
],
|
||||
)
|
||||
def test_custom_quant(vllm_runner, model, monkeypatch):
|
||||
"""Test infer with the custom quantization method."""
|
||||
# `LLM.apply_model` requires pickling a function.
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
with vllm_runner(
|
||||
model_name=model, quantization="custom_quant", enforce_eager=True
|
||||
) as llm:
|
||||
|
||||
def check_model(model):
|
||||
layer = model.model.layers[0]
|
||||
qkv_proj = layer.self_attn.qkv_proj
|
||||
|
||||
# Check the quantization method is FakeQuantLinearMethod
|
||||
assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod)
|
||||
|
||||
llm.apply_model(check_model)
|
||||
|
||||
output = llm.generate_greedy("Hello my name is", max_tokens=1)
|
||||
assert output
|
||||
35
tests/quantization/test_rtn.py
Normal file
35
tests/quantization/test_rtn.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright © 2025, Oracle and/or its affiliates.
|
||||
"""Tests RTN quantization startup and generation,
|
||||
doesn't test correctness
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.quantization.utils import is_quant_method_supported
|
||||
|
||||
MODELS = [
|
||||
"ai21labs/Jamba-tiny-dev", # MoE model
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_quant_method_supported("rtn"),
|
||||
reason="RTN is not supported on this GPU type.",
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
@pytest.mark.parametrize("max_tokens", [10])
|
||||
def test_model_rtn_startup(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, enforce_eager=True, dtype=dtype, quantization="rtn"
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
400
tests/quantization/test_torchao.py
Normal file
400
tests/quantization/test_torchao.py
Normal file
@@ -0,0 +1,400 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib.metadata
|
||||
import importlib.util
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
DTYPE = ["bfloat16"]
|
||||
|
||||
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_pre_quantized_model(vllm_runner):
|
||||
with vllm_runner(
|
||||
"drisspg/fp8-opt-125m",
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.parametrize(
|
||||
"pt_load_map_location",
|
||||
[
|
||||
"cuda:0",
|
||||
# {"": "cuda"},
|
||||
],
|
||||
)
|
||||
def test_opt_125m_int8wo_model_loading_with_params(vllm_runner, pt_load_map_location):
|
||||
torch._dynamo.reset()
|
||||
model_name = "jerryzh168/opt-125m-int8wo-partial-quant"
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location=pt_load_map_location,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
|
||||
torch._dynamo.reset()
|
||||
model_name = "jerryzh168/opt-125m-int4wo-per-module"
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
|
||||
torch._dynamo.reset()
|
||||
model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.skip(
|
||||
reason="since torchao nightly is only compatible with torch nightly"
|
||||
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
|
||||
"torchao tests that requires newer versions (0.14.0.dev+) for now"
|
||||
)
|
||||
def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
|
||||
torch._dynamo.reset()
|
||||
model_name = "torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2-0.14.0.dev"
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_online_quant_config_dict_json(vllm_runner):
|
||||
"""Testing on the fly quantization, load_weights integration point,
|
||||
with config dict serialized to json string
|
||||
"""
|
||||
torch._dynamo.reset()
|
||||
model_name = "facebook/opt-125m"
|
||||
|
||||
import json
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
||||
|
||||
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
|
||||
granularity=PerRow()
|
||||
)
|
||||
hf_overrides = {
|
||||
"quantization_config_dict_json": json.dumps(
|
||||
config_to_dict(torchao_quant_config)
|
||||
)
|
||||
}
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_online_quant_config_file(vllm_runner):
|
||||
"""Testing on the fly quantization, load_weights integration point,
|
||||
with config file
|
||||
"""
|
||||
torch._dynamo.reset()
|
||||
model_name = "facebook/opt-125m"
|
||||
import json
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
||||
|
||||
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
|
||||
|
||||
with NamedTemporaryFile(mode="w", delete=False) as f:
|
||||
f.write(json.dumps(config_to_dict(config)))
|
||||
# close the file to save it
|
||||
f.close()
|
||||
config_file_name = str(f.name)
|
||||
|
||||
hf_overrides = {"quantization_config_file": config_file_name}
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
def test_reload_weights():
|
||||
import json
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, PerRow
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
torchao_quant_config = Float8DynamicActivationFloat8WeightConfig(
|
||||
granularity=PerRow()
|
||||
)
|
||||
|
||||
hf_overrides = {
|
||||
"quantization_config_dict_json": json.dumps(
|
||||
config_to_dict(torchao_quant_config)
|
||||
)
|
||||
}
|
||||
|
||||
llm = LLM(
|
||||
model="Qwen/Qwen3-0.6B",
|
||||
dtype="bfloat16",
|
||||
load_format="dummy",
|
||||
enforce_eager=True,
|
||||
quantization="torchao",
|
||||
hf_overrides=hf_overrides,
|
||||
)
|
||||
# Update load format from `dummy` to `auto`
|
||||
llm.collective_rpc(
|
||||
"update_config", args=({"load_config": {"load_format": "auto"}},)
|
||||
)
|
||||
# Now reload real weights inplace
|
||||
llm.collective_rpc("reload_weights")
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
# Create a sampling params object.
|
||||
sampling_params = SamplingParams(temperature=0, top_p=0.95)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# make sure it runs
|
||||
for output in outputs:
|
||||
generated_text = output.outputs[0].text
|
||||
assert generated_text
|
||||
# can also uncomment locally to make sure the generated
|
||||
# output makes sense
|
||||
# prompt = output.prompt
|
||||
# print(f"Prompt: {prompt!r}")
|
||||
# print(f"Output: {generated_text!r}")
|
||||
# print("-" * 60)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.skip(
|
||||
reason="since torchao nightly is only compatible with torch nightly"
|
||||
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
|
||||
"torchao tests that requires newer versions (0.15.0.dev+) for now"
|
||||
)
|
||||
def test_safetensors_model_loading_with_params(vllm_runner):
|
||||
torch._dynamo.reset()
|
||||
# using this model to test safetensors loading with file sharding
|
||||
model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors"
|
||||
with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.skip(
|
||||
reason="since torchao nightly is only compatible with torch nightly"
|
||||
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
|
||||
"torchao tests that requires newer versions (0.14.0.dev+) for now"
|
||||
)
|
||||
def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
|
||||
torch._dynamo.reset()
|
||||
model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
|
||||
with vllm_runner(
|
||||
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.skip(
|
||||
reason="since torchao nightly is only compatible with torch nightly"
|
||||
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
|
||||
"torchao tests that requires newer versions (0.14.0.dev+) for now"
|
||||
)
|
||||
def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypatch):
|
||||
"""We load a model with Int4Tensor (plain format) linear weights
|
||||
and verify that the weight is updated to Int4PreshuffledTensor
|
||||
after loading in vllm
|
||||
"""
|
||||
from torchao.quantization import Int4PreshuffledTensor
|
||||
from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
|
||||
|
||||
torch._dynamo.reset()
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
model_name = "torchao-testing/opt-125m-Int4WeightOnlyConfig-v2-0.14.0.dev"
|
||||
# Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
|
||||
# have meta kernel implemented yet, can remove this flag after that is implemented
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
|
||||
def has_int4_preshuffled_tensor_weight(model):
|
||||
return isinstance(
|
||||
model.model.decoder.layers[0].self_attn.qkv_proj.weight,
|
||||
Int4PreshuffledTensor,
|
||||
)
|
||||
|
||||
def get_weight_attrs(model):
|
||||
weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
|
||||
return [
|
||||
weight.requires_grad,
|
||||
weight.input_dim,
|
||||
weight.output_dim,
|
||||
hasattr(weight, "weight_loader"),
|
||||
]
|
||||
|
||||
llm_engine = llm.get_llm().llm_engine
|
||||
has_int4_preshuffled_tensor = any(
|
||||
llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
|
||||
)
|
||||
weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
|
||||
|
||||
# making sure we are using Int4PreshuffledTensor on H100 GPU, when
|
||||
# fbgemm_gpu_genai
|
||||
# library is installed, otherwise it should be using Int4Tensor
|
||||
if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
|
||||
assert has_int4_preshuffled_tensor
|
||||
else:
|
||||
assert not has_int4_preshuffled_tensor
|
||||
|
||||
assert weight_attrs == [False, 1, 0, True]
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
|
||||
@pytest.mark.skip(
|
||||
reason="since torchao nightly is only compatible with torch nightly"
|
||||
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
|
||||
"torchao tests that requires newer versions (0.14.0.dev+) for now"
|
||||
)
|
||||
def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
|
||||
vllm_runner, monkeypatch
|
||||
):
|
||||
"""We load a bf16 model and online quantize the model to int4, then verify that
|
||||
the weights are updated to Int4PreshuffledTensor after online quantization
|
||||
"""
|
||||
from torchao.quantization import Int4PreshuffledTensor
|
||||
from torchao.utils import _is_fbgemm_gpu_genai_available, is_sm_at_least_90
|
||||
|
||||
torch._dynamo.reset()
|
||||
model_name = "facebook/opt-125m"
|
||||
|
||||
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
||||
|
||||
import json
|
||||
|
||||
from torchao.core.config import config_to_dict
|
||||
from torchao.quantization import Int4WeightOnlyConfig
|
||||
|
||||
torchao_quant_config = Int4WeightOnlyConfig(
|
||||
group_size=128, int4_packing_format="plain"
|
||||
)
|
||||
hf_overrides = {
|
||||
"quantization_config_dict_json": json.dumps(
|
||||
config_to_dict(torchao_quant_config)
|
||||
)
|
||||
}
|
||||
|
||||
# Note: using enforce_eager=True because the `bf16i4bf16_shuffled` doesn't
|
||||
# have meta kernel implemented yet, can remove this flag after that is implemented
|
||||
with vllm_runner(
|
||||
model_name=model_name,
|
||||
quantization="torchao",
|
||||
dtype="bfloat16",
|
||||
pt_load_map_location="cuda:0",
|
||||
hf_overrides=hf_overrides,
|
||||
enforce_eager=True,
|
||||
) as llm:
|
||||
|
||||
def has_int4_preshuffled_tensor_weight(model):
|
||||
return isinstance(
|
||||
model.model.decoder.layers[0].self_attn.qkv_proj.weight,
|
||||
Int4PreshuffledTensor,
|
||||
)
|
||||
|
||||
def get_weight_attrs(model):
|
||||
weight = model.model.decoder.layers[0].self_attn.qkv_proj.weight
|
||||
return [
|
||||
weight.requires_grad,
|
||||
weight.input_dim,
|
||||
weight.output_dim,
|
||||
hasattr(weight, "weight_loader"),
|
||||
]
|
||||
|
||||
llm_engine = llm.get_llm().llm_engine
|
||||
has_int4_preshuffled_tensor = any(
|
||||
llm_engine.apply_model(has_int4_preshuffled_tensor_weight)
|
||||
)
|
||||
weight_attrs = llm_engine.apply_model(get_weight_attrs)[0]
|
||||
|
||||
# making sure we are using Int4PreshuffledTensor on H100 GPU, when
|
||||
# fbgemm_gpu_genai
|
||||
# library is installed, otherwise it should be using Int4Tensor
|
||||
if _is_fbgemm_gpu_genai_available() and is_sm_at_least_90():
|
||||
assert has_int4_preshuffled_tensor
|
||||
else:
|
||||
assert not has_int4_preshuffled_tensor
|
||||
|
||||
assert weight_attrs == [False, 1, 0, True]
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
|
||||
|
||||
assert output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
18
tests/quantization/utils.py
Normal file
18
tests/quantization/utils.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.model_executor.layers.quantization import get_quantization_config
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
def is_quant_method_supported(quant_method: str) -> bool:
|
||||
# Currently, all quantization methods require Nvidia or AMD GPUs
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
return False
|
||||
|
||||
capability = current_platform.get_device_capability()
|
||||
assert capability is not None
|
||||
|
||||
min_capability = get_quantization_config(quant_method).get_min_capability()
|
||||
|
||||
return capability.to_int() >= min_capability
|
||||
Reference in New Issue
Block a user