First commit
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
from .layer_utils import replace_parameter, update_tensor_inplace
|
||||
|
||||
__all__ = ['update_tensor_inplace', 'replace_parameter']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
37
vllm/model_executor/layers/quantization/utils/layer_utils.py
Normal file
37
vllm/model_executor/layers/quantization/utils/layer_utils.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def update_tensor_inplace(dst: torch.Tensor, src: torch.Tensor):
|
||||
assert dst.dtype == src.dtype, "Tensors must have the same dtype"
|
||||
|
||||
# update tensor shape and stride
|
||||
dst.as_strided_(src.shape, src.stride())
|
||||
|
||||
# If not the same underlying storage move tensor data
|
||||
if dst.data_ptr() != src.data_ptr():
|
||||
dst.copy_(src)
|
||||
del src
|
||||
|
||||
|
||||
# Newly generated tensors need to replace existing tensors that are
|
||||
# already registered as parameters by vLLM (and won't be freed)
|
||||
def replace_parameter(mod: torch.nn.Module, name: str,
|
||||
new: Union[torch.Tensor, torch.nn.Parameter]) -> None:
|
||||
|
||||
old = getattr(mod, name)
|
||||
if type(old) is type(new) and old.dtype == new.dtype and \
|
||||
old.untyped_storage().nbytes() == new.untyped_storage().nbytes():
|
||||
# If we can just update in-place to avoid re-registering
|
||||
# can be faster if the underlying storage is the same
|
||||
update_tensor_inplace(old, new)
|
||||
else:
|
||||
# Fallback re-register parameter, convert to Parameter if necessary
|
||||
# this not only ensures we don't register a tensor as a parameter, but
|
||||
# also ensures that all parameter subclasses get re-registered as
|
||||
# parameters for `torch.compile` compatibility
|
||||
if not isinstance(new, torch.nn.Parameter):
|
||||
new = torch.nn.Parameter(new, requires_grad=False)
|
||||
mod.register_parameter(name,
|
||||
torch.nn.Parameter(new, requires_grad=False))
|
||||
@@ -0,0 +1,30 @@
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
MACHETE_SUPPORTED_GROUP_SIZES = [-1, 128]
|
||||
MACHETE_PREPACKED_BLOCK_SHAPE = [64, 128]
|
||||
|
||||
|
||||
def query_machete_supported_quant_types(zero_points: bool) -> List[ScalarType]:
|
||||
if zero_points:
|
||||
return [scalar_types.uint4, scalar_types.uint8]
|
||||
else:
|
||||
return [scalar_types.uint4b8, scalar_types.uint8b128]
|
||||
|
||||
|
||||
def query_machete_supported_act_types(zero_points: bool) -> List[ScalarType]:
|
||||
return [torch.float16, torch.bfloat16]
|
||||
|
||||
|
||||
def check_machete_supports_shape(in_features: int, out_featrues: int) \
|
||||
-> Tuple[bool, Optional[str]]:
|
||||
if in_features % MACHETE_PREPACKED_BLOCK_SHAPE[0] != 0:
|
||||
return False, "Input features size must be divisible by "\
|
||||
f"{MACHETE_PREPACKED_BLOCK_SHAPE[0]}"
|
||||
if out_featrues % MACHETE_PREPACKED_BLOCK_SHAPE[1] != 0:
|
||||
return False, "Output features size must be divisible by "\
|
||||
f"{MACHETE_PREPACKED_BLOCK_SHAPE[1]}"
|
||||
return True, None
|
||||
348
vllm/model_executor/layers/quantization/utils/marlin_utils.py
Normal file
348
vllm/model_executor/layers/quantization/utils/marlin_utils.py
Normal file
@@ -0,0 +1,348 @@
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
from .quant_utils import pack_cols, unpack_cols
|
||||
|
||||
GPTQ_MARLIN_TILE = 16
|
||||
GPTQ_MARLIN_MIN_THREAD_N = 64
|
||||
GPTQ_MARLIN_MIN_THREAD_K = 128
|
||||
GPTQ_MARLIN_MAX_PARALLEL = 16
|
||||
|
||||
MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
|
||||
|
||||
# In case there is a performance issue with Marlin, the variable below can be
|
||||
# changed to False, which allows Marlin to perform global reductions in fp16
|
||||
# precision (instead of fp32), and therefore, save on some memory movements.
|
||||
USE_FP32_REDUCE_DEFAULT = True
|
||||
|
||||
|
||||
# For binary size and compile time, we don't support the same types for with and
|
||||
# without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
|
||||
# TODO: we may want to move this into the C++ so its closer to the actual impl
|
||||
def query_marlin_supported_quant_types(has_zp: bool,
|
||||
device_capability: Optional[int] = None
|
||||
):
|
||||
if device_capability is None:
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
device_capability = (-1 if capability_tuple is None else
|
||||
capability_tuple.to_int())
|
||||
|
||||
if device_capability < 80:
|
||||
return []
|
||||
|
||||
if has_zp:
|
||||
# AWQ style, unsigned + runtime zero-point
|
||||
return [scalar_types.uint4, scalar_types.uint8]
|
||||
else:
|
||||
# GPTQ style, unsigned + symmetric bias
|
||||
# TODO: once fp8_marlin is merged into "gptq_marlin" we should be able
|
||||
# to add `scalar_types.float8_e4m3fn` here
|
||||
return [scalar_types.uint4b8, scalar_types.uint8b128]
|
||||
|
||||
|
||||
def _check_marlin_supported(
|
||||
quant_type: ScalarType,
|
||||
group_size: Optional[int],
|
||||
has_zp: bool,
|
||||
device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]:
|
||||
|
||||
if device_capability is None:
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
device_capability = (-1 if capability_tuple is None else
|
||||
capability_tuple.to_int())
|
||||
|
||||
supported_types = query_marlin_supported_quant_types(
|
||||
has_zp, device_capability)
|
||||
|
||||
if quant_type not in supported_types:
|
||||
return (False, f"Marlin does not support weight_bits = {quant_type}. "
|
||||
f"Only types = {supported_types} "
|
||||
f"are supported (for group_size = {group_size}, "
|
||||
f"device_capability = {device_capability}, zp = {has_zp}).")
|
||||
if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES):
|
||||
return (False, f"Marlin does not support group_size = {group_size}. "
|
||||
f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
|
||||
"are supported.")
|
||||
|
||||
return True, None
|
||||
|
||||
|
||||
def check_marlin_supported(quant_type: ScalarType,
|
||||
group_size: int,
|
||||
has_zp: bool = False,
|
||||
device_capability: Optional[int] = None) -> bool:
|
||||
cond, _ = _check_marlin_supported(quant_type, group_size, has_zp,
|
||||
device_capability)
|
||||
return cond
|
||||
|
||||
|
||||
def verify_marlin_supported(quant_type: ScalarType,
|
||||
group_size: int,
|
||||
has_zp: bool = False) -> None:
|
||||
cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
|
||||
if not cond:
|
||||
assert err_msg is not None
|
||||
raise ValueError(err_msg)
|
||||
|
||||
|
||||
def verify_marlin_supports_shape(output_size_per_partition: int,
|
||||
input_size_per_partition: int,
|
||||
input_size: int, group_size: int) -> None:
|
||||
|
||||
# Validate output_size_per_partition
|
||||
if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
|
||||
raise ValueError(f"Weight output_size_per_partition = "
|
||||
f"{output_size_per_partition} is not divisible by "
|
||||
f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
|
||||
"Consider reducing tensor_parallel_size or running "
|
||||
"with --quantization gptq.")
|
||||
|
||||
# Validate input_size_per_partition
|
||||
if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
|
||||
raise ValueError(f"Weight input_size_per_partition = "
|
||||
f"{input_size_per_partition} is not divisible "
|
||||
f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
|
||||
"Consider reducing tensor_parallel_size or running "
|
||||
"with --quantization gptq.")
|
||||
|
||||
if (group_size < input_size
|
||||
and input_size_per_partition % group_size != 0):
|
||||
raise ValueError(
|
||||
f"Weight input_size_per_partition = {input_size_per_partition}"
|
||||
f" is not divisible by group_size = {group_size}."
|
||||
"Consider reducing tensor_parallel_size or running "
|
||||
"with --quantization gptq.")
|
||||
|
||||
|
||||
def check_marlin_supports_shape(output_size_per_partition: int,
|
||||
input_size_per_partition: int,
|
||||
input_size: int, group_size: int) \
|
||||
-> Tuple[bool, Optional[str]]:
|
||||
try:
|
||||
verify_marlin_supports_shape(output_size_per_partition,
|
||||
input_size_per_partition, input_size,
|
||||
group_size)
|
||||
except ValueError as e:
|
||||
return False, e.__str__()
|
||||
return True, None
|
||||
|
||||
|
||||
def marlin_make_workspace(output_size_per_partition: int,
|
||||
device: torch.device) -> torch.Tensor:
|
||||
max_workspace_size = (output_size_per_partition //
|
||||
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
|
||||
|
||||
return torch.zeros(max_workspace_size,
|
||||
dtype=torch.int,
|
||||
device=device,
|
||||
requires_grad=False)
|
||||
|
||||
|
||||
def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
|
||||
return (not act_order) or (act_order and not is_row_parallel)
|
||||
|
||||
|
||||
def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
|
||||
is_row_parallel: bool) -> bool:
|
||||
# Need to repeat scales on every rank if act_ordering or
|
||||
# channelwise and RowParallelLinear
|
||||
is_channelwise = group_size == -1
|
||||
return act_order or (is_channelwise and is_row_parallel)
|
||||
|
||||
|
||||
def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
|
||||
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
||||
requires_grad=False)
|
||||
|
||||
|
||||
def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
|
||||
return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
|
||||
requires_grad=False)
|
||||
|
||||
|
||||
def marlin_sort_g_idx(
|
||||
g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
|
||||
return g_idx[g_idx_sort_indices], g_idx_sort_indices
|
||||
|
||||
|
||||
def get_scale_perms():
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(4):
|
||||
scale_perm_single.extend(
|
||||
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||
return scale_perm, scale_perm_single
|
||||
|
||||
|
||||
def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
|
||||
group_size: int) -> torch.Tensor:
|
||||
|
||||
scale_perm, scale_perm_single = get_scale_perms()
|
||||
if group_size < size_k and group_size != -1:
|
||||
s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
|
||||
else:
|
||||
s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
|
||||
s = s.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def marlin_moe_permute_scales(
|
||||
s: torch.Tensor,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
group_size: int,
|
||||
):
|
||||
num_experts = s.shape[0]
|
||||
output = torch.empty(
|
||||
(num_experts, s.shape[1], s.shape[2]),
|
||||
device=s.device,
|
||||
dtype=s.dtype,
|
||||
)
|
||||
|
||||
for e in range(num_experts):
|
||||
output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
|
||||
return output
|
||||
|
||||
|
||||
def marlin_zero_points(zp: torch.Tensor, size_k: int, size_n: int,
|
||||
num_bits: int) -> torch.Tensor:
|
||||
# Permute zero-points in a similar way to scales, but do not use the
|
||||
# "single" permutation, since zero-points are applied on every MMA
|
||||
scale_perm, _ = get_scale_perms()
|
||||
zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
|
||||
|
||||
# Interleave column dim (for the dequantize code) and pack it to int32
|
||||
if num_bits == 4:
|
||||
interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||
elif num_bits == 8:
|
||||
interleave = numpy.array([0, 2, 1, 3])
|
||||
else:
|
||||
raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
|
||||
|
||||
zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
zp = zp.reshape((-1, size_n)).contiguous()
|
||||
zp = pack_cols(zp, num_bits, size_k, size_n)
|
||||
|
||||
return zp
|
||||
|
||||
|
||||
def awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
||||
size_n: int, num_bits: int) -> torch.Tensor:
|
||||
# AWQ zero-points are quantized and packed on the column dim.
|
||||
# In addition, the values are permuted based on dequantizer.
|
||||
# Here we undo both of these, and then apply marlin permutation
|
||||
# and pack it back.
|
||||
q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
|
||||
|
||||
# Undo interleaving (use argsort(..) to get inverse perm)
|
||||
if num_bits == 4:
|
||||
undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
|
||||
elif num_bits == 8:
|
||||
undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
|
||||
else:
|
||||
raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
|
||||
|
||||
q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
|
||||
q_zp = q_zp.reshape((-1, size_n)).contiguous()
|
||||
|
||||
marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
|
||||
return marlin_zp
|
||||
|
||||
|
||||
def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
|
||||
size_n: int, num_bits: int):
|
||||
num_experts = q_zp_packed.shape[0]
|
||||
output = torch.empty(
|
||||
(num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
|
||||
device=q_zp_packed.device,
|
||||
dtype=q_zp_packed.dtype,
|
||||
)
|
||||
for e in range(num_experts):
|
||||
output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n,
|
||||
num_bits)
|
||||
return output
|
||||
|
||||
|
||||
def apply_gptq_marlin_linear(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
weight_zp: torch.Tensor,
|
||||
g_idx: torch.Tensor,
|
||||
g_idx_sort_indices: torch.Tensor,
|
||||
workspace: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
output_size_per_partition: int,
|
||||
input_size_per_partition: int,
|
||||
is_k_full: bool,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
||||
reshaped_x = input.reshape(-1, input.shape[-1])
|
||||
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
||||
|
||||
output = ops.gptq_marlin_gemm(reshaped_x,
|
||||
weight,
|
||||
weight_scale,
|
||||
weight_zp,
|
||||
g_idx,
|
||||
g_idx_sort_indices,
|
||||
workspace,
|
||||
wtype,
|
||||
size_m=reshaped_x.shape[0],
|
||||
size_n=output_size_per_partition,
|
||||
size_k=input_size_per_partition,
|
||||
is_k_full=is_k_full,
|
||||
has_zp=False,
|
||||
use_fp32_reduce=use_fp32_reduce)
|
||||
|
||||
if bias is not None:
|
||||
output.add_(bias) # In-place add
|
||||
|
||||
return output.reshape(out_shape)
|
||||
|
||||
|
||||
def apply_awq_marlin_linear(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
weight_zp: torch.Tensor,
|
||||
g_idx: torch.Tensor,
|
||||
g_idx_sort_indices: torch.Tensor,
|
||||
workspace: torch.Tensor,
|
||||
quant_type: ScalarType,
|
||||
output_size_per_partition: int,
|
||||
input_size_per_partition: int,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor:
|
||||
reshaped_x = input.reshape(-1, input.shape[-1])
|
||||
out_shape = input.shape[:-1] + (output_size_per_partition, )
|
||||
|
||||
output = ops.gptq_marlin_gemm(reshaped_x,
|
||||
weight,
|
||||
weight_scale,
|
||||
weight_zp,
|
||||
g_idx,
|
||||
g_idx_sort_indices,
|
||||
workspace,
|
||||
quant_type,
|
||||
size_m=reshaped_x.shape[0],
|
||||
size_n=output_size_per_partition,
|
||||
size_k=input_size_per_partition,
|
||||
is_k_full=True,
|
||||
has_zp=True,
|
||||
use_fp32_reduce=use_fp32_reduce)
|
||||
|
||||
if bias is not None:
|
||||
output.add_(bias) # In-place add
|
||||
|
||||
return output.reshape(out_shape)
|
||||
@@ -0,0 +1,106 @@
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
import vllm._custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import print_warning_once
|
||||
|
||||
from .marlin_utils import marlin_make_workspace, marlin_permute_scales
|
||||
|
||||
|
||||
def is_fp8_marlin_supported():
|
||||
return current_platform.has_device_capability(80)
|
||||
|
||||
|
||||
def apply_fp8_marlin_linear(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
workspace: torch.Tensor,
|
||||
size_n: int,
|
||||
size_k: int,
|
||||
bias: Optional[torch.Tensor],
|
||||
) -> torch.Tensor:
|
||||
# For GPUs that lack FP8 hardware support, we can leverage the
|
||||
# Marlin kernel for fast weight-only FP8 quantization
|
||||
|
||||
reshaped_x = input.reshape(-1, input.shape[-1])
|
||||
out_shape = input.shape[:-1] + (size_n, )
|
||||
|
||||
output = ops.fp8_marlin_gemm(
|
||||
a=reshaped_x,
|
||||
b_q_weight=weight,
|
||||
b_scales=weight_scale,
|
||||
workspace=workspace,
|
||||
num_bits=8,
|
||||
size_m=reshaped_x.shape[0],
|
||||
size_n=size_n,
|
||||
size_k=size_k,
|
||||
)
|
||||
|
||||
if bias is not None:
|
||||
output.add_(bias) # In-place add
|
||||
|
||||
return output.reshape(out_shape)
|
||||
|
||||
|
||||
def prepare_fp8_layer_for_marlin(layer: torch.nn.Module,
|
||||
strategy: str = "tensor") -> None:
|
||||
print_warning_once(
|
||||
"Your GPU does not have native support for FP8 computation but "
|
||||
"FP8 quantization is being used. Weight-only FP8 compression will "
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads.")
|
||||
|
||||
part_size_n = layer.output_size_per_partition
|
||||
part_size_k = layer.input_size_per_partition
|
||||
|
||||
device = layer.weight.device
|
||||
|
||||
# WORKSPACE
|
||||
layer.workspace = marlin_make_workspace(part_size_n, device)
|
||||
|
||||
# WEIGHT
|
||||
# Repack weights to marlin format
|
||||
marlin_qweight = ops.gptq_marlin_repack(b_q_weight=pack_fp8_to_int32(
|
||||
layer.weight),
|
||||
perm=torch.empty(0,
|
||||
dtype=torch.int,
|
||||
device=device),
|
||||
size_k=part_size_k,
|
||||
size_n=part_size_n,
|
||||
num_bits=8)
|
||||
layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
|
||||
|
||||
# WEIGHT SCALES
|
||||
scales = layer.weight_scale.to(layer.orig_dtype)
|
||||
# Permute scales
|
||||
marlin_scales = marlin_permute_scales(s=scales,
|
||||
size_k=part_size_k,
|
||||
size_n=part_size_n,
|
||||
group_size=-1)
|
||||
layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
|
||||
|
||||
|
||||
def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Repack FP8 weights to gptq format (packed int32 elements)
|
||||
"""
|
||||
assert fp8_tensor.dtype == torch.float8_e4m3fn
|
||||
assert fp8_tensor.shape[0] % 4 == 0
|
||||
|
||||
# Reshape to prepare for packing
|
||||
reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
|
||||
|
||||
# Convert fp8 to uint8 (byte) representation
|
||||
byte_tensor = reshaped.view(torch.uint8)
|
||||
|
||||
# Pack 4 uint8 values into one int32
|
||||
packed = (byte_tensor[:, 0].to(torch.int32) |
|
||||
(byte_tensor[:, 1].to(torch.int32) << 8) |
|
||||
(byte_tensor[:, 2].to(torch.int32) << 16) |
|
||||
(byte_tensor[:, 3].to(torch.int32) << 24))
|
||||
|
||||
return packed.view(fp8_tensor.shape[0] // 4,
|
||||
*fp8_tensor.shape[1:]).contiguous()
|
||||
@@ -0,0 +1,163 @@
|
||||
"""Utility functions used for tests and benchmarks"""
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from vllm.scalar_type import ScalarType
|
||||
|
||||
from .marlin_utils import (GPTQ_MARLIN_TILE, marlin_permute_scales,
|
||||
marlin_zero_points)
|
||||
from .quant_utils import (get_pack_factor, gptq_quantize_weights,
|
||||
quantize_weights, sort_weights)
|
||||
|
||||
|
||||
class MarlinWorkspace:
|
||||
|
||||
def __init__(self, out_features, min_thread_n, max_parallel):
|
||||
assert (out_features % min_thread_n == 0), (
|
||||
"out_features = {} is undivisible by min_thread_n = {}".format(
|
||||
out_features, min_thread_n))
|
||||
|
||||
max_workspace_size = ((out_features // min_thread_n) * max_parallel)
|
||||
|
||||
self.scratch = torch.zeros(max_workspace_size,
|
||||
dtype=torch.int,
|
||||
device="cuda")
|
||||
|
||||
|
||||
def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
|
||||
assert q_w.shape == (size_k, size_n)
|
||||
assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
|
||||
assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
|
||||
|
||||
# Permute weights to 16x64 marlin tiles
|
||||
q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
|
||||
q_w = q_w.permute((0, 2, 1, 3))
|
||||
q_w = q_w.reshape((size_k // tile, size_n * tile))
|
||||
|
||||
q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
|
||||
|
||||
return q_w
|
||||
|
||||
|
||||
def marlin_weights(q_w, size_k, size_n, num_bits, perm):
|
||||
# Permute
|
||||
q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
|
||||
|
||||
# Pack
|
||||
pack_factor = get_pack_factor(num_bits)
|
||||
orig_device = q_w.device
|
||||
|
||||
q_w = q_w.cpu().numpy().astype(np.uint32)
|
||||
|
||||
q_packed = np.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
|
||||
dtype=np.uint32)
|
||||
for i in range(pack_factor):
|
||||
q_packed |= q_w[:, i::pack_factor] << num_bits * i
|
||||
|
||||
q_packed = torch.from_numpy(q_packed.astype(np.int32)).to(orig_device)
|
||||
|
||||
return q_packed
|
||||
|
||||
|
||||
def get_weight_perm(num_bits: int):
|
||||
perm_list: List[int] = []
|
||||
for i in range(32):
|
||||
perm1: List[int] = []
|
||||
col = i // 4
|
||||
for block in [0, 1]:
|
||||
for row in [
|
||||
2 * (i % 4),
|
||||
2 * (i % 4) + 1,
|
||||
2 * (i % 4 + 4),
|
||||
2 * (i % 4 + 4) + 1,
|
||||
]:
|
||||
perm1.append(16 * row + col + 8 * block)
|
||||
for j in range(4):
|
||||
perm_list.extend([p + 256 * j for p in perm1])
|
||||
|
||||
perm = np.array(perm_list)
|
||||
|
||||
if num_bits == 4:
|
||||
interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||
elif num_bits == 8:
|
||||
interleave = np.array([0, 2, 1, 3])
|
||||
else:
|
||||
raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
|
||||
|
||||
perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
perm = torch.from_numpy(perm)
|
||||
return perm
|
||||
|
||||
|
||||
def marlin_quantize(w: torch.Tensor,
|
||||
quant_type: ScalarType,
|
||||
group_size: int,
|
||||
act_order: bool,
|
||||
test_perm: Optional[torch.Tensor] = None):
|
||||
size_k, size_n = w.shape
|
||||
num_bits = quant_type.size_bits
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Quantize (and apply act_order if provided)
|
||||
w_ref, q_w, s, g_idx, rand_perm = gptq_quantize_weights(
|
||||
w, quant_type, group_size, act_order, test_perm)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx" so that group ids are
|
||||
# increasing
|
||||
sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
|
||||
if act_order:
|
||||
q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
|
||||
|
||||
# Reformat to marlin
|
||||
weight_perm = get_weight_perm(num_bits)
|
||||
marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
|
||||
marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
|
||||
|
||||
# Create result
|
||||
res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
|
||||
for i in range(len(res_list)):
|
||||
res_list[i] = res_list[i].to(w.device)
|
||||
|
||||
return res_list
|
||||
|
||||
|
||||
def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType,
|
||||
group_size: int):
|
||||
size_k, size_n = w.shape
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Detect num groups
|
||||
assert size_k % group_size == 0
|
||||
num_groups = size_k // group_size
|
||||
|
||||
# Quantize with zp
|
||||
w_ref, q_w, s, zp = quantize_weights(w,
|
||||
quant_type,
|
||||
group_size,
|
||||
zero_points=True)
|
||||
|
||||
# Reformat to marlin
|
||||
weight_perm = get_weight_perm(quant_type.size_bits)
|
||||
marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits,
|
||||
weight_perm)
|
||||
marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
|
||||
marlin_zp = marlin_zero_points(zp, num_groups, size_n,
|
||||
quant_type.size_bits)
|
||||
|
||||
# Create result
|
||||
res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
|
||||
for i in range(len(res_list)):
|
||||
res_list[i] = res_list[i].to(w.device)
|
||||
|
||||
return res_list
|
||||
@@ -0,0 +1,463 @@
|
||||
"""Utility functions used for tests and benchmarks"""
|
||||
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
from vllm.scalar_type import ScalarType
|
||||
|
||||
from .marlin_utils_test import marlin_weights
|
||||
from .quant_utils import gptq_quantize_weights
|
||||
|
||||
|
||||
# This is PyTorch implementation of main part of reorder_meta()
|
||||
# function, from tools/util/include/cutlass/util/host_reorder.h file
|
||||
# of CUTLASS source tree. Furthermore, CUTLASS template for sparse
|
||||
# GEMM decides upon layout of this matrix, and at the moment for the
|
||||
# sparse GEMM executed on tensor cores, this is layout described by
|
||||
# ColumnMajorInterleaved<2> data structure, in
|
||||
# include/cutlass/layout/matrix.h of CUTLASS source tree. The
|
||||
# reordering of meta matrix into meta_reordered matrix calculated
|
||||
# according to these segments of CUTLASS code is re-implemented here.
|
||||
# Note that this calculation produces offsets for scattering metadata
|
||||
# matrix elements into reordered metadata matrix elements (or,
|
||||
# equivalently, for gathering reordered metadata matrix element back
|
||||
# into metadata matrix elements).
|
||||
def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype,
|
||||
device):
|
||||
dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
|
||||
dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
|
||||
|
||||
# Reorder the rows, then swizzle the 2x2 blocks.
|
||||
group_x = 64
|
||||
group_y = 32 if meta_dtype.itemsize == 2 else 16
|
||||
|
||||
dst_rows = (dst_rows // group_x * group_x + (dst_rows % 2) * 2 +
|
||||
(dst_rows % 8) // 4 + ((dst_rows % group_y) % 4) // 2 * 32 +
|
||||
((dst_rows % group_x) // 8) * 4)
|
||||
|
||||
topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
|
||||
bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
|
||||
dst_rows += topright - bottomleft
|
||||
dst_cols -= topright - bottomleft
|
||||
|
||||
# Assumed that meta tensor is to be stored in CUTLASS
|
||||
# InterleavedColumnMajor layout, and reverse engineered
|
||||
# corresponding code to store values into this tensor.
|
||||
interleave = 2
|
||||
cols_maj = dst_cols // interleave
|
||||
cols_min = dst_cols % interleave
|
||||
return (cols_maj * m * interleave + dst_rows * interleave +
|
||||
cols_min).view(-1)
|
||||
|
||||
|
||||
# This function converts dense matrix into sparse semi-structured
|
||||
# representation, producing "compressed" matrix, in the layout used by
|
||||
# CUTLASS backend, and corresponding metadata matrix.
|
||||
def sparse_semi_structured_from_dense_cutlass(dense):
|
||||
if dense.dim() != 2:
|
||||
raise RuntimeError(
|
||||
f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor" # noqa: E501
|
||||
)
|
||||
|
||||
m, k = dense.shape
|
||||
device = dense.device
|
||||
|
||||
meta_dtype = torch.int8
|
||||
if dense.dtype == torch.int8:
|
||||
meta_dtype = torch.int32
|
||||
elif dense.dtype in [torch.half, torch.bfloat16, torch.float, torch.int32]:
|
||||
meta_dtype = torch.int16
|
||||
else:
|
||||
raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
|
||||
quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
|
||||
if quadbits_per_meta_elem not in (4, 8):
|
||||
raise RuntimeError(
|
||||
"Invalid number of elements per meta element calculated")
|
||||
|
||||
if meta_dtype == torch.int32:
|
||||
if m % 16 != 0:
|
||||
raise RuntimeError(
|
||||
f"Number of rows of dense matrix {m} must be divisible by 16")
|
||||
else:
|
||||
if m % 32 != 0:
|
||||
raise RuntimeError(
|
||||
f"Number of rows of dense matrix {m} must be divisible by 32")
|
||||
if k % (4 * quadbits_per_meta_elem) != 0:
|
||||
raise RuntimeError(
|
||||
f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}" # noqa: E501
|
||||
)
|
||||
|
||||
if dense.dtype != torch.float:
|
||||
ksparse = 4
|
||||
dense_4 = dense.view(-1, k // ksparse, ksparse)
|
||||
m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
|
||||
else:
|
||||
ksparse = 2
|
||||
dense_2 = dense.view(-1, k // ksparse, ksparse)
|
||||
m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
|
||||
meta_ncols = k // (ksparse * quadbits_per_meta_elem)
|
||||
|
||||
# Encoding quadruples of True/False values as follows:
|
||||
# [True, True, False, False] -> 0b0100
|
||||
# [True, False, True, False] -> 0b1000
|
||||
# [False, True, True, False] -> 0b1001
|
||||
# [True, False, False, True ] -> 0b1100
|
||||
# [False, True, False, True ] -> 0b1101
|
||||
# [False, False, True, True ] -> 0b1110
|
||||
# Thus, lower two bits in the encoding are index of the True value
|
||||
# at the lowest index in the quadruple, and the higher two bits in
|
||||
# the encoding are index of the other True value in the quadruple.
|
||||
# In case there are less than two True values, than False value or
|
||||
# values at some index or indices are considered True for the
|
||||
# encoding. In case there are more than two True values, then the
|
||||
# excess True value(s) at some indices are considered False for
|
||||
# the encoding. The exact encodings used for these cases are as
|
||||
# follows:
|
||||
# [False, False, False, False] -> 0b1110
|
||||
# [False, False, False, True ] -> 0b1110
|
||||
# [False, False, True, False] -> 0b1110
|
||||
# [False, True, False, False] -> 0b1001
|
||||
# [False, True, True, True ] -> 0b1101
|
||||
# [True, False, False, False] -> 0b1000
|
||||
# [True, False, True, True ] -> 0b1100
|
||||
# [True, True, False, True ] -> 0b0100
|
||||
# [True, True, True, False] -> 0b0100
|
||||
# [True, True, True, True ] -> 0b0100
|
||||
# These particular encodings are chosen, with the help of Espresso
|
||||
# logic minimizer software, for the purpose of minimization of
|
||||
# corresponding Boolean functions, that translate non-zero flags
|
||||
# into encoding bits. Note also possible choices for the first
|
||||
# and last of these encodings were limited only to (0b0100,
|
||||
# 0b1110), in order to produce valid encodings for 1:2 sparsity
|
||||
# case.
|
||||
|
||||
expr0 = m0 & m1
|
||||
expr1 = ~m0 & m1
|
||||
expr2 = ~m0 & ~m1
|
||||
bit0 = expr1
|
||||
bit1 = expr2
|
||||
bit2 = expr0 | expr2 | m3
|
||||
bit3 = expr1 | ~m1
|
||||
idxs0 = bit0 | (bit1.to(torch.int64) << 1)
|
||||
idxs1 = bit2 | (bit3.to(torch.int64) << 1)
|
||||
|
||||
if dense.dtype != torch.float:
|
||||
sparse0 = dense_4.gather(
|
||||
-1, idxs0.unsqueeze(-1)) # type: ignore[possibly-undefined]
|
||||
sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
|
||||
sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
|
||||
else:
|
||||
sparse = dense_2.gather(-1,
|
||||
idxs0.unsqueeze(-1) // 2).view(
|
||||
m,
|
||||
k // 2) # type: ignore[possibly-undefined]
|
||||
|
||||
meta_4 = idxs0 | (idxs1 << 2)
|
||||
meta_n = meta_4.view(
|
||||
(-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
|
||||
|
||||
if quadbits_per_meta_elem == 4:
|
||||
meta = (meta_n[:, :, 0]
|
||||
| (meta_n[:, :, 1] << 4)
|
||||
| (meta_n[:, :, 2] << 8)
|
||||
| (meta_n[:, :, 3] << 12))
|
||||
elif quadbits_per_meta_elem == 8:
|
||||
meta = (meta_n[:, :, 0]
|
||||
| (meta_n[:, :, 1] << 4)
|
||||
| (meta_n[:, :, 2] << 8)
|
||||
| (meta_n[:, :, 3] << 12)
|
||||
| (meta_n[:, :, 4] << 16)
|
||||
| (meta_n[:, :, 5] << 20)
|
||||
| (meta_n[:, :, 6] << 24)
|
||||
| (meta_n[:, :, 7] << 28))
|
||||
|
||||
# Reorder meta tensor elements.
|
||||
meta_reordered = meta.new_empty(
|
||||
(m * meta_ncols, )) # type: ignore[possibly-undefined]
|
||||
meta_offsets = _calculate_meta_reordering_scatter_offsets(
|
||||
m, meta_ncols, meta_dtype, device)
|
||||
meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
|
||||
|
||||
return (sparse, meta_reordered.view(m, meta_ncols))
|
||||
|
||||
|
||||
# This function performs reverse of the function above - it
|
||||
# reconstructs dense matrix from a pair of "compressed" matrix, given
|
||||
# in the layout used by CUTLASS backend, and accompanying metadata
|
||||
# matrix.
|
||||
def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
|
||||
if sparse.dim() != 2:
|
||||
raise RuntimeError(
|
||||
f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor" # noqa: E501
|
||||
)
|
||||
|
||||
m, k = sparse.shape
|
||||
device = sparse.device
|
||||
|
||||
if meta_reordered.dim() != 2:
|
||||
raise RuntimeError(
|
||||
f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor" # noqa: E501
|
||||
)
|
||||
if meta_reordered.device != device:
|
||||
raise RuntimeError(
|
||||
f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device" # noqa: E501
|
||||
)
|
||||
|
||||
meta_dtype = meta_reordered.dtype
|
||||
if meta_dtype not in (torch.int16, torch.int32):
|
||||
raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
|
||||
quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
|
||||
|
||||
ksparse = 4 if sparse.dtype != torch.float else 2
|
||||
|
||||
meta_nrows, meta_ncols = meta_reordered.shape
|
||||
if meta_nrows != m:
|
||||
raise RuntimeError(
|
||||
f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}" # noqa: E501
|
||||
)
|
||||
if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
|
||||
raise RuntimeError(
|
||||
f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, " # noqa: E501
|
||||
"expected according to the number of columns of meta matrix")
|
||||
|
||||
# Undo meta tensor elements reordering.
|
||||
meta_offsets = _calculate_meta_reordering_scatter_offsets(
|
||||
m, meta_ncols, meta_dtype, device)
|
||||
meta = torch.gather(meta_reordered.view(-1), 0,
|
||||
meta_offsets).view(m, meta_ncols)
|
||||
|
||||
# Unpack sparse tensor back to original dense tensor, using
|
||||
# information provided by meta tensor. Note that torch.float
|
||||
# datatype is handled pretty much the same as
|
||||
# torch.half/torch.bfloat16, as metadata for a pair of torch.float
|
||||
# value is encoded as if underlying 8 bytes contain four
|
||||
# torch.half/torch.bfloat16 values, where either first two or last
|
||||
# two are zeros.
|
||||
meta_2 = torch.empty(
|
||||
(m, meta_ncols, 2 * quadbits_per_meta_elem),
|
||||
dtype=meta_dtype,
|
||||
device=device,
|
||||
)
|
||||
if quadbits_per_meta_elem == 4:
|
||||
meta_2[:, :, 0] = meta & 0b11
|
||||
meta_2[:, :, 1] = (meta >> 2) & 0b11
|
||||
meta_2[:, :, 2] = (meta >> 4) & 0b11
|
||||
meta_2[:, :, 3] = (meta >> 6) & 0b11
|
||||
meta_2[:, :, 4] = (meta >> 8) & 0b11
|
||||
meta_2[:, :, 5] = (meta >> 10) & 0b11
|
||||
meta_2[:, :, 6] = (meta >> 12) & 0b11
|
||||
meta_2[:, :, 7] = (meta >> 14) & 0b11
|
||||
elif quadbits_per_meta_elem == 8:
|
||||
meta_2[:, :, 0] = meta & 0b11
|
||||
meta_2[:, :, 1] = (meta >> 2) & 0b11
|
||||
meta_2[:, :, 2] = (meta >> 4) & 0b11
|
||||
meta_2[:, :, 3] = (meta >> 6) & 0b11
|
||||
meta_2[:, :, 4] = (meta >> 8) & 0b11
|
||||
meta_2[:, :, 5] = (meta >> 10) & 0b11
|
||||
meta_2[:, :, 6] = (meta >> 12) & 0b11
|
||||
meta_2[:, :, 7] = (meta >> 14) & 0b11
|
||||
meta_2[:, :, 8] = (meta >> 16) & 0b11
|
||||
meta_2[:, :, 9] = (meta >> 18) & 0b11
|
||||
meta_2[:, :, 10] = (meta >> 20) & 0b11
|
||||
meta_2[:, :, 11] = (meta >> 22) & 0b11
|
||||
meta_2[:, :, 12] = (meta >> 24) & 0b11
|
||||
meta_2[:, :, 13] = (meta >> 26) & 0b11
|
||||
meta_2[:, :, 14] = (meta >> 28) & 0b11
|
||||
meta_2[:, :, 15] = (meta >> 30) & 0b11
|
||||
|
||||
dense_offsets = meta_2.view(-1) + (
|
||||
torch.arange(0, 2 * m * k // ksparse, device=device) * 4).view(
|
||||
-1, 1).repeat(1, 2).view(-1)
|
||||
|
||||
dense = torch.zeros((m * 2 * k, ), dtype=sparse.dtype, device=device)
|
||||
if sparse.dtype != torch.float:
|
||||
# dense.scatter_(0, dense_offsets, sparse.view(-1))
|
||||
dense.scatter_(0, dense_offsets, sparse.reshape(-1))
|
||||
else:
|
||||
dense.view(torch.half).scatter_(0, dense_offsets,
|
||||
sparse.view(torch.half).view(-1))
|
||||
|
||||
return dense.view(m, 2 * k)
|
||||
|
||||
|
||||
def mask_creator(tensor):
|
||||
"""
|
||||
Class for creating N:M sparsity masks.
|
||||
Masks will be created using the N:M ratio, where for every block of
|
||||
M weights, N will be pruned based on ranked weight value. Each mask
|
||||
will correspond to the given tensor.
|
||||
|
||||
:param N: The number of weights in a group to keep
|
||||
:param M: The size of a weight group
|
||||
"""
|
||||
N = 2
|
||||
M = 4
|
||||
|
||||
mask = None
|
||||
# for i, tensor in enumerate(tensors):
|
||||
if tensor.numel() % M != 0:
|
||||
raise ValueError(
|
||||
f"Tensor of size {tensor.shape} can't be evenly divided into "
|
||||
f"{M} groups")
|
||||
|
||||
num_groups = tensor.numel() // M
|
||||
|
||||
# N:M sparsity for linear layers
|
||||
tensor_temp = tensor.detach().abs().reshape(num_groups, M)
|
||||
index = torch.argsort(tensor_temp, dim=1)[:, :int(M - N)]
|
||||
|
||||
w_b = torch.ones(tensor_temp.shape, device=tensor_temp.device)
|
||||
mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
|
||||
|
||||
return mask
|
||||
|
||||
|
||||
def inject_24(w, size_k, size_n):
|
||||
assert w.shape == (size_k, size_n)
|
||||
|
||||
mask = mask_creator(w.t()).t().cuda().bool()
|
||||
|
||||
return (mask * w).contiguous(), mask.contiguous()
|
||||
|
||||
|
||||
def check_24(w, num_rows_to_sample=50, _verbose=False):
|
||||
BLOCK_SIZE = 4
|
||||
MAX_NON_ZEROS = 2
|
||||
|
||||
w = w.t().contiguous()
|
||||
|
||||
print("check_24: w.shape = {}".format(w.shape))
|
||||
|
||||
num_rows, num_cols = w.shape
|
||||
sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
|
||||
if _verbose:
|
||||
print(f"Sampled row idxs = {sampled_row_idxs}")
|
||||
|
||||
total_segments = 0
|
||||
non_24_segments = 0
|
||||
for i in sampled_row_idxs:
|
||||
for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
|
||||
total_segments += 1
|
||||
block = w[i, j:j + BLOCK_SIZE]
|
||||
num_nonzero = torch.count_nonzero(block)
|
||||
if num_nonzero > MAX_NON_ZEROS:
|
||||
print("i = {} j = {} block = {}".format(i, j, block))
|
||||
non_24_segments += 1
|
||||
|
||||
print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
|
||||
|
||||
|
||||
def compress_quantized_24_weight(q_24, size_k, size_n, wtype: ScalarType):
|
||||
assert q_24.shape == (size_k, size_n)
|
||||
|
||||
# Remove bias to normalize over 0
|
||||
q_24_no_zp = q_24 - wtype.bias
|
||||
|
||||
# Compress
|
||||
q_24_no_zp = q_24_no_zp.t().contiguous()
|
||||
q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
|
||||
q_24_no_zp)
|
||||
q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
|
||||
|
||||
# Restore bias
|
||||
q_24_comp = q_24_no_zp_comp + wtype.bias
|
||||
|
||||
# Resize meta to its actual shape (without moving any data)
|
||||
meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
|
||||
|
||||
return q_24_comp, meta
|
||||
|
||||
|
||||
def get_scale_perms_24():
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
|
||||
return scale_perm, scale_perm_single
|
||||
|
||||
|
||||
def get_weight_perm_24(num_bits: int):
|
||||
perm_list: List[int] = []
|
||||
for i in range(32):
|
||||
perm1: List[int] = []
|
||||
col = i // 4
|
||||
col_o = col // 2
|
||||
for block in [0, 1]:
|
||||
for row in [
|
||||
2 * (i % 4),
|
||||
2 * (i % 4) + 1,
|
||||
2 * (i % 4 + 4),
|
||||
2 * (i % 4 + 4) + 1,
|
||||
]:
|
||||
perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
|
||||
4 * block)
|
||||
for j in range(4):
|
||||
perm_list.extend([p + 1 * j for p in perm1])
|
||||
perm = numpy.array(perm_list)
|
||||
|
||||
if num_bits == 4:
|
||||
interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||
elif num_bits == 8:
|
||||
interleave = numpy.array([0, 2, 1, 3])
|
||||
else:
|
||||
raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
|
||||
|
||||
perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
perm = torch.from_numpy(perm)
|
||||
return perm
|
||||
|
||||
|
||||
def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
|
||||
group_size: int) -> torch.Tensor:
|
||||
|
||||
scale_perm, scale_perm_single = get_scale_perms_24()
|
||||
if group_size < size_k and group_size != -1:
|
||||
s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
|
||||
else:
|
||||
s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
|
||||
s = s.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def marlin_24_quantize(
|
||||
w: torch.Tensor,
|
||||
quant_type: ScalarType,
|
||||
group_size: int,
|
||||
):
|
||||
size_k, size_n = w.shape
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Inject 2:4 sparsity
|
||||
w_24, mask_24 = inject_24(w, size_k, size_n)
|
||||
|
||||
# Quantize
|
||||
w_24_ref, q_w_24, s, g_idx, rand_perm = gptq_quantize_weights(
|
||||
w_24, quant_type, group_size, act_order=False)
|
||||
|
||||
# Compress quantized weight
|
||||
q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
|
||||
quant_type)
|
||||
size_k_comp = size_k // 2
|
||||
|
||||
# Reformat to marlin
|
||||
weight_perm = get_weight_perm_24(quant_type.size_bits)
|
||||
marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
|
||||
quant_type.size_bits, weight_perm)
|
||||
marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
|
||||
|
||||
# Create result
|
||||
res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
|
||||
for i in range(len(res_list)):
|
||||
res_list[i] = res_list[i].to(w.device)
|
||||
|
||||
return res_list
|
||||
@@ -0,0 +1,125 @@
|
||||
from typing import List
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
from .marlin_utils_test import marlin_permute_weights
|
||||
from .quant_utils import get_pack_factor, qqq_quantize_weights
|
||||
|
||||
|
||||
def marlin_qqq_weights(q_w, size_k, size_n, num_bits, perm, group_size):
|
||||
# Permute
|
||||
q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
|
||||
|
||||
# Pack
|
||||
pack_factor = get_pack_factor(num_bits)
|
||||
orig_device = q_w.device
|
||||
|
||||
q_w = q_w.cpu().numpy().astype(numpy.uint32)
|
||||
|
||||
q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
|
||||
dtype=numpy.uint32)
|
||||
if group_size == size_k:
|
||||
for i in range(pack_factor):
|
||||
q_packed |= (q_w[:, i::pack_factor] & 0xF) << num_bits * i
|
||||
else:
|
||||
for i in range(pack_factor):
|
||||
q_packed |= q_w[:, i::pack_factor] << num_bits * i
|
||||
|
||||
q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
|
||||
|
||||
return q_packed
|
||||
|
||||
|
||||
def get_qqq_scale_perms():
|
||||
scale_perm: List[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||
scale_perm_single: List[int] = []
|
||||
for i in range(4):
|
||||
scale_perm_single.extend(
|
||||
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||
return scale_perm, scale_perm_single
|
||||
|
||||
|
||||
# NOTE(HandH1998): QQQ employs different perms for per-group and per-channel weight quantization. # noqa: E501
|
||||
def get_qqq_weight_perm(num_bits: int, quant_type: str):
|
||||
perm_list: List[int] = []
|
||||
for i in range(32):
|
||||
perm1: List[int] = []
|
||||
col = i // 4
|
||||
for block in [0, 1]:
|
||||
for row in [
|
||||
4 * (i % 4),
|
||||
4 * (i % 4) + 1,
|
||||
4 * (i % 4) + 2,
|
||||
4 * (i % 4) + 3,
|
||||
]:
|
||||
perm1.append(16 * row + col + 8 * block)
|
||||
for j in range(4):
|
||||
perm_list.extend([p + 256 * j for p in perm1])
|
||||
|
||||
perm = numpy.array(perm_list)
|
||||
|
||||
assert quant_type in ["per-channel",
|
||||
"per-group"], "not supported quantization type"
|
||||
if num_bits == 4:
|
||||
if quant_type == "per-channel":
|
||||
interleave = numpy.array([4, 0, 5, 1, 6, 2, 7, 3])
|
||||
else:
|
||||
interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||
else:
|
||||
raise Exception("num_bits must be 4, got {}".format(num_bits))
|
||||
|
||||
perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
perm = torch.from_numpy(perm)
|
||||
return perm
|
||||
|
||||
|
||||
def marlin_qqq_permute_scales(s_group, s_channel, size_k, size_n, group_size):
|
||||
scale_perm, scale_perm_single = get_qqq_scale_perms()
|
||||
if group_size < size_k and group_size != -1:
|
||||
s_group = s_group.reshape((-1, len(scale_perm)))[:, scale_perm]
|
||||
s_channel = s_channel.reshape(
|
||||
(-1, len(scale_perm_single)))[:, scale_perm_single]
|
||||
s_group = s_group.reshape((-1, size_n)).contiguous()
|
||||
else:
|
||||
s_channel = s_channel.reshape(
|
||||
(-1, len(scale_perm_single)))[:, scale_perm_single]
|
||||
s_channel = s_channel.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return s_group, s_channel
|
||||
|
||||
|
||||
def marlin_qqq_quantize(
|
||||
w: torch.Tensor,
|
||||
num_bits: int,
|
||||
group_size: int,
|
||||
):
|
||||
size_k, size_n = w.shape
|
||||
|
||||
# Normalize group_size
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
quant_type = "per-channel" if group_size == size_k else "per-group"
|
||||
|
||||
# Quantize
|
||||
w_ref, q_w, s_group, s_channel = qqq_quantize_weights(
|
||||
w, num_bits, group_size)
|
||||
|
||||
# Reformat to marlin_qqq
|
||||
weight_perm = get_qqq_weight_perm(num_bits, quant_type)
|
||||
marlin_qqq_q_w = marlin_qqq_weights(q_w, size_k, size_n, num_bits,
|
||||
weight_perm, group_size)
|
||||
marlin_qqq_s_group, marlin_qqq_s_channel = marlin_qqq_permute_scales(
|
||||
s_group, s_channel, size_k, size_n, group_size)
|
||||
|
||||
# Create result
|
||||
res_list = [
|
||||
w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel
|
||||
]
|
||||
for i in range(len(res_list)):
|
||||
res_list[i] = res_list[i].to(w.device)
|
||||
|
||||
return res_list
|
||||
451
vllm/model_executor/layers/quantization/utils/quant_utils.py
Normal file
451
vllm/model_executor/layers/quantization/utils/quant_utils.py
Normal file
@@ -0,0 +1,451 @@
|
||||
"""This file is used for /tests and /benchmarks"""
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.quantization.qqq import (
|
||||
MARLIN_QQQ_SUPPORTED_NUM_BITS)
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
|
||||
SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
|
||||
SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
|
||||
|
||||
# Note: this is a hack. We should update each model to register the
|
||||
# stacked params and get it from there instead in a future PR.
|
||||
# fused_name: List[shard_name]
|
||||
FUSED_LAYER_NAME_MAPPING = {
|
||||
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
||||
"gate_up_proj": ["gate_proj", "up_proj"]
|
||||
}
|
||||
|
||||
|
||||
def pack_weights_into_int32(w_q: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
packed_dim: int = 0):
|
||||
# move dim to pack to the end
|
||||
perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
|
||||
inv_perm = tuple(perm.index(i) for i in range(len(perm)))
|
||||
w_q_perm = w_q.permute(perm)
|
||||
|
||||
pack_factor = 32 // wtype.size_bits
|
||||
mask = (1 << wtype.size_bits) - 1
|
||||
|
||||
new_shape_perm = list(w_q_perm.shape)
|
||||
assert w_q_perm.shape[-1] % pack_factor == 0
|
||||
new_shape_perm[-1] //= pack_factor
|
||||
|
||||
res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
|
||||
for i in range(pack_factor):
|
||||
res |= (w_q_perm[..., i::pack_factor] & mask) << wtype.size_bits * i
|
||||
|
||||
return res.permute(inv_perm)
|
||||
|
||||
|
||||
def unpack_weights_into_int32(w_q: torch.Tensor,
|
||||
wtype: ScalarType,
|
||||
packed_dim: int = 0):
|
||||
# move dim to pack to the end
|
||||
perm = (*[i for i in range(len(w_q.shape)) if i != packed_dim], packed_dim)
|
||||
inv_perm = tuple(perm.index(i) for i in range(len(perm)))
|
||||
w_q_perm = w_q.permute(perm)
|
||||
|
||||
pack_factor = 32 // wtype.size_bits
|
||||
mask = (1 << wtype.size_bits) - 1
|
||||
|
||||
new_shape_perm = list(w_q_perm.shape)
|
||||
new_shape_perm[-1] *= pack_factor
|
||||
|
||||
res = torch.zeros(new_shape_perm, dtype=torch.int32, device=w_q.device)
|
||||
for i in range(pack_factor):
|
||||
res[..., i::pack_factor] = (w_q_perm >> wtype.size_bits * i) & mask
|
||||
|
||||
return res.permute(inv_perm)
|
||||
|
||||
|
||||
def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
|
||||
# prefix: model.layers.0.self_attn.q_proj
|
||||
# proj_name: q_proj
|
||||
proj_name = prefix.split(".")[-1]
|
||||
if proj_name in FUSED_LAYER_NAME_MAPPING:
|
||||
shard_prefixes = [
|
||||
prefix.replace(proj_name, shard_proj_name)
|
||||
for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
|
||||
]
|
||||
|
||||
is_skipped = None
|
||||
for shard_prefix in shard_prefixes:
|
||||
is_shard_skipped = shard_prefix in ignored_layers
|
||||
|
||||
if is_skipped is None:
|
||||
is_skipped = is_shard_skipped
|
||||
elif is_shard_skipped != is_skipped:
|
||||
raise ValueError(
|
||||
f"Detected some but not all shards of {prefix} "
|
||||
"are quantized. All shards of fused layers "
|
||||
"to have the same precision.")
|
||||
else:
|
||||
is_skipped = prefix in ignored_layers
|
||||
|
||||
assert is_skipped is not None
|
||||
return is_skipped
|
||||
|
||||
|
||||
def get_pack_factor(num_bits):
|
||||
assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
|
||||
return 32 // num_bits
|
||||
|
||||
|
||||
def permute_rows(q_w: torch.Tensor,
|
||||
w_ref: torch.Tensor,
|
||||
group_size: int,
|
||||
test_perm: Optional[torch.Tensor] = None):
|
||||
assert q_w.shape == w_ref.shape
|
||||
|
||||
orig_device = q_w.device
|
||||
k_size, _ = q_w.shape
|
||||
|
||||
g_idx = torch.zeros((k_size, ), dtype=torch.int32)
|
||||
for i in range(k_size):
|
||||
g_idx[i] = i // group_size
|
||||
|
||||
# Simulate act_order by doing a random permutation on K
|
||||
rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
|
||||
|
||||
g_idx = g_idx[rand_perm].contiguous()
|
||||
q_w = q_w[rand_perm, :].contiguous()
|
||||
w_ref = w_ref[rand_perm, :].contiguous()
|
||||
|
||||
return (
|
||||
w_ref.to(device=orig_device),
|
||||
q_w.to(device=orig_device),
|
||||
g_idx.to(device=orig_device),
|
||||
rand_perm.to(device=orig_device),
|
||||
)
|
||||
|
||||
|
||||
def quantize_weights(w: torch.Tensor,
|
||||
quant_type: ScalarType,
|
||||
group_size: int,
|
||||
zero_points: bool = False,
|
||||
ref_zero_points_after_scales: bool = False):
|
||||
assert quant_type.is_integer(), \
|
||||
"Floating point quantization may work but has not been tested"
|
||||
|
||||
orig_device = w.device
|
||||
orig_type = w.dtype
|
||||
size_k, size_n = w.shape
|
||||
|
||||
assert w.is_floating_point(), "w must be float"
|
||||
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
# Reshape to [groupsize, -1]
|
||||
if group_size < size_k:
|
||||
w = w.reshape((-1, group_size, size_n))
|
||||
w = w.permute(1, 0, 2)
|
||||
w = w.reshape((group_size, -1))
|
||||
|
||||
# Compute scale for each group
|
||||
max_val = torch.max(w, 0, keepdim=True).values
|
||||
min_val = torch.min(w, 0, keepdim=True).values
|
||||
|
||||
max_q_val = quant_type.max()
|
||||
min_q_val = quant_type.min()
|
||||
|
||||
if zero_points:
|
||||
assert not quant_type.is_signed() and quant_type.max() > 0
|
||||
w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
|
||||
maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \
|
||||
.clamp(min_q_val, max_q_val).int()
|
||||
else:
|
||||
# If the bias is such that there are no possible negative/positive
|
||||
# values, set the max value to inf to avoid divide by 0
|
||||
w_s = torch.max(
|
||||
abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
|
||||
abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)))
|
||||
maybe_w_zp = None
|
||||
|
||||
# Quantize
|
||||
w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
|
||||
w_q = torch.clamp(w_q, min_q_val, max_q_val)
|
||||
|
||||
# Compute ref (dequantized)
|
||||
# For some kernels (namely Machete) the zero-points are applied after the
|
||||
# scales are applied, for this case computing the reference in similar way
|
||||
# allows us to use tighter error tolerances in our unit tests.
|
||||
if ref_zero_points_after_scales and zero_points:
|
||||
w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
|
||||
else:
|
||||
w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
|
||||
|
||||
if quant_type.has_bias():
|
||||
w_q += quant_type.bias
|
||||
|
||||
# Restore original shapes
|
||||
if group_size < size_k:
|
||||
|
||||
def reshape_w(w):
|
||||
w = w.reshape((group_size, -1, size_n))
|
||||
w = w.permute(1, 0, 2)
|
||||
w = w.reshape((size_k, size_n)).contiguous()
|
||||
return w
|
||||
|
||||
w_q = reshape_w(w_q)
|
||||
w_ref = reshape_w(w_ref)
|
||||
|
||||
w_s = w_s.reshape((-1, size_n)).contiguous()
|
||||
|
||||
if zero_points:
|
||||
maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
|
||||
maybe_w_zp = maybe_w_zp.to(device=orig_device)
|
||||
|
||||
return (
|
||||
w_ref.to(device=orig_device),
|
||||
w_q.to(device=orig_device),
|
||||
w_s.to(device=orig_device),
|
||||
maybe_w_zp,
|
||||
)
|
||||
|
||||
|
||||
def gptq_quantize_weights(w: torch.Tensor,
|
||||
quant_type: ScalarType,
|
||||
group_size: int,
|
||||
act_order: bool,
|
||||
test_perm: Optional[torch.Tensor] = None):
|
||||
size_k, _ = w.shape
|
||||
|
||||
assert w.is_floating_point(), "w must be float"
|
||||
assert quant_type in SUPPORTED_GPTQ_QUANT_TYPES, \
|
||||
f"Unsupported gptq type = {quant_type}"
|
||||
assert group_size in SUPPORTED_GROUP_SIZES + [
|
||||
size_k
|
||||
], f"Unsupported groupsize = {group_size}"
|
||||
|
||||
w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
|
||||
|
||||
# Apply act_order
|
||||
g_idx = torch.empty(0, dtype=torch.int, device=w.device)
|
||||
rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
|
||||
if act_order:
|
||||
assert (
|
||||
group_size < size_k
|
||||
), "For act_order, groupsize = {} must be less than size_k = {}".format(
|
||||
group_size, size_k)
|
||||
|
||||
w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size,
|
||||
test_perm)
|
||||
|
||||
return w_ref, w_q, w_s, g_idx, rand_perm
|
||||
|
||||
|
||||
# QQQ employs different quant schemes for per-group and
|
||||
# per-channel quantization.
|
||||
def qqq_quantize_weights(w: torch.Tensor, num_bits: int, group_size: int):
|
||||
orig_device = w.device
|
||||
size_k, size_n = w.shape
|
||||
|
||||
assert w.is_floating_point(), "w must be float"
|
||||
assert num_bits in MARLIN_QQQ_SUPPORTED_NUM_BITS, \
|
||||
f"Unsupported num_bits = {num_bits}"
|
||||
assert group_size in SUPPORTED_GROUP_SIZES + [
|
||||
size_k
|
||||
], f"Unsupported groupsize = {group_size}"
|
||||
|
||||
if group_size == -1:
|
||||
group_size = size_k
|
||||
assert group_size <= size_k
|
||||
|
||||
if group_size < size_k:
|
||||
# Reshape to [groupsize, -1]
|
||||
w = w.reshape((-1, group_size, size_n))
|
||||
w = w.permute(1, 0, 2)
|
||||
w = w.reshape((group_size, -1))
|
||||
|
||||
max_q_val = 2**num_bits - 1
|
||||
half_q_val = (max_q_val + 1) // 2
|
||||
|
||||
# Compute scale for each group
|
||||
s_group = torch.max(torch.abs(w), 0, keepdim=True)[0]
|
||||
s_group *= 2 / max_q_val # 2 => symmetric
|
||||
|
||||
# Quantize
|
||||
q_w = torch.round(w / s_group).int()
|
||||
q_w += half_q_val
|
||||
q_w = torch.clamp(q_w, 0, max_q_val)
|
||||
# Compute ref (dequantized)
|
||||
w_ref = (q_w - half_q_val).half() * s_group
|
||||
|
||||
# Restore original shapes
|
||||
def reshape_w(w):
|
||||
w = w.reshape((group_size, -1, size_n))
|
||||
w = w.permute(1, 0, 2)
|
||||
w = w.reshape((size_k, size_n)).contiguous()
|
||||
return w
|
||||
|
||||
q_w = reshape_w(q_w)
|
||||
w_ref = reshape_w(w_ref)
|
||||
|
||||
# Compute int8 quantization scale for each channel
|
||||
s_channel = torch.max(torch.abs(w_ref), 0, keepdim=True)[0]
|
||||
s_channel /= 127.0
|
||||
t_int8 = (w_ref / s_channel).round().clamp(-128, 127).to(torch.int8)
|
||||
w_ref = t_int8.half() * s_channel
|
||||
s_channel = s_channel.reshape(1, -1).to(dtype=torch.float)
|
||||
|
||||
# Fuse scales
|
||||
s_group = (s_group.reshape(-1, size_n).contiguous() /
|
||||
s_channel).to(dtype=torch.half)
|
||||
else:
|
||||
max_q_val = 2**(num_bits - 1) - 1
|
||||
|
||||
# Compute scale for each channel
|
||||
s_channel = torch.max(torch.abs(w), 0, keepdim=True)[0]
|
||||
s_channel /= max_q_val
|
||||
|
||||
# Quantize
|
||||
q_w = torch.round(w / s_channel).int()
|
||||
q_w = torch.clamp(q_w, -max_q_val, max_q_val)
|
||||
# Compute ref (dequantized)
|
||||
w_ref = q_w.half() * s_channel
|
||||
|
||||
s_group = torch.tensor([], dtype=torch.half)
|
||||
# div 2 ** (8 - self.bits)) to offset right shift in unpacking
|
||||
s_channel /= (2**(8 - num_bits))
|
||||
s_channel = s_channel.reshape(-1, size_n).contiguous().to(torch.float)
|
||||
|
||||
return (
|
||||
w_ref.to(device=orig_device),
|
||||
q_w.to(device=orig_device),
|
||||
s_group.to(device=orig_device),
|
||||
s_channel.to(device=orig_device),
|
||||
)
|
||||
|
||||
|
||||
def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
|
||||
orig_device = q_w.device
|
||||
|
||||
sort_indices = torch.argsort(g_idx).to(
|
||||
dtype=torch.int32) # Sort based on g_idx
|
||||
|
||||
g_idx = g_idx[sort_indices].contiguous()
|
||||
q_w = q_w[sort_indices, :].contiguous()
|
||||
|
||||
return (
|
||||
q_w.to(device=orig_device),
|
||||
g_idx.to(device=orig_device),
|
||||
sort_indices.to(device=orig_device),
|
||||
)
|
||||
|
||||
|
||||
def pack_rows(
|
||||
q_w: torch.Tensor,
|
||||
num_bits: int,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
):
|
||||
assert q_w.shape == (size_k, size_n)
|
||||
|
||||
pack_factor = get_pack_factor(num_bits)
|
||||
assert size_k % pack_factor == 0
|
||||
|
||||
orig_device = q_w.device
|
||||
|
||||
q_w = q_w.cpu().numpy().astype(numpy.uint32)
|
||||
|
||||
q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
|
||||
|
||||
for i in range(pack_factor):
|
||||
q_res |= q_w[i::pack_factor, :] << num_bits * i
|
||||
|
||||
q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
|
||||
return q_res
|
||||
|
||||
|
||||
def pack_cols(
|
||||
q_w: torch.Tensor,
|
||||
num_bits: int,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
):
|
||||
assert q_w.shape == (size_k, size_n)
|
||||
|
||||
pack_factor = get_pack_factor(num_bits)
|
||||
assert size_n % pack_factor == 0
|
||||
|
||||
orig_device = q_w.device
|
||||
|
||||
q_w = q_w.cpu().numpy().astype(numpy.uint32)
|
||||
|
||||
q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
|
||||
|
||||
for i in range(pack_factor):
|
||||
q_res |= q_w[:, i::pack_factor] << num_bits * i
|
||||
|
||||
q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
|
||||
q_res = q_res.contiguous()
|
||||
|
||||
return q_res
|
||||
|
||||
|
||||
def unpack_cols(
|
||||
packed_q_w: torch.Tensor,
|
||||
num_bits: int,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
):
|
||||
pack_factor = get_pack_factor(num_bits)
|
||||
assert size_n % pack_factor == 0
|
||||
assert packed_q_w.shape == (
|
||||
size_k, size_n // pack_factor
|
||||
), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
|
||||
packed_q_w.shape, size_k, size_n, pack_factor)
|
||||
|
||||
orig_device = packed_q_w.device
|
||||
|
||||
packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
|
||||
q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
|
||||
|
||||
mask = (1 << num_bits) - 1
|
||||
for i in range(pack_factor):
|
||||
vals = packed_q_w_cpu & mask
|
||||
packed_q_w_cpu >>= num_bits
|
||||
q_res[:, i::pack_factor] = vals
|
||||
|
||||
q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
|
||||
q_res = q_res.contiguous()
|
||||
|
||||
return q_res
|
||||
|
||||
|
||||
def gptq_pack(
|
||||
q_w: torch.Tensor,
|
||||
num_bits: int,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
):
|
||||
return pack_rows(q_w, num_bits, size_k, size_n)
|
||||
|
||||
|
||||
def awq_pack(
|
||||
q_w: torch.Tensor,
|
||||
num_bits: int,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
):
|
||||
assert q_w.shape == (size_k, size_n)
|
||||
|
||||
# Interleave column dim (for the dequantize code) and pack it to int32
|
||||
if num_bits == 4:
|
||||
interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
|
||||
elif num_bits == 8:
|
||||
interleave = numpy.array([0, 2, 1, 3])
|
||||
else:
|
||||
raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
|
||||
|
||||
q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
|
||||
q_w = q_w.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return pack_cols(q_w, num_bits, size_k, size_n)
|
||||
246
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
Normal file
246
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
Normal file
@@ -0,0 +1,246 @@
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import is_hip
|
||||
|
||||
# Input scaling factors are no longer optional in _scaled_mm starting
|
||||
# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
|
||||
TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
|
||||
|
||||
|
||||
def cutlass_fp8_supported() -> bool:
|
||||
# cutlass is not supported on Rocm
|
||||
if is_hip():
|
||||
return False
|
||||
|
||||
capability_tuple = current_platform.get_device_capability()
|
||||
capability = -1 if capability_tuple is None else capability_tuple.to_int()
|
||||
|
||||
return ops.cutlass_scaled_mm_supports_fp8(capability)
|
||||
|
||||
|
||||
def per_tensor_dequantize(
|
||||
tensor: torch.Tensor, inv_scale: Union[float,
|
||||
torch.Tensor]) -> torch.Tensor:
|
||||
fake_qweight = tensor.to(torch.float16)
|
||||
dq_weight = fake_qweight * inv_scale
|
||||
return dq_weight
|
||||
|
||||
|
||||
def all_close_1d(x: torch.Tensor) -> bool:
|
||||
assert len(x.shape) == 1
|
||||
return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
|
||||
|
||||
|
||||
def convert_to_channelwise(
|
||||
weight_scale: torch.Tensor,
|
||||
logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# Create channelwise buffer
|
||||
weight_scale_channel = torch.empty((sum(logical_widths), 1),
|
||||
dtype=torch.float32,
|
||||
device=weight_scale.device)
|
||||
|
||||
# Expand each scale to match the size of each logical matrix.
|
||||
start = 0
|
||||
for idx, logical_width in enumerate(logical_widths):
|
||||
end = start + logical_width
|
||||
weight_scale_channel[start:end, :] = weight_scale[idx]
|
||||
start = end
|
||||
|
||||
return weight_scale_channel
|
||||
|
||||
|
||||
def requantize_with_max_scale(
|
||||
weight: torch.Tensor, weight_scale: torch.Tensor,
|
||||
logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# Max scale to be used for requanitzation.
|
||||
max_w_scale = weight_scale.max()
|
||||
|
||||
# QKV / MLP is fused in the on disk checkpoint if any of the
|
||||
# weight scales are still set to the default since we initialize
|
||||
# N weight scales for N shards but we only load 1 weight scale
|
||||
# from disk in this case. Skip requantization in this case (since)
|
||||
# we already are quantized with the single scale.
|
||||
# * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
|
||||
unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo(
|
||||
torch.float8_e4m3fn).min)
|
||||
|
||||
# If unfused checkpoint, need requanize with the single scale.
|
||||
if unfused_module_in_checkpoint:
|
||||
start = 0
|
||||
for idx, logical_width in enumerate(logical_widths):
|
||||
end = start + logical_width
|
||||
weight_dq = per_tensor_dequantize(weight[start:end, :],
|
||||
weight_scale[idx])
|
||||
weight[start:end, :], _ = ops.scaled_fp8_quant(
|
||||
weight_dq, max_w_scale)
|
||||
start = end
|
||||
|
||||
return max_w_scale, weight
|
||||
|
||||
|
||||
def apply_fp8_linear(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
input_scale: Optional[torch.Tensor] = None,
|
||||
input_scale_ub: Optional[torch.Tensor] = None,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
cutlass_fp8_supported: bool = True,
|
||||
use_per_token_if_dynamic: bool = False,
|
||||
) -> torch.Tensor:
|
||||
# ops.scaled_fp8_quant supports both dynamic and static quant.
|
||||
# If dynamic, layer.input_scale is None and x_scale computed from x.
|
||||
# If static, layer.input_scale is scalar and x_scale is input_scale.
|
||||
|
||||
# cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
|
||||
if cutlass_fp8_supported:
|
||||
qinput, x_scale = ops.scaled_fp8_quant(
|
||||
input,
|
||||
input_scale,
|
||||
scale_ub=input_scale_ub,
|
||||
use_per_token_if_dynamic=use_per_token_if_dynamic)
|
||||
|
||||
# Fused GEMM_DQ
|
||||
return ops.cutlass_scaled_mm(qinput,
|
||||
weight,
|
||||
out_dtype=input.dtype,
|
||||
scale_a=x_scale,
|
||||
scale_b=weight_scale,
|
||||
bias=bias)
|
||||
|
||||
# torch.scaled_mm supports per tensor weights + activations only
|
||||
# so fallback to naive if per channel or per token
|
||||
else:
|
||||
# Note: we pad the input because torch._scaled_mm is more performant
|
||||
# for matrices with batch dimension > 16.
|
||||
# This could change in the future.
|
||||
qinput, x_scale = ops.scaled_fp8_quant(
|
||||
input,
|
||||
input_scale,
|
||||
num_token_padding=17,
|
||||
use_per_token_if_dynamic=use_per_token_if_dynamic)
|
||||
|
||||
per_tensor_weights = (weight_scale.numel() == 1)
|
||||
per_tensor_activations = (x_scale.numel() == 1)
|
||||
|
||||
if per_tensor_weights and per_tensor_activations:
|
||||
# Fused GEMM_DQ
|
||||
output = torch._scaled_mm(qinput,
|
||||
weight,
|
||||
out_dtype=input.dtype,
|
||||
scale_a=x_scale,
|
||||
scale_b=weight_scale,
|
||||
bias=bias)
|
||||
# A fix for discrepancy in scaled_mm which returns tuple
|
||||
# for torch < 2.5 and a single value in torch >= 2.5
|
||||
if type(output) is tuple and len(output) == 2:
|
||||
return torch.narrow(output[0], 0, 0, input.shape[0])
|
||||
return torch.narrow(output, 0, 0, input.shape[0])
|
||||
|
||||
else:
|
||||
# Fallback for channelwise case, where we use unfused DQ
|
||||
# due to limitations with scaled_mm
|
||||
|
||||
# Symmetric quantized GEMM by definition computes the following:
|
||||
# C = (s_x * X) (s_w * W) + bias
|
||||
# This is equivalent to dequantizing the weights and activations
|
||||
# before applying a GEMM.
|
||||
#
|
||||
# In order to compute quantized operands, a quantized kernel
|
||||
# will rewrite the above like so:
|
||||
# C = s_w * s_x * (X * W) + bias
|
||||
#
|
||||
# For the scaled_mm fallback case, we break this down, since it
|
||||
# does not support s_w being a vector.
|
||||
|
||||
# Making sure the dummy tensor is on the same device as the weight
|
||||
global TORCH_DEVICE_IDENTITY
|
||||
if (TORCH_DEVICE_IDENTITY is not None
|
||||
and TORCH_DEVICE_IDENTITY.device != weight.device):
|
||||
TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
|
||||
|
||||
# GEMM
|
||||
# This computes C = (X * W).
|
||||
# Output in fp32 to allow subsequent ops to happen in-place
|
||||
output = torch._scaled_mm(qinput,
|
||||
weight,
|
||||
scale_a=TORCH_DEVICE_IDENTITY,
|
||||
scale_b=TORCH_DEVICE_IDENTITY,
|
||||
out_dtype=torch.float32)
|
||||
# A fix for discrepancy in scaled_mm which returns tuple
|
||||
# for torch < 2.5 and a single value in torch >= 2.5
|
||||
if type(output) is tuple and len(output) == 2:
|
||||
output = output[0]
|
||||
# Unpad (undo num_token_padding)
|
||||
output = torch.narrow(output, 0, 0, input.shape[0])
|
||||
x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])
|
||||
|
||||
# DQ
|
||||
# C = sw * sx * (X * W) + bias
|
||||
output = output * x_scale * weight_scale.t()
|
||||
if bias is not None:
|
||||
output = output + bias
|
||||
return output.to(dtype=input.dtype)
|
||||
|
||||
|
||||
def apply_int8_linear(
|
||||
input: torch.Tensor,
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
input_scale: Optional[torch.Tensor] = None,
|
||||
input_zero_point: Optional[torch.Tensor] = None,
|
||||
azp_adj: Optional[torch.Tensor] = None,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
):
|
||||
# ops.scaled_int8_quant supports both dynamic and static quant.
|
||||
# * dynamic, layer.input_scale is None and x_scale computed from x.
|
||||
# * static, layer.input_scale is scalar and x_scale is input_scale.
|
||||
symmetric = azp_adj is None
|
||||
x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
|
||||
input_scale,
|
||||
input_zero_point,
|
||||
symmetric=symmetric)
|
||||
|
||||
if x_zp is not None:
|
||||
return ops.cutlass_scaled_mm_azp(x_q,
|
||||
weight,
|
||||
scale_a=x_scale,
|
||||
scale_b=weight_scale,
|
||||
out_dtype=input.dtype,
|
||||
azp_adj=azp_adj,
|
||||
azp=x_zp,
|
||||
bias=bias)
|
||||
return ops.cutlass_scaled_mm(x_q,
|
||||
weight,
|
||||
scale_a=x_scale,
|
||||
scale_b=weight_scale,
|
||||
out_dtype=input.dtype,
|
||||
bias=bias)
|
||||
|
||||
|
||||
def normalize_e4m3fn_to_e4m3fnuz(
|
||||
weight: torch.Tensor,
|
||||
weight_scale: torch.Tensor,
|
||||
input_scale: Optional[torch.Tensor] = None
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
|
||||
assert weight.dtype == torch.float8_e4m3fn
|
||||
# The bits pattern 10000000(-128) represents zero in e4m3fn
|
||||
# but NaN in e4m3fnuz. So here we set it to 0.
|
||||
# https://onnx.ai/onnx/technical/float8.html
|
||||
weight_as_int8 = weight.view(torch.int8)
|
||||
ROCM_FP8_NAN_AS_INT = -128
|
||||
weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
|
||||
weight = weight_as_int8.view(torch.float8_e4m3fnuz)
|
||||
|
||||
# For the same bits representation, e4m3fnuz value is half of
|
||||
# the e4m3fn value, so we should double the scaling factor to
|
||||
# get the same dequantized value.
|
||||
# https://onnx.ai/onnx/technical/float8.html
|
||||
weight_scale = weight_scale * 2.0
|
||||
if input_scale is not None:
|
||||
input_scale = input_scale * 2.0
|
||||
return weight, weight_scale, input_scale
|
||||
Reference in New Issue
Block a user