Upgrade to vllm 0.17.0 corex v4.1 overlay
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
"""Utility methods for model layers."""
|
||||
|
||||
from collections.abc import Callable
|
||||
import ast
|
||||
import re
|
||||
|
||||
import torch
|
||||
|
||||
@@ -13,6 +15,7 @@ from vllm.logger import init_logger
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils.platform_utils import num_compute_units
|
||||
from vllm.utils.torch_utils import direct_register_custom_op
|
||||
import ixformer.inference.functions as IXF
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -31,27 +34,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool:
|
||||
return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES
|
||||
|
||||
|
||||
def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
|
||||
# Shuffle weight along the last dimension so that
|
||||
# we folded the weights to adjance location
|
||||
# Example:
|
||||
# input:
|
||||
# [[1, 2, 3, 4, 5, 6],
|
||||
# [7, 8, 9, 10, 11, 12]]
|
||||
# output:
|
||||
# [[1, 4, 2, 5, 3, 6],
|
||||
# [7, 10, 8, 11, 9, 12]]
|
||||
# This will be used together with triton swiglu kernel
|
||||
shape = w.shape
|
||||
N = shape[-1]
|
||||
first = w[..., : N // 2]
|
||||
second = w[..., N // 2 :]
|
||||
|
||||
stacked = torch.stack((first, second), dim=-1)
|
||||
w_shuffled = stacked.reshape(shape)
|
||||
return w_shuffled
|
||||
|
||||
|
||||
def get_token_bin_counts_and_mask(
|
||||
tokens: torch.Tensor,
|
||||
vocab_size: int,
|
||||
@@ -116,7 +98,11 @@ def default_unquantized_gemm(
|
||||
weight: torch.Tensor,
|
||||
bias: torch.Tensor | None = None,
|
||||
):
|
||||
return torch.nn.functional.linear(x, weight, bias)
|
||||
if bias is None and x.dtype in [torch.half, torch.bfloat16] and weight.dtype == torch.float32:
|
||||
return IXF.mixed_type_linear(input=x, weight=layer.weight)
|
||||
if x.dtype == torch.float32:
|
||||
return torch.nn.functional.linear(x, weight, bias)
|
||||
return IXF.linear(x, weight, bias)
|
||||
|
||||
|
||||
def use_aiter_triton_gemm(n, m, k, dtype):
|
||||
@@ -191,7 +177,6 @@ def rocm_unquantized_gemm_impl(
|
||||
and on_gfx9()
|
||||
and x.dtype in [torch.float16, torch.bfloat16]
|
||||
and k % 8 == 0
|
||||
and x.is_contiguous()
|
||||
)
|
||||
|
||||
if use_skinny is not True:
|
||||
@@ -302,3 +287,72 @@ def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
|
||||
return cpu_unquantized_gemm
|
||||
else:
|
||||
return default_unquantized_gemm
|
||||
|
||||
def weight_quant_l1(input: torch.Tensor):
|
||||
qmax = 127.0
|
||||
input = input.to(device="cuda")
|
||||
abs_max = torch.abs(input).max(dim=1, keepdim=True)[0]
|
||||
scale = abs_max / qmax
|
||||
assert scale.shape == (input.shape[0], 1)
|
||||
quantized = torch.round(input / scale)
|
||||
quantized = torch.clamp(quantized, -qmax, qmax)
|
||||
return quantized.to(torch.int8), scale.to(torch.float32)
|
||||
|
||||
def weight_quant_l2(input: torch.Tensor, format: str = "TN"):
|
||||
qmax = 127.0
|
||||
input = input.to(device="cuda")
|
||||
abs_max = torch.abs(input).max(dim=1, keepdim=True)[0] # [rows, 1]
|
||||
scale = abs_max / qmax # [rows, 1]
|
||||
assert scale.shape == (input.shape[0], 1)
|
||||
quantized = torch.round(input / scale)
|
||||
quantized = torch.clamp(quantized, -qmax, qmax)
|
||||
|
||||
i4_weights, i8scales, i8zeros = IXF.quant_repack_int4(quantized.to(torch.int8).unsqueeze_(0), -1, 2, format, False)
|
||||
return i4_weights.squeeze(0), scale.to(torch.float32)
|
||||
|
||||
|
||||
def parse_opt_exclude_layers(
|
||||
opt_exclude_layers_str: str,
|
||||
prefix: str,
|
||||
) -> bool:
|
||||
"""
|
||||
Parses the VLLM_OPT_EXCLUDE_LAYERS environment variable to determine if
|
||||
the current layer should be excluded from optimization.
|
||||
|
||||
Args:
|
||||
opt_exclude_layers_str: The string value from the
|
||||
VLLM_OPT_EXCLUDE_LAYERS environment variable.
|
||||
prefix: The prefix of the current layer (e.g.,
|
||||
"model.layers.12.qkv_proj").
|
||||
|
||||
Returns:
|
||||
A boolean indicating whether the layer should be excluded.
|
||||
"""
|
||||
if not opt_exclude_layers_str:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Safely evaluate the string to a Python object
|
||||
excluded_layers = ast.literal_eval(opt_exclude_layers_str)
|
||||
|
||||
# If a single integer is provided, convert it to a set
|
||||
if isinstance(excluded_layers, int):
|
||||
excluded_layers = {excluded_layers}
|
||||
elif not isinstance(excluded_layers, (set, tuple, list)):
|
||||
raise TypeError
|
||||
|
||||
excluded_layers: set[int] = set(excluded_layers)
|
||||
|
||||
# Extract layer number from the prefix string
|
||||
layer_match = re.search(r"\.(\d+)", prefix)
|
||||
if layer_match and int(layer_match.group(1)) in excluded_layers:
|
||||
return True # Exclude this layer
|
||||
except (ValueError, SyntaxError, TypeError):
|
||||
logger.warning(
|
||||
"Failed to parse VLLM_OPT_EXCLUDE_LAYERS: %s. "
|
||||
"Expected a string representation of an integer or a "
|
||||
"tuple/list/set of integers.",
|
||||
opt_exclude_layers_str,
|
||||
)
|
||||
|
||||
return False # Do not exclude this layer
|
||||
Reference in New Issue
Block a user