Upgrade to vllm 0.17.0 corex v4.1 overlay

This commit is contained in:
2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions

View File

@@ -3,6 +3,8 @@
"""Utility methods for model layers."""
from collections.abc import Callable
import ast
import re
import torch
@@ -13,6 +15,7 @@ from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.utils.platform_utils import num_compute_units
from vllm.utils.torch_utils import direct_register_custom_op
import ixformer.inference.functions as IXF
logger = init_logger(__name__)
@@ -31,27 +34,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool:
return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES
def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
# Shuffle weight along the last dimension so that
# we folded the weights to adjance location
# Example:
# input:
# [[1, 2, 3, 4, 5, 6],
# [7, 8, 9, 10, 11, 12]]
# output:
# [[1, 4, 2, 5, 3, 6],
# [7, 10, 8, 11, 9, 12]]
# This will be used together with triton swiglu kernel
shape = w.shape
N = shape[-1]
first = w[..., : N // 2]
second = w[..., N // 2 :]
stacked = torch.stack((first, second), dim=-1)
w_shuffled = stacked.reshape(shape)
return w_shuffled
def get_token_bin_counts_and_mask(
tokens: torch.Tensor,
vocab_size: int,
@@ -116,7 +98,11 @@ def default_unquantized_gemm(
weight: torch.Tensor,
bias: torch.Tensor | None = None,
):
return torch.nn.functional.linear(x, weight, bias)
if bias is None and x.dtype in [torch.half, torch.bfloat16] and weight.dtype == torch.float32:
return IXF.mixed_type_linear(input=x, weight=layer.weight)
if x.dtype == torch.float32:
return torch.nn.functional.linear(x, weight, bias)
return IXF.linear(x, weight, bias)
def use_aiter_triton_gemm(n, m, k, dtype):
@@ -191,7 +177,6 @@ def rocm_unquantized_gemm_impl(
and on_gfx9()
and x.dtype in [torch.float16, torch.bfloat16]
and k % 8 == 0
and x.is_contiguous()
)
if use_skinny is not True:
@@ -302,3 +287,72 @@ def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
return cpu_unquantized_gemm
else:
return default_unquantized_gemm
def weight_quant_l1(input: torch.Tensor):
qmax = 127.0
input = input.to(device="cuda")
abs_max = torch.abs(input).max(dim=1, keepdim=True)[0]
scale = abs_max / qmax
assert scale.shape == (input.shape[0], 1)
quantized = torch.round(input / scale)
quantized = torch.clamp(quantized, -qmax, qmax)
return quantized.to(torch.int8), scale.to(torch.float32)
def weight_quant_l2(input: torch.Tensor, format: str = "TN"):
qmax = 127.0
input = input.to(device="cuda")
abs_max = torch.abs(input).max(dim=1, keepdim=True)[0] # [rows, 1]
scale = abs_max / qmax # [rows, 1]
assert scale.shape == (input.shape[0], 1)
quantized = torch.round(input / scale)
quantized = torch.clamp(quantized, -qmax, qmax)
i4_weights, i8scales, i8zeros = IXF.quant_repack_int4(quantized.to(torch.int8).unsqueeze_(0), -1, 2, format, False)
return i4_weights.squeeze(0), scale.to(torch.float32)
def parse_opt_exclude_layers(
opt_exclude_layers_str: str,
prefix: str,
) -> bool:
"""
Parses the VLLM_OPT_EXCLUDE_LAYERS environment variable to determine if
the current layer should be excluded from optimization.
Args:
opt_exclude_layers_str: The string value from the
VLLM_OPT_EXCLUDE_LAYERS environment variable.
prefix: The prefix of the current layer (e.g.,
"model.layers.12.qkv_proj").
Returns:
A boolean indicating whether the layer should be excluded.
"""
if not opt_exclude_layers_str:
return False
try:
# Safely evaluate the string to a Python object
excluded_layers = ast.literal_eval(opt_exclude_layers_str)
# If a single integer is provided, convert it to a set
if isinstance(excluded_layers, int):
excluded_layers = {excluded_layers}
elif not isinstance(excluded_layers, (set, tuple, list)):
raise TypeError
excluded_layers: set[int] = set(excluded_layers)
# Extract layer number from the prefix string
layer_match = re.search(r"\.(\d+)", prefix)
if layer_match and int(layer_match.group(1)) in excluded_layers:
return True # Exclude this layer
except (ValueError, SyntaxError, TypeError):
logger.warning(
"Failed to parse VLLM_OPT_EXCLUDE_LAYERS: %s. "
"Expected a string representation of an integer or a "
"tuple/list/set of integers.",
opt_exclude_layers_str,
)
return False # Do not exclude this layer