Upgrade to vllm 0.17.0 corex v4.1 overlay

2026-04-29 19:38:22 +08:00
parent 8fac6062e4
commit 938d0854a5
430 changed files with 35969 additions and 14511 deletions
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -3,6 +3,8 @@
 """Utility methods for model layers."""

 from collections.abc import Callable
+import ast
+import re

 import torch

@@ -13,6 +15,7 @@ from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.utils.platform_utils import num_compute_units
 from vllm.utils.torch_utils import direct_register_custom_op
+import ixformer.inference.functions as IXF

 logger = init_logger(__name__)

@@ -31,27 +34,6 @@ def is_layer_moe_router_gate(prefix: str) -> bool:
    return prefix.rsplit(".", 1)[-1] in MOE_LAYER_ROUTER_GATE_SUFFIXES


-def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
-    # Shuffle weight along the last dimension so that
-    # we folded the weights to adjance location
-    # Example:
-    # input:
-    #       [[1, 2, 3, 4, 5, 6],
-    #        [7, 8, 9, 10, 11, 12]]
-    # output:
-    #       [[1, 4, 2, 5, 3, 6],
-    #        [7, 10, 8, 11, 9, 12]]
-    # This will be used together with triton swiglu kernel
-    shape = w.shape
-    N = shape[-1]
-    first = w[..., : N // 2]
-    second = w[..., N // 2 :]
-
-    stacked = torch.stack((first, second), dim=-1)
-    w_shuffled = stacked.reshape(shape)
-    return w_shuffled
-
-
 def get_token_bin_counts_and_mask(
    tokens: torch.Tensor,
    vocab_size: int,
@@ -116,7 +98,11 @@ def default_unquantized_gemm(
    weight: torch.Tensor,
    bias: torch.Tensor | None = None,
 ):
-    return torch.nn.functional.linear(x, weight, bias)
+    if bias is None and x.dtype in [torch.half, torch.bfloat16] and weight.dtype == torch.float32:
+        return IXF.mixed_type_linear(input=x, weight=layer.weight)
+    if x.dtype == torch.float32:
+        return torch.nn.functional.linear(x, weight, bias)
+    return IXF.linear(x, weight, bias)


 def use_aiter_triton_gemm(n, m, k, dtype):
@@ -191,7 +177,6 @@ def rocm_unquantized_gemm_impl(
        and on_gfx9()
        and x.dtype in [torch.float16, torch.bfloat16]
        and k % 8 == 0
-        and x.is_contiguous()
    )

    if use_skinny is not True:
@@ -302,3 +287,72 @@ def dispatch_unquantized_gemm() -> Callable[..., torch.Tensor]:
        return cpu_unquantized_gemm
    else:
        return default_unquantized_gemm
+    
+def weight_quant_l1(input: torch.Tensor):
+    qmax = 127.0
+    input = input.to(device="cuda")
+    abs_max = torch.abs(input).max(dim=1, keepdim=True)[0]
+    scale = abs_max / qmax
+    assert scale.shape == (input.shape[0], 1)
+    quantized = torch.round(input / scale)
+    quantized = torch.clamp(quantized, -qmax, qmax)
+    return quantized.to(torch.int8), scale.to(torch.float32)
+
+def weight_quant_l2(input: torch.Tensor, format: str = "TN"):
+    qmax = 127.0
+    input = input.to(device="cuda")
+    abs_max = torch.abs(input).max(dim=1, keepdim=True)[0]  # [rows, 1]
+    scale = abs_max / qmax  # [rows, 1]
+    assert scale.shape == (input.shape[0], 1)
+    quantized = torch.round(input / scale)
+    quantized = torch.clamp(quantized, -qmax, qmax)
+
+    i4_weights, i8scales, i8zeros = IXF.quant_repack_int4(quantized.to(torch.int8).unsqueeze_(0), -1, 2, format, False)
+    return i4_weights.squeeze(0), scale.to(torch.float32)
+
+
+def parse_opt_exclude_layers(
+    opt_exclude_layers_str: str,
+    prefix: str,
+) -> bool:
+    """
+    Parses the VLLM_OPT_EXCLUDE_LAYERS environment variable to determine if
+    the current layer should be excluded from optimization.
+
+    Args:
+        opt_exclude_layers_str: The string value from the
+            VLLM_OPT_EXCLUDE_LAYERS environment variable.
+        prefix: The prefix of the current layer (e.g.,
+            "model.layers.12.qkv_proj").
+
+    Returns:
+        A boolean indicating whether the layer should be excluded.
+    """
+    if not opt_exclude_layers_str:
+        return False
+
+    try:
+        # Safely evaluate the string to a Python object
+        excluded_layers = ast.literal_eval(opt_exclude_layers_str)
+
+        # If a single integer is provided, convert it to a set
+        if isinstance(excluded_layers, int):
+            excluded_layers = {excluded_layers}
+        elif not isinstance(excluded_layers, (set, tuple, list)):
+            raise TypeError
+
+        excluded_layers: set[int] = set(excluded_layers)
+
+        # Extract layer number from the prefix string
+        layer_match = re.search(r"\.(\d+)", prefix)
+        if layer_match and int(layer_match.group(1)) in excluded_layers:
+            return True  # Exclude this layer
+    except (ValueError, SyntaxError, TypeError):
+        logger.warning(
+            "Failed to parse VLLM_OPT_EXCLUDE_LAYERS: %s. "
+            "Expected a string representation of an integer or a "
+            "tuple/list/set of integers.",
+            opt_exclude_layers_str,
+        )
+
+    return False  # Do not exclude this layer