xc-llm-kunlun/vllm_kunlun/ops/quantization/awq.py

#
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
# Author: Li Wei, Pan Xiakai, You Zeyu, Tang Shiwen
# Email: liwei157@baidu.com
# This file is a part of the vllm-kunlun project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, Union

import torch

from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
from vllm.model_executor.layers.quantization.awq import (
    AWQLinearMethod,
    AWQConfig,
    is_layer_skipped_awq,
)
from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config

logger = init_logger(__name__)


class KunlunAWQLinearMethod(AWQLinearMethod):

    def repack_int4_for_kunlun(self, packed: torch.Tensor, num_bits: int = 4):
        """Convert AWQ-packed int4 weights to Kunlun XPU format.
        Input:  packed[N, K], dtype=int32, saved as AWQ order
        Output: packed_reordered[N, K], dtype=int32, saved as Kunlun order
        """
        N, K = packed.shape
        self.align_type = 1 if K % 8 == 0 else 0
        assert num_bits == 4, "Only int4 supported now"
        shifts = torch.arange(0, 32, num_bits, device=packed.device, dtype=torch.int32)

        if self.align_type == 0:  # NORMAL MODE
            # Unpack AWQ order:[0, 2, 4, 6, 1, 3, 5, 7]
            unpacked_awq = (packed.unsqueeze(-1) >> shifts) & 0xF  # [N, K, 8]

            # Reverse AWQ order and convert to KUNLUN order
            AWQ_TO_KUNLUN_ORDER_NORMAL = [4, 0, 5, 1, 6, 2, 7, 3]
            # [0,2,4,6,1,3,5,7] --> [1, 0, 3, 2, 5, 4, 7, 6]
            unpacked_kunlun = unpacked_awq[..., AWQ_TO_KUNLUN_ORDER_NORMAL]  # [N, K, 8]

            # Pack to int32, order[6, 7, 4, 5, 2, 3, 0, 1]
            packed_kunlun = (unpacked_kunlun << shifts).sum(
                dim=-1, dtype=torch.int32
            )  # [N, K]
        elif self.align_type == 1:  # FAST MODEL
            # Unpack AWQ order
            unpacked_awq = (
                packed.view(N, K // 8, 8).unsqueeze(-1) >> shifts
            ) & 0xF  # [N, K//8, 8, 8]

            # Reverse AWQ order and convert to KUNLUN order
            AWQ_TO_KUNLUN_ORDER_FAST = [
                32, 0, 36, 4, 33, 1, 37, 5,
                34, 2, 38, 6, 35, 3, 39, 7,
                40, 8, 44, 12, 41, 9, 45, 13,
                42, 10, 46, 14, 43, 11, 47, 15,
                48, 16, 52, 20, 49, 17, 53, 21,
                50, 18, 54, 22, 51, 19, 55, 23,
                56, 24, 60, 28, 57, 25, 61, 29,
                58, 26, 62, 30, 59, 27, 63, 31
            ]
            unpacked_awq = unpacked_awq.reshape(N, K // 8, 64)
            unpacked_kunlun = unpacked_awq[
                ..., AWQ_TO_KUNLUN_ORDER_FAST
            ]  # [N, K//8, 64]

            # Pack to int32
            unpacked_kunlun = unpacked_kunlun.reshape(N, K // 8, 8, 8)
            packed_kunlun = (
                (unpacked_kunlun << shifts).sum(dim=-1, dtype=torch.int32).reshape(N, K)
            )  # [N, K]
        else:
            raise NotImplementedError

        return packed_kunlun

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        logger.warning_once(f"Repacking INT4 for XPU ...")
        layer.qweight = torch.nn.Parameter(
            (
                self.repack_int4_for_kunlun(layer.qweight.data)
                if layer.qweight.data.dtype == torch.int32
                else layer.qweight.data
            ),
            requires_grad=False,
        )
        layer.qzeros = torch.nn.Parameter(
            (
                self.repack_int4_for_kunlun(layer.qzeros.data)
                if layer.qzeros.data.dtype == torch.int32
                else layer.qzeros.data
            ),
            requires_grad=False,
        )
        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)

    def apply(
        self,
        layer: torch.nn.Module,
        x: torch.Tensor,
        bias: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        qweight = layer.qweight
        scales = layer.scales
        qzeros = layer.qzeros
        pack_factor = self.quant_config.pack_factor
        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
        reshaped_x = x.reshape(-1, x.shape[-1])

        # num_tokens >= threshold
        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256

        if FP16_MATMUL_HEURISTIC_CONDITION:
            out = torch.ops._C.awq_dequantize(
                qweight, scales, qzeros, quant_type=0, align_type=self.align_type
            )
            out = torch.matmul(reshaped_x, out)
        else:
            out = torch.ops._C.awq_gemm(
                reshaped_x, qweight, scales, qzeros, align_type=self.align_type
            )
        if bias is not None:
            out.add_(bias)
        return out.reshape(out_shape)


class KunlunAWQConfig(AWQConfig):

    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional[Union["LinearMethodBase", "QuantizeMethodBase"]]: # type: ignore
        if isinstance(layer, LinearBase):
            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
                return UnquantizedLinearMethod()
            return KunlunAWQLinearMethod(self)
        elif isinstance(layer, FusedMoE):
            logger.warning_once(
                f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
                "Falling back to Moe WNA16 kernels."
            )
            config = {
                "quant_method": "awq",
                "bits": self.weight_bits,
                "group_size": self.group_size,
                "zero_point": self.zero_point,
                "lm_head": False,
            }
            return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix)

        return None


# monkey patch
from vllm.model_executor.layers.quantization import awq

awq.AWQLinearMethod = KunlunAWQLinearMethod
awq.AWQConfig = KunlunAWQConfig
print(
    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.awq.AWQLinearMethod \
      --> vllm_kunlun.ops.quantization.awq.KunlunAWQLinearMethod"
)
print(
    "[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.awq.AWQConfig \
      --> vllm_kunlun.ops.quantization.awq.KunlunAWQConfig"
)