[Feature] Support AWQ MoE W4A16 Quantization (#142)

Signed-off-by: tangshiwen <tangshiwen@baidu.com> Co-authored-by: Li Wei <liwei.109@outlook.com>
2026-01-26 18:56:05 +08:00
parent 2a998286c0
commit 0711c1abfa
7 changed files with 639 additions and 126 deletions
--- a/vllm_kunlun/ops/quantization/kernels/quant_ops.py
+++ b/vllm_kunlun/ops/quantization/kernels/quant_ops.py
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
+# Author: Tang Shiwen
+# Email: tangshiwen@baidu.com
+# This file is a part of the vllm-kunlun project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def dequant_int4(
+    qweight: torch.Tensor,
+    scale: torch.Tensor,
+    zp: torch.Tensor,
+    int4_signed: bool = False,
+    use_mode_fast: bool = False,
+) -> torch.Tensor:
+
+    fpweight = torch.empty(
+        (
+            qweight.shape[0],
+            qweight.shape[2],
+            scale.shape[1],
+        ),
+        dtype=scale.dtype,
+        device=qweight.device,
+    )
+
+    qweight_t = qweight.transpose(1, 2).contiguous()
+    qscale_t = scale.transpose(1, 2).contiguous() * 15.0
+
+    zp_t = zp.transpose(1, 2).contiguous()
+    zp_unpack = torch.stack((zp_t & 0xF, (zp_t >> 4) & 0xF), dim=-1)
+    zp_fp = (
+        zp_unpack.reshape(
+            zp_unpack.shape[0],
+            zp_unpack.shape[1],
+            zp_unpack.shape[2] * zp_unpack.shape[3],
+        )
+        .contiguous()
+        .to(scale.dtype)
+        - 8.0
+    )
+
+    group_m = qweight_t.shape[-2] // qscale_t.shape[-2]
+
+    torch.ops._C.dequant_int4(
+        x=qweight_t,
+        scale=qscale_t,
+        zero=zp_fp,
+        y=fpweight,
+        group_m=group_m,
+        int4_signed=int4_signed,
+        use_mode_fast=use_mode_fast,
+    )
+
+    return fpweight.transpose(1, 2).contiguous()