[Feature] support compressed-tensors w4a16 quantization (#154)

- native int4 kimi model inference is supported

Signed-off-by: Li Wei <liwei.109@outlook.com>
This commit is contained in:
Li Wei
2026-01-27 19:56:22 +08:00
committed by GitHub
parent 0711c1abfa
commit 71bd70ad6c
9 changed files with 369 additions and 28 deletions

View File

@@ -15,13 +15,20 @@
# This file is a part of the vllm-ascend project.
#
# embedding
import vllm_kunlun.ops.rotary_embedding
import vllm_kunlun.ops.layernorm
import vllm_kunlun.ops.vocab_parallel_embedding
# quantization
import vllm_kunlun.ops.quantization.awq
import vllm_kunlun.ops.quantization.gptq
import vllm_kunlun.ops.quantization.moe_wna16
import vllm_kunlun.ops.vocab_parallel_embedding
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
import vllm_kunlun.ops.quantization.kernels.kunlun_exllama_linear
# base layers
import vllm_kunlun.ops.layernorm
import vllm_kunlun.ops.linear
import vllm_kunlun.ops.fused_moe.layer
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm