[Feature] support compressed-tensors w4a16 quantization (#154)

- native int4 kimi model inference is supported Signed-off-by: Li Wei <liwei.109@outlook.com>
2026-01-27 19:56:22 +08:00
parent 0711c1abfa
commit 71bd70ad6c
9 changed files with 369 additions and 28 deletions
--- a/vllm_kunlun/vllm_utils_wrapper.py
+++ b/vllm_kunlun/vllm_utils_wrapper.py
@@ -2275,7 +2275,7 @@ fwd_kvcache_mla.register_fake(_fake_fwd_kvcache_mla)


 ##################################################
-# --------------- dequant_int4 -----------------
+# --------------- dequant_int4 -------------------
 ##################################################
@custom_op("_C::dequant_int4", mutates_args=())
 def dequant_int4(