[Feature] support compressed-tensors w4a16 quantization (#154)
- native int4 kimi model inference is supported Signed-off-by: Li Wei <liwei.109@outlook.com>
This commit is contained in:
@@ -99,12 +99,5 @@ class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
|
||||
# )
|
||||
|
||||
|
||||
# monkey patch
|
||||
_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import cutlass
|
||||
|
||||
cutlass.CutlassScaledMMLinearKernel = KunlunScaledMMLinearKernel
|
||||
print(
|
||||
"[Monkey Patch Applied] >>> vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass.CutlassScaledMMLinearKernel \
|
||||
--> vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm.KunlunScaledMMLinearKernel"
|
||||
)
|
||||
# replace CutlassScaledMMLinearKernel with KunlunScaledMMLinearKernel
|
||||
_POSSIBLE_KERNELS[PlatformEnum.CUDA] = [KunlunScaledMMLinearKernel]
|
||||
Reference in New Issue
Block a user