[fix]update compressed-tensors scheme

Deepseek v3.2 is supported now Signed-off-by: Li Wei <liwei.109@outlook.com>
2026-01-06 22:30:27 +08:00
parent 9c2b908908
commit 1c1b84d78c
4 changed files with 4 additions and 82 deletions
--- a/vllm_kunlun/ops/init.py
+++ b/vllm_kunlun/ops/init.py
@@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq
 import vllm_kunlun.ops.quantization.gptq
 import vllm_kunlun.ops.vocab_parallel_embedding
 import vllm_kunlun.ops.linear
-# import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass
 import vllm_kunlun.ops.fused_moe.layer
-import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
 import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
-import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun
+import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
@@ -1,75 +0,0 @@
-#
-# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
-# Author: Tang Shiwen, Li Wei
-# Email: tangshiwen@baidu.com, liwei157@baidu.com
-# This file is a part of the vllm-kunlun project.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-import torch
-from vllm.model_executor.layers.linear import (
-    LinearBase,
-    LinearMethodBase,
-    UnquantizedLinearMethod,
-)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsConfig,
-    CompressedTensorsLinearMethod,
-    CompressedTensorsMoEMethod,
-    CompressedTensorsKVCacheMethod,
-    CompressedTensorsLinearTransformMethod,
-    get_linear_transform_schemes,
-)
-from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
-from vllm_kunlun.ops.fused_moe.layer import FusedMoE
-
-
-def get_quant_method(
-    self,
-    layer: torch.nn.Module,
-    prefix: str,
-) -> Optional["QuantizeMethodBase"]:
-    from vllm_kunlun.ops.attention.layer import Attention  # Avoid circular import
-
-    if isinstance(layer, LinearBase):
-        # collect schemes
-        quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
-        input_tfms, output_tfms = get_linear_transform_schemes(
-            layer, prefix, self.transform_config, self.packed_modules_mapping
-        )
-
-        # choose quantization method
-        quant_method: LinearMethodBase = UnquantizedLinearMethod()
-        if quant_scheme is not None:
-            layer.scheme = quant_scheme
-            quant_method = CompressedTensorsLinearMethod(self)
-
-        # choose transform method
-        if any((input_tfms, output_tfms)):
-            return CompressedTensorsLinearTransformMethod.from_schemes(
-                quant_method, quant_scheme, input_tfms, output_tfms
-            )
-
-        else:
-            return quant_method
-
-    if isinstance(layer, Attention):
-        return CompressedTensorsKVCacheMethod(self)
-    if isinstance(layer, FusedMoE):
-        return CompressedTensorsMoEMethod.get_moe_method(self, layer)
-    return None
-
-
-CompressedTensorsConfig.get_quant_method = get_quant_method
--- a/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py
+++ b/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py
@@ -20,17 +20,16 @@ from typing import Optional

 import torch
 import xspeedgate_ops
+from vllm.platforms import current_platform, PlatformEnum
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    convert_to_channelwise,
 )
-from vllm.platforms import current_platform
-from vllm.model_executor.layers.quantization.kernels.scaled_mm import (  # noqa: E501
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
+    _POSSIBLE_KERNELS,
    ScaledMMLinearLayerConfig,
    CutlassScaledMMLinearKernel,
 )
-from vllm.platforms import PlatformEnum
-from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS


 class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
--- a/vllm_kunlun/ops/quantization/kernels/scaled_mm/init.py
+++ b/vllm_kunlun/ops/quantization/kernels/scaled_mm/init.py