Merge pull request #85 from liwei109/liwei-dev

[fix] update compressed-tensors scheme
2026-01-07 09:27:23 +08:00
parent 9c2b908908 1c1b84d78c
commit 62a97db6ed
4 changed files with 4 additions and 82 deletions
--- a/vllm_kunlun/ops/init.py
+++ b/vllm_kunlun/ops/init.py
@@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq
 import vllm_kunlun.ops.quantization.gptq
 import vllm_kunlun.ops.vocab_parallel_embedding
 import vllm_kunlun.ops.linear
 # import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass
 import vllm_kunlun.ops.fused_moe.layer
 import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
 import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
-import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun
+import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
--- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py
@@ -1,75 +0,0 @@
 #
 # Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
 # Author: Tang Shiwen, Li Wei
 # Email: tangshiwen@baidu.com, liwei157@baidu.com
 # This file is a part of the vllm-kunlun project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Optional
 import torch
 from vllm.model_executor.layers.linear import (
    LinearBase,
    LinearMethodBase,
    UnquantizedLinearMethod,
 )
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensorsConfig,
    CompressedTensorsLinearMethod,
    CompressedTensorsMoEMethod,
    CompressedTensorsKVCacheMethod,
    CompressedTensorsLinearTransformMethod,
    get_linear_transform_schemes,
 )
 from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
 from vllm_kunlun.ops.fused_moe.layer import FusedMoE
 def get_quant_method(
    self,
    layer: torch.nn.Module,
    prefix: str,
 ) -> Optional["QuantizeMethodBase"]:
    from vllm_kunlun.ops.attention.layer import Attention  # Avoid circular import
    if isinstance(layer, LinearBase):
        # collect schemes
        quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
        input_tfms, output_tfms = get_linear_transform_schemes(
            layer, prefix, self.transform_config, self.packed_modules_mapping
        )
        # choose quantization method
        quant_method: LinearMethodBase = UnquantizedLinearMethod()
        if quant_scheme is not None:
            layer.scheme = quant_scheme
            quant_method = CompressedTensorsLinearMethod(self)
        # choose transform method
        if any((input_tfms, output_tfms)):
            return CompressedTensorsLinearTransformMethod.from_schemes(
                quant_method, quant_scheme, input_tfms, output_tfms
            )
        else:
            return quant_method
    if isinstance(layer, Attention):
        return CompressedTensorsKVCacheMethod(self)
    if isinstance(layer, FusedMoE):
        return CompressedTensorsMoEMethod.get_moe_method(self, layer)
    return None
 CompressedTensorsConfig.get_quant_method = get_quant_method
--- a/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py
+++ b/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py
@@ -20,17 +20,16 @@ from typing import Optional
 import torch
 import xspeedgate_ops
 from vllm.platforms import current_platform, PlatformEnum
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    convert_to_channelwise,
 )
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
-from vllm.model_executor.layers.quantization.kernels.scaled_mm import (  # noqa: E501
+    _POSSIBLE_KERNELS,
    ScaledMMLinearLayerConfig,
    CutlassScaledMMLinearKernel,
 )
 from vllm.platforms import PlatformEnum
 from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS
 class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
--- a/vllm_kunlun/ops/quantization/kernels/scaled_mm/init.py
+++ b/vllm_kunlun/ops/quantization/kernels/scaled_mm/init.py