diff --git a/vllm_kunlun/ops/__init__.py b/vllm_kunlun/ops/__init__.py index 0bae89f..fa5f0cc 100644 --- a/vllm_kunlun/ops/__init__.py +++ b/vllm_kunlun/ops/__init__.py @@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq import vllm_kunlun.ops.quantization.gptq import vllm_kunlun.ops.vocab_parallel_embedding import vllm_kunlun.ops.linear -# import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass import vllm_kunlun.ops.fused_moe.layer -import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe -import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun \ No newline at end of file +import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm \ No newline at end of file diff --git a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py b/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py deleted file mode 100644 index 23b9ba1..0000000 --- a/vllm_kunlun/ops/quantization/compressed_tensors/compressed_tensors.py +++ /dev/null @@ -1,75 +0,0 @@ -# -# Copyright (c) 2025 Baidu, Inc. All Rights Reserved. -# Author: Tang Shiwen, Li Wei -# Email: tangshiwen@baidu.com, liwei157@baidu.com -# This file is a part of the vllm-kunlun project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Optional - -import torch -from vllm.model_executor.layers.linear import ( - LinearBase, - LinearMethodBase, - UnquantizedLinearMethod, -) -from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsConfig, - CompressedTensorsLinearMethod, - CompressedTensorsMoEMethod, - CompressedTensorsKVCacheMethod, - CompressedTensorsLinearTransformMethod, - get_linear_transform_schemes, -) -from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase -from vllm_kunlun.ops.fused_moe.layer import FusedMoE - - -def get_quant_method( - self, - layer: torch.nn.Module, - prefix: str, -) -> Optional["QuantizeMethodBase"]: - from vllm_kunlun.ops.attention.layer import Attention # Avoid circular import - - if isinstance(layer, LinearBase): - # collect schemes - quant_scheme = self.get_scheme(layer=layer, layer_name=prefix) - input_tfms, output_tfms = get_linear_transform_schemes( - layer, prefix, self.transform_config, self.packed_modules_mapping - ) - - # choose quantization method - quant_method: LinearMethodBase = UnquantizedLinearMethod() - if quant_scheme is not None: - layer.scheme = quant_scheme - quant_method = CompressedTensorsLinearMethod(self) - - # choose transform method - if any((input_tfms, output_tfms)): - return CompressedTensorsLinearTransformMethod.from_schemes( - quant_method, quant_scheme, input_tfms, output_tfms - ) - - else: - return quant_method - - if isinstance(layer, Attention): - return CompressedTensorsKVCacheMethod(self) - if isinstance(layer, FusedMoE): - return CompressedTensorsMoEMethod.get_moe_method(self, layer) - return None - - -CompressedTensorsConfig.get_quant_method = get_quant_method diff --git a/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py similarity index 94% rename from vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py rename to vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py index 24f29d7..34d8c94 100644 --- a/vllm_kunlun/ops/quantization/kernels/scaled_mm/kunlun.py +++ b/vllm_kunlun/ops/quantization/kernels/kunlun_scale_mm.py @@ -20,17 +20,16 @@ from typing import Optional import torch import xspeedgate_ops +from vllm.platforms import current_platform, PlatformEnum from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( convert_to_channelwise, ) -from vllm.platforms import current_platform -from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( # noqa: E501 +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + _POSSIBLE_KERNELS, ScaledMMLinearLayerConfig, CutlassScaledMMLinearKernel, ) -from vllm.platforms import PlatformEnum -from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel): diff --git a/vllm_kunlun/ops/quantization/kernels/scaled_mm/__init__.py b/vllm_kunlun/ops/quantization/kernels/scaled_mm/__init__.py deleted file mode 100644 index e69de29..0000000