[fix]update compressed-tensors scheme
Deepseek v3.2 is supported now Signed-off-by: Li Wei <liwei.109@outlook.com>
This commit is contained in:
@@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq
|
||||
import vllm_kunlun.ops.quantization.gptq
|
||||
import vllm_kunlun.ops.vocab_parallel_embedding
|
||||
import vllm_kunlun.ops.linear
|
||||
# import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass
|
||||
import vllm_kunlun.ops.fused_moe.layer
|
||||
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
|
||||
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
|
||||
import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun
|
||||
import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
|
||||
@@ -1,75 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
|
||||
# Author: Tang Shiwen, Li Wei
|
||||
# Email: tangshiwen@baidu.com, liwei157@baidu.com
|
||||
# This file is a part of the vllm-kunlun project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearBase,
|
||||
LinearMethodBase,
|
||||
UnquantizedLinearMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
||||
CompressedTensorsConfig,
|
||||
CompressedTensorsLinearMethod,
|
||||
CompressedTensorsMoEMethod,
|
||||
CompressedTensorsKVCacheMethod,
|
||||
CompressedTensorsLinearTransformMethod,
|
||||
get_linear_transform_schemes,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
|
||||
from vllm_kunlun.ops.fused_moe.layer import FusedMoE
|
||||
|
||||
|
||||
def get_quant_method(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
prefix: str,
|
||||
) -> Optional["QuantizeMethodBase"]:
|
||||
from vllm_kunlun.ops.attention.layer import Attention # Avoid circular import
|
||||
|
||||
if isinstance(layer, LinearBase):
|
||||
# collect schemes
|
||||
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
|
||||
input_tfms, output_tfms = get_linear_transform_schemes(
|
||||
layer, prefix, self.transform_config, self.packed_modules_mapping
|
||||
)
|
||||
|
||||
# choose quantization method
|
||||
quant_method: LinearMethodBase = UnquantizedLinearMethod()
|
||||
if quant_scheme is not None:
|
||||
layer.scheme = quant_scheme
|
||||
quant_method = CompressedTensorsLinearMethod(self)
|
||||
|
||||
# choose transform method
|
||||
if any((input_tfms, output_tfms)):
|
||||
return CompressedTensorsLinearTransformMethod.from_schemes(
|
||||
quant_method, quant_scheme, input_tfms, output_tfms
|
||||
)
|
||||
|
||||
else:
|
||||
return quant_method
|
||||
|
||||
if isinstance(layer, Attention):
|
||||
return CompressedTensorsKVCacheMethod(self)
|
||||
if isinstance(layer, FusedMoE):
|
||||
return CompressedTensorsMoEMethod.get_moe_method(self, layer)
|
||||
return None
|
||||
|
||||
|
||||
CompressedTensorsConfig.get_quant_method = get_quant_method
|
||||
@@ -20,17 +20,16 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
import xspeedgate_ops
|
||||
from vllm.platforms import current_platform, PlatformEnum
|
||||
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
convert_to_channelwise,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( # noqa: E501
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
|
||||
_POSSIBLE_KERNELS,
|
||||
ScaledMMLinearLayerConfig,
|
||||
CutlassScaledMMLinearKernel,
|
||||
)
|
||||
from vllm.platforms import PlatformEnum
|
||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS
|
||||
|
||||
|
||||
class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
|
||||
Reference in New Issue
Block a user