[fix]update compressed-tensors scheme

Deepseek v3.2 is supported now

Signed-off-by: Li Wei <liwei.109@outlook.com>
This commit is contained in:
Li Wei
2026-01-06 22:30:27 +08:00
parent 9c2b908908
commit 1c1b84d78c
4 changed files with 4 additions and 82 deletions

View File

@@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq
import vllm_kunlun.ops.quantization.gptq
import vllm_kunlun.ops.vocab_parallel_embedding
import vllm_kunlun.ops.linear
# import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass
import vllm_kunlun.ops.fused_moe.layer
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun
import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm

View File

@@ -1,75 +0,0 @@
#
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
# Author: Tang Shiwen, Li Wei
# Email: tangshiwen@baidu.com, liwei157@baidu.com
# This file is a part of the vllm-kunlun project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from vllm.model_executor.layers.linear import (
LinearBase,
LinearMethodBase,
UnquantizedLinearMethod,
)
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsConfig,
CompressedTensorsLinearMethod,
CompressedTensorsMoEMethod,
CompressedTensorsKVCacheMethod,
CompressedTensorsLinearTransformMethod,
get_linear_transform_schemes,
)
from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
from vllm_kunlun.ops.fused_moe.layer import FusedMoE
def get_quant_method(
self,
layer: torch.nn.Module,
prefix: str,
) -> Optional["QuantizeMethodBase"]:
from vllm_kunlun.ops.attention.layer import Attention # Avoid circular import
if isinstance(layer, LinearBase):
# collect schemes
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
input_tfms, output_tfms = get_linear_transform_schemes(
layer, prefix, self.transform_config, self.packed_modules_mapping
)
# choose quantization method
quant_method: LinearMethodBase = UnquantizedLinearMethod()
if quant_scheme is not None:
layer.scheme = quant_scheme
quant_method = CompressedTensorsLinearMethod(self)
# choose transform method
if any((input_tfms, output_tfms)):
return CompressedTensorsLinearTransformMethod.from_schemes(
quant_method, quant_scheme, input_tfms, output_tfms
)
else:
return quant_method
if isinstance(layer, Attention):
return CompressedTensorsKVCacheMethod(self)
if isinstance(layer, FusedMoE):
return CompressedTensorsMoEMethod.get_moe_method(self, layer)
return None
CompressedTensorsConfig.get_quant_method = get_quant_method

View File

@@ -20,17 +20,16 @@ from typing import Optional
import torch
import xspeedgate_ops
from vllm.platforms import current_platform, PlatformEnum
from vllm.model_executor.layers.quantization.utils import replace_parameter
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
convert_to_channelwise,
)
from vllm.platforms import current_platform
from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( # noqa: E501
from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
_POSSIBLE_KERNELS,
ScaledMMLinearLayerConfig,
CutlassScaledMMLinearKernel,
)
from vllm.platforms import PlatformEnum
from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS
class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):