Merge pull request #85 from liwei109/liwei-dev
[fix] update compressed-tensors scheme
This commit is contained in:
@@ -21,8 +21,6 @@ import vllm_kunlun.ops.quantization.awq
|
|||||||
import vllm_kunlun.ops.quantization.gptq
|
import vllm_kunlun.ops.quantization.gptq
|
||||||
import vllm_kunlun.ops.vocab_parallel_embedding
|
import vllm_kunlun.ops.vocab_parallel_embedding
|
||||||
import vllm_kunlun.ops.linear
|
import vllm_kunlun.ops.linear
|
||||||
# import vllm_kunlun.ops.quantization.kernels.scaled_mm.cutlass
|
|
||||||
import vllm_kunlun.ops.fused_moe.layer
|
import vllm_kunlun.ops.fused_moe.layer
|
||||||
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors
|
|
||||||
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
|
import vllm_kunlun.ops.quantization.compressed_tensors.compressed_tensors_moe
|
||||||
import vllm_kunlun.ops.quantization.kernels.scaled_mm.kunlun
|
import vllm_kunlun.ops.quantization.kernels.kunlun_scale_mm
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright (c) 2025 Baidu, Inc. All Rights Reserved.
|
|
||||||
# Author: Tang Shiwen, Li Wei
|
|
||||||
# Email: tangshiwen@baidu.com, liwei157@baidu.com
|
|
||||||
# This file is a part of the vllm-kunlun project.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from vllm.model_executor.layers.linear import (
|
|
||||||
LinearBase,
|
|
||||||
LinearMethodBase,
|
|
||||||
UnquantizedLinearMethod,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
|
||||||
CompressedTensorsConfig,
|
|
||||||
CompressedTensorsLinearMethod,
|
|
||||||
CompressedTensorsMoEMethod,
|
|
||||||
CompressedTensorsKVCacheMethod,
|
|
||||||
CompressedTensorsLinearTransformMethod,
|
|
||||||
get_linear_transform_schemes,
|
|
||||||
)
|
|
||||||
from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
|
|
||||||
from vllm_kunlun.ops.fused_moe.layer import FusedMoE
|
|
||||||
|
|
||||||
|
|
||||||
def get_quant_method(
|
|
||||||
self,
|
|
||||||
layer: torch.nn.Module,
|
|
||||||
prefix: str,
|
|
||||||
) -> Optional["QuantizeMethodBase"]:
|
|
||||||
from vllm_kunlun.ops.attention.layer import Attention # Avoid circular import
|
|
||||||
|
|
||||||
if isinstance(layer, LinearBase):
|
|
||||||
# collect schemes
|
|
||||||
quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
|
|
||||||
input_tfms, output_tfms = get_linear_transform_schemes(
|
|
||||||
layer, prefix, self.transform_config, self.packed_modules_mapping
|
|
||||||
)
|
|
||||||
|
|
||||||
# choose quantization method
|
|
||||||
quant_method: LinearMethodBase = UnquantizedLinearMethod()
|
|
||||||
if quant_scheme is not None:
|
|
||||||
layer.scheme = quant_scheme
|
|
||||||
quant_method = CompressedTensorsLinearMethod(self)
|
|
||||||
|
|
||||||
# choose transform method
|
|
||||||
if any((input_tfms, output_tfms)):
|
|
||||||
return CompressedTensorsLinearTransformMethod.from_schemes(
|
|
||||||
quant_method, quant_scheme, input_tfms, output_tfms
|
|
||||||
)
|
|
||||||
|
|
||||||
else:
|
|
||||||
return quant_method
|
|
||||||
|
|
||||||
if isinstance(layer, Attention):
|
|
||||||
return CompressedTensorsKVCacheMethod(self)
|
|
||||||
if isinstance(layer, FusedMoE):
|
|
||||||
return CompressedTensorsMoEMethod.get_moe_method(self, layer)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
CompressedTensorsConfig.get_quant_method = get_quant_method
|
|
||||||
@@ -20,17 +20,16 @@ from typing import Optional
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import xspeedgate_ops
|
import xspeedgate_ops
|
||||||
|
from vllm.platforms import current_platform, PlatformEnum
|
||||||
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
from vllm.model_executor.layers.quantization.utils import replace_parameter
|
||||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
convert_to_channelwise,
|
convert_to_channelwise,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.model_executor.layers.quantization.kernels.scaled_mm import (
|
||||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( # noqa: E501
|
_POSSIBLE_KERNELS,
|
||||||
ScaledMMLinearLayerConfig,
|
ScaledMMLinearLayerConfig,
|
||||||
CutlassScaledMMLinearKernel,
|
CutlassScaledMMLinearKernel,
|
||||||
)
|
)
|
||||||
from vllm.platforms import PlatformEnum
|
|
||||||
from vllm.model_executor.layers.quantization.kernels.scaled_mm import _POSSIBLE_KERNELS
|
|
||||||
|
|
||||||
|
|
||||||
class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
|
class KunlunScaledMMLinearKernel(CutlassScaledMMLinearKernel):
|
||||||
Reference in New Issue
Block a user