Co-authored-by: kunpengW-code <1289706727@qq.com>
Co-authored-by: linsheng1 <1950916997@qq.com>
### What this PR does / why we need it?
Currently, chunked prefill is forcibly enabled. DeepSeek V3.1 W8A8C8
supports only the PD separation scenario. C8 refers to quantizing the KV
cache to int8, which aims to reduce the GPU memory usage of the KV cache
and improve the inference throughput.
Constraints:
1. Only the PD separation mode can be used and
MooncakeLayerwiseConnector can be used to run the model.
2. Currently, only the activation value supports dynamic quantization,
and the KV cache supports static quantization. C8 quantization with MTP
is not supported. You can use ModelSlim for quantization. The
quantization procedure is as follows:
pip install transformers==4.48.2
git clone https://gitcode.com/Ascend/msmodelslim.git
cd msmodelslim
bash install.sh
cd example/DeepSeek/
python3 quant_deepseek_w8a8.py --model_path <path/weight> --save_path
<path/quant_weight>
--anti_dataset../common/deepseek_anti_prompt_50_v3_1.json
--calib_dataset../common/deepseek_calib_prompt_50_v3_1.json --rot
--trust_remote_code True --fa_quant --dynamic --anti_method m6
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: pichangping <1337510399@qq.com>
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Co-authored-by: Wang Kunpeng <1289706727@qq.com>
83 lines
2.9 KiB
Python
83 lines
2.9 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
"""Ascend quantization scheme implementations.
|
|
|
|
This module provides all quantization scheme implementations for Ascend NPU.
|
|
Schemes are automatically registered via the @register_scheme decorator.
|
|
|
|
Usage:
|
|
from vllm_ascend.quantization.methods import get_scheme_class
|
|
|
|
# Get a scheme class by quant_type and layer_type
|
|
scheme_cls = get_scheme_class("W8A8_DYNAMIC", "linear")
|
|
scheme = scheme_cls()
|
|
"""
|
|
|
|
from typing import Any
|
|
|
|
# Import base classes
|
|
from .base import AscendAttentionScheme, AscendLinearScheme, AscendMoEScheme, QuantType
|
|
|
|
# Import all scheme classes for external access
|
|
from .kv_c8 import AscendFAQuantAttentionMethod
|
|
|
|
# Import registry functions
|
|
from .registry import get_scheme_class, register_scheme
|
|
from .w4a4_flatquant import AscendW4A4FlatQuantDynamicLinearMethod
|
|
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
|
|
from .w4a8 import AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod
|
|
from .w4a16 import AscendW4A16FusedMoEMethod
|
|
from .w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod
|
|
from .w8a8_mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
|
|
from .w8a8_pdmix import AscendW8A8PDMixFusedMoeMethod, AscendW8A8PDMixLinearMethod
|
|
from .w8a8_static import AscendW8A8LinearMethod
|
|
from .w8a16 import AscendW8A16LinearMethod
|
|
|
|
|
|
def is_mx_quant_type(instance: Any) -> bool:
|
|
"""Checks if the quantization method is a microscaling (MX) type."""
|
|
MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod,)
|
|
return isinstance(instance, MX_QUANT_TYPES)
|
|
|
|
|
|
__all__ = [
|
|
# Base classes
|
|
"AscendAttentionScheme",
|
|
"AscendLinearScheme",
|
|
"AscendMoEScheme",
|
|
"QuantType",
|
|
# Registry functions
|
|
"register_scheme",
|
|
"get_scheme_class",
|
|
# Utility functions
|
|
"is_mx_quant_type",
|
|
# Scheme classes
|
|
"AscendW8A8LinearMethod",
|
|
"AscendW8A8DynamicLinearMethod",
|
|
"AscendW8A8DynamicFusedMoEMethod",
|
|
"AscendW8A8MXFP8DynamicLinearMethod",
|
|
"AscendW8A8PDMixLinearMethod",
|
|
"AscendW8A8PDMixFusedMoeMethod",
|
|
"AscendW8A16LinearMethod",
|
|
"AscendW4A8DynamicLinearMethod",
|
|
"AscendW4A8DynamicFusedMoEMethod",
|
|
"AscendW4A16FusedMoEMethod",
|
|
"AscendW4A4FlatQuantDynamicLinearMethod",
|
|
"AscendW4A4LaosDynamicLinearMethod",
|
|
"AscendFAQuantAttentionMethod",
|
|
]
|