Files
xc-llm-ascend/vllm_ascend/quantization/methods/__init__.py
pichangping 3f39ac9c8d [Feature]Supports DSv3.1 PD separation and C8 quantization (#7222)
Co-authored-by: kunpengW-code <1289706727@qq.com>
Co-authored-by: linsheng1 <1950916997@qq.com>

### What this PR does / why we need it?
Currently, chunked prefill is forcibly enabled. DeepSeek V3.1 W8A8C8
supports only the PD separation scenario. C8 refers to quantizing the KV
cache to int8, which aims to reduce the GPU memory usage of the KV cache
and improve the inference throughput.
Constraints: 
1. Only the PD separation mode can be used and
MooncakeLayerwiseConnector can be used to run the model.
2. Currently, only the activation value supports dynamic quantization,
and the KV cache supports static quantization. C8 quantization with MTP
is not supported. You can use ModelSlim for quantization. The
quantization procedure is as follows:
pip install transformers==4.48.2
git clone https://gitcode.com/Ascend/msmodelslim.git
cd msmodelslim
bash install.sh
cd example/DeepSeek/
python3 quant_deepseek_w8a8.py --model_path <path/weight> --save_path
<path/quant_weight>
--anti_dataset../common/deepseek_anti_prompt_50_v3_1.json
--calib_dataset../common/deepseek_calib_prompt_50_v3_1.json --rot
--trust_remote_code True --fa_quant --dynamic --anti_method m6

### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?

- vLLM version: v0.17.0
- vLLM main:
4034c3d32e

---------

Signed-off-by: pichangping <1337510399@qq.com>
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Co-authored-by: Wang Kunpeng <1289706727@qq.com>
2026-03-16 22:49:05 +08:00

83 lines
2.9 KiB
Python

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Ascend quantization scheme implementations.
This module provides all quantization scheme implementations for Ascend NPU.
Schemes are automatically registered via the @register_scheme decorator.
Usage:
from vllm_ascend.quantization.methods import get_scheme_class
# Get a scheme class by quant_type and layer_type
scheme_cls = get_scheme_class("W8A8_DYNAMIC", "linear")
scheme = scheme_cls()
"""
from typing import Any
# Import base classes
from .base import AscendAttentionScheme, AscendLinearScheme, AscendMoEScheme, QuantType
# Import all scheme classes for external access
from .kv_c8 import AscendFAQuantAttentionMethod
# Import registry functions
from .registry import get_scheme_class, register_scheme
from .w4a4_flatquant import AscendW4A4FlatQuantDynamicLinearMethod
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
from .w4a8 import AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod
from .w4a16 import AscendW4A16FusedMoEMethod
from .w8a8_dynamic import AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod
from .w8a8_mxfp8 import AscendW8A8MXFP8DynamicLinearMethod
from .w8a8_pdmix import AscendW8A8PDMixFusedMoeMethod, AscendW8A8PDMixLinearMethod
from .w8a8_static import AscendW8A8LinearMethod
from .w8a16 import AscendW8A16LinearMethod
def is_mx_quant_type(instance: Any) -> bool:
"""Checks if the quantization method is a microscaling (MX) type."""
MX_QUANT_TYPES = (AscendW8A8MXFP8DynamicLinearMethod,)
return isinstance(instance, MX_QUANT_TYPES)
__all__ = [
# Base classes
"AscendAttentionScheme",
"AscendLinearScheme",
"AscendMoEScheme",
"QuantType",
# Registry functions
"register_scheme",
"get_scheme_class",
# Utility functions
"is_mx_quant_type",
# Scheme classes
"AscendW8A8LinearMethod",
"AscendW8A8DynamicLinearMethod",
"AscendW8A8DynamicFusedMoEMethod",
"AscendW8A8MXFP8DynamicLinearMethod",
"AscendW8A8PDMixLinearMethod",
"AscendW8A8PDMixFusedMoeMethod",
"AscendW8A16LinearMethod",
"AscendW4A8DynamicLinearMethod",
"AscendW4A8DynamicFusedMoEMethod",
"AscendW4A16FusedMoEMethod",
"AscendW4A4FlatQuantDynamicLinearMethod",
"AscendW4A4LaosDynamicLinearMethod",
"AscendFAQuantAttentionMethod",
]