Files
xc-llm-ascend/vllm_ascend/quantization/quant_config.py

355 lines
15 KiB
Python
Raw Normal View History

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from types import MappingProxyType
from typing import Any, Callable, Dict, List, Mapping, Optional
import torch
from vllm.distributed import get_tensor_model_parallel_rank
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
FusedMoeWeightScaleSupported)
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
RowParallelLinear,
UnquantizedLinearMethod)
from vllm.model_executor.layers.quantization import \
register_quantization_config
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod, VocabParallelEmbedding)
from vllm.model_executor.parameter import PerTensorScaleParameter
from vllm.model_executor.utils import set_weight_attrs
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD
from .quantizer import AscendQuantizer
@register_quantization_config(ASCEND_QUATIZATION_METHOD)
class AscendQuantConfig(QuantizationConfig):
"""Config class for Ascend
This class is a general class that parse quantization configs
that are supported on ascend hardware.
"""
def __init__(self, quant_config: Dict[str, Any]):
self.quant_description = quant_config
def __repr__(self) -> str:
return "AscendQuantConfig:\n" + super().__repr__()
@classmethod
def get_name(cls) -> str:
return ASCEND_QUATIZATION_METHOD
@classmethod
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
return [torch.int8, torch.float16, torch.bfloat16]
@classmethod
def get_min_capability(cls) -> int:
raise NotImplementedError(
"Ascend hardware dose not support \"get_min_capability\" feature.")
@classmethod
def get_config_filenames(cls) -> List[str]:
return ["quant_model_description.json"]
@classmethod
def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig":
return cls(config)
@classmethod
def override_quantization_method(cls, hf_quant_cfg,
user_quant) -> Optional[str]:
if torch.npu.is_available():
return ASCEND_QUATIZATION_METHOD
return None
def get_quant_method(self, layer: torch.nn.Module,
prefix: str) -> Optional["QuantizeMethodBase"]:
from vllm.attention.layer import Attention
if isinstance(layer, LinearBase):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
return UnquantizedLinearMethod()
return AscendLinearMethod(self, prefix,
self.packed_modules_mapping)
elif isinstance(layer, Attention) and \
'fa_quant_type' in self.quant_description.keys() and \
self.quant_description['fa_quant_type'] is not None:
return AscendKVCacheMethod(self, prefix)
elif isinstance(layer, Attention) and self.quant_description.get(
'kv_quant_type') == 'C8':
return AscendKVCacheMethod(self, prefix)
elif isinstance(layer, FusedMoE):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
[V1] MTP supports torchair (#2145) ### What this PR does / why we need it? Support MTP with: - [x] V0 Scheduler - [x] TorchAir - [x] Single DP - [x] Multi DP - [x] Disaggregate PD Known issues: - [ ] Not support V1 Scheduler (chunked prefill), will be supported in a few weeks - [ ] vllm v0.10.0 does not support metrics with `DP > 1` right now, need to comment out the line 171-175 in file `vllm/vllm/v1/metrics/loggers.py` ``` if (len(self.engine_indexes) > 1 and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") ``` To start an online server with torchair enabled, here is an example: ``` python -m vllm.entrypoints.openai.api_server \ --model="/weights/DeepSeek-R1_w8a8/" \ --trust-remote-code \ --max-model-len 40000 \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --enable_expert_parallel \ --served-model-name deepseekr1 \ --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ --quantization ascend \ --host 0.0.0.0 \ --port 1234 \ --additional-config '{"ascend_scheduler_config":{"enabled":true,"enable_chunked_prefill":false},"torchair_graph_config":{"enabled":true,"graph_batch_sizes":[16]},"enable_weight_nz_layout":true}' \ --gpu_memory_utilization 0.9 ``` offline example with torchair enabled ``` from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ] # Create a sampling params object. sampling_params = SamplingParams(max_tokens=16, temperature=0) # Create an LLM. llm = LLM( model="/home/data/DeepSeek-R1_w8a8/", tensor_parallel_size=16, max_num_seqs=16, gpu_memory_utilization=0.9, distributed_executor_backend="mp", enable_expert_parallel=True, speculative_config={ "method": "deepseek_mtp", "num_speculative_tokens": 1, }, trust_remote_code=True, enforce_eager=False, max_model_len=2000, additional_config = { 'torchair_graph_config': { 'enabled': True, "graph_batch_sizes": [16], 'enable_multistream_shared_expert': False, }, "ascend_scheduler_config": { "enabled": True }, # 'expert_tensor_parallel_size': 16, } ) # Generate texts from the prompts. # llm.start_profile() outputs = llm.generate(prompts, sampling_params) # llm.stop_profile() for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/302962e806e9820643ae25987e8e38ed035e05d3 --------- Signed-off-by: xuyexiong <xuyexiong@huawei.com>
2025-08-06 19:37:43 +08:00
return AscendUnquantizedFusedMoEMethod(layer.moe)
return AscendFusedMoEMethod(self, prefix,
self.packed_modules_mapping)
elif isinstance(layer, VocabParallelEmbedding):
if self.is_layer_skipped_ascend(prefix,
self.packed_modules_mapping):
return UnquantizedEmbeddingMethod()
return AscendEmbeddingMethod(self, prefix,
self.packed_modules_mapping)
return None
def is_layer_skipped_ascend(
self,
prefix: str,
fused_mapping: Mapping[str, List[str]] = MappingProxyType({})):
# adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
proj_name = prefix.split(".")[-1]
if proj_name in fused_mapping:
shard_prefixes = [
prefix.replace(proj_name, shard_proj_name)
for shard_proj_name in fused_mapping[proj_name]
]
is_skipped = None
for shard_prefix in shard_prefixes:
is_shard_skipped = self.quant_description[shard_prefix +
'.weight'] == "FLOAT"
if is_skipped is None:
is_skipped = is_shard_skipped
elif is_shard_skipped != is_skipped:
raise ValueError(
f"Detected some but not all shards of {prefix} "
"are quantized. All shards of fused layers "
"to have the same precision.")
else:
is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
assert is_skipped is not None
return is_skipped
def get_scaled_act_names(self) -> List[str]:
return []
class AscendLinearMethod(LinearMethodBase):
"""Linear method for Ascend quantization.
This class calls AscendQuantizer to search a specific quantization
implementations supported on ascend hardware for linear methods.
Args:
quant_config: The Ascend quantization config.
"""
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
packed_modules_mapping: Dict[str, Any]) -> None:
self.quantizer = AscendQuantizer.get_quantizer(
quant_config.quant_description, prefix, packed_modules_mapping)
self.quant_method = self.quantizer.build_linear_method()
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: List[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
) -> None:
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
weight_dict = self.quant_method.get_weight(input_size_per_partition,
output_size_per_partition,
params_dtype)
for weight_name, weight_param in weight_dict.items():
param = torch.nn.Parameter(weight_param, requires_grad=False)
set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
layer.register_parameter(weight_name, param)
set_weight_attrs(param, extra_weight_attrs)
pertensor_dict = self.quant_method.get_pertensor_param(params_dtype)
for pertensor_name, pertensor_param in pertensor_dict.items():
param = PerTensorScaleParameter(data=pertensor_param,
weight_loader=weight_loader)
# disable warning
param.ignore_warning = True
layer.register_parameter(pertensor_name, param)
perchannel_dict = self.quant_method.get_perchannel_param(
output_size_per_partition, params_dtype)
for perchannel_name, perchannel_param in perchannel_dict.items():
param = torch.nn.Parameter(perchannel_param, requires_grad=False)
set_weight_attrs(param, {"output_dim": 0})
layer.register_parameter(perchannel_name, param)
set_weight_attrs(param, extra_weight_attrs)
pergroup_dict = self.quant_method.get_pergroup_param(
input_size_per_partition, output_size_per_partition, params_dtype)
for pergroup_name, pergroup_param in pergroup_dict.items():
param = torch.nn.Parameter(pergroup_param, requires_grad=False)
set_weight_attrs(param, {"output_dim": 0})
layer.register_parameter(pergroup_name, param)
set_weight_attrs(param, extra_weight_attrs)
if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name:
setattr(param, "input_dim", 1)
param.input_dim = 1
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if isinstance(layer, RowParallelLinear):
tp_rank = get_tensor_model_parallel_rank()
return self.quant_method.apply(layer, x, bias, tp_rank)
return self.quant_method.apply(layer, x, bias)
class AscendKVCacheMethod(BaseKVCacheMethod):
"""KVCache method for Ascend quantization.
This class calls AscendQuantizer to search a specific quantization
implementations supported on ascend hardware for kvcache methods.
Args:
quant_config: The Ascend quantization config.
"""
def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None:
self.quantizer = AscendQuantizer.get_quantizer(
quant_config.quant_description, prefix)
self.quant_method = self.quantizer.build_attention_method()
def create_weights(self, layer: torch.nn.Module) -> None:
# Different from linear method, there are no weight processing/slicing
# steps for attention in vllm. So the whole process of create weights
# is hidden into the specific quant method.
self.quant_method.create_weights(layer)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer)
def apply(self, layer: torch.nn.Module, query: torch.Tensor,
key: torch.Tensor, value: torch.Tensor, kv_cache, attn_metadata,
attn_type, scale, output) -> torch.Tensor:
return self.quant_method.apply(layer, query, key, value, kv_cache,
attn_metadata, attn_type, scale, output)
class AscendFusedMoEMethod(FusedMoEMethodBase):
"""FusedMoE method for Ascend quantization.
This class calls AscendQuantizer to search a specific quantization
implementations supported on ascend hardware for kvcache methods.
Args:
quant_config: The Ascend quantization config.
"""
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
packed_modules_mapping: Dict[str, Any]):
self.quantizer = AscendQuantizer.get_quantizer(
quant_config.quant_description, prefix, packed_modules_mapping)
self.quant_method = self.quantizer.build_moe_method()
def create_weights(
self,
layer: torch.nn.Module,
num_experts: int,
hidden_size: int,
intermediate_size_per_partition: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
) -> None:
weight_param = self.quant_method.get_weight(
num_experts, intermediate_size_per_partition, hidden_size,
params_dtype)
for param_key, param_value in weight_param.items():
param = torch.nn.Parameter(param_value, requires_grad=False)
layer.register_parameter(param_key, param)
set_weight_attrs(param, extra_weight_attrs)
extra_weight_attrs.update(
{"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
dynamic_quant_param = self.quant_method.get_dynamic_quant_param(
num_experts, intermediate_size_per_partition, hidden_size,
params_dtype)
for param_key, param_value in dynamic_quant_param.items():
param = torch.nn.Parameter(param_value, requires_grad=False)
layer.register_parameter(param_key, param)
set_weight_attrs(param, extra_weight_attrs)
[main][Feature] Support deepseek w4a8 quantization (#2172) ### What this PR does / why we need it? Supports Deepseek-R1 w4a8 quantization. Since R1 w4a8 uses mixed quantization, only the MOE layer uses w4a8_dynamic quantization, so we added the w4a8_dynamic.py file, which includes the AscendW4A8DynamicFusedMoEMethod class. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Adding ut case in `tests/ut/quantization/test_w4a8_dynamic.py` and `tests/ut/quantization/test_quantizer.py` Adding e2e case in `tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC` to test deepseek w4a8_dynamic quantized model #### 1.How to get weights using Modelslim ##### Installation steps Use the branch master, the commit id is: 298e175d69b3b855111a1e09bbe2fcd12fdb4e24 git clone https://gitee.com/ascend/msit.git cd msit/msmodelslim bash install.sh ##### The required transformers environment transformers>=4.48.2 ##### Generate w4a8 weights cd /example/DeepSeek Command reference: msmodelslim/example/DeepSeek/README.md Execute the [pre-check](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#%E8%BF%90%E8%A1%8C%E5%89%8D%E5%BF%85%E6%A3%80) and [DeepSeek-R1 w4a8 mix quantization](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#deepseek-r1-w4a8-%E6%B7%B7%E5%90%88%E9%87%8F%E5%8C%96%E5%89%8D%E4%B8%89%E5%B1%82-mlpw8a8-dynamic-%E9%87%8F%E5%8C%96mla%E5%85%B1%E4%BA%AB%E4%B8%93%E5%AE%B6w8a8%E9%87%8F%E5%8C%96%E8%B7%AF%E7%94%B1%E4%B8%93%E5%AE%B6w4a8-dynamic%E9%87%8F%E5%8C%96) chapter Reference command:python3 quant_deepseek_w4a8.py --model_path {Original weight path} --save_path {Generate weight path} --mindie_format ##### Adapt to vllm-ascend Since mindie_format generates mindie format, some adaptation modifications are needed for vllm-ascend to use it: `quant_model_description_w8a8_dynamic.json` rename to `quant_model_description.json`, and add `"group_size": 256` Modification in `config.json`:`"model_type":deepseekv2` is changed to `"model_type":deepseek_v3`; `quantization_config` is removed; tips:The group_size and weights match. If the w4a8 weights are not generated using msmodelslim, you can check the group_size in quantization_config in config.json. #### 2.How to run w4a8 ##### a.How to run eager mode export VLLM_USE_V1=1 # v1 python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 -dp $3 --enable_expert_parallel --quantization ascend --port $4 --max-model-len $5 --max-num-seqs $6 --enforce-eager eg: python -m vllm.entrypoints.openai.api_server --model=/weightpath/w4a8_4_layer --trust-remote-code -tp 4 -dp 4 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 5120 --max-num-seqs 128 --enforce-eager ##### b.How to run graph mode export VLLM_USE_V1=1 # v1 export HCCL_BUFFSIZE=1024 python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 -dp $3 --enable_expert_parallel --quantization ascend --port $4 --max-model-len $5 --additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' eg: python -m vllm.entrypoints.openai.api_server --model=/weight/dsr1_w4a8_vllm --trust-remote-code -tp 4 -dp 4 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 5120 --additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/c494f96fbcf5e9f19f59e3dea6c2780aeb6c567f --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2025-08-06 10:17:44 +08:00
if "weight_scale_second" in param_key or "weight_offset_second" in param_key:
setattr(param, "quant_method",
FusedMoeWeightScaleSupported.GROUP.value)
def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
router_logits: torch.Tensor,
[quantization] Support w8a8 quantization (#580) ### What this PR does / why we need it? Add a `VLLMAscendQuantizer` to support w8a8 static (W8A8) and dynamic on linear and moe (W8A8_DYNAMIC), the quantizer will be enable if a model has [quantize filed](https://huggingface.co/vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8/blob/main/config.json#L27). If MindIE Turbo is installed, the MindIE Turbo Quantizer will apply, otherwise will use VLLMAscendQuantizer directly. - This patch fix installation docs to make installation work - This patch enable norm quantization by patch `RMSNorm.__init__`, `RMSNorm.forward_oot`, `NPUModelRunnerBase.load_model` - Add `AscendW8A8LinearMethod` for W8A8 - Add `AscendW8A8DynamicLinearMethod` and `AscendW8A8DynamicFusedMoEMethod` for W8A8_DYNAMIC - Add a e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` ### Does this PR introduce _any_ user-facing change? Yes, support w8a8 quantization. After this patch supported, users can use below commands to run w8a8 models: ``` vllm serve /root/.cache/modelscope/hub/Qwen/Qwen2.5-7B-Instruct-w8a8 --served-model-name "qwen2.5-7B" ``` ### How was this patch tested? 0. CI passed: add e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` 1. From @Yikun: I test Qwen2.5-0.5B-Instruct-w8a8 for functional test all is well, pls refer to https://github.com/vllm-project/vllm-ascend/pull/580#issuecomment-2816747613 2. From @dingdingchaomian : Use qwen2.5-72b-instruct model and deepseek-v2-lite-chat tested, both models were quantized using Ascend's msmodelslim tool: - Qwen2.5-72b-instruct were tested twice, one for w8a8 static and one for w8a8 dynamic. - Deepseek-v2-lite-chat were tested once because its quantization used both static and dynamic w8a8. Models were tested using both off line inference and online serving, and both work well. The inference codes are exactly the same with the examples in https://vllm-ascend.readthedocs.io/en/latest/quick_start.html, with model path and tensor parallel number changed. --------- Signed-off-by: dingdingchaomian <wangce21@huawei.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: dingdingchaomian <wangce21@huawei.com> Co-authored-by: Angazenn <zengyanjia@huawei.com> Co-authored-by: liujiaxu <liujiaxu4@huawei.com> Co-authored-by: ApsarasX <apsarax@outlook.com> Co-authored-by: ganyi1996ppo <pleaplusone.gy@gmail.com>
2025-04-20 18:14:05 +08:00
top_k: int,
renormalize: bool,
[quantization] Support w8a8 quantization (#580) ### What this PR does / why we need it? Add a `VLLMAscendQuantizer` to support w8a8 static (W8A8) and dynamic on linear and moe (W8A8_DYNAMIC), the quantizer will be enable if a model has [quantize filed](https://huggingface.co/vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8/blob/main/config.json#L27). If MindIE Turbo is installed, the MindIE Turbo Quantizer will apply, otherwise will use VLLMAscendQuantizer directly. - This patch fix installation docs to make installation work - This patch enable norm quantization by patch `RMSNorm.__init__`, `RMSNorm.forward_oot`, `NPUModelRunnerBase.load_model` - Add `AscendW8A8LinearMethod` for W8A8 - Add `AscendW8A8DynamicLinearMethod` and `AscendW8A8DynamicFusedMoEMethod` for W8A8_DYNAMIC - Add a e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` ### Does this PR introduce _any_ user-facing change? Yes, support w8a8 quantization. After this patch supported, users can use below commands to run w8a8 models: ``` vllm serve /root/.cache/modelscope/hub/Qwen/Qwen2.5-7B-Instruct-w8a8 --served-model-name "qwen2.5-7B" ``` ### How was this patch tested? 0. CI passed: add e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` 1. From @Yikun: I test Qwen2.5-0.5B-Instruct-w8a8 for functional test all is well, pls refer to https://github.com/vllm-project/vllm-ascend/pull/580#issuecomment-2816747613 2. From @dingdingchaomian : Use qwen2.5-72b-instruct model and deepseek-v2-lite-chat tested, both models were quantized using Ascend's msmodelslim tool: - Qwen2.5-72b-instruct were tested twice, one for w8a8 static and one for w8a8 dynamic. - Deepseek-v2-lite-chat were tested once because its quantization used both static and dynamic w8a8. Models were tested using both off line inference and online serving, and both work well. The inference codes are exactly the same with the examples in https://vllm-ascend.readthedocs.io/en/latest/quick_start.html, with model path and tensor parallel number changed. --------- Signed-off-by: dingdingchaomian <wangce21@huawei.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: dingdingchaomian <wangce21@huawei.com> Co-authored-by: Angazenn <zengyanjia@huawei.com> Co-authored-by: liujiaxu <liujiaxu4@huawei.com> Co-authored-by: ApsarasX <apsarax@outlook.com> Co-authored-by: ganyi1996ppo <pleaplusone.gy@gmail.com>
2025-04-20 18:14:05 +08:00
use_grouped_topk: bool = False,
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
[quantization] Support w8a8 quantization (#580) ### What this PR does / why we need it? Add a `VLLMAscendQuantizer` to support w8a8 static (W8A8) and dynamic on linear and moe (W8A8_DYNAMIC), the quantizer will be enable if a model has [quantize filed](https://huggingface.co/vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8/blob/main/config.json#L27). If MindIE Turbo is installed, the MindIE Turbo Quantizer will apply, otherwise will use VLLMAscendQuantizer directly. - This patch fix installation docs to make installation work - This patch enable norm quantization by patch `RMSNorm.__init__`, `RMSNorm.forward_oot`, `NPUModelRunnerBase.load_model` - Add `AscendW8A8LinearMethod` for W8A8 - Add `AscendW8A8DynamicLinearMethod` and `AscendW8A8DynamicFusedMoEMethod` for W8A8_DYNAMIC - Add a e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` ### Does this PR introduce _any_ user-facing change? Yes, support w8a8 quantization. After this patch supported, users can use below commands to run w8a8 models: ``` vllm serve /root/.cache/modelscope/hub/Qwen/Qwen2.5-7B-Instruct-w8a8 --served-model-name "qwen2.5-7B" ``` ### How was this patch tested? 0. CI passed: add e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` 1. From @Yikun: I test Qwen2.5-0.5B-Instruct-w8a8 for functional test all is well, pls refer to https://github.com/vllm-project/vllm-ascend/pull/580#issuecomment-2816747613 2. From @dingdingchaomian : Use qwen2.5-72b-instruct model and deepseek-v2-lite-chat tested, both models were quantized using Ascend's msmodelslim tool: - Qwen2.5-72b-instruct were tested twice, one for w8a8 static and one for w8a8 dynamic. - Deepseek-v2-lite-chat were tested once because its quantization used both static and dynamic w8a8. Models were tested using both off line inference and online serving, and both work well. The inference codes are exactly the same with the examples in https://vllm-ascend.readthedocs.io/en/latest/quick_start.html, with model path and tensor parallel number changed. --------- Signed-off-by: dingdingchaomian <wangce21@huawei.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: dingdingchaomian <wangce21@huawei.com> Co-authored-by: Angazenn <zengyanjia@huawei.com> Co-authored-by: liujiaxu <liujiaxu4@huawei.com> Co-authored-by: ApsarasX <apsarax@outlook.com> Co-authored-by: ganyi1996ppo <pleaplusone.gy@gmail.com>
2025-04-20 18:14:05 +08:00
e_score_correction_bias: Optional[torch.Tensor] = None,
is_prefill: bool = True,
enable_force_load_balance: bool = False,
log2phy: torch.Tensor = None,
global_redundant_expert_num=0,
[quantization] Support w8a8 quantization (#580) ### What this PR does / why we need it? Add a `VLLMAscendQuantizer` to support w8a8 static (W8A8) and dynamic on linear and moe (W8A8_DYNAMIC), the quantizer will be enable if a model has [quantize filed](https://huggingface.co/vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8/blob/main/config.json#L27). If MindIE Turbo is installed, the MindIE Turbo Quantizer will apply, otherwise will use VLLMAscendQuantizer directly. - This patch fix installation docs to make installation work - This patch enable norm quantization by patch `RMSNorm.__init__`, `RMSNorm.forward_oot`, `NPUModelRunnerBase.load_model` - Add `AscendW8A8LinearMethod` for W8A8 - Add `AscendW8A8DynamicLinearMethod` and `AscendW8A8DynamicFusedMoEMethod` for W8A8_DYNAMIC - Add a e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` ### Does this PR introduce _any_ user-facing change? Yes, support w8a8 quantization. After this patch supported, users can use below commands to run w8a8 models: ``` vllm serve /root/.cache/modelscope/hub/Qwen/Qwen2.5-7B-Instruct-w8a8 --served-model-name "qwen2.5-7B" ``` ### How was this patch tested? 0. CI passed: add e2e test for `vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8` 1. From @Yikun: I test Qwen2.5-0.5B-Instruct-w8a8 for functional test all is well, pls refer to https://github.com/vllm-project/vllm-ascend/pull/580#issuecomment-2816747613 2. From @dingdingchaomian : Use qwen2.5-72b-instruct model and deepseek-v2-lite-chat tested, both models were quantized using Ascend's msmodelslim tool: - Qwen2.5-72b-instruct were tested twice, one for w8a8 static and one for w8a8 dynamic. - Deepseek-v2-lite-chat were tested once because its quantization used both static and dynamic w8a8. Models were tested using both off line inference and online serving, and both work well. The inference codes are exactly the same with the examples in https://vllm-ascend.readthedocs.io/en/latest/quick_start.html, with model path and tensor parallel number changed. --------- Signed-off-by: dingdingchaomian <wangce21@huawei.com> Signed-off-by: Yikun Jiang <yikunkero@gmail.com> Co-authored-by: dingdingchaomian <wangce21@huawei.com> Co-authored-by: Angazenn <zengyanjia@huawei.com> Co-authored-by: liujiaxu <liujiaxu4@huawei.com> Co-authored-by: ApsarasX <apsarax@outlook.com> Co-authored-by: ganyi1996ppo <pleaplusone.gy@gmail.com>
2025-04-20 18:14:05 +08:00
**kwargs,
) -> torch.Tensor:
return self.quant_method.apply(
layer, x, router_logits, top_k, renormalize, use_grouped_topk,
global_num_experts, expert_map, topk_group, num_expert_group,
custom_routing_function, scoring_func, e_score_correction_bias,
is_prefill, enable_force_load_balance, log2phy,
global_redundant_expert_num, **kwargs)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
if hasattr(self.quant_method, "process_weights_after_loading"):
self.quant_method.process_weights_after_loading(layer)
class AscendEmbeddingMethod(AscendLinearMethod):
"""Embedding method for Ascend quantization.
This class calls AscendQuantizer to search a specific quantization
implementations supported on ascend hardware for Embedding methods.
Args:
quant_config: The Ascend quantization config.
"""
def __init__(self, quant_config: AscendQuantConfig, prefix: str,
packed_modules_mapping: Dict[str, Any]) -> None:
self.quantizer = AscendQuantizer.get_quantizer(
quant_config.quant_description, prefix, packed_modules_mapping)
[main][Feature] Support deepseek w4a8 quantization (#2172) ### What this PR does / why we need it? Supports Deepseek-R1 w4a8 quantization. Since R1 w4a8 uses mixed quantization, only the MOE layer uses w4a8_dynamic quantization, so we added the w4a8_dynamic.py file, which includes the AscendW4A8DynamicFusedMoEMethod class. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Adding ut case in `tests/ut/quantization/test_w4a8_dynamic.py` and `tests/ut/quantization/test_quantizer.py` Adding e2e case in `tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC` to test deepseek w4a8_dynamic quantized model #### 1.How to get weights using Modelslim ##### Installation steps Use the branch master, the commit id is: 298e175d69b3b855111a1e09bbe2fcd12fdb4e24 git clone https://gitee.com/ascend/msit.git cd msit/msmodelslim bash install.sh ##### The required transformers environment transformers>=4.48.2 ##### Generate w4a8 weights cd /example/DeepSeek Command reference: msmodelslim/example/DeepSeek/README.md Execute the [pre-check](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#%E8%BF%90%E8%A1%8C%E5%89%8D%E5%BF%85%E6%A3%80) and [DeepSeek-R1 w4a8 mix quantization](https://gitee.com/ascend/msit/blob/master/msmodelslim/example/DeepSeek/README.md#deepseek-r1-w4a8-%E6%B7%B7%E5%90%88%E9%87%8F%E5%8C%96%E5%89%8D%E4%B8%89%E5%B1%82-mlpw8a8-dynamic-%E9%87%8F%E5%8C%96mla%E5%85%B1%E4%BA%AB%E4%B8%93%E5%AE%B6w8a8%E9%87%8F%E5%8C%96%E8%B7%AF%E7%94%B1%E4%B8%93%E5%AE%B6w4a8-dynamic%E9%87%8F%E5%8C%96) chapter Reference command:python3 quant_deepseek_w4a8.py --model_path {Original weight path} --save_path {Generate weight path} --mindie_format ##### Adapt to vllm-ascend Since mindie_format generates mindie format, some adaptation modifications are needed for vllm-ascend to use it: `quant_model_description_w8a8_dynamic.json` rename to `quant_model_description.json`, and add `"group_size": 256` Modification in `config.json`:`"model_type":deepseekv2` is changed to `"model_type":deepseek_v3`; `quantization_config` is removed; tips:The group_size and weights match. If the w4a8 weights are not generated using msmodelslim, you can check the group_size in quantization_config in config.json. #### 2.How to run w4a8 ##### a.How to run eager mode export VLLM_USE_V1=1 # v1 python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 -dp $3 --enable_expert_parallel --quantization ascend --port $4 --max-model-len $5 --max-num-seqs $6 --enforce-eager eg: python -m vllm.entrypoints.openai.api_server --model=/weightpath/w4a8_4_layer --trust-remote-code -tp 4 -dp 4 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 5120 --max-num-seqs 128 --enforce-eager ##### b.How to run graph mode export VLLM_USE_V1=1 # v1 export HCCL_BUFFSIZE=1024 python -m vllm.entrypoints.openai.api_server --model=$1 --trust-remote-code -tp $2 -dp $3 --enable_expert_parallel --quantization ascend --port $4 --max-model-len $5 --additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' eg: python -m vllm.entrypoints.openai.api_server --model=/weight/dsr1_w4a8_vllm --trust-remote-code -tp 4 -dp 4 --enable_expert_parallel --quantization ascend --port 8002 --max-model-len 5120 --additional_config='{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/c494f96fbcf5e9f19f59e3dea6c2780aeb6c567f --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com>
2025-08-06 10:17:44 +08:00
self.quant_method = self.quantizer.build_linear_method()