Co-authored-by: kunpengW-code <1289706727@qq.com>
Co-authored-by: linsheng1 <1950916997@qq.com>
### What this PR does / why we need it?
Currently, chunked prefill is forcibly enabled. DeepSeek V3.1 W8A8C8
supports only the PD separation scenario. C8 refers to quantizing the KV
cache to int8, which aims to reduce the GPU memory usage of the KV cache
and improve the inference throughput.
Constraints:
1. Only the PD separation mode can be used and
MooncakeLayerwiseConnector can be used to run the model.
2. Currently, only the activation value supports dynamic quantization,
and the KV cache supports static quantization. C8 quantization with MTP
is not supported. You can use ModelSlim for quantization. The
quantization procedure is as follows:
pip install transformers==4.48.2
git clone https://gitcode.com/Ascend/msmodelslim.git
cd msmodelslim
bash install.sh
cd example/DeepSeek/
python3 quant_deepseek_w8a8.py --model_path <path/weight> --save_path
<path/quant_weight>
--anti_dataset../common/deepseek_anti_prompt_50_v3_1.json
--calib_dataset../common/deepseek_calib_prompt_50_v3_1.json --rot
--trust_remote_code True --fa_quant --dynamic --anti_method m6
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
- vLLM version: v0.17.0
- vLLM main:
4034c3d32e
---------
Signed-off-by: pichangping <1337510399@qq.com>
Signed-off-by: Wang Kunpeng <1289706727@qq.com>
Co-authored-by: Wang Kunpeng <1289706727@qq.com>
722 lines
27 KiB
Python
722 lines
27 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
"""ModelSlim quantization configuration and model mappings for Ascend.
|
|
|
|
This module provides the AscendModelSlimConfig class for parsing quantization
|
|
configs generated by the ModelSlim tool, along with model-specific mappings.
|
|
"""
|
|
|
|
import glob
|
|
import json
|
|
import os
|
|
import re
|
|
from collections.abc import Mapping
|
|
from types import MappingProxyType
|
|
from typing import Any, Optional
|
|
|
|
import torch
|
|
from vllm.config import get_current_vllm_config
|
|
from vllm.logger import init_logger
|
|
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
|
from vllm.model_executor.layers.linear import LinearBase
|
|
from vllm.model_executor.layers.quantization import register_quantization_config
|
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
|
|
from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding
|
|
from vllm.model_executor.models.utils import WeightsMapper
|
|
|
|
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor
|
|
|
|
from .methods import get_scheme_class
|
|
|
|
# The config filename that ModelSlim generates after quantizing a model.
|
|
MODELSLIM_CONFIG_FILENAME = "quant_model_description.json"
|
|
|
|
logger = init_logger(__name__)
|
|
|
|
# key: model_type
|
|
# value: orig_to_new_prefix
|
|
QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
|
|
"qwen3_vl_moe": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"qwen3_vl_text": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"kimi_k25": {
|
|
"mm_projector.linear_1": "mm_projector.proj.0",
|
|
"mm_projector.linear_2": "mm_projector.proj.2",
|
|
},
|
|
"qwen3_omni_moe": {
|
|
"language_model.lm_head.": "thinker.lm_head.",
|
|
"language_model.model.": "thinker.model.",
|
|
"visual.": "thinker.visual.",
|
|
},
|
|
"qwen2_5_omni": {
|
|
"language_model.lm_head.": "thinker.lm_head.",
|
|
"language_model.model.": "thinker.model.",
|
|
"visual.": "thinker.visual.",
|
|
},
|
|
"qwen2_5_omni_text": {
|
|
"language_model.": "thinker.",
|
|
},
|
|
"glm4v_moe": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"glm4v_moe_text": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"qwen3_5": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"qwen3_5_moe": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
}
|
|
|
|
# key: model_type
|
|
# value: dict of fused module name -> list of original module names
|
|
packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
|
|
"qwen3_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen3_5": {
|
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
|
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
|
},
|
|
"qwen3_5_moe": {
|
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
|
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"deepseek_v2": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"deepseek_v3": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"pangu_ultra_moe": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"kimi_k2": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"deepseek_v32": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"glm_moe_dsa": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
# NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
|
|
# NOTE 2.The description file generated by the current msmodelslim tool does not have
|
|
# MTP layer info. Please manually add it and set the value to FLOAT.
|
|
"deepseek_mtp": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"pangu_ultra_moe_mtp": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"qwen3_next": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj": ["in_proj_qkvz", "in_proj_ba"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen2_5_vl": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
},
|
|
"qwen3_vl_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4_moe_lite": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"glm4v_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4v_moe_text": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"longcat_flash": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"minimax_m2": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
|
|
},
|
|
"qwen3_omni_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"attn_qkv_proj": [
|
|
"attn_q_proj",
|
|
"attn_k_proj",
|
|
"attn_v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen2_5_omni": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"attn_qkv_proj": [
|
|
"attn_q_proj",
|
|
"attn_k_proj",
|
|
"attn_v_proj",
|
|
],
|
|
"qkv": [
|
|
"q",
|
|
"k",
|
|
"v",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]:
|
|
"""Get packed modules mapping for a model type.
|
|
|
|
Args:
|
|
model_type: The model type string (e.g., "deepseek_v3").
|
|
|
|
Returns:
|
|
Dictionary mapping fused module names to their component module names.
|
|
Returns empty dict if model_type is not found.
|
|
"""
|
|
return packed_modules_model_mapping.get(model_type, {})
|
|
|
|
|
|
def get_prefix_mapping(model_type: str) -> dict[str, str]:
|
|
"""Get prefix mapping for a model type.
|
|
|
|
Args:
|
|
model_type: The model type string (e.g., "qwen3_vl_moe").
|
|
|
|
Returns:
|
|
Dictionary mapping original prefixes to new prefixes.
|
|
Returns empty dict if model_type is not found.
|
|
"""
|
|
return QUANT_MODEL_PREFIX_MAPPINGS.get(model_type, {})
|
|
|
|
|
|
def get_linear_quant_type(
|
|
quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any]
|
|
) -> str | None:
|
|
"""Determine the quantization type for a linear layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
The quantization type string (e.g., "W8A8_DYNAMIC").
|
|
"""
|
|
proj_name = prefix.split(".")[-1]
|
|
if proj_name in packed_modules_mapping:
|
|
quant_type = None
|
|
shard_prefixes = [
|
|
prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name]
|
|
]
|
|
for shard_prefix in shard_prefixes:
|
|
shard_quant_type = quant_description[shard_prefix + ".weight"]
|
|
|
|
if quant_type is None:
|
|
quant_type = shard_quant_type
|
|
elif shard_quant_type != quant_type:
|
|
raise ValueError(
|
|
f"Not all shards of {prefix} are quantized with same quant type."
|
|
f"Shard {proj_name} uses {shard_quant_type}, but another shard"
|
|
f"use {quant_type}. Please check quantization config."
|
|
)
|
|
else:
|
|
quant_type = quant_description[prefix + ".weight"]
|
|
return quant_type
|
|
|
|
|
|
def get_quant_type_for_layer(
|
|
quant_description: dict[str, Any],
|
|
prefix: str,
|
|
layer_type: str,
|
|
packed_modules_mapping: dict[str, Any] | None = None,
|
|
) -> str | None:
|
|
"""Determine the quantization type for a layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
layer_type: The type of layer ("linear", "moe", "attention").
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
The quantization type string (e.g., "W8A8_DYNAMIC").
|
|
"""
|
|
if packed_modules_mapping is None:
|
|
packed_modules_mapping = dict()
|
|
# Attention
|
|
if layer_type == "attention" and "fa_quant_type" in quant_description:
|
|
return quant_description["fa_quant_type"]
|
|
# Linear / MoE
|
|
return get_linear_quant_type(quant_description, prefix, packed_modules_mapping)
|
|
|
|
|
|
def create_scheme_for_layer(
|
|
quant_description: dict[str, Any],
|
|
prefix: str,
|
|
layer_type: str,
|
|
packed_modules_mapping: dict[str, Any] | None = None,
|
|
):
|
|
"""Create a quantization scheme instance for a layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
layer_type: The type of layer ("linear", "moe", "attention").
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
An instance of the appropriate quantization scheme class.
|
|
"""
|
|
logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
|
|
quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping)
|
|
|
|
if quant_type is None:
|
|
raise ValueError(f"Could not determine quantization type for layer {prefix}.")
|
|
|
|
# Use registry to get scheme class
|
|
scheme_cls = get_scheme_class(quant_type, layer_type)
|
|
if scheme_cls is not None:
|
|
return scheme_cls()
|
|
|
|
raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.")
|
|
|
|
|
|
@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
|
|
class AscendModelSlimConfig(QuantizationConfig):
|
|
"""Config class for Ascend ModelSlim quantization.
|
|
|
|
This class is a general class that parses quantization configs
|
|
that are supported on Ascend hardware, specifically for models
|
|
quantized using the ModelSlim tool.
|
|
"""
|
|
|
|
def __init__(self, quant_config: dict[str, Any] | None = None):
|
|
super().__init__()
|
|
self.quant_description = quant_config if quant_config is not None else {}
|
|
# TODO(whx): remove this adaptation after adding "shared_head"
|
|
# to prefix of DeepSeekShareHead in vLLM.
|
|
extra_quant_dict = {}
|
|
for k in self.quant_description:
|
|
if "shared_head" in k:
|
|
new_k = k.replace(".shared_head.", ".")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
if "weight_packed" in k:
|
|
new_k = k.replace("weight_packed", "weight")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
self.quant_description.update(extra_quant_dict)
|
|
self._add_kvcache_quant_metadata()
|
|
|
|
def __repr__(self) -> str:
|
|
return "AscendModelSlimConfig:\n" + super().__repr__()
|
|
|
|
@classmethod
|
|
def get_name(cls) -> str:
|
|
return ASCEND_QUANTIZATION_METHOD
|
|
|
|
@classmethod
|
|
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
|
|
return [torch.int8, torch.float16, torch.bfloat16]
|
|
|
|
@classmethod
|
|
def get_min_capability(cls) -> int:
|
|
raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')
|
|
|
|
@classmethod
|
|
def get_config_filenames(cls) -> list[str]:
|
|
# Return empty list so that vllm's get_quant_config() skips the
|
|
# file-based lookup (which raises an unfriendly "Cannot find the
|
|
# config file for ascend" error when the model is not quantized).
|
|
# Instead, the config file is loaded in maybe_update_config(),
|
|
# which can provide a user-friendly error message.
|
|
return []
|
|
|
|
@classmethod
|
|
def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
|
|
return cls(config)
|
|
|
|
@classmethod
|
|
def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
|
|
if hf_quant_cfg is not None:
|
|
quant_method = hf_quant_cfg.get("quant_method", None)
|
|
if not quant_method and torch.npu.is_available():
|
|
return ASCEND_QUANTIZATION_METHOD
|
|
return None
|
|
|
|
def quant_prefix_mapper(self, model_type: str, prefix: str) -> str:
|
|
# TODO (Levi-JQ): will be removed when QuantizationConfig.apply_vllm_mapper is implemented
|
|
prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type)
|
|
if prefix_mapping:
|
|
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping)
|
|
return hf_to_vllm_mapper._map_name(prefix)
|
|
return prefix
|
|
|
|
def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
|
|
from .method_adapters import (
|
|
AscendEmbeddingMethod,
|
|
AscendFusedMoEMethod,
|
|
AscendKVCacheMethod,
|
|
AscendLinearMethod,
|
|
)
|
|
|
|
vllm_config = get_current_vllm_config()
|
|
model_type = vllm_config.model_config.hf_config.model_type
|
|
|
|
if model_type in ["minimax", "minimax_m2"]:
|
|
# Adapt to Minimax architecture: update layer names to MoE convention
|
|
prefix = prefix.replace("mlp", "block_sparse_moe")
|
|
# Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts')
|
|
parts = prefix.split(".")
|
|
if "experts" in parts and len(parts) > 2:
|
|
exp_idx = parts.index("experts")
|
|
if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
|
|
parts = parts[: exp_idx + 1]
|
|
prefix = ".".join(parts)
|
|
|
|
if model_type in packed_modules_model_mapping:
|
|
self.packed_modules_mapping = packed_modules_model_mapping[model_type]
|
|
prefix = self.quant_prefix_mapper(model_type, prefix)
|
|
|
|
if model_type != "kimi_k2":
|
|
if prefix.startswith("language_model"):
|
|
prefix = prefix.split(".", 1)[-1]
|
|
if isinstance(layer, LinearBase):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
# Delayed import to avoid circular import
|
|
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
|
|
|
|
return AscendUnquantizedLinearMethod()
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
|
|
return AscendLinearMethod(scheme)
|
|
elif isinstance(layer, AttentionLayerBase) and self.is_fa_quant_layer(prefix):
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
|
|
return AscendKVCacheMethod(scheme)
|
|
elif isinstance(layer, FusedMoE):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
# Delayed import to avoid circular import
|
|
from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
|
|
|
|
return AscendUnquantizedFusedMoEMethod(layer.moe_config)
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping)
|
|
return AscendFusedMoEMethod(scheme, layer.moe_config)
|
|
elif isinstance(layer, VocabParallelEmbedding):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
return UnquantizedEmbeddingMethod()
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
|
|
return AscendEmbeddingMethod(scheme)
|
|
return None
|
|
|
|
def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})):
|
|
# adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
|
|
proj_name = prefix.split(".")[-1]
|
|
if proj_name in fused_mapping:
|
|
shard_prefixes = [
|
|
prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name]
|
|
]
|
|
|
|
is_skipped = None
|
|
for shard_prefix in shard_prefixes:
|
|
is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT"
|
|
|
|
if is_skipped is None:
|
|
is_skipped = is_shard_skipped
|
|
elif is_shard_skipped != is_skipped:
|
|
raise ValueError(
|
|
f"Detected some but not all shards of {prefix} "
|
|
"are quantized. All shards of fused layers "
|
|
"to have the same precision."
|
|
)
|
|
else:
|
|
is_skipped = any(
|
|
key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT"
|
|
for key, value in self.quant_description.items()
|
|
)
|
|
|
|
assert is_skipped is not None
|
|
return is_skipped
|
|
|
|
def is_fa_quant_layer(self, prefix):
|
|
if self.enable_fa_quant:
|
|
layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix))
|
|
if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers:
|
|
return True
|
|
return False
|
|
|
|
def enabling_fa_quant(self, vllm_config, layer_name) -> bool:
|
|
is_decode_instance = (
|
|
vllm_config.kv_transfer_config is not None
|
|
and vllm_config.kv_transfer_config.is_kv_consumer
|
|
and not vllm_config.kv_transfer_config.is_kv_producer
|
|
)
|
|
return bool(is_decode_instance and self.is_fa_quant_layer(layer_name))
|
|
|
|
def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config):
|
|
if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
|
|
ori_dtype = model_config.dtype
|
|
quant_dtype = torch.int8
|
|
# For MLA models like deepseek, we only quantify K cache to ensure accuracy
|
|
if model_config.use_mla:
|
|
return quant_dtype, ori_dtype
|
|
else:
|
|
return quant_dtype, quant_dtype
|
|
return cache_dtype, cache_dtype
|
|
|
|
def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list):
|
|
if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
|
|
k_quant_head_dim = kv_head_dim_list[0]
|
|
v_quant_head_dim = kv_head_dim_list[1] * 2
|
|
kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim]
|
|
return calc_split_factor(kv_head_dim_list)
|
|
|
|
def maybe_update_config(self, model_name: str, revision: str | None = None) -> None:
|
|
"""Load the ModelSlim quantization config from model directory.
|
|
|
|
This method is called by vllm after get_quant_config() returns
|
|
successfully. Since we return an empty list from get_config_filenames()
|
|
to bypass vllm's built-in file lookup, we do the actual config loading
|
|
here and provide user-friendly error messages when the config is missing.
|
|
|
|
Works with both local directories (``/path/to/model``) and remote
|
|
repository identifiers (``org/model-name``). For remote repos the
|
|
lookup goes through the HuggingFace / ModelScope cache via
|
|
``get_model_file`` to fetch the config if not already cached.
|
|
|
|
Args:
|
|
model_name: Path to the model directory or HuggingFace /
|
|
ModelScope repo id.
|
|
revision: Optional revision (branch, tag, or commit hash) for
|
|
remote repos.
|
|
"""
|
|
from vllm_ascend.quantization.utils import get_model_file
|
|
|
|
# If quant_description is already populated (e.g. from from_config()),
|
|
# there is nothing to do.
|
|
if self.quant_description:
|
|
return
|
|
|
|
# Try to get the config file (local or remote)
|
|
config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision)
|
|
|
|
if config_path is not None:
|
|
with open(config_path) as f:
|
|
self.quant_description = json.load(f)
|
|
self._apply_extra_quant_adaptations()
|
|
self._add_kvcache_quant_metadata()
|
|
return
|
|
|
|
# Collect diagnostic info for the error message
|
|
json_names: list[str] = []
|
|
if os.path.isdir(model_name):
|
|
json_files = glob.glob(os.path.join(model_name, "*.json"))
|
|
json_names = [os.path.basename(f) for f in json_files]
|
|
|
|
# Config file not found - raise a friendly error message
|
|
raise ValueError(
|
|
"\n"
|
|
+ "=" * 80
|
|
+ "\n"
|
|
+ "ERROR: ModelSlim Quantization Config Not Found\n"
|
|
+ "=" * 80
|
|
+ "\n"
|
|
+ "\n"
|
|
+ f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' "
|
|
+ "(ModelSlim quantization),\n"
|
|
+ f"but the model '{model_name}' does not contain the required\n"
|
|
+ f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n"
|
|
+ "\n"
|
|
+ "This usually means the model weights are NOT quantized by "
|
|
+ "ModelSlim.\n"
|
|
+ "\n"
|
|
+ "Please choose one of the following solutions:\n"
|
|
+ "\n"
|
|
+ " Solution 1: Remove the quantization option "
|
|
+ "(for float/unquantized models)\n"
|
|
+ " "
|
|
+ "-" * 58
|
|
+ "\n"
|
|
+ f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from "
|
|
+ "your command if you want to\n"
|
|
+ " run the model with the original (float) weights.\n"
|
|
+ "\n"
|
|
+ " Example:\n"
|
|
+ f" vllm serve {model_name}\n"
|
|
+ "\n"
|
|
+ " Solution 2: Quantize your model weights with ModelSlim first\n"
|
|
+ " "
|
|
+ "-" * 58
|
|
+ "\n"
|
|
+ " Use the ModelSlim tool to quantize your model weights "
|
|
+ "before deployment.\n"
|
|
+ " After quantization, the model directory should contain "
|
|
+ f"'{MODELSLIM_CONFIG_FILENAME}'.\n"
|
|
+ " For more information, please refer to:\n"
|
|
+ " https://gitee.com/ascend/msit/tree/master/msmodelslim\n"
|
|
+ "\n"
|
|
+ (f" (Found JSON files in model directory: {json_names})\n" if json_names else "")
|
|
+ "=" * 80
|
|
)
|
|
|
|
def _apply_extra_quant_adaptations(self) -> None:
|
|
"""Apply extra adaptations to the quant_description dict.
|
|
|
|
This handles known key transformations such as shared_head and
|
|
weight_packed mappings.
|
|
"""
|
|
extra_quant_dict = {}
|
|
for k in self.quant_description:
|
|
if "shared_head" in k:
|
|
new_k = k.replace(".shared_head.", ".")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
if "weight_packed" in k:
|
|
new_k = k.replace("weight_packed", "weight")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
self.quant_description.update(extra_quant_dict)
|
|
|
|
def get_scaled_act_names(self) -> list[str]:
|
|
return []
|
|
|
|
def _add_kvcache_quant_metadata(self):
|
|
fa_quant_type = self.quant_description.get("fa_quant_type", "")
|
|
self.enable_fa_quant = fa_quant_type != ""
|
|
self.kvcache_quant_layers = []
|
|
if self.enable_fa_quant:
|
|
for key in self.quant_description:
|
|
if "fa_k.scale" in key:
|
|
_id = "".join(re.findall(r"\.(\d+)\.", key))
|
|
self.kvcache_quant_layers.append(int(_id))
|