# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # Copyright 2023 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file is a part of the vllm-ascend project. # """ModelSlim quantization configuration and model mappings for Ascend. This module provides the AscendModelSlimConfig class for parsing quantization configs generated by the ModelSlim tool, along with model-specific mappings. """ import glob import json import os from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional import regex as re import torch from vllm.config import get_current_vllm_config from vllm.logger import logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization import register_quantization_config from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding from vllm.model_executor.models.utils import WeightsMapper from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor from .methods import get_scheme_class # The config filename that ModelSlim generates after quantizing a model. MODELSLIM_CONFIG_FILENAME = "quant_model_description.json" # key: model_type # value: dict of fused module name -> list of original module names packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { "qwen3_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen3_5": { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], "in_proj_ba": ["in_proj_b", "in_proj_a"], }, "qwen3_5_moe": { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], "in_proj_ba": ["in_proj_b", "in_proj_a"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "deepseek_v2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v3": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "pangu_ultra_moe": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "kimi_k2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v32": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "glm_moe_dsa": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; # NOTE 2.The description file generated by the current msmodelslim tool does not have # MTP layer info. Please manually add it and set the value to FLOAT. "deepseek_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "pangu_ultra_moe_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "qwen3_next": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj_qkvz", "in_proj_ba"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen2_5_vl": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], }, "qwen3_vl_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe_lite": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "glm4v_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4v_moe_text": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "longcat_flash": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "minimax_m2": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"], }, "qwen3_omni_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "attn_qkv_proj": [ "attn_q_proj", "attn_k_proj", "attn_v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen2_5_omni": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "attn_qkv_proj": [ "attn_q_proj", "attn_k_proj", "attn_v_proj", ], "qkv": [ "q", "k", "v", ], "gate_up_proj": [ "gate_proj", "up_proj", ], }, } def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]: """Get packed modules mapping for a model type. Args: model_type: The model type string (e.g., "deepseek_v3"). Returns: Dictionary mapping fused module names to their component module names. Returns empty dict if model_type is not found. """ return packed_modules_model_mapping.get(model_type, {}) def get_linear_quant_type( quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any] ) -> str | None: """Determine the quantization type for a linear layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ proj_name = prefix.split(".")[-1] if proj_name in packed_modules_mapping: quant_type = None shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name] ] for shard_prefix in shard_prefixes: shard_quant_type = quant_description[shard_prefix + ".weight"] if quant_type is None: quant_type = shard_quant_type elif shard_quant_type != quant_type: raise ValueError( f"Not all shards of {prefix} are quantized with same quant type." f"Shard {proj_name} uses {shard_quant_type}, but another shard" f"use {quant_type}. Please check quantization config." ) else: quant_type = quant_description[prefix + ".weight"] return quant_type def get_quant_type_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ) -> str | None: """Determine the quantization type for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ if packed_modules_mapping is None: packed_modules_mapping = dict() # Attention if layer_type == "attention" and "fa_quant_type" in quant_description: return quant_description["fa_quant_type"] if layer_type == "attention" and "indexer_quant_type" in quant_description: return quant_description["indexer_quant_type"] # Linear / MoE return get_linear_quant_type(quant_description, prefix, packed_modules_mapping) def create_scheme_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ): """Create a quantization scheme instance for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: An instance of the appropriate quantization scheme class. """ logger.info_once("Using the vLLM Ascend modelslim Quantization now!") quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping) if quant_type is None: raise ValueError(f"Could not determine quantization type for layer {prefix}.") # Use registry to get scheme class scheme_cls = get_scheme_class(quant_type, layer_type) if scheme_cls is not None: return scheme_cls() raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.") @register_quantization_config(ASCEND_QUANTIZATION_METHOD) class AscendModelSlimConfig(QuantizationConfig): """Config class for Ascend ModelSlim quantization. This class is a general class that parses quantization configs that are supported on Ascend hardware, specifically for models quantized using the ModelSlim tool. """ def __init__(self, quant_config: dict[str, Any] | None = None): super().__init__() self.quant_description = quant_config if quant_config is not None else {} self._apply_extra_quant_adaptations() self.model_type: str | None = None self.hf_to_vllm_mapper: WeightsMapper | None = None self._mapper_applied = False self._add_kvcache_quant_metadata() def __repr__(self) -> str: return "AscendModelSlimConfig:\n" + super().__repr__() @classmethod def get_name(cls) -> str: return ASCEND_QUANTIZATION_METHOD @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.int8, torch.float16, torch.bfloat16] @classmethod def get_min_capability(cls) -> int: raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.') @classmethod def get_config_filenames(cls) -> list[str]: # Return empty list so that vllm's get_quant_config() skips the # file-based lookup (which raises an unfriendly "Cannot find the # config file for ascend" error when the model is not quantized). # Instead, the config file is loaded in maybe_update_config(), # which can provide a user-friendly error message. return [] @classmethod def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig": return cls(config) @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None: if hf_quant_cfg is not None: quant_method = hf_quant_cfg.get("quant_method", None) if not quant_method and torch.npu.is_available(): return ASCEND_QUANTIZATION_METHOD return None def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): """Apply the vLLM model-specific mapper to this quantization config. This method is called by vLLM to apply the model-specific weight mapper to the quantization configuration. It directly uses the forward mapping (HF -> vLLM) to transform keys in quant_description from HF format to vLLM format. Args: hf_to_vllm_mapper: The WeightsMapper instance provided by vLLM that contains model-specific prefix mappings (HF to vLLM). """ if self._mapper_applied and self.hf_to_vllm_mapper is hf_to_vllm_mapper: return self.hf_to_vllm_mapper = hf_to_vllm_mapper self._mapper_applied = True if self.quant_description: self.quant_description = hf_to_vllm_mapper.apply_dict(self.quant_description) self._add_kvcache_quant_metadata() logger.info("Applied hf_to_vllm_mapper to quant_description keys") def quant_prefix_mapper(self, model_type: str, prefix: str) -> str: self.model_type = model_type return prefix def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from .method_adapters import ( AscendEmbeddingMethod, AscendFusedMoEMethod, AscendKVCacheMethod, AscendLinearMethod, ) vllm_config = get_current_vllm_config() model_type = vllm_config.model_config.hf_config.model_type if model_type in ["minimax", "minimax_m2"]: # Adapt to Minimax architecture: update layer names to MoE convention prefix = prefix.replace("mlp", "block_sparse_moe") # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts') parts = prefix.split(".") if "experts" in parts and len(parts) > 2: exp_idx = parts.index("experts") if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit(): parts = parts[: exp_idx + 1] prefix = ".".join(parts) if model_type in packed_modules_model_mapping: self.packed_modules_mapping = packed_modules_model_mapping[model_type] prefix = self.quant_prefix_mapper(model_type, prefix) if isinstance(layer, LinearBase): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod return AscendUnquantizedLinearMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendLinearMethod(scheme) elif isinstance(layer, AttentionLayerBase) and ( self.is_fa_quant_layer(prefix) or self.is_indexer_quant_layer(prefix) ): scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping) return AscendKVCacheMethod(scheme) elif isinstance(layer, FusedMoE): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod return AscendUnquantizedFusedMoEMethod(layer.moe_config) scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping) return AscendFusedMoEMethod(scheme, layer.moe_config) elif isinstance(layer, VocabParallelEmbedding): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): return UnquantizedEmbeddingMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendEmbeddingMethod(scheme) return None def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})): # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped proj_name = prefix.split(".")[-1] if proj_name in fused_mapping: shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name] ] is_skipped = None for shard_prefix in shard_prefixes: is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT" if is_skipped is None: is_skipped = is_shard_skipped elif is_shard_skipped != is_skipped: raise ValueError( f"Detected some but not all shards of {prefix} " "are quantized. All shards of fused layers " "to have the same precision." ) else: is_skipped = any( key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT" for key, value in self.quant_description.items() ) assert is_skipped is not None return is_skipped def is_fa_quant_layer(self, prefix): if self.enable_fa_quant: layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix)) if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers: return True return False def is_indexer_quant_layer(self, prefix): if self.enable_indexer_quant: layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix)) if layer_id_str.isdigit() and int(layer_id_str) in self.indexer_quant_layers: return True return False def enabling_fa_quant(self, vllm_config, layer_name) -> bool: is_decode_instance = ( vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer and not vllm_config.kv_transfer_config.is_kv_producer ) return bool(is_decode_instance and self.is_fa_quant_layer(layer_name)) def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config): if self.enable_fa_quant and self.is_fa_quant_layer(layer_name): ori_dtype = model_config.dtype quant_dtype = torch.int8 # For MLA models like deepseek, we only quantify K cache to ensure accuracy if model_config.use_mla: return quant_dtype, ori_dtype else: return quant_dtype, quant_dtype return cache_dtype, cache_dtype def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list): if self.enable_fa_quant and self.is_fa_quant_layer(layer_name): k_quant_head_dim = kv_head_dim_list[0] v_quant_head_dim = kv_head_dim_list[1] * 2 kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim] return calc_split_factor(kv_head_dim_list) def maybe_update_config(self, model_name: str, revision: str | None = None) -> None: """Load the ModelSlim quantization config from model directory. This method is called by vllm after get_quant_config() returns successfully. Since we return an empty list from get_config_filenames() to bypass vllm's built-in file lookup, we do the actual config loading here and provide user-friendly error messages when the config is missing. Works with both local directories (``/path/to/model``) and remote repository identifiers (``org/model-name``). For remote repos the lookup goes through the HuggingFace / ModelScope cache via ``get_model_file`` to fetch the config if not already cached. Args: model_name: Path to the model directory or HuggingFace / ModelScope repo id. revision: Optional revision (branch, tag, or commit hash) for remote repos. """ from vllm_ascend.quantization.utils import get_model_file # If quant_description is already populated (e.g. from from_config()), # there is nothing to do. if self.quant_description: return # Try to get the config file (local or remote) config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision) if config_path is not None: with open(config_path) as f: self.quant_description = json.load(f) self._apply_extra_quant_adaptations() self._add_kvcache_quant_metadata() return # Collect diagnostic info for the error message json_names: list[str] = [] if os.path.isdir(model_name): json_files = glob.glob(os.path.join(model_name, "*.json")) json_names = [os.path.basename(f) for f in json_files] # Config file not found - raise a friendly error message raise ValueError( "\n" + "=" * 80 + "\n" + "ERROR: ModelSlim Quantization Config Not Found\n" + "=" * 80 + "\n" + "\n" + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' " + "(ModelSlim quantization),\n" + f"but the model '{model_name}' does not contain the required\n" + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n" + "\n" + "This usually means the model weights are NOT quantized by " + "ModelSlim.\n" + "\n" + "Please choose one of the following solutions:\n" + "\n" + " Solution 1: Remove the quantization option " + "(for float/unquantized models)\n" + " " + "-" * 58 + "\n" + f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from " + "your command if you want to\n" + " run the model with the original (float) weights.\n" + "\n" + " Example:\n" + f" vllm serve {model_name}\n" + "\n" + " Solution 2: Quantize your model weights with ModelSlim first\n" + " " + "-" * 58 + "\n" + " Use the ModelSlim tool to quantize your model weights " + "before deployment.\n" + " After quantization, the model directory should contain " + f"'{MODELSLIM_CONFIG_FILENAME}'.\n" + " For more information, please refer to:\n" + " https://gitee.com/ascend/msit/tree/master/msmodelslim\n" + "\n" + (f" (Found JSON files in model directory: {json_names})\n" if json_names else "") + "=" * 80 ) def _apply_extra_quant_adaptations(self) -> None: """Apply extra adaptations to the quant_description dict. This handles known key transformations such as shared_head and weight_packed mappings. """ extra_quant_dict = {} for k in self.quant_description: if "shared_head" in k: new_k = k.replace(".shared_head.", ".") extra_quant_dict[new_k] = self.quant_description[k] if "weight_packed" in k: new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) def _add_kvcache_quant_metadata(self): fa_quant_type = self.quant_description.get("fa_quant_type", "") self.enable_fa_quant = fa_quant_type != "" self.kvcache_quant_layers = [] indexer_quant_type = self.quant_description.get("indexer_quant_type", "") self.enable_indexer_quant = indexer_quant_type != "" self.indexer_quant_layers = [] if self.enable_fa_quant or self.enable_indexer_quant: for key in self.quant_description: _id = "".join(re.findall(r"\.(\d+)\.", key)) if "fa_k.scale" in key: self.kvcache_quant_layers.append(int(_id)) if "indexer.quant_type" in key: self.indexer_quant_layers.append(int(_id))