# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # Copyright 2023 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file is a part of the vllm-ascend project. # """ModelSlim quantization configuration and model mappings for Ascend. This module provides the AscendModelSlimConfig class for parsing quantization configs generated by the ModelSlim tool, along with model-specific mappings. """ import glob import json import os import re from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional import torch from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization import register_quantization_config from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding from vllm.model_executor.models.utils import WeightsMapper from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor from .methods import get_scheme_class # The config filename that ModelSlim generates after quantizing a model. MODELSLIM_CONFIG_FILENAME = "quant_model_description.json" logger = init_logger(__name__) # key: model_type # value: orig_to_new_prefix QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "qwen3_vl_moe": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "qwen3_vl_text": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "kimi_k25": { "mm_projector.linear_1": "mm_projector.proj.0", "mm_projector.linear_2": "mm_projector.proj.2", }, "qwen3_omni_moe": { "language_model.lm_head.": "thinker.lm_head.", "language_model.model.": "thinker.model.", "visual.": "thinker.visual.", }, "qwen2_5_omni": { "language_model.lm_head.": "thinker.lm_head.", "language_model.model.": "thinker.model.", "visual.": "thinker.visual.", }, "qwen2_5_omni_text": { "language_model.": "thinker.", }, "glm4v_moe": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "glm4v_moe_text": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "qwen3_5": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "qwen3_5_moe": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, } # key: model_type # value: dict of fused module name -> list of original module names packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { "qwen3_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen3_5": { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], "in_proj_ba": ["in_proj_b", "in_proj_a"], }, "qwen3_5_moe": { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"], "in_proj_ba": ["in_proj_b", "in_proj_a"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "deepseek_v2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v3": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "pangu_ultra_moe": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "kimi_k2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v32": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "glm_moe_dsa": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; # NOTE 2.The description file generated by the current msmodelslim tool does not have # MTP layer info. Please manually add it and set the value to FLOAT. "deepseek_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "pangu_ultra_moe_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "qwen3_next": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj_qkvz", "in_proj_ba"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen2_5_vl": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], }, "qwen3_vl_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe_lite": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "glm4v_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4v_moe_text": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "longcat_flash": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "minimax_m2": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"], }, "qwen3_omni_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "attn_qkv_proj": [ "attn_q_proj", "attn_k_proj", "attn_v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen2_5_omni": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "attn_qkv_proj": [ "attn_q_proj", "attn_k_proj", "attn_v_proj", ], "qkv": [ "q", "k", "v", ], "gate_up_proj": [ "gate_proj", "up_proj", ], }, } def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]: """Get packed modules mapping for a model type. Args: model_type: The model type string (e.g., "deepseek_v3"). Returns: Dictionary mapping fused module names to their component module names. Returns empty dict if model_type is not found. """ return packed_modules_model_mapping.get(model_type, {}) def get_prefix_mapping(model_type: str) -> dict[str, str]: """Get prefix mapping for a model type. Args: model_type: The model type string (e.g., "qwen3_vl_moe"). Returns: Dictionary mapping original prefixes to new prefixes. Returns empty dict if model_type is not found. """ return QUANT_MODEL_PREFIX_MAPPINGS.get(model_type, {}) def get_linear_quant_type( quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any] ) -> str | None: """Determine the quantization type for a linear layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ proj_name = prefix.split(".")[-1] if proj_name in packed_modules_mapping: quant_type = None shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name] ] for shard_prefix in shard_prefixes: shard_quant_type = quant_description[shard_prefix + ".weight"] if quant_type is None: quant_type = shard_quant_type elif shard_quant_type != quant_type: raise ValueError( f"Not all shards of {prefix} are quantized with same quant type." f"Shard {proj_name} uses {shard_quant_type}, but another shard" f"use {quant_type}. Please check quantization config." ) else: quant_type = quant_description[prefix + ".weight"] return quant_type def get_quant_type_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ) -> str | None: """Determine the quantization type for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ if packed_modules_mapping is None: packed_modules_mapping = dict() # Attention if layer_type == "attention" and "fa_quant_type" in quant_description: return quant_description["fa_quant_type"] # Linear / MoE return get_linear_quant_type(quant_description, prefix, packed_modules_mapping) def create_scheme_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ): """Create a quantization scheme instance for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: An instance of the appropriate quantization scheme class. """ logger.info_once("Using the vLLM Ascend modelslim Quantization now!") quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping) if quant_type is None: raise ValueError(f"Could not determine quantization type for layer {prefix}.") # Use registry to get scheme class scheme_cls = get_scheme_class(quant_type, layer_type) if scheme_cls is not None: return scheme_cls() raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.") @register_quantization_config(ASCEND_QUANTIZATION_METHOD) class AscendModelSlimConfig(QuantizationConfig): """Config class for Ascend ModelSlim quantization. This class is a general class that parses quantization configs that are supported on Ascend hardware, specifically for models quantized using the ModelSlim tool. """ def __init__(self, quant_config: dict[str, Any] | None = None): super().__init__() self.quant_description = quant_config if quant_config is not None else {} # TODO(whx): remove this adaptation after adding "shared_head" # to prefix of DeepSeekShareHead in vLLM. extra_quant_dict = {} for k in self.quant_description: if "shared_head" in k: new_k = k.replace(".shared_head.", ".") extra_quant_dict[new_k] = self.quant_description[k] if "weight_packed" in k: new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) self._add_kvcache_quant_metadata() def __repr__(self) -> str: return "AscendModelSlimConfig:\n" + super().__repr__() @classmethod def get_name(cls) -> str: return ASCEND_QUANTIZATION_METHOD @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.int8, torch.float16, torch.bfloat16] @classmethod def get_min_capability(cls) -> int: raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.') @classmethod def get_config_filenames(cls) -> list[str]: # Return empty list so that vllm's get_quant_config() skips the # file-based lookup (which raises an unfriendly "Cannot find the # config file for ascend" error when the model is not quantized). # Instead, the config file is loaded in maybe_update_config(), # which can provide a user-friendly error message. return [] @classmethod def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig": return cls(config) @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None: if hf_quant_cfg is not None: quant_method = hf_quant_cfg.get("quant_method", None) if not quant_method and torch.npu.is_available(): return ASCEND_QUANTIZATION_METHOD return None def quant_prefix_mapper(self, model_type: str, prefix: str) -> str: # TODO (Levi-JQ): will be removed when QuantizationConfig.apply_vllm_mapper is implemented prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type) if prefix_mapping: hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping) return hf_to_vllm_mapper._map_name(prefix) return prefix def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from .method_adapters import ( AscendEmbeddingMethod, AscendFusedMoEMethod, AscendKVCacheMethod, AscendLinearMethod, ) vllm_config = get_current_vllm_config() model_type = vllm_config.model_config.hf_config.model_type if model_type in ["minimax", "minimax_m2"]: # Adapt to Minimax architecture: update layer names to MoE convention prefix = prefix.replace("mlp", "block_sparse_moe") # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts') parts = prefix.split(".") if "experts" in parts and len(parts) > 2: exp_idx = parts.index("experts") if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit(): parts = parts[: exp_idx + 1] prefix = ".".join(parts) if model_type in packed_modules_model_mapping: self.packed_modules_mapping = packed_modules_model_mapping[model_type] prefix = self.quant_prefix_mapper(model_type, prefix) if model_type != "kimi_k2": if prefix.startswith("language_model"): prefix = prefix.split(".", 1)[-1] if isinstance(layer, LinearBase): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod return AscendUnquantizedLinearMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendLinearMethod(scheme) elif isinstance(layer, AttentionLayerBase) and self.is_fa_quant_layer(prefix): scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping) return AscendKVCacheMethod(scheme) elif isinstance(layer, FusedMoE): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod return AscendUnquantizedFusedMoEMethod(layer.moe_config) scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping) return AscendFusedMoEMethod(scheme, layer.moe_config) elif isinstance(layer, VocabParallelEmbedding): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): return UnquantizedEmbeddingMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendEmbeddingMethod(scheme) return None def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})): # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped proj_name = prefix.split(".")[-1] if proj_name in fused_mapping: shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name] ] is_skipped = None for shard_prefix in shard_prefixes: is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT" if is_skipped is None: is_skipped = is_shard_skipped elif is_shard_skipped != is_skipped: raise ValueError( f"Detected some but not all shards of {prefix} " "are quantized. All shards of fused layers " "to have the same precision." ) else: is_skipped = any( key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT" for key, value in self.quant_description.items() ) assert is_skipped is not None return is_skipped def is_fa_quant_layer(self, prefix): if self.enable_fa_quant: layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix)) if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers: return True return False def enabling_fa_quant(self, vllm_config, layer_name) -> bool: is_decode_instance = ( vllm_config.kv_transfer_config is not None and vllm_config.kv_transfer_config.is_kv_consumer and not vllm_config.kv_transfer_config.is_kv_producer ) return bool(is_decode_instance and self.is_fa_quant_layer(layer_name)) def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config): if self.enable_fa_quant and self.is_fa_quant_layer(layer_name): ori_dtype = model_config.dtype quant_dtype = torch.int8 # For MLA models like deepseek, we only quantify K cache to ensure accuracy if model_config.use_mla: return quant_dtype, ori_dtype else: return quant_dtype, quant_dtype return cache_dtype, cache_dtype def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list): if self.enable_fa_quant and self.is_fa_quant_layer(layer_name): k_quant_head_dim = kv_head_dim_list[0] v_quant_head_dim = kv_head_dim_list[1] * 2 kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim] return calc_split_factor(kv_head_dim_list) def maybe_update_config(self, model_name: str, revision: str | None = None) -> None: """Load the ModelSlim quantization config from model directory. This method is called by vllm after get_quant_config() returns successfully. Since we return an empty list from get_config_filenames() to bypass vllm's built-in file lookup, we do the actual config loading here and provide user-friendly error messages when the config is missing. Works with both local directories (``/path/to/model``) and remote repository identifiers (``org/model-name``). For remote repos the lookup goes through the HuggingFace / ModelScope cache via ``get_model_file`` to fetch the config if not already cached. Args: model_name: Path to the model directory or HuggingFace / ModelScope repo id. revision: Optional revision (branch, tag, or commit hash) for remote repos. """ from vllm_ascend.quantization.utils import get_model_file # If quant_description is already populated (e.g. from from_config()), # there is nothing to do. if self.quant_description: return # Try to get the config file (local or remote) config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision) if config_path is not None: with open(config_path) as f: self.quant_description = json.load(f) self._apply_extra_quant_adaptations() self._add_kvcache_quant_metadata() return # Collect diagnostic info for the error message json_names: list[str] = [] if os.path.isdir(model_name): json_files = glob.glob(os.path.join(model_name, "*.json")) json_names = [os.path.basename(f) for f in json_files] # Config file not found - raise a friendly error message raise ValueError( "\n" + "=" * 80 + "\n" + "ERROR: ModelSlim Quantization Config Not Found\n" + "=" * 80 + "\n" + "\n" + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' " + "(ModelSlim quantization),\n" + f"but the model '{model_name}' does not contain the required\n" + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n" + "\n" + "This usually means the model weights are NOT quantized by " + "ModelSlim.\n" + "\n" + "Please choose one of the following solutions:\n" + "\n" + " Solution 1: Remove the quantization option " + "(for float/unquantized models)\n" + " " + "-" * 58 + "\n" + f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from " + "your command if you want to\n" + " run the model with the original (float) weights.\n" + "\n" + " Example:\n" + f" vllm serve {model_name}\n" + "\n" + " Solution 2: Quantize your model weights with ModelSlim first\n" + " " + "-" * 58 + "\n" + " Use the ModelSlim tool to quantize your model weights " + "before deployment.\n" + " After quantization, the model directory should contain " + f"'{MODELSLIM_CONFIG_FILENAME}'.\n" + " For more information, please refer to:\n" + " https://gitee.com/ascend/msit/tree/master/msmodelslim\n" + "\n" + (f" (Found JSON files in model directory: {json_names})\n" if json_names else "") + "=" * 80 ) def _apply_extra_quant_adaptations(self) -> None: """Apply extra adaptations to the quant_description dict. This handles known key transformations such as shared_head and weight_packed mappings. """ extra_quant_dict = {} for k in self.quant_description: if "shared_head" in k: new_k = k.replace(".shared_head.", ".") extra_quant_dict[new_k] = self.quant_description[k] if "weight_packed" in k: new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) def get_scaled_act_names(self) -> list[str]: return [] def _add_kvcache_quant_metadata(self): fa_quant_type = self.quant_description.get("fa_quant_type", "") self.enable_fa_quant = fa_quant_type != "" self.kvcache_quant_layers = [] if self.enable_fa_quant: for key in self.quant_description: if "fa_k.scale" in key: _id = "".join(re.findall(r"\.(\d+)\.", key)) self.kvcache_quant_layers.append(int(_id))