# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # Copyright 2023 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This file is a part of the vllm-ascend project. # """ModelSlim quantization configuration and model mappings for Ascend. This module provides the AscendModelSlimConfig class for parsing quantization configs generated by the ModelSlim tool, along with model-specific mappings. """ import glob import json import os from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional import torch from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.quantization import register_quantization_config from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding from vllm.model_executor.models.utils import WeightsMapper from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD from .methods import get_scheme_class # The config filename that ModelSlim generates after quantizing a model. MODELSLIM_CONFIG_FILENAME = "quant_model_description.json" logger = init_logger(__name__) # key: model_type # value: orig_to_new_prefix QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = { "qwen3_vl_moe": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "qwen3_vl_text": { "visual.": "model.visual.", "language_model.lm_head.": "lm_head.", "language_model.model.": "model.language_model.", }, "kimi_k25": { "mm_projector.linear_1": "mm_projector.proj.0", "mm_projector.linear_2": "mm_projector.proj.2", }, } # key: model_type # value: dict of fused module name -> list of original module names packed_modules_model_mapping: dict[str, dict[str, list[str]]] = { "qwen3_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "deepseek_v2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v3": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "pangu_ultra_moe": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "kimi_k2": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "deepseek_v32": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "glm_moe_dsa": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; # NOTE 2.The description file generated by the current msmodelslim tool does not have # MTP layer info. Please manually add it and set the value to FLOAT. "deepseek_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "pangu_ultra_moe_mtp": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "qwen3_next": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": ["gate_proj", "up_proj"], "in_proj": ["in_proj_qkvz", "in_proj_ba"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "qwen2_5_vl": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], }, "qwen3_vl_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "gate_up_proj": [ "gate_proj", "up_proj", ], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], }, "glm4_moe_lite": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "longcat_flash": { "gate_up_proj": ["gate_proj", "up_proj"], "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"], }, "minimax_m2": { "qkv_proj": [ "q_proj", "k_proj", "v_proj", ], "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"], }, } def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]: """Get packed modules mapping for a model type. Args: model_type: The model type string (e.g., "deepseek_v3"). Returns: Dictionary mapping fused module names to their component module names. Returns empty dict if model_type is not found. """ return packed_modules_model_mapping.get(model_type, {}) def get_prefix_mapping(model_type: str) -> dict[str, str]: """Get prefix mapping for a model type. Args: model_type: The model type string (e.g., "qwen3_vl_moe"). Returns: Dictionary mapping original prefixes to new prefixes. Returns empty dict if model_type is not found. """ return QUANT_MODEL_PREFIX_MAPPINGS.get(model_type, {}) def get_linear_quant_type( quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any] ) -> str | None: """Determine the quantization type for a linear layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ proj_name = prefix.split(".")[-1] if proj_name in packed_modules_mapping: quant_type = None shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name] ] for shard_prefix in shard_prefixes: shard_quant_type = quant_description[shard_prefix + ".weight"] if quant_type is None: quant_type = shard_quant_type elif shard_quant_type != quant_type: raise ValueError( f"Not all shards of {prefix} are quantized with same quant type." f"Shard {proj_name} uses {shard_quant_type}, but another shard" f"use {quant_type}. Please check quantization config." ) else: quant_type = quant_description[prefix + ".weight"] return quant_type def get_quant_type_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ) -> str | None: """Determine the quantization type for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: The quantization type string (e.g., "W8A8_DYNAMIC"). """ if packed_modules_mapping is None: packed_modules_mapping = dict() # Attention if layer_type == "attention" and "fa_quant_type" in quant_description: return quant_description["fa_quant_type"] # Linear / MoE return get_linear_quant_type(quant_description, prefix, packed_modules_mapping) def create_scheme_for_layer( quant_description: dict[str, Any], prefix: str, layer_type: str, packed_modules_mapping: dict[str, Any] | None = None, ): """Create a quantization scheme instance for a layer. Args: quant_description: The quantization description dictionary. prefix: The layer prefix. layer_type: The type of layer ("linear", "moe", "attention"). packed_modules_mapping: Mapping for packed/fused modules. Returns: An instance of the appropriate quantization scheme class. """ logger.info_once("Using the vLLM Ascend modelslim Quantization now!") quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping) if quant_type is None: raise ValueError(f"Could not determine quantization type for layer {prefix}.") # Use registry to get scheme class scheme_cls = get_scheme_class(quant_type, layer_type) if scheme_cls is not None: return scheme_cls() raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.") @register_quantization_config(ASCEND_QUANTIZATION_METHOD) class AscendModelSlimConfig(QuantizationConfig): """Config class for Ascend ModelSlim quantization. This class is a general class that parses quantization configs that are supported on Ascend hardware, specifically for models quantized using the ModelSlim tool. """ def __init__(self, quant_config: dict[str, Any] | None = None): super().__init__() self.quant_description = quant_config if quant_config is not None else {} # TODO(whx): remove this adaptation after adding "shared_head" # to prefix of DeepSeekShareHead in vLLM. extra_quant_dict = {} for k in self.quant_description: if "shared_head" in k: new_k = k.replace(".shared_head.", ".") extra_quant_dict[new_k] = self.quant_description[k] if "weight_packed" in k: new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) def __repr__(self) -> str: return "AscendModelSlimConfig:\n" + super().__repr__() @classmethod def get_name(cls) -> str: return ASCEND_QUANTIZATION_METHOD @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.int8, torch.float16, torch.bfloat16] @classmethod def get_min_capability(cls) -> int: raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.') @classmethod def get_config_filenames(cls) -> list[str]: # Return empty list so that vllm's get_quant_config() skips the # file-based lookup (which raises an unfriendly "Cannot find the # config file for ascend" error when the model is not quantized). # Instead, the config file is loaded in maybe_update_config(), # which can provide a user-friendly error message. return [] @classmethod def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig": return cls(config) @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None: if hf_quant_cfg is not None: quant_method = hf_quant_cfg.get("quant_method", None) if not quant_method and torch.npu.is_available(): return ASCEND_QUANTIZATION_METHOD return None def quant_prefix_mapper(self, model_type: str, prefix: str) -> str: # TODO (Levi-JQ): will be removed when QuantizationConfig.apply_vllm_mapper is implemented prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type) if prefix_mapping: hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping) return hf_to_vllm_mapper._map_name(prefix) return prefix def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from .method_adapters import ( AscendEmbeddingMethod, AscendFusedMoEMethod, AscendKVCacheMethod, AscendLinearMethod, ) vllm_config = get_current_vllm_config() model_type = vllm_config.model_config.hf_config.model_type if model_type in ["minimax", "minimax_m2"]: # Adapt to Minimax architecture: update layer names to MoE convention prefix = prefix.replace("mlp", "block_sparse_moe") # Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts') parts = prefix.split(".") if "experts" in parts and len(parts) > 2: exp_idx = parts.index("experts") if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit(): parts = parts[: exp_idx + 1] prefix = ".".join(parts) if model_type in packed_modules_model_mapping: self.packed_modules_mapping = packed_modules_model_mapping[model_type] prefix = self.quant_prefix_mapper(model_type, prefix) from vllm.model_executor.layers.attention import Attention if model_type != "kimi_k2": if prefix.startswith("language_model"): prefix = prefix.split(".", 1)[-1] if isinstance(layer, LinearBase): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod return AscendUnquantizedLinearMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendLinearMethod(scheme) elif ( isinstance(layer, Attention) and "fa_quant_type" in self.quant_description and self.quant_description["fa_quant_type"] is not None ): scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping) return AscendKVCacheMethod(scheme) elif isinstance(layer, FusedMoE): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): # Delayed import to avoid circular import from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod return AscendUnquantizedFusedMoEMethod(layer.moe_config) scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping) return AscendFusedMoEMethod(scheme, layer.moe_config) elif isinstance(layer, VocabParallelEmbedding): if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping): return UnquantizedEmbeddingMethod() scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping) return AscendEmbeddingMethod(scheme) return None def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})): # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped proj_name = prefix.split(".")[-1] if proj_name in fused_mapping: shard_prefixes = [ prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name] ] is_skipped = None for shard_prefix in shard_prefixes: is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT" if is_skipped is None: is_skipped = is_shard_skipped elif is_shard_skipped != is_skipped: raise ValueError( f"Detected some but not all shards of {prefix} " "are quantized. All shards of fused layers " "to have the same precision." ) else: is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT" assert is_skipped is not None return is_skipped def maybe_update_config(self, model_name: str) -> None: """Load the ModelSlim quantization config from model directory. This method is called by vllm after get_quant_config() returns successfully. Since we return an empty list from get_config_filenames() to bypass vllm's built-in file lookup, we do the actual config loading here and provide user-friendly error messages when the config is missing. Args: model_name: Path to the model directory or model name. """ # If quant_description is already populated (e.g. from from_config()), # there is nothing to do. if self.quant_description: return # Try to find and load the ModelSlim config file if os.path.isdir(model_name): config_path = os.path.join(model_name, MODELSLIM_CONFIG_FILENAME) if os.path.isfile(config_path): with open(config_path) as f: self.quant_description = json.load(f) self._apply_extra_quant_adaptations() return # Check if there are any json files at all to help diagnose json_files = glob.glob(os.path.join(model_name, "*.json")) json_names = [os.path.basename(f) for f in json_files] else: json_names = [] # Config file not found - raise a friendly error message raise ValueError( "\n" + "=" * 80 + "\n" + "ERROR: ModelSlim Quantization Config Not Found\n" + "=" * 80 + "\n" + "\n" + f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' " + "(ModelSlim quantization),\n" + f"but the model at '{model_name}' does not contain the required\n" + f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n" + "\n" + "This usually means the model weights are NOT quantized by " + "ModelSlim.\n" + "\n" + "Please choose one of the following solutions:\n" + "\n" + " Solution 1: Remove the quantization option " + "(for float/unquantized models)\n" + " " + "-" * 58 + "\n" + f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from " + "your command if you want to\n" + " run the model with the original (float) weights.\n" + "\n" + " Example:\n" + f" vllm serve {model_name}\n" + "\n" + " Solution 2: Quantize your model weights with ModelSlim first\n" + " " + "-" * 58 + "\n" + " Use the ModelSlim tool to quantize your model weights " + "before deployment.\n" + " After quantization, the model directory should contain " + f"'{MODELSLIM_CONFIG_FILENAME}'.\n" + " For more information, please refer to:\n" + " https://gitee.com/ascend/msit/tree/master/msmodelslim\n" + "\n" + (f" (Found JSON files in model directory: {json_names})\n" if json_names else "") + "=" * 80 ) def _apply_extra_quant_adaptations(self) -> None: """Apply extra adaptations to the quant_description dict. This handles known key transformations such as shared_head and weight_packed mappings. """ extra_quant_dict = {} for k in self.quant_description: if "shared_head" in k: new_k = k.replace(".shared_head.", ".") extra_quant_dict[new_k] = self.quant_description[k] if "weight_packed" in k: new_k = k.replace("weight_packed", "weight") extra_quant_dict[new_k] = self.quant_description[k] self.quant_description.update(extra_quant_dict) def get_scaled_act_names(self) -> list[str]: return []