### What this PR does / why we need it?
This PR introduces several upstream `vllm`-aligned lint hooks into
`vllm-ascend` and makes them part of the actual `pre-commit` flow.
Main changes in this PR:
- add `check-boolean-context-manager` to catch boolean expressions in
`with` statements
- add `check-forbidden-imports` to forbid direct `re` imports and
disallowed direct `triton` imports
- enable shell script linting through `tools/shellcheck.sh`
- add root `.clang-format` aligned with upstream `vllm`, enable
`clang-format` in `pre-commit`, temporarily **exclude all `csrc/**`**
from `clang-format` to avoid bringing a large native code reformat into
this PR
This PR focuses on landing the smaller and immediately useful lint
alignment first, without mixing in the larger requirements-management
migration.
### Does this PR introduce _any_ user-facing change?
No.
This PR only updates repository lint configuration, static checks, and
internal import/style enforcement. It does not change runtime behavior
or public interfaces.
### How was this patch tested?
Tested locally in the project virtual environment.
Commands used:
```bash
bash format.sh
```
Verified checks passed:
``` bash
ruff check...............................................................Passed
ruff format..............................................................Passed
codespell................................................................Passed
typos....................................................................Passed
clang-format.............................................................Passed
Lint GitHub Actions workflow files.......................................Passed
Lint shell scripts.......................................................Passed
Lint PNG exports from excalidraw.........................................Passed
Check for spaces in all filenames........................................Passed
Enforce __init__.py in Python packages...................................Passed
Check for forbidden imports..............................................Passed
Check for boolean ops in with-statements.................................Passed
Suggestion...............................................................Passed
- hook id: suggestion
- duration: 0s
To bypass pre-commit hooks, add --no-verify to git commit.
```
**note:**
clang-format is enabled but currently excludes all csrc/**
- vLLM version: v0.17.0
- vLLM main:
8b6325758c
---------
Signed-off-by: MrZ20 <2609716663@qq.com>
781 lines
30 KiB
Python
781 lines
30 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
"""ModelSlim quantization configuration and model mappings for Ascend.
|
|
|
|
This module provides the AscendModelSlimConfig class for parsing quantization
|
|
configs generated by the ModelSlim tool, along with model-specific mappings.
|
|
"""
|
|
|
|
import glob
|
|
import json
|
|
import os
|
|
from collections.abc import Mapping
|
|
from types import MappingProxyType
|
|
from typing import Any, Optional
|
|
|
|
import regex as re
|
|
import torch
|
|
from vllm.config import get_current_vllm_config
|
|
from vllm.logger import logger
|
|
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
|
from vllm.model_executor.layers.linear import LinearBase
|
|
from vllm.model_executor.layers.quantization import register_quantization_config
|
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig, QuantizeMethodBase
|
|
from vllm.model_executor.layers.vocab_parallel_embedding import UnquantizedEmbeddingMethod, VocabParallelEmbedding
|
|
from vllm.model_executor.models.utils import WeightsMapper
|
|
|
|
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, calc_split_factor
|
|
|
|
from .methods import get_scheme_class
|
|
|
|
# The config filename that ModelSlim generates after quantizing a model.
|
|
MODELSLIM_CONFIG_FILENAME = "quant_model_description.json"
|
|
|
|
# key: model_type
|
|
# value: vLLM prefix -> HF prefix mapping (used to convert vLLM layer names to HF format
|
|
# for looking up keys in quant_model_description.json)
|
|
QUANT_MODEL_PREFIX_MAPPINGS: dict[str, dict[str, str]] = {
|
|
"qwen3_vl_moe": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"qwen3_vl": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"kimi_k25": {
|
|
"mm_projector.linear_1": "mm_projector.proj.0",
|
|
"mm_projector.linear_2": "mm_projector.proj.2",
|
|
},
|
|
"qwen3_omni_moe": {
|
|
"language_model.lm_head.": "thinker.lm_head.",
|
|
"language_model.model.": "thinker.model.",
|
|
"visual.": "thinker.visual.",
|
|
},
|
|
"qwen2_5_omni": {
|
|
"language_model.lm_head.": "thinker.lm_head.",
|
|
"language_model.model.": "thinker.model.",
|
|
"visual.": "thinker.visual.",
|
|
},
|
|
"qwen2_5_omni_text": {
|
|
"language_model.": "thinker.",
|
|
"language_model.lm_head.": "thinker.lm_head.",
|
|
"language_model.model.": "thinker.model.",
|
|
},
|
|
"glm4v_moe": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"glm4v_moe_text": {
|
|
"visual.": "model.visual.",
|
|
"language_model.lm_head.": "lm_head.",
|
|
"language_model.model.": "model.language_model.",
|
|
},
|
|
"kimi_k2": {
|
|
"language_model.layers.": "language_model.model.layers.",
|
|
# mm projector
|
|
"mm_projector.proj.0": "mm_projector.linear_1",
|
|
"mm_projector.proj.2": "mm_projector.linear_2",
|
|
},
|
|
}
|
|
|
|
# key: model_type
|
|
# value: dict of fused module name -> list of original module names
|
|
packed_modules_model_mapping: dict[str, dict[str, list[str]]] = {
|
|
"qwen3_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen3_5": {
|
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
|
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
|
},
|
|
"qwen3_5_moe": {
|
|
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
|
|
"in_proj_ba": ["in_proj_b", "in_proj_a"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"deepseek_v2": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"deepseek_v3": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"pangu_ultra_moe": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"kimi_k2": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"deepseek_v32": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"glm_moe_dsa": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
# NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized;
|
|
# NOTE 2.The description file generated by the current msmodelslim tool does not have
|
|
# MTP layer info. Please manually add it and set the value to FLOAT.
|
|
"deepseek_mtp": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"pangu_ultra_moe_mtp": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"qwen3_next": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"in_proj": ["in_proj_qkvz", "in_proj_ba"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen2_5_vl": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
},
|
|
"qwen3_vl_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4_moe_lite": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"glm4v_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"glm4v_moe_text": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"longcat_flash": {
|
|
"gate_up_proj": ["gate_proj", "up_proj"],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
"fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"],
|
|
},
|
|
"minimax_m2": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"],
|
|
},
|
|
"qwen3_omni_moe": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"attn_qkv_proj": [
|
|
"attn_q_proj",
|
|
"attn_k_proj",
|
|
"attn_v_proj",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
"experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
|
|
},
|
|
"qwen2_5_omni": {
|
|
"qkv_proj": [
|
|
"q_proj",
|
|
"k_proj",
|
|
"v_proj",
|
|
],
|
|
"attn_qkv_proj": [
|
|
"attn_q_proj",
|
|
"attn_k_proj",
|
|
"attn_v_proj",
|
|
],
|
|
"qkv": [
|
|
"q",
|
|
"k",
|
|
"v",
|
|
],
|
|
"gate_up_proj": [
|
|
"gate_proj",
|
|
"up_proj",
|
|
],
|
|
},
|
|
}
|
|
|
|
|
|
def get_packed_modules_mapping(model_type: str) -> dict[str, list[str]]:
|
|
"""Get packed modules mapping for a model type.
|
|
|
|
Args:
|
|
model_type: The model type string (e.g., "deepseek_v3").
|
|
|
|
Returns:
|
|
Dictionary mapping fused module names to their component module names.
|
|
Returns empty dict if model_type is not found.
|
|
"""
|
|
return packed_modules_model_mapping.get(model_type, {})
|
|
|
|
|
|
def get_prefix_mapping(model_type: str) -> dict[str, str]:
|
|
"""Get prefix mapping for a model type.
|
|
|
|
Args:
|
|
model_type: The model type string (e.g., "qwen3_vl_moe").
|
|
|
|
Returns:
|
|
Dictionary mapping original prefixes to new prefixes.
|
|
Returns empty dict if model_type is not found.
|
|
"""
|
|
return QUANT_MODEL_PREFIX_MAPPINGS.get(model_type, {})
|
|
|
|
|
|
def get_linear_quant_type(
|
|
quant_description: dict[str, Any], prefix: str, packed_modules_mapping: dict[str, Any]
|
|
) -> str | None:
|
|
"""Determine the quantization type for a linear layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
The quantization type string (e.g., "W8A8_DYNAMIC").
|
|
"""
|
|
proj_name = prefix.split(".")[-1]
|
|
if proj_name in packed_modules_mapping:
|
|
quant_type = None
|
|
shard_prefixes = [
|
|
prefix.replace(proj_name, shard_proj_name) for shard_proj_name in packed_modules_mapping[proj_name]
|
|
]
|
|
for shard_prefix in shard_prefixes:
|
|
shard_quant_type = quant_description[shard_prefix + ".weight"]
|
|
|
|
if quant_type is None:
|
|
quant_type = shard_quant_type
|
|
elif shard_quant_type != quant_type:
|
|
raise ValueError(
|
|
f"Not all shards of {prefix} are quantized with same quant type."
|
|
f"Shard {proj_name} uses {shard_quant_type}, but another shard"
|
|
f"use {quant_type}. Please check quantization config."
|
|
)
|
|
else:
|
|
quant_type = quant_description[prefix + ".weight"]
|
|
return quant_type
|
|
|
|
|
|
def get_quant_type_for_layer(
|
|
quant_description: dict[str, Any],
|
|
prefix: str,
|
|
layer_type: str,
|
|
packed_modules_mapping: dict[str, Any] | None = None,
|
|
) -> str | None:
|
|
"""Determine the quantization type for a layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
layer_type: The type of layer ("linear", "moe", "attention").
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
The quantization type string (e.g., "W8A8_DYNAMIC").
|
|
"""
|
|
if packed_modules_mapping is None:
|
|
packed_modules_mapping = dict()
|
|
# Attention
|
|
if layer_type == "attention" and "fa_quant_type" in quant_description:
|
|
return quant_description["fa_quant_type"]
|
|
# Linear / MoE
|
|
return get_linear_quant_type(quant_description, prefix, packed_modules_mapping)
|
|
|
|
|
|
def create_scheme_for_layer(
|
|
quant_description: dict[str, Any],
|
|
prefix: str,
|
|
layer_type: str,
|
|
packed_modules_mapping: dict[str, Any] | None = None,
|
|
):
|
|
"""Create a quantization scheme instance for a layer.
|
|
|
|
Args:
|
|
quant_description: The quantization description dictionary.
|
|
prefix: The layer prefix.
|
|
layer_type: The type of layer ("linear", "moe", "attention").
|
|
packed_modules_mapping: Mapping for packed/fused modules.
|
|
|
|
Returns:
|
|
An instance of the appropriate quantization scheme class.
|
|
"""
|
|
logger.info_once("Using the vLLM Ascend modelslim Quantization now!")
|
|
quant_type = get_quant_type_for_layer(quant_description, prefix, layer_type, packed_modules_mapping)
|
|
|
|
if quant_type is None:
|
|
raise ValueError(f"Could not determine quantization type for layer {prefix}.")
|
|
|
|
# Use registry to get scheme class
|
|
scheme_cls = get_scheme_class(quant_type, layer_type)
|
|
if scheme_cls is not None:
|
|
return scheme_cls()
|
|
|
|
raise NotImplementedError(f"Currently, vLLM Ascend doesn't support {quant_type} for {layer_type}.")
|
|
|
|
|
|
@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
|
|
class AscendModelSlimConfig(QuantizationConfig):
|
|
"""Config class for Ascend ModelSlim quantization.
|
|
|
|
This class is a general class that parses quantization configs
|
|
that are supported on Ascend hardware, specifically for models
|
|
quantized using the ModelSlim tool.
|
|
"""
|
|
|
|
def __init__(self, quant_config: dict[str, Any] | None = None):
|
|
super().__init__()
|
|
self.quant_description = quant_config if quant_config is not None else {}
|
|
# TODO(whx): remove this adaptation after adding "shared_head"
|
|
# to prefix of DeepSeekShareHead in vLLM.
|
|
extra_quant_dict = {}
|
|
for k in self.quant_description:
|
|
if "shared_head" in k:
|
|
new_k = k.replace(".shared_head.", ".")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
if "weight_packed" in k:
|
|
new_k = k.replace("weight_packed", "weight")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
self.quant_description.update(extra_quant_dict)
|
|
# Initialize attributes for type checking
|
|
self.model_type: str | None = None
|
|
self.hf_to_vllm_mapper: WeightsMapper | None = None
|
|
self.vllm_to_hf_mapper: WeightsMapper | None = None
|
|
self._add_kvcache_quant_metadata()
|
|
|
|
def __repr__(self) -> str:
|
|
return "AscendModelSlimConfig:\n" + super().__repr__()
|
|
|
|
@classmethod
|
|
def get_name(cls) -> str:
|
|
return ASCEND_QUANTIZATION_METHOD
|
|
|
|
@classmethod
|
|
def get_supported_act_dtypes(cls) -> list[torch.dtype]:
|
|
return [torch.int8, torch.float16, torch.bfloat16]
|
|
|
|
@classmethod
|
|
def get_min_capability(cls) -> int:
|
|
raise NotImplementedError('Ascend hardware dose not support "get_min_capability" feature.')
|
|
|
|
@classmethod
|
|
def get_config_filenames(cls) -> list[str]:
|
|
# Return empty list so that vllm's get_quant_config() skips the
|
|
# file-based lookup (which raises an unfriendly "Cannot find the
|
|
# config file for ascend" error when the model is not quantized).
|
|
# Instead, the config file is loaded in maybe_update_config(),
|
|
# which can provide a user-friendly error message.
|
|
return []
|
|
|
|
@classmethod
|
|
def from_config(cls, config: dict[str, Any]) -> "AscendModelSlimConfig":
|
|
return cls(config)
|
|
|
|
@classmethod
|
|
def override_quantization_method(cls, hf_quant_cfg, user_quant) -> str | None:
|
|
if hf_quant_cfg is not None:
|
|
quant_method = hf_quant_cfg.get("quant_method", None)
|
|
if not quant_method and torch.npu.is_available():
|
|
return ASCEND_QUANTIZATION_METHOD
|
|
return None
|
|
|
|
# TODO: Modify the key values in self.quant_description instead of flipping the hf_to_vllm_mapper
|
|
def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
|
|
"""Apply the vLLM model-specific mapper to this quantization config.
|
|
|
|
This method is called by vLLM to apply the model-specific weight mapper
|
|
to the quantization configuration. It creates a reverse mapper to convert
|
|
vLLM prefixes back to HF format for looking up keys in quant_config.json.
|
|
|
|
Args:
|
|
hf_to_vllm_mapper: The WeightsMapper instance provided by vLLM
|
|
that contains model-specific prefix mappings (HF to vLLM).
|
|
"""
|
|
# Check if we already have a valid vllm_to_hf_mapper for this hf_to_vllm_mapper
|
|
if hasattr(self, "hf_to_vllm_mapper") and self.hf_to_vllm_mapper is hf_to_vllm_mapper:
|
|
# Same mapper instance, no need to recreate
|
|
return
|
|
|
|
# Store the original mapper
|
|
self.hf_to_vllm_mapper = hf_to_vllm_mapper
|
|
|
|
# Try different ways to get the mapping based on WeightsMapper implementation
|
|
mapping_attrs = ["orig_to_new_prefix"]
|
|
orig_to_new_prefix = {}
|
|
|
|
for attr_name in mapping_attrs:
|
|
if hasattr(hf_to_vllm_mapper, attr_name):
|
|
orig_to_new_prefix = getattr(hf_to_vllm_mapper, attr_name)
|
|
break
|
|
|
|
# Create reverse mapping (vLLM -> HF), skipping empty values
|
|
vllm_to_hf_mapping = {}
|
|
for orig_prefix, new_prefix in orig_to_new_prefix.items():
|
|
# Skip empty values to avoid invalid keys in reverse mapping
|
|
if new_prefix:
|
|
vllm_to_hf_mapping[new_prefix] = orig_prefix
|
|
|
|
# Create and store the reverse WeightsMapper instance
|
|
if vllm_to_hf_mapping:
|
|
self.vllm_to_hf_mapper = WeightsMapper(orig_to_new_prefix=vllm_to_hf_mapping)
|
|
logger.debug(f"Created reverse mapping from hf_to_vllm_mapper: {vllm_to_hf_mapping}")
|
|
else:
|
|
logger.info("No valid reverse mapping found for WeightsMapper.")
|
|
|
|
def quant_prefix_mapper(self, model_type: str, prefix: str) -> str:
|
|
# Store model_type for reference
|
|
self.model_type = model_type
|
|
|
|
# Check if manual mapping exists for this model type
|
|
# Manual mapping takes priority and is used exclusively to avoid conflicts
|
|
if model_type in QUANT_MODEL_PREFIX_MAPPINGS:
|
|
manual_mapping = QUANT_MODEL_PREFIX_MAPPINGS[model_type]
|
|
# Manual mapping is already in vLLM -> HF direction, use directly
|
|
mapper = WeightsMapper(orig_to_new_prefix=manual_mapping)
|
|
return mapper._map_name(prefix)
|
|
|
|
# Use the reverse mapper (vLLM to HF) if available
|
|
if hasattr(self, "vllm_to_hf_mapper") and self.vllm_to_hf_mapper:
|
|
return self.vllm_to_hf_mapper._map_name(prefix)
|
|
|
|
# Fall back to manual mapping for backward compatibility (simplified)
|
|
# This is only used if apply_vllm_mapper wasn't called or failed
|
|
prefix_mapping = QUANT_MODEL_PREFIX_MAPPINGS.get(model_type)
|
|
if prefix_mapping:
|
|
# Manual mapping is already in vLLM -> HF direction, use directly
|
|
mapper = WeightsMapper(orig_to_new_prefix=prefix_mapping)
|
|
return mapper._map_name(prefix)
|
|
|
|
return prefix
|
|
|
|
def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]:
|
|
from .method_adapters import (
|
|
AscendEmbeddingMethod,
|
|
AscendFusedMoEMethod,
|
|
AscendKVCacheMethod,
|
|
AscendLinearMethod,
|
|
)
|
|
|
|
vllm_config = get_current_vllm_config()
|
|
model_type = vllm_config.model_config.hf_config.model_type
|
|
|
|
if model_type in ["minimax", "minimax_m2"]:
|
|
# Adapt to Minimax architecture: update layer names to MoE convention
|
|
prefix = prefix.replace("mlp", "block_sparse_moe")
|
|
# Normalize the prefix by stripping specific expert indices (e.g., 'experts.0' -> 'experts')
|
|
parts = prefix.split(".")
|
|
if "experts" in parts and len(parts) > 2:
|
|
exp_idx = parts.index("experts")
|
|
if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
|
|
parts = parts[: exp_idx + 1]
|
|
prefix = ".".join(parts)
|
|
|
|
if model_type in packed_modules_model_mapping:
|
|
self.packed_modules_mapping = packed_modules_model_mapping[model_type]
|
|
prefix = self.quant_prefix_mapper(model_type, prefix)
|
|
|
|
if isinstance(layer, LinearBase):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
# Delayed import to avoid circular import
|
|
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
|
|
|
|
return AscendUnquantizedLinearMethod()
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
|
|
return AscendLinearMethod(scheme)
|
|
elif isinstance(layer, AttentionLayerBase) and self.is_fa_quant_layer(prefix):
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "attention", self.packed_modules_mapping)
|
|
return AscendKVCacheMethod(scheme)
|
|
elif isinstance(layer, FusedMoE):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
# Delayed import to avoid circular import
|
|
from vllm_ascend.ops.fused_moe.fused_moe import AscendUnquantizedFusedMoEMethod
|
|
|
|
return AscendUnquantizedFusedMoEMethod(layer.moe_config)
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "moe", self.packed_modules_mapping)
|
|
return AscendFusedMoEMethod(scheme, layer.moe_config)
|
|
elif isinstance(layer, VocabParallelEmbedding):
|
|
if self.is_layer_skipped_ascend(prefix, self.packed_modules_mapping):
|
|
return UnquantizedEmbeddingMethod()
|
|
scheme = create_scheme_for_layer(self.quant_description, prefix, "linear", self.packed_modules_mapping)
|
|
return AscendEmbeddingMethod(scheme)
|
|
return None
|
|
|
|
def is_layer_skipped_ascend(self, prefix: str, fused_mapping: Mapping[str, list[str]] = MappingProxyType({})):
|
|
# adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
|
|
proj_name = prefix.split(".")[-1]
|
|
if proj_name in fused_mapping:
|
|
shard_prefixes = [
|
|
prefix.replace(proj_name, shard_proj_name) for shard_proj_name in fused_mapping[proj_name]
|
|
]
|
|
|
|
is_skipped = None
|
|
for shard_prefix in shard_prefixes:
|
|
is_shard_skipped = self.quant_description[shard_prefix + ".weight"] == "FLOAT"
|
|
|
|
if is_skipped is None:
|
|
is_skipped = is_shard_skipped
|
|
elif is_shard_skipped != is_skipped:
|
|
raise ValueError(
|
|
f"Detected some but not all shards of {prefix} "
|
|
"are quantized. All shards of fused layers "
|
|
"to have the same precision."
|
|
)
|
|
else:
|
|
is_skipped = any(
|
|
key.startswith(prefix) and key.endswith(".weight") and value == "FLOAT"
|
|
for key, value in self.quant_description.items()
|
|
)
|
|
|
|
assert is_skipped is not None
|
|
return is_skipped
|
|
|
|
def is_fa_quant_layer(self, prefix):
|
|
if self.enable_fa_quant:
|
|
layer_id_str = "".join(re.findall(r"\.(\d+)\.", prefix))
|
|
if layer_id_str.isdigit() and int(layer_id_str) in self.kvcache_quant_layers:
|
|
return True
|
|
return False
|
|
|
|
def enabling_fa_quant(self, vllm_config, layer_name) -> bool:
|
|
is_decode_instance = (
|
|
vllm_config.kv_transfer_config is not None
|
|
and vllm_config.kv_transfer_config.is_kv_consumer
|
|
and not vllm_config.kv_transfer_config.is_kv_producer
|
|
)
|
|
return bool(is_decode_instance and self.is_fa_quant_layer(layer_name))
|
|
|
|
def get_kv_quant_dtype(self, layer_name, cache_dtype, model_config):
|
|
if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
|
|
ori_dtype = model_config.dtype
|
|
quant_dtype = torch.int8
|
|
# For MLA models like deepseek, we only quantify K cache to ensure accuracy
|
|
if model_config.use_mla:
|
|
return quant_dtype, ori_dtype
|
|
else:
|
|
return quant_dtype, quant_dtype
|
|
return cache_dtype, cache_dtype
|
|
|
|
def get_kv_quant_split_factor(self, layer_name, kv_head_dim_list):
|
|
if self.enable_fa_quant and self.is_fa_quant_layer(layer_name):
|
|
k_quant_head_dim = kv_head_dim_list[0]
|
|
v_quant_head_dim = kv_head_dim_list[1] * 2
|
|
kv_head_dim_list = [k_quant_head_dim, v_quant_head_dim]
|
|
return calc_split_factor(kv_head_dim_list)
|
|
|
|
def maybe_update_config(self, model_name: str, revision: str | None = None) -> None:
|
|
"""Load the ModelSlim quantization config from model directory.
|
|
|
|
This method is called by vllm after get_quant_config() returns
|
|
successfully. Since we return an empty list from get_config_filenames()
|
|
to bypass vllm's built-in file lookup, we do the actual config loading
|
|
here and provide user-friendly error messages when the config is missing.
|
|
|
|
Works with both local directories (``/path/to/model``) and remote
|
|
repository identifiers (``org/model-name``). For remote repos the
|
|
lookup goes through the HuggingFace / ModelScope cache via
|
|
``get_model_file`` to fetch the config if not already cached.
|
|
|
|
Args:
|
|
model_name: Path to the model directory or HuggingFace /
|
|
ModelScope repo id.
|
|
revision: Optional revision (branch, tag, or commit hash) for
|
|
remote repos.
|
|
"""
|
|
from vllm_ascend.quantization.utils import get_model_file
|
|
|
|
# If quant_description is already populated (e.g. from from_config()),
|
|
# there is nothing to do.
|
|
if self.quant_description:
|
|
return
|
|
|
|
# Try to get the config file (local or remote)
|
|
config_path = get_model_file(model_name, MODELSLIM_CONFIG_FILENAME, revision=revision)
|
|
|
|
if config_path is not None:
|
|
with open(config_path) as f:
|
|
self.quant_description = json.load(f)
|
|
self._apply_extra_quant_adaptations()
|
|
self._add_kvcache_quant_metadata()
|
|
return
|
|
|
|
# Collect diagnostic info for the error message
|
|
json_names: list[str] = []
|
|
if os.path.isdir(model_name):
|
|
json_files = glob.glob(os.path.join(model_name, "*.json"))
|
|
json_names = [os.path.basename(f) for f in json_files]
|
|
|
|
# Config file not found - raise a friendly error message
|
|
raise ValueError(
|
|
"\n"
|
|
+ "=" * 80
|
|
+ "\n"
|
|
+ "ERROR: ModelSlim Quantization Config Not Found\n"
|
|
+ "=" * 80
|
|
+ "\n"
|
|
+ "\n"
|
|
+ f"You have enabled '--quantization {ASCEND_QUANTIZATION_METHOD}' "
|
|
+ "(ModelSlim quantization),\n"
|
|
+ f"but the model '{model_name}' does not contain the required\n"
|
|
+ f"quantization config file ('{MODELSLIM_CONFIG_FILENAME}').\n"
|
|
+ "\n"
|
|
+ "This usually means the model weights are NOT quantized by "
|
|
+ "ModelSlim.\n"
|
|
+ "\n"
|
|
+ "Please choose one of the following solutions:\n"
|
|
+ "\n"
|
|
+ " Solution 1: Remove the quantization option "
|
|
+ "(for float/unquantized models)\n"
|
|
+ " "
|
|
+ "-" * 58
|
|
+ "\n"
|
|
+ f" Remove '--quantization {ASCEND_QUANTIZATION_METHOD}' from "
|
|
+ "your command if you want to\n"
|
|
+ " run the model with the original (float) weights.\n"
|
|
+ "\n"
|
|
+ " Example:\n"
|
|
+ f" vllm serve {model_name}\n"
|
|
+ "\n"
|
|
+ " Solution 2: Quantize your model weights with ModelSlim first\n"
|
|
+ " "
|
|
+ "-" * 58
|
|
+ "\n"
|
|
+ " Use the ModelSlim tool to quantize your model weights "
|
|
+ "before deployment.\n"
|
|
+ " After quantization, the model directory should contain "
|
|
+ f"'{MODELSLIM_CONFIG_FILENAME}'.\n"
|
|
+ " For more information, please refer to:\n"
|
|
+ " https://gitee.com/ascend/msit/tree/master/msmodelslim\n"
|
|
+ "\n"
|
|
+ (f" (Found JSON files in model directory: {json_names})\n" if json_names else "")
|
|
+ "=" * 80
|
|
)
|
|
|
|
def _apply_extra_quant_adaptations(self) -> None:
|
|
"""Apply extra adaptations to the quant_description dict.
|
|
|
|
This handles known key transformations such as shared_head and
|
|
weight_packed mappings.
|
|
"""
|
|
extra_quant_dict = {}
|
|
for k in self.quant_description:
|
|
if "shared_head" in k:
|
|
new_k = k.replace(".shared_head.", ".")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
if "weight_packed" in k:
|
|
new_k = k.replace("weight_packed", "weight")
|
|
extra_quant_dict[new_k] = self.quant_description[k]
|
|
self.quant_description.update(extra_quant_dict)
|
|
|
|
def get_scaled_act_names(self) -> list[str]:
|
|
return []
|
|
|
|
def _add_kvcache_quant_metadata(self):
|
|
fa_quant_type = self.quant_description.get("fa_quant_type", "")
|
|
self.enable_fa_quant = fa_quant_type != ""
|
|
self.kvcache_quant_layers = []
|
|
if self.enable_fa_quant:
|
|
for key in self.quant_description:
|
|
if "fa_k.scale" in key:
|
|
_id = "".join(re.findall(r"\.(\d+)\.", key))
|
|
self.kvcache_quant_layers.append(int(_id))
|